refactor(skills): Restructure skills to dual-platform architecture
Major refactoring of ourdigital-custom-skills with new numbering system: ## Structure Changes - Each skill now has code/ (Claude Code) and desktop/ (Claude Desktop) versions - New progressive numbering: 01-09 General, 10-19 SEO, 20-29 GTM, 30-39 OurDigital, 40-49 Jamie ## Skill Reorganization - 01-notion-organizer (from 02) - 10-18: SEO tools split into focused skills (technical, on-page, local, schema, vitals, gsc, gateway) - 20-21: GTM audit and manager - 30-32: OurDigital designer, research, presentation - 40-41: Jamie brand editor and audit ## New Files - .claude/commands/: Slash command definitions for all skills - CLAUDE.md: Updated with new skill structure documentation - REFACTORING_PLAN.md: Migration documentation - COMPATIBILITY_REPORT.md, SKILLS_COMPARISON.md: Analysis docs ## Removed - Old skill directories (02-05, 10-14, 20-21 old numbering) - Consolidated into new structure with _archive/ for reference 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
Base Client - Shared async client utilities
|
||||
===========================================
|
||||
Purpose: Rate-limited async operations for API clients
|
||||
Python: 3.10+
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from asyncio import Semaphore
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable, TypeVar
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
retry_if_exception_type,
|
||||
)
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Logging setup
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Rate limiter using token bucket algorithm."""
|
||||
|
||||
def __init__(self, rate: float, per: float = 1.0):
|
||||
"""
|
||||
Initialize rate limiter.
|
||||
|
||||
Args:
|
||||
rate: Number of requests allowed
|
||||
per: Time period in seconds (default: 1 second)
|
||||
"""
|
||||
self.rate = rate
|
||||
self.per = per
|
||||
self.tokens = rate
|
||||
self.last_update = datetime.now()
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def acquire(self) -> None:
|
||||
"""Acquire a token, waiting if necessary."""
|
||||
async with self._lock:
|
||||
now = datetime.now()
|
||||
elapsed = (now - self.last_update).total_seconds()
|
||||
self.tokens = min(self.rate, self.tokens + elapsed * (self.rate / self.per))
|
||||
self.last_update = now
|
||||
|
||||
if self.tokens < 1:
|
||||
wait_time = (1 - self.tokens) * (self.per / self.rate)
|
||||
await asyncio.sleep(wait_time)
|
||||
self.tokens = 0
|
||||
else:
|
||||
self.tokens -= 1
|
||||
|
||||
|
||||
class BaseAsyncClient:
|
||||
"""Base class for async API clients with rate limiting."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_concurrent: int = 5,
|
||||
requests_per_second: float = 3.0,
|
||||
logger: logging.Logger | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize base client.
|
||||
|
||||
Args:
|
||||
max_concurrent: Maximum concurrent requests
|
||||
requests_per_second: Rate limit
|
||||
logger: Logger instance
|
||||
"""
|
||||
self.semaphore = Semaphore(max_concurrent)
|
||||
self.rate_limiter = RateLimiter(requests_per_second)
|
||||
self.logger = logger or logging.getLogger(self.__class__.__name__)
|
||||
self.stats = {
|
||||
"requests": 0,
|
||||
"success": 0,
|
||||
"errors": 0,
|
||||
"retries": 0,
|
||||
}
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=2, max=10),
|
||||
retry=retry_if_exception_type(Exception),
|
||||
)
|
||||
async def _rate_limited_request(
|
||||
self,
|
||||
coro: Callable[[], Any],
|
||||
) -> Any:
|
||||
"""Execute a request with rate limiting and retry."""
|
||||
async with self.semaphore:
|
||||
await self.rate_limiter.acquire()
|
||||
self.stats["requests"] += 1
|
||||
try:
|
||||
result = await coro()
|
||||
self.stats["success"] += 1
|
||||
return result
|
||||
except Exception as e:
|
||||
self.stats["errors"] += 1
|
||||
self.logger.error(f"Request failed: {e}")
|
||||
raise
|
||||
|
||||
async def batch_requests(
|
||||
self,
|
||||
requests: list[Callable[[], Any]],
|
||||
desc: str = "Processing",
|
||||
) -> list[Any]:
|
||||
"""Execute multiple requests concurrently."""
|
||||
try:
|
||||
from tqdm.asyncio import tqdm
|
||||
has_tqdm = True
|
||||
except ImportError:
|
||||
has_tqdm = False
|
||||
|
||||
async def execute(req: Callable) -> Any:
|
||||
try:
|
||||
return await self._rate_limited_request(req)
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
tasks = [execute(req) for req in requests]
|
||||
|
||||
if has_tqdm:
|
||||
results = []
|
||||
for coro in tqdm.as_completed(tasks, total=len(tasks), desc=desc):
|
||||
result = await coro
|
||||
results.append(result)
|
||||
return results
|
||||
else:
|
||||
return await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
def print_stats(self) -> None:
|
||||
"""Print request statistics."""
|
||||
self.logger.info("=" * 40)
|
||||
self.logger.info("Request Statistics:")
|
||||
self.logger.info(f" Total Requests: {self.stats['requests']}")
|
||||
self.logger.info(f" Successful: {self.stats['success']}")
|
||||
self.logger.info(f" Errors: {self.stats['errors']}")
|
||||
self.logger.info("=" * 40)
|
||||
|
||||
|
||||
class ConfigManager:
|
||||
"""Manage API configuration and credentials."""
|
||||
|
||||
def __init__(self):
|
||||
load_dotenv()
|
||||
|
||||
@property
|
||||
def google_credentials_path(self) -> str | None:
|
||||
"""Get Google service account credentials path."""
|
||||
# Prefer SEO-specific credentials, fallback to general credentials
|
||||
seo_creds = os.path.expanduser("~/.credential/ourdigital-seo-agent.json")
|
||||
if os.path.exists(seo_creds):
|
||||
return seo_creds
|
||||
return os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
|
||||
|
||||
@property
|
||||
def pagespeed_api_key(self) -> str | None:
|
||||
"""Get PageSpeed Insights API key."""
|
||||
return os.getenv("PAGESPEED_API_KEY")
|
||||
|
||||
@property
|
||||
def custom_search_api_key(self) -> str | None:
|
||||
"""Get Custom Search API key."""
|
||||
return os.getenv("CUSTOM_SEARCH_API_KEY")
|
||||
|
||||
@property
|
||||
def custom_search_engine_id(self) -> str | None:
|
||||
"""Get Custom Search Engine ID."""
|
||||
return os.getenv("CUSTOM_SEARCH_ENGINE_ID")
|
||||
|
||||
@property
|
||||
def notion_token(self) -> str | None:
|
||||
"""Get Notion API token."""
|
||||
return os.getenv("NOTION_TOKEN") or os.getenv("NOTION_API_KEY")
|
||||
|
||||
def validate_google_credentials(self) -> bool:
|
||||
"""Validate Google credentials are configured."""
|
||||
creds_path = self.google_credentials_path
|
||||
if not creds_path:
|
||||
return False
|
||||
return os.path.exists(creds_path)
|
||||
|
||||
def get_required(self, key: str) -> str:
|
||||
"""Get required environment variable or raise error."""
|
||||
value = os.getenv(key)
|
||||
if not value:
|
||||
raise ValueError(f"Missing required environment variable: {key}")
|
||||
return value
|
||||
|
||||
|
||||
# Singleton config instance
|
||||
config = ConfigManager()
|
||||
@@ -0,0 +1,569 @@
|
||||
"""
|
||||
Page Analyzer - Extract SEO metadata from web pages
|
||||
===================================================
|
||||
Purpose: Comprehensive page-level SEO data extraction
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
from page_analyzer import PageAnalyzer, PageMetadata
|
||||
analyzer = PageAnalyzer()
|
||||
metadata = analyzer.analyze_url("https://example.com/page")
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LinkData:
|
||||
"""Represents a link found on a page."""
|
||||
url: str
|
||||
anchor_text: str
|
||||
is_internal: bool
|
||||
is_nofollow: bool = False
|
||||
link_type: str = "body" # body, nav, footer, etc.
|
||||
|
||||
|
||||
@dataclass
|
||||
class HeadingData:
|
||||
"""Represents a heading found on a page."""
|
||||
level: int # 1-6
|
||||
text: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SchemaData:
|
||||
"""Represents schema.org structured data."""
|
||||
schema_type: str
|
||||
properties: dict
|
||||
format: str = "json-ld" # json-ld, microdata, rdfa
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenGraphData:
|
||||
"""Represents Open Graph metadata."""
|
||||
og_title: str | None = None
|
||||
og_description: str | None = None
|
||||
og_image: str | None = None
|
||||
og_url: str | None = None
|
||||
og_type: str | None = None
|
||||
og_site_name: str | None = None
|
||||
og_locale: str | None = None
|
||||
twitter_card: str | None = None
|
||||
twitter_title: str | None = None
|
||||
twitter_description: str | None = None
|
||||
twitter_image: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageMetadata:
|
||||
"""Complete SEO metadata for a page."""
|
||||
|
||||
# Basic info
|
||||
url: str
|
||||
status_code: int = 0
|
||||
content_type: str = ""
|
||||
response_time_ms: float = 0
|
||||
analyzed_at: datetime = field(default_factory=datetime.now)
|
||||
|
||||
# Meta tags
|
||||
title: str | None = None
|
||||
title_length: int = 0
|
||||
meta_description: str | None = None
|
||||
meta_description_length: int = 0
|
||||
canonical_url: str | None = None
|
||||
robots_meta: str | None = None
|
||||
|
||||
# Language
|
||||
html_lang: str | None = None
|
||||
hreflang_tags: list[dict] = field(default_factory=list) # [{"lang": "en", "url": "..."}]
|
||||
|
||||
# Headings
|
||||
headings: list[HeadingData] = field(default_factory=list)
|
||||
h1_count: int = 0
|
||||
h1_text: str | None = None
|
||||
|
||||
# Open Graph & Social
|
||||
open_graph: OpenGraphData = field(default_factory=OpenGraphData)
|
||||
|
||||
# Schema/Structured Data
|
||||
schema_data: list[SchemaData] = field(default_factory=list)
|
||||
schema_types_found: list[str] = field(default_factory=list)
|
||||
|
||||
# Links
|
||||
internal_links: list[LinkData] = field(default_factory=list)
|
||||
external_links: list[LinkData] = field(default_factory=list)
|
||||
internal_link_count: int = 0
|
||||
external_link_count: int = 0
|
||||
|
||||
# Images
|
||||
images_total: int = 0
|
||||
images_without_alt: int = 0
|
||||
images_with_alt: int = 0
|
||||
|
||||
# Content metrics
|
||||
word_count: int = 0
|
||||
|
||||
# Issues found
|
||||
issues: list[str] = field(default_factory=list)
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return {
|
||||
"url": self.url,
|
||||
"status_code": self.status_code,
|
||||
"content_type": self.content_type,
|
||||
"response_time_ms": self.response_time_ms,
|
||||
"analyzed_at": self.analyzed_at.isoformat(),
|
||||
"title": self.title,
|
||||
"title_length": self.title_length,
|
||||
"meta_description": self.meta_description,
|
||||
"meta_description_length": self.meta_description_length,
|
||||
"canonical_url": self.canonical_url,
|
||||
"robots_meta": self.robots_meta,
|
||||
"html_lang": self.html_lang,
|
||||
"hreflang_tags": self.hreflang_tags,
|
||||
"h1_count": self.h1_count,
|
||||
"h1_text": self.h1_text,
|
||||
"headings_count": len(self.headings),
|
||||
"schema_types_found": self.schema_types_found,
|
||||
"internal_link_count": self.internal_link_count,
|
||||
"external_link_count": self.external_link_count,
|
||||
"images_total": self.images_total,
|
||||
"images_without_alt": self.images_without_alt,
|
||||
"word_count": self.word_count,
|
||||
"issues": self.issues,
|
||||
"warnings": self.warnings,
|
||||
"open_graph": {
|
||||
"og_title": self.open_graph.og_title,
|
||||
"og_description": self.open_graph.og_description,
|
||||
"og_image": self.open_graph.og_image,
|
||||
"og_url": self.open_graph.og_url,
|
||||
"og_type": self.open_graph.og_type,
|
||||
},
|
||||
}
|
||||
|
||||
def get_summary(self) -> str:
|
||||
"""Get a brief summary of the page analysis."""
|
||||
lines = [
|
||||
f"URL: {self.url}",
|
||||
f"Status: {self.status_code}",
|
||||
f"Title: {self.title[:50] + '...' if self.title and len(self.title) > 50 else self.title}",
|
||||
f"Description: {'✓' if self.meta_description else '✗ Missing'}",
|
||||
f"Canonical: {'✓' if self.canonical_url else '✗ Missing'}",
|
||||
f"H1: {self.h1_count} found",
|
||||
f"Schema: {', '.join(self.schema_types_found) if self.schema_types_found else 'None'}",
|
||||
f"Links: {self.internal_link_count} internal, {self.external_link_count} external",
|
||||
f"Images: {self.images_total} total, {self.images_without_alt} without alt",
|
||||
]
|
||||
if self.issues:
|
||||
lines.append(f"Issues: {len(self.issues)}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
class PageAnalyzer:
|
||||
"""Analyze web pages for SEO metadata."""
|
||||
|
||||
DEFAULT_USER_AGENT = "Mozilla/5.0 (compatible; OurDigitalSEOBot/1.0; +https://ourdigital.org)"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
user_agent: str | None = None,
|
||||
timeout: int = 30,
|
||||
):
|
||||
"""
|
||||
Initialize page analyzer.
|
||||
|
||||
Args:
|
||||
user_agent: Custom user agent string
|
||||
timeout: Request timeout in seconds
|
||||
"""
|
||||
self.user_agent = user_agent or self.DEFAULT_USER_AGENT
|
||||
self.timeout = timeout
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
"User-Agent": self.user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9,ko;q=0.8",
|
||||
})
|
||||
|
||||
def analyze_url(self, url: str) -> PageMetadata:
|
||||
"""
|
||||
Analyze a URL and extract SEO metadata.
|
||||
|
||||
Args:
|
||||
url: URL to analyze
|
||||
|
||||
Returns:
|
||||
PageMetadata object with all extracted data
|
||||
"""
|
||||
metadata = PageMetadata(url=url)
|
||||
|
||||
try:
|
||||
# Fetch page
|
||||
start_time = datetime.now()
|
||||
response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
|
||||
metadata.response_time_ms = (datetime.now() - start_time).total_seconds() * 1000
|
||||
metadata.status_code = response.status_code
|
||||
metadata.content_type = response.headers.get("Content-Type", "")
|
||||
|
||||
if response.status_code != 200:
|
||||
metadata.issues.append(f"HTTP {response.status_code} status")
|
||||
if response.status_code >= 400:
|
||||
return metadata
|
||||
|
||||
# Parse HTML
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
base_url = url
|
||||
|
||||
# Extract all metadata
|
||||
self._extract_basic_meta(soup, metadata)
|
||||
self._extract_canonical(soup, metadata, base_url)
|
||||
self._extract_robots_meta(soup, metadata)
|
||||
self._extract_hreflang(soup, metadata)
|
||||
self._extract_headings(soup, metadata)
|
||||
self._extract_open_graph(soup, metadata)
|
||||
self._extract_schema(soup, metadata)
|
||||
self._extract_links(soup, metadata, base_url)
|
||||
self._extract_images(soup, metadata)
|
||||
self._extract_content_metrics(soup, metadata)
|
||||
|
||||
# Run SEO checks
|
||||
self._run_seo_checks(metadata)
|
||||
|
||||
except requests.RequestException as e:
|
||||
metadata.issues.append(f"Request failed: {str(e)}")
|
||||
logger.error(f"Failed to analyze {url}: {e}")
|
||||
except Exception as e:
|
||||
metadata.issues.append(f"Analysis error: {str(e)}")
|
||||
logger.error(f"Error analyzing {url}: {e}")
|
||||
|
||||
return metadata
|
||||
|
||||
def _extract_basic_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract title and meta description."""
|
||||
# Title
|
||||
title_tag = soup.find("title")
|
||||
if title_tag and title_tag.string:
|
||||
metadata.title = title_tag.string.strip()
|
||||
metadata.title_length = len(metadata.title)
|
||||
|
||||
# Meta description
|
||||
desc_tag = soup.find("meta", attrs={"name": re.compile(r"^description$", re.I)})
|
||||
if desc_tag and desc_tag.get("content"):
|
||||
metadata.meta_description = desc_tag["content"].strip()
|
||||
metadata.meta_description_length = len(metadata.meta_description)
|
||||
|
||||
# HTML lang
|
||||
html_tag = soup.find("html")
|
||||
if html_tag and html_tag.get("lang"):
|
||||
metadata.html_lang = html_tag["lang"]
|
||||
|
||||
def _extract_canonical(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None:
|
||||
"""Extract canonical URL."""
|
||||
canonical = soup.find("link", rel="canonical")
|
||||
if canonical and canonical.get("href"):
|
||||
metadata.canonical_url = urljoin(base_url, canonical["href"])
|
||||
|
||||
def _extract_robots_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract robots meta tag."""
|
||||
robots = soup.find("meta", attrs={"name": re.compile(r"^robots$", re.I)})
|
||||
if robots and robots.get("content"):
|
||||
metadata.robots_meta = robots["content"]
|
||||
|
||||
# Also check for googlebot-specific
|
||||
googlebot = soup.find("meta", attrs={"name": re.compile(r"^googlebot$", re.I)})
|
||||
if googlebot and googlebot.get("content"):
|
||||
if metadata.robots_meta:
|
||||
metadata.robots_meta += f" | googlebot: {googlebot['content']}"
|
||||
else:
|
||||
metadata.robots_meta = f"googlebot: {googlebot['content']}"
|
||||
|
||||
def _extract_hreflang(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract hreflang tags."""
|
||||
hreflang_tags = soup.find_all("link", rel="alternate", hreflang=True)
|
||||
for tag in hreflang_tags:
|
||||
if tag.get("href") and tag.get("hreflang"):
|
||||
metadata.hreflang_tags.append({
|
||||
"lang": tag["hreflang"],
|
||||
"url": tag["href"]
|
||||
})
|
||||
|
||||
def _extract_headings(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract all headings."""
|
||||
for level in range(1, 7):
|
||||
for heading in soup.find_all(f"h{level}"):
|
||||
text = heading.get_text(strip=True)
|
||||
if text:
|
||||
metadata.headings.append(HeadingData(level=level, text=text))
|
||||
|
||||
# Count H1s specifically
|
||||
h1_tags = soup.find_all("h1")
|
||||
metadata.h1_count = len(h1_tags)
|
||||
if h1_tags:
|
||||
metadata.h1_text = h1_tags[0].get_text(strip=True)
|
||||
|
||||
def _extract_open_graph(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract Open Graph and Twitter Card data."""
|
||||
og = metadata.open_graph
|
||||
|
||||
# Open Graph tags
|
||||
og_mappings = {
|
||||
"og:title": "og_title",
|
||||
"og:description": "og_description",
|
||||
"og:image": "og_image",
|
||||
"og:url": "og_url",
|
||||
"og:type": "og_type",
|
||||
"og:site_name": "og_site_name",
|
||||
"og:locale": "og_locale",
|
||||
}
|
||||
|
||||
for og_prop, attr_name in og_mappings.items():
|
||||
tag = soup.find("meta", property=og_prop)
|
||||
if tag and tag.get("content"):
|
||||
setattr(og, attr_name, tag["content"])
|
||||
|
||||
# Twitter Card tags
|
||||
twitter_mappings = {
|
||||
"twitter:card": "twitter_card",
|
||||
"twitter:title": "twitter_title",
|
||||
"twitter:description": "twitter_description",
|
||||
"twitter:image": "twitter_image",
|
||||
}
|
||||
|
||||
for tw_name, attr_name in twitter_mappings.items():
|
||||
tag = soup.find("meta", attrs={"name": tw_name})
|
||||
if tag and tag.get("content"):
|
||||
setattr(og, attr_name, tag["content"])
|
||||
|
||||
def _extract_schema(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract schema.org structured data."""
|
||||
# JSON-LD
|
||||
for script in soup.find_all("script", type="application/ld+json"):
|
||||
try:
|
||||
data = json.loads(script.string)
|
||||
if isinstance(data, list):
|
||||
for item in data:
|
||||
self._process_schema_item(item, metadata, "json-ld")
|
||||
else:
|
||||
self._process_schema_item(data, metadata, "json-ld")
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
continue
|
||||
|
||||
# Microdata (basic detection)
|
||||
for item in soup.find_all(itemscope=True):
|
||||
itemtype = item.get("itemtype", "")
|
||||
if itemtype:
|
||||
schema_type = itemtype.split("/")[-1]
|
||||
if schema_type not in metadata.schema_types_found:
|
||||
metadata.schema_types_found.append(schema_type)
|
||||
metadata.schema_data.append(SchemaData(
|
||||
schema_type=schema_type,
|
||||
properties={},
|
||||
format="microdata"
|
||||
))
|
||||
|
||||
def _process_schema_item(self, data: dict, metadata: PageMetadata, format_type: str) -> None:
|
||||
"""Process a single schema.org item."""
|
||||
if not isinstance(data, dict):
|
||||
return
|
||||
|
||||
schema_type = data.get("@type", "Unknown")
|
||||
if isinstance(schema_type, list):
|
||||
schema_type = schema_type[0] if schema_type else "Unknown"
|
||||
|
||||
if schema_type not in metadata.schema_types_found:
|
||||
metadata.schema_types_found.append(schema_type)
|
||||
|
||||
metadata.schema_data.append(SchemaData(
|
||||
schema_type=schema_type,
|
||||
properties=data,
|
||||
format=format_type
|
||||
))
|
||||
|
||||
# Process nested @graph items
|
||||
if "@graph" in data:
|
||||
for item in data["@graph"]:
|
||||
self._process_schema_item(item, metadata, format_type)
|
||||
|
||||
def _extract_links(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None:
|
||||
"""Extract internal and external links."""
|
||||
parsed_base = urlparse(base_url)
|
||||
base_domain = parsed_base.netloc.lower()
|
||||
|
||||
for a_tag in soup.find_all("a", href=True):
|
||||
href = a_tag["href"]
|
||||
|
||||
# Skip non-http links
|
||||
if href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
||||
continue
|
||||
|
||||
# Resolve relative URLs
|
||||
full_url = urljoin(base_url, href)
|
||||
parsed_url = urlparse(full_url)
|
||||
|
||||
# Get anchor text
|
||||
anchor_text = a_tag.get_text(strip=True)[:100] # Limit length
|
||||
|
||||
# Check if nofollow
|
||||
rel = a_tag.get("rel", [])
|
||||
if isinstance(rel, str):
|
||||
rel = rel.split()
|
||||
is_nofollow = "nofollow" in rel
|
||||
|
||||
# Determine if internal or external
|
||||
link_domain = parsed_url.netloc.lower()
|
||||
is_internal = (
|
||||
link_domain == base_domain or
|
||||
link_domain.endswith(f".{base_domain}") or
|
||||
base_domain.endswith(f".{link_domain}")
|
||||
)
|
||||
|
||||
link_data = LinkData(
|
||||
url=full_url,
|
||||
anchor_text=anchor_text,
|
||||
is_internal=is_internal,
|
||||
is_nofollow=is_nofollow,
|
||||
)
|
||||
|
||||
if is_internal:
|
||||
metadata.internal_links.append(link_data)
|
||||
else:
|
||||
metadata.external_links.append(link_data)
|
||||
|
||||
metadata.internal_link_count = len(metadata.internal_links)
|
||||
metadata.external_link_count = len(metadata.external_links)
|
||||
|
||||
def _extract_images(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract image information."""
|
||||
images = soup.find_all("img")
|
||||
metadata.images_total = len(images)
|
||||
|
||||
for img in images:
|
||||
alt = img.get("alt", "").strip()
|
||||
if alt:
|
||||
metadata.images_with_alt += 1
|
||||
else:
|
||||
metadata.images_without_alt += 1
|
||||
|
||||
def _extract_content_metrics(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract content metrics like word count."""
|
||||
# Remove script and style elements
|
||||
for element in soup(["script", "style", "noscript"]):
|
||||
element.decompose()
|
||||
|
||||
# Get text content
|
||||
text = soup.get_text(separator=" ", strip=True)
|
||||
words = text.split()
|
||||
metadata.word_count = len(words)
|
||||
|
||||
def _run_seo_checks(self, metadata: PageMetadata) -> None:
|
||||
"""Run SEO checks and add issues/warnings."""
|
||||
# Title checks
|
||||
if not metadata.title:
|
||||
metadata.issues.append("Missing title tag")
|
||||
elif metadata.title_length < 30:
|
||||
metadata.warnings.append(f"Title too short ({metadata.title_length} chars, recommend 50-60)")
|
||||
elif metadata.title_length > 60:
|
||||
metadata.warnings.append(f"Title too long ({metadata.title_length} chars, recommend 50-60)")
|
||||
|
||||
# Meta description checks
|
||||
if not metadata.meta_description:
|
||||
metadata.issues.append("Missing meta description")
|
||||
elif metadata.meta_description_length < 120:
|
||||
metadata.warnings.append(f"Meta description too short ({metadata.meta_description_length} chars)")
|
||||
elif metadata.meta_description_length > 160:
|
||||
metadata.warnings.append(f"Meta description too long ({metadata.meta_description_length} chars)")
|
||||
|
||||
# Canonical check
|
||||
if not metadata.canonical_url:
|
||||
metadata.warnings.append("Missing canonical tag")
|
||||
elif metadata.canonical_url != metadata.url:
|
||||
metadata.warnings.append(f"Canonical points to different URL: {metadata.canonical_url}")
|
||||
|
||||
# H1 checks
|
||||
if metadata.h1_count == 0:
|
||||
metadata.issues.append("Missing H1 tag")
|
||||
elif metadata.h1_count > 1:
|
||||
metadata.warnings.append(f"Multiple H1 tags ({metadata.h1_count})")
|
||||
|
||||
# Image alt check
|
||||
if metadata.images_without_alt > 0:
|
||||
metadata.warnings.append(f"{metadata.images_without_alt} images missing alt text")
|
||||
|
||||
# Schema check
|
||||
if not metadata.schema_types_found:
|
||||
metadata.warnings.append("No structured data found")
|
||||
|
||||
# Open Graph check
|
||||
if not metadata.open_graph.og_title:
|
||||
metadata.warnings.append("Missing Open Graph tags")
|
||||
|
||||
# Robots meta check
|
||||
if metadata.robots_meta:
|
||||
robots_lower = metadata.robots_meta.lower()
|
||||
if "noindex" in robots_lower:
|
||||
metadata.issues.append("Page is set to noindex")
|
||||
if "nofollow" in robots_lower:
|
||||
metadata.warnings.append("Page is set to nofollow")
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point for testing."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Page SEO Analyzer")
|
||||
parser.add_argument("url", help="URL to analyze")
|
||||
parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
analyzer = PageAnalyzer()
|
||||
metadata = analyzer.analyze_url(args.url)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(metadata.to_dict(), indent=2, ensure_ascii=False))
|
||||
else:
|
||||
print("=" * 60)
|
||||
print("PAGE ANALYSIS REPORT")
|
||||
print("=" * 60)
|
||||
print(metadata.get_summary())
|
||||
print()
|
||||
|
||||
if metadata.issues:
|
||||
print("ISSUES:")
|
||||
for issue in metadata.issues:
|
||||
print(f" ✗ {issue}")
|
||||
|
||||
if metadata.warnings:
|
||||
print("\nWARNINGS:")
|
||||
for warning in metadata.warnings:
|
||||
print(f" ⚠ {warning}")
|
||||
|
||||
if metadata.hreflang_tags:
|
||||
print(f"\nHREFLANG TAGS ({len(metadata.hreflang_tags)}):")
|
||||
for tag in metadata.hreflang_tags[:5]:
|
||||
print(f" {tag['lang']}: {tag['url']}")
|
||||
|
||||
if metadata.schema_types_found:
|
||||
print(f"\nSCHEMA TYPES:")
|
||||
for schema_type in metadata.schema_types_found:
|
||||
print(f" - {schema_type}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,17 @@
|
||||
# 10-seo-technical-audit dependencies
|
||||
# Install: pip install -r requirements.txt
|
||||
|
||||
# Web Scraping & Parsing
|
||||
lxml>=5.1.0
|
||||
beautifulsoup4>=4.12.0
|
||||
requests>=2.31.0
|
||||
aiohttp>=3.9.0
|
||||
|
||||
# Async & Retry
|
||||
tenacity>=8.2.0
|
||||
tqdm>=4.66.0
|
||||
|
||||
# Environment & CLI
|
||||
python-dotenv>=1.0.0
|
||||
rich>=13.7.0
|
||||
typer>=0.9.0
|
||||
@@ -0,0 +1,540 @@
|
||||
"""
|
||||
Robots.txt Checker - Analyze robots.txt configuration
|
||||
=====================================================
|
||||
Purpose: Parse and analyze robots.txt for SEO compliance
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
python robots_checker.py --url https://example.com/robots.txt
|
||||
python robots_checker.py --url https://example.com --test-url /admin/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from urllib.robotparser import RobotFileParser
|
||||
|
||||
import requests
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RobotsIssue:
|
||||
"""Represents a robots.txt issue."""
|
||||
|
||||
severity: str # "error", "warning", "info"
|
||||
message: str
|
||||
line_number: int | None = None
|
||||
directive: str | None = None
|
||||
suggestion: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class UserAgentRules:
|
||||
"""Rules for a specific user-agent."""
|
||||
|
||||
user_agent: str
|
||||
disallow: list[str] = field(default_factory=list)
|
||||
allow: list[str] = field(default_factory=list)
|
||||
crawl_delay: float | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RobotsResult:
|
||||
"""Complete robots.txt analysis result."""
|
||||
|
||||
url: str
|
||||
accessible: bool = True
|
||||
content: str = ""
|
||||
rules: list[UserAgentRules] = field(default_factory=list)
|
||||
sitemaps: list[str] = field(default_factory=list)
|
||||
issues: list[RobotsIssue] = field(default_factory=list)
|
||||
stats: dict = field(default_factory=dict)
|
||||
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON output."""
|
||||
return {
|
||||
"url": self.url,
|
||||
"accessible": self.accessible,
|
||||
"sitemaps": self.sitemaps,
|
||||
"rules": [
|
||||
{
|
||||
"user_agent": r.user_agent,
|
||||
"disallow": r.disallow,
|
||||
"allow": r.allow,
|
||||
"crawl_delay": r.crawl_delay,
|
||||
}
|
||||
for r in self.rules
|
||||
],
|
||||
"issues": [
|
||||
{
|
||||
"severity": i.severity,
|
||||
"message": i.message,
|
||||
"line_number": i.line_number,
|
||||
"directive": i.directive,
|
||||
"suggestion": i.suggestion,
|
||||
}
|
||||
for i in self.issues
|
||||
],
|
||||
"stats": self.stats,
|
||||
"timestamp": self.timestamp,
|
||||
}
|
||||
|
||||
|
||||
class RobotsChecker:
|
||||
"""Analyze robots.txt configuration."""
|
||||
|
||||
# Common user agents
|
||||
USER_AGENTS = {
|
||||
"*": "All bots",
|
||||
"Googlebot": "Google crawler",
|
||||
"Googlebot-Image": "Google Image crawler",
|
||||
"Googlebot-News": "Google News crawler",
|
||||
"Googlebot-Video": "Google Video crawler",
|
||||
"Bingbot": "Bing crawler",
|
||||
"Slurp": "Yahoo crawler",
|
||||
"DuckDuckBot": "DuckDuckGo crawler",
|
||||
"Baiduspider": "Baidu crawler",
|
||||
"Yandex": "Yandex crawler",
|
||||
"facebot": "Facebook crawler",
|
||||
"Twitterbot": "Twitter crawler",
|
||||
"LinkedInBot": "LinkedIn crawler",
|
||||
}
|
||||
|
||||
# Paths that should generally not be blocked
|
||||
IMPORTANT_PATHS = [
|
||||
"/",
|
||||
"/*.css",
|
||||
"/*.js",
|
||||
"/*.jpg",
|
||||
"/*.jpeg",
|
||||
"/*.png",
|
||||
"/*.gif",
|
||||
"/*.svg",
|
||||
"/*.webp",
|
||||
]
|
||||
|
||||
# Paths commonly blocked
|
||||
COMMON_BLOCKED = [
|
||||
"/admin",
|
||||
"/wp-admin",
|
||||
"/login",
|
||||
"/private",
|
||||
"/api",
|
||||
"/cgi-bin",
|
||||
"/tmp",
|
||||
"/search",
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
|
||||
})
|
||||
|
||||
def fetch_robots(self, url: str) -> str | None:
|
||||
"""Fetch robots.txt content."""
|
||||
# Ensure we're fetching robots.txt
|
||||
parsed = urlparse(url)
|
||||
if not parsed.path.endswith("robots.txt"):
|
||||
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
||||
else:
|
||||
robots_url = url
|
||||
|
||||
try:
|
||||
response = self.session.get(robots_url, timeout=10)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
elif response.status_code == 404:
|
||||
return None
|
||||
else:
|
||||
raise RuntimeError(f"HTTP {response.status_code}")
|
||||
except requests.RequestException as e:
|
||||
raise RuntimeError(f"Failed to fetch robots.txt: {e}")
|
||||
|
||||
def parse_robots(self, content: str) -> tuple[list[UserAgentRules], list[str]]:
|
||||
"""Parse robots.txt content."""
|
||||
rules = []
|
||||
sitemaps = []
|
||||
current_ua = None
|
||||
current_rules = None
|
||||
|
||||
for line_num, line in enumerate(content.split("\n"), 1):
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines and comments
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
|
||||
# Parse directive
|
||||
if ":" not in line:
|
||||
continue
|
||||
|
||||
directive, value = line.split(":", 1)
|
||||
directive = directive.strip().lower()
|
||||
value = value.strip()
|
||||
|
||||
if directive == "user-agent":
|
||||
# Save previous user-agent rules
|
||||
if current_rules:
|
||||
rules.append(current_rules)
|
||||
|
||||
current_ua = value
|
||||
current_rules = UserAgentRules(user_agent=value)
|
||||
|
||||
elif directive == "disallow" and current_rules:
|
||||
if value: # Empty disallow means allow all
|
||||
current_rules.disallow.append(value)
|
||||
|
||||
elif directive == "allow" and current_rules:
|
||||
if value:
|
||||
current_rules.allow.append(value)
|
||||
|
||||
elif directive == "crawl-delay" and current_rules:
|
||||
try:
|
||||
current_rules.crawl_delay = float(value)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
elif directive == "sitemap":
|
||||
if value:
|
||||
sitemaps.append(value)
|
||||
|
||||
# Don't forget last user-agent
|
||||
if current_rules:
|
||||
rules.append(current_rules)
|
||||
|
||||
return rules, sitemaps
|
||||
|
||||
def analyze(self, url: str) -> RobotsResult:
|
||||
"""Analyze robots.txt."""
|
||||
result = RobotsResult(url=url)
|
||||
|
||||
# Fetch robots.txt
|
||||
try:
|
||||
content = self.fetch_robots(url)
|
||||
if content is None:
|
||||
result.accessible = False
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message="No robots.txt found (returns 404)",
|
||||
suggestion="Consider creating a robots.txt file",
|
||||
))
|
||||
return result
|
||||
except RuntimeError as e:
|
||||
result.accessible = False
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="error",
|
||||
message=str(e),
|
||||
))
|
||||
return result
|
||||
|
||||
result.content = content
|
||||
result.rules, result.sitemaps = self.parse_robots(content)
|
||||
|
||||
# Analyze content
|
||||
self._analyze_syntax(result)
|
||||
self._analyze_rules(result)
|
||||
self._analyze_sitemaps(result)
|
||||
|
||||
# Calculate stats
|
||||
result.stats = {
|
||||
"user_agents_count": len(result.rules),
|
||||
"user_agents": [r.user_agent for r in result.rules],
|
||||
"total_disallow_rules": sum(len(r.disallow) for r in result.rules),
|
||||
"total_allow_rules": sum(len(r.allow) for r in result.rules),
|
||||
"sitemaps_count": len(result.sitemaps),
|
||||
"has_crawl_delay": any(r.crawl_delay for r in result.rules),
|
||||
"content_length": len(content),
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
def _analyze_syntax(self, result: RobotsResult) -> None:
|
||||
"""Check for syntax issues."""
|
||||
lines = result.content.split("\n")
|
||||
|
||||
for line_num, line in enumerate(lines, 1):
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines and comments
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
|
||||
# Check for valid directive
|
||||
if ":" not in line:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message=f"Invalid line (missing colon): {line[:50]}",
|
||||
line_number=line_num,
|
||||
))
|
||||
continue
|
||||
|
||||
directive, value = line.split(":", 1)
|
||||
directive = directive.strip().lower()
|
||||
|
||||
valid_directives = {
|
||||
"user-agent", "disallow", "allow",
|
||||
"crawl-delay", "sitemap", "host",
|
||||
}
|
||||
|
||||
if directive not in valid_directives:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message=f"Unknown directive: {directive}",
|
||||
line_number=line_num,
|
||||
directive=directive,
|
||||
))
|
||||
|
||||
def _analyze_rules(self, result: RobotsResult) -> None:
|
||||
"""Analyze blocking rules."""
|
||||
# Check if there are any rules
|
||||
if not result.rules:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message="No user-agent rules defined",
|
||||
suggestion="Add User-agent: * rules to control crawling",
|
||||
))
|
||||
return
|
||||
|
||||
# Check for wildcard rule
|
||||
has_wildcard = any(r.user_agent == "*" for r in result.rules)
|
||||
if not has_wildcard:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message="No wildcard (*) user-agent defined",
|
||||
suggestion="Consider adding User-agent: * as fallback",
|
||||
))
|
||||
|
||||
# Check for blocking important resources
|
||||
for rules in result.rules:
|
||||
for disallow in rules.disallow:
|
||||
# Check if blocking root
|
||||
if disallow == "/":
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="error",
|
||||
message=f"Blocking entire site for {rules.user_agent}",
|
||||
directive=f"Disallow: {disallow}",
|
||||
suggestion="This will prevent indexing. Is this intentional?",
|
||||
))
|
||||
|
||||
# Check if blocking CSS/JS
|
||||
if any(ext in disallow.lower() for ext in [".css", ".js"]):
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message=f"Blocking CSS/JS files for {rules.user_agent}",
|
||||
directive=f"Disallow: {disallow}",
|
||||
suggestion="May affect rendering and SEO",
|
||||
))
|
||||
|
||||
# Check for blocking images
|
||||
if any(ext in disallow.lower() for ext in [".jpg", ".png", ".gif", ".webp"]):
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message=f"Blocking image files for {rules.user_agent}",
|
||||
directive=f"Disallow: {disallow}",
|
||||
))
|
||||
|
||||
# Check crawl delay
|
||||
if rules.crawl_delay:
|
||||
if rules.crawl_delay > 10:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message=f"High crawl-delay ({rules.crawl_delay}s) for {rules.user_agent}",
|
||||
directive=f"Crawl-delay: {rules.crawl_delay}",
|
||||
suggestion="May significantly slow indexing",
|
||||
))
|
||||
elif rules.crawl_delay > 0:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message=f"Crawl-delay set to {rules.crawl_delay}s for {rules.user_agent}",
|
||||
))
|
||||
|
||||
def _analyze_sitemaps(self, result: RobotsResult) -> None:
|
||||
"""Analyze sitemap declarations."""
|
||||
if not result.sitemaps:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message="No sitemap declared in robots.txt",
|
||||
suggestion="Add Sitemap: directive to help crawlers find your sitemap",
|
||||
))
|
||||
else:
|
||||
for sitemap in result.sitemaps:
|
||||
if not sitemap.startswith("http"):
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message=f"Sitemap URL should be absolute: {sitemap}",
|
||||
directive=f"Sitemap: {sitemap}",
|
||||
))
|
||||
|
||||
def test_url(self, robots_url: str, test_path: str,
|
||||
user_agent: str = "Googlebot") -> dict:
|
||||
"""Test if a specific URL is allowed."""
|
||||
# Use Python's built-in parser
|
||||
rp = RobotFileParser()
|
||||
|
||||
# Ensure robots.txt URL
|
||||
parsed = urlparse(robots_url)
|
||||
if not parsed.path.endswith("robots.txt"):
|
||||
robots_txt_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
||||
else:
|
||||
robots_txt_url = robots_url
|
||||
|
||||
rp.set_url(robots_txt_url)
|
||||
try:
|
||||
rp.read()
|
||||
except Exception as e:
|
||||
return {
|
||||
"path": test_path,
|
||||
"user_agent": user_agent,
|
||||
"allowed": None,
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
# Build full URL for testing
|
||||
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
||||
full_url = urljoin(base_url, test_path)
|
||||
|
||||
allowed = rp.can_fetch(user_agent, full_url)
|
||||
|
||||
return {
|
||||
"path": test_path,
|
||||
"user_agent": user_agent,
|
||||
"allowed": allowed,
|
||||
"full_url": full_url,
|
||||
}
|
||||
|
||||
def generate_report(self, result: RobotsResult) -> str:
|
||||
"""Generate human-readable analysis report."""
|
||||
lines = [
|
||||
"=" * 60,
|
||||
"Robots.txt Analysis Report",
|
||||
"=" * 60,
|
||||
f"URL: {result.url}",
|
||||
f"Accessible: {'Yes' if result.accessible else 'No'}",
|
||||
f"Timestamp: {result.timestamp}",
|
||||
"",
|
||||
]
|
||||
|
||||
if result.accessible:
|
||||
lines.append("Statistics:")
|
||||
for key, value in result.stats.items():
|
||||
if key == "user_agents":
|
||||
lines.append(f" {key}: {', '.join(value) if value else 'None'}")
|
||||
else:
|
||||
lines.append(f" {key}: {value}")
|
||||
lines.append("")
|
||||
|
||||
if result.sitemaps:
|
||||
lines.append(f"Sitemaps ({len(result.sitemaps)}):")
|
||||
for sitemap in result.sitemaps:
|
||||
lines.append(f" - {sitemap}")
|
||||
lines.append("")
|
||||
|
||||
if result.rules:
|
||||
lines.append("Rules Summary:")
|
||||
for rules in result.rules:
|
||||
lines.append(f"\n User-agent: {rules.user_agent}")
|
||||
if rules.disallow:
|
||||
lines.append(f" Disallow: {len(rules.disallow)} rules")
|
||||
for d in rules.disallow[:5]:
|
||||
lines.append(f" - {d}")
|
||||
if len(rules.disallow) > 5:
|
||||
lines.append(f" ... and {len(rules.disallow) - 5} more")
|
||||
if rules.allow:
|
||||
lines.append(f" Allow: {len(rules.allow)} rules")
|
||||
for a in rules.allow[:3]:
|
||||
lines.append(f" - {a}")
|
||||
if rules.crawl_delay:
|
||||
lines.append(f" Crawl-delay: {rules.crawl_delay}s")
|
||||
lines.append("")
|
||||
|
||||
if result.issues:
|
||||
lines.append("Issues Found:")
|
||||
errors = [i for i in result.issues if i.severity == "error"]
|
||||
warnings = [i for i in result.issues if i.severity == "warning"]
|
||||
infos = [i for i in result.issues if i.severity == "info"]
|
||||
|
||||
if errors:
|
||||
lines.append(f"\n ERRORS ({len(errors)}):")
|
||||
for issue in errors:
|
||||
lines.append(f" - {issue.message}")
|
||||
if issue.directive:
|
||||
lines.append(f" Directive: {issue.directive}")
|
||||
if issue.suggestion:
|
||||
lines.append(f" Suggestion: {issue.suggestion}")
|
||||
|
||||
if warnings:
|
||||
lines.append(f"\n WARNINGS ({len(warnings)}):")
|
||||
for issue in warnings:
|
||||
lines.append(f" - {issue.message}")
|
||||
if issue.suggestion:
|
||||
lines.append(f" Suggestion: {issue.suggestion}")
|
||||
|
||||
if infos:
|
||||
lines.append(f"\n INFO ({len(infos)}):")
|
||||
for issue in infos:
|
||||
lines.append(f" - {issue.message}")
|
||||
|
||||
lines.append("")
|
||||
lines.append("=" * 60)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for CLI usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze robots.txt configuration",
|
||||
)
|
||||
parser.add_argument("--url", "-u", required=True,
|
||||
help="URL to robots.txt or domain")
|
||||
parser.add_argument("--test-url", "-t",
|
||||
help="Test if specific URL path is allowed")
|
||||
parser.add_argument("--user-agent", "-a", default="Googlebot",
|
||||
help="User agent for testing (default: Googlebot)")
|
||||
parser.add_argument("--output", "-o", help="Output file for JSON report")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
checker = RobotsChecker()
|
||||
|
||||
if args.test_url:
|
||||
# Test specific URL
|
||||
test_result = checker.test_url(args.url, args.test_url, args.user_agent)
|
||||
if args.json:
|
||||
print(json.dumps(test_result, indent=2))
|
||||
else:
|
||||
status = "ALLOWED" if test_result["allowed"] else "BLOCKED"
|
||||
print(f"URL: {test_result['path']}")
|
||||
print(f"User-Agent: {test_result['user_agent']}")
|
||||
print(f"Status: {status}")
|
||||
else:
|
||||
# Full analysis
|
||||
result = checker.analyze(args.url)
|
||||
|
||||
if args.json or args.output:
|
||||
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
logger.info(f"Report written to {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
else:
|
||||
print(checker.generate_report(result))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,969 @@
|
||||
"""
|
||||
Sitemap Crawler - Sequential page analysis from sitemap
|
||||
=======================================================
|
||||
Purpose: Crawl sitemap URLs one by one, analyze each page, save to Notion
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
from sitemap_crawler import SitemapCrawler
|
||||
crawler = SitemapCrawler()
|
||||
crawler.crawl_sitemap("https://example.com/sitemap.xml", delay=2.0)
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Callable, Generator
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from notion_client import Client
|
||||
|
||||
from base_client import config
|
||||
from page_analyzer import PageAnalyzer, PageMetadata
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default database for page analysis data
|
||||
DEFAULT_PAGES_DATABASE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
|
||||
|
||||
# Default limits to prevent excessive resource usage
|
||||
DEFAULT_MAX_PAGES = 500
|
||||
DEFAULT_DELAY_SECONDS = 2.0
|
||||
|
||||
# Progress tracking directory
|
||||
PROGRESS_DIR = Path.home() / ".claude" / "seo-audit-progress"
|
||||
PROGRESS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlProgress:
|
||||
"""Track crawl progress."""
|
||||
total_urls: int = 0
|
||||
processed_urls: int = 0
|
||||
successful_urls: int = 0
|
||||
failed_urls: int = 0
|
||||
skipped_urls: int = 0
|
||||
start_time: datetime = field(default_factory=datetime.now)
|
||||
current_url: str = ""
|
||||
audit_id: str = ""
|
||||
site: str = ""
|
||||
status: str = "running" # running, completed, failed
|
||||
error_message: str = ""
|
||||
summary_page_id: str = ""
|
||||
|
||||
def get_progress_percent(self) -> float:
|
||||
if self.total_urls == 0:
|
||||
return 0.0
|
||||
return (self.processed_urls / self.total_urls) * 100
|
||||
|
||||
def get_elapsed_time(self) -> str:
|
||||
elapsed = datetime.now() - self.start_time
|
||||
minutes = int(elapsed.total_seconds() // 60)
|
||||
seconds = int(elapsed.total_seconds() % 60)
|
||||
return f"{minutes}m {seconds}s"
|
||||
|
||||
def get_eta(self) -> str:
|
||||
if self.processed_urls == 0:
|
||||
return "calculating..."
|
||||
elapsed = (datetime.now() - self.start_time).total_seconds()
|
||||
avg_time_per_url = elapsed / self.processed_urls
|
||||
remaining_urls = self.total_urls - self.processed_urls
|
||||
eta_seconds = remaining_urls * avg_time_per_url
|
||||
minutes = int(eta_seconds // 60)
|
||||
seconds = int(eta_seconds % 60)
|
||||
return f"{minutes}m {seconds}s"
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return {
|
||||
"audit_id": self.audit_id,
|
||||
"site": self.site,
|
||||
"status": self.status,
|
||||
"total_urls": self.total_urls,
|
||||
"processed_urls": self.processed_urls,
|
||||
"successful_urls": self.successful_urls,
|
||||
"failed_urls": self.failed_urls,
|
||||
"progress_percent": round(self.get_progress_percent(), 1),
|
||||
"elapsed_time": self.get_elapsed_time(),
|
||||
"eta": self.get_eta(),
|
||||
"current_url": self.current_url,
|
||||
"start_time": self.start_time.isoformat(),
|
||||
"error_message": self.error_message,
|
||||
"summary_page_id": self.summary_page_id,
|
||||
"updated_at": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
def save_to_file(self, filepath: Path | None = None) -> Path:
|
||||
"""Save progress to JSON file."""
|
||||
if filepath is None:
|
||||
filepath = PROGRESS_DIR / f"{self.audit_id}.json"
|
||||
with open(filepath, "w") as f:
|
||||
json.dump(self.to_dict(), f, indent=2)
|
||||
return filepath
|
||||
|
||||
@classmethod
|
||||
def load_from_file(cls, filepath: Path) -> "CrawlProgress":
|
||||
"""Load progress from JSON file."""
|
||||
with open(filepath, "r") as f:
|
||||
data = json.load(f)
|
||||
progress = cls()
|
||||
progress.audit_id = data.get("audit_id", "")
|
||||
progress.site = data.get("site", "")
|
||||
progress.status = data.get("status", "unknown")
|
||||
progress.total_urls = data.get("total_urls", 0)
|
||||
progress.processed_urls = data.get("processed_urls", 0)
|
||||
progress.successful_urls = data.get("successful_urls", 0)
|
||||
progress.failed_urls = data.get("failed_urls", 0)
|
||||
progress.current_url = data.get("current_url", "")
|
||||
progress.error_message = data.get("error_message", "")
|
||||
progress.summary_page_id = data.get("summary_page_id", "")
|
||||
if data.get("start_time"):
|
||||
progress.start_time = datetime.fromisoformat(data["start_time"])
|
||||
return progress
|
||||
|
||||
|
||||
def get_active_crawls() -> list[CrawlProgress]:
|
||||
"""Get all active (running) crawl jobs."""
|
||||
active = []
|
||||
for filepath in PROGRESS_DIR.glob("*.json"):
|
||||
try:
|
||||
progress = CrawlProgress.load_from_file(filepath)
|
||||
if progress.status == "running":
|
||||
active.append(progress)
|
||||
except Exception:
|
||||
continue
|
||||
return active
|
||||
|
||||
|
||||
def get_all_crawls() -> list[CrawlProgress]:
|
||||
"""Get all crawl jobs (active and completed)."""
|
||||
crawls = []
|
||||
for filepath in sorted(PROGRESS_DIR.glob("*.json"), reverse=True):
|
||||
try:
|
||||
progress = CrawlProgress.load_from_file(filepath)
|
||||
crawls.append(progress)
|
||||
except Exception:
|
||||
continue
|
||||
return crawls
|
||||
|
||||
|
||||
def get_crawl_status(audit_id: str) -> CrawlProgress | None:
|
||||
"""Get status of a specific crawl by audit ID."""
|
||||
filepath = PROGRESS_DIR / f"{audit_id}.json"
|
||||
if filepath.exists():
|
||||
return CrawlProgress.load_from_file(filepath)
|
||||
return None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlResult:
|
||||
"""Result of a complete sitemap crawl."""
|
||||
site: str
|
||||
sitemap_url: str
|
||||
audit_id: str
|
||||
total_pages: int
|
||||
successful_pages: int
|
||||
failed_pages: int
|
||||
start_time: datetime
|
||||
end_time: datetime
|
||||
pages_analyzed: list[PageMetadata] = field(default_factory=list)
|
||||
notion_page_ids: list[str] = field(default_factory=list)
|
||||
summary_page_id: str | None = None
|
||||
|
||||
def get_duration(self) -> str:
|
||||
duration = self.end_time - self.start_time
|
||||
minutes = int(duration.total_seconds() // 60)
|
||||
seconds = int(duration.total_seconds() % 60)
|
||||
return f"{minutes}m {seconds}s"
|
||||
|
||||
|
||||
class SitemapCrawler:
|
||||
"""Crawl sitemap URLs and analyze each page."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
notion_token: str | None = None,
|
||||
database_id: str | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize sitemap crawler.
|
||||
|
||||
Args:
|
||||
notion_token: Notion API token
|
||||
database_id: Notion database ID for storing results
|
||||
"""
|
||||
self.notion_token = notion_token or config.notion_token
|
||||
self.database_id = database_id or DEFAULT_PAGES_DATABASE_ID
|
||||
self.analyzer = PageAnalyzer()
|
||||
|
||||
if self.notion_token:
|
||||
self.notion = Client(auth=self.notion_token)
|
||||
else:
|
||||
self.notion = None
|
||||
logger.warning("Notion token not configured, results will not be saved")
|
||||
|
||||
def fetch_sitemap_urls(self, sitemap_url: str) -> list[str]:
|
||||
"""
|
||||
Fetch and parse URLs from a sitemap.
|
||||
|
||||
Args:
|
||||
sitemap_url: URL of the sitemap
|
||||
|
||||
Returns:
|
||||
List of URLs found in the sitemap
|
||||
"""
|
||||
try:
|
||||
response = requests.get(sitemap_url, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse XML
|
||||
root = ET.fromstring(response.content)
|
||||
|
||||
# Handle namespace
|
||||
namespaces = {
|
||||
"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||
}
|
||||
|
||||
urls = []
|
||||
|
||||
# Check if this is a sitemap index
|
||||
sitemap_tags = root.findall(".//sm:sitemap/sm:loc", namespaces)
|
||||
if sitemap_tags:
|
||||
# This is a sitemap index, recursively fetch child sitemaps
|
||||
logger.info(f"Found sitemap index with {len(sitemap_tags)} child sitemaps")
|
||||
for loc in sitemap_tags:
|
||||
if loc.text:
|
||||
child_urls = self.fetch_sitemap_urls(loc.text)
|
||||
urls.extend(child_urls)
|
||||
else:
|
||||
# Regular sitemap, extract URLs
|
||||
url_tags = root.findall(".//sm:url/sm:loc", namespaces)
|
||||
if not url_tags:
|
||||
# Try without namespace
|
||||
url_tags = root.findall(".//url/loc")
|
||||
|
||||
for loc in url_tags:
|
||||
if loc.text:
|
||||
urls.append(loc.text)
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
unique_urls = []
|
||||
for url in urls:
|
||||
if url not in seen:
|
||||
seen.add(url)
|
||||
unique_urls.append(url)
|
||||
|
||||
logger.info(f"Found {len(unique_urls)} unique URLs in sitemap")
|
||||
return unique_urls
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch sitemap: {e}")
|
||||
raise
|
||||
|
||||
def crawl_sitemap(
|
||||
self,
|
||||
sitemap_url: str,
|
||||
delay: float = DEFAULT_DELAY_SECONDS,
|
||||
max_pages: int = DEFAULT_MAX_PAGES,
|
||||
progress_callback: Callable[[CrawlProgress], None] | None = None,
|
||||
save_to_notion: bool = True,
|
||||
url_filter: Callable[[str], bool] | None = None,
|
||||
) -> CrawlResult:
|
||||
"""
|
||||
Crawl all URLs in a sitemap sequentially.
|
||||
|
||||
Args:
|
||||
sitemap_url: URL of the sitemap
|
||||
delay: Seconds to wait between requests (default: 2.0s)
|
||||
max_pages: Maximum number of pages to process (default: 500)
|
||||
progress_callback: Function called with progress updates
|
||||
save_to_notion: Whether to save results to Notion
|
||||
url_filter: Optional function to filter URLs (return True to include)
|
||||
|
||||
Returns:
|
||||
CrawlResult with all analyzed pages
|
||||
"""
|
||||
# Parse site info
|
||||
parsed_sitemap = urlparse(sitemap_url)
|
||||
site = f"{parsed_sitemap.scheme}://{parsed_sitemap.netloc}"
|
||||
site_domain = parsed_sitemap.netloc
|
||||
|
||||
# Generate audit ID
|
||||
audit_id = f"{site_domain}-pages-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
||||
|
||||
logger.info(f"Starting sitemap crawl: {sitemap_url}")
|
||||
logger.info(f"Audit ID: {audit_id}")
|
||||
logger.info(f"Delay between requests: {delay}s")
|
||||
|
||||
# Initialize progress tracking
|
||||
progress = CrawlProgress(
|
||||
audit_id=audit_id,
|
||||
site=site,
|
||||
status="running",
|
||||
)
|
||||
|
||||
# Fetch URLs
|
||||
urls = self.fetch_sitemap_urls(sitemap_url)
|
||||
|
||||
# Apply URL filter if provided
|
||||
if url_filter:
|
||||
urls = [url for url in urls if url_filter(url)]
|
||||
logger.info(f"After filtering: {len(urls)} URLs")
|
||||
|
||||
# Apply max pages limit (default: 500 to prevent excessive resource usage)
|
||||
if len(urls) > max_pages:
|
||||
logger.warning(f"Sitemap has {len(urls)} URLs, limiting to {max_pages} pages")
|
||||
logger.warning(f"Use max_pages parameter to adjust this limit")
|
||||
urls = urls[:max_pages]
|
||||
logger.info(f"Processing {len(urls)} pages (max: {max_pages})")
|
||||
|
||||
# Update progress with total URLs
|
||||
progress.total_urls = len(urls)
|
||||
progress.save_to_file()
|
||||
|
||||
# Initialize result
|
||||
result = CrawlResult(
|
||||
site=site,
|
||||
sitemap_url=sitemap_url,
|
||||
audit_id=audit_id,
|
||||
total_pages=len(urls),
|
||||
successful_pages=0,
|
||||
failed_pages=0,
|
||||
start_time=datetime.now(),
|
||||
end_time=datetime.now(),
|
||||
)
|
||||
|
||||
# Process each URL
|
||||
try:
|
||||
for i, url in enumerate(urls):
|
||||
progress.current_url = url
|
||||
progress.processed_urls = i
|
||||
progress.save_to_file() # Save progress to file
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(progress)
|
||||
|
||||
logger.info(f"[{i+1}/{len(urls)}] Analyzing: {url}")
|
||||
|
||||
try:
|
||||
# Analyze page
|
||||
metadata = self.analyzer.analyze_url(url)
|
||||
result.pages_analyzed.append(metadata)
|
||||
|
||||
if metadata.status_code == 200:
|
||||
progress.successful_urls += 1
|
||||
result.successful_pages += 1
|
||||
|
||||
# Save to Notion
|
||||
if save_to_notion and self.notion:
|
||||
page_id = self._save_page_to_notion(metadata, audit_id, site)
|
||||
if page_id:
|
||||
result.notion_page_ids.append(page_id)
|
||||
else:
|
||||
progress.failed_urls += 1
|
||||
result.failed_pages += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to analyze {url}: {e}")
|
||||
progress.failed_urls += 1
|
||||
result.failed_pages += 1
|
||||
|
||||
# Wait before next request
|
||||
if i < len(urls) - 1: # Don't wait after last URL
|
||||
time.sleep(delay)
|
||||
|
||||
# Final progress update
|
||||
progress.processed_urls = len(urls)
|
||||
progress.status = "completed"
|
||||
if progress_callback:
|
||||
progress_callback(progress)
|
||||
|
||||
except Exception as e:
|
||||
progress.status = "failed"
|
||||
progress.error_message = str(e)
|
||||
progress.save_to_file()
|
||||
raise
|
||||
|
||||
# Update result
|
||||
result.end_time = datetime.now()
|
||||
|
||||
# Create summary page
|
||||
if save_to_notion and self.notion:
|
||||
summary_id = self._create_crawl_summary_page(result)
|
||||
result.summary_page_id = summary_id
|
||||
progress.summary_page_id = summary_id
|
||||
|
||||
# Save final progress
|
||||
progress.save_to_file()
|
||||
|
||||
logger.info(f"Crawl complete: {result.successful_pages}/{result.total_pages} pages analyzed")
|
||||
logger.info(f"Duration: {result.get_duration()}")
|
||||
|
||||
return result
|
||||
|
||||
def _save_page_to_notion(
|
||||
self,
|
||||
metadata: PageMetadata,
|
||||
audit_id: str,
|
||||
site: str,
|
||||
) -> str | None:
|
||||
"""Save page metadata to Notion database."""
|
||||
try:
|
||||
# Build properties
|
||||
properties = {
|
||||
"Issue": {"title": [{"text": {"content": f"📄 {metadata.url}"}}]},
|
||||
"Category": {"select": {"name": "On-page SEO"}},
|
||||
"Priority": {"select": {"name": self._determine_priority(metadata)}},
|
||||
"Site": {"url": site},
|
||||
"URL": {"url": metadata.url},
|
||||
"Audit ID": {"rich_text": [{"text": {"content": audit_id}}]},
|
||||
"Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
|
||||
}
|
||||
|
||||
# Build page content
|
||||
children = self._build_page_content(metadata)
|
||||
|
||||
response = self.notion.pages.create(
|
||||
parent={"database_id": self.database_id},
|
||||
properties=properties,
|
||||
children=children,
|
||||
)
|
||||
|
||||
return response["id"]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save to Notion: {e}")
|
||||
return None
|
||||
|
||||
def _determine_priority(self, metadata: PageMetadata) -> str:
|
||||
"""Determine priority based on issues found."""
|
||||
if len(metadata.issues) >= 3:
|
||||
return "High"
|
||||
elif len(metadata.issues) >= 1:
|
||||
return "Medium"
|
||||
elif len(metadata.warnings) >= 3:
|
||||
return "Medium"
|
||||
else:
|
||||
return "Low"
|
||||
|
||||
def _build_page_content(self, metadata: PageMetadata) -> list[dict]:
|
||||
"""Build Notion page content blocks from metadata."""
|
||||
children = []
|
||||
|
||||
# Status summary callout
|
||||
status_emoji = "✅" if not metadata.issues else "⚠️" if len(metadata.issues) < 3 else "❌"
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "callout",
|
||||
"callout": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Status: {metadata.status_code} | "}},
|
||||
{"type": "text", "text": {"content": f"Response: {metadata.response_time_ms:.0f}ms | "}},
|
||||
{"type": "text", "text": {"content": f"Issues: {len(metadata.issues)} | "}},
|
||||
{"type": "text", "text": {"content": f"Warnings: {len(metadata.warnings)}"}},
|
||||
],
|
||||
"icon": {"type": "emoji", "emoji": status_emoji},
|
||||
"color": "gray_background" if not metadata.issues else "yellow_background" if len(metadata.issues) < 3 else "red_background",
|
||||
}
|
||||
})
|
||||
|
||||
# Meta Tags Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Meta Tags"}}]}
|
||||
})
|
||||
|
||||
# Meta tags table
|
||||
meta_rows = [
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Tag"}, "annotations": {"bold": True}}],
|
||||
[{"type": "text", "text": {"content": "Value"}, "annotations": {"bold": True}}],
|
||||
[{"type": "text", "text": {"content": "Status"}, "annotations": {"bold": True}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Title"}}],
|
||||
[{"type": "text", "text": {"content": (metadata.title or "—")[:50]}}],
|
||||
[{"type": "text", "text": {"content": f"✓ {metadata.title_length} chars" if metadata.title else "✗ Missing"}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Description"}}],
|
||||
[{"type": "text", "text": {"content": (metadata.meta_description or "—")[:50]}}],
|
||||
[{"type": "text", "text": {"content": f"✓ {metadata.meta_description_length} chars" if metadata.meta_description else "✗ Missing"}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Canonical"}}],
|
||||
[{"type": "text", "text": {"content": (metadata.canonical_url or "—")[:50]}}],
|
||||
[{"type": "text", "text": {"content": "✓" if metadata.canonical_url else "✗ Missing"}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Robots"}}],
|
||||
[{"type": "text", "text": {"content": metadata.robots_meta or "—"}}],
|
||||
[{"type": "text", "text": {"content": "✓" if metadata.robots_meta else "—"}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Lang"}}],
|
||||
[{"type": "text", "text": {"content": metadata.html_lang or "—"}}],
|
||||
[{"type": "text", "text": {"content": "✓" if metadata.html_lang else "—"}}],
|
||||
]}},
|
||||
]
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "table",
|
||||
"table": {
|
||||
"table_width": 3,
|
||||
"has_column_header": True,
|
||||
"has_row_header": False,
|
||||
"children": meta_rows
|
||||
}
|
||||
})
|
||||
|
||||
# Headings Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Headings"}}]}
|
||||
})
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"H1: {metadata.h1_count} | "}},
|
||||
{"type": "text", "text": {"content": f"Total headings: {len(metadata.headings)}"}},
|
||||
]}
|
||||
})
|
||||
|
||||
if metadata.h1_text:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "quote",
|
||||
"quote": {"rich_text": [{"type": "text", "text": {"content": metadata.h1_text[:200]}}]}
|
||||
})
|
||||
|
||||
# Schema Data Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Structured Data"}}]}
|
||||
})
|
||||
|
||||
if metadata.schema_types_found:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": "Schema types found: "}},
|
||||
{"type": "text", "text": {"content": ", ".join(metadata.schema_types_found)}, "annotations": {"code": True}},
|
||||
]}
|
||||
})
|
||||
else:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "callout",
|
||||
"callout": {
|
||||
"rich_text": [{"type": "text", "text": {"content": "No structured data found on this page"}}],
|
||||
"icon": {"type": "emoji", "emoji": "⚠️"},
|
||||
"color": "yellow_background",
|
||||
}
|
||||
})
|
||||
|
||||
# Open Graph Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Open Graph"}}]}
|
||||
})
|
||||
|
||||
og = metadata.open_graph
|
||||
og_status = "✓ Configured" if og.og_title else "✗ Missing"
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Status: {og_status}\n"}},
|
||||
{"type": "text", "text": {"content": f"og:title: {og.og_title or '—'}\n"}},
|
||||
{"type": "text", "text": {"content": f"og:type: {og.og_type or '—'}"}},
|
||||
]}
|
||||
})
|
||||
|
||||
# Links Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Links"}}]}
|
||||
})
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Internal links: {metadata.internal_link_count}\n"}},
|
||||
{"type": "text", "text": {"content": f"External links: {metadata.external_link_count}"}},
|
||||
]}
|
||||
})
|
||||
|
||||
# Images Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Images"}}]}
|
||||
})
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Total: {metadata.images_total} | "}},
|
||||
{"type": "text", "text": {"content": f"With alt: {metadata.images_with_alt} | "}},
|
||||
{"type": "text", "text": {"content": f"Without alt: {metadata.images_without_alt}"}},
|
||||
]}
|
||||
})
|
||||
|
||||
# Hreflang Section (if present)
|
||||
if metadata.hreflang_tags:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Hreflang Tags"}}]}
|
||||
})
|
||||
|
||||
for tag in metadata.hreflang_tags[:10]:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "bulleted_list_item",
|
||||
"bulleted_list_item": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"{tag['lang']}: "}},
|
||||
{"type": "text", "text": {"content": tag['url'], "link": {"url": tag['url']}}},
|
||||
]}
|
||||
})
|
||||
|
||||
# Issues & Warnings Section
|
||||
if metadata.issues or metadata.warnings:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Issues & Warnings"}}]}
|
||||
})
|
||||
|
||||
for issue in metadata.issues:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "to_do",
|
||||
"to_do": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": "❌ "}, "annotations": {"bold": True}},
|
||||
{"type": "text", "text": {"content": issue}},
|
||||
],
|
||||
"checked": False,
|
||||
}
|
||||
})
|
||||
|
||||
for warning in metadata.warnings:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "to_do",
|
||||
"to_do": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": "⚠️ "}, "annotations": {"bold": True}},
|
||||
{"type": "text", "text": {"content": warning}},
|
||||
],
|
||||
"checked": False,
|
||||
}
|
||||
})
|
||||
|
||||
return children
|
||||
|
||||
def _create_crawl_summary_page(self, result: CrawlResult) -> str | None:
|
||||
"""Create a summary page for the crawl."""
|
||||
try:
|
||||
site_domain = urlparse(result.site).netloc
|
||||
|
||||
# Calculate statistics
|
||||
total_issues = sum(len(p.issues) for p in result.pages_analyzed)
|
||||
total_warnings = sum(len(p.warnings) for p in result.pages_analyzed)
|
||||
pages_with_issues = sum(1 for p in result.pages_analyzed if p.issues)
|
||||
pages_without_schema = sum(1 for p in result.pages_analyzed if not p.schema_types_found)
|
||||
pages_without_description = sum(1 for p in result.pages_analyzed if not p.meta_description)
|
||||
|
||||
children = []
|
||||
|
||||
# Header callout
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "callout",
|
||||
"callout": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Sitemap Crawl Complete\n\n"}},
|
||||
{"type": "text", "text": {"content": f"Audit ID: {result.audit_id}\n"}},
|
||||
{"type": "text", "text": {"content": f"Duration: {result.get_duration()}\n"}},
|
||||
{"type": "text", "text": {"content": f"Pages: {result.successful_pages}/{result.total_pages}"}},
|
||||
],
|
||||
"icon": {"type": "emoji", "emoji": "📊"},
|
||||
"color": "blue_background",
|
||||
}
|
||||
})
|
||||
|
||||
# Statistics table
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Statistics"}}]}
|
||||
})
|
||||
|
||||
stats_rows = [
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Metric"}, "annotations": {"bold": True}}],
|
||||
[{"type": "text", "text": {"content": "Count"}, "annotations": {"bold": True}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Total Pages"}}],
|
||||
[{"type": "text", "text": {"content": str(result.total_pages)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Successfully Analyzed"}}],
|
||||
[{"type": "text", "text": {"content": str(result.successful_pages)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Pages with Issues"}}],
|
||||
[{"type": "text", "text": {"content": str(pages_with_issues)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Total Issues"}}],
|
||||
[{"type": "text", "text": {"content": str(total_issues)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Total Warnings"}}],
|
||||
[{"type": "text", "text": {"content": str(total_warnings)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Pages without Schema"}}],
|
||||
[{"type": "text", "text": {"content": str(pages_without_schema)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Pages without Description"}}],
|
||||
[{"type": "text", "text": {"content": str(pages_without_description)}}],
|
||||
]}},
|
||||
]
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "table",
|
||||
"table": {
|
||||
"table_width": 2,
|
||||
"has_column_header": True,
|
||||
"has_row_header": False,
|
||||
"children": stats_rows
|
||||
}
|
||||
})
|
||||
|
||||
# Pages list
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Analyzed Pages"}}]}
|
||||
})
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Filter by Audit ID in the database to see all {result.successful_pages} page entries."}}
|
||||
]}
|
||||
})
|
||||
|
||||
# Create the summary page
|
||||
response = self.notion.pages.create(
|
||||
parent={"database_id": self.database_id},
|
||||
properties={
|
||||
"Issue": {"title": [{"text": {"content": f"📊 Sitemap Crawl: {site_domain}"}}]},
|
||||
"Category": {"select": {"name": "Technical SEO"}},
|
||||
"Priority": {"select": {"name": "High"}},
|
||||
"Site": {"url": result.site},
|
||||
"Audit ID": {"rich_text": [{"text": {"content": result.audit_id}}]},
|
||||
"Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
|
||||
},
|
||||
children=children,
|
||||
)
|
||||
|
||||
logger.info(f"Created crawl summary page: {response['id']}")
|
||||
return response["id"]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create summary page: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def print_progress_status(progress: CrawlProgress) -> None:
|
||||
"""Print formatted progress status."""
|
||||
status_emoji = {
|
||||
"running": "🔄",
|
||||
"completed": "✅",
|
||||
"failed": "❌",
|
||||
}.get(progress.status, "❓")
|
||||
|
||||
print(f"""
|
||||
{'=' * 60}
|
||||
{status_emoji} SEO Page Analysis - {progress.status.upper()}
|
||||
{'=' * 60}
|
||||
Audit ID: {progress.audit_id}
|
||||
Site: {progress.site}
|
||||
Status: {progress.status}
|
||||
|
||||
Progress: {progress.processed_urls}/{progress.total_urls} pages ({progress.get_progress_percent():.1f}%)
|
||||
Successful: {progress.successful_urls}
|
||||
Failed: {progress.failed_urls}
|
||||
Elapsed: {progress.get_elapsed_time()}
|
||||
ETA: {progress.get_eta() if progress.status == 'running' else 'N/A'}
|
||||
|
||||
Current URL: {progress.current_url[:60] + '...' if len(progress.current_url) > 60 else progress.current_url}
|
||||
""")
|
||||
|
||||
if progress.summary_page_id:
|
||||
print(f"Summary: https://www.notion.so/{progress.summary_page_id.replace('-', '')}")
|
||||
|
||||
if progress.error_message:
|
||||
print(f"Error: {progress.error_message}")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Sitemap Crawler with Background Support")
|
||||
subparsers = parser.add_subparsers(dest="command", help="Commands")
|
||||
|
||||
# Crawl command
|
||||
crawl_parser = subparsers.add_parser("crawl", help="Start crawling a sitemap")
|
||||
crawl_parser.add_argument("sitemap_url", help="URL of the sitemap to crawl")
|
||||
crawl_parser.add_argument("--delay", "-d", type=float, default=DEFAULT_DELAY_SECONDS,
|
||||
help=f"Delay between requests in seconds (default: {DEFAULT_DELAY_SECONDS})")
|
||||
crawl_parser.add_argument("--max-pages", "-m", type=int, default=DEFAULT_MAX_PAGES,
|
||||
help=f"Maximum pages to process (default: {DEFAULT_MAX_PAGES})")
|
||||
crawl_parser.add_argument("--no-notion", action="store_true",
|
||||
help="Don't save to Notion")
|
||||
crawl_parser.add_argument("--no-limit", action="store_true",
|
||||
help="Remove page limit (use with caution)")
|
||||
|
||||
# Status command
|
||||
status_parser = subparsers.add_parser("status", help="Check crawl progress")
|
||||
status_parser.add_argument("audit_id", nargs="?", help="Specific audit ID to check (optional)")
|
||||
status_parser.add_argument("--all", "-a", action="store_true", help="Show all crawls (not just active)")
|
||||
|
||||
# List command
|
||||
list_parser = subparsers.add_parser("list", help="List all crawl jobs")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Default to crawl if no command specified but URL provided
|
||||
if args.command is None:
|
||||
# Check if first positional arg looks like a URL
|
||||
import sys
|
||||
if len(sys.argv) > 1 and (sys.argv[1].startswith("http") or sys.argv[1].endswith(".xml")):
|
||||
args.command = "crawl"
|
||||
args.sitemap_url = sys.argv[1]
|
||||
args.delay = DEFAULT_DELAY_SECONDS
|
||||
args.max_pages = DEFAULT_MAX_PAGES
|
||||
args.no_notion = False
|
||||
args.no_limit = False
|
||||
else:
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
if args.command == "status":
|
||||
if args.audit_id:
|
||||
# Show specific crawl status
|
||||
progress = get_crawl_status(args.audit_id)
|
||||
if progress:
|
||||
print_progress_status(progress)
|
||||
else:
|
||||
print(f"No crawl found with audit ID: {args.audit_id}")
|
||||
else:
|
||||
# Show active crawls
|
||||
if args.all:
|
||||
crawls = get_all_crawls()
|
||||
label = "All"
|
||||
else:
|
||||
crawls = get_active_crawls()
|
||||
label = "Active"
|
||||
|
||||
if crawls:
|
||||
print(f"\n{label} Crawl Jobs ({len(crawls)}):")
|
||||
print("-" * 60)
|
||||
for p in crawls:
|
||||
status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓")
|
||||
print(f"{status_emoji} {p.audit_id}")
|
||||
print(f" Site: {p.site}")
|
||||
print(f" Progress: {p.processed_urls}/{p.total_urls} ({p.get_progress_percent():.1f}%)")
|
||||
print()
|
||||
else:
|
||||
print(f"No {label.lower()} crawl jobs found.")
|
||||
return
|
||||
|
||||
if args.command == "list":
|
||||
crawls = get_all_crawls()
|
||||
if crawls:
|
||||
print(f"\nAll Crawl Jobs ({len(crawls)}):")
|
||||
print("-" * 80)
|
||||
print(f"{'Status':<10} {'Audit ID':<45} {'Progress':<15}")
|
||||
print("-" * 80)
|
||||
for p in crawls[:20]: # Show last 20
|
||||
status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓")
|
||||
progress_str = f"{p.processed_urls}/{p.total_urls}"
|
||||
print(f"{status_emoji} {p.status:<7} {p.audit_id:<45} {progress_str:<15}")
|
||||
if len(crawls) > 20:
|
||||
print(f"... and {len(crawls) - 20} more")
|
||||
else:
|
||||
print("No crawl jobs found.")
|
||||
return
|
||||
|
||||
if args.command == "crawl":
|
||||
# Handle --no-limit option
|
||||
max_pages = args.max_pages
|
||||
if args.no_limit:
|
||||
max_pages = 999999 # Effectively unlimited
|
||||
print("⚠️ WARNING: Page limit disabled. This may take a very long time!")
|
||||
|
||||
def progress_callback(progress: CrawlProgress):
|
||||
pct = progress.get_progress_percent()
|
||||
print(f"\r[{pct:5.1f}%] {progress.processed_urls}/{progress.total_urls} pages | "
|
||||
f"Success: {progress.successful_urls} | Failed: {progress.failed_urls} | "
|
||||
f"ETA: {progress.get_eta()}", end="", flush=True)
|
||||
|
||||
crawler = SitemapCrawler()
|
||||
result = crawler.crawl_sitemap(
|
||||
args.sitemap_url,
|
||||
delay=args.delay,
|
||||
max_pages=max_pages,
|
||||
progress_callback=progress_callback,
|
||||
save_to_notion=not args.no_notion,
|
||||
)
|
||||
|
||||
print() # New line after progress
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("CRAWL COMPLETE")
|
||||
print("=" * 60)
|
||||
print(f"Audit ID: {result.audit_id}")
|
||||
print(f"Total Pages: {result.total_pages}")
|
||||
print(f"Successful: {result.successful_pages}")
|
||||
print(f"Failed: {result.failed_pages}")
|
||||
print(f"Duration: {result.get_duration()}")
|
||||
if result.summary_page_id:
|
||||
print(f"Summary Page: https://www.notion.so/{result.summary_page_id.replace('-', '')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,467 @@
|
||||
"""
|
||||
Sitemap Validator - Validate XML sitemaps
|
||||
==========================================
|
||||
Purpose: Parse and validate XML sitemaps for SEO compliance
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
python sitemap_validator.py --url https://example.com/sitemap.xml
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import gzip
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
from lxml import etree
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SitemapIssue:
|
||||
"""Represents a sitemap validation issue."""
|
||||
|
||||
severity: str # "error", "warning", "info"
|
||||
message: str
|
||||
url: str | None = None
|
||||
suggestion: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SitemapEntry:
|
||||
"""Represents a single URL entry in sitemap."""
|
||||
|
||||
loc: str
|
||||
lastmod: str | None = None
|
||||
changefreq: str | None = None
|
||||
priority: float | None = None
|
||||
status_code: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SitemapResult:
|
||||
"""Complete sitemap validation result."""
|
||||
|
||||
url: str
|
||||
sitemap_type: str # "urlset" or "sitemapindex"
|
||||
entries: list[SitemapEntry] = field(default_factory=list)
|
||||
child_sitemaps: list[str] = field(default_factory=list)
|
||||
issues: list[SitemapIssue] = field(default_factory=list)
|
||||
valid: bool = True
|
||||
stats: dict = field(default_factory=dict)
|
||||
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON output."""
|
||||
return {
|
||||
"url": self.url,
|
||||
"sitemap_type": self.sitemap_type,
|
||||
"valid": self.valid,
|
||||
"stats": self.stats,
|
||||
"issues": [
|
||||
{
|
||||
"severity": i.severity,
|
||||
"message": i.message,
|
||||
"url": i.url,
|
||||
"suggestion": i.suggestion,
|
||||
}
|
||||
for i in self.issues
|
||||
],
|
||||
"entries_count": len(self.entries),
|
||||
"child_sitemaps": self.child_sitemaps,
|
||||
"timestamp": self.timestamp,
|
||||
}
|
||||
|
||||
|
||||
class SitemapValidator:
|
||||
"""Validate XML sitemaps."""
|
||||
|
||||
SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||
MAX_URLS = 50000
|
||||
MAX_SIZE_BYTES = 50 * 1024 * 1024 # 50MB
|
||||
|
||||
VALID_CHANGEFREQ = {
|
||||
"always", "hourly", "daily", "weekly",
|
||||
"monthly", "yearly", "never"
|
||||
}
|
||||
|
||||
def __init__(self, check_urls: bool = False, max_concurrent: int = 10):
|
||||
self.check_urls = check_urls
|
||||
self.max_concurrent = max_concurrent
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
|
||||
})
|
||||
|
||||
def fetch_sitemap(self, url: str) -> tuple[bytes, bool]:
|
||||
"""Fetch sitemap content, handling gzip compression."""
|
||||
try:
|
||||
response = self.session.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
content = response.content
|
||||
is_gzipped = False
|
||||
|
||||
# Check if gzipped
|
||||
if url.endswith(".gz") or response.headers.get(
|
||||
"Content-Encoding"
|
||||
) == "gzip":
|
||||
try:
|
||||
content = gzip.decompress(content)
|
||||
is_gzipped = True
|
||||
except gzip.BadGzipFile:
|
||||
pass
|
||||
|
||||
return content, is_gzipped
|
||||
except requests.RequestException as e:
|
||||
raise RuntimeError(f"Failed to fetch sitemap: {e}")
|
||||
|
||||
def parse_sitemap(self, content: bytes) -> tuple[str, list[dict]]:
|
||||
"""Parse sitemap XML content."""
|
||||
try:
|
||||
root = etree.fromstring(content)
|
||||
except etree.XMLSyntaxError as e:
|
||||
raise ValueError(f"Invalid XML: {e}")
|
||||
|
||||
# Remove namespace for easier parsing
|
||||
nsmap = {"sm": self.SITEMAP_NS}
|
||||
|
||||
# Check if it's a sitemap index or urlset
|
||||
if root.tag == f"{{{self.SITEMAP_NS}}}sitemapindex":
|
||||
sitemap_type = "sitemapindex"
|
||||
entries = []
|
||||
for sitemap in root.findall("sm:sitemap", nsmap):
|
||||
entry = {}
|
||||
loc = sitemap.find("sm:loc", nsmap)
|
||||
if loc is not None and loc.text:
|
||||
entry["loc"] = loc.text.strip()
|
||||
lastmod = sitemap.find("sm:lastmod", nsmap)
|
||||
if lastmod is not None and lastmod.text:
|
||||
entry["lastmod"] = lastmod.text.strip()
|
||||
if entry.get("loc"):
|
||||
entries.append(entry)
|
||||
elif root.tag == f"{{{self.SITEMAP_NS}}}urlset":
|
||||
sitemap_type = "urlset"
|
||||
entries = []
|
||||
for url in root.findall("sm:url", nsmap):
|
||||
entry = {}
|
||||
loc = url.find("sm:loc", nsmap)
|
||||
if loc is not None and loc.text:
|
||||
entry["loc"] = loc.text.strip()
|
||||
lastmod = url.find("sm:lastmod", nsmap)
|
||||
if lastmod is not None and lastmod.text:
|
||||
entry["lastmod"] = lastmod.text.strip()
|
||||
changefreq = url.find("sm:changefreq", nsmap)
|
||||
if changefreq is not None and changefreq.text:
|
||||
entry["changefreq"] = changefreq.text.strip().lower()
|
||||
priority = url.find("sm:priority", nsmap)
|
||||
if priority is not None and priority.text:
|
||||
try:
|
||||
entry["priority"] = float(priority.text.strip())
|
||||
except ValueError:
|
||||
entry["priority"] = None
|
||||
if entry.get("loc"):
|
||||
entries.append(entry)
|
||||
else:
|
||||
raise ValueError(f"Unknown sitemap type: {root.tag}")
|
||||
|
||||
return sitemap_type, entries
|
||||
|
||||
def validate(self, url: str) -> SitemapResult:
|
||||
"""Validate a sitemap URL."""
|
||||
result = SitemapResult(url=url, sitemap_type="unknown")
|
||||
|
||||
# Fetch sitemap
|
||||
try:
|
||||
content, is_gzipped = self.fetch_sitemap(url)
|
||||
except RuntimeError as e:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="error",
|
||||
message=str(e),
|
||||
url=url,
|
||||
))
|
||||
result.valid = False
|
||||
return result
|
||||
|
||||
# Check size
|
||||
if len(content) > self.MAX_SIZE_BYTES:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="error",
|
||||
message=f"Sitemap exceeds 50MB limit ({len(content) / 1024 / 1024:.2f}MB)",
|
||||
url=url,
|
||||
suggestion="Split sitemap into smaller files using sitemap index",
|
||||
))
|
||||
|
||||
# Parse XML
|
||||
try:
|
||||
sitemap_type, entries = self.parse_sitemap(content)
|
||||
except ValueError as e:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="error",
|
||||
message=str(e),
|
||||
url=url,
|
||||
))
|
||||
result.valid = False
|
||||
return result
|
||||
|
||||
result.sitemap_type = sitemap_type
|
||||
|
||||
# Process entries
|
||||
if sitemap_type == "sitemapindex":
|
||||
result.child_sitemaps = [e["loc"] for e in entries]
|
||||
result.stats = {
|
||||
"child_sitemaps_count": len(entries),
|
||||
}
|
||||
else:
|
||||
# Validate URL entries
|
||||
url_count = len(entries)
|
||||
result.stats["url_count"] = url_count
|
||||
|
||||
if url_count > self.MAX_URLS:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="error",
|
||||
message=f"Sitemap exceeds 50,000 URL limit ({url_count} URLs)",
|
||||
url=url,
|
||||
suggestion="Split into multiple sitemaps with sitemap index",
|
||||
))
|
||||
|
||||
if url_count == 0:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="warning",
|
||||
message="Sitemap is empty (no URLs)",
|
||||
url=url,
|
||||
))
|
||||
|
||||
# Validate individual entries
|
||||
seen_urls = set()
|
||||
invalid_lastmod = 0
|
||||
invalid_changefreq = 0
|
||||
invalid_priority = 0
|
||||
|
||||
for entry in entries:
|
||||
loc = entry.get("loc", "")
|
||||
|
||||
# Check for duplicates
|
||||
if loc in seen_urls:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="warning",
|
||||
message="Duplicate URL in sitemap",
|
||||
url=loc,
|
||||
))
|
||||
seen_urls.add(loc)
|
||||
|
||||
# Validate lastmod format
|
||||
lastmod = entry.get("lastmod")
|
||||
if lastmod:
|
||||
if not self._validate_date(lastmod):
|
||||
invalid_lastmod += 1
|
||||
|
||||
# Validate changefreq
|
||||
changefreq = entry.get("changefreq")
|
||||
if changefreq and changefreq not in self.VALID_CHANGEFREQ:
|
||||
invalid_changefreq += 1
|
||||
|
||||
# Validate priority
|
||||
priority = entry.get("priority")
|
||||
if priority is not None:
|
||||
if not (0.0 <= priority <= 1.0):
|
||||
invalid_priority += 1
|
||||
|
||||
# Create entry object
|
||||
result.entries.append(SitemapEntry(
|
||||
loc=loc,
|
||||
lastmod=lastmod,
|
||||
changefreq=changefreq,
|
||||
priority=priority,
|
||||
))
|
||||
|
||||
# Add summary issues
|
||||
if invalid_lastmod > 0:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="warning",
|
||||
message=f"{invalid_lastmod} URLs with invalid lastmod format",
|
||||
suggestion="Use ISO 8601 format (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS+TZ)",
|
||||
))
|
||||
|
||||
if invalid_changefreq > 0:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="info",
|
||||
message=f"{invalid_changefreq} URLs with invalid changefreq",
|
||||
suggestion="Use: always, hourly, daily, weekly, monthly, yearly, never",
|
||||
))
|
||||
|
||||
if invalid_priority > 0:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="warning",
|
||||
message=f"{invalid_priority} URLs with invalid priority (must be 0.0-1.0)",
|
||||
))
|
||||
|
||||
result.stats.update({
|
||||
"invalid_lastmod": invalid_lastmod,
|
||||
"invalid_changefreq": invalid_changefreq,
|
||||
"invalid_priority": invalid_priority,
|
||||
"has_lastmod": sum(1 for e in result.entries if e.lastmod),
|
||||
"has_changefreq": sum(1 for e in result.entries if e.changefreq),
|
||||
"has_priority": sum(1 for e in result.entries if e.priority is not None),
|
||||
})
|
||||
|
||||
# Check URLs if requested
|
||||
if self.check_urls and result.entries:
|
||||
asyncio.run(self._check_url_status(result))
|
||||
|
||||
# Determine validity
|
||||
result.valid = not any(i.severity == "error" for i in result.issues)
|
||||
|
||||
return result
|
||||
|
||||
def _validate_date(self, date_str: str) -> bool:
|
||||
"""Validate ISO 8601 date format."""
|
||||
patterns = [
|
||||
r"^\d{4}-\d{2}-\d{2}$",
|
||||
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}",
|
||||
]
|
||||
return any(re.match(p, date_str) for p in patterns)
|
||||
|
||||
async def _check_url_status(self, result: SitemapResult) -> None:
|
||||
"""Check HTTP status of URLs in sitemap."""
|
||||
semaphore = asyncio.Semaphore(self.max_concurrent)
|
||||
|
||||
async def check_url(entry: SitemapEntry) -> None:
|
||||
async with semaphore:
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.head(
|
||||
entry.loc,
|
||||
timeout=aiohttp.ClientTimeout(total=10),
|
||||
allow_redirects=True,
|
||||
) as response:
|
||||
entry.status_code = response.status
|
||||
except Exception:
|
||||
entry.status_code = 0
|
||||
|
||||
await asyncio.gather(*[check_url(e) for e in result.entries[:100]])
|
||||
|
||||
# Count status codes
|
||||
status_counts = {}
|
||||
for entry in result.entries:
|
||||
if entry.status_code:
|
||||
status_counts[entry.status_code] = (
|
||||
status_counts.get(entry.status_code, 0) + 1
|
||||
)
|
||||
|
||||
result.stats["url_status_codes"] = status_counts
|
||||
|
||||
# Add issues for non-200 URLs
|
||||
error_count = sum(
|
||||
1 for e in result.entries
|
||||
if e.status_code and e.status_code >= 400
|
||||
)
|
||||
if error_count > 0:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="warning",
|
||||
message=f"{error_count} URLs returning error status codes (4xx/5xx)",
|
||||
suggestion="Remove or fix broken URLs in sitemap",
|
||||
))
|
||||
|
||||
def generate_report(self, result: SitemapResult) -> str:
|
||||
"""Generate human-readable validation report."""
|
||||
lines = [
|
||||
"=" * 60,
|
||||
"Sitemap Validation Report",
|
||||
"=" * 60,
|
||||
f"URL: {result.url}",
|
||||
f"Type: {result.sitemap_type}",
|
||||
f"Valid: {'Yes' if result.valid else 'No'}",
|
||||
f"Timestamp: {result.timestamp}",
|
||||
"",
|
||||
]
|
||||
|
||||
lines.append("Statistics:")
|
||||
for key, value in result.stats.items():
|
||||
lines.append(f" {key}: {value}")
|
||||
lines.append("")
|
||||
|
||||
if result.child_sitemaps:
|
||||
lines.append(f"Child Sitemaps ({len(result.child_sitemaps)}):")
|
||||
for sitemap in result.child_sitemaps[:10]:
|
||||
lines.append(f" - {sitemap}")
|
||||
if len(result.child_sitemaps) > 10:
|
||||
lines.append(f" ... and {len(result.child_sitemaps) - 10} more")
|
||||
lines.append("")
|
||||
|
||||
if result.issues:
|
||||
lines.append("Issues Found:")
|
||||
errors = [i for i in result.issues if i.severity == "error"]
|
||||
warnings = [i for i in result.issues if i.severity == "warning"]
|
||||
infos = [i for i in result.issues if i.severity == "info"]
|
||||
|
||||
if errors:
|
||||
lines.append(f"\n ERRORS ({len(errors)}):")
|
||||
for issue in errors:
|
||||
lines.append(f" - {issue.message}")
|
||||
if issue.url:
|
||||
lines.append(f" URL: {issue.url}")
|
||||
if issue.suggestion:
|
||||
lines.append(f" Suggestion: {issue.suggestion}")
|
||||
|
||||
if warnings:
|
||||
lines.append(f"\n WARNINGS ({len(warnings)}):")
|
||||
for issue in warnings:
|
||||
lines.append(f" - {issue.message}")
|
||||
if issue.suggestion:
|
||||
lines.append(f" Suggestion: {issue.suggestion}")
|
||||
|
||||
if infos:
|
||||
lines.append(f"\n INFO ({len(infos)}):")
|
||||
for issue in infos:
|
||||
lines.append(f" - {issue.message}")
|
||||
|
||||
lines.append("")
|
||||
lines.append("=" * 60)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for CLI usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Validate XML sitemaps",
|
||||
)
|
||||
parser.add_argument("--url", "-u", required=True, help="Sitemap URL to validate")
|
||||
parser.add_argument("--check-urls", action="store_true",
|
||||
help="Check HTTP status of URLs (slower)")
|
||||
parser.add_argument("--output", "-o", help="Output file for JSON report")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
validator = SitemapValidator(check_urls=args.check_urls)
|
||||
result = validator.validate(args.url)
|
||||
|
||||
if args.json or args.output:
|
||||
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
logger.info(f"Report written to {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
else:
|
||||
print(validator.generate_report(result))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user