directory changes and restructuring

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-22 02:01:41 +09:00
parent eea49f9f8c
commit 236be6c580
598 changed files with 0 additions and 0 deletions

View File

@@ -0,0 +1,969 @@
"""
Sitemap Crawler - Sequential page analysis from sitemap
=======================================================
Purpose: Crawl sitemap URLs one by one, analyze each page, save to Notion
Python: 3.10+
Usage:
from sitemap_crawler import SitemapCrawler
crawler = SitemapCrawler()
crawler.crawl_sitemap("https://example.com/sitemap.xml", delay=2.0)
"""
import json
import logging
import time
import xml.etree.ElementTree as ET
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Callable, Generator
from urllib.parse import urlparse
import requests
from notion_client import Client
from base_client import config
from page_analyzer import PageAnalyzer, PageMetadata
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
# Default database for page analysis data
DEFAULT_PAGES_DATABASE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
# Default limits to prevent excessive resource usage
DEFAULT_MAX_PAGES = 500
DEFAULT_DELAY_SECONDS = 2.0
# Progress tracking directory
PROGRESS_DIR = Path.home() / ".claude" / "seo-audit-progress"
PROGRESS_DIR.mkdir(parents=True, exist_ok=True)
@dataclass
class CrawlProgress:
"""Track crawl progress."""
total_urls: int = 0
processed_urls: int = 0
successful_urls: int = 0
failed_urls: int = 0
skipped_urls: int = 0
start_time: datetime = field(default_factory=datetime.now)
current_url: str = ""
audit_id: str = ""
site: str = ""
status: str = "running" # running, completed, failed
error_message: str = ""
summary_page_id: str = ""
def get_progress_percent(self) -> float:
if self.total_urls == 0:
return 0.0
return (self.processed_urls / self.total_urls) * 100
def get_elapsed_time(self) -> str:
elapsed = datetime.now() - self.start_time
minutes = int(elapsed.total_seconds() // 60)
seconds = int(elapsed.total_seconds() % 60)
return f"{minutes}m {seconds}s"
def get_eta(self) -> str:
if self.processed_urls == 0:
return "calculating..."
elapsed = (datetime.now() - self.start_time).total_seconds()
avg_time_per_url = elapsed / self.processed_urls
remaining_urls = self.total_urls - self.processed_urls
eta_seconds = remaining_urls * avg_time_per_url
minutes = int(eta_seconds // 60)
seconds = int(eta_seconds % 60)
return f"{minutes}m {seconds}s"
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"audit_id": self.audit_id,
"site": self.site,
"status": self.status,
"total_urls": self.total_urls,
"processed_urls": self.processed_urls,
"successful_urls": self.successful_urls,
"failed_urls": self.failed_urls,
"progress_percent": round(self.get_progress_percent(), 1),
"elapsed_time": self.get_elapsed_time(),
"eta": self.get_eta(),
"current_url": self.current_url,
"start_time": self.start_time.isoformat(),
"error_message": self.error_message,
"summary_page_id": self.summary_page_id,
"updated_at": datetime.now().isoformat(),
}
def save_to_file(self, filepath: Path | None = None) -> Path:
"""Save progress to JSON file."""
if filepath is None:
filepath = PROGRESS_DIR / f"{self.audit_id}.json"
with open(filepath, "w") as f:
json.dump(self.to_dict(), f, indent=2)
return filepath
@classmethod
def load_from_file(cls, filepath: Path) -> "CrawlProgress":
"""Load progress from JSON file."""
with open(filepath, "r") as f:
data = json.load(f)
progress = cls()
progress.audit_id = data.get("audit_id", "")
progress.site = data.get("site", "")
progress.status = data.get("status", "unknown")
progress.total_urls = data.get("total_urls", 0)
progress.processed_urls = data.get("processed_urls", 0)
progress.successful_urls = data.get("successful_urls", 0)
progress.failed_urls = data.get("failed_urls", 0)
progress.current_url = data.get("current_url", "")
progress.error_message = data.get("error_message", "")
progress.summary_page_id = data.get("summary_page_id", "")
if data.get("start_time"):
progress.start_time = datetime.fromisoformat(data["start_time"])
return progress
def get_active_crawls() -> list[CrawlProgress]:
"""Get all active (running) crawl jobs."""
active = []
for filepath in PROGRESS_DIR.glob("*.json"):
try:
progress = CrawlProgress.load_from_file(filepath)
if progress.status == "running":
active.append(progress)
except Exception:
continue
return active
def get_all_crawls() -> list[CrawlProgress]:
"""Get all crawl jobs (active and completed)."""
crawls = []
for filepath in sorted(PROGRESS_DIR.glob("*.json"), reverse=True):
try:
progress = CrawlProgress.load_from_file(filepath)
crawls.append(progress)
except Exception:
continue
return crawls
def get_crawl_status(audit_id: str) -> CrawlProgress | None:
"""Get status of a specific crawl by audit ID."""
filepath = PROGRESS_DIR / f"{audit_id}.json"
if filepath.exists():
return CrawlProgress.load_from_file(filepath)
return None
@dataclass
class CrawlResult:
"""Result of a complete sitemap crawl."""
site: str
sitemap_url: str
audit_id: str
total_pages: int
successful_pages: int
failed_pages: int
start_time: datetime
end_time: datetime
pages_analyzed: list[PageMetadata] = field(default_factory=list)
notion_page_ids: list[str] = field(default_factory=list)
summary_page_id: str | None = None
def get_duration(self) -> str:
duration = self.end_time - self.start_time
minutes = int(duration.total_seconds() // 60)
seconds = int(duration.total_seconds() % 60)
return f"{minutes}m {seconds}s"
class SitemapCrawler:
"""Crawl sitemap URLs and analyze each page."""
def __init__(
self,
notion_token: str | None = None,
database_id: str | None = None,
):
"""
Initialize sitemap crawler.
Args:
notion_token: Notion API token
database_id: Notion database ID for storing results
"""
self.notion_token = notion_token or config.notion_token
self.database_id = database_id or DEFAULT_PAGES_DATABASE_ID
self.analyzer = PageAnalyzer()
if self.notion_token:
self.notion = Client(auth=self.notion_token)
else:
self.notion = None
logger.warning("Notion token not configured, results will not be saved")
def fetch_sitemap_urls(self, sitemap_url: str) -> list[str]:
"""
Fetch and parse URLs from a sitemap.
Args:
sitemap_url: URL of the sitemap
Returns:
List of URLs found in the sitemap
"""
try:
response = requests.get(sitemap_url, timeout=30)
response.raise_for_status()
# Parse XML
root = ET.fromstring(response.content)
# Handle namespace
namespaces = {
"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"
}
urls = []
# Check if this is a sitemap index
sitemap_tags = root.findall(".//sm:sitemap/sm:loc", namespaces)
if sitemap_tags:
# This is a sitemap index, recursively fetch child sitemaps
logger.info(f"Found sitemap index with {len(sitemap_tags)} child sitemaps")
for loc in sitemap_tags:
if loc.text:
child_urls = self.fetch_sitemap_urls(loc.text)
urls.extend(child_urls)
else:
# Regular sitemap, extract URLs
url_tags = root.findall(".//sm:url/sm:loc", namespaces)
if not url_tags:
# Try without namespace
url_tags = root.findall(".//url/loc")
for loc in url_tags:
if loc.text:
urls.append(loc.text)
# Remove duplicates while preserving order
seen = set()
unique_urls = []
for url in urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)
logger.info(f"Found {len(unique_urls)} unique URLs in sitemap")
return unique_urls
except Exception as e:
logger.error(f"Failed to fetch sitemap: {e}")
raise
def crawl_sitemap(
self,
sitemap_url: str,
delay: float = DEFAULT_DELAY_SECONDS,
max_pages: int = DEFAULT_MAX_PAGES,
progress_callback: Callable[[CrawlProgress], None] | None = None,
save_to_notion: bool = True,
url_filter: Callable[[str], bool] | None = None,
) -> CrawlResult:
"""
Crawl all URLs in a sitemap sequentially.
Args:
sitemap_url: URL of the sitemap
delay: Seconds to wait between requests (default: 2.0s)
max_pages: Maximum number of pages to process (default: 500)
progress_callback: Function called with progress updates
save_to_notion: Whether to save results to Notion
url_filter: Optional function to filter URLs (return True to include)
Returns:
CrawlResult with all analyzed pages
"""
# Parse site info
parsed_sitemap = urlparse(sitemap_url)
site = f"{parsed_sitemap.scheme}://{parsed_sitemap.netloc}"
site_domain = parsed_sitemap.netloc
# Generate audit ID
audit_id = f"{site_domain}-pages-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
logger.info(f"Starting sitemap crawl: {sitemap_url}")
logger.info(f"Audit ID: {audit_id}")
logger.info(f"Delay between requests: {delay}s")
# Initialize progress tracking
progress = CrawlProgress(
audit_id=audit_id,
site=site,
status="running",
)
# Fetch URLs
urls = self.fetch_sitemap_urls(sitemap_url)
# Apply URL filter if provided
if url_filter:
urls = [url for url in urls if url_filter(url)]
logger.info(f"After filtering: {len(urls)} URLs")
# Apply max pages limit (default: 500 to prevent excessive resource usage)
if len(urls) > max_pages:
logger.warning(f"Sitemap has {len(urls)} URLs, limiting to {max_pages} pages")
logger.warning(f"Use max_pages parameter to adjust this limit")
urls = urls[:max_pages]
logger.info(f"Processing {len(urls)} pages (max: {max_pages})")
# Update progress with total URLs
progress.total_urls = len(urls)
progress.save_to_file()
# Initialize result
result = CrawlResult(
site=site,
sitemap_url=sitemap_url,
audit_id=audit_id,
total_pages=len(urls),
successful_pages=0,
failed_pages=0,
start_time=datetime.now(),
end_time=datetime.now(),
)
# Process each URL
try:
for i, url in enumerate(urls):
progress.current_url = url
progress.processed_urls = i
progress.save_to_file() # Save progress to file
if progress_callback:
progress_callback(progress)
logger.info(f"[{i+1}/{len(urls)}] Analyzing: {url}")
try:
# Analyze page
metadata = self.analyzer.analyze_url(url)
result.pages_analyzed.append(metadata)
if metadata.status_code == 200:
progress.successful_urls += 1
result.successful_pages += 1
# Save to Notion
if save_to_notion and self.notion:
page_id = self._save_page_to_notion(metadata, audit_id, site)
if page_id:
result.notion_page_ids.append(page_id)
else:
progress.failed_urls += 1
result.failed_pages += 1
except Exception as e:
logger.error(f"Failed to analyze {url}: {e}")
progress.failed_urls += 1
result.failed_pages += 1
# Wait before next request
if i < len(urls) - 1: # Don't wait after last URL
time.sleep(delay)
# Final progress update
progress.processed_urls = len(urls)
progress.status = "completed"
if progress_callback:
progress_callback(progress)
except Exception as e:
progress.status = "failed"
progress.error_message = str(e)
progress.save_to_file()
raise
# Update result
result.end_time = datetime.now()
# Create summary page
if save_to_notion and self.notion:
summary_id = self._create_crawl_summary_page(result)
result.summary_page_id = summary_id
progress.summary_page_id = summary_id
# Save final progress
progress.save_to_file()
logger.info(f"Crawl complete: {result.successful_pages}/{result.total_pages} pages analyzed")
logger.info(f"Duration: {result.get_duration()}")
return result
def _save_page_to_notion(
self,
metadata: PageMetadata,
audit_id: str,
site: str,
) -> str | None:
"""Save page metadata to Notion database."""
try:
# Build properties
properties = {
"Issue": {"title": [{"text": {"content": f"📄 {metadata.url}"}}]},
"Category": {"select": {"name": "On-page SEO"}},
"Priority": {"select": {"name": self._determine_priority(metadata)}},
"Site": {"url": site},
"URL": {"url": metadata.url},
"Audit ID": {"rich_text": [{"text": {"content": audit_id}}]},
"Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
}
# Build page content
children = self._build_page_content(metadata)
response = self.notion.pages.create(
parent={"database_id": self.database_id},
properties=properties,
children=children,
)
return response["id"]
except Exception as e:
logger.error(f"Failed to save to Notion: {e}")
return None
def _determine_priority(self, metadata: PageMetadata) -> str:
"""Determine priority based on issues found."""
if len(metadata.issues) >= 3:
return "High"
elif len(metadata.issues) >= 1:
return "Medium"
elif len(metadata.warnings) >= 3:
return "Medium"
else:
return "Low"
def _build_page_content(self, metadata: PageMetadata) -> list[dict]:
"""Build Notion page content blocks from metadata."""
children = []
# Status summary callout
status_emoji = "" if not metadata.issues else "⚠️" if len(metadata.issues) < 3 else ""
children.append({
"object": "block",
"type": "callout",
"callout": {
"rich_text": [
{"type": "text", "text": {"content": f"Status: {metadata.status_code} | "}},
{"type": "text", "text": {"content": f"Response: {metadata.response_time_ms:.0f}ms | "}},
{"type": "text", "text": {"content": f"Issues: {len(metadata.issues)} | "}},
{"type": "text", "text": {"content": f"Warnings: {len(metadata.warnings)}"}},
],
"icon": {"type": "emoji", "emoji": status_emoji},
"color": "gray_background" if not metadata.issues else "yellow_background" if len(metadata.issues) < 3 else "red_background",
}
})
# Meta Tags Section
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Meta Tags"}}]}
})
# Meta tags table
meta_rows = [
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Tag"}, "annotations": {"bold": True}}],
[{"type": "text", "text": {"content": "Value"}, "annotations": {"bold": True}}],
[{"type": "text", "text": {"content": "Status"}, "annotations": {"bold": True}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Title"}}],
[{"type": "text", "text": {"content": (metadata.title or "")[:50]}}],
[{"type": "text", "text": {"content": f"{metadata.title_length} chars" if metadata.title else "✗ Missing"}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Description"}}],
[{"type": "text", "text": {"content": (metadata.meta_description or "")[:50]}}],
[{"type": "text", "text": {"content": f"{metadata.meta_description_length} chars" if metadata.meta_description else "✗ Missing"}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Canonical"}}],
[{"type": "text", "text": {"content": (metadata.canonical_url or "")[:50]}}],
[{"type": "text", "text": {"content": "" if metadata.canonical_url else "✗ Missing"}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Robots"}}],
[{"type": "text", "text": {"content": metadata.robots_meta or ""}}],
[{"type": "text", "text": {"content": "" if metadata.robots_meta else ""}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Lang"}}],
[{"type": "text", "text": {"content": metadata.html_lang or ""}}],
[{"type": "text", "text": {"content": "" if metadata.html_lang else ""}}],
]}},
]
children.append({
"object": "block",
"type": "table",
"table": {
"table_width": 3,
"has_column_header": True,
"has_row_header": False,
"children": meta_rows
}
})
# Headings Section
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Headings"}}]}
})
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {"rich_text": [
{"type": "text", "text": {"content": f"H1: {metadata.h1_count} | "}},
{"type": "text", "text": {"content": f"Total headings: {len(metadata.headings)}"}},
]}
})
if metadata.h1_text:
children.append({
"object": "block",
"type": "quote",
"quote": {"rich_text": [{"type": "text", "text": {"content": metadata.h1_text[:200]}}]}
})
# Schema Data Section
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Structured Data"}}]}
})
if metadata.schema_types_found:
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {"rich_text": [
{"type": "text", "text": {"content": "Schema types found: "}},
{"type": "text", "text": {"content": ", ".join(metadata.schema_types_found)}, "annotations": {"code": True}},
]}
})
else:
children.append({
"object": "block",
"type": "callout",
"callout": {
"rich_text": [{"type": "text", "text": {"content": "No structured data found on this page"}}],
"icon": {"type": "emoji", "emoji": "⚠️"},
"color": "yellow_background",
}
})
# Open Graph Section
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Open Graph"}}]}
})
og = metadata.open_graph
og_status = "✓ Configured" if og.og_title else "✗ Missing"
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {"rich_text": [
{"type": "text", "text": {"content": f"Status: {og_status}\n"}},
{"type": "text", "text": {"content": f"og:title: {og.og_title or ''}\n"}},
{"type": "text", "text": {"content": f"og:type: {og.og_type or ''}"}},
]}
})
# Links Section
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Links"}}]}
})
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {"rich_text": [
{"type": "text", "text": {"content": f"Internal links: {metadata.internal_link_count}\n"}},
{"type": "text", "text": {"content": f"External links: {metadata.external_link_count}"}},
]}
})
# Images Section
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Images"}}]}
})
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {"rich_text": [
{"type": "text", "text": {"content": f"Total: {metadata.images_total} | "}},
{"type": "text", "text": {"content": f"With alt: {metadata.images_with_alt} | "}},
{"type": "text", "text": {"content": f"Without alt: {metadata.images_without_alt}"}},
]}
})
# Hreflang Section (if present)
if metadata.hreflang_tags:
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Hreflang Tags"}}]}
})
for tag in metadata.hreflang_tags[:10]:
children.append({
"object": "block",
"type": "bulleted_list_item",
"bulleted_list_item": {"rich_text": [
{"type": "text", "text": {"content": f"{tag['lang']}: "}},
{"type": "text", "text": {"content": tag['url'], "link": {"url": tag['url']}}},
]}
})
# Issues & Warnings Section
if metadata.issues or metadata.warnings:
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Issues & Warnings"}}]}
})
for issue in metadata.issues:
children.append({
"object": "block",
"type": "to_do",
"to_do": {
"rich_text": [
{"type": "text", "text": {"content": ""}, "annotations": {"bold": True}},
{"type": "text", "text": {"content": issue}},
],
"checked": False,
}
})
for warning in metadata.warnings:
children.append({
"object": "block",
"type": "to_do",
"to_do": {
"rich_text": [
{"type": "text", "text": {"content": "⚠️ "}, "annotations": {"bold": True}},
{"type": "text", "text": {"content": warning}},
],
"checked": False,
}
})
return children
def _create_crawl_summary_page(self, result: CrawlResult) -> str | None:
"""Create a summary page for the crawl."""
try:
site_domain = urlparse(result.site).netloc
# Calculate statistics
total_issues = sum(len(p.issues) for p in result.pages_analyzed)
total_warnings = sum(len(p.warnings) for p in result.pages_analyzed)
pages_with_issues = sum(1 for p in result.pages_analyzed if p.issues)
pages_without_schema = sum(1 for p in result.pages_analyzed if not p.schema_types_found)
pages_without_description = sum(1 for p in result.pages_analyzed if not p.meta_description)
children = []
# Header callout
children.append({
"object": "block",
"type": "callout",
"callout": {
"rich_text": [
{"type": "text", "text": {"content": f"Sitemap Crawl Complete\n\n"}},
{"type": "text", "text": {"content": f"Audit ID: {result.audit_id}\n"}},
{"type": "text", "text": {"content": f"Duration: {result.get_duration()}\n"}},
{"type": "text", "text": {"content": f"Pages: {result.successful_pages}/{result.total_pages}"}},
],
"icon": {"type": "emoji", "emoji": "📊"},
"color": "blue_background",
}
})
# Statistics table
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Statistics"}}]}
})
stats_rows = [
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Metric"}, "annotations": {"bold": True}}],
[{"type": "text", "text": {"content": "Count"}, "annotations": {"bold": True}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Total Pages"}}],
[{"type": "text", "text": {"content": str(result.total_pages)}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Successfully Analyzed"}}],
[{"type": "text", "text": {"content": str(result.successful_pages)}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Pages with Issues"}}],
[{"type": "text", "text": {"content": str(pages_with_issues)}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Total Issues"}}],
[{"type": "text", "text": {"content": str(total_issues)}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Total Warnings"}}],
[{"type": "text", "text": {"content": str(total_warnings)}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Pages without Schema"}}],
[{"type": "text", "text": {"content": str(pages_without_schema)}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Pages without Description"}}],
[{"type": "text", "text": {"content": str(pages_without_description)}}],
]}},
]
children.append({
"object": "block",
"type": "table",
"table": {
"table_width": 2,
"has_column_header": True,
"has_row_header": False,
"children": stats_rows
}
})
# Pages list
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Analyzed Pages"}}]}
})
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {"rich_text": [
{"type": "text", "text": {"content": f"Filter by Audit ID in the database to see all {result.successful_pages} page entries."}}
]}
})
# Create the summary page
response = self.notion.pages.create(
parent={"database_id": self.database_id},
properties={
"Issue": {"title": [{"text": {"content": f"📊 Sitemap Crawl: {site_domain}"}}]},
"Category": {"select": {"name": "Technical SEO"}},
"Priority": {"select": {"name": "High"}},
"Site": {"url": result.site},
"Audit ID": {"rich_text": [{"text": {"content": result.audit_id}}]},
"Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
},
children=children,
)
logger.info(f"Created crawl summary page: {response['id']}")
return response["id"]
except Exception as e:
logger.error(f"Failed to create summary page: {e}")
return None
def print_progress_status(progress: CrawlProgress) -> None:
"""Print formatted progress status."""
status_emoji = {
"running": "🔄",
"completed": "",
"failed": "",
}.get(progress.status, "")
print(f"""
{'=' * 60}
{status_emoji} SEO Page Analysis - {progress.status.upper()}
{'=' * 60}
Audit ID: {progress.audit_id}
Site: {progress.site}
Status: {progress.status}
Progress: {progress.processed_urls}/{progress.total_urls} pages ({progress.get_progress_percent():.1f}%)
Successful: {progress.successful_urls}
Failed: {progress.failed_urls}
Elapsed: {progress.get_elapsed_time()}
ETA: {progress.get_eta() if progress.status == 'running' else 'N/A'}
Current URL: {progress.current_url[:60] + '...' if len(progress.current_url) > 60 else progress.current_url}
""")
if progress.summary_page_id:
print(f"Summary: https://www.notion.so/{progress.summary_page_id.replace('-', '')}")
if progress.error_message:
print(f"Error: {progress.error_message}")
print("=" * 60)
def main():
"""CLI entry point."""
import argparse
parser = argparse.ArgumentParser(description="Sitemap Crawler with Background Support")
subparsers = parser.add_subparsers(dest="command", help="Commands")
# Crawl command
crawl_parser = subparsers.add_parser("crawl", help="Start crawling a sitemap")
crawl_parser.add_argument("sitemap_url", help="URL of the sitemap to crawl")
crawl_parser.add_argument("--delay", "-d", type=float, default=DEFAULT_DELAY_SECONDS,
help=f"Delay between requests in seconds (default: {DEFAULT_DELAY_SECONDS})")
crawl_parser.add_argument("--max-pages", "-m", type=int, default=DEFAULT_MAX_PAGES,
help=f"Maximum pages to process (default: {DEFAULT_MAX_PAGES})")
crawl_parser.add_argument("--no-notion", action="store_true",
help="Don't save to Notion")
crawl_parser.add_argument("--no-limit", action="store_true",
help="Remove page limit (use with caution)")
# Status command
status_parser = subparsers.add_parser("status", help="Check crawl progress")
status_parser.add_argument("audit_id", nargs="?", help="Specific audit ID to check (optional)")
status_parser.add_argument("--all", "-a", action="store_true", help="Show all crawls (not just active)")
# List command
list_parser = subparsers.add_parser("list", help="List all crawl jobs")
args = parser.parse_args()
# Default to crawl if no command specified but URL provided
if args.command is None:
# Check if first positional arg looks like a URL
import sys
if len(sys.argv) > 1 and (sys.argv[1].startswith("http") or sys.argv[1].endswith(".xml")):
args.command = "crawl"
args.sitemap_url = sys.argv[1]
args.delay = DEFAULT_DELAY_SECONDS
args.max_pages = DEFAULT_MAX_PAGES
args.no_notion = False
args.no_limit = False
else:
parser.print_help()
return
if args.command == "status":
if args.audit_id:
# Show specific crawl status
progress = get_crawl_status(args.audit_id)
if progress:
print_progress_status(progress)
else:
print(f"No crawl found with audit ID: {args.audit_id}")
else:
# Show active crawls
if args.all:
crawls = get_all_crawls()
label = "All"
else:
crawls = get_active_crawls()
label = "Active"
if crawls:
print(f"\n{label} Crawl Jobs ({len(crawls)}):")
print("-" * 60)
for p in crawls:
status_emoji = {"running": "🔄", "completed": "", "failed": ""}.get(p.status, "")
print(f"{status_emoji} {p.audit_id}")
print(f" Site: {p.site}")
print(f" Progress: {p.processed_urls}/{p.total_urls} ({p.get_progress_percent():.1f}%)")
print()
else:
print(f"No {label.lower()} crawl jobs found.")
return
if args.command == "list":
crawls = get_all_crawls()
if crawls:
print(f"\nAll Crawl Jobs ({len(crawls)}):")
print("-" * 80)
print(f"{'Status':<10} {'Audit ID':<45} {'Progress':<15}")
print("-" * 80)
for p in crawls[:20]: # Show last 20
status_emoji = {"running": "🔄", "completed": "", "failed": ""}.get(p.status, "")
progress_str = f"{p.processed_urls}/{p.total_urls}"
print(f"{status_emoji} {p.status:<7} {p.audit_id:<45} {progress_str:<15}")
if len(crawls) > 20:
print(f"... and {len(crawls) - 20} more")
else:
print("No crawl jobs found.")
return
if args.command == "crawl":
# Handle --no-limit option
max_pages = args.max_pages
if args.no_limit:
max_pages = 999999 # Effectively unlimited
print("⚠️ WARNING: Page limit disabled. This may take a very long time!")
def progress_callback(progress: CrawlProgress):
pct = progress.get_progress_percent()
print(f"\r[{pct:5.1f}%] {progress.processed_urls}/{progress.total_urls} pages | "
f"Success: {progress.successful_urls} | Failed: {progress.failed_urls} | "
f"ETA: {progress.get_eta()}", end="", flush=True)
crawler = SitemapCrawler()
result = crawler.crawl_sitemap(
args.sitemap_url,
delay=args.delay,
max_pages=max_pages,
progress_callback=progress_callback,
save_to_notion=not args.no_notion,
)
print() # New line after progress
print()
print("=" * 60)
print("CRAWL COMPLETE")
print("=" * 60)
print(f"Audit ID: {result.audit_id}")
print(f"Total Pages: {result.total_pages}")
print(f"Successful: {result.successful_pages}")
print(f"Failed: {result.failed_pages}")
print(f"Duration: {result.get_duration()}")
if result.summary_page_id:
print(f"Summary Page: https://www.notion.so/{result.summary_page_id.replace('-', '')}")
if __name__ == "__main__":
main()