""" Sitemap Crawler - Sequential page analysis from sitemap ======================================================= Purpose: Crawl sitemap URLs one by one, analyze each page, save to Notion Python: 3.10+ Usage: from sitemap_crawler import SitemapCrawler crawler = SitemapCrawler() crawler.crawl_sitemap("https://example.com/sitemap.xml", delay=2.0) """ import json import logging import time import xml.etree.ElementTree as ET from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import Callable, Generator from urllib.parse import urlparse import requests from notion_client import Client from base_client import config from page_analyzer import PageAnalyzer, PageMetadata logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", ) logger = logging.getLogger(__name__) # Default database for page analysis data DEFAULT_PAGES_DATABASE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef" # Default limits to prevent excessive resource usage DEFAULT_MAX_PAGES = 500 DEFAULT_DELAY_SECONDS = 2.0 # Progress tracking directory PROGRESS_DIR = Path.home() / ".claude" / "seo-audit-progress" PROGRESS_DIR.mkdir(parents=True, exist_ok=True) @dataclass class CrawlProgress: """Track crawl progress.""" total_urls: int = 0 processed_urls: int = 0 successful_urls: int = 0 failed_urls: int = 0 skipped_urls: int = 0 start_time: datetime = field(default_factory=datetime.now) current_url: str = "" audit_id: str = "" site: str = "" status: str = "running" # running, completed, failed error_message: str = "" summary_page_id: str = "" def get_progress_percent(self) -> float: if self.total_urls == 0: return 0.0 return (self.processed_urls / self.total_urls) * 100 def get_elapsed_time(self) -> str: elapsed = datetime.now() - self.start_time minutes = int(elapsed.total_seconds() // 60) seconds = int(elapsed.total_seconds() % 60) return f"{minutes}m {seconds}s" def get_eta(self) -> str: if self.processed_urls == 0: return "calculating..." elapsed = (datetime.now() - self.start_time).total_seconds() avg_time_per_url = elapsed / self.processed_urls remaining_urls = self.total_urls - self.processed_urls eta_seconds = remaining_urls * avg_time_per_url minutes = int(eta_seconds // 60) seconds = int(eta_seconds % 60) return f"{minutes}m {seconds}s" def to_dict(self) -> dict: """Convert to dictionary for JSON serialization.""" return { "audit_id": self.audit_id, "site": self.site, "status": self.status, "total_urls": self.total_urls, "processed_urls": self.processed_urls, "successful_urls": self.successful_urls, "failed_urls": self.failed_urls, "progress_percent": round(self.get_progress_percent(), 1), "elapsed_time": self.get_elapsed_time(), "eta": self.get_eta(), "current_url": self.current_url, "start_time": self.start_time.isoformat(), "error_message": self.error_message, "summary_page_id": self.summary_page_id, "updated_at": datetime.now().isoformat(), } def save_to_file(self, filepath: Path | None = None) -> Path: """Save progress to JSON file.""" if filepath is None: filepath = PROGRESS_DIR / f"{self.audit_id}.json" with open(filepath, "w") as f: json.dump(self.to_dict(), f, indent=2) return filepath @classmethod def load_from_file(cls, filepath: Path) -> "CrawlProgress": """Load progress from JSON file.""" with open(filepath, "r") as f: data = json.load(f) progress = cls() progress.audit_id = data.get("audit_id", "") progress.site = data.get("site", "") progress.status = data.get("status", "unknown") progress.total_urls = data.get("total_urls", 0) progress.processed_urls = data.get("processed_urls", 0) progress.successful_urls = data.get("successful_urls", 0) progress.failed_urls = data.get("failed_urls", 0) progress.current_url = data.get("current_url", "") progress.error_message = data.get("error_message", "") progress.summary_page_id = data.get("summary_page_id", "") if data.get("start_time"): progress.start_time = datetime.fromisoformat(data["start_time"]) return progress def get_active_crawls() -> list[CrawlProgress]: """Get all active (running) crawl jobs.""" active = [] for filepath in PROGRESS_DIR.glob("*.json"): try: progress = CrawlProgress.load_from_file(filepath) if progress.status == "running": active.append(progress) except Exception: continue return active def get_all_crawls() -> list[CrawlProgress]: """Get all crawl jobs (active and completed).""" crawls = [] for filepath in sorted(PROGRESS_DIR.glob("*.json"), reverse=True): try: progress = CrawlProgress.load_from_file(filepath) crawls.append(progress) except Exception: continue return crawls def get_crawl_status(audit_id: str) -> CrawlProgress | None: """Get status of a specific crawl by audit ID.""" filepath = PROGRESS_DIR / f"{audit_id}.json" if filepath.exists(): return CrawlProgress.load_from_file(filepath) return None @dataclass class CrawlResult: """Result of a complete sitemap crawl.""" site: str sitemap_url: str audit_id: str total_pages: int successful_pages: int failed_pages: int start_time: datetime end_time: datetime pages_analyzed: list[PageMetadata] = field(default_factory=list) notion_page_ids: list[str] = field(default_factory=list) summary_page_id: str | None = None def get_duration(self) -> str: duration = self.end_time - self.start_time minutes = int(duration.total_seconds() // 60) seconds = int(duration.total_seconds() % 60) return f"{minutes}m {seconds}s" class SitemapCrawler: """Crawl sitemap URLs and analyze each page.""" def __init__( self, notion_token: str | None = None, database_id: str | None = None, ): """ Initialize sitemap crawler. Args: notion_token: Notion API token database_id: Notion database ID for storing results """ self.notion_token = notion_token or config.notion_token self.database_id = database_id or DEFAULT_PAGES_DATABASE_ID self.analyzer = PageAnalyzer() if self.notion_token: self.notion = Client(auth=self.notion_token) else: self.notion = None logger.warning("Notion token not configured, results will not be saved") def fetch_sitemap_urls(self, sitemap_url: str) -> list[str]: """ Fetch and parse URLs from a sitemap. Args: sitemap_url: URL of the sitemap Returns: List of URLs found in the sitemap """ try: response = requests.get(sitemap_url, timeout=30) response.raise_for_status() # Parse XML root = ET.fromstring(response.content) # Handle namespace namespaces = { "sm": "http://www.sitemaps.org/schemas/sitemap/0.9" } urls = [] # Check if this is a sitemap index sitemap_tags = root.findall(".//sm:sitemap/sm:loc", namespaces) if sitemap_tags: # This is a sitemap index, recursively fetch child sitemaps logger.info(f"Found sitemap index with {len(sitemap_tags)} child sitemaps") for loc in sitemap_tags: if loc.text: child_urls = self.fetch_sitemap_urls(loc.text) urls.extend(child_urls) else: # Regular sitemap, extract URLs url_tags = root.findall(".//sm:url/sm:loc", namespaces) if not url_tags: # Try without namespace url_tags = root.findall(".//url/loc") for loc in url_tags: if loc.text: urls.append(loc.text) # Remove duplicates while preserving order seen = set() unique_urls = [] for url in urls: if url not in seen: seen.add(url) unique_urls.append(url) logger.info(f"Found {len(unique_urls)} unique URLs in sitemap") return unique_urls except Exception as e: logger.error(f"Failed to fetch sitemap: {e}") raise def crawl_sitemap( self, sitemap_url: str, delay: float = DEFAULT_DELAY_SECONDS, max_pages: int = DEFAULT_MAX_PAGES, progress_callback: Callable[[CrawlProgress], None] | None = None, save_to_notion: bool = True, url_filter: Callable[[str], bool] | None = None, ) -> CrawlResult: """ Crawl all URLs in a sitemap sequentially. Args: sitemap_url: URL of the sitemap delay: Seconds to wait between requests (default: 2.0s) max_pages: Maximum number of pages to process (default: 500) progress_callback: Function called with progress updates save_to_notion: Whether to save results to Notion url_filter: Optional function to filter URLs (return True to include) Returns: CrawlResult with all analyzed pages """ # Parse site info parsed_sitemap = urlparse(sitemap_url) site = f"{parsed_sitemap.scheme}://{parsed_sitemap.netloc}" site_domain = parsed_sitemap.netloc # Generate audit ID audit_id = f"{site_domain}-pages-{datetime.now().strftime('%Y%m%d-%H%M%S')}" logger.info(f"Starting sitemap crawl: {sitemap_url}") logger.info(f"Audit ID: {audit_id}") logger.info(f"Delay between requests: {delay}s") # Initialize progress tracking progress = CrawlProgress( audit_id=audit_id, site=site, status="running", ) # Fetch URLs urls = self.fetch_sitemap_urls(sitemap_url) # Apply URL filter if provided if url_filter: urls = [url for url in urls if url_filter(url)] logger.info(f"After filtering: {len(urls)} URLs") # Apply max pages limit (default: 500 to prevent excessive resource usage) if len(urls) > max_pages: logger.warning(f"Sitemap has {len(urls)} URLs, limiting to {max_pages} pages") logger.warning(f"Use max_pages parameter to adjust this limit") urls = urls[:max_pages] logger.info(f"Processing {len(urls)} pages (max: {max_pages})") # Update progress with total URLs progress.total_urls = len(urls) progress.save_to_file() # Initialize result result = CrawlResult( site=site, sitemap_url=sitemap_url, audit_id=audit_id, total_pages=len(urls), successful_pages=0, failed_pages=0, start_time=datetime.now(), end_time=datetime.now(), ) # Process each URL try: for i, url in enumerate(urls): progress.current_url = url progress.processed_urls = i progress.save_to_file() # Save progress to file if progress_callback: progress_callback(progress) logger.info(f"[{i+1}/{len(urls)}] Analyzing: {url}") try: # Analyze page metadata = self.analyzer.analyze_url(url) result.pages_analyzed.append(metadata) if metadata.status_code == 200: progress.successful_urls += 1 result.successful_pages += 1 # Save to Notion if save_to_notion and self.notion: page_id = self._save_page_to_notion(metadata, audit_id, site) if page_id: result.notion_page_ids.append(page_id) else: progress.failed_urls += 1 result.failed_pages += 1 except Exception as e: logger.error(f"Failed to analyze {url}: {e}") progress.failed_urls += 1 result.failed_pages += 1 # Wait before next request if i < len(urls) - 1: # Don't wait after last URL time.sleep(delay) # Final progress update progress.processed_urls = len(urls) progress.status = "completed" if progress_callback: progress_callback(progress) except Exception as e: progress.status = "failed" progress.error_message = str(e) progress.save_to_file() raise # Update result result.end_time = datetime.now() # Create summary page if save_to_notion and self.notion: summary_id = self._create_crawl_summary_page(result) result.summary_page_id = summary_id progress.summary_page_id = summary_id # Save final progress progress.save_to_file() logger.info(f"Crawl complete: {result.successful_pages}/{result.total_pages} pages analyzed") logger.info(f"Duration: {result.get_duration()}") return result def _save_page_to_notion( self, metadata: PageMetadata, audit_id: str, site: str, ) -> str | None: """Save page metadata to Notion database.""" try: # Build properties properties = { "Issue": {"title": [{"text": {"content": f"📄 {metadata.url}"}}]}, "Category": {"select": {"name": "On-page SEO"}}, "Priority": {"select": {"name": self._determine_priority(metadata)}}, "Site": {"url": site}, "URL": {"url": metadata.url}, "Audit ID": {"rich_text": [{"text": {"content": audit_id}}]}, "Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}}, } # Build page content children = self._build_page_content(metadata) response = self.notion.pages.create( parent={"database_id": self.database_id}, properties=properties, children=children, ) return response["id"] except Exception as e: logger.error(f"Failed to save to Notion: {e}") return None def _determine_priority(self, metadata: PageMetadata) -> str: """Determine priority based on issues found.""" if len(metadata.issues) >= 3: return "High" elif len(metadata.issues) >= 1: return "Medium" elif len(metadata.warnings) >= 3: return "Medium" else: return "Low" def _build_page_content(self, metadata: PageMetadata) -> list[dict]: """Build Notion page content blocks from metadata.""" children = [] # Status summary callout status_emoji = "✅" if not metadata.issues else "⚠️" if len(metadata.issues) < 3 else "❌" children.append({ "object": "block", "type": "callout", "callout": { "rich_text": [ {"type": "text", "text": {"content": f"Status: {metadata.status_code} | "}}, {"type": "text", "text": {"content": f"Response: {metadata.response_time_ms:.0f}ms | "}}, {"type": "text", "text": {"content": f"Issues: {len(metadata.issues)} | "}}, {"type": "text", "text": {"content": f"Warnings: {len(metadata.warnings)}"}}, ], "icon": {"type": "emoji", "emoji": status_emoji}, "color": "gray_background" if not metadata.issues else "yellow_background" if len(metadata.issues) < 3 else "red_background", } }) # Meta Tags Section children.append({ "object": "block", "type": "heading_2", "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Meta Tags"}}]} }) # Meta tags table meta_rows = [ {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Tag"}, "annotations": {"bold": True}}], [{"type": "text", "text": {"content": "Value"}, "annotations": {"bold": True}}], [{"type": "text", "text": {"content": "Status"}, "annotations": {"bold": True}}], ]}}, {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Title"}}], [{"type": "text", "text": {"content": (metadata.title or "—")[:50]}}], [{"type": "text", "text": {"content": f"✓ {metadata.title_length} chars" if metadata.title else "✗ Missing"}}], ]}}, {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Description"}}], [{"type": "text", "text": {"content": (metadata.meta_description or "—")[:50]}}], [{"type": "text", "text": {"content": f"✓ {metadata.meta_description_length} chars" if metadata.meta_description else "✗ Missing"}}], ]}}, {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Canonical"}}], [{"type": "text", "text": {"content": (metadata.canonical_url or "—")[:50]}}], [{"type": "text", "text": {"content": "✓" if metadata.canonical_url else "✗ Missing"}}], ]}}, {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Robots"}}], [{"type": "text", "text": {"content": metadata.robots_meta or "—"}}], [{"type": "text", "text": {"content": "✓" if metadata.robots_meta else "—"}}], ]}}, {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Lang"}}], [{"type": "text", "text": {"content": metadata.html_lang or "—"}}], [{"type": "text", "text": {"content": "✓" if metadata.html_lang else "—"}}], ]}}, ] children.append({ "object": "block", "type": "table", "table": { "table_width": 3, "has_column_header": True, "has_row_header": False, "children": meta_rows } }) # Headings Section children.append({ "object": "block", "type": "heading_2", "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Headings"}}]} }) children.append({ "object": "block", "type": "paragraph", "paragraph": {"rich_text": [ {"type": "text", "text": {"content": f"H1: {metadata.h1_count} | "}}, {"type": "text", "text": {"content": f"Total headings: {len(metadata.headings)}"}}, ]} }) if metadata.h1_text: children.append({ "object": "block", "type": "quote", "quote": {"rich_text": [{"type": "text", "text": {"content": metadata.h1_text[:200]}}]} }) # Schema Data Section children.append({ "object": "block", "type": "heading_2", "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Structured Data"}}]} }) if metadata.schema_types_found: children.append({ "object": "block", "type": "paragraph", "paragraph": {"rich_text": [ {"type": "text", "text": {"content": "Schema types found: "}}, {"type": "text", "text": {"content": ", ".join(metadata.schema_types_found)}, "annotations": {"code": True}}, ]} }) else: children.append({ "object": "block", "type": "callout", "callout": { "rich_text": [{"type": "text", "text": {"content": "No structured data found on this page"}}], "icon": {"type": "emoji", "emoji": "⚠️"}, "color": "yellow_background", } }) # Open Graph Section children.append({ "object": "block", "type": "heading_2", "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Open Graph"}}]} }) og = metadata.open_graph og_status = "✓ Configured" if og.og_title else "✗ Missing" children.append({ "object": "block", "type": "paragraph", "paragraph": {"rich_text": [ {"type": "text", "text": {"content": f"Status: {og_status}\n"}}, {"type": "text", "text": {"content": f"og:title: {og.og_title or '—'}\n"}}, {"type": "text", "text": {"content": f"og:type: {og.og_type or '—'}"}}, ]} }) # Links Section children.append({ "object": "block", "type": "heading_2", "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Links"}}]} }) children.append({ "object": "block", "type": "paragraph", "paragraph": {"rich_text": [ {"type": "text", "text": {"content": f"Internal links: {metadata.internal_link_count}\n"}}, {"type": "text", "text": {"content": f"External links: {metadata.external_link_count}"}}, ]} }) # Images Section children.append({ "object": "block", "type": "heading_2", "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Images"}}]} }) children.append({ "object": "block", "type": "paragraph", "paragraph": {"rich_text": [ {"type": "text", "text": {"content": f"Total: {metadata.images_total} | "}}, {"type": "text", "text": {"content": f"With alt: {metadata.images_with_alt} | "}}, {"type": "text", "text": {"content": f"Without alt: {metadata.images_without_alt}"}}, ]} }) # Hreflang Section (if present) if metadata.hreflang_tags: children.append({ "object": "block", "type": "heading_2", "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Hreflang Tags"}}]} }) for tag in metadata.hreflang_tags[:10]: children.append({ "object": "block", "type": "bulleted_list_item", "bulleted_list_item": {"rich_text": [ {"type": "text", "text": {"content": f"{tag['lang']}: "}}, {"type": "text", "text": {"content": tag['url'], "link": {"url": tag['url']}}}, ]} }) # Issues & Warnings Section if metadata.issues or metadata.warnings: children.append({ "object": "block", "type": "heading_2", "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Issues & Warnings"}}]} }) for issue in metadata.issues: children.append({ "object": "block", "type": "to_do", "to_do": { "rich_text": [ {"type": "text", "text": {"content": "❌ "}, "annotations": {"bold": True}}, {"type": "text", "text": {"content": issue}}, ], "checked": False, } }) for warning in metadata.warnings: children.append({ "object": "block", "type": "to_do", "to_do": { "rich_text": [ {"type": "text", "text": {"content": "⚠️ "}, "annotations": {"bold": True}}, {"type": "text", "text": {"content": warning}}, ], "checked": False, } }) return children def _create_crawl_summary_page(self, result: CrawlResult) -> str | None: """Create a summary page for the crawl.""" try: site_domain = urlparse(result.site).netloc # Calculate statistics total_issues = sum(len(p.issues) for p in result.pages_analyzed) total_warnings = sum(len(p.warnings) for p in result.pages_analyzed) pages_with_issues = sum(1 for p in result.pages_analyzed if p.issues) pages_without_schema = sum(1 for p in result.pages_analyzed if not p.schema_types_found) pages_without_description = sum(1 for p in result.pages_analyzed if not p.meta_description) children = [] # Header callout children.append({ "object": "block", "type": "callout", "callout": { "rich_text": [ {"type": "text", "text": {"content": f"Sitemap Crawl Complete\n\n"}}, {"type": "text", "text": {"content": f"Audit ID: {result.audit_id}\n"}}, {"type": "text", "text": {"content": f"Duration: {result.get_duration()}\n"}}, {"type": "text", "text": {"content": f"Pages: {result.successful_pages}/{result.total_pages}"}}, ], "icon": {"type": "emoji", "emoji": "📊"}, "color": "blue_background", } }) # Statistics table children.append({ "object": "block", "type": "heading_2", "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Statistics"}}]} }) stats_rows = [ {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Metric"}, "annotations": {"bold": True}}], [{"type": "text", "text": {"content": "Count"}, "annotations": {"bold": True}}], ]}}, {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Total Pages"}}], [{"type": "text", "text": {"content": str(result.total_pages)}}], ]}}, {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Successfully Analyzed"}}], [{"type": "text", "text": {"content": str(result.successful_pages)}}], ]}}, {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Pages with Issues"}}], [{"type": "text", "text": {"content": str(pages_with_issues)}}], ]}}, {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Total Issues"}}], [{"type": "text", "text": {"content": str(total_issues)}}], ]}}, {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Total Warnings"}}], [{"type": "text", "text": {"content": str(total_warnings)}}], ]}}, {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Pages without Schema"}}], [{"type": "text", "text": {"content": str(pages_without_schema)}}], ]}}, {"type": "table_row", "table_row": {"cells": [ [{"type": "text", "text": {"content": "Pages without Description"}}], [{"type": "text", "text": {"content": str(pages_without_description)}}], ]}}, ] children.append({ "object": "block", "type": "table", "table": { "table_width": 2, "has_column_header": True, "has_row_header": False, "children": stats_rows } }) # Pages list children.append({ "object": "block", "type": "heading_2", "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Analyzed Pages"}}]} }) children.append({ "object": "block", "type": "paragraph", "paragraph": {"rich_text": [ {"type": "text", "text": {"content": f"Filter by Audit ID in the database to see all {result.successful_pages} page entries."}} ]} }) # Create the summary page response = self.notion.pages.create( parent={"database_id": self.database_id}, properties={ "Issue": {"title": [{"text": {"content": f"📊 Sitemap Crawl: {site_domain}"}}]}, "Category": {"select": {"name": "Technical SEO"}}, "Priority": {"select": {"name": "High"}}, "Site": {"url": result.site}, "Audit ID": {"rich_text": [{"text": {"content": result.audit_id}}]}, "Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}}, }, children=children, ) logger.info(f"Created crawl summary page: {response['id']}") return response["id"] except Exception as e: logger.error(f"Failed to create summary page: {e}") return None def print_progress_status(progress: CrawlProgress) -> None: """Print formatted progress status.""" status_emoji = { "running": "🔄", "completed": "✅", "failed": "❌", }.get(progress.status, "❓") print(f""" {'=' * 60} {status_emoji} SEO Page Analysis - {progress.status.upper()} {'=' * 60} Audit ID: {progress.audit_id} Site: {progress.site} Status: {progress.status} Progress: {progress.processed_urls}/{progress.total_urls} pages ({progress.get_progress_percent():.1f}%) Successful: {progress.successful_urls} Failed: {progress.failed_urls} Elapsed: {progress.get_elapsed_time()} ETA: {progress.get_eta() if progress.status == 'running' else 'N/A'} Current URL: {progress.current_url[:60] + '...' if len(progress.current_url) > 60 else progress.current_url} """) if progress.summary_page_id: print(f"Summary: https://www.notion.so/{progress.summary_page_id.replace('-', '')}") if progress.error_message: print(f"Error: {progress.error_message}") print("=" * 60) def main(): """CLI entry point.""" import argparse parser = argparse.ArgumentParser(description="Sitemap Crawler with Background Support") subparsers = parser.add_subparsers(dest="command", help="Commands") # Crawl command crawl_parser = subparsers.add_parser("crawl", help="Start crawling a sitemap") crawl_parser.add_argument("sitemap_url", help="URL of the sitemap to crawl") crawl_parser.add_argument("--delay", "-d", type=float, default=DEFAULT_DELAY_SECONDS, help=f"Delay between requests in seconds (default: {DEFAULT_DELAY_SECONDS})") crawl_parser.add_argument("--max-pages", "-m", type=int, default=DEFAULT_MAX_PAGES, help=f"Maximum pages to process (default: {DEFAULT_MAX_PAGES})") crawl_parser.add_argument("--no-notion", action="store_true", help="Don't save to Notion") crawl_parser.add_argument("--no-limit", action="store_true", help="Remove page limit (use with caution)") # Status command status_parser = subparsers.add_parser("status", help="Check crawl progress") status_parser.add_argument("audit_id", nargs="?", help="Specific audit ID to check (optional)") status_parser.add_argument("--all", "-a", action="store_true", help="Show all crawls (not just active)") # List command list_parser = subparsers.add_parser("list", help="List all crawl jobs") args = parser.parse_args() # Default to crawl if no command specified but URL provided if args.command is None: # Check if first positional arg looks like a URL import sys if len(sys.argv) > 1 and (sys.argv[1].startswith("http") or sys.argv[1].endswith(".xml")): args.command = "crawl" args.sitemap_url = sys.argv[1] args.delay = DEFAULT_DELAY_SECONDS args.max_pages = DEFAULT_MAX_PAGES args.no_notion = False args.no_limit = False else: parser.print_help() return if args.command == "status": if args.audit_id: # Show specific crawl status progress = get_crawl_status(args.audit_id) if progress: print_progress_status(progress) else: print(f"No crawl found with audit ID: {args.audit_id}") else: # Show active crawls if args.all: crawls = get_all_crawls() label = "All" else: crawls = get_active_crawls() label = "Active" if crawls: print(f"\n{label} Crawl Jobs ({len(crawls)}):") print("-" * 60) for p in crawls: status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓") print(f"{status_emoji} {p.audit_id}") print(f" Site: {p.site}") print(f" Progress: {p.processed_urls}/{p.total_urls} ({p.get_progress_percent():.1f}%)") print() else: print(f"No {label.lower()} crawl jobs found.") return if args.command == "list": crawls = get_all_crawls() if crawls: print(f"\nAll Crawl Jobs ({len(crawls)}):") print("-" * 80) print(f"{'Status':<10} {'Audit ID':<45} {'Progress':<15}") print("-" * 80) for p in crawls[:20]: # Show last 20 status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓") progress_str = f"{p.processed_urls}/{p.total_urls}" print(f"{status_emoji} {p.status:<7} {p.audit_id:<45} {progress_str:<15}") if len(crawls) > 20: print(f"... and {len(crawls) - 20} more") else: print("No crawl jobs found.") return if args.command == "crawl": # Handle --no-limit option max_pages = args.max_pages if args.no_limit: max_pages = 999999 # Effectively unlimited print("⚠️ WARNING: Page limit disabled. This may take a very long time!") def progress_callback(progress: CrawlProgress): pct = progress.get_progress_percent() print(f"\r[{pct:5.1f}%] {progress.processed_urls}/{progress.total_urls} pages | " f"Success: {progress.successful_urls} | Failed: {progress.failed_urls} | " f"ETA: {progress.get_eta()}", end="", flush=True) crawler = SitemapCrawler() result = crawler.crawl_sitemap( args.sitemap_url, delay=args.delay, max_pages=max_pages, progress_callback=progress_callback, save_to_notion=not args.no_notion, ) print() # New line after progress print() print("=" * 60) print("CRAWL COMPLETE") print("=" * 60) print(f"Audit ID: {result.audit_id}") print(f"Total Pages: {result.total_pages}") print(f"Successful: {result.successful_pages}") print(f"Failed: {result.failed_pages}") print(f"Duration: {result.get_duration()}") if result.summary_page_id: print(f"Summary Page: https://www.notion.so/{result.summary_page_id.replace('-', '')}") if __name__ == "__main__": main()