our-claude-skills/custom-skills/12-seo-technical-audit/code/scripts/sitemap_crawler.py

"""
Sitemap Crawler - Sequential page analysis from sitemap
=======================================================
Purpose: Crawl sitemap URLs one by one, analyze each page, save to Notion
Python: 3.10+
Usage:
    from sitemap_crawler import SitemapCrawler
    crawler = SitemapCrawler()
    crawler.crawl_sitemap("https://example.com/sitemap.xml", delay=2.0)
"""

import json
import logging
import time
import xml.etree.ElementTree as ET
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Callable, Generator
from urllib.parse import urlparse

import requests
from notion_client import Client

from base_client import config
from page_analyzer import PageAnalyzer, PageMetadata

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)

# Default database for page analysis data
DEFAULT_PAGES_DATABASE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"

# Default limits to prevent excessive resource usage
DEFAULT_MAX_PAGES = 500
DEFAULT_DELAY_SECONDS = 2.0

# Progress tracking directory
PROGRESS_DIR = Path.home() / ".claude" / "seo-audit-progress"
PROGRESS_DIR.mkdir(parents=True, exist_ok=True)


@dataclass
class CrawlProgress:
    """Track crawl progress."""
    total_urls: int = 0
    processed_urls: int = 0
    successful_urls: int = 0
    failed_urls: int = 0
    skipped_urls: int = 0
    start_time: datetime = field(default_factory=datetime.now)
    current_url: str = ""
    audit_id: str = ""
    site: str = ""
    status: str = "running"  # running, completed, failed
    error_message: str = ""
    summary_page_id: str = ""

    def get_progress_percent(self) -> float:
        if self.total_urls == 0:
            return 0.0
        return (self.processed_urls / self.total_urls) * 100

    def get_elapsed_time(self) -> str:
        elapsed = datetime.now() - self.start_time
        minutes = int(elapsed.total_seconds() // 60)
        seconds = int(elapsed.total_seconds() % 60)
        return f"{minutes}m {seconds}s"

    def get_eta(self) -> str:
        if self.processed_urls == 0:
            return "calculating..."
        elapsed = (datetime.now() - self.start_time).total_seconds()
        avg_time_per_url = elapsed / self.processed_urls
        remaining_urls = self.total_urls - self.processed_urls
        eta_seconds = remaining_urls * avg_time_per_url
        minutes = int(eta_seconds // 60)
        seconds = int(eta_seconds % 60)
        return f"{minutes}m {seconds}s"

    def to_dict(self) -> dict:
        """Convert to dictionary for JSON serialization."""
        return {
            "audit_id": self.audit_id,
            "site": self.site,
            "status": self.status,
            "total_urls": self.total_urls,
            "processed_urls": self.processed_urls,
            "successful_urls": self.successful_urls,
            "failed_urls": self.failed_urls,
            "progress_percent": round(self.get_progress_percent(), 1),
            "elapsed_time": self.get_elapsed_time(),
            "eta": self.get_eta(),
            "current_url": self.current_url,
            "start_time": self.start_time.isoformat(),
            "error_message": self.error_message,
            "summary_page_id": self.summary_page_id,
            "updated_at": datetime.now().isoformat(),
        }

    def save_to_file(self, filepath: Path | None = None) -> Path:
        """Save progress to JSON file."""
        if filepath is None:
            filepath = PROGRESS_DIR / f"{self.audit_id}.json"
        with open(filepath, "w") as f:
            json.dump(self.to_dict(), f, indent=2)
        return filepath

    @classmethod
    def load_from_file(cls, filepath: Path) -> "CrawlProgress":
        """Load progress from JSON file."""
        with open(filepath, "r") as f:
            data = json.load(f)
        progress = cls()
        progress.audit_id = data.get("audit_id", "")
        progress.site = data.get("site", "")
        progress.status = data.get("status", "unknown")
        progress.total_urls = data.get("total_urls", 0)
        progress.processed_urls = data.get("processed_urls", 0)
        progress.successful_urls = data.get("successful_urls", 0)
        progress.failed_urls = data.get("failed_urls", 0)
        progress.current_url = data.get("current_url", "")
        progress.error_message = data.get("error_message", "")
        progress.summary_page_id = data.get("summary_page_id", "")
        if data.get("start_time"):
            progress.start_time = datetime.fromisoformat(data["start_time"])
        return progress


def get_active_crawls() -> list[CrawlProgress]:
    """Get all active (running) crawl jobs."""
    active = []
    for filepath in PROGRESS_DIR.glob("*.json"):
        try:
            progress = CrawlProgress.load_from_file(filepath)
            if progress.status == "running":
                active.append(progress)
        except Exception:
            continue
    return active


def get_all_crawls() -> list[CrawlProgress]:
    """Get all crawl jobs (active and completed)."""
    crawls = []
    for filepath in sorted(PROGRESS_DIR.glob("*.json"), reverse=True):
        try:
            progress = CrawlProgress.load_from_file(filepath)
            crawls.append(progress)
        except Exception:
            continue
    return crawls


def get_crawl_status(audit_id: str) -> CrawlProgress | None:
    """Get status of a specific crawl by audit ID."""
    filepath = PROGRESS_DIR / f"{audit_id}.json"
    if filepath.exists():
        return CrawlProgress.load_from_file(filepath)
    return None


@dataclass
class CrawlResult:
    """Result of a complete sitemap crawl."""
    site: str
    sitemap_url: str
    audit_id: str
    total_pages: int
    successful_pages: int
    failed_pages: int
    start_time: datetime
    end_time: datetime
    pages_analyzed: list[PageMetadata] = field(default_factory=list)
    notion_page_ids: list[str] = field(default_factory=list)
    summary_page_id: str | None = None

    def get_duration(self) -> str:
        duration = self.end_time - self.start_time
        minutes = int(duration.total_seconds() // 60)
        seconds = int(duration.total_seconds() % 60)
        return f"{minutes}m {seconds}s"


class SitemapCrawler:
    """Crawl sitemap URLs and analyze each page."""

    def __init__(
        self,
        notion_token: str | None = None,
        database_id: str | None = None,
    ):
        """
        Initialize sitemap crawler.

        Args:
            notion_token: Notion API token
            database_id: Notion database ID for storing results
        """
        self.notion_token = notion_token or config.notion_token
        self.database_id = database_id or DEFAULT_PAGES_DATABASE_ID
        self.analyzer = PageAnalyzer()

        if self.notion_token:
            self.notion = Client(auth=self.notion_token)
        else:
            self.notion = None
            logger.warning("Notion token not configured, results will not be saved")

    def fetch_sitemap_urls(self, sitemap_url: str) -> list[str]:
        """
        Fetch and parse URLs from a sitemap.

        Args:
            sitemap_url: URL of the sitemap

        Returns:
            List of URLs found in the sitemap
        """
        try:
            response = requests.get(sitemap_url, timeout=30)
            response.raise_for_status()

            # Parse XML
            root = ET.fromstring(response.content)

            # Handle namespace
            namespaces = {
                "sm": "http://www.sitemaps.org/schemas/sitemap/0.9"
            }

            urls = []

            # Check if this is a sitemap index
            sitemap_tags = root.findall(".//sm:sitemap/sm:loc", namespaces)
            if sitemap_tags:
                # This is a sitemap index, recursively fetch child sitemaps
                logger.info(f"Found sitemap index with {len(sitemap_tags)} child sitemaps")
                for loc in sitemap_tags:
                    if loc.text:
                        child_urls = self.fetch_sitemap_urls(loc.text)
                        urls.extend(child_urls)
            else:
                # Regular sitemap, extract URLs
                url_tags = root.findall(".//sm:url/sm:loc", namespaces)
                if not url_tags:
                    # Try without namespace
                    url_tags = root.findall(".//url/loc")

                for loc in url_tags:
                    if loc.text:
                        urls.append(loc.text)

            # Remove duplicates while preserving order
            seen = set()
            unique_urls = []
            for url in urls:
                if url not in seen:
                    seen.add(url)
                    unique_urls.append(url)

            logger.info(f"Found {len(unique_urls)} unique URLs in sitemap")
            return unique_urls

        except Exception as e:
            logger.error(f"Failed to fetch sitemap: {e}")
            raise

    def crawl_sitemap(
        self,
        sitemap_url: str,
        delay: float = DEFAULT_DELAY_SECONDS,
        max_pages: int = DEFAULT_MAX_PAGES,
        progress_callback: Callable[[CrawlProgress], None] | None = None,
        save_to_notion: bool = True,
        url_filter: Callable[[str], bool] | None = None,
    ) -> CrawlResult:
        """
        Crawl all URLs in a sitemap sequentially.

        Args:
            sitemap_url: URL of the sitemap
            delay: Seconds to wait between requests (default: 2.0s)
            max_pages: Maximum number of pages to process (default: 500)
            progress_callback: Function called with progress updates
            save_to_notion: Whether to save results to Notion
            url_filter: Optional function to filter URLs (return True to include)

        Returns:
            CrawlResult with all analyzed pages
        """
        # Parse site info
        parsed_sitemap = urlparse(sitemap_url)
        site = f"{parsed_sitemap.scheme}://{parsed_sitemap.netloc}"
        site_domain = parsed_sitemap.netloc

        # Generate audit ID
        audit_id = f"{site_domain}-pages-{datetime.now().strftime('%Y%m%d-%H%M%S')}"

        logger.info(f"Starting sitemap crawl: {sitemap_url}")
        logger.info(f"Audit ID: {audit_id}")
        logger.info(f"Delay between requests: {delay}s")

        # Initialize progress tracking
        progress = CrawlProgress(
            audit_id=audit_id,
            site=site,
            status="running",
        )

        # Fetch URLs
        urls = self.fetch_sitemap_urls(sitemap_url)

        # Apply URL filter if provided
        if url_filter:
            urls = [url for url in urls if url_filter(url)]
            logger.info(f"After filtering: {len(urls)} URLs")

        # Apply max pages limit (default: 500 to prevent excessive resource usage)
        if len(urls) > max_pages:
            logger.warning(f"Sitemap has {len(urls)} URLs, limiting to {max_pages} pages")
            logger.warning(f"Use max_pages parameter to adjust this limit")
            urls = urls[:max_pages]
        logger.info(f"Processing {len(urls)} pages (max: {max_pages})")

        # Update progress with total URLs
        progress.total_urls = len(urls)
        progress.save_to_file()

        # Initialize result
        result = CrawlResult(
            site=site,
            sitemap_url=sitemap_url,
            audit_id=audit_id,
            total_pages=len(urls),
            successful_pages=0,
            failed_pages=0,
            start_time=datetime.now(),
            end_time=datetime.now(),
        )

        # Process each URL
        try:
            for i, url in enumerate(urls):
                progress.current_url = url
                progress.processed_urls = i
                progress.save_to_file()  # Save progress to file

                if progress_callback:
                    progress_callback(progress)

                logger.info(f"[{i+1}/{len(urls)}] Analyzing: {url}")

                try:
                    # Analyze page
                    metadata = self.analyzer.analyze_url(url)
                    result.pages_analyzed.append(metadata)

                    if metadata.status_code == 200:
                        progress.successful_urls += 1
                        result.successful_pages += 1

                        # Save to Notion
                        if save_to_notion and self.notion:
                            page_id = self._save_page_to_notion(metadata, audit_id, site)
                            if page_id:
                                result.notion_page_ids.append(page_id)
                    else:
                        progress.failed_urls += 1
                        result.failed_pages += 1

                except Exception as e:
                    logger.error(f"Failed to analyze {url}: {e}")
                    progress.failed_urls += 1
                    result.failed_pages += 1

                # Wait before next request
                if i < len(urls) - 1:  # Don't wait after last URL
                    time.sleep(delay)

            # Final progress update
            progress.processed_urls = len(urls)
            progress.status = "completed"
            if progress_callback:
                progress_callback(progress)

        except Exception as e:
            progress.status = "failed"
            progress.error_message = str(e)
            progress.save_to_file()
            raise

        # Update result
        result.end_time = datetime.now()

        # Create summary page
        if save_to_notion and self.notion:
            summary_id = self._create_crawl_summary_page(result)
            result.summary_page_id = summary_id
            progress.summary_page_id = summary_id

        # Save final progress
        progress.save_to_file()

        logger.info(f"Crawl complete: {result.successful_pages}/{result.total_pages} pages analyzed")
        logger.info(f"Duration: {result.get_duration()}")

        return result

    def _save_page_to_notion(
        self,
        metadata: PageMetadata,
        audit_id: str,
        site: str,
    ) -> str | None:
        """Save page metadata to Notion database."""
        try:
            # Build properties
            properties = {
                "Issue": {"title": [{"text": {"content": f"📄 {metadata.url}"}}]},
                "Category": {"select": {"name": "On-page SEO"}},
                "Priority": {"select": {"name": self._determine_priority(metadata)}},
                "Site": {"url": site},
                "URL": {"url": metadata.url},
                "Audit ID": {"rich_text": [{"text": {"content": audit_id}}]},
                "Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
            }

            # Build page content
            children = self._build_page_content(metadata)

            response = self.notion.pages.create(
                parent={"database_id": self.database_id},
                properties=properties,
                children=children,
            )

            return response["id"]

        except Exception as e:
            logger.error(f"Failed to save to Notion: {e}")
            return None

    def _determine_priority(self, metadata: PageMetadata) -> str:
        """Determine priority based on issues found."""
        if len(metadata.issues) >= 3:
            return "High"
        elif len(metadata.issues) >= 1:
            return "Medium"
        elif len(metadata.warnings) >= 3:
            return "Medium"
        else:
            return "Low"

    def _build_page_content(self, metadata: PageMetadata) -> list[dict]:
        """Build Notion page content blocks from metadata."""
        children = []

        # Status summary callout
        status_emoji = "✅" if not metadata.issues else "⚠️" if len(metadata.issues) < 3 else "❌"
        children.append({
            "object": "block",
            "type": "callout",
            "callout": {
                "rich_text": [
                    {"type": "text", "text": {"content": f"Status: {metadata.status_code} | "}},
                    {"type": "text", "text": {"content": f"Response: {metadata.response_time_ms:.0f}ms | "}},
                    {"type": "text", "text": {"content": f"Issues: {len(metadata.issues)} | "}},
                    {"type": "text", "text": {"content": f"Warnings: {len(metadata.warnings)}"}},
                ],
                "icon": {"type": "emoji", "emoji": status_emoji},
                "color": "gray_background" if not metadata.issues else "yellow_background" if len(metadata.issues) < 3 else "red_background",
            }
        })

        # Meta Tags Section
        children.append({
            "object": "block",
            "type": "heading_2",
            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Meta Tags"}}]}
        })

        # Meta tags table
        meta_rows = [
            {"type": "table_row", "table_row": {"cells": [
                [{"type": "text", "text": {"content": "Tag"}, "annotations": {"bold": True}}],
                [{"type": "text", "text": {"content": "Value"}, "annotations": {"bold": True}}],
                [{"type": "text", "text": {"content": "Status"}, "annotations": {"bold": True}}],
            ]}},
            {"type": "table_row", "table_row": {"cells": [
                [{"type": "text", "text": {"content": "Title"}}],
                [{"type": "text", "text": {"content": (metadata.title or "—")[:50]}}],
                [{"type": "text", "text": {"content": f"✓ {metadata.title_length} chars" if metadata.title else "✗ Missing"}}],
            ]}},
            {"type": "table_row", "table_row": {"cells": [
                [{"type": "text", "text": {"content": "Description"}}],
                [{"type": "text", "text": {"content": (metadata.meta_description or "—")[:50]}}],
                [{"type": "text", "text": {"content": f"✓ {metadata.meta_description_length} chars" if metadata.meta_description else "✗ Missing"}}],
            ]}},
            {"type": "table_row", "table_row": {"cells": [
                [{"type": "text", "text": {"content": "Canonical"}}],
                [{"type": "text", "text": {"content": (metadata.canonical_url or "—")[:50]}}],
                [{"type": "text", "text": {"content": "✓" if metadata.canonical_url else "✗ Missing"}}],
            ]}},
            {"type": "table_row", "table_row": {"cells": [
                [{"type": "text", "text": {"content": "Robots"}}],
                [{"type": "text", "text": {"content": metadata.robots_meta or "—"}}],
                [{"type": "text", "text": {"content": "✓" if metadata.robots_meta else "—"}}],
            ]}},
            {"type": "table_row", "table_row": {"cells": [
                [{"type": "text", "text": {"content": "Lang"}}],
                [{"type": "text", "text": {"content": metadata.html_lang or "—"}}],
                [{"type": "text", "text": {"content": "✓" if metadata.html_lang else "—"}}],
            ]}},
        ]

        children.append({
            "object": "block",
            "type": "table",
            "table": {
                "table_width": 3,
                "has_column_header": True,
                "has_row_header": False,
                "children": meta_rows
            }
        })

        # Headings Section
        children.append({
            "object": "block",
            "type": "heading_2",
            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Headings"}}]}
        })

        children.append({
            "object": "block",
            "type": "paragraph",
            "paragraph": {"rich_text": [
                {"type": "text", "text": {"content": f"H1: {metadata.h1_count} | "}},
                {"type": "text", "text": {"content": f"Total headings: {len(metadata.headings)}"}},
            ]}
        })

        if metadata.h1_text:
            children.append({
                "object": "block",
                "type": "quote",
                "quote": {"rich_text": [{"type": "text", "text": {"content": metadata.h1_text[:200]}}]}
            })

        # Schema Data Section
        children.append({
            "object": "block",
            "type": "heading_2",
            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Structured Data"}}]}
        })

        if metadata.schema_types_found:
            children.append({
                "object": "block",
                "type": "paragraph",
                "paragraph": {"rich_text": [
                    {"type": "text", "text": {"content": "Schema types found: "}},
                    {"type": "text", "text": {"content": ", ".join(metadata.schema_types_found)}, "annotations": {"code": True}},
                ]}
            })
        else:
            children.append({
                "object": "block",
                "type": "callout",
                "callout": {
                    "rich_text": [{"type": "text", "text": {"content": "No structured data found on this page"}}],
                    "icon": {"type": "emoji", "emoji": "⚠️"},
                    "color": "yellow_background",
                }
            })

        # Open Graph Section
        children.append({
            "object": "block",
            "type": "heading_2",
            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Open Graph"}}]}
        })

        og = metadata.open_graph
        og_status = "✓ Configured" if og.og_title else "✗ Missing"
        children.append({
            "object": "block",
            "type": "paragraph",
            "paragraph": {"rich_text": [
                {"type": "text", "text": {"content": f"Status: {og_status}\n"}},
                {"type": "text", "text": {"content": f"og:title: {og.og_title or '—'}\n"}},
                {"type": "text", "text": {"content": f"og:type: {og.og_type or '—'}"}},
            ]}
        })

        # Links Section
        children.append({
            "object": "block",
            "type": "heading_2",
            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Links"}}]}
        })

        children.append({
            "object": "block",
            "type": "paragraph",
            "paragraph": {"rich_text": [
                {"type": "text", "text": {"content": f"Internal links: {metadata.internal_link_count}\n"}},
                {"type": "text", "text": {"content": f"External links: {metadata.external_link_count}"}},
            ]}
        })

        # Images Section
        children.append({
            "object": "block",
            "type": "heading_2",
            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Images"}}]}
        })

        children.append({
            "object": "block",
            "type": "paragraph",
            "paragraph": {"rich_text": [
                {"type": "text", "text": {"content": f"Total: {metadata.images_total} | "}},
                {"type": "text", "text": {"content": f"With alt: {metadata.images_with_alt} | "}},
                {"type": "text", "text": {"content": f"Without alt: {metadata.images_without_alt}"}},
            ]}
        })

        # Hreflang Section (if present)
        if metadata.hreflang_tags:
            children.append({
                "object": "block",
                "type": "heading_2",
                "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Hreflang Tags"}}]}
            })

            for tag in metadata.hreflang_tags[:10]:
                children.append({
                    "object": "block",
                    "type": "bulleted_list_item",
                    "bulleted_list_item": {"rich_text": [
                        {"type": "text", "text": {"content": f"{tag['lang']}: "}},
                        {"type": "text", "text": {"content": tag['url'], "link": {"url": tag['url']}}},
                    ]}
                })

        # Issues & Warnings Section
        if metadata.issues or metadata.warnings:
            children.append({
                "object": "block",
                "type": "heading_2",
                "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Issues & Warnings"}}]}
            })

            for issue in metadata.issues:
                children.append({
                    "object": "block",
                    "type": "to_do",
                    "to_do": {
                        "rich_text": [
                            {"type": "text", "text": {"content": "❌ "}, "annotations": {"bold": True}},
                            {"type": "text", "text": {"content": issue}},
                        ],
                        "checked": False,
                    }
                })

            for warning in metadata.warnings:
                children.append({
                    "object": "block",
                    "type": "to_do",
                    "to_do": {
                        "rich_text": [
                            {"type": "text", "text": {"content": "⚠️ "}, "annotations": {"bold": True}},
                            {"type": "text", "text": {"content": warning}},
                        ],
                        "checked": False,
                    }
                })

        return children

    def _create_crawl_summary_page(self, result: CrawlResult) -> str | None:
        """Create a summary page for the crawl."""
        try:
            site_domain = urlparse(result.site).netloc

            # Calculate statistics
            total_issues = sum(len(p.issues) for p in result.pages_analyzed)
            total_warnings = sum(len(p.warnings) for p in result.pages_analyzed)
            pages_with_issues = sum(1 for p in result.pages_analyzed if p.issues)
            pages_without_schema = sum(1 for p in result.pages_analyzed if not p.schema_types_found)
            pages_without_description = sum(1 for p in result.pages_analyzed if not p.meta_description)

            children = []

            # Header callout
            children.append({
                "object": "block",
                "type": "callout",
                "callout": {
                    "rich_text": [
                        {"type": "text", "text": {"content": f"Sitemap Crawl Complete\n\n"}},
                        {"type": "text", "text": {"content": f"Audit ID: {result.audit_id}\n"}},
                        {"type": "text", "text": {"content": f"Duration: {result.get_duration()}\n"}},
                        {"type": "text", "text": {"content": f"Pages: {result.successful_pages}/{result.total_pages}"}},
                    ],
                    "icon": {"type": "emoji", "emoji": "📊"},
                    "color": "blue_background",
                }
            })

            # Statistics table
            children.append({
                "object": "block",
                "type": "heading_2",
                "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Statistics"}}]}
            })

            stats_rows = [
                {"type": "table_row", "table_row": {"cells": [
                    [{"type": "text", "text": {"content": "Metric"}, "annotations": {"bold": True}}],
                    [{"type": "text", "text": {"content": "Count"}, "annotations": {"bold": True}}],
                ]}},
                {"type": "table_row", "table_row": {"cells": [
                    [{"type": "text", "text": {"content": "Total Pages"}}],
                    [{"type": "text", "text": {"content": str(result.total_pages)}}],
                ]}},
                {"type": "table_row", "table_row": {"cells": [
                    [{"type": "text", "text": {"content": "Successfully Analyzed"}}],
                    [{"type": "text", "text": {"content": str(result.successful_pages)}}],
                ]}},
                {"type": "table_row", "table_row": {"cells": [
                    [{"type": "text", "text": {"content": "Pages with Issues"}}],
                    [{"type": "text", "text": {"content": str(pages_with_issues)}}],
                ]}},
                {"type": "table_row", "table_row": {"cells": [
                    [{"type": "text", "text": {"content": "Total Issues"}}],
                    [{"type": "text", "text": {"content": str(total_issues)}}],
                ]}},
                {"type": "table_row", "table_row": {"cells": [
                    [{"type": "text", "text": {"content": "Total Warnings"}}],
                    [{"type": "text", "text": {"content": str(total_warnings)}}],
                ]}},
                {"type": "table_row", "table_row": {"cells": [
                    [{"type": "text", "text": {"content": "Pages without Schema"}}],
                    [{"type": "text", "text": {"content": str(pages_without_schema)}}],
                ]}},
                {"type": "table_row", "table_row": {"cells": [
                    [{"type": "text", "text": {"content": "Pages without Description"}}],
                    [{"type": "text", "text": {"content": str(pages_without_description)}}],
                ]}},
            ]

            children.append({
                "object": "block",
                "type": "table",
                "table": {
                    "table_width": 2,
                    "has_column_header": True,
                    "has_row_header": False,
                    "children": stats_rows
                }
            })

            # Pages list
            children.append({
                "object": "block",
                "type": "heading_2",
                "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Analyzed Pages"}}]}
            })

            children.append({
                "object": "block",
                "type": "paragraph",
                "paragraph": {"rich_text": [
                    {"type": "text", "text": {"content": f"Filter by Audit ID in the database to see all {result.successful_pages} page entries."}}
                ]}
            })

            # Create the summary page
            response = self.notion.pages.create(
                parent={"database_id": self.database_id},
                properties={
                    "Issue": {"title": [{"text": {"content": f"📊 Sitemap Crawl: {site_domain}"}}]},
                    "Category": {"select": {"name": "Technical SEO"}},
                    "Priority": {"select": {"name": "High"}},
                    "Site": {"url": result.site},
                    "Audit ID": {"rich_text": [{"text": {"content": result.audit_id}}]},
                    "Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
                },
                children=children,
            )

            logger.info(f"Created crawl summary page: {response['id']}")
            return response["id"]

        except Exception as e:
            logger.error(f"Failed to create summary page: {e}")
            return None


def print_progress_status(progress: CrawlProgress) -> None:
    """Print formatted progress status."""
    status_emoji = {
        "running": "🔄",
        "completed": "✅",
        "failed": "❌",
    }.get(progress.status, "❓")

    print(f"""
{'=' * 60}
{status_emoji} SEO Page Analysis - {progress.status.upper()}
{'=' * 60}
Audit ID:    {progress.audit_id}
Site:        {progress.site}
Status:      {progress.status}

Progress:    {progress.processed_urls}/{progress.total_urls} pages ({progress.get_progress_percent():.1f}%)
Successful:  {progress.successful_urls}
Failed:      {progress.failed_urls}
Elapsed:     {progress.get_elapsed_time()}
ETA:         {progress.get_eta() if progress.status == 'running' else 'N/A'}

Current URL: {progress.current_url[:60] + '...' if len(progress.current_url) > 60 else progress.current_url}
""")

    if progress.summary_page_id:
        print(f"Summary:     https://www.notion.so/{progress.summary_page_id.replace('-', '')}")

    if progress.error_message:
        print(f"Error:       {progress.error_message}")

    print("=" * 60)


def main():
    """CLI entry point."""
    import argparse

    parser = argparse.ArgumentParser(description="Sitemap Crawler with Background Support")
    subparsers = parser.add_subparsers(dest="command", help="Commands")

    # Crawl command
    crawl_parser = subparsers.add_parser("crawl", help="Start crawling a sitemap")
    crawl_parser.add_argument("sitemap_url", help="URL of the sitemap to crawl")
    crawl_parser.add_argument("--delay", "-d", type=float, default=DEFAULT_DELAY_SECONDS,
                             help=f"Delay between requests in seconds (default: {DEFAULT_DELAY_SECONDS})")
    crawl_parser.add_argument("--max-pages", "-m", type=int, default=DEFAULT_MAX_PAGES,
                             help=f"Maximum pages to process (default: {DEFAULT_MAX_PAGES})")
    crawl_parser.add_argument("--no-notion", action="store_true",
                             help="Don't save to Notion")
    crawl_parser.add_argument("--no-limit", action="store_true",
                             help="Remove page limit (use with caution)")

    # Status command
    status_parser = subparsers.add_parser("status", help="Check crawl progress")
    status_parser.add_argument("audit_id", nargs="?", help="Specific audit ID to check (optional)")
    status_parser.add_argument("--all", "-a", action="store_true", help="Show all crawls (not just active)")

    # List command
    list_parser = subparsers.add_parser("list", help="List all crawl jobs")

    args = parser.parse_args()

    # Default to crawl if no command specified but URL provided
    if args.command is None:
        # Check if first positional arg looks like a URL
        import sys
        if len(sys.argv) > 1 and (sys.argv[1].startswith("http") or sys.argv[1].endswith(".xml")):
            args.command = "crawl"
            args.sitemap_url = sys.argv[1]
            args.delay = DEFAULT_DELAY_SECONDS
            args.max_pages = DEFAULT_MAX_PAGES
            args.no_notion = False
            args.no_limit = False
        else:
            parser.print_help()
            return

    if args.command == "status":
        if args.audit_id:
            # Show specific crawl status
            progress = get_crawl_status(args.audit_id)
            if progress:
                print_progress_status(progress)
            else:
                print(f"No crawl found with audit ID: {args.audit_id}")
        else:
            # Show active crawls
            if args.all:
                crawls = get_all_crawls()
                label = "All"
            else:
                crawls = get_active_crawls()
                label = "Active"

            if crawls:
                print(f"\n{label} Crawl Jobs ({len(crawls)}):")
                print("-" * 60)
                for p in crawls:
                    status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓")
                    print(f"{status_emoji} {p.audit_id}")
                    print(f"   Site: {p.site}")
                    print(f"   Progress: {p.processed_urls}/{p.total_urls} ({p.get_progress_percent():.1f}%)")
                    print()
            else:
                print(f"No {label.lower()} crawl jobs found.")
        return

    if args.command == "list":
        crawls = get_all_crawls()
        if crawls:
            print(f"\nAll Crawl Jobs ({len(crawls)}):")
            print("-" * 80)
            print(f"{'Status':<10} {'Audit ID':<45} {'Progress':<15}")
            print("-" * 80)
            for p in crawls[:20]:  # Show last 20
                status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓")
                progress_str = f"{p.processed_urls}/{p.total_urls}"
                print(f"{status_emoji} {p.status:<7} {p.audit_id:<45} {progress_str:<15}")
            if len(crawls) > 20:
                print(f"... and {len(crawls) - 20} more")
        else:
            print("No crawl jobs found.")
        return

    if args.command == "crawl":
        # Handle --no-limit option
        max_pages = args.max_pages
        if args.no_limit:
            max_pages = 999999  # Effectively unlimited
            print("⚠️  WARNING: Page limit disabled. This may take a very long time!")

        def progress_callback(progress: CrawlProgress):
            pct = progress.get_progress_percent()
            print(f"\r[{pct:5.1f}%] {progress.processed_urls}/{progress.total_urls} pages | "
                  f"Success: {progress.successful_urls} | Failed: {progress.failed_urls} | "
                  f"ETA: {progress.get_eta()}", end="", flush=True)

        crawler = SitemapCrawler()
        result = crawler.crawl_sitemap(
            args.sitemap_url,
            delay=args.delay,
            max_pages=max_pages,
            progress_callback=progress_callback,
            save_to_notion=not args.no_notion,
        )

        print()  # New line after progress
        print()
        print("=" * 60)
        print("CRAWL COMPLETE")
        print("=" * 60)
        print(f"Audit ID: {result.audit_id}")
        print(f"Total Pages: {result.total_pages}")
        print(f"Successful: {result.successful_pages}")
        print(f"Failed: {result.failed_pages}")
        print(f"Duration: {result.get_duration()}")
        if result.summary_page_id:
            print(f"Summary Page: https://www.notion.so/{result.summary_page_id.replace('-', '')}")


if __name__ == "__main__":
    main()