refactor(skills): Restructure skills to dual-platform architecture

Major refactoring of ourdigital-custom-skills with new numbering system: ## Structure Changes - Each skill now has code/ (Claude Code) and desktop/ (Claude Desktop) versions - New progressive numbering: 01-09 General, 10-19 SEO, 20-29 GTM, 30-39 OurDigital, 40-49 Jamie ## Skill Reorganization - 01-notion-organizer (from 02) - 10-18: SEO tools split into focused skills (technical, on-page, local, schema, vitals, gsc, gateway) - 20-21: GTM audit and manager - 30-32: OurDigital designer, research, presentation - 40-41: Jamie brand editor and audit ## New Files - .claude/commands/: Slash command definitions for all skills - CLAUDE.md: Updated with new skill structure documentation - REFACTORING_PLAN.md: Migration documentation - COMPATIBILITY_REPORT.md, SKILLS_COMPARISON.md: Analysis docs ## Removed - Old skill directories (02-05, 10-14, 20-21 old numbering) - Consolidated into new structure with _archive/ for reference 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 01:58:24 +09:00
parent 214247ace2
commit eea49f9f8c
251 changed files with 12308 additions and 102 deletions
--- a/ourdigital-custom-skills/10-seo-technical-audit/code/scripts/sitemap_crawler.py
+++ b/ourdigital-custom-skills/10-seo-technical-audit/code/scripts/sitemap_crawler.py
@@ -0,0 +1,969 @@
+"""
+Sitemap Crawler - Sequential page analysis from sitemap
+=======================================================
+Purpose: Crawl sitemap URLs one by one, analyze each page, save to Notion
+Python: 3.10+
+Usage:
+    from sitemap_crawler import SitemapCrawler
+    crawler = SitemapCrawler()
+    crawler.crawl_sitemap("https://example.com/sitemap.xml", delay=2.0)
+"""
+
+import json
+import logging
+import time
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Callable, Generator
+from urllib.parse import urlparse
+
+import requests
+from notion_client import Client
+
+from base_client import config
+from page_analyzer import PageAnalyzer, PageMetadata
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+# Default database for page analysis data
+DEFAULT_PAGES_DATABASE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
+
+# Default limits to prevent excessive resource usage
+DEFAULT_MAX_PAGES = 500
+DEFAULT_DELAY_SECONDS = 2.0
+
+# Progress tracking directory
+PROGRESS_DIR = Path.home() / ".claude" / "seo-audit-progress"
+PROGRESS_DIR.mkdir(parents=True, exist_ok=True)
+
+
+@dataclass
+class CrawlProgress:
+    """Track crawl progress."""
+    total_urls: int = 0
+    processed_urls: int = 0
+    successful_urls: int = 0
+    failed_urls: int = 0
+    skipped_urls: int = 0
+    start_time: datetime = field(default_factory=datetime.now)
+    current_url: str = ""
+    audit_id: str = ""
+    site: str = ""
+    status: str = "running"  # running, completed, failed
+    error_message: str = ""
+    summary_page_id: str = ""
+
+    def get_progress_percent(self) -> float:
+        if self.total_urls == 0:
+            return 0.0
+        return (self.processed_urls / self.total_urls) * 100
+
+    def get_elapsed_time(self) -> str:
+        elapsed = datetime.now() - self.start_time
+        minutes = int(elapsed.total_seconds() // 60)
+        seconds = int(elapsed.total_seconds() % 60)
+        return f"{minutes}m {seconds}s"
+
+    def get_eta(self) -> str:
+        if self.processed_urls == 0:
+            return "calculating..."
+        elapsed = (datetime.now() - self.start_time).total_seconds()
+        avg_time_per_url = elapsed / self.processed_urls
+        remaining_urls = self.total_urls - self.processed_urls
+        eta_seconds = remaining_urls * avg_time_per_url
+        minutes = int(eta_seconds // 60)
+        seconds = int(eta_seconds % 60)
+        return f"{minutes}m {seconds}s"
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "audit_id": self.audit_id,
+            "site": self.site,
+            "status": self.status,
+            "total_urls": self.total_urls,
+            "processed_urls": self.processed_urls,
+            "successful_urls": self.successful_urls,
+            "failed_urls": self.failed_urls,
+            "progress_percent": round(self.get_progress_percent(), 1),
+            "elapsed_time": self.get_elapsed_time(),
+            "eta": self.get_eta(),
+            "current_url": self.current_url,
+            "start_time": self.start_time.isoformat(),
+            "error_message": self.error_message,
+            "summary_page_id": self.summary_page_id,
+            "updated_at": datetime.now().isoformat(),
+        }
+
+    def save_to_file(self, filepath: Path | None = None) -> Path:
+        """Save progress to JSON file."""
+        if filepath is None:
+            filepath = PROGRESS_DIR / f"{self.audit_id}.json"
+        with open(filepath, "w") as f:
+            json.dump(self.to_dict(), f, indent=2)
+        return filepath
+
+    @classmethod
+    def load_from_file(cls, filepath: Path) -> "CrawlProgress":
+        """Load progress from JSON file."""
+        with open(filepath, "r") as f:
+            data = json.load(f)
+        progress = cls()
+        progress.audit_id = data.get("audit_id", "")
+        progress.site = data.get("site", "")
+        progress.status = data.get("status", "unknown")
+        progress.total_urls = data.get("total_urls", 0)
+        progress.processed_urls = data.get("processed_urls", 0)
+        progress.successful_urls = data.get("successful_urls", 0)
+        progress.failed_urls = data.get("failed_urls", 0)
+        progress.current_url = data.get("current_url", "")
+        progress.error_message = data.get("error_message", "")
+        progress.summary_page_id = data.get("summary_page_id", "")
+        if data.get("start_time"):
+            progress.start_time = datetime.fromisoformat(data["start_time"])
+        return progress
+
+
+def get_active_crawls() -> list[CrawlProgress]:
+    """Get all active (running) crawl jobs."""
+    active = []
+    for filepath in PROGRESS_DIR.glob("*.json"):
+        try:
+            progress = CrawlProgress.load_from_file(filepath)
+            if progress.status == "running":
+                active.append(progress)
+        except Exception:
+            continue
+    return active
+
+
+def get_all_crawls() -> list[CrawlProgress]:
+    """Get all crawl jobs (active and completed)."""
+    crawls = []
+    for filepath in sorted(PROGRESS_DIR.glob("*.json"), reverse=True):
+        try:
+            progress = CrawlProgress.load_from_file(filepath)
+            crawls.append(progress)
+        except Exception:
+            continue
+    return crawls
+
+
+def get_crawl_status(audit_id: str) -> CrawlProgress | None:
+    """Get status of a specific crawl by audit ID."""
+    filepath = PROGRESS_DIR / f"{audit_id}.json"
+    if filepath.exists():
+        return CrawlProgress.load_from_file(filepath)
+    return None
+
+
+@dataclass
+class CrawlResult:
+    """Result of a complete sitemap crawl."""
+    site: str
+    sitemap_url: str
+    audit_id: str
+    total_pages: int
+    successful_pages: int
+    failed_pages: int
+    start_time: datetime
+    end_time: datetime
+    pages_analyzed: list[PageMetadata] = field(default_factory=list)
+    notion_page_ids: list[str] = field(default_factory=list)
+    summary_page_id: str | None = None
+
+    def get_duration(self) -> str:
+        duration = self.end_time - self.start_time
+        minutes = int(duration.total_seconds() // 60)
+        seconds = int(duration.total_seconds() % 60)
+        return f"{minutes}m {seconds}s"
+
+
+class SitemapCrawler:
+    """Crawl sitemap URLs and analyze each page."""
+
+    def __init__(
+        self,
+        notion_token: str | None = None,
+        database_id: str | None = None,
+    ):
+        """
+        Initialize sitemap crawler.
+
+        Args:
+            notion_token: Notion API token
+            database_id: Notion database ID for storing results
+        """
+        self.notion_token = notion_token or config.notion_token
+        self.database_id = database_id or DEFAULT_PAGES_DATABASE_ID
+        self.analyzer = PageAnalyzer()
+
+        if self.notion_token:
+            self.notion = Client(auth=self.notion_token)
+        else:
+            self.notion = None
+            logger.warning("Notion token not configured, results will not be saved")
+
+    def fetch_sitemap_urls(self, sitemap_url: str) -> list[str]:
+        """
+        Fetch and parse URLs from a sitemap.
+
+        Args:
+            sitemap_url: URL of the sitemap
+
+        Returns:
+            List of URLs found in the sitemap
+        """
+        try:
+            response = requests.get(sitemap_url, timeout=30)
+            response.raise_for_status()
+
+            # Parse XML
+            root = ET.fromstring(response.content)
+
+            # Handle namespace
+            namespaces = {
+                "sm": "http://www.sitemaps.org/schemas/sitemap/0.9"
+            }
+
+            urls = []
+
+            # Check if this is a sitemap index
+            sitemap_tags = root.findall(".//sm:sitemap/sm:loc", namespaces)
+            if sitemap_tags:
+                # This is a sitemap index, recursively fetch child sitemaps
+                logger.info(f"Found sitemap index with {len(sitemap_tags)} child sitemaps")
+                for loc in sitemap_tags:
+                    if loc.text:
+                        child_urls = self.fetch_sitemap_urls(loc.text)
+                        urls.extend(child_urls)
+            else:
+                # Regular sitemap, extract URLs
+                url_tags = root.findall(".//sm:url/sm:loc", namespaces)
+                if not url_tags:
+                    # Try without namespace
+                    url_tags = root.findall(".//url/loc")
+
+                for loc in url_tags:
+                    if loc.text:
+                        urls.append(loc.text)
+
+            # Remove duplicates while preserving order
+            seen = set()
+            unique_urls = []
+            for url in urls:
+                if url not in seen:
+                    seen.add(url)
+                    unique_urls.append(url)
+
+            logger.info(f"Found {len(unique_urls)} unique URLs in sitemap")
+            return unique_urls
+
+        except Exception as e:
+            logger.error(f"Failed to fetch sitemap: {e}")
+            raise
+
+    def crawl_sitemap(
+        self,
+        sitemap_url: str,
+        delay: float = DEFAULT_DELAY_SECONDS,
+        max_pages: int = DEFAULT_MAX_PAGES,
+        progress_callback: Callable[[CrawlProgress], None] | None = None,
+        save_to_notion: bool = True,
+        url_filter: Callable[[str], bool] | None = None,
+    ) -> CrawlResult:
+        """
+        Crawl all URLs in a sitemap sequentially.
+
+        Args:
+            sitemap_url: URL of the sitemap
+            delay: Seconds to wait between requests (default: 2.0s)
+            max_pages: Maximum number of pages to process (default: 500)
+            progress_callback: Function called with progress updates
+            save_to_notion: Whether to save results to Notion
+            url_filter: Optional function to filter URLs (return True to include)
+
+        Returns:
+            CrawlResult with all analyzed pages
+        """
+        # Parse site info
+        parsed_sitemap = urlparse(sitemap_url)
+        site = f"{parsed_sitemap.scheme}://{parsed_sitemap.netloc}"
+        site_domain = parsed_sitemap.netloc
+
+        # Generate audit ID
+        audit_id = f"{site_domain}-pages-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
+
+        logger.info(f"Starting sitemap crawl: {sitemap_url}")
+        logger.info(f"Audit ID: {audit_id}")
+        logger.info(f"Delay between requests: {delay}s")
+
+        # Initialize progress tracking
+        progress = CrawlProgress(
+            audit_id=audit_id,
+            site=site,
+            status="running",
+        )
+
+        # Fetch URLs
+        urls = self.fetch_sitemap_urls(sitemap_url)
+
+        # Apply URL filter if provided
+        if url_filter:
+            urls = [url for url in urls if url_filter(url)]
+            logger.info(f"After filtering: {len(urls)} URLs")
+
+        # Apply max pages limit (default: 500 to prevent excessive resource usage)
+        if len(urls) > max_pages:
+            logger.warning(f"Sitemap has {len(urls)} URLs, limiting to {max_pages} pages")
+            logger.warning(f"Use max_pages parameter to adjust this limit")
+            urls = urls[:max_pages]
+        logger.info(f"Processing {len(urls)} pages (max: {max_pages})")
+
+        # Update progress with total URLs
+        progress.total_urls = len(urls)
+        progress.save_to_file()
+
+        # Initialize result
+        result = CrawlResult(
+            site=site,
+            sitemap_url=sitemap_url,
+            audit_id=audit_id,
+            total_pages=len(urls),
+            successful_pages=0,
+            failed_pages=0,
+            start_time=datetime.now(),
+            end_time=datetime.now(),
+        )
+
+        # Process each URL
+        try:
+            for i, url in enumerate(urls):
+                progress.current_url = url
+                progress.processed_urls = i
+                progress.save_to_file()  # Save progress to file
+
+                if progress_callback:
+                    progress_callback(progress)
+
+                logger.info(f"[{i+1}/{len(urls)}] Analyzing: {url}")
+
+                try:
+                    # Analyze page
+                    metadata = self.analyzer.analyze_url(url)
+                    result.pages_analyzed.append(metadata)
+
+                    if metadata.status_code == 200:
+                        progress.successful_urls += 1
+                        result.successful_pages += 1
+
+                        # Save to Notion
+                        if save_to_notion and self.notion:
+                            page_id = self._save_page_to_notion(metadata, audit_id, site)
+                            if page_id:
+                                result.notion_page_ids.append(page_id)
+                    else:
+                        progress.failed_urls += 1
+                        result.failed_pages += 1
+
+                except Exception as e:
+                    logger.error(f"Failed to analyze {url}: {e}")
+                    progress.failed_urls += 1
+                    result.failed_pages += 1
+
+                # Wait before next request
+                if i < len(urls) - 1:  # Don't wait after last URL
+                    time.sleep(delay)
+
+            # Final progress update
+            progress.processed_urls = len(urls)
+            progress.status = "completed"
+            if progress_callback:
+                progress_callback(progress)
+
+        except Exception as e:
+            progress.status = "failed"
+            progress.error_message = str(e)
+            progress.save_to_file()
+            raise
+
+        # Update result
+        result.end_time = datetime.now()
+
+        # Create summary page
+        if save_to_notion and self.notion:
+            summary_id = self._create_crawl_summary_page(result)
+            result.summary_page_id = summary_id
+            progress.summary_page_id = summary_id
+
+        # Save final progress
+        progress.save_to_file()
+
+        logger.info(f"Crawl complete: {result.successful_pages}/{result.total_pages} pages analyzed")
+        logger.info(f"Duration: {result.get_duration()}")
+
+        return result
+
+    def _save_page_to_notion(
+        self,
+        metadata: PageMetadata,
+        audit_id: str,
+        site: str,
+    ) -> str | None:
+        """Save page metadata to Notion database."""
+        try:
+            # Build properties
+            properties = {
+                "Issue": {"title": [{"text": {"content": f"📄 {metadata.url}"}}]},
+                "Category": {"select": {"name": "On-page SEO"}},
+                "Priority": {"select": {"name": self._determine_priority(metadata)}},
+                "Site": {"url": site},
+                "URL": {"url": metadata.url},
+                "Audit ID": {"rich_text": [{"text": {"content": audit_id}}]},
+                "Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
+            }
+
+            # Build page content
+            children = self._build_page_content(metadata)
+
+            response = self.notion.pages.create(
+                parent={"database_id": self.database_id},
+                properties=properties,
+                children=children,
+            )
+
+            return response["id"]
+
+        except Exception as e:
+            logger.error(f"Failed to save to Notion: {e}")
+            return None
+
+    def _determine_priority(self, metadata: PageMetadata) -> str:
+        """Determine priority based on issues found."""
+        if len(metadata.issues) >= 3:
+            return "High"
+        elif len(metadata.issues) >= 1:
+            return "Medium"
+        elif len(metadata.warnings) >= 3:
+            return "Medium"
+        else:
+            return "Low"
+
+    def _build_page_content(self, metadata: PageMetadata) -> list[dict]:
+        """Build Notion page content blocks from metadata."""
+        children = []
+
+        # Status summary callout
+        status_emoji = "✅" if not metadata.issues else "⚠️" if len(metadata.issues) < 3 else "❌"
+        children.append({
+            "object": "block",
+            "type": "callout",
+            "callout": {
+                "rich_text": [
+                    {"type": "text", "text": {"content": f"Status: {metadata.status_code} | "}},
+                    {"type": "text", "text": {"content": f"Response: {metadata.response_time_ms:.0f}ms | "}},
+                    {"type": "text", "text": {"content": f"Issues: {len(metadata.issues)} | "}},
+                    {"type": "text", "text": {"content": f"Warnings: {len(metadata.warnings)}"}},
+                ],
+                "icon": {"type": "emoji", "emoji": status_emoji},
+                "color": "gray_background" if not metadata.issues else "yellow_background" if len(metadata.issues) < 3 else "red_background",
+            }
+        })
+
+        # Meta Tags Section
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Meta Tags"}}]}
+        })
+
+        # Meta tags table
+        meta_rows = [
+            {"type": "table_row", "table_row": {"cells": [
+                [{"type": "text", "text": {"content": "Tag"}, "annotations": {"bold": True}}],
+                [{"type": "text", "text": {"content": "Value"}, "annotations": {"bold": True}}],
+                [{"type": "text", "text": {"content": "Status"}, "annotations": {"bold": True}}],
+            ]}},
+            {"type": "table_row", "table_row": {"cells": [
+                [{"type": "text", "text": {"content": "Title"}}],
+                [{"type": "text", "text": {"content": (metadata.title or "—")[:50]}}],
+                [{"type": "text", "text": {"content": f"✓ {metadata.title_length} chars" if metadata.title else "✗ Missing"}}],
+            ]}},
+            {"type": "table_row", "table_row": {"cells": [
+                [{"type": "text", "text": {"content": "Description"}}],
+                [{"type": "text", "text": {"content": (metadata.meta_description or "—")[:50]}}],
+                [{"type": "text", "text": {"content": f"✓ {metadata.meta_description_length} chars" if metadata.meta_description else "✗ Missing"}}],
+            ]}},
+            {"type": "table_row", "table_row": {"cells": [
+                [{"type": "text", "text": {"content": "Canonical"}}],
+                [{"type": "text", "text": {"content": (metadata.canonical_url or "—")[:50]}}],
+                [{"type": "text", "text": {"content": "✓" if metadata.canonical_url else "✗ Missing"}}],
+            ]}},
+            {"type": "table_row", "table_row": {"cells": [
+                [{"type": "text", "text": {"content": "Robots"}}],
+                [{"type": "text", "text": {"content": metadata.robots_meta or "—"}}],
+                [{"type": "text", "text": {"content": "✓" if metadata.robots_meta else "—"}}],
+            ]}},
+            {"type": "table_row", "table_row": {"cells": [
+                [{"type": "text", "text": {"content": "Lang"}}],
+                [{"type": "text", "text": {"content": metadata.html_lang or "—"}}],
+                [{"type": "text", "text": {"content": "✓" if metadata.html_lang else "—"}}],
+            ]}},
+        ]
+
+        children.append({
+            "object": "block",
+            "type": "table",
+            "table": {
+                "table_width": 3,
+                "has_column_header": True,
+                "has_row_header": False,
+                "children": meta_rows
+            }
+        })
+
+        # Headings Section
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Headings"}}]}
+        })
+
+        children.append({
+            "object": "block",
+            "type": "paragraph",
+            "paragraph": {"rich_text": [
+                {"type": "text", "text": {"content": f"H1: {metadata.h1_count} | "}},
+                {"type": "text", "text": {"content": f"Total headings: {len(metadata.headings)}"}},
+            ]}
+        })
+
+        if metadata.h1_text:
+            children.append({
+                "object": "block",
+                "type": "quote",
+                "quote": {"rich_text": [{"type": "text", "text": {"content": metadata.h1_text[:200]}}]}
+            })
+
+        # Schema Data Section
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Structured Data"}}]}
+        })
+
+        if metadata.schema_types_found:
+            children.append({
+                "object": "block",
+                "type": "paragraph",
+                "paragraph": {"rich_text": [
+                    {"type": "text", "text": {"content": "Schema types found: "}},
+                    {"type": "text", "text": {"content": ", ".join(metadata.schema_types_found)}, "annotations": {"code": True}},
+                ]}
+            })
+        else:
+            children.append({
+                "object": "block",
+                "type": "callout",
+                "callout": {
+                    "rich_text": [{"type": "text", "text": {"content": "No structured data found on this page"}}],
+                    "icon": {"type": "emoji", "emoji": "⚠️"},
+                    "color": "yellow_background",
+                }
+            })
+
+        # Open Graph Section
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Open Graph"}}]}
+        })
+
+        og = metadata.open_graph
+        og_status = "✓ Configured" if og.og_title else "✗ Missing"
+        children.append({
+            "object": "block",
+            "type": "paragraph",
+            "paragraph": {"rich_text": [
+                {"type": "text", "text": {"content": f"Status: {og_status}\n"}},
+                {"type": "text", "text": {"content": f"og:title: {og.og_title or '—'}\n"}},
+                {"type": "text", "text": {"content": f"og:type: {og.og_type or '—'}"}},
+            ]}
+        })
+
+        # Links Section
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Links"}}]}
+        })
+
+        children.append({
+            "object": "block",
+            "type": "paragraph",
+            "paragraph": {"rich_text": [
+                {"type": "text", "text": {"content": f"Internal links: {metadata.internal_link_count}\n"}},
+                {"type": "text", "text": {"content": f"External links: {metadata.external_link_count}"}},
+            ]}
+        })
+
+        # Images Section
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Images"}}]}
+        })
+
+        children.append({
+            "object": "block",
+            "type": "paragraph",
+            "paragraph": {"rich_text": [
+                {"type": "text", "text": {"content": f"Total: {metadata.images_total} | "}},
+                {"type": "text", "text": {"content": f"With alt: {metadata.images_with_alt} | "}},
+                {"type": "text", "text": {"content": f"Without alt: {metadata.images_without_alt}"}},
+            ]}
+        })
+
+        # Hreflang Section (if present)
+        if metadata.hreflang_tags:
+            children.append({
+                "object": "block",
+                "type": "heading_2",
+                "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Hreflang Tags"}}]}
+            })
+
+            for tag in metadata.hreflang_tags[:10]:
+                children.append({
+                    "object": "block",
+                    "type": "bulleted_list_item",
+                    "bulleted_list_item": {"rich_text": [
+                        {"type": "text", "text": {"content": f"{tag['lang']}: "}},
+                        {"type": "text", "text": {"content": tag['url'], "link": {"url": tag['url']}}},
+                    ]}
+                })
+
+        # Issues & Warnings Section
+        if metadata.issues or metadata.warnings:
+            children.append({
+                "object": "block",
+                "type": "heading_2",
+                "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Issues & Warnings"}}]}
+            })
+
+            for issue in metadata.issues:
+                children.append({
+                    "object": "block",
+                    "type": "to_do",
+                    "to_do": {
+                        "rich_text": [
+                            {"type": "text", "text": {"content": "❌ "}, "annotations": {"bold": True}},
+                            {"type": "text", "text": {"content": issue}},
+                        ],
+                        "checked": False,
+                    }
+                })
+
+            for warning in metadata.warnings:
+                children.append({
+                    "object": "block",
+                    "type": "to_do",
+                    "to_do": {
+                        "rich_text": [
+                            {"type": "text", "text": {"content": "⚠️ "}, "annotations": {"bold": True}},
+                            {"type": "text", "text": {"content": warning}},
+                        ],
+                        "checked": False,
+                    }
+                })
+
+        return children
+
+    def _create_crawl_summary_page(self, result: CrawlResult) -> str | None:
+        """Create a summary page for the crawl."""
+        try:
+            site_domain = urlparse(result.site).netloc
+
+            # Calculate statistics
+            total_issues = sum(len(p.issues) for p in result.pages_analyzed)
+            total_warnings = sum(len(p.warnings) for p in result.pages_analyzed)
+            pages_with_issues = sum(1 for p in result.pages_analyzed if p.issues)
+            pages_without_schema = sum(1 for p in result.pages_analyzed if not p.schema_types_found)
+            pages_without_description = sum(1 for p in result.pages_analyzed if not p.meta_description)
+
+            children = []
+
+            # Header callout
+            children.append({
+                "object": "block",
+                "type": "callout",
+                "callout": {
+                    "rich_text": [
+                        {"type": "text", "text": {"content": f"Sitemap Crawl Complete\n\n"}},
+                        {"type": "text", "text": {"content": f"Audit ID: {result.audit_id}\n"}},
+                        {"type": "text", "text": {"content": f"Duration: {result.get_duration()}\n"}},
+                        {"type": "text", "text": {"content": f"Pages: {result.successful_pages}/{result.total_pages}"}},
+                    ],
+                    "icon": {"type": "emoji", "emoji": "📊"},
+                    "color": "blue_background",
+                }
+            })
+
+            # Statistics table
+            children.append({
+                "object": "block",
+                "type": "heading_2",
+                "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Statistics"}}]}
+            })
+
+            stats_rows = [
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Metric"}, "annotations": {"bold": True}}],
+                    [{"type": "text", "text": {"content": "Count"}, "annotations": {"bold": True}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Total Pages"}}],
+                    [{"type": "text", "text": {"content": str(result.total_pages)}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Successfully Analyzed"}}],
+                    [{"type": "text", "text": {"content": str(result.successful_pages)}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Pages with Issues"}}],
+                    [{"type": "text", "text": {"content": str(pages_with_issues)}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Total Issues"}}],
+                    [{"type": "text", "text": {"content": str(total_issues)}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Total Warnings"}}],
+                    [{"type": "text", "text": {"content": str(total_warnings)}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Pages without Schema"}}],
+                    [{"type": "text", "text": {"content": str(pages_without_schema)}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Pages without Description"}}],
+                    [{"type": "text", "text": {"content": str(pages_without_description)}}],
+                ]}},
+            ]
+
+            children.append({
+                "object": "block",
+                "type": "table",
+                "table": {
+                    "table_width": 2,
+                    "has_column_header": True,
+                    "has_row_header": False,
+                    "children": stats_rows
+                }
+            })
+
+            # Pages list
+            children.append({
+                "object": "block",
+                "type": "heading_2",
+                "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Analyzed Pages"}}]}
+            })
+
+            children.append({
+                "object": "block",
+                "type": "paragraph",
+                "paragraph": {"rich_text": [
+                    {"type": "text", "text": {"content": f"Filter by Audit ID in the database to see all {result.successful_pages} page entries."}}
+                ]}
+            })
+
+            # Create the summary page
+            response = self.notion.pages.create(
+                parent={"database_id": self.database_id},
+                properties={
+                    "Issue": {"title": [{"text": {"content": f"📊 Sitemap Crawl: {site_domain}"}}]},
+                    "Category": {"select": {"name": "Technical SEO"}},
+                    "Priority": {"select": {"name": "High"}},
+                    "Site": {"url": result.site},
+                    "Audit ID": {"rich_text": [{"text": {"content": result.audit_id}}]},
+                    "Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
+                },
+                children=children,
+            )
+
+            logger.info(f"Created crawl summary page: {response['id']}")
+            return response["id"]
+
+        except Exception as e:
+            logger.error(f"Failed to create summary page: {e}")
+            return None
+
+
+def print_progress_status(progress: CrawlProgress) -> None:
+    """Print formatted progress status."""
+    status_emoji = {
+        "running": "🔄",
+        "completed": "✅",
+        "failed": "❌",
+    }.get(progress.status, "❓")
+
+    print(f"""
+{'=' * 60}
+{status_emoji} SEO Page Analysis - {progress.status.upper()}
+{'=' * 60}
+Audit ID:    {progress.audit_id}
+Site:        {progress.site}
+Status:      {progress.status}
+
+Progress:    {progress.processed_urls}/{progress.total_urls} pages ({progress.get_progress_percent():.1f}%)
+Successful:  {progress.successful_urls}
+Failed:      {progress.failed_urls}
+Elapsed:     {progress.get_elapsed_time()}
+ETA:         {progress.get_eta() if progress.status == 'running' else 'N/A'}
+
+Current URL: {progress.current_url[:60] + '...' if len(progress.current_url) > 60 else progress.current_url}
+""")
+
+    if progress.summary_page_id:
+        print(f"Summary:     https://www.notion.so/{progress.summary_page_id.replace('-', '')}")
+
+    if progress.error_message:
+        print(f"Error:       {progress.error_message}")
+
+    print("=" * 60)
+
+
+def main():
+    """CLI entry point."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Sitemap Crawler with Background Support")
+    subparsers = parser.add_subparsers(dest="command", help="Commands")
+
+    # Crawl command
+    crawl_parser = subparsers.add_parser("crawl", help="Start crawling a sitemap")
+    crawl_parser.add_argument("sitemap_url", help="URL of the sitemap to crawl")
+    crawl_parser.add_argument("--delay", "-d", type=float, default=DEFAULT_DELAY_SECONDS,
+                             help=f"Delay between requests in seconds (default: {DEFAULT_DELAY_SECONDS})")
+    crawl_parser.add_argument("--max-pages", "-m", type=int, default=DEFAULT_MAX_PAGES,
+                             help=f"Maximum pages to process (default: {DEFAULT_MAX_PAGES})")
+    crawl_parser.add_argument("--no-notion", action="store_true",
+                             help="Don't save to Notion")
+    crawl_parser.add_argument("--no-limit", action="store_true",
+                             help="Remove page limit (use with caution)")
+
+    # Status command
+    status_parser = subparsers.add_parser("status", help="Check crawl progress")
+    status_parser.add_argument("audit_id", nargs="?", help="Specific audit ID to check (optional)")
+    status_parser.add_argument("--all", "-a", action="store_true", help="Show all crawls (not just active)")
+
+    # List command
+    list_parser = subparsers.add_parser("list", help="List all crawl jobs")
+
+    args = parser.parse_args()
+
+    # Default to crawl if no command specified but URL provided
+    if args.command is None:
+        # Check if first positional arg looks like a URL
+        import sys
+        if len(sys.argv) > 1 and (sys.argv[1].startswith("http") or sys.argv[1].endswith(".xml")):
+            args.command = "crawl"
+            args.sitemap_url = sys.argv[1]
+            args.delay = DEFAULT_DELAY_SECONDS
+            args.max_pages = DEFAULT_MAX_PAGES
+            args.no_notion = False
+            args.no_limit = False
+        else:
+            parser.print_help()
+            return
+
+    if args.command == "status":
+        if args.audit_id:
+            # Show specific crawl status
+            progress = get_crawl_status(args.audit_id)
+            if progress:
+                print_progress_status(progress)
+            else:
+                print(f"No crawl found with audit ID: {args.audit_id}")
+        else:
+            # Show active crawls
+            if args.all:
+                crawls = get_all_crawls()
+                label = "All"
+            else:
+                crawls = get_active_crawls()
+                label = "Active"
+
+            if crawls:
+                print(f"\n{label} Crawl Jobs ({len(crawls)}):")
+                print("-" * 60)
+                for p in crawls:
+                    status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓")
+                    print(f"{status_emoji} {p.audit_id}")
+                    print(f"   Site: {p.site}")
+                    print(f"   Progress: {p.processed_urls}/{p.total_urls} ({p.get_progress_percent():.1f}%)")
+                    print()
+            else:
+                print(f"No {label.lower()} crawl jobs found.")
+        return
+
+    if args.command == "list":
+        crawls = get_all_crawls()
+        if crawls:
+            print(f"\nAll Crawl Jobs ({len(crawls)}):")
+            print("-" * 80)
+            print(f"{'Status':<10} {'Audit ID':<45} {'Progress':<15}")
+            print("-" * 80)
+            for p in crawls[:20]:  # Show last 20
+                status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓")
+                progress_str = f"{p.processed_urls}/{p.total_urls}"
+                print(f"{status_emoji} {p.status:<7} {p.audit_id:<45} {progress_str:<15}")
+            if len(crawls) > 20:
+                print(f"... and {len(crawls) - 20} more")
+        else:
+            print("No crawl jobs found.")
+        return
+
+    if args.command == "crawl":
+        # Handle --no-limit option
+        max_pages = args.max_pages
+        if args.no_limit:
+            max_pages = 999999  # Effectively unlimited
+            print("⚠️  WARNING: Page limit disabled. This may take a very long time!")
+
+        def progress_callback(progress: CrawlProgress):
+            pct = progress.get_progress_percent()
+            print(f"\r[{pct:5.1f}%] {progress.processed_urls}/{progress.total_urls} pages | "
+                  f"Success: {progress.successful_urls} | Failed: {progress.failed_urls} | "
+                  f"ETA: {progress.get_eta()}", end="", flush=True)
+
+        crawler = SitemapCrawler()
+        result = crawler.crawl_sitemap(
+            args.sitemap_url,
+            delay=args.delay,
+            max_pages=max_pages,
+            progress_callback=progress_callback,
+            save_to_notion=not args.no_notion,
+        )
+
+        print()  # New line after progress
+        print()
+        print("=" * 60)
+        print("CRAWL COMPLETE")
+        print("=" * 60)
+        print(f"Audit ID: {result.audit_id}")
+        print(f"Total Pages: {result.total_pages}")
+        print(f"Successful: {result.successful_pages}")
+        print(f"Failed: {result.failed_pages}")
+        print(f"Duration: {result.get_duration()}")
+        if result.summary_page_id:
+            print(f"Summary Page: https://www.notion.so/{result.summary_page_id.replace('-', '')}")
+
+
+if __name__ == "__main__":
+    main()