refactor(skills): Restructure skills to dual-platform architecture

Major refactoring of ourdigital-custom-skills with new numbering system: ## Structure Changes - Each skill now has code/ (Claude Code) and desktop/ (Claude Desktop) versions - New progressive numbering: 01-09 General, 10-19 SEO, 20-29 GTM, 30-39 OurDigital, 40-49 Jamie ## Skill Reorganization - 01-notion-organizer (from 02) - 10-18: SEO tools split into focused skills (technical, on-page, local, schema, vitals, gsc, gateway) - 20-21: GTM audit and manager - 30-32: OurDigital designer, research, presentation - 40-41: Jamie brand editor and audit ## New Files - .claude/commands/: Slash command definitions for all skills - CLAUDE.md: Updated with new skill structure documentation - REFACTORING_PLAN.md: Migration documentation - COMPATIBILITY_REPORT.md, SKILLS_COMPARISON.md: Analysis docs ## Removed - Old skill directories (02-05, 10-14, 20-21 old numbering) - Consolidated into new structure with _archive/ for reference 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 01:58:24 +09:00
parent 214247ace2
commit eea49f9f8c
251 changed files with 12308 additions and 102 deletions
--- a/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/base_client.py
+++ b/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/base_client.py
@@ -0,0 +1,207 @@
+"""
+Base Client - Shared async client utilities
+===========================================
+Purpose: Rate-limited async operations for API clients
+Python: 3.10+
+"""
+
+import asyncio
+import logging
+import os
+from asyncio import Semaphore
+from datetime import datetime
+from typing import Any, Callable, TypeVar
+
+from dotenv import load_dotenv
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+)
+
+# Load environment variables
+load_dotenv()
+
+# Logging setup
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
+T = TypeVar("T")
+
+
+class RateLimiter:
+    """Rate limiter using token bucket algorithm."""
+
+    def __init__(self, rate: float, per: float = 1.0):
+        """
+        Initialize rate limiter.
+
+        Args:
+            rate: Number of requests allowed
+            per: Time period in seconds (default: 1 second)
+        """
+        self.rate = rate
+        self.per = per
+        self.tokens = rate
+        self.last_update = datetime.now()
+        self._lock = asyncio.Lock()
+
+    async def acquire(self) -> None:
+        """Acquire a token, waiting if necessary."""
+        async with self._lock:
+            now = datetime.now()
+            elapsed = (now - self.last_update).total_seconds()
+            self.tokens = min(self.rate, self.tokens + elapsed * (self.rate / self.per))
+            self.last_update = now
+
+            if self.tokens < 1:
+                wait_time = (1 - self.tokens) * (self.per / self.rate)
+                await asyncio.sleep(wait_time)
+                self.tokens = 0
+            else:
+                self.tokens -= 1
+
+
+class BaseAsyncClient:
+    """Base class for async API clients with rate limiting."""
+
+    def __init__(
+        self,
+        max_concurrent: int = 5,
+        requests_per_second: float = 3.0,
+        logger: logging.Logger | None = None,
+    ):
+        """
+        Initialize base client.
+
+        Args:
+            max_concurrent: Maximum concurrent requests
+            requests_per_second: Rate limit
+            logger: Logger instance
+        """
+        self.semaphore = Semaphore(max_concurrent)
+        self.rate_limiter = RateLimiter(requests_per_second)
+        self.logger = logger or logging.getLogger(self.__class__.__name__)
+        self.stats = {
+            "requests": 0,
+            "success": 0,
+            "errors": 0,
+            "retries": 0,
+        }
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type(Exception),
+    )
+    async def _rate_limited_request(
+        self,
+        coro: Callable[[], Any],
+    ) -> Any:
+        """Execute a request with rate limiting and retry."""
+        async with self.semaphore:
+            await self.rate_limiter.acquire()
+            self.stats["requests"] += 1
+            try:
+                result = await coro()
+                self.stats["success"] += 1
+                return result
+            except Exception as e:
+                self.stats["errors"] += 1
+                self.logger.error(f"Request failed: {e}")
+                raise
+
+    async def batch_requests(
+        self,
+        requests: list[Callable[[], Any]],
+        desc: str = "Processing",
+    ) -> list[Any]:
+        """Execute multiple requests concurrently."""
+        try:
+            from tqdm.asyncio import tqdm
+            has_tqdm = True
+        except ImportError:
+            has_tqdm = False
+
+        async def execute(req: Callable) -> Any:
+            try:
+                return await self._rate_limited_request(req)
+            except Exception as e:
+                return {"error": str(e)}
+
+        tasks = [execute(req) for req in requests]
+
+        if has_tqdm:
+            results = []
+            for coro in tqdm.as_completed(tasks, total=len(tasks), desc=desc):
+                result = await coro
+                results.append(result)
+            return results
+        else:
+            return await asyncio.gather(*tasks, return_exceptions=True)
+
+    def print_stats(self) -> None:
+        """Print request statistics."""
+        self.logger.info("=" * 40)
+        self.logger.info("Request Statistics:")
+        self.logger.info(f"  Total Requests: {self.stats['requests']}")
+        self.logger.info(f"  Successful: {self.stats['success']}")
+        self.logger.info(f"  Errors: {self.stats['errors']}")
+        self.logger.info("=" * 40)
+
+
+class ConfigManager:
+    """Manage API configuration and credentials."""
+
+    def __init__(self):
+        load_dotenv()
+
+    @property
+    def google_credentials_path(self) -> str | None:
+        """Get Google service account credentials path."""
+        # Prefer SEO-specific credentials, fallback to general credentials
+        seo_creds = os.path.expanduser("~/.credential/ourdigital-seo-agent.json")
+        if os.path.exists(seo_creds):
+            return seo_creds
+        return os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+
+    @property
+    def pagespeed_api_key(self) -> str | None:
+        """Get PageSpeed Insights API key."""
+        return os.getenv("PAGESPEED_API_KEY")
+
+    @property
+    def custom_search_api_key(self) -> str | None:
+        """Get Custom Search API key."""
+        return os.getenv("CUSTOM_SEARCH_API_KEY")
+
+    @property
+    def custom_search_engine_id(self) -> str | None:
+        """Get Custom Search Engine ID."""
+        return os.getenv("CUSTOM_SEARCH_ENGINE_ID")
+
+    @property
+    def notion_token(self) -> str | None:
+        """Get Notion API token."""
+        return os.getenv("NOTION_TOKEN") or os.getenv("NOTION_API_KEY")
+
+    def validate_google_credentials(self) -> bool:
+        """Validate Google credentials are configured."""
+        creds_path = self.google_credentials_path
+        if not creds_path:
+            return False
+        return os.path.exists(creds_path)
+
+    def get_required(self, key: str) -> str:
+        """Get required environment variable or raise error."""
+        value = os.getenv(key)
+        if not value:
+            raise ValueError(f"Missing required environment variable: {key}")
+        return value
+
+
+# Singleton config instance
+config = ConfigManager()
--- a/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/full_audit.py
+++ b/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/full_audit.py
@@ -0,0 +1,497 @@
+"""
+Full SEO Audit - Orchestration Script
+=====================================
+Purpose: Run comprehensive SEO audit combining all tools
+Python: 3.10+
+Usage:
+    python full_audit.py --url https://example.com --output notion --notion-page-id abc123
+"""
+
+import argparse
+import json
+import logging
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any
+from urllib.parse import urlparse
+
+from robots_checker import RobotsChecker
+from schema_validator import SchemaValidator
+from sitemap_validator import SitemapValidator
+from pagespeed_client import PageSpeedClient
+from notion_reporter import NotionReporter, SEOFinding
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class AuditResult:
+    """Complete SEO audit result."""
+
+    url: str
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+    robots: dict = field(default_factory=dict)
+    sitemap: dict = field(default_factory=dict)
+    schema: dict = field(default_factory=dict)
+    performance: dict = field(default_factory=dict)
+    findings: list[SEOFinding] = field(default_factory=list)
+    summary: dict = field(default_factory=dict)
+
+    def to_dict(self) -> dict:
+        return {
+            "url": self.url,
+            "timestamp": self.timestamp,
+            "robots": self.robots,
+            "sitemap": self.sitemap,
+            "schema": self.schema,
+            "performance": self.performance,
+            "summary": self.summary,
+            "findings_count": len(self.findings),
+        }
+
+
+class SEOAuditor:
+    """Orchestrate comprehensive SEO audit."""
+
+    def __init__(self):
+        self.robots_checker = RobotsChecker()
+        self.sitemap_validator = SitemapValidator()
+        self.schema_validator = SchemaValidator()
+        self.pagespeed_client = PageSpeedClient()
+
+    def run_audit(
+        self,
+        url: str,
+        include_robots: bool = True,
+        include_sitemap: bool = True,
+        include_schema: bool = True,
+        include_performance: bool = True,
+    ) -> AuditResult:
+        """
+        Run comprehensive SEO audit.
+
+        Args:
+            url: URL to audit
+            include_robots: Check robots.txt
+            include_sitemap: Validate sitemap
+            include_schema: Validate schema markup
+            include_performance: Run PageSpeed analysis
+        """
+        result = AuditResult(url=url)
+        parsed_url = urlparse(url)
+        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+
+        logger.info(f"Starting SEO audit for {url}")
+
+        # 1. Robots.txt analysis
+        if include_robots:
+            logger.info("Analyzing robots.txt...")
+            try:
+                robots_result = self.robots_checker.analyze(base_url)
+                result.robots = robots_result.to_dict()
+                self._process_robots_findings(robots_result, result)
+            except Exception as e:
+                logger.error(f"Robots.txt analysis failed: {e}")
+                result.robots = {"error": str(e)}
+
+        # 2. Sitemap validation
+        if include_sitemap:
+            logger.info("Validating sitemap...")
+            sitemap_url = f"{base_url}/sitemap.xml"
+            # Try to get sitemap URL from robots.txt
+            if result.robots.get("sitemaps"):
+                sitemap_url = result.robots["sitemaps"][0]
+            try:
+                sitemap_result = self.sitemap_validator.validate(sitemap_url)
+                result.sitemap = sitemap_result.to_dict()
+                self._process_sitemap_findings(sitemap_result, result)
+            except Exception as e:
+                logger.error(f"Sitemap validation failed: {e}")
+                result.sitemap = {"error": str(e)}
+
+        # 3. Schema validation
+        if include_schema:
+            logger.info("Validating schema markup...")
+            try:
+                schema_result = self.schema_validator.validate(url=url)
+                result.schema = schema_result.to_dict()
+                self._process_schema_findings(schema_result, result)
+            except Exception as e:
+                logger.error(f"Schema validation failed: {e}")
+                result.schema = {"error": str(e)}
+
+        # 4. PageSpeed analysis
+        if include_performance:
+            logger.info("Running PageSpeed analysis...")
+            try:
+                perf_result = self.pagespeed_client.analyze(url, strategy="mobile")
+                result.performance = perf_result.to_dict()
+                self._process_performance_findings(perf_result, result)
+            except Exception as e:
+                logger.error(f"PageSpeed analysis failed: {e}")
+                result.performance = {"error": str(e)}
+
+        # Generate summary
+        result.summary = self._generate_summary(result)
+
+        logger.info(f"Audit complete. Found {len(result.findings)} issues.")
+        return result
+
+    def _process_robots_findings(self, robots_result, audit_result: AuditResult):
+        """Convert robots.txt issues to findings."""
+        for issue in robots_result.issues:
+            priority = "Medium"
+            if issue.severity == "error":
+                priority = "Critical"
+            elif issue.severity == "warning":
+                priority = "High"
+
+            audit_result.findings.append(SEOFinding(
+                issue=issue.message,
+                category="Robots.txt",
+                priority=priority,
+                description=issue.directive or "",
+                recommendation=issue.suggestion or "",
+            ))
+
+    def _process_sitemap_findings(self, sitemap_result, audit_result: AuditResult):
+        """Convert sitemap issues to findings."""
+        for issue in sitemap_result.issues:
+            priority = "Medium"
+            if issue.severity == "error":
+                priority = "High"
+            elif issue.severity == "warning":
+                priority = "Medium"
+
+            audit_result.findings.append(SEOFinding(
+                issue=issue.message,
+                category="Sitemap",
+                priority=priority,
+                url=issue.url,
+                recommendation=issue.suggestion or "",
+            ))
+
+    def _process_schema_findings(self, schema_result, audit_result: AuditResult):
+        """Convert schema issues to findings."""
+        for issue in schema_result.issues:
+            priority = "Low"
+            if issue.severity == "error":
+                priority = "High"
+            elif issue.severity == "warning":
+                priority = "Medium"
+
+            audit_result.findings.append(SEOFinding(
+                issue=issue.message,
+                category="Schema/Structured Data",
+                priority=priority,
+                description=f"Schema type: {issue.schema_type}" if issue.schema_type else "",
+                recommendation=issue.suggestion or "",
+            ))
+
+    def _process_performance_findings(self, perf_result, audit_result: AuditResult):
+        """Convert performance issues to findings."""
+        cwv = perf_result.core_web_vitals
+
+        # Check Core Web Vitals
+        if cwv.lcp_rating == "POOR":
+            audit_result.findings.append(SEOFinding(
+                issue=f"Poor LCP: {cwv.lcp / 1000:.2f}s (should be < 2.5s)",
+                category="Performance",
+                priority="Critical",
+                impact="Users experience slow page loads, affecting bounce rate and rankings",
+                recommendation="Optimize images, reduce server response time, use CDN",
+            ))
+        elif cwv.lcp_rating == "NEEDS_IMPROVEMENT":
+            audit_result.findings.append(SEOFinding(
+                issue=f"LCP needs improvement: {cwv.lcp / 1000:.2f}s (target < 2.5s)",
+                category="Performance",
+                priority="High",
+                recommendation="Optimize largest content element loading",
+            ))
+
+        if cwv.cls_rating == "POOR":
+            audit_result.findings.append(SEOFinding(
+                issue=f"Poor CLS: {cwv.cls:.3f} (should be < 0.1)",
+                category="Performance",
+                priority="High",
+                impact="Layout shifts frustrate users",
+                recommendation="Set dimensions for images/embeds, avoid inserting content above existing content",
+            ))
+
+        if cwv.fid_rating == "POOR":
+            audit_result.findings.append(SEOFinding(
+                issue=f"Poor FID/TBT: {cwv.fid:.0f}ms (should be < 100ms)",
+                category="Performance",
+                priority="High",
+                impact="Slow interactivity affects user experience",
+                recommendation="Reduce JavaScript execution time, break up long tasks",
+            ))
+
+        # Check performance score
+        if perf_result.performance_score and perf_result.performance_score < 50:
+            audit_result.findings.append(SEOFinding(
+                issue=f"Low performance score: {perf_result.performance_score:.0f}/100",
+                category="Performance",
+                priority="High",
+                impact="Poor performance affects user experience and SEO",
+                recommendation="Address top opportunities from PageSpeed Insights",
+            ))
+
+        # Add top opportunities as findings
+        for opp in perf_result.opportunities[:3]:
+            if opp["savings_ms"] > 500:  # Only significant savings
+                audit_result.findings.append(SEOFinding(
+                    issue=opp["title"],
+                    category="Performance",
+                    priority="Medium",
+                    description=opp.get("description", ""),
+                    impact=f"Potential savings: {opp['savings_ms'] / 1000:.1f}s",
+                    recommendation="See PageSpeed Insights for details",
+                ))
+
+    def _generate_summary(self, result: AuditResult) -> dict:
+        """Generate audit summary."""
+        findings_by_priority = {}
+        findings_by_category = {}
+
+        for finding in result.findings:
+            # Count by priority
+            findings_by_priority[finding.priority] = (
+                findings_by_priority.get(finding.priority, 0) + 1
+            )
+            # Count by category
+            findings_by_category[finding.category] = (
+                findings_by_category.get(finding.category, 0) + 1
+            )
+
+        return {
+            "total_findings": len(result.findings),
+            "findings_by_priority": findings_by_priority,
+            "findings_by_category": findings_by_category,
+            "robots_accessible": result.robots.get("accessible", False),
+            "sitemap_valid": result.sitemap.get("valid", False),
+            "schema_valid": result.schema.get("valid", False),
+            "performance_score": result.performance.get("scores", {}).get("performance"),
+            "quick_wins": [
+                f.issue for f in result.findings
+                if f.priority in ("Medium", "Low")
+            ][:5],
+            "critical_issues": [
+                f.issue for f in result.findings
+                if f.priority == "Critical"
+            ],
+        }
+
+    def export_to_notion(
+        self,
+        result: AuditResult,
+        parent_page_id: str | None = None,
+        use_default_db: bool = True,
+    ) -> dict:
+        """
+        Export audit results to Notion.
+
+        Args:
+            result: AuditResult object
+            parent_page_id: Parent page ID (for creating new database)
+            use_default_db: If True, use OurDigital SEO Audit Log database
+
+        Returns:
+            Dict with database_id, summary_page_id, findings_created
+        """
+        reporter = NotionReporter()
+        audit_id = f"{urlparse(result.url).netloc}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
+
+        # Add site and audit_id to all findings
+        for finding in result.findings:
+            finding.site = result.url
+            finding.audit_id = audit_id
+
+        if use_default_db:
+            # Use the default OurDigital SEO Audit Log database
+            page_ids = reporter.add_findings_batch(result.findings)
+            return {
+                "database_id": reporter.DEFAULT_DATABASE_ID if hasattr(reporter, 'DEFAULT_DATABASE_ID') else "2c8581e5-8a1e-8035-880b-e38cefc2f3ef",
+                "audit_id": audit_id,
+                "findings_created": len(page_ids),
+            }
+        else:
+            # Create new database under parent page
+            if not parent_page_id:
+                raise ValueError("parent_page_id required when not using default database")
+
+            db_title = f"SEO Audit - {urlparse(result.url).netloc} - {datetime.now().strftime('%Y-%m-%d')}"
+            database_id = reporter.create_findings_database(parent_page_id, db_title)
+            page_ids = reporter.add_findings_batch(result.findings, database_id)
+
+            # Create summary page
+            summary_page_id = reporter.create_audit_summary_page(
+                parent_page_id,
+                result.url,
+                result.summary,
+            )
+
+            return {
+                "database_id": database_id,
+                "summary_page_id": summary_page_id,
+                "audit_id": audit_id,
+                "findings_created": len(page_ids),
+            }
+
+    def generate_report(self, result: AuditResult) -> str:
+        """Generate human-readable report."""
+        lines = [
+            "=" * 70,
+            "SEO AUDIT REPORT",
+            "=" * 70,
+            f"URL: {result.url}",
+            f"Date: {result.timestamp}",
+            "",
+            "-" * 70,
+            "SUMMARY",
+            "-" * 70,
+            f"Total Issues Found: {result.summary.get('total_findings', 0)}",
+            "",
+        ]
+
+        # Priority breakdown
+        lines.append("Issues by Priority:")
+        for priority in ["Critical", "High", "Medium", "Low"]:
+            count = result.summary.get("findings_by_priority", {}).get(priority, 0)
+            if count:
+                lines.append(f"  {priority}: {count}")
+
+        lines.append("")
+
+        # Category breakdown
+        lines.append("Issues by Category:")
+        for category, count in result.summary.get("findings_by_category", {}).items():
+            lines.append(f"  {category}: {count}")
+
+        lines.append("")
+        lines.append("-" * 70)
+        lines.append("STATUS OVERVIEW")
+        lines.append("-" * 70)
+
+        # Status checks
+        lines.append(f"Robots.txt: {'✓ Accessible' if result.robots.get('accessible') else '✗ Not accessible'}")
+        lines.append(f"Sitemap: {'✓ Valid' if result.sitemap.get('valid') else '✗ Issues found'}")
+        lines.append(f"Schema: {'✓ Valid' if result.schema.get('valid') else '✗ Issues found'}")
+
+        perf_score = result.performance.get("scores", {}).get("performance")
+        if perf_score:
+            status = "✓ Good" if perf_score >= 90 else "⚠ Needs work" if perf_score >= 50 else "✗ Poor"
+            lines.append(f"Performance: {status} ({perf_score:.0f}/100)")
+
+        # Critical issues
+        critical = result.summary.get("critical_issues", [])
+        if critical:
+            lines.extend([
+                "",
+                "-" * 70,
+                "CRITICAL ISSUES (Fix Immediately)",
+                "-" * 70,
+            ])
+            for issue in critical:
+                lines.append(f"  • {issue}")
+
+        # Quick wins
+        quick_wins = result.summary.get("quick_wins", [])
+        if quick_wins:
+            lines.extend([
+                "",
+                "-" * 70,
+                "QUICK WINS",
+                "-" * 70,
+            ])
+            for issue in quick_wins[:5]:
+                lines.append(f"  • {issue}")
+
+        # All findings
+        if result.findings:
+            lines.extend([
+                "",
+                "-" * 70,
+                "ALL FINDINGS",
+                "-" * 70,
+            ])
+
+            current_category = None
+            for finding in sorted(result.findings, key=lambda x: (x.category, x.priority)):
+                if finding.category != current_category:
+                    current_category = finding.category
+                    lines.append(f"\n[{current_category}]")
+
+                lines.append(f"  [{finding.priority}] {finding.issue}")
+                if finding.recommendation:
+                    lines.append(f"    → {finding.recommendation}")
+
+        lines.extend(["", "=" * 70])
+
+        return "\n".join(lines)
+
+
+def main():
+    """CLI entry point."""
+    parser = argparse.ArgumentParser(
+        description="Run comprehensive SEO audit",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Run full audit and output to console
+  python full_audit.py --url https://example.com
+
+  # Export to Notion
+  python full_audit.py --url https://example.com --output notion --notion-page-id abc123
+
+  # Output as JSON
+  python full_audit.py --url https://example.com --json
+        """,
+    )
+
+    parser.add_argument("--url", "-u", required=True, help="URL to audit")
+    parser.add_argument("--output", "-o", choices=["console", "notion", "json"],
+                       default="console", help="Output format")
+    parser.add_argument("--notion-page-id", help="Notion parent page ID (required for notion output)")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+    parser.add_argument("--no-robots", action="store_true", help="Skip robots.txt check")
+    parser.add_argument("--no-sitemap", action="store_true", help="Skip sitemap validation")
+    parser.add_argument("--no-schema", action="store_true", help="Skip schema validation")
+    parser.add_argument("--no-performance", action="store_true", help="Skip PageSpeed analysis")
+
+    args = parser.parse_args()
+
+    auditor = SEOAuditor()
+
+    # Run audit
+    result = auditor.run_audit(
+        args.url,
+        include_robots=not args.no_robots,
+        include_sitemap=not args.no_sitemap,
+        include_schema=not args.no_schema,
+        include_performance=not args.no_performance,
+    )
+
+    # Output results
+    if args.json or args.output == "json":
+        print(json.dumps(result.to_dict(), indent=2, default=str))
+
+    elif args.output == "notion":
+        if not args.notion_page_id:
+            parser.error("--notion-page-id required for notion output")
+        notion_result = auditor.export_to_notion(result, args.notion_page_id)
+        print(f"Exported to Notion:")
+        print(f"  Database ID: {notion_result['database_id']}")
+        print(f"  Summary Page: {notion_result['summary_page_id']}")
+        print(f"  Findings Created: {notion_result['findings_created']}")
+
+    else:
+        print(auditor.generate_report(result))
+
+
+if __name__ == "__main__":
+    main()
--- a/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/gsc_client.py
+++ b/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/gsc_client.py
@@ -0,0 +1,409 @@
+"""
+Google Search Console Client
+============================
+Purpose: Interact with Google Search Console API for SEO data
+Python: 3.10+
+Usage:
+    from gsc_client import SearchConsoleClient
+    client = SearchConsoleClient()
+    data = client.get_search_analytics("sc-domain:example.com")
+"""
+
+import logging
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+from typing import Any
+
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+
+from base_client import config
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SearchAnalyticsResult:
+    """Search analytics query result."""
+
+    rows: list[dict] = field(default_factory=list)
+    total_clicks: int = 0
+    total_impressions: int = 0
+    average_ctr: float = 0.0
+    average_position: float = 0.0
+
+
+@dataclass
+class SitemapInfo:
+    """Sitemap information from Search Console."""
+
+    path: str
+    last_submitted: str | None = None
+    last_downloaded: str | None = None
+    is_pending: bool = False
+    is_sitemaps_index: bool = False
+    warnings: int = 0
+    errors: int = 0
+
+
+class SearchConsoleClient:
+    """Client for Google Search Console API."""
+
+    SCOPES = ["https://www.googleapis.com/auth/webmasters.readonly"]
+
+    def __init__(self, credentials_path: str | None = None):
+        """
+        Initialize Search Console client.
+
+        Args:
+            credentials_path: Path to service account JSON key
+        """
+        self.credentials_path = credentials_path or config.google_credentials_path
+        self._service = None
+
+    @property
+    def service(self):
+        """Get or create Search Console service."""
+        if self._service is None:
+            if not self.credentials_path:
+                raise ValueError(
+                    "Google credentials not configured. "
+                    "Set GOOGLE_APPLICATION_CREDENTIALS environment variable."
+                )
+
+            credentials = service_account.Credentials.from_service_account_file(
+                self.credentials_path,
+                scopes=self.SCOPES,
+            )
+            self._service = build("searchconsole", "v1", credentials=credentials)
+
+        return self._service
+
+    def list_sites(self) -> list[dict]:
+        """List all sites accessible to the service account."""
+        response = self.service.sites().list().execute()
+        return response.get("siteEntry", [])
+
+    def get_search_analytics(
+        self,
+        site_url: str,
+        start_date: str | None = None,
+        end_date: str | None = None,
+        dimensions: list[str] | None = None,
+        row_limit: int = 25000,
+        filters: list[dict] | None = None,
+    ) -> SearchAnalyticsResult:
+        """
+        Get search analytics data.
+
+        Args:
+            site_url: Site URL (e.g., "sc-domain:example.com" or "https://example.com/")
+            start_date: Start date (YYYY-MM-DD), defaults to 30 days ago
+            end_date: End date (YYYY-MM-DD), defaults to yesterday
+            dimensions: List of dimensions (query, page, country, device, date)
+            row_limit: Maximum rows to return
+            filters: Dimension filters
+
+        Returns:
+            SearchAnalyticsResult with rows and summary stats
+        """
+        # Default date range: last 30 days
+        if not end_date:
+            end_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
+        if not start_date:
+            start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
+
+        # Default dimensions
+        if dimensions is None:
+            dimensions = ["query", "page"]
+
+        request_body = {
+            "startDate": start_date,
+            "endDate": end_date,
+            "dimensions": dimensions,
+            "rowLimit": row_limit,
+        }
+
+        if filters:
+            request_body["dimensionFilterGroups"] = [{"filters": filters}]
+
+        try:
+            response = self.service.searchanalytics().query(
+                siteUrl=site_url,
+                body=request_body,
+            ).execute()
+        except Exception as e:
+            logger.error(f"Failed to query search analytics: {e}")
+            raise
+
+        rows = response.get("rows", [])
+
+        # Calculate totals
+        total_clicks = sum(row.get("clicks", 0) for row in rows)
+        total_impressions = sum(row.get("impressions", 0) for row in rows)
+        total_ctr = sum(row.get("ctr", 0) for row in rows)
+        total_position = sum(row.get("position", 0) for row in rows)
+
+        avg_ctr = total_ctr / len(rows) if rows else 0
+        avg_position = total_position / len(rows) if rows else 0
+
+        return SearchAnalyticsResult(
+            rows=rows,
+            total_clicks=total_clicks,
+            total_impressions=total_impressions,
+            average_ctr=avg_ctr,
+            average_position=avg_position,
+        )
+
+    def get_top_queries(
+        self,
+        site_url: str,
+        limit: int = 100,
+        start_date: str | None = None,
+        end_date: str | None = None,
+    ) -> list[dict]:
+        """Get top search queries by clicks."""
+        result = self.get_search_analytics(
+            site_url=site_url,
+            dimensions=["query"],
+            row_limit=limit,
+            start_date=start_date,
+            end_date=end_date,
+        )
+
+        # Sort by clicks
+        sorted_rows = sorted(
+            result.rows,
+            key=lambda x: x.get("clicks", 0),
+            reverse=True,
+        )
+
+        return [
+            {
+                "query": row["keys"][0],
+                "clicks": row.get("clicks", 0),
+                "impressions": row.get("impressions", 0),
+                "ctr": row.get("ctr", 0),
+                "position": row.get("position", 0),
+            }
+            for row in sorted_rows[:limit]
+        ]
+
+    def get_top_pages(
+        self,
+        site_url: str,
+        limit: int = 100,
+        start_date: str | None = None,
+        end_date: str | None = None,
+    ) -> list[dict]:
+        """Get top pages by clicks."""
+        result = self.get_search_analytics(
+            site_url=site_url,
+            dimensions=["page"],
+            row_limit=limit,
+            start_date=start_date,
+            end_date=end_date,
+        )
+
+        sorted_rows = sorted(
+            result.rows,
+            key=lambda x: x.get("clicks", 0),
+            reverse=True,
+        )
+
+        return [
+            {
+                "page": row["keys"][0],
+                "clicks": row.get("clicks", 0),
+                "impressions": row.get("impressions", 0),
+                "ctr": row.get("ctr", 0),
+                "position": row.get("position", 0),
+            }
+            for row in sorted_rows[:limit]
+        ]
+
+    def get_sitemaps(self, site_url: str) -> list[SitemapInfo]:
+        """Get list of sitemaps for a site."""
+        try:
+            response = self.service.sitemaps().list(siteUrl=site_url).execute()
+        except Exception as e:
+            logger.error(f"Failed to get sitemaps: {e}")
+            raise
+
+        sitemaps = []
+        for sm in response.get("sitemap", []):
+            sitemaps.append(SitemapInfo(
+                path=sm.get("path", ""),
+                last_submitted=sm.get("lastSubmitted"),
+                last_downloaded=sm.get("lastDownloaded"),
+                is_pending=sm.get("isPending", False),
+                is_sitemaps_index=sm.get("isSitemapsIndex", False),
+                warnings=sm.get("warnings", 0),
+                errors=sm.get("errors", 0),
+            ))
+
+        return sitemaps
+
+    def submit_sitemap(self, site_url: str, sitemap_url: str) -> bool:
+        """Submit a sitemap for indexing."""
+        try:
+            self.service.sitemaps().submit(
+                siteUrl=site_url,
+                feedpath=sitemap_url,
+            ).execute()
+            logger.info(f"Submitted sitemap: {sitemap_url}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to submit sitemap: {e}")
+            return False
+
+    def inspect_url(self, site_url: str, inspection_url: str) -> dict:
+        """
+        Inspect a URL's indexing status.
+
+        Note: This uses the URL Inspection API which may have different quotas.
+        """
+        try:
+            response = self.service.urlInspection().index().inspect(
+                body={
+                    "inspectionUrl": inspection_url,
+                    "siteUrl": site_url,
+                }
+            ).execute()
+
+            result = response.get("inspectionResult", {})
+
+            return {
+                "url": inspection_url,
+                "indexing_state": result.get("indexStatusResult", {}).get(
+                    "coverageState", "Unknown"
+                ),
+                "last_crawl_time": result.get("indexStatusResult", {}).get(
+                    "lastCrawlTime"
+                ),
+                "crawled_as": result.get("indexStatusResult", {}).get("crawledAs"),
+                "robots_txt_state": result.get("indexStatusResult", {}).get(
+                    "robotsTxtState"
+                ),
+                "mobile_usability": result.get("mobileUsabilityResult", {}).get(
+                    "verdict", "Unknown"
+                ),
+                "rich_results": result.get("richResultsResult", {}).get(
+                    "verdict", "Unknown"
+                ),
+            }
+        except Exception as e:
+            logger.error(f"Failed to inspect URL: {e}")
+            raise
+
+    def get_performance_summary(
+        self,
+        site_url: str,
+        days: int = 30,
+    ) -> dict:
+        """Get a summary of search performance."""
+        end_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
+        start_date = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
+
+        # Get overall stats
+        overall = self.get_search_analytics(
+            site_url=site_url,
+            dimensions=[],
+            start_date=start_date,
+            end_date=end_date,
+        )
+
+        # Get top queries
+        top_queries = self.get_top_queries(
+            site_url=site_url,
+            limit=10,
+            start_date=start_date,
+            end_date=end_date,
+        )
+
+        # Get top pages
+        top_pages = self.get_top_pages(
+            site_url=site_url,
+            limit=10,
+            start_date=start_date,
+            end_date=end_date,
+        )
+
+        # Get by device
+        by_device = self.get_search_analytics(
+            site_url=site_url,
+            dimensions=["device"],
+            start_date=start_date,
+            end_date=end_date,
+        )
+
+        device_breakdown = {}
+        for row in by_device.rows:
+            device = row["keys"][0]
+            device_breakdown[device] = {
+                "clicks": row.get("clicks", 0),
+                "impressions": row.get("impressions", 0),
+                "ctr": row.get("ctr", 0),
+                "position": row.get("position", 0),
+            }
+
+        return {
+            "period": f"{start_date} to {end_date}",
+            "total_clicks": overall.total_clicks,
+            "total_impressions": overall.total_impressions,
+            "average_ctr": overall.average_ctr,
+            "average_position": overall.average_position,
+            "top_queries": top_queries,
+            "top_pages": top_pages,
+            "by_device": device_breakdown,
+        }
+
+
+def main():
+    """Test the Search Console client."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Google Search Console Client")
+    parser.add_argument("--site", "-s", required=True, help="Site URL")
+    parser.add_argument("--action", "-a", default="summary",
+                       choices=["summary", "queries", "pages", "sitemaps", "inspect"],
+                       help="Action to perform")
+    parser.add_argument("--url", help="URL to inspect")
+    parser.add_argument("--days", type=int, default=30, help="Days of data")
+
+    args = parser.parse_args()
+
+    client = SearchConsoleClient()
+
+    if args.action == "summary":
+        summary = client.get_performance_summary(args.site, args.days)
+        import json
+        print(json.dumps(summary, indent=2, default=str))
+
+    elif args.action == "queries":
+        queries = client.get_top_queries(args.site)
+        for q in queries[:20]:
+            print(f"{q['query']}: {q['clicks']} clicks, pos {q['position']:.1f}")
+
+    elif args.action == "pages":
+        pages = client.get_top_pages(args.site)
+        for p in pages[:20]:
+            print(f"{p['page']}: {p['clicks']} clicks, pos {p['position']:.1f}")
+
+    elif args.action == "sitemaps":
+        sitemaps = client.get_sitemaps(args.site)
+        for sm in sitemaps:
+            print(f"{sm.path}: errors={sm.errors}, warnings={sm.warnings}")
+
+    elif args.action == "inspect" and args.url:
+        result = client.inspect_url(args.site, args.url)
+        import json
+        print(json.dumps(result, indent=2))
+
+
+if __name__ == "__main__":
+    main()
--- a/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/notion_reporter.py
+++ b/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/notion_reporter.py
@@ -0,0 +1,951 @@
+"""
+Notion Reporter - Create SEO audit findings in Notion
+=====================================================
+Purpose: Output SEO audit findings to Notion databases
+Python: 3.10+
+Usage:
+    from notion_reporter import NotionReporter, SEOFinding, AuditReport
+    reporter = NotionReporter()
+
+    # Create audit report with checklist table
+    report = AuditReport(site="https://example.com")
+    report.add_finding(SEOFinding(...))
+    reporter.create_audit_report(report)
+"""
+
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+from notion_client import Client
+
+from base_client import config
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+# Template directory
+TEMPLATE_DIR = Path(__file__).parent.parent / "templates"
+
+# Default OurDigital SEO Audit Log database
+DEFAULT_DATABASE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
+
+# Default parent page for audit reports (OurDigital SEO Audit Log)
+DEFAULT_AUDIT_REPORTS_PAGE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
+
+
+@dataclass
+class SEOFinding:
+    """Represents an SEO audit finding."""
+
+    issue: str
+    category: str
+    priority: str
+    status: str = "To Fix"
+    url: str | None = None
+    description: str | None = None
+    impact: str | None = None
+    recommendation: str | None = None
+    site: str | None = None  # The audited site URL
+    audit_id: str | None = None  # Groups findings from same audit session
+    affected_urls: list[str] = field(default_factory=list)  # List of all affected URLs
+
+
+@dataclass
+class AuditReport:
+    """Represents a complete SEO audit report with checklist."""
+
+    site: str
+    audit_id: str = field(default_factory=lambda: datetime.now().strftime("%Y%m%d-%H%M%S"))
+    audit_date: datetime = field(default_factory=datetime.now)
+    findings: list[SEOFinding] = field(default_factory=list)
+
+    # Audit check results
+    robots_txt_status: str = "Not checked"
+    sitemap_status: str = "Not checked"
+    schema_status: str = "Not checked"
+    performance_status: str = "Not checked"
+
+    # Summary statistics
+    total_urls_checked: int = 0
+    total_issues: int = 0
+
+    def add_finding(self, finding: SEOFinding) -> None:
+        """Add a finding to the report."""
+        finding.site = self.site
+        finding.audit_id = f"{self.site.replace('https://', '').replace('http://', '').split('/')[0]}-{self.audit_id}"
+        self.findings.append(finding)
+        self.total_issues = len(self.findings)
+
+    def get_findings_by_priority(self) -> dict[str, list[SEOFinding]]:
+        """Group findings by priority."""
+        result = {"Critical": [], "High": [], "Medium": [], "Low": []}
+        for f in self.findings:
+            if f.priority in result:
+                result[f.priority].append(f)
+        return result
+
+    def get_findings_by_category(self) -> dict[str, list[SEOFinding]]:
+        """Group findings by category."""
+        result = {}
+        for f in self.findings:
+            if f.category not in result:
+                result[f.category] = []
+            result[f.category].append(f)
+        return result
+
+
+class NotionReporter:
+    """Create and manage SEO audit findings in Notion."""
+
+    CATEGORIES = [
+        "Technical SEO",
+        "On-page SEO",
+        "Content",
+        "Local SEO",
+        "Performance",
+        "Schema/Structured Data",
+        "Sitemap",
+        "Robots.txt",
+    ]
+
+    PRIORITIES = ["Critical", "High", "Medium", "Low"]
+
+    STATUSES = ["To Fix", "In Progress", "Fixed", "Monitoring"]
+
+    CATEGORY_COLORS = {
+        "Technical SEO": "blue",
+        "On-page SEO": "green",
+        "Content": "purple",
+        "Local SEO": "orange",
+        "Performance": "red",
+        "Schema/Structured Data": "yellow",
+        "Sitemap": "pink",
+        "Robots.txt": "gray",
+    }
+
+    PRIORITY_COLORS = {
+        "Critical": "red",
+        "High": "orange",
+        "Medium": "yellow",
+        "Low": "gray",
+    }
+
+    def __init__(self, token: str | None = None):
+        """
+        Initialize Notion reporter.
+
+        Args:
+            token: Notion API token
+        """
+        self.token = token or config.notion_token
+        if not self.token:
+            raise ValueError(
+                "Notion token not configured. "
+                "Set NOTION_TOKEN or NOTION_API_KEY environment variable."
+            )
+        self.client = Client(auth=self.token)
+
+    def create_findings_database(
+        self,
+        parent_page_id: str,
+        title: str = "SEO Audit Findings",
+    ) -> str:
+        """
+        Create a new SEO findings database.
+
+        Args:
+            parent_page_id: Parent page ID for the database
+            title: Database title
+
+        Returns:
+            Database ID
+        """
+        # Build database schema
+        properties = {
+            "Issue": {"title": {}},
+            "Category": {
+                "select": {
+                    "options": [
+                        {"name": cat, "color": self.CATEGORY_COLORS.get(cat, "default")}
+                        for cat in self.CATEGORIES
+                    ]
+                }
+            },
+            "Priority": {
+                "select": {
+                    "options": [
+                        {"name": pri, "color": self.PRIORITY_COLORS.get(pri, "default")}
+                        for pri in self.PRIORITIES
+                    ]
+                }
+            },
+            "Status": {
+                "status": {
+                    "options": [
+                        {"name": "To Fix", "color": "red"},
+                        {"name": "In Progress", "color": "yellow"},
+                        {"name": "Fixed", "color": "green"},
+                        {"name": "Monitoring", "color": "blue"},
+                    ],
+                    "groups": [
+                        {"name": "To-do", "option_ids": [], "color": "gray"},
+                        {"name": "In progress", "option_ids": [], "color": "blue"},
+                        {"name": "Complete", "option_ids": [], "color": "green"},
+                    ],
+                }
+            },
+            "URL": {"url": {}},
+            "Description": {"rich_text": {}},
+            "Impact": {"rich_text": {}},
+            "Recommendation": {"rich_text": {}},
+            "Found Date": {"date": {}},
+        }
+
+        try:
+            response = self.client.databases.create(
+                parent={"page_id": parent_page_id},
+                title=[{"type": "text", "text": {"content": title}}],
+                properties=properties,
+            )
+            database_id = response["id"]
+            logger.info(f"Created database: {database_id}")
+            return database_id
+        except Exception as e:
+            logger.error(f"Failed to create database: {e}")
+            raise
+
+    def add_finding(
+        self,
+        finding: SEOFinding,
+        database_id: str | None = None,
+    ) -> str:
+        """
+        Add a finding to the database with page content.
+
+        Args:
+            finding: SEOFinding object
+            database_id: Target database ID (defaults to OurDigital SEO Audit Log)
+
+        Returns:
+            Page ID of created entry
+        """
+        db_id = database_id or DEFAULT_DATABASE_ID
+
+        # Database properties (metadata)
+        properties = {
+            "Issue": {"title": [{"text": {"content": finding.issue}}]},
+            "Category": {"select": {"name": finding.category}},
+            "Priority": {"select": {"name": finding.priority}},
+            "Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
+        }
+
+        if finding.url:
+            properties["URL"] = {"url": finding.url}
+
+        if finding.site:
+            properties["Site"] = {"url": finding.site}
+
+        if finding.audit_id:
+            properties["Audit ID"] = {
+                "rich_text": [{"text": {"content": finding.audit_id}}]
+            }
+
+        # Page content blocks (Description, Impact, Recommendation)
+        children = []
+
+        if finding.description:
+            children.extend([
+                {
+                    "object": "block",
+                    "type": "heading_2",
+                    "heading_2": {
+                        "rich_text": [{"type": "text", "text": {"content": "Description"}}]
+                    }
+                },
+                {
+                    "object": "block",
+                    "type": "paragraph",
+                    "paragraph": {
+                        "rich_text": [{"type": "text", "text": {"content": finding.description}}]
+                    }
+                }
+            ])
+
+        if finding.impact:
+            children.extend([
+                {
+                    "object": "block",
+                    "type": "heading_2",
+                    "heading_2": {
+                        "rich_text": [{"type": "text", "text": {"content": "Impact"}}]
+                    }
+                },
+                {
+                    "object": "block",
+                    "type": "callout",
+                    "callout": {
+                        "rich_text": [{"type": "text", "text": {"content": finding.impact}}],
+                        "icon": {"type": "emoji", "emoji": "⚠️"}
+                    }
+                }
+            ])
+
+        if finding.recommendation:
+            children.extend([
+                {
+                    "object": "block",
+                    "type": "heading_2",
+                    "heading_2": {
+                        "rich_text": [{"type": "text", "text": {"content": "Recommendation"}}]
+                    }
+                },
+                {
+                    "object": "block",
+                    "type": "callout",
+                    "callout": {
+                        "rich_text": [{"type": "text", "text": {"content": finding.recommendation}}],
+                        "icon": {"type": "emoji", "emoji": "💡"}
+                    }
+                }
+            ])
+
+        try:
+            response = self.client.pages.create(
+                parent={"database_id": db_id},
+                properties=properties,
+                children=children if children else None,
+            )
+            return response["id"]
+        except Exception as e:
+            logger.error(f"Failed to add finding: {e}")
+            raise
+
+    def add_findings_batch(
+        self,
+        findings: list[SEOFinding],
+        database_id: str | None = None,
+    ) -> list[str]:
+        """
+        Add multiple findings to the database.
+
+        Args:
+            findings: List of SEOFinding objects
+            database_id: Target database ID (defaults to OurDigital SEO Audit Log)
+
+        Returns:
+            List of created page IDs
+        """
+        page_ids = []
+        for finding in findings:
+            try:
+                page_id = self.add_finding(finding, database_id)
+                page_ids.append(page_id)
+            except Exception as e:
+                logger.error(f"Failed to add finding '{finding.issue}': {e}")
+        return page_ids
+
+    def create_audit_summary_page(
+        self,
+        parent_page_id: str,
+        url: str,
+        summary: dict,
+    ) -> str:
+        """
+        Create a summary page for the audit.
+
+        Args:
+            parent_page_id: Parent page ID
+            url: Audited URL
+            summary: Audit summary data
+
+        Returns:
+            Page ID
+        """
+        # Build page content
+        children = [
+            {
+                "object": "block",
+                "type": "heading_1",
+                "heading_1": {
+                    "rich_text": [{"type": "text", "text": {"content": f"SEO Audit: {url}"}}]
+                },
+            },
+            {
+                "object": "block",
+                "type": "paragraph",
+                "paragraph": {
+                    "rich_text": [
+                        {
+                            "type": "text",
+                            "text": {"content": f"Audit Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}"},
+                        }
+                    ]
+                },
+            },
+            {
+                "object": "block",
+                "type": "divider",
+                "divider": {},
+            },
+            {
+                "object": "block",
+                "type": "heading_2",
+                "heading_2": {
+                    "rich_text": [{"type": "text", "text": {"content": "Summary"}}]
+                },
+            },
+        ]
+
+        # Add summary statistics
+        if "stats" in summary:
+            stats = summary["stats"]
+            stats_text = "\n".join([f"• {k}: {v}" for k, v in stats.items()])
+            children.append({
+                "object": "block",
+                "type": "paragraph",
+                "paragraph": {
+                    "rich_text": [{"type": "text", "text": {"content": stats_text}}]
+                },
+            })
+
+        # Add findings by priority
+        if "findings_by_priority" in summary:
+            children.append({
+                "object": "block",
+                "type": "heading_2",
+                "heading_2": {
+                    "rich_text": [{"type": "text", "text": {"content": "Findings by Priority"}}]
+                },
+            })
+
+            for priority, count in summary["findings_by_priority"].items():
+                children.append({
+                    "object": "block",
+                    "type": "bulleted_list_item",
+                    "bulleted_list_item": {
+                        "rich_text": [{"type": "text", "text": {"content": f"{priority}: {count}"}}]
+                    },
+                })
+
+        try:
+            response = self.client.pages.create(
+                parent={"page_id": parent_page_id},
+                properties={
+                    "title": {"title": [{"text": {"content": f"SEO Audit - {url}"}}]}
+                },
+                children=children,
+            )
+            return response["id"]
+        except Exception as e:
+            logger.error(f"Failed to create summary page: {e}")
+            raise
+
+    def query_findings(
+        self,
+        database_id: str,
+        category: str | None = None,
+        priority: str | None = None,
+        status: str | None = None,
+    ) -> list[dict]:
+        """
+        Query findings from database.
+
+        Args:
+            database_id: Database ID
+            category: Filter by category
+            priority: Filter by priority
+            status: Filter by status
+
+        Returns:
+            List of finding records
+        """
+        filters = []
+
+        if category:
+            filters.append({
+                "property": "Category",
+                "select": {"equals": category},
+            })
+
+        if priority:
+            filters.append({
+                "property": "Priority",
+                "select": {"equals": priority},
+            })
+
+        if status:
+            filters.append({
+                "property": "Status",
+                "status": {"equals": status},
+            })
+
+        query_params = {"database_id": database_id}
+        if filters:
+            if len(filters) == 1:
+                query_params["filter"] = filters[0]
+            else:
+                query_params["filter"] = {"and": filters}
+
+        try:
+            response = self.client.databases.query(**query_params)
+            return response.get("results", [])
+        except Exception as e:
+            logger.error(f"Failed to query findings: {e}")
+            raise
+
+    def update_finding_status(
+        self,
+        page_id: str,
+        status: str,
+    ) -> None:
+        """Update the status of a finding."""
+        if status not in self.STATUSES:
+            raise ValueError(f"Invalid status: {status}")
+
+        try:
+            self.client.pages.update(
+                page_id=page_id,
+                properties={"Status": {"status": {"name": status}}},
+            )
+            logger.info(f"Updated finding {page_id} to {status}")
+        except Exception as e:
+            logger.error(f"Failed to update status: {e}")
+            raise
+
+    def create_audit_report(
+        self,
+        report: "AuditReport",
+        database_id: str | None = None,
+    ) -> dict:
+        """
+        Create a comprehensive audit report page with checklist table.
+
+        This creates:
+        1. Individual finding pages in the database
+        2. A summary page with all findings in table format for checklist tracking
+
+        Args:
+            report: AuditReport object with all findings
+            database_id: Target database ID (defaults to OurDigital SEO Audit Log)
+
+        Returns:
+            Dict with summary_page_id and finding_page_ids
+        """
+        db_id = database_id or DEFAULT_DATABASE_ID
+
+        # Generate full audit ID
+        site_domain = report.site.replace('https://', '').replace('http://', '').split('/')[0]
+        full_audit_id = f"{site_domain}-{report.audit_id}"
+
+        result = {
+            "audit_id": full_audit_id,
+            "site": report.site,
+            "summary_page_id": None,
+            "finding_page_ids": [],
+        }
+
+        # 1. Create individual finding pages in database
+        logger.info(f"Creating {len(report.findings)} finding pages...")
+        for finding in report.findings:
+            finding.audit_id = full_audit_id
+            finding.site = report.site
+            try:
+                page_id = self.add_finding(finding, db_id)
+                result["finding_page_ids"].append(page_id)
+            except Exception as e:
+                logger.error(f"Failed to add finding '{finding.issue}': {e}")
+
+        # 2. Create summary page with checklist table
+        logger.info("Creating audit summary page with checklist...")
+        summary_page_id = self._create_audit_summary_with_table(report, full_audit_id, db_id)
+        result["summary_page_id"] = summary_page_id
+
+        logger.info(f"Audit report created: {full_audit_id}")
+        return result
+
+    def _create_audit_summary_with_table(
+        self,
+        report: "AuditReport",
+        audit_id: str,
+        database_id: str,
+    ) -> str:
+        """
+        Create audit summary page with checklist table format.
+
+        Args:
+            report: AuditReport object
+            audit_id: Full audit ID
+            database_id: Parent database ID
+
+        Returns:
+            Summary page ID
+        """
+        site_domain = report.site.replace('https://', '').replace('http://', '').split('/')[0]
+
+        # Build page content blocks
+        children = []
+
+        # Header with audit info
+        children.append({
+            "object": "block",
+            "type": "callout",
+            "callout": {
+                "rich_text": [
+                    {"type": "text", "text": {"content": f"Audit ID: {audit_id}\n"}},
+                    {"type": "text", "text": {"content": f"Date: {report.audit_date.strftime('%Y-%m-%d %H:%M')}\n"}},
+                    {"type": "text", "text": {"content": f"Total Issues: {report.total_issues}"}},
+                ],
+                "icon": {"type": "emoji", "emoji": "📋"},
+                "color": "blue_background",
+            }
+        })
+
+        # Audit Status Summary
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {
+                "rich_text": [{"type": "text", "text": {"content": "Audit Status"}}]
+            }
+        })
+
+        # Status table
+        status_table = {
+            "object": "block",
+            "type": "table",
+            "table": {
+                "table_width": 2,
+                "has_column_header": True,
+                "has_row_header": False,
+                "children": [
+                    {
+                        "type": "table_row",
+                        "table_row": {
+                            "cells": [
+                                [{"type": "text", "text": {"content": "Check"}}],
+                                [{"type": "text", "text": {"content": "Status"}}],
+                            ]
+                        }
+                    },
+                    {
+                        "type": "table_row",
+                        "table_row": {
+                            "cells": [
+                                [{"type": "text", "text": {"content": "Robots.txt"}}],
+                                [{"type": "text", "text": {"content": report.robots_txt_status}}],
+                            ]
+                        }
+                    },
+                    {
+                        "type": "table_row",
+                        "table_row": {
+                            "cells": [
+                                [{"type": "text", "text": {"content": "Sitemap"}}],
+                                [{"type": "text", "text": {"content": report.sitemap_status}}],
+                            ]
+                        }
+                    },
+                    {
+                        "type": "table_row",
+                        "table_row": {
+                            "cells": [
+                                [{"type": "text", "text": {"content": "Schema Markup"}}],
+                                [{"type": "text", "text": {"content": report.schema_status}}],
+                            ]
+                        }
+                    },
+                    {
+                        "type": "table_row",
+                        "table_row": {
+                            "cells": [
+                                [{"type": "text", "text": {"content": "Performance"}}],
+                                [{"type": "text", "text": {"content": report.performance_status}}],
+                            ]
+                        }
+                    },
+                ]
+            }
+        }
+        children.append(status_table)
+
+        # Divider
+        children.append({"object": "block", "type": "divider", "divider": {}})
+
+        # Findings Checklist Header
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {
+                "rich_text": [{"type": "text", "text": {"content": "Findings Checklist"}}]
+            }
+        })
+
+        children.append({
+            "object": "block",
+            "type": "paragraph",
+            "paragraph": {
+                "rich_text": [{"type": "text", "text": {"content": "Use this checklist to track fixes. Check off items as you complete them."}}]
+            }
+        })
+
+        # Create findings table with checklist format
+        if report.findings:
+            # Build table rows - Header row
+            table_rows = [
+                {
+                    "type": "table_row",
+                    "table_row": {
+                        "cells": [
+                            [{"type": "text", "text": {"content": "#"}, "annotations": {"bold": True}}],
+                            [{"type": "text", "text": {"content": "Priority"}, "annotations": {"bold": True}}],
+                            [{"type": "text", "text": {"content": "Category"}, "annotations": {"bold": True}}],
+                            [{"type": "text", "text": {"content": "Issue"}, "annotations": {"bold": True}}],
+                            [{"type": "text", "text": {"content": "URL"}, "annotations": {"bold": True}}],
+                        ]
+                    }
+                }
+            ]
+
+            # Add finding rows
+            for idx, finding in enumerate(report.findings, 1):
+                # Truncate long text for table cells
+                issue_text = finding.issue[:50] + "..." if len(finding.issue) > 50 else finding.issue
+                url_text = finding.url[:40] + "..." if finding.url and len(finding.url) > 40 else (finding.url or "-")
+
+                table_rows.append({
+                    "type": "table_row",
+                    "table_row": {
+                        "cells": [
+                            [{"type": "text", "text": {"content": str(idx)}}],
+                            [{"type": "text", "text": {"content": finding.priority}}],
+                            [{"type": "text", "text": {"content": finding.category}}],
+                            [{"type": "text", "text": {"content": issue_text}}],
+                            [{"type": "text", "text": {"content": url_text}}],
+                        ]
+                    }
+                })
+
+            findings_table = {
+                "object": "block",
+                "type": "table",
+                "table": {
+                    "table_width": 5,
+                    "has_column_header": True,
+                    "has_row_header": False,
+                    "children": table_rows
+                }
+            }
+            children.append(findings_table)
+
+        # Divider
+        children.append({"object": "block", "type": "divider", "divider": {}})
+
+        # Detailed Findings with To-Do checkboxes
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {
+                "rich_text": [{"type": "text", "text": {"content": "Detailed Findings & Actions"}}]
+            }
+        })
+
+        # Group findings by priority and add as to-do items
+        for priority in ["Critical", "High", "Medium", "Low"]:
+            priority_findings = [f for f in report.findings if f.priority == priority]
+            if not priority_findings:
+                continue
+
+            # Priority header with emoji
+            priority_emoji = {"Critical": "🔴", "High": "🟠", "Medium": "🟡", "Low": "⚪"}
+            children.append({
+                "object": "block",
+                "type": "heading_3",
+                "heading_3": {
+                    "rich_text": [{"type": "text", "text": {"content": f"{priority_emoji.get(priority, '')} {priority} Priority ({len(priority_findings)})"}}]
+                }
+            })
+
+            # Add each finding as a to-do item with details
+            for finding in priority_findings:
+                # Main to-do item
+                children.append({
+                    "object": "block",
+                    "type": "to_do",
+                    "to_do": {
+                        "rich_text": [
+                            {"type": "text", "text": {"content": f"[{finding.category}] "}, "annotations": {"bold": True}},
+                            {"type": "text", "text": {"content": finding.issue}},
+                        ],
+                        "checked": False,
+                    }
+                })
+
+                # URL if available
+                if finding.url:
+                    children.append({
+                        "object": "block",
+                        "type": "bulleted_list_item",
+                        "bulleted_list_item": {
+                            "rich_text": [
+                                {"type": "text", "text": {"content": "URL: "}},
+                                {"type": "text", "text": {"content": finding.url, "link": {"url": finding.url}}},
+                            ]
+                        }
+                    })
+
+                # Affected URLs list if available
+                if finding.affected_urls:
+                    children.append({
+                        "object": "block",
+                        "type": "toggle",
+                        "toggle": {
+                            "rich_text": [{"type": "text", "text": {"content": f"Affected URLs ({len(finding.affected_urls)})"}}],
+                            "children": [
+                                {
+                                    "object": "block",
+                                    "type": "bulleted_list_item",
+                                    "bulleted_list_item": {
+                                        "rich_text": [{"type": "text", "text": {"content": url, "link": {"url": url} if url.startswith("http") else None}}]
+                                    }
+                                }
+                                for url in finding.affected_urls[:20]  # Limit to 20 URLs
+                            ] + ([{
+                                "object": "block",
+                                "type": "paragraph",
+                                "paragraph": {
+                                    "rich_text": [{"type": "text", "text": {"content": f"... and {len(finding.affected_urls) - 20} more URLs"}}]
+                                }
+                            }] if len(finding.affected_urls) > 20 else [])
+                        }
+                    })
+
+                # Recommendation as sub-item
+                if finding.recommendation:
+                    children.append({
+                        "object": "block",
+                        "type": "bulleted_list_item",
+                        "bulleted_list_item": {
+                            "rich_text": [
+                                {"type": "text", "text": {"content": "💡 "}, "annotations": {"bold": True}},
+                                {"type": "text", "text": {"content": finding.recommendation}},
+                            ]
+                        }
+                    })
+
+        # Create the summary page
+        try:
+            response = self.client.pages.create(
+                parent={"database_id": database_id},
+                properties={
+                    "Issue": {"title": [{"text": {"content": f"📊 Audit Report: {site_domain}"}}]},
+                    "Category": {"select": {"name": "Technical SEO"}},
+                    "Priority": {"select": {"name": "High"}},
+                    "Site": {"url": report.site},
+                    "Audit ID": {"rich_text": [{"text": {"content": audit_id}}]},
+                    "Found Date": {"date": {"start": report.audit_date.strftime("%Y-%m-%d")}},
+                },
+                children=children,
+            )
+            logger.info(f"Created audit summary page: {response['id']}")
+            return response["id"]
+        except Exception as e:
+            logger.error(f"Failed to create audit summary page: {e}")
+            raise
+
+    def create_quick_audit_report(
+        self,
+        site: str,
+        findings: list[SEOFinding],
+        robots_status: str = "Not checked",
+        sitemap_status: str = "Not checked",
+        schema_status: str = "Not checked",
+        performance_status: str = "Not checked",
+        database_id: str | None = None,
+    ) -> dict:
+        """
+        Quick method to create audit report from a list of findings.
+
+        Args:
+            site: Site URL
+            findings: List of SEOFinding objects
+            robots_status: Robots.txt check result
+            sitemap_status: Sitemap check result
+            schema_status: Schema check result
+            performance_status: Performance check result
+            database_id: Target database ID
+
+        Returns:
+            Dict with audit results
+        """
+        report = AuditReport(site=site)
+        report.robots_txt_status = robots_status
+        report.sitemap_status = sitemap_status
+        report.schema_status = schema_status
+        report.performance_status = performance_status
+
+        for finding in findings:
+            report.add_finding(finding)
+
+        return self.create_audit_report(report, database_id)
+
+
+def main():
+    """CLI entry point for testing."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Notion SEO Reporter")
+    parser.add_argument("--action", "-a", required=True,
+                       choices=["create-db", "add-finding", "query"],
+                       help="Action to perform")
+    parser.add_argument("--parent-id", "-p", help="Parent page ID")
+    parser.add_argument("--database-id", "-d", help="Database ID")
+    parser.add_argument("--title", "-t", default="SEO Audit Findings",
+                       help="Database title")
+
+    args = parser.parse_args()
+
+    reporter = NotionReporter()
+
+    if args.action == "create-db":
+        if not args.parent_id:
+            parser.error("--parent-id required for create-db")
+        db_id = reporter.create_findings_database(args.parent_id, args.title)
+        print(f"Created database: {db_id}")
+
+    elif args.action == "add-finding":
+        if not args.database_id:
+            parser.error("--database-id required for add-finding")
+        # Example finding
+        finding = SEOFinding(
+            issue="Missing meta description",
+            category="On-page SEO",
+            priority="Medium",
+            url="https://example.com/page",
+            description="Page is missing meta description tag",
+            impact="May affect CTR in search results",
+            recommendation="Add unique meta description under 160 characters",
+        )
+        page_id = reporter.add_finding(args.database_id, finding)
+        print(f"Created finding: {page_id}")
+
+    elif args.action == "query":
+        if not args.database_id:
+            parser.error("--database-id required for query")
+        findings = reporter.query_findings(args.database_id)
+        print(f"Found {len(findings)} findings")
+        for f in findings[:5]:
+            title = f["properties"]["Issue"]["title"]
+            if title:
+                print(f"  - {title[0]['plain_text']}")
+
+
+if __name__ == "__main__":
+    main()
--- a/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/page_analyzer.py
+++ b/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/page_analyzer.py
@@ -0,0 +1,569 @@
+"""
+Page Analyzer - Extract SEO metadata from web pages
+===================================================
+Purpose: Comprehensive page-level SEO data extraction
+Python: 3.10+
+Usage:
+    from page_analyzer import PageAnalyzer, PageMetadata
+    analyzer = PageAnalyzer()
+    metadata = analyzer.analyze_url("https://example.com/page")
+"""
+
+import json
+import logging
+import re
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any
+from urllib.parse import urljoin, urlparse
+
+import requests
+from bs4 import BeautifulSoup
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class LinkData:
+    """Represents a link found on a page."""
+    url: str
+    anchor_text: str
+    is_internal: bool
+    is_nofollow: bool = False
+    link_type: str = "body"  # body, nav, footer, etc.
+
+
+@dataclass
+class HeadingData:
+    """Represents a heading found on a page."""
+    level: int  # 1-6
+    text: str
+
+
+@dataclass
+class SchemaData:
+    """Represents schema.org structured data."""
+    schema_type: str
+    properties: dict
+    format: str = "json-ld"  # json-ld, microdata, rdfa
+
+
+@dataclass
+class OpenGraphData:
+    """Represents Open Graph metadata."""
+    og_title: str | None = None
+    og_description: str | None = None
+    og_image: str | None = None
+    og_url: str | None = None
+    og_type: str | None = None
+    og_site_name: str | None = None
+    og_locale: str | None = None
+    twitter_card: str | None = None
+    twitter_title: str | None = None
+    twitter_description: str | None = None
+    twitter_image: str | None = None
+
+
+@dataclass
+class PageMetadata:
+    """Complete SEO metadata for a page."""
+
+    # Basic info
+    url: str
+    status_code: int = 0
+    content_type: str = ""
+    response_time_ms: float = 0
+    analyzed_at: datetime = field(default_factory=datetime.now)
+
+    # Meta tags
+    title: str | None = None
+    title_length: int = 0
+    meta_description: str | None = None
+    meta_description_length: int = 0
+    canonical_url: str | None = None
+    robots_meta: str | None = None
+
+    # Language
+    html_lang: str | None = None
+    hreflang_tags: list[dict] = field(default_factory=list)  # [{"lang": "en", "url": "..."}]
+
+    # Headings
+    headings: list[HeadingData] = field(default_factory=list)
+    h1_count: int = 0
+    h1_text: str | None = None
+
+    # Open Graph & Social
+    open_graph: OpenGraphData = field(default_factory=OpenGraphData)
+
+    # Schema/Structured Data
+    schema_data: list[SchemaData] = field(default_factory=list)
+    schema_types_found: list[str] = field(default_factory=list)
+
+    # Links
+    internal_links: list[LinkData] = field(default_factory=list)
+    external_links: list[LinkData] = field(default_factory=list)
+    internal_link_count: int = 0
+    external_link_count: int = 0
+
+    # Images
+    images_total: int = 0
+    images_without_alt: int = 0
+    images_with_alt: int = 0
+
+    # Content metrics
+    word_count: int = 0
+
+    # Issues found
+    issues: list[str] = field(default_factory=list)
+    warnings: list[str] = field(default_factory=list)
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "url": self.url,
+            "status_code": self.status_code,
+            "content_type": self.content_type,
+            "response_time_ms": self.response_time_ms,
+            "analyzed_at": self.analyzed_at.isoformat(),
+            "title": self.title,
+            "title_length": self.title_length,
+            "meta_description": self.meta_description,
+            "meta_description_length": self.meta_description_length,
+            "canonical_url": self.canonical_url,
+            "robots_meta": self.robots_meta,
+            "html_lang": self.html_lang,
+            "hreflang_tags": self.hreflang_tags,
+            "h1_count": self.h1_count,
+            "h1_text": self.h1_text,
+            "headings_count": len(self.headings),
+            "schema_types_found": self.schema_types_found,
+            "internal_link_count": self.internal_link_count,
+            "external_link_count": self.external_link_count,
+            "images_total": self.images_total,
+            "images_without_alt": self.images_without_alt,
+            "word_count": self.word_count,
+            "issues": self.issues,
+            "warnings": self.warnings,
+            "open_graph": {
+                "og_title": self.open_graph.og_title,
+                "og_description": self.open_graph.og_description,
+                "og_image": self.open_graph.og_image,
+                "og_url": self.open_graph.og_url,
+                "og_type": self.open_graph.og_type,
+            },
+        }
+
+    def get_summary(self) -> str:
+        """Get a brief summary of the page analysis."""
+        lines = [
+            f"URL: {self.url}",
+            f"Status: {self.status_code}",
+            f"Title: {self.title[:50] + '...' if self.title and len(self.title) > 50 else self.title}",
+            f"Description: {'✓' if self.meta_description else '✗ Missing'}",
+            f"Canonical: {'✓' if self.canonical_url else '✗ Missing'}",
+            f"H1: {self.h1_count} found",
+            f"Schema: {', '.join(self.schema_types_found) if self.schema_types_found else 'None'}",
+            f"Links: {self.internal_link_count} internal, {self.external_link_count} external",
+            f"Images: {self.images_total} total, {self.images_without_alt} without alt",
+        ]
+        if self.issues:
+            lines.append(f"Issues: {len(self.issues)}")
+        return "\n".join(lines)
+
+
+class PageAnalyzer:
+    """Analyze web pages for SEO metadata."""
+
+    DEFAULT_USER_AGENT = "Mozilla/5.0 (compatible; OurDigitalSEOBot/1.0; +https://ourdigital.org)"
+
+    def __init__(
+        self,
+        user_agent: str | None = None,
+        timeout: int = 30,
+    ):
+        """
+        Initialize page analyzer.
+
+        Args:
+            user_agent: Custom user agent string
+            timeout: Request timeout in seconds
+        """
+        self.user_agent = user_agent or self.DEFAULT_USER_AGENT
+        self.timeout = timeout
+        self.session = requests.Session()
+        self.session.headers.update({
+            "User-Agent": self.user_agent,
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.9,ko;q=0.8",
+        })
+
+    def analyze_url(self, url: str) -> PageMetadata:
+        """
+        Analyze a URL and extract SEO metadata.
+
+        Args:
+            url: URL to analyze
+
+        Returns:
+            PageMetadata object with all extracted data
+        """
+        metadata = PageMetadata(url=url)
+
+        try:
+            # Fetch page
+            start_time = datetime.now()
+            response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
+            metadata.response_time_ms = (datetime.now() - start_time).total_seconds() * 1000
+            metadata.status_code = response.status_code
+            metadata.content_type = response.headers.get("Content-Type", "")
+
+            if response.status_code != 200:
+                metadata.issues.append(f"HTTP {response.status_code} status")
+                if response.status_code >= 400:
+                    return metadata
+
+            # Parse HTML
+            soup = BeautifulSoup(response.text, "html.parser")
+            base_url = url
+
+            # Extract all metadata
+            self._extract_basic_meta(soup, metadata)
+            self._extract_canonical(soup, metadata, base_url)
+            self._extract_robots_meta(soup, metadata)
+            self._extract_hreflang(soup, metadata)
+            self._extract_headings(soup, metadata)
+            self._extract_open_graph(soup, metadata)
+            self._extract_schema(soup, metadata)
+            self._extract_links(soup, metadata, base_url)
+            self._extract_images(soup, metadata)
+            self._extract_content_metrics(soup, metadata)
+
+            # Run SEO checks
+            self._run_seo_checks(metadata)
+
+        except requests.RequestException as e:
+            metadata.issues.append(f"Request failed: {str(e)}")
+            logger.error(f"Failed to analyze {url}: {e}")
+        except Exception as e:
+            metadata.issues.append(f"Analysis error: {str(e)}")
+            logger.error(f"Error analyzing {url}: {e}")
+
+        return metadata
+
+    def _extract_basic_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
+        """Extract title and meta description."""
+        # Title
+        title_tag = soup.find("title")
+        if title_tag and title_tag.string:
+            metadata.title = title_tag.string.strip()
+            metadata.title_length = len(metadata.title)
+
+        # Meta description
+        desc_tag = soup.find("meta", attrs={"name": re.compile(r"^description$", re.I)})
+        if desc_tag and desc_tag.get("content"):
+            metadata.meta_description = desc_tag["content"].strip()
+            metadata.meta_description_length = len(metadata.meta_description)
+
+        # HTML lang
+        html_tag = soup.find("html")
+        if html_tag and html_tag.get("lang"):
+            metadata.html_lang = html_tag["lang"]
+
+    def _extract_canonical(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None:
+        """Extract canonical URL."""
+        canonical = soup.find("link", rel="canonical")
+        if canonical and canonical.get("href"):
+            metadata.canonical_url = urljoin(base_url, canonical["href"])
+
+    def _extract_robots_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
+        """Extract robots meta tag."""
+        robots = soup.find("meta", attrs={"name": re.compile(r"^robots$", re.I)})
+        if robots and robots.get("content"):
+            metadata.robots_meta = robots["content"]
+
+        # Also check for googlebot-specific
+        googlebot = soup.find("meta", attrs={"name": re.compile(r"^googlebot$", re.I)})
+        if googlebot and googlebot.get("content"):
+            if metadata.robots_meta:
+                metadata.robots_meta += f" | googlebot: {googlebot['content']}"
+            else:
+                metadata.robots_meta = f"googlebot: {googlebot['content']}"
+
+    def _extract_hreflang(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
+        """Extract hreflang tags."""
+        hreflang_tags = soup.find_all("link", rel="alternate", hreflang=True)
+        for tag in hreflang_tags:
+            if tag.get("href") and tag.get("hreflang"):
+                metadata.hreflang_tags.append({
+                    "lang": tag["hreflang"],
+                    "url": tag["href"]
+                })
+
+    def _extract_headings(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
+        """Extract all headings."""
+        for level in range(1, 7):
+            for heading in soup.find_all(f"h{level}"):
+                text = heading.get_text(strip=True)
+                if text:
+                    metadata.headings.append(HeadingData(level=level, text=text))
+
+        # Count H1s specifically
+        h1_tags = soup.find_all("h1")
+        metadata.h1_count = len(h1_tags)
+        if h1_tags:
+            metadata.h1_text = h1_tags[0].get_text(strip=True)
+
+    def _extract_open_graph(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
+        """Extract Open Graph and Twitter Card data."""
+        og = metadata.open_graph
+
+        # Open Graph tags
+        og_mappings = {
+            "og:title": "og_title",
+            "og:description": "og_description",
+            "og:image": "og_image",
+            "og:url": "og_url",
+            "og:type": "og_type",
+            "og:site_name": "og_site_name",
+            "og:locale": "og_locale",
+        }
+
+        for og_prop, attr_name in og_mappings.items():
+            tag = soup.find("meta", property=og_prop)
+            if tag and tag.get("content"):
+                setattr(og, attr_name, tag["content"])
+
+        # Twitter Card tags
+        twitter_mappings = {
+            "twitter:card": "twitter_card",
+            "twitter:title": "twitter_title",
+            "twitter:description": "twitter_description",
+            "twitter:image": "twitter_image",
+        }
+
+        for tw_name, attr_name in twitter_mappings.items():
+            tag = soup.find("meta", attrs={"name": tw_name})
+            if tag and tag.get("content"):
+                setattr(og, attr_name, tag["content"])
+
+    def _extract_schema(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
+        """Extract schema.org structured data."""
+        # JSON-LD
+        for script in soup.find_all("script", type="application/ld+json"):
+            try:
+                data = json.loads(script.string)
+                if isinstance(data, list):
+                    for item in data:
+                        self._process_schema_item(item, metadata, "json-ld")
+                else:
+                    self._process_schema_item(data, metadata, "json-ld")
+            except (json.JSONDecodeError, TypeError):
+                continue
+
+        # Microdata (basic detection)
+        for item in soup.find_all(itemscope=True):
+            itemtype = item.get("itemtype", "")
+            if itemtype:
+                schema_type = itemtype.split("/")[-1]
+                if schema_type not in metadata.schema_types_found:
+                    metadata.schema_types_found.append(schema_type)
+                    metadata.schema_data.append(SchemaData(
+                        schema_type=schema_type,
+                        properties={},
+                        format="microdata"
+                    ))
+
+    def _process_schema_item(self, data: dict, metadata: PageMetadata, format_type: str) -> None:
+        """Process a single schema.org item."""
+        if not isinstance(data, dict):
+            return
+
+        schema_type = data.get("@type", "Unknown")
+        if isinstance(schema_type, list):
+            schema_type = schema_type[0] if schema_type else "Unknown"
+
+        if schema_type not in metadata.schema_types_found:
+            metadata.schema_types_found.append(schema_type)
+
+        metadata.schema_data.append(SchemaData(
+            schema_type=schema_type,
+            properties=data,
+            format=format_type
+        ))
+
+        # Process nested @graph items
+        if "@graph" in data:
+            for item in data["@graph"]:
+                self._process_schema_item(item, metadata, format_type)
+
+    def _extract_links(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None:
+        """Extract internal and external links."""
+        parsed_base = urlparse(base_url)
+        base_domain = parsed_base.netloc.lower()
+
+        for a_tag in soup.find_all("a", href=True):
+            href = a_tag["href"]
+
+            # Skip non-http links
+            if href.startswith(("#", "javascript:", "mailto:", "tel:")):
+                continue
+
+            # Resolve relative URLs
+            full_url = urljoin(base_url, href)
+            parsed_url = urlparse(full_url)
+
+            # Get anchor text
+            anchor_text = a_tag.get_text(strip=True)[:100]  # Limit length
+
+            # Check if nofollow
+            rel = a_tag.get("rel", [])
+            if isinstance(rel, str):
+                rel = rel.split()
+            is_nofollow = "nofollow" in rel
+
+            # Determine if internal or external
+            link_domain = parsed_url.netloc.lower()
+            is_internal = (
+                link_domain == base_domain or
+                link_domain.endswith(f".{base_domain}") or
+                base_domain.endswith(f".{link_domain}")
+            )
+
+            link_data = LinkData(
+                url=full_url,
+                anchor_text=anchor_text,
+                is_internal=is_internal,
+                is_nofollow=is_nofollow,
+            )
+
+            if is_internal:
+                metadata.internal_links.append(link_data)
+            else:
+                metadata.external_links.append(link_data)
+
+        metadata.internal_link_count = len(metadata.internal_links)
+        metadata.external_link_count = len(metadata.external_links)
+
+    def _extract_images(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
+        """Extract image information."""
+        images = soup.find_all("img")
+        metadata.images_total = len(images)
+
+        for img in images:
+            alt = img.get("alt", "").strip()
+            if alt:
+                metadata.images_with_alt += 1
+            else:
+                metadata.images_without_alt += 1
+
+    def _extract_content_metrics(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
+        """Extract content metrics like word count."""
+        # Remove script and style elements
+        for element in soup(["script", "style", "noscript"]):
+            element.decompose()
+
+        # Get text content
+        text = soup.get_text(separator=" ", strip=True)
+        words = text.split()
+        metadata.word_count = len(words)
+
+    def _run_seo_checks(self, metadata: PageMetadata) -> None:
+        """Run SEO checks and add issues/warnings."""
+        # Title checks
+        if not metadata.title:
+            metadata.issues.append("Missing title tag")
+        elif metadata.title_length < 30:
+            metadata.warnings.append(f"Title too short ({metadata.title_length} chars, recommend 50-60)")
+        elif metadata.title_length > 60:
+            metadata.warnings.append(f"Title too long ({metadata.title_length} chars, recommend 50-60)")
+
+        # Meta description checks
+        if not metadata.meta_description:
+            metadata.issues.append("Missing meta description")
+        elif metadata.meta_description_length < 120:
+            metadata.warnings.append(f"Meta description too short ({metadata.meta_description_length} chars)")
+        elif metadata.meta_description_length > 160:
+            metadata.warnings.append(f"Meta description too long ({metadata.meta_description_length} chars)")
+
+        # Canonical check
+        if not metadata.canonical_url:
+            metadata.warnings.append("Missing canonical tag")
+        elif metadata.canonical_url != metadata.url:
+            metadata.warnings.append(f"Canonical points to different URL: {metadata.canonical_url}")
+
+        # H1 checks
+        if metadata.h1_count == 0:
+            metadata.issues.append("Missing H1 tag")
+        elif metadata.h1_count > 1:
+            metadata.warnings.append(f"Multiple H1 tags ({metadata.h1_count})")
+
+        # Image alt check
+        if metadata.images_without_alt > 0:
+            metadata.warnings.append(f"{metadata.images_without_alt} images missing alt text")
+
+        # Schema check
+        if not metadata.schema_types_found:
+            metadata.warnings.append("No structured data found")
+
+        # Open Graph check
+        if not metadata.open_graph.og_title:
+            metadata.warnings.append("Missing Open Graph tags")
+
+        # Robots meta check
+        if metadata.robots_meta:
+            robots_lower = metadata.robots_meta.lower()
+            if "noindex" in robots_lower:
+                metadata.issues.append("Page is set to noindex")
+            if "nofollow" in robots_lower:
+                metadata.warnings.append("Page is set to nofollow")
+
+
+def main():
+    """CLI entry point for testing."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Page SEO Analyzer")
+    parser.add_argument("url", help="URL to analyze")
+    parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
+
+    args = parser.parse_args()
+
+    analyzer = PageAnalyzer()
+    metadata = analyzer.analyze_url(args.url)
+
+    if args.json:
+        print(json.dumps(metadata.to_dict(), indent=2, ensure_ascii=False))
+    else:
+        print("=" * 60)
+        print("PAGE ANALYSIS REPORT")
+        print("=" * 60)
+        print(metadata.get_summary())
+        print()
+
+        if metadata.issues:
+            print("ISSUES:")
+            for issue in metadata.issues:
+                print(f"  ✗ {issue}")
+
+        if metadata.warnings:
+            print("\nWARNINGS:")
+            for warning in metadata.warnings:
+                print(f"  ⚠ {warning}")
+
+        if metadata.hreflang_tags:
+            print(f"\nHREFLANG TAGS ({len(metadata.hreflang_tags)}):")
+            for tag in metadata.hreflang_tags[:5]:
+                print(f"  {tag['lang']}: {tag['url']}")
+
+        if metadata.schema_types_found:
+            print(f"\nSCHEMA TYPES:")
+            for schema_type in metadata.schema_types_found:
+                print(f"  - {schema_type}")
+
+
+if __name__ == "__main__":
+    main()
--- a/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/pagespeed_client.py
+++ b/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/pagespeed_client.py
@@ -0,0 +1,452 @@
+"""
+PageSpeed Insights Client
+=========================
+Purpose: Get Core Web Vitals and performance data from PageSpeed Insights API
+Python: 3.10+
+Usage:
+    from pagespeed_client import PageSpeedClient
+    client = PageSpeedClient()
+    result = client.analyze("https://example.com")
+"""
+
+import argparse
+import json
+import logging
+from dataclasses import dataclass, field
+from typing import Any
+
+import requests
+
+from base_client import config
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CoreWebVitals:
+    """Core Web Vitals metrics."""
+
+    lcp: float | None = None  # Largest Contentful Paint (ms)
+    fid: float | None = None  # First Input Delay (ms)
+    cls: float | None = None  # Cumulative Layout Shift
+    inp: float | None = None  # Interaction to Next Paint (ms)
+    ttfb: float | None = None  # Time to First Byte (ms)
+    fcp: float | None = None  # First Contentful Paint (ms)
+
+    # Assessment (GOOD, NEEDS_IMPROVEMENT, POOR)
+    lcp_rating: str | None = None
+    fid_rating: str | None = None
+    cls_rating: str | None = None
+    inp_rating: str | None = None
+
+    def to_dict(self) -> dict:
+        return {
+            "lcp": {"value": self.lcp, "rating": self.lcp_rating},
+            "fid": {"value": self.fid, "rating": self.fid_rating},
+            "cls": {"value": self.cls, "rating": self.cls_rating},
+            "inp": {"value": self.inp, "rating": self.inp_rating},
+            "ttfb": {"value": self.ttfb},
+            "fcp": {"value": self.fcp},
+        }
+
+
+@dataclass
+class PageSpeedResult:
+    """PageSpeed analysis result."""
+
+    url: str
+    strategy: str  # mobile or desktop
+    performance_score: float | None = None
+    seo_score: float | None = None
+    accessibility_score: float | None = None
+    best_practices_score: float | None = None
+    core_web_vitals: CoreWebVitals = field(default_factory=CoreWebVitals)
+    opportunities: list[dict] = field(default_factory=list)
+    diagnostics: list[dict] = field(default_factory=list)
+    passed_audits: list[str] = field(default_factory=list)
+    raw_data: dict = field(default_factory=dict)
+
+    def to_dict(self) -> dict:
+        return {
+            "url": self.url,
+            "strategy": self.strategy,
+            "scores": {
+                "performance": self.performance_score,
+                "seo": self.seo_score,
+                "accessibility": self.accessibility_score,
+                "best_practices": self.best_practices_score,
+            },
+            "core_web_vitals": self.core_web_vitals.to_dict(),
+            "opportunities_count": len(self.opportunities),
+            "opportunities": self.opportunities[:10],
+            "diagnostics_count": len(self.diagnostics),
+            "passed_audits_count": len(self.passed_audits),
+        }
+
+
+class PageSpeedClient:
+    """Client for PageSpeed Insights API."""
+
+    BASE_URL = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
+
+    # Core Web Vitals thresholds
+    THRESHOLDS = {
+        "lcp": {"good": 2500, "poor": 4000},
+        "fid": {"good": 100, "poor": 300},
+        "cls": {"good": 0.1, "poor": 0.25},
+        "inp": {"good": 200, "poor": 500},
+        "ttfb": {"good": 800, "poor": 1800},
+        "fcp": {"good": 1800, "poor": 3000},
+    }
+
+    def __init__(self, api_key: str | None = None):
+        """
+        Initialize PageSpeed client.
+
+        Args:
+            api_key: PageSpeed API key (optional but recommended for higher quotas)
+        """
+        self.api_key = api_key or config.pagespeed_api_key
+        self.session = requests.Session()
+
+    def _rate_metric(self, metric: str, value: float | None) -> str | None:
+        """Rate a metric against thresholds."""
+        if value is None:
+            return None
+
+        thresholds = self.THRESHOLDS.get(metric)
+        if not thresholds:
+            return None
+
+        if value <= thresholds["good"]:
+            return "GOOD"
+        elif value <= thresholds["poor"]:
+            return "NEEDS_IMPROVEMENT"
+        else:
+            return "POOR"
+
+    def analyze(
+        self,
+        url: str,
+        strategy: str = "mobile",
+        categories: list[str] | None = None,
+    ) -> PageSpeedResult:
+        """
+        Analyze a URL with PageSpeed Insights.
+
+        Args:
+            url: URL to analyze
+            strategy: "mobile" or "desktop"
+            categories: Categories to analyze (performance, seo, accessibility, best-practices)
+
+        Returns:
+            PageSpeedResult with scores and metrics
+        """
+        if categories is None:
+            categories = ["performance", "seo", "accessibility", "best-practices"]
+
+        params = {
+            "url": url,
+            "strategy": strategy,
+            "category": categories,
+        }
+
+        if self.api_key:
+            params["key"] = self.api_key
+
+        try:
+            response = self.session.get(self.BASE_URL, params=params, timeout=60)
+            response.raise_for_status()
+            data = response.json()
+        except requests.RequestException as e:
+            logger.error(f"PageSpeed API request failed: {e}")
+            raise
+
+        result = PageSpeedResult(url=url, strategy=strategy, raw_data=data)
+
+        # Extract scores
+        lighthouse = data.get("lighthouseResult", {})
+        categories_data = lighthouse.get("categories", {})
+
+        if "performance" in categories_data:
+            score = categories_data["performance"].get("score")
+            result.performance_score = score * 100 if score else None
+
+        if "seo" in categories_data:
+            score = categories_data["seo"].get("score")
+            result.seo_score = score * 100 if score else None
+
+        if "accessibility" in categories_data:
+            score = categories_data["accessibility"].get("score")
+            result.accessibility_score = score * 100 if score else None
+
+        if "best-practices" in categories_data:
+            score = categories_data["best-practices"].get("score")
+            result.best_practices_score = score * 100 if score else None
+
+        # Extract Core Web Vitals
+        audits = lighthouse.get("audits", {})
+
+        # Lab data
+        cwv = result.core_web_vitals
+
+        if "largest-contentful-paint" in audits:
+            cwv.lcp = audits["largest-contentful-paint"].get("numericValue")
+            cwv.lcp_rating = self._rate_metric("lcp", cwv.lcp)
+
+        if "total-blocking-time" in audits:
+            # TBT is proxy for FID in lab data
+            cwv.fid = audits["total-blocking-time"].get("numericValue")
+            cwv.fid_rating = self._rate_metric("fid", cwv.fid)
+
+        if "cumulative-layout-shift" in audits:
+            cwv.cls = audits["cumulative-layout-shift"].get("numericValue")
+            cwv.cls_rating = self._rate_metric("cls", cwv.cls)
+
+        if "experimental-interaction-to-next-paint" in audits:
+            cwv.inp = audits["experimental-interaction-to-next-paint"].get("numericValue")
+            cwv.inp_rating = self._rate_metric("inp", cwv.inp)
+
+        if "server-response-time" in audits:
+            cwv.ttfb = audits["server-response-time"].get("numericValue")
+
+        if "first-contentful-paint" in audits:
+            cwv.fcp = audits["first-contentful-paint"].get("numericValue")
+
+        # Field data (real user data) if available
+        loading_exp = data.get("loadingExperience", {})
+        metrics = loading_exp.get("metrics", {})
+
+        if "LARGEST_CONTENTFUL_PAINT_MS" in metrics:
+            cwv.lcp = metrics["LARGEST_CONTENTFUL_PAINT_MS"].get("percentile")
+            cwv.lcp_rating = metrics["LARGEST_CONTENTFUL_PAINT_MS"].get("category")
+
+        if "FIRST_INPUT_DELAY_MS" in metrics:
+            cwv.fid = metrics["FIRST_INPUT_DELAY_MS"].get("percentile")
+            cwv.fid_rating = metrics["FIRST_INPUT_DELAY_MS"].get("category")
+
+        if "CUMULATIVE_LAYOUT_SHIFT_SCORE" in metrics:
+            cwv.cls = metrics["CUMULATIVE_LAYOUT_SHIFT_SCORE"].get("percentile") / 100
+            cwv.cls_rating = metrics["CUMULATIVE_LAYOUT_SHIFT_SCORE"].get("category")
+
+        if "INTERACTION_TO_NEXT_PAINT" in metrics:
+            cwv.inp = metrics["INTERACTION_TO_NEXT_PAINT"].get("percentile")
+            cwv.inp_rating = metrics["INTERACTION_TO_NEXT_PAINT"].get("category")
+
+        # Extract opportunities
+        for audit_id, audit in audits.items():
+            if audit.get("details", {}).get("type") == "opportunity":
+                savings = audit.get("details", {}).get("overallSavingsMs", 0)
+                if savings > 0:
+                    result.opportunities.append({
+                        "id": audit_id,
+                        "title": audit.get("title", ""),
+                        "description": audit.get("description", ""),
+                        "savings_ms": savings,
+                        "score": audit.get("score", 0),
+                    })
+
+        # Sort opportunities by savings
+        result.opportunities.sort(key=lambda x: x["savings_ms"], reverse=True)
+
+        # Extract diagnostics
+        for audit_id, audit in audits.items():
+            score = audit.get("score")
+            if score is not None and score < 1 and audit.get("details"):
+                if audit.get("details", {}).get("type") not in ["opportunity", None]:
+                    result.diagnostics.append({
+                        "id": audit_id,
+                        "title": audit.get("title", ""),
+                        "description": audit.get("description", ""),
+                        "score": score,
+                    })
+
+        # Extract passed audits
+        for audit_id, audit in audits.items():
+            if audit.get("score") == 1:
+                result.passed_audits.append(audit.get("title", audit_id))
+
+        return result
+
+    def analyze_both_strategies(self, url: str) -> dict:
+        """Analyze URL for both mobile and desktop."""
+        mobile = self.analyze(url, strategy="mobile")
+        desktop = self.analyze(url, strategy="desktop")
+
+        return {
+            "url": url,
+            "mobile": mobile.to_dict(),
+            "desktop": desktop.to_dict(),
+            "comparison": {
+                "performance_difference": (
+                    (desktop.performance_score or 0) - (mobile.performance_score or 0)
+                ),
+                "mobile_first_issues": self._identify_mobile_issues(mobile, desktop),
+            },
+        }
+
+    def _identify_mobile_issues(
+        self,
+        mobile: PageSpeedResult,
+        desktop: PageSpeedResult,
+    ) -> list[str]:
+        """Identify issues that affect mobile more than desktop."""
+        issues = []
+
+        if mobile.performance_score and desktop.performance_score:
+            if desktop.performance_score - mobile.performance_score > 20:
+                issues.append("Significant performance gap between mobile and desktop")
+
+        m_cwv = mobile.core_web_vitals
+        d_cwv = desktop.core_web_vitals
+
+        if m_cwv.lcp and d_cwv.lcp and m_cwv.lcp > d_cwv.lcp * 1.5:
+            issues.append("LCP significantly slower on mobile")
+
+        if m_cwv.cls and d_cwv.cls and m_cwv.cls > d_cwv.cls * 2:
+            issues.append("Layout shift issues more severe on mobile")
+
+        return issues
+
+    def get_cwv_summary(self, url: str) -> dict:
+        """Get a summary focused on Core Web Vitals."""
+        result = self.analyze(url, strategy="mobile")
+
+        cwv = result.core_web_vitals
+
+        return {
+            "url": url,
+            "overall_cwv_status": self._overall_cwv_status(cwv),
+            "metrics": {
+                "lcp": {
+                    "value": f"{cwv.lcp / 1000:.2f}s" if cwv.lcp else None,
+                    "rating": cwv.lcp_rating,
+                    "threshold": "≤ 2.5s good, > 4.0s poor",
+                },
+                "fid": {
+                    "value": f"{cwv.fid:.0f}ms" if cwv.fid else None,
+                    "rating": cwv.fid_rating,
+                    "threshold": "≤ 100ms good, > 300ms poor",
+                },
+                "cls": {
+                    "value": f"{cwv.cls:.3f}" if cwv.cls else None,
+                    "rating": cwv.cls_rating,
+                    "threshold": "≤ 0.1 good, > 0.25 poor",
+                },
+                "inp": {
+                    "value": f"{cwv.inp:.0f}ms" if cwv.inp else None,
+                    "rating": cwv.inp_rating,
+                    "threshold": "≤ 200ms good, > 500ms poor",
+                },
+            },
+            "top_opportunities": result.opportunities[:5],
+        }
+
+    def _overall_cwv_status(self, cwv: CoreWebVitals) -> str:
+        """Determine overall Core Web Vitals status."""
+        ratings = [cwv.lcp_rating, cwv.fid_rating, cwv.cls_rating]
+        ratings = [r for r in ratings if r]
+
+        if not ratings:
+            return "UNKNOWN"
+
+        if any(r == "POOR" for r in ratings):
+            return "POOR"
+        if any(r == "NEEDS_IMPROVEMENT" for r in ratings):
+            return "NEEDS_IMPROVEMENT"
+        return "GOOD"
+
+    def generate_report(self, result: PageSpeedResult) -> str:
+        """Generate human-readable performance report."""
+        lines = [
+            "=" * 60,
+            "PageSpeed Insights Report",
+            "=" * 60,
+            f"URL: {result.url}",
+            f"Strategy: {result.strategy}",
+            "",
+            "Scores:",
+            f"  Performance: {result.performance_score:.0f}/100" if result.performance_score else "  Performance: N/A",
+            f"  SEO: {result.seo_score:.0f}/100" if result.seo_score else "  SEO: N/A",
+            f"  Accessibility: {result.accessibility_score:.0f}/100" if result.accessibility_score else "  Accessibility: N/A",
+            f"  Best Practices: {result.best_practices_score:.0f}/100" if result.best_practices_score else "  Best Practices: N/A",
+            "",
+            "Core Web Vitals:",
+        ]
+
+        cwv = result.core_web_vitals
+
+        def format_metric(name: str, value: Any, rating: str | None, unit: str) -> str:
+            if value is None:
+                return f"  {name}: N/A"
+            rating_str = f" ({rating})" if rating else ""
+            return f"  {name}: {value}{unit}{rating_str}"
+
+        lines.append(format_metric("LCP", f"{cwv.lcp / 1000:.2f}" if cwv.lcp else None, cwv.lcp_rating, "s"))
+        lines.append(format_metric("FID/TBT", f"{cwv.fid:.0f}" if cwv.fid else None, cwv.fid_rating, "ms"))
+        lines.append(format_metric("CLS", f"{cwv.cls:.3f}" if cwv.cls else None, cwv.cls_rating, ""))
+        lines.append(format_metric("INP", f"{cwv.inp:.0f}" if cwv.inp else None, cwv.inp_rating, "ms"))
+        lines.append(format_metric("TTFB", f"{cwv.ttfb:.0f}" if cwv.ttfb else None, None, "ms"))
+        lines.append(format_metric("FCP", f"{cwv.fcp / 1000:.2f}" if cwv.fcp else None, None, "s"))
+
+        if result.opportunities:
+            lines.extend([
+                "",
+                f"Top Opportunities ({len(result.opportunities)} total):",
+            ])
+            for opp in result.opportunities[:5]:
+                savings = opp["savings_ms"]
+                lines.append(f"  - {opp['title']}: -{savings / 1000:.1f}s potential savings")
+
+        lines.extend(["", "=" * 60])
+
+        return "\n".join(lines)
+
+
+def main():
+    """CLI entry point."""
+    parser = argparse.ArgumentParser(description="PageSpeed Insights Client")
+    parser.add_argument("--url", "-u", required=True, help="URL to analyze")
+    parser.add_argument("--strategy", "-s", default="mobile",
+                       choices=["mobile", "desktop", "both"],
+                       help="Analysis strategy")
+    parser.add_argument("--output", "-o", help="Output file for JSON")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+    parser.add_argument("--cwv-only", action="store_true",
+                       help="Show only Core Web Vitals summary")
+
+    args = parser.parse_args()
+
+    client = PageSpeedClient()
+
+    if args.cwv_only:
+        summary = client.get_cwv_summary(args.url)
+        print(json.dumps(summary, indent=2))
+    elif args.strategy == "both":
+        result = client.analyze_both_strategies(args.url)
+        output = json.dumps(result, indent=2)
+        if args.output:
+            with open(args.output, "w") as f:
+                f.write(output)
+        else:
+            print(output)
+    else:
+        result = client.analyze(args.url, strategy=args.strategy)
+
+        if args.json or args.output:
+            output = json.dumps(result.to_dict(), indent=2)
+            if args.output:
+                with open(args.output, "w") as f:
+                    f.write(output)
+            else:
+                print(output)
+        else:
+            print(client.generate_report(result))
+
+
+if __name__ == "__main__":
+    main()
--- a/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/requirements.txt
+++ b/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/requirements.txt
@@ -0,0 +1,40 @@
+# OurDigital SEO Audit - Python Dependencies
+# Install with: pip install -r requirements.txt
+
+# Google APIs
+google-api-python-client>=2.100.0
+google-auth>=2.23.0
+google-auth-oauthlib>=1.1.0
+google-auth-httplib2>=0.1.1
+google-analytics-data>=0.18.0
+
+# Notion API
+notion-client>=2.0.0
+
+# Web Scraping & Parsing
+lxml>=5.1.0
+beautifulsoup4>=4.12.0
+extruct>=0.16.0
+requests>=2.31.0
+aiohttp>=3.9.0
+
+# Schema Validation
+jsonschema>=4.21.0
+rdflib>=7.0.0
+
+# Google Trends
+pytrends>=4.9.2
+
+# Data Processing
+pandas>=2.1.0
+
+# Async & Retry
+tenacity>=8.2.0
+tqdm>=4.66.0
+
+# Environment
+python-dotenv>=1.0.0
+
+# Logging & CLI
+rich>=13.7.0
+typer>=0.9.0
--- a/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/robots_checker.py
+++ b/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/robots_checker.py
@@ -0,0 +1,540 @@
+"""
+Robots.txt Checker - Analyze robots.txt configuration
+=====================================================
+Purpose: Parse and analyze robots.txt for SEO compliance
+Python: 3.10+
+Usage:
+    python robots_checker.py --url https://example.com/robots.txt
+    python robots_checker.py --url https://example.com --test-url /admin/
+"""
+
+import argparse
+import json
+import logging
+import re
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any
+from urllib.parse import urljoin, urlparse
+from urllib.robotparser import RobotFileParser
+
+import requests
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class RobotsIssue:
+    """Represents a robots.txt issue."""
+
+    severity: str  # "error", "warning", "info"
+    message: str
+    line_number: int | None = None
+    directive: str | None = None
+    suggestion: str | None = None
+
+
+@dataclass
+class UserAgentRules:
+    """Rules for a specific user-agent."""
+
+    user_agent: str
+    disallow: list[str] = field(default_factory=list)
+    allow: list[str] = field(default_factory=list)
+    crawl_delay: float | None = None
+
+
+@dataclass
+class RobotsResult:
+    """Complete robots.txt analysis result."""
+
+    url: str
+    accessible: bool = True
+    content: str = ""
+    rules: list[UserAgentRules] = field(default_factory=list)
+    sitemaps: list[str] = field(default_factory=list)
+    issues: list[RobotsIssue] = field(default_factory=list)
+    stats: dict = field(default_factory=dict)
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON output."""
+        return {
+            "url": self.url,
+            "accessible": self.accessible,
+            "sitemaps": self.sitemaps,
+            "rules": [
+                {
+                    "user_agent": r.user_agent,
+                    "disallow": r.disallow,
+                    "allow": r.allow,
+                    "crawl_delay": r.crawl_delay,
+                }
+                for r in self.rules
+            ],
+            "issues": [
+                {
+                    "severity": i.severity,
+                    "message": i.message,
+                    "line_number": i.line_number,
+                    "directive": i.directive,
+                    "suggestion": i.suggestion,
+                }
+                for i in self.issues
+            ],
+            "stats": self.stats,
+            "timestamp": self.timestamp,
+        }
+
+
+class RobotsChecker:
+    """Analyze robots.txt configuration."""
+
+    # Common user agents
+    USER_AGENTS = {
+        "*": "All bots",
+        "Googlebot": "Google crawler",
+        "Googlebot-Image": "Google Image crawler",
+        "Googlebot-News": "Google News crawler",
+        "Googlebot-Video": "Google Video crawler",
+        "Bingbot": "Bing crawler",
+        "Slurp": "Yahoo crawler",
+        "DuckDuckBot": "DuckDuckGo crawler",
+        "Baiduspider": "Baidu crawler",
+        "Yandex": "Yandex crawler",
+        "facebot": "Facebook crawler",
+        "Twitterbot": "Twitter crawler",
+        "LinkedInBot": "LinkedIn crawler",
+    }
+
+    # Paths that should generally not be blocked
+    IMPORTANT_PATHS = [
+        "/",
+        "/*.css",
+        "/*.js",
+        "/*.jpg",
+        "/*.jpeg",
+        "/*.png",
+        "/*.gif",
+        "/*.svg",
+        "/*.webp",
+    ]
+
+    # Paths commonly blocked
+    COMMON_BLOCKED = [
+        "/admin",
+        "/wp-admin",
+        "/login",
+        "/private",
+        "/api",
+        "/cgi-bin",
+        "/tmp",
+        "/search",
+    ]
+
+    def __init__(self):
+        self.session = requests.Session()
+        self.session.headers.update({
+            "User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
+        })
+
+    def fetch_robots(self, url: str) -> str | None:
+        """Fetch robots.txt content."""
+        # Ensure we're fetching robots.txt
+        parsed = urlparse(url)
+        if not parsed.path.endswith("robots.txt"):
+            robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
+        else:
+            robots_url = url
+
+        try:
+            response = self.session.get(robots_url, timeout=10)
+            if response.status_code == 200:
+                return response.text
+            elif response.status_code == 404:
+                return None
+            else:
+                raise RuntimeError(f"HTTP {response.status_code}")
+        except requests.RequestException as e:
+            raise RuntimeError(f"Failed to fetch robots.txt: {e}")
+
+    def parse_robots(self, content: str) -> tuple[list[UserAgentRules], list[str]]:
+        """Parse robots.txt content."""
+        rules = []
+        sitemaps = []
+        current_ua = None
+        current_rules = None
+
+        for line_num, line in enumerate(content.split("\n"), 1):
+            line = line.strip()
+
+            # Skip empty lines and comments
+            if not line or line.startswith("#"):
+                continue
+
+            # Parse directive
+            if ":" not in line:
+                continue
+
+            directive, value = line.split(":", 1)
+            directive = directive.strip().lower()
+            value = value.strip()
+
+            if directive == "user-agent":
+                # Save previous user-agent rules
+                if current_rules:
+                    rules.append(current_rules)
+
+                current_ua = value
+                current_rules = UserAgentRules(user_agent=value)
+
+            elif directive == "disallow" and current_rules:
+                if value:  # Empty disallow means allow all
+                    current_rules.disallow.append(value)
+
+            elif directive == "allow" and current_rules:
+                if value:
+                    current_rules.allow.append(value)
+
+            elif directive == "crawl-delay" and current_rules:
+                try:
+                    current_rules.crawl_delay = float(value)
+                except ValueError:
+                    pass
+
+            elif directive == "sitemap":
+                if value:
+                    sitemaps.append(value)
+
+        # Don't forget last user-agent
+        if current_rules:
+            rules.append(current_rules)
+
+        return rules, sitemaps
+
+    def analyze(self, url: str) -> RobotsResult:
+        """Analyze robots.txt."""
+        result = RobotsResult(url=url)
+
+        # Fetch robots.txt
+        try:
+            content = self.fetch_robots(url)
+            if content is None:
+                result.accessible = False
+                result.issues.append(RobotsIssue(
+                    severity="info",
+                    message="No robots.txt found (returns 404)",
+                    suggestion="Consider creating a robots.txt file",
+                ))
+                return result
+        except RuntimeError as e:
+            result.accessible = False
+            result.issues.append(RobotsIssue(
+                severity="error",
+                message=str(e),
+            ))
+            return result
+
+        result.content = content
+        result.rules, result.sitemaps = self.parse_robots(content)
+
+        # Analyze content
+        self._analyze_syntax(result)
+        self._analyze_rules(result)
+        self._analyze_sitemaps(result)
+
+        # Calculate stats
+        result.stats = {
+            "user_agents_count": len(result.rules),
+            "user_agents": [r.user_agent for r in result.rules],
+            "total_disallow_rules": sum(len(r.disallow) for r in result.rules),
+            "total_allow_rules": sum(len(r.allow) for r in result.rules),
+            "sitemaps_count": len(result.sitemaps),
+            "has_crawl_delay": any(r.crawl_delay for r in result.rules),
+            "content_length": len(content),
+        }
+
+        return result
+
+    def _analyze_syntax(self, result: RobotsResult) -> None:
+        """Check for syntax issues."""
+        lines = result.content.split("\n")
+
+        for line_num, line in enumerate(lines, 1):
+            line = line.strip()
+
+            # Skip empty lines and comments
+            if not line or line.startswith("#"):
+                continue
+
+            # Check for valid directive
+            if ":" not in line:
+                result.issues.append(RobotsIssue(
+                    severity="warning",
+                    message=f"Invalid line (missing colon): {line[:50]}",
+                    line_number=line_num,
+                ))
+                continue
+
+            directive, value = line.split(":", 1)
+            directive = directive.strip().lower()
+
+            valid_directives = {
+                "user-agent", "disallow", "allow",
+                "crawl-delay", "sitemap", "host",
+            }
+
+            if directive not in valid_directives:
+                result.issues.append(RobotsIssue(
+                    severity="info",
+                    message=f"Unknown directive: {directive}",
+                    line_number=line_num,
+                    directive=directive,
+                ))
+
+    def _analyze_rules(self, result: RobotsResult) -> None:
+        """Analyze blocking rules."""
+        # Check if there are any rules
+        if not result.rules:
+            result.issues.append(RobotsIssue(
+                severity="info",
+                message="No user-agent rules defined",
+                suggestion="Add User-agent: * rules to control crawling",
+            ))
+            return
+
+        # Check for wildcard rule
+        has_wildcard = any(r.user_agent == "*" for r in result.rules)
+        if not has_wildcard:
+            result.issues.append(RobotsIssue(
+                severity="info",
+                message="No wildcard (*) user-agent defined",
+                suggestion="Consider adding User-agent: * as fallback",
+            ))
+
+        # Check for blocking important resources
+        for rules in result.rules:
+            for disallow in rules.disallow:
+                # Check if blocking root
+                if disallow == "/":
+                    result.issues.append(RobotsIssue(
+                        severity="error",
+                        message=f"Blocking entire site for {rules.user_agent}",
+                        directive=f"Disallow: {disallow}",
+                        suggestion="This will prevent indexing. Is this intentional?",
+                    ))
+
+                # Check if blocking CSS/JS
+                if any(ext in disallow.lower() for ext in [".css", ".js"]):
+                    result.issues.append(RobotsIssue(
+                        severity="warning",
+                        message=f"Blocking CSS/JS files for {rules.user_agent}",
+                        directive=f"Disallow: {disallow}",
+                        suggestion="May affect rendering and SEO",
+                    ))
+
+                # Check for blocking images
+                if any(ext in disallow.lower() for ext in [".jpg", ".png", ".gif", ".webp"]):
+                    result.issues.append(RobotsIssue(
+                        severity="info",
+                        message=f"Blocking image files for {rules.user_agent}",
+                        directive=f"Disallow: {disallow}",
+                    ))
+
+            # Check crawl delay
+            if rules.crawl_delay:
+                if rules.crawl_delay > 10:
+                    result.issues.append(RobotsIssue(
+                        severity="warning",
+                        message=f"High crawl-delay ({rules.crawl_delay}s) for {rules.user_agent}",
+                        directive=f"Crawl-delay: {rules.crawl_delay}",
+                        suggestion="May significantly slow indexing",
+                    ))
+                elif rules.crawl_delay > 0:
+                    result.issues.append(RobotsIssue(
+                        severity="info",
+                        message=f"Crawl-delay set to {rules.crawl_delay}s for {rules.user_agent}",
+                    ))
+
+    def _analyze_sitemaps(self, result: RobotsResult) -> None:
+        """Analyze sitemap declarations."""
+        if not result.sitemaps:
+            result.issues.append(RobotsIssue(
+                severity="warning",
+                message="No sitemap declared in robots.txt",
+                suggestion="Add Sitemap: directive to help crawlers find your sitemap",
+            ))
+        else:
+            for sitemap in result.sitemaps:
+                if not sitemap.startswith("http"):
+                    result.issues.append(RobotsIssue(
+                        severity="warning",
+                        message=f"Sitemap URL should be absolute: {sitemap}",
+                        directive=f"Sitemap: {sitemap}",
+                    ))
+
+    def test_url(self, robots_url: str, test_path: str,
+                 user_agent: str = "Googlebot") -> dict:
+        """Test if a specific URL is allowed."""
+        # Use Python's built-in parser
+        rp = RobotFileParser()
+
+        # Ensure robots.txt URL
+        parsed = urlparse(robots_url)
+        if not parsed.path.endswith("robots.txt"):
+            robots_txt_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
+        else:
+            robots_txt_url = robots_url
+
+        rp.set_url(robots_txt_url)
+        try:
+            rp.read()
+        except Exception as e:
+            return {
+                "path": test_path,
+                "user_agent": user_agent,
+                "allowed": None,
+                "error": str(e),
+            }
+
+        # Build full URL for testing
+        base_url = f"{parsed.scheme}://{parsed.netloc}"
+        full_url = urljoin(base_url, test_path)
+
+        allowed = rp.can_fetch(user_agent, full_url)
+
+        return {
+            "path": test_path,
+            "user_agent": user_agent,
+            "allowed": allowed,
+            "full_url": full_url,
+        }
+
+    def generate_report(self, result: RobotsResult) -> str:
+        """Generate human-readable analysis report."""
+        lines = [
+            "=" * 60,
+            "Robots.txt Analysis Report",
+            "=" * 60,
+            f"URL: {result.url}",
+            f"Accessible: {'Yes' if result.accessible else 'No'}",
+            f"Timestamp: {result.timestamp}",
+            "",
+        ]
+
+        if result.accessible:
+            lines.append("Statistics:")
+            for key, value in result.stats.items():
+                if key == "user_agents":
+                    lines.append(f"  {key}: {', '.join(value) if value else 'None'}")
+                else:
+                    lines.append(f"  {key}: {value}")
+            lines.append("")
+
+            if result.sitemaps:
+                lines.append(f"Sitemaps ({len(result.sitemaps)}):")
+                for sitemap in result.sitemaps:
+                    lines.append(f"  - {sitemap}")
+                lines.append("")
+
+            if result.rules:
+                lines.append("Rules Summary:")
+                for rules in result.rules:
+                    lines.append(f"\n  User-agent: {rules.user_agent}")
+                    if rules.disallow:
+                        lines.append(f"    Disallow: {len(rules.disallow)} rules")
+                        for d in rules.disallow[:5]:
+                            lines.append(f"      - {d}")
+                        if len(rules.disallow) > 5:
+                            lines.append(f"      ... and {len(rules.disallow) - 5} more")
+                    if rules.allow:
+                        lines.append(f"    Allow: {len(rules.allow)} rules")
+                        for a in rules.allow[:3]:
+                            lines.append(f"      - {a}")
+                    if rules.crawl_delay:
+                        lines.append(f"    Crawl-delay: {rules.crawl_delay}s")
+                lines.append("")
+
+        if result.issues:
+            lines.append("Issues Found:")
+            errors = [i for i in result.issues if i.severity == "error"]
+            warnings = [i for i in result.issues if i.severity == "warning"]
+            infos = [i for i in result.issues if i.severity == "info"]
+
+            if errors:
+                lines.append(f"\n  ERRORS ({len(errors)}):")
+                for issue in errors:
+                    lines.append(f"    - {issue.message}")
+                    if issue.directive:
+                        lines.append(f"      Directive: {issue.directive}")
+                    if issue.suggestion:
+                        lines.append(f"      Suggestion: {issue.suggestion}")
+
+            if warnings:
+                lines.append(f"\n  WARNINGS ({len(warnings)}):")
+                for issue in warnings:
+                    lines.append(f"    - {issue.message}")
+                    if issue.suggestion:
+                        lines.append(f"      Suggestion: {issue.suggestion}")
+
+            if infos:
+                lines.append(f"\n  INFO ({len(infos)}):")
+                for issue in infos:
+                    lines.append(f"    - {issue.message}")
+
+        lines.append("")
+        lines.append("=" * 60)
+
+        return "\n".join(lines)
+
+
+def main():
+    """Main entry point for CLI usage."""
+    parser = argparse.ArgumentParser(
+        description="Analyze robots.txt configuration",
+    )
+    parser.add_argument("--url", "-u", required=True,
+                       help="URL to robots.txt or domain")
+    parser.add_argument("--test-url", "-t",
+                       help="Test if specific URL path is allowed")
+    parser.add_argument("--user-agent", "-a", default="Googlebot",
+                       help="User agent for testing (default: Googlebot)")
+    parser.add_argument("--output", "-o", help="Output file for JSON report")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+
+    args = parser.parse_args()
+
+    checker = RobotsChecker()
+
+    if args.test_url:
+        # Test specific URL
+        test_result = checker.test_url(args.url, args.test_url, args.user_agent)
+        if args.json:
+            print(json.dumps(test_result, indent=2))
+        else:
+            status = "ALLOWED" if test_result["allowed"] else "BLOCKED"
+            print(f"URL: {test_result['path']}")
+            print(f"User-Agent: {test_result['user_agent']}")
+            print(f"Status: {status}")
+    else:
+        # Full analysis
+        result = checker.analyze(args.url)
+
+        if args.json or args.output:
+            output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
+            if args.output:
+                with open(args.output, "w", encoding="utf-8") as f:
+                    f.write(output)
+                logger.info(f"Report written to {args.output}")
+            else:
+                print(output)
+        else:
+            print(checker.generate_report(result))
+
+
+if __name__ == "__main__":
+    main()
--- a/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/schema_generator.py
+++ b/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/schema_generator.py
@@ -0,0 +1,490 @@
+"""
+Schema Generator - Generate JSON-LD structured data markup
+==========================================================
+Purpose: Generate schema.org structured data in JSON-LD format
+Python: 3.10+
+Usage:
+    python schema_generator.py --type organization --name "Company Name" --url "https://example.com"
+"""
+
+import argparse
+import json
+import logging
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+# Template directory relative to this script
+TEMPLATE_DIR = Path(__file__).parent.parent / "templates" / "schema_templates"
+
+
+class SchemaGenerator:
+    """Generate JSON-LD schema markup from templates."""
+
+    SCHEMA_TYPES = {
+        "organization": "organization.json",
+        "local_business": "local_business.json",
+        "product": "product.json",
+        "article": "article.json",
+        "faq": "faq.json",
+        "breadcrumb": "breadcrumb.json",
+        "website": "website.json",
+    }
+
+    # Business type mappings for LocalBusiness
+    BUSINESS_TYPES = {
+        "restaurant": "Restaurant",
+        "cafe": "CafeOrCoffeeShop",
+        "bar": "BarOrPub",
+        "hotel": "Hotel",
+        "store": "Store",
+        "medical": "MedicalBusiness",
+        "dental": "Dentist",
+        "legal": "LegalService",
+        "real_estate": "RealEstateAgent",
+        "auto": "AutoRepair",
+        "beauty": "BeautySalon",
+        "gym": "HealthClub",
+        "spa": "DaySpa",
+    }
+
+    # Article type mappings
+    ARTICLE_TYPES = {
+        "article": "Article",
+        "blog": "BlogPosting",
+        "news": "NewsArticle",
+        "tech": "TechArticle",
+        "scholarly": "ScholarlyArticle",
+    }
+
+    def __init__(self, template_dir: Path = TEMPLATE_DIR):
+        self.template_dir = template_dir
+
+    def load_template(self, schema_type: str) -> dict:
+        """Load a schema template file."""
+        if schema_type not in self.SCHEMA_TYPES:
+            raise ValueError(f"Unknown schema type: {schema_type}. "
+                           f"Available: {list(self.SCHEMA_TYPES.keys())}")
+
+        template_file = self.template_dir / self.SCHEMA_TYPES[schema_type]
+        if not template_file.exists():
+            raise FileNotFoundError(f"Template not found: {template_file}")
+
+        with open(template_file, "r", encoding="utf-8") as f:
+            return json.load(f)
+
+    def fill_template(self, template: dict, data: dict[str, Any]) -> dict:
+        """Fill template placeholders with actual data."""
+        template_str = json.dumps(template, ensure_ascii=False)
+
+        # Replace placeholders {{key}} with values
+        for key, value in data.items():
+            placeholder = f"{{{{{key}}}}}"
+            if value is not None:
+                template_str = template_str.replace(placeholder, str(value))
+
+        # Remove unfilled placeholders and their parent objects if empty
+        result = json.loads(template_str)
+        return self._clean_empty_values(result)
+
+    def _clean_empty_values(self, obj: Any) -> Any:
+        """Remove empty values and unfilled placeholders."""
+        if isinstance(obj, dict):
+            cleaned = {}
+            for key, value in obj.items():
+                cleaned_value = self._clean_empty_values(value)
+                # Skip if value is empty, None, or unfilled placeholder
+                if cleaned_value is None:
+                    continue
+                if isinstance(cleaned_value, str) and cleaned_value.startswith("{{"):
+                    continue
+                if isinstance(cleaned_value, (list, dict)) and not cleaned_value:
+                    continue
+                cleaned[key] = cleaned_value
+            return cleaned if cleaned else None
+        elif isinstance(obj, list):
+            cleaned = []
+            for item in obj:
+                cleaned_item = self._clean_empty_values(item)
+                if cleaned_item is not None:
+                    if isinstance(cleaned_item, str) and cleaned_item.startswith("{{"):
+                        continue
+                    cleaned.append(cleaned_item)
+            return cleaned if cleaned else None
+        elif isinstance(obj, str):
+            if obj.startswith("{{") and obj.endswith("}}"):
+                return None
+            return obj
+        return obj
+
+    def generate_organization(
+        self,
+        name: str,
+        url: str,
+        logo_url: str | None = None,
+        description: str | None = None,
+        founding_date: str | None = None,
+        phone: str | None = None,
+        address: dict | None = None,
+        social_links: list[str] | None = None,
+    ) -> dict:
+        """Generate Organization schema."""
+        template = self.load_template("organization")
+
+        data = {
+            "name": name,
+            "url": url,
+            "logo_url": logo_url,
+            "description": description,
+            "founding_date": founding_date,
+            "phone": phone,
+        }
+
+        if address:
+            data.update({
+                "street_address": address.get("street"),
+                "city": address.get("city"),
+                "region": address.get("region"),
+                "postal_code": address.get("postal_code"),
+                "country": address.get("country", "KR"),
+            })
+
+        if social_links:
+            # Handle social links specially
+            pass
+
+        return self.fill_template(template, data)
+
+    def generate_local_business(
+        self,
+        name: str,
+        business_type: str,
+        address: dict,
+        phone: str | None = None,
+        url: str | None = None,
+        description: str | None = None,
+        hours: dict | None = None,
+        geo: dict | None = None,
+        price_range: str | None = None,
+        rating: float | None = None,
+        review_count: int | None = None,
+    ) -> dict:
+        """Generate LocalBusiness schema."""
+        template = self.load_template("local_business")
+
+        schema_business_type = self.BUSINESS_TYPES.get(
+            business_type.lower(), "LocalBusiness"
+        )
+
+        data = {
+            "business_type": schema_business_type,
+            "name": name,
+            "url": url,
+            "description": description,
+            "phone": phone,
+            "price_range": price_range,
+            "street_address": address.get("street"),
+            "city": address.get("city"),
+            "region": address.get("region"),
+            "postal_code": address.get("postal_code"),
+            "country": address.get("country", "KR"),
+        }
+
+        if geo:
+            data["latitude"] = geo.get("lat")
+            data["longitude"] = geo.get("lng")
+
+        if hours:
+            data.update({
+                "weekday_opens": hours.get("weekday_opens", "09:00"),
+                "weekday_closes": hours.get("weekday_closes", "18:00"),
+                "weekend_opens": hours.get("weekend_opens"),
+                "weekend_closes": hours.get("weekend_closes"),
+            })
+
+        if rating is not None:
+            data["rating"] = str(rating)
+            data["review_count"] = str(review_count or 0)
+
+        return self.fill_template(template, data)
+
+    def generate_product(
+        self,
+        name: str,
+        description: str,
+        price: float,
+        currency: str = "KRW",
+        brand: str | None = None,
+        sku: str | None = None,
+        images: list[str] | None = None,
+        availability: str = "InStock",
+        condition: str = "NewCondition",
+        rating: float | None = None,
+        review_count: int | None = None,
+        url: str | None = None,
+        seller: str | None = None,
+    ) -> dict:
+        """Generate Product schema."""
+        template = self.load_template("product")
+
+        data = {
+            "name": name,
+            "description": description,
+            "price": str(int(price)),
+            "currency": currency,
+            "brand_name": brand,
+            "sku": sku,
+            "product_url": url,
+            "availability": availability,
+            "condition": condition,
+            "seller_name": seller,
+        }
+
+        if images:
+            for i, img in enumerate(images[:3], 1):
+                data[f"image_url_{i}"] = img
+
+        if rating is not None:
+            data["rating"] = str(rating)
+            data["review_count"] = str(review_count or 0)
+
+        return self.fill_template(template, data)
+
+    def generate_article(
+        self,
+        headline: str,
+        description: str,
+        author_name: str,
+        date_published: str,
+        publisher_name: str,
+        article_type: str = "article",
+        date_modified: str | None = None,
+        images: list[str] | None = None,
+        page_url: str | None = None,
+        publisher_logo: str | None = None,
+        author_url: str | None = None,
+        section: str | None = None,
+        word_count: int | None = None,
+        keywords: str | None = None,
+    ) -> dict:
+        """Generate Article schema."""
+        template = self.load_template("article")
+
+        schema_article_type = self.ARTICLE_TYPES.get(
+            article_type.lower(), "Article"
+        )
+
+        data = {
+            "article_type": schema_article_type,
+            "headline": headline,
+            "description": description,
+            "author_name": author_name,
+            "author_url": author_url,
+            "date_published": date_published,
+            "date_modified": date_modified or date_published,
+            "publisher_name": publisher_name,
+            "publisher_logo_url": publisher_logo,
+            "page_url": page_url,
+            "section": section,
+            "word_count": str(word_count) if word_count else None,
+            "keywords": keywords,
+        }
+
+        if images:
+            for i, img in enumerate(images[:2], 1):
+                data[f"image_url_{i}"] = img
+
+        return self.fill_template(template, data)
+
+    def generate_faq(self, questions: list[dict[str, str]]) -> dict:
+        """Generate FAQPage schema."""
+        schema = {
+            "@context": "https://schema.org",
+            "@type": "FAQPage",
+            "mainEntity": [],
+        }
+
+        for qa in questions:
+            schema["mainEntity"].append({
+                "@type": "Question",
+                "name": qa["question"],
+                "acceptedAnswer": {
+                    "@type": "Answer",
+                    "text": qa["answer"],
+                },
+            })
+
+        return schema
+
+    def generate_breadcrumb(self, items: list[dict[str, str]]) -> dict:
+        """Generate BreadcrumbList schema."""
+        schema = {
+            "@context": "https://schema.org",
+            "@type": "BreadcrumbList",
+            "itemListElement": [],
+        }
+
+        for i, item in enumerate(items, 1):
+            schema["itemListElement"].append({
+                "@type": "ListItem",
+                "position": i,
+                "name": item["name"],
+                "item": item["url"],
+            })
+
+        return schema
+
+    def generate_website(
+        self,
+        name: str,
+        url: str,
+        search_url_template: str | None = None,
+        description: str | None = None,
+        language: str = "ko-KR",
+        publisher_name: str | None = None,
+        logo_url: str | None = None,
+        alternate_name: str | None = None,
+    ) -> dict:
+        """Generate WebSite schema."""
+        template = self.load_template("website")
+
+        data = {
+            "site_name": name,
+            "url": url,
+            "description": description,
+            "language": language,
+            "search_url_template": search_url_template,
+            "publisher_name": publisher_name or name,
+            "logo_url": logo_url,
+            "alternate_name": alternate_name,
+        }
+
+        return self.fill_template(template, data)
+
+    def to_json_ld(self, schema: dict, pretty: bool = True) -> str:
+        """Convert schema dict to JSON-LD string."""
+        indent = 2 if pretty else None
+        return json.dumps(schema, ensure_ascii=False, indent=indent)
+
+    def to_html_script(self, schema: dict) -> str:
+        """Wrap schema in HTML script tag."""
+        json_ld = self.to_json_ld(schema)
+        return f'<script type="application/ld+json">\n{json_ld}\n</script>'
+
+
+def main():
+    """Main entry point for CLI usage."""
+    parser = argparse.ArgumentParser(
+        description="Generate JSON-LD schema markup",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Generate Organization schema
+  python schema_generator.py --type organization --name "My Company" --url "https://example.com"
+
+  # Generate Product schema
+  python schema_generator.py --type product --name "Widget" --price 29900 --currency KRW
+
+  # Generate Article schema
+  python schema_generator.py --type article --headline "Article Title" --author "John Doe"
+        """,
+    )
+
+    parser.add_argument(
+        "--type", "-t",
+        required=True,
+        choices=SchemaGenerator.SCHEMA_TYPES.keys(),
+        help="Schema type to generate",
+    )
+    parser.add_argument("--name", help="Name/title")
+    parser.add_argument("--url", help="URL")
+    parser.add_argument("--description", help="Description")
+    parser.add_argument("--price", type=float, help="Price (for product)")
+    parser.add_argument("--currency", default="KRW", help="Currency code")
+    parser.add_argument("--headline", help="Headline (for article)")
+    parser.add_argument("--author", help="Author name")
+    parser.add_argument("--output", "-o", help="Output file path")
+    parser.add_argument("--html", action="store_true", help="Output as HTML script tag")
+
+    args = parser.parse_args()
+
+    generator = SchemaGenerator()
+
+    try:
+        if args.type == "organization":
+            schema = generator.generate_organization(
+                name=args.name or "Organization Name",
+                url=args.url or "https://example.com",
+                description=args.description,
+            )
+        elif args.type == "product":
+            schema = generator.generate_product(
+                name=args.name or "Product Name",
+                description=args.description or "Product description",
+                price=args.price or 0,
+                currency=args.currency,
+            )
+        elif args.type == "article":
+            schema = generator.generate_article(
+                headline=args.headline or args.name or "Article Title",
+                description=args.description or "Article description",
+                author_name=args.author or "Author",
+                date_published=datetime.now().strftime("%Y-%m-%d"),
+                publisher_name="Publisher",
+            )
+        elif args.type == "website":
+            schema = generator.generate_website(
+                name=args.name or "Website Name",
+                url=args.url or "https://example.com",
+                description=args.description,
+            )
+        elif args.type == "faq":
+            # Example FAQ
+            schema = generator.generate_faq([
+                {"question": "Question 1?", "answer": "Answer 1"},
+                {"question": "Question 2?", "answer": "Answer 2"},
+            ])
+        elif args.type == "breadcrumb":
+            # Example breadcrumb
+            schema = generator.generate_breadcrumb([
+                {"name": "Home", "url": "https://example.com/"},
+                {"name": "Category", "url": "https://example.com/category/"},
+            ])
+        elif args.type == "local_business":
+            schema = generator.generate_local_business(
+                name=args.name or "Business Name",
+                business_type="store",
+                address={"street": "123 Main St", "city": "Seoul", "country": "KR"},
+                url=args.url,
+                description=args.description,
+            )
+        else:
+            raise ValueError(f"Unsupported type: {args.type}")
+
+        if args.html:
+            output = generator.to_html_script(schema)
+        else:
+            output = generator.to_json_ld(schema)
+
+        if args.output:
+            with open(args.output, "w", encoding="utf-8") as f:
+                f.write(output)
+            logger.info(f"Schema written to {args.output}")
+        else:
+            print(output)
+
+    except Exception as e:
+        logger.error(f"Error generating schema: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    main()
--- a/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/schema_validator.py
+++ b/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/schema_validator.py
@@ -0,0 +1,498 @@
+"""
+Schema Validator - Validate JSON-LD structured data markup
+==========================================================
+Purpose: Extract and validate schema.org structured data from URLs or files
+Python: 3.10+
+Usage:
+    python schema_validator.py --url https://example.com
+    python schema_validator.py --file schema.json
+"""
+
+import argparse
+import json
+import logging
+import re
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any
+from urllib.parse import urlparse
+
+import requests
+from bs4 import BeautifulSoup
+
+try:
+    import extruct
+    HAS_EXTRUCT = True
+except ImportError:
+    HAS_EXTRUCT = False
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ValidationIssue:
+    """Represents a validation issue found in schema."""
+
+    severity: str  # "error", "warning", "info"
+    message: str
+    schema_type: str | None = None
+    property_name: str | None = None
+    suggestion: str | None = None
+
+
+@dataclass
+class ValidationResult:
+    """Complete validation result for a schema."""
+
+    url: str | None = None
+    schemas_found: list[dict] = field(default_factory=list)
+    issues: list[ValidationIssue] = field(default_factory=list)
+    valid: bool = True
+    rich_results_eligible: dict = field(default_factory=dict)
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON output."""
+        return {
+            "url": self.url,
+            "schemas_found": len(self.schemas_found),
+            "schema_types": [s.get("@type", "Unknown") for s in self.schemas_found],
+            "valid": self.valid,
+            "issues": [
+                {
+                    "severity": i.severity,
+                    "message": i.message,
+                    "schema_type": i.schema_type,
+                    "property": i.property_name,
+                    "suggestion": i.suggestion,
+                }
+                for i in self.issues
+            ],
+            "rich_results_eligible": self.rich_results_eligible,
+            "timestamp": self.timestamp,
+        }
+
+
+class SchemaValidator:
+    """Validate schema.org structured data."""
+
+    # Required properties for common schema types
+    REQUIRED_PROPERTIES = {
+        "Organization": ["name", "url"],
+        "LocalBusiness": ["name", "address"],
+        "Product": ["name"],
+        "Offer": ["price", "priceCurrency"],
+        "Article": ["headline", "author", "datePublished", "publisher"],
+        "BlogPosting": ["headline", "author", "datePublished", "publisher"],
+        "NewsArticle": ["headline", "author", "datePublished", "publisher"],
+        "FAQPage": ["mainEntity"],
+        "Question": ["name", "acceptedAnswer"],
+        "Answer": ["text"],
+        "BreadcrumbList": ["itemListElement"],
+        "ListItem": ["position", "name"],
+        "WebSite": ["name", "url"],
+        "WebPage": ["name"],
+        "Person": ["name"],
+        "Event": ["name", "startDate", "location"],
+        "Review": ["reviewRating", "author"],
+        "AggregateRating": ["ratingValue"],
+        "ImageObject": ["url"],
+    }
+
+    # Recommended (but not required) properties
+    RECOMMENDED_PROPERTIES = {
+        "Organization": ["logo", "description", "contactPoint", "sameAs"],
+        "LocalBusiness": ["telephone", "openingHoursSpecification", "geo", "image"],
+        "Product": ["description", "image", "brand", "offers", "aggregateRating"],
+        "Article": ["image", "dateModified", "description"],
+        "FAQPage": [],
+        "WebSite": ["potentialAction"],
+        "BreadcrumbList": [],
+    }
+
+    # Google Rich Results eligible types
+    RICH_RESULTS_TYPES = {
+        "Article", "BlogPosting", "NewsArticle",
+        "Product", "Review",
+        "FAQPage", "HowTo",
+        "LocalBusiness", "Restaurant",
+        "Event",
+        "Recipe",
+        "JobPosting",
+        "Course",
+        "BreadcrumbList",
+        "Organization",
+        "WebSite",
+        "VideoObject",
+    }
+
+    def __init__(self):
+        self.session = requests.Session()
+        self.session.headers.update({
+            "User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
+        })
+
+    def extract_from_url(self, url: str) -> list[dict]:
+        """Extract all structured data from a URL."""
+        try:
+            response = self.session.get(url, timeout=30)
+            response.raise_for_status()
+            return self.extract_from_html(response.text, url)
+        except requests.RequestException as e:
+            logger.error(f"Failed to fetch URL: {e}")
+            return []
+
+    def extract_from_html(self, html: str, base_url: str | None = None) -> list[dict]:
+        """Extract structured data from HTML content."""
+        schemas = []
+
+        # Method 1: Use extruct if available (handles JSON-LD, Microdata, RDFa)
+        if HAS_EXTRUCT:
+            try:
+                data = extruct.extract(html, base_url=base_url, uniform=True)
+                schemas.extend(data.get("json-ld", []))
+                schemas.extend(data.get("microdata", []))
+                schemas.extend(data.get("rdfa", []))
+            except Exception as e:
+                logger.warning(f"extruct extraction failed: {e}")
+
+        # Method 2: Manual JSON-LD extraction (fallback/additional)
+        soup = BeautifulSoup(html, "html.parser")
+        for script in soup.find_all("script", type="application/ld+json"):
+            try:
+                content = script.string
+                if content:
+                    data = json.loads(content)
+                    if isinstance(data, list):
+                        schemas.extend(data)
+                    else:
+                        schemas.append(data)
+            except json.JSONDecodeError as e:
+                logger.warning(f"Invalid JSON-LD: {e}")
+
+        # Deduplicate schemas
+        seen = set()
+        unique_schemas = []
+        for schema in schemas:
+            schema_str = json.dumps(schema, sort_keys=True)
+            if schema_str not in seen:
+                seen.add(schema_str)
+                unique_schemas.append(schema)
+
+        return unique_schemas
+
+    def validate(self, url: str | None = None, html: str | None = None,
+                 schema: dict | None = None) -> ValidationResult:
+        """Validate schema from URL, HTML, or direct schema dict."""
+        result = ValidationResult(url=url)
+
+        # Extract schemas
+        if schema:
+            schemas = [schema]
+        elif html:
+            schemas = self.extract_from_html(html, url)
+        elif url:
+            schemas = self.extract_from_url(url)
+        else:
+            raise ValueError("Must provide url, html, or schema")
+
+        result.schemas_found = schemas
+
+        if not schemas:
+            result.issues.append(ValidationIssue(
+                severity="warning",
+                message="No structured data found",
+                suggestion="Add JSON-LD schema markup to improve SEO",
+            ))
+            result.valid = False
+            return result
+
+        # Validate each schema
+        for schema in schemas:
+            self._validate_schema(schema, result)
+
+        # Check for errors (warnings don't affect validity)
+        result.valid = not any(i.severity == "error" for i in result.issues)
+
+        return result
+
+    def _validate_schema(self, schema: dict, result: ValidationResult,
+                        parent_type: str | None = None) -> None:
+        """Validate a single schema object."""
+        schema_type = schema.get("@type")
+
+        if not schema_type:
+            result.issues.append(ValidationIssue(
+                severity="error",
+                message="Missing @type property",
+                schema_type=parent_type,
+            ))
+            return
+
+        # Handle array of types
+        if isinstance(schema_type, list):
+            schema_type = schema_type[0]
+
+        # Check required properties
+        required = self.REQUIRED_PROPERTIES.get(schema_type, [])
+        for prop in required:
+            if prop not in schema:
+                result.issues.append(ValidationIssue(
+                    severity="error",
+                    message=f"Missing required property: {prop}",
+                    schema_type=schema_type,
+                    property_name=prop,
+                    suggestion=f"Add '{prop}' property to {schema_type} schema",
+                ))
+
+        # Check recommended properties
+        recommended = self.RECOMMENDED_PROPERTIES.get(schema_type, [])
+        for prop in recommended:
+            if prop not in schema:
+                result.issues.append(ValidationIssue(
+                    severity="info",
+                    message=f"Missing recommended property: {prop}",
+                    schema_type=schema_type,
+                    property_name=prop,
+                    suggestion=f"Consider adding '{prop}' for better rich results",
+                ))
+
+        # Check Rich Results eligibility
+        if schema_type in self.RICH_RESULTS_TYPES:
+            result.rich_results_eligible[schema_type] = self._check_rich_results(
+                schema, schema_type
+            )
+
+        # Validate nested schemas
+        for key, value in schema.items():
+            if key.startswith("@"):
+                continue
+            if isinstance(value, dict) and "@type" in value:
+                self._validate_schema(value, result, schema_type)
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and "@type" in item:
+                        self._validate_schema(item, result, schema_type)
+
+        # Type-specific validations
+        self._validate_type_specific(schema, schema_type, result)
+
+    def _validate_type_specific(self, schema: dict, schema_type: str,
+                                result: ValidationResult) -> None:
+        """Type-specific validation rules."""
+        if schema_type in ("Article", "BlogPosting", "NewsArticle"):
+            # Check image
+            if "image" not in schema:
+                result.issues.append(ValidationIssue(
+                    severity="warning",
+                    message="Article without image may not show in rich results",
+                    schema_type=schema_type,
+                    property_name="image",
+                    suggestion="Add at least one image to the article",
+                ))
+
+            # Check headline length
+            headline = schema.get("headline", "")
+            if len(headline) > 110:
+                result.issues.append(ValidationIssue(
+                    severity="warning",
+                    message=f"Headline too long ({len(headline)} chars, max 110)",
+                    schema_type=schema_type,
+                    property_name="headline",
+                ))
+
+        elif schema_type == "Product":
+            offer = schema.get("offers", {})
+            if isinstance(offer, dict):
+                # Check price
+                price = offer.get("price")
+                if price is not None:
+                    try:
+                        float(price)
+                    except (ValueError, TypeError):
+                        result.issues.append(ValidationIssue(
+                            severity="error",
+                            message=f"Invalid price value: {price}",
+                            schema_type="Offer",
+                            property_name="price",
+                        ))
+
+                # Check availability
+                availability = offer.get("availability", "")
+                valid_availabilities = [
+                    "InStock", "OutOfStock", "PreOrder", "Discontinued",
+                    "https://schema.org/InStock", "https://schema.org/OutOfStock",
+                ]
+                if availability and not any(
+                    a in availability for a in valid_availabilities
+                ):
+                    result.issues.append(ValidationIssue(
+                        severity="warning",
+                        message=f"Unknown availability value: {availability}",
+                        schema_type="Offer",
+                        property_name="availability",
+                    ))
+
+        elif schema_type == "LocalBusiness":
+            # Check for geo coordinates
+            if "geo" not in schema:
+                result.issues.append(ValidationIssue(
+                    severity="info",
+                    message="Missing geo coordinates",
+                    schema_type=schema_type,
+                    property_name="geo",
+                    suggestion="Add latitude/longitude for better local search",
+                ))
+
+        elif schema_type == "FAQPage":
+            main_entity = schema.get("mainEntity", [])
+            if not main_entity:
+                result.issues.append(ValidationIssue(
+                    severity="error",
+                    message="FAQPage must have at least one question",
+                    schema_type=schema_type,
+                    property_name="mainEntity",
+                ))
+            elif len(main_entity) < 2:
+                result.issues.append(ValidationIssue(
+                    severity="info",
+                    message="FAQPage has only one question",
+                    schema_type=schema_type,
+                    suggestion="Add more questions for better rich results",
+                ))
+
+    def _check_rich_results(self, schema: dict, schema_type: str) -> dict:
+        """Check if schema is eligible for Google Rich Results."""
+        result = {
+            "eligible": True,
+            "missing_for_rich_results": [],
+        }
+
+        if schema_type in ("Article", "BlogPosting", "NewsArticle"):
+            required_for_rich = ["headline", "image", "datePublished", "author"]
+            for prop in required_for_rich:
+                if prop not in schema:
+                    result["eligible"] = False
+                    result["missing_for_rich_results"].append(prop)
+
+        elif schema_type == "Product":
+            if "name" not in schema:
+                result["eligible"] = False
+                result["missing_for_rich_results"].append("name")
+            offer = schema.get("offers")
+            if not offer:
+                result["eligible"] = False
+                result["missing_for_rich_results"].append("offers")
+
+        elif schema_type == "FAQPage":
+            if not schema.get("mainEntity"):
+                result["eligible"] = False
+                result["missing_for_rich_results"].append("mainEntity")
+
+        return result
+
+    def generate_report(self, result: ValidationResult) -> str:
+        """Generate human-readable validation report."""
+        lines = [
+            "=" * 60,
+            "Schema Validation Report",
+            "=" * 60,
+            f"URL: {result.url or 'N/A'}",
+            f"Timestamp: {result.timestamp}",
+            f"Valid: {'Yes' if result.valid else 'No'}",
+            f"Schemas Found: {len(result.schemas_found)}",
+            "",
+        ]
+
+        if result.schemas_found:
+            lines.append("Schema Types:")
+            for schema in result.schemas_found:
+                schema_type = schema.get("@type", "Unknown")
+                lines.append(f"  - {schema_type}")
+            lines.append("")
+
+        if result.rich_results_eligible:
+            lines.append("Rich Results Eligibility:")
+            for schema_type, status in result.rich_results_eligible.items():
+                eligible = "Yes" if status["eligible"] else "No"
+                lines.append(f"  - {schema_type}: {eligible}")
+                if status["missing_for_rich_results"]:
+                    missing = ", ".join(status["missing_for_rich_results"])
+                    lines.append(f"    Missing: {missing}")
+            lines.append("")
+
+        if result.issues:
+            lines.append("Issues Found:")
+            errors = [i for i in result.issues if i.severity == "error"]
+            warnings = [i for i in result.issues if i.severity == "warning"]
+            infos = [i for i in result.issues if i.severity == "info"]
+
+            if errors:
+                lines.append(f"\n  ERRORS ({len(errors)}):")
+                for issue in errors:
+                    lines.append(f"    - [{issue.schema_type}] {issue.message}")
+                    if issue.suggestion:
+                        lines.append(f"      Suggestion: {issue.suggestion}")
+
+            if warnings:
+                lines.append(f"\n  WARNINGS ({len(warnings)}):")
+                for issue in warnings:
+                    lines.append(f"    - [{issue.schema_type}] {issue.message}")
+                    if issue.suggestion:
+                        lines.append(f"      Suggestion: {issue.suggestion}")
+
+            if infos:
+                lines.append(f"\n  INFO ({len(infos)}):")
+                for issue in infos:
+                    lines.append(f"    - [{issue.schema_type}] {issue.message}")
+                    if issue.suggestion:
+                        lines.append(f"      Suggestion: {issue.suggestion}")
+
+        lines.append("")
+        lines.append("=" * 60)
+
+        return "\n".join(lines)
+
+
+def main():
+    """Main entry point for CLI usage."""
+    parser = argparse.ArgumentParser(
+        description="Validate schema.org structured data",
+    )
+    parser.add_argument("--url", "-u", help="URL to validate")
+    parser.add_argument("--file", "-f", help="JSON-LD file to validate")
+    parser.add_argument("--output", "-o", help="Output file for JSON report")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+
+    args = parser.parse_args()
+
+    if not args.url and not args.file:
+        parser.error("Must provide --url or --file")
+
+    validator = SchemaValidator()
+
+    if args.file:
+        with open(args.file, "r", encoding="utf-8") as f:
+            schema = json.load(f)
+        result = validator.validate(schema=schema)
+    else:
+        result = validator.validate(url=args.url)
+
+    if args.json or args.output:
+        output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
+        if args.output:
+            with open(args.output, "w", encoding="utf-8") as f:
+                f.write(output)
+            logger.info(f"Report written to {args.output}")
+        else:
+            print(output)
+    else:
+        print(validator.generate_report(result))
+
+
+if __name__ == "__main__":
+    main()
--- a/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/sitemap_crawler.py
+++ b/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/sitemap_crawler.py
@@ -0,0 +1,969 @@
+"""
+Sitemap Crawler - Sequential page analysis from sitemap
+=======================================================
+Purpose: Crawl sitemap URLs one by one, analyze each page, save to Notion
+Python: 3.10+
+Usage:
+    from sitemap_crawler import SitemapCrawler
+    crawler = SitemapCrawler()
+    crawler.crawl_sitemap("https://example.com/sitemap.xml", delay=2.0)
+"""
+
+import json
+import logging
+import time
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Callable, Generator
+from urllib.parse import urlparse
+
+import requests
+from notion_client import Client
+
+from base_client import config
+from page_analyzer import PageAnalyzer, PageMetadata
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+# Default database for page analysis data
+DEFAULT_PAGES_DATABASE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
+
+# Default limits to prevent excessive resource usage
+DEFAULT_MAX_PAGES = 500
+DEFAULT_DELAY_SECONDS = 2.0
+
+# Progress tracking directory
+PROGRESS_DIR = Path.home() / ".claude" / "seo-audit-progress"
+PROGRESS_DIR.mkdir(parents=True, exist_ok=True)
+
+
+@dataclass
+class CrawlProgress:
+    """Track crawl progress."""
+    total_urls: int = 0
+    processed_urls: int = 0
+    successful_urls: int = 0
+    failed_urls: int = 0
+    skipped_urls: int = 0
+    start_time: datetime = field(default_factory=datetime.now)
+    current_url: str = ""
+    audit_id: str = ""
+    site: str = ""
+    status: str = "running"  # running, completed, failed
+    error_message: str = ""
+    summary_page_id: str = ""
+
+    def get_progress_percent(self) -> float:
+        if self.total_urls == 0:
+            return 0.0
+        return (self.processed_urls / self.total_urls) * 100
+
+    def get_elapsed_time(self) -> str:
+        elapsed = datetime.now() - self.start_time
+        minutes = int(elapsed.total_seconds() // 60)
+        seconds = int(elapsed.total_seconds() % 60)
+        return f"{minutes}m {seconds}s"
+
+    def get_eta(self) -> str:
+        if self.processed_urls == 0:
+            return "calculating..."
+        elapsed = (datetime.now() - self.start_time).total_seconds()
+        avg_time_per_url = elapsed / self.processed_urls
+        remaining_urls = self.total_urls - self.processed_urls
+        eta_seconds = remaining_urls * avg_time_per_url
+        minutes = int(eta_seconds // 60)
+        seconds = int(eta_seconds % 60)
+        return f"{minutes}m {seconds}s"
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "audit_id": self.audit_id,
+            "site": self.site,
+            "status": self.status,
+            "total_urls": self.total_urls,
+            "processed_urls": self.processed_urls,
+            "successful_urls": self.successful_urls,
+            "failed_urls": self.failed_urls,
+            "progress_percent": round(self.get_progress_percent(), 1),
+            "elapsed_time": self.get_elapsed_time(),
+            "eta": self.get_eta(),
+            "current_url": self.current_url,
+            "start_time": self.start_time.isoformat(),
+            "error_message": self.error_message,
+            "summary_page_id": self.summary_page_id,
+            "updated_at": datetime.now().isoformat(),
+        }
+
+    def save_to_file(self, filepath: Path | None = None) -> Path:
+        """Save progress to JSON file."""
+        if filepath is None:
+            filepath = PROGRESS_DIR / f"{self.audit_id}.json"
+        with open(filepath, "w") as f:
+            json.dump(self.to_dict(), f, indent=2)
+        return filepath
+
+    @classmethod
+    def load_from_file(cls, filepath: Path) -> "CrawlProgress":
+        """Load progress from JSON file."""
+        with open(filepath, "r") as f:
+            data = json.load(f)
+        progress = cls()
+        progress.audit_id = data.get("audit_id", "")
+        progress.site = data.get("site", "")
+        progress.status = data.get("status", "unknown")
+        progress.total_urls = data.get("total_urls", 0)
+        progress.processed_urls = data.get("processed_urls", 0)
+        progress.successful_urls = data.get("successful_urls", 0)
+        progress.failed_urls = data.get("failed_urls", 0)
+        progress.current_url = data.get("current_url", "")
+        progress.error_message = data.get("error_message", "")
+        progress.summary_page_id = data.get("summary_page_id", "")
+        if data.get("start_time"):
+            progress.start_time = datetime.fromisoformat(data["start_time"])
+        return progress
+
+
+def get_active_crawls() -> list[CrawlProgress]:
+    """Get all active (running) crawl jobs."""
+    active = []
+    for filepath in PROGRESS_DIR.glob("*.json"):
+        try:
+            progress = CrawlProgress.load_from_file(filepath)
+            if progress.status == "running":
+                active.append(progress)
+        except Exception:
+            continue
+    return active
+
+
+def get_all_crawls() -> list[CrawlProgress]:
+    """Get all crawl jobs (active and completed)."""
+    crawls = []
+    for filepath in sorted(PROGRESS_DIR.glob("*.json"), reverse=True):
+        try:
+            progress = CrawlProgress.load_from_file(filepath)
+            crawls.append(progress)
+        except Exception:
+            continue
+    return crawls
+
+
+def get_crawl_status(audit_id: str) -> CrawlProgress | None:
+    """Get status of a specific crawl by audit ID."""
+    filepath = PROGRESS_DIR / f"{audit_id}.json"
+    if filepath.exists():
+        return CrawlProgress.load_from_file(filepath)
+    return None
+
+
+@dataclass
+class CrawlResult:
+    """Result of a complete sitemap crawl."""
+    site: str
+    sitemap_url: str
+    audit_id: str
+    total_pages: int
+    successful_pages: int
+    failed_pages: int
+    start_time: datetime
+    end_time: datetime
+    pages_analyzed: list[PageMetadata] = field(default_factory=list)
+    notion_page_ids: list[str] = field(default_factory=list)
+    summary_page_id: str | None = None
+
+    def get_duration(self) -> str:
+        duration = self.end_time - self.start_time
+        minutes = int(duration.total_seconds() // 60)
+        seconds = int(duration.total_seconds() % 60)
+        return f"{minutes}m {seconds}s"
+
+
+class SitemapCrawler:
+    """Crawl sitemap URLs and analyze each page."""
+
+    def __init__(
+        self,
+        notion_token: str | None = None,
+        database_id: str | None = None,
+    ):
+        """
+        Initialize sitemap crawler.
+
+        Args:
+            notion_token: Notion API token
+            database_id: Notion database ID for storing results
+        """
+        self.notion_token = notion_token or config.notion_token
+        self.database_id = database_id or DEFAULT_PAGES_DATABASE_ID
+        self.analyzer = PageAnalyzer()
+
+        if self.notion_token:
+            self.notion = Client(auth=self.notion_token)
+        else:
+            self.notion = None
+            logger.warning("Notion token not configured, results will not be saved")
+
+    def fetch_sitemap_urls(self, sitemap_url: str) -> list[str]:
+        """
+        Fetch and parse URLs from a sitemap.
+
+        Args:
+            sitemap_url: URL of the sitemap
+
+        Returns:
+            List of URLs found in the sitemap
+        """
+        try:
+            response = requests.get(sitemap_url, timeout=30)
+            response.raise_for_status()
+
+            # Parse XML
+            root = ET.fromstring(response.content)
+
+            # Handle namespace
+            namespaces = {
+                "sm": "http://www.sitemaps.org/schemas/sitemap/0.9"
+            }
+
+            urls = []
+
+            # Check if this is a sitemap index
+            sitemap_tags = root.findall(".//sm:sitemap/sm:loc", namespaces)
+            if sitemap_tags:
+                # This is a sitemap index, recursively fetch child sitemaps
+                logger.info(f"Found sitemap index with {len(sitemap_tags)} child sitemaps")
+                for loc in sitemap_tags:
+                    if loc.text:
+                        child_urls = self.fetch_sitemap_urls(loc.text)
+                        urls.extend(child_urls)
+            else:
+                # Regular sitemap, extract URLs
+                url_tags = root.findall(".//sm:url/sm:loc", namespaces)
+                if not url_tags:
+                    # Try without namespace
+                    url_tags = root.findall(".//url/loc")
+
+                for loc in url_tags:
+                    if loc.text:
+                        urls.append(loc.text)
+
+            # Remove duplicates while preserving order
+            seen = set()
+            unique_urls = []
+            for url in urls:
+                if url not in seen:
+                    seen.add(url)
+                    unique_urls.append(url)
+
+            logger.info(f"Found {len(unique_urls)} unique URLs in sitemap")
+            return unique_urls
+
+        except Exception as e:
+            logger.error(f"Failed to fetch sitemap: {e}")
+            raise
+
+    def crawl_sitemap(
+        self,
+        sitemap_url: str,
+        delay: float = DEFAULT_DELAY_SECONDS,
+        max_pages: int = DEFAULT_MAX_PAGES,
+        progress_callback: Callable[[CrawlProgress], None] | None = None,
+        save_to_notion: bool = True,
+        url_filter: Callable[[str], bool] | None = None,
+    ) -> CrawlResult:
+        """
+        Crawl all URLs in a sitemap sequentially.
+
+        Args:
+            sitemap_url: URL of the sitemap
+            delay: Seconds to wait between requests (default: 2.0s)
+            max_pages: Maximum number of pages to process (default: 500)
+            progress_callback: Function called with progress updates
+            save_to_notion: Whether to save results to Notion
+            url_filter: Optional function to filter URLs (return True to include)
+
+        Returns:
+            CrawlResult with all analyzed pages
+        """
+        # Parse site info
+        parsed_sitemap = urlparse(sitemap_url)
+        site = f"{parsed_sitemap.scheme}://{parsed_sitemap.netloc}"
+        site_domain = parsed_sitemap.netloc
+
+        # Generate audit ID
+        audit_id = f"{site_domain}-pages-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
+
+        logger.info(f"Starting sitemap crawl: {sitemap_url}")
+        logger.info(f"Audit ID: {audit_id}")
+        logger.info(f"Delay between requests: {delay}s")
+
+        # Initialize progress tracking
+        progress = CrawlProgress(
+            audit_id=audit_id,
+            site=site,
+            status="running",
+        )
+
+        # Fetch URLs
+        urls = self.fetch_sitemap_urls(sitemap_url)
+
+        # Apply URL filter if provided
+        if url_filter:
+            urls = [url for url in urls if url_filter(url)]
+            logger.info(f"After filtering: {len(urls)} URLs")
+
+        # Apply max pages limit (default: 500 to prevent excessive resource usage)
+        if len(urls) > max_pages:
+            logger.warning(f"Sitemap has {len(urls)} URLs, limiting to {max_pages} pages")
+            logger.warning(f"Use max_pages parameter to adjust this limit")
+            urls = urls[:max_pages]
+        logger.info(f"Processing {len(urls)} pages (max: {max_pages})")
+
+        # Update progress with total URLs
+        progress.total_urls = len(urls)
+        progress.save_to_file()
+
+        # Initialize result
+        result = CrawlResult(
+            site=site,
+            sitemap_url=sitemap_url,
+            audit_id=audit_id,
+            total_pages=len(urls),
+            successful_pages=0,
+            failed_pages=0,
+            start_time=datetime.now(),
+            end_time=datetime.now(),
+        )
+
+        # Process each URL
+        try:
+            for i, url in enumerate(urls):
+                progress.current_url = url
+                progress.processed_urls = i
+                progress.save_to_file()  # Save progress to file
+
+                if progress_callback:
+                    progress_callback(progress)
+
+                logger.info(f"[{i+1}/{len(urls)}] Analyzing: {url}")
+
+                try:
+                    # Analyze page
+                    metadata = self.analyzer.analyze_url(url)
+                    result.pages_analyzed.append(metadata)
+
+                    if metadata.status_code == 200:
+                        progress.successful_urls += 1
+                        result.successful_pages += 1
+
+                        # Save to Notion
+                        if save_to_notion and self.notion:
+                            page_id = self._save_page_to_notion(metadata, audit_id, site)
+                            if page_id:
+                                result.notion_page_ids.append(page_id)
+                    else:
+                        progress.failed_urls += 1
+                        result.failed_pages += 1
+
+                except Exception as e:
+                    logger.error(f"Failed to analyze {url}: {e}")
+                    progress.failed_urls += 1
+                    result.failed_pages += 1
+
+                # Wait before next request
+                if i < len(urls) - 1:  # Don't wait after last URL
+                    time.sleep(delay)
+
+            # Final progress update
+            progress.processed_urls = len(urls)
+            progress.status = "completed"
+            if progress_callback:
+                progress_callback(progress)
+
+        except Exception as e:
+            progress.status = "failed"
+            progress.error_message = str(e)
+            progress.save_to_file()
+            raise
+
+        # Update result
+        result.end_time = datetime.now()
+
+        # Create summary page
+        if save_to_notion and self.notion:
+            summary_id = self._create_crawl_summary_page(result)
+            result.summary_page_id = summary_id
+            progress.summary_page_id = summary_id
+
+        # Save final progress
+        progress.save_to_file()
+
+        logger.info(f"Crawl complete: {result.successful_pages}/{result.total_pages} pages analyzed")
+        logger.info(f"Duration: {result.get_duration()}")
+
+        return result
+
+    def _save_page_to_notion(
+        self,
+        metadata: PageMetadata,
+        audit_id: str,
+        site: str,
+    ) -> str | None:
+        """Save page metadata to Notion database."""
+        try:
+            # Build properties
+            properties = {
+                "Issue": {"title": [{"text": {"content": f"📄 {metadata.url}"}}]},
+                "Category": {"select": {"name": "On-page SEO"}},
+                "Priority": {"select": {"name": self._determine_priority(metadata)}},
+                "Site": {"url": site},
+                "URL": {"url": metadata.url},
+                "Audit ID": {"rich_text": [{"text": {"content": audit_id}}]},
+                "Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
+            }
+
+            # Build page content
+            children = self._build_page_content(metadata)
+
+            response = self.notion.pages.create(
+                parent={"database_id": self.database_id},
+                properties=properties,
+                children=children,
+            )
+
+            return response["id"]
+
+        except Exception as e:
+            logger.error(f"Failed to save to Notion: {e}")
+            return None
+
+    def _determine_priority(self, metadata: PageMetadata) -> str:
+        """Determine priority based on issues found."""
+        if len(metadata.issues) >= 3:
+            return "High"
+        elif len(metadata.issues) >= 1:
+            return "Medium"
+        elif len(metadata.warnings) >= 3:
+            return "Medium"
+        else:
+            return "Low"
+
+    def _build_page_content(self, metadata: PageMetadata) -> list[dict]:
+        """Build Notion page content blocks from metadata."""
+        children = []
+
+        # Status summary callout
+        status_emoji = "✅" if not metadata.issues else "⚠️" if len(metadata.issues) < 3 else "❌"
+        children.append({
+            "object": "block",
+            "type": "callout",
+            "callout": {
+                "rich_text": [
+                    {"type": "text", "text": {"content": f"Status: {metadata.status_code} | "}},
+                    {"type": "text", "text": {"content": f"Response: {metadata.response_time_ms:.0f}ms | "}},
+                    {"type": "text", "text": {"content": f"Issues: {len(metadata.issues)} | "}},
+                    {"type": "text", "text": {"content": f"Warnings: {len(metadata.warnings)}"}},
+                ],
+                "icon": {"type": "emoji", "emoji": status_emoji},
+                "color": "gray_background" if not metadata.issues else "yellow_background" if len(metadata.issues) < 3 else "red_background",
+            }
+        })
+
+        # Meta Tags Section
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Meta Tags"}}]}
+        })
+
+        # Meta tags table
+        meta_rows = [
+            {"type": "table_row", "table_row": {"cells": [
+                [{"type": "text", "text": {"content": "Tag"}, "annotations": {"bold": True}}],
+                [{"type": "text", "text": {"content": "Value"}, "annotations": {"bold": True}}],
+                [{"type": "text", "text": {"content": "Status"}, "annotations": {"bold": True}}],
+            ]}},
+            {"type": "table_row", "table_row": {"cells": [
+                [{"type": "text", "text": {"content": "Title"}}],
+                [{"type": "text", "text": {"content": (metadata.title or "—")[:50]}}],
+                [{"type": "text", "text": {"content": f"✓ {metadata.title_length} chars" if metadata.title else "✗ Missing"}}],
+            ]}},
+            {"type": "table_row", "table_row": {"cells": [
+                [{"type": "text", "text": {"content": "Description"}}],
+                [{"type": "text", "text": {"content": (metadata.meta_description or "—")[:50]}}],
+                [{"type": "text", "text": {"content": f"✓ {metadata.meta_description_length} chars" if metadata.meta_description else "✗ Missing"}}],
+            ]}},
+            {"type": "table_row", "table_row": {"cells": [
+                [{"type": "text", "text": {"content": "Canonical"}}],
+                [{"type": "text", "text": {"content": (metadata.canonical_url or "—")[:50]}}],
+                [{"type": "text", "text": {"content": "✓" if metadata.canonical_url else "✗ Missing"}}],
+            ]}},
+            {"type": "table_row", "table_row": {"cells": [
+                [{"type": "text", "text": {"content": "Robots"}}],
+                [{"type": "text", "text": {"content": metadata.robots_meta or "—"}}],
+                [{"type": "text", "text": {"content": "✓" if metadata.robots_meta else "—"}}],
+            ]}},
+            {"type": "table_row", "table_row": {"cells": [
+                [{"type": "text", "text": {"content": "Lang"}}],
+                [{"type": "text", "text": {"content": metadata.html_lang or "—"}}],
+                [{"type": "text", "text": {"content": "✓" if metadata.html_lang else "—"}}],
+            ]}},
+        ]
+
+        children.append({
+            "object": "block",
+            "type": "table",
+            "table": {
+                "table_width": 3,
+                "has_column_header": True,
+                "has_row_header": False,
+                "children": meta_rows
+            }
+        })
+
+        # Headings Section
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Headings"}}]}
+        })
+
+        children.append({
+            "object": "block",
+            "type": "paragraph",
+            "paragraph": {"rich_text": [
+                {"type": "text", "text": {"content": f"H1: {metadata.h1_count} | "}},
+                {"type": "text", "text": {"content": f"Total headings: {len(metadata.headings)}"}},
+            ]}
+        })
+
+        if metadata.h1_text:
+            children.append({
+                "object": "block",
+                "type": "quote",
+                "quote": {"rich_text": [{"type": "text", "text": {"content": metadata.h1_text[:200]}}]}
+            })
+
+        # Schema Data Section
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Structured Data"}}]}
+        })
+
+        if metadata.schema_types_found:
+            children.append({
+                "object": "block",
+                "type": "paragraph",
+                "paragraph": {"rich_text": [
+                    {"type": "text", "text": {"content": "Schema types found: "}},
+                    {"type": "text", "text": {"content": ", ".join(metadata.schema_types_found)}, "annotations": {"code": True}},
+                ]}
+            })
+        else:
+            children.append({
+                "object": "block",
+                "type": "callout",
+                "callout": {
+                    "rich_text": [{"type": "text", "text": {"content": "No structured data found on this page"}}],
+                    "icon": {"type": "emoji", "emoji": "⚠️"},
+                    "color": "yellow_background",
+                }
+            })
+
+        # Open Graph Section
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Open Graph"}}]}
+        })
+
+        og = metadata.open_graph
+        og_status = "✓ Configured" if og.og_title else "✗ Missing"
+        children.append({
+            "object": "block",
+            "type": "paragraph",
+            "paragraph": {"rich_text": [
+                {"type": "text", "text": {"content": f"Status: {og_status}\n"}},
+                {"type": "text", "text": {"content": f"og:title: {og.og_title or '—'}\n"}},
+                {"type": "text", "text": {"content": f"og:type: {og.og_type or '—'}"}},
+            ]}
+        })
+
+        # Links Section
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Links"}}]}
+        })
+
+        children.append({
+            "object": "block",
+            "type": "paragraph",
+            "paragraph": {"rich_text": [
+                {"type": "text", "text": {"content": f"Internal links: {metadata.internal_link_count}\n"}},
+                {"type": "text", "text": {"content": f"External links: {metadata.external_link_count}"}},
+            ]}
+        })
+
+        # Images Section
+        children.append({
+            "object": "block",
+            "type": "heading_2",
+            "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Images"}}]}
+        })
+
+        children.append({
+            "object": "block",
+            "type": "paragraph",
+            "paragraph": {"rich_text": [
+                {"type": "text", "text": {"content": f"Total: {metadata.images_total} | "}},
+                {"type": "text", "text": {"content": f"With alt: {metadata.images_with_alt} | "}},
+                {"type": "text", "text": {"content": f"Without alt: {metadata.images_without_alt}"}},
+            ]}
+        })
+
+        # Hreflang Section (if present)
+        if metadata.hreflang_tags:
+            children.append({
+                "object": "block",
+                "type": "heading_2",
+                "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Hreflang Tags"}}]}
+            })
+
+            for tag in metadata.hreflang_tags[:10]:
+                children.append({
+                    "object": "block",
+                    "type": "bulleted_list_item",
+                    "bulleted_list_item": {"rich_text": [
+                        {"type": "text", "text": {"content": f"{tag['lang']}: "}},
+                        {"type": "text", "text": {"content": tag['url'], "link": {"url": tag['url']}}},
+                    ]}
+                })
+
+        # Issues & Warnings Section
+        if metadata.issues or metadata.warnings:
+            children.append({
+                "object": "block",
+                "type": "heading_2",
+                "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Issues & Warnings"}}]}
+            })
+
+            for issue in metadata.issues:
+                children.append({
+                    "object": "block",
+                    "type": "to_do",
+                    "to_do": {
+                        "rich_text": [
+                            {"type": "text", "text": {"content": "❌ "}, "annotations": {"bold": True}},
+                            {"type": "text", "text": {"content": issue}},
+                        ],
+                        "checked": False,
+                    }
+                })
+
+            for warning in metadata.warnings:
+                children.append({
+                    "object": "block",
+                    "type": "to_do",
+                    "to_do": {
+                        "rich_text": [
+                            {"type": "text", "text": {"content": "⚠️ "}, "annotations": {"bold": True}},
+                            {"type": "text", "text": {"content": warning}},
+                        ],
+                        "checked": False,
+                    }
+                })
+
+        return children
+
+    def _create_crawl_summary_page(self, result: CrawlResult) -> str | None:
+        """Create a summary page for the crawl."""
+        try:
+            site_domain = urlparse(result.site).netloc
+
+            # Calculate statistics
+            total_issues = sum(len(p.issues) for p in result.pages_analyzed)
+            total_warnings = sum(len(p.warnings) for p in result.pages_analyzed)
+            pages_with_issues = sum(1 for p in result.pages_analyzed if p.issues)
+            pages_without_schema = sum(1 for p in result.pages_analyzed if not p.schema_types_found)
+            pages_without_description = sum(1 for p in result.pages_analyzed if not p.meta_description)
+
+            children = []
+
+            # Header callout
+            children.append({
+                "object": "block",
+                "type": "callout",
+                "callout": {
+                    "rich_text": [
+                        {"type": "text", "text": {"content": f"Sitemap Crawl Complete\n\n"}},
+                        {"type": "text", "text": {"content": f"Audit ID: {result.audit_id}\n"}},
+                        {"type": "text", "text": {"content": f"Duration: {result.get_duration()}\n"}},
+                        {"type": "text", "text": {"content": f"Pages: {result.successful_pages}/{result.total_pages}"}},
+                    ],
+                    "icon": {"type": "emoji", "emoji": "📊"},
+                    "color": "blue_background",
+                }
+            })
+
+            # Statistics table
+            children.append({
+                "object": "block",
+                "type": "heading_2",
+                "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Statistics"}}]}
+            })
+
+            stats_rows = [
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Metric"}, "annotations": {"bold": True}}],
+                    [{"type": "text", "text": {"content": "Count"}, "annotations": {"bold": True}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Total Pages"}}],
+                    [{"type": "text", "text": {"content": str(result.total_pages)}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Successfully Analyzed"}}],
+                    [{"type": "text", "text": {"content": str(result.successful_pages)}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Pages with Issues"}}],
+                    [{"type": "text", "text": {"content": str(pages_with_issues)}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Total Issues"}}],
+                    [{"type": "text", "text": {"content": str(total_issues)}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Total Warnings"}}],
+                    [{"type": "text", "text": {"content": str(total_warnings)}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Pages without Schema"}}],
+                    [{"type": "text", "text": {"content": str(pages_without_schema)}}],
+                ]}},
+                {"type": "table_row", "table_row": {"cells": [
+                    [{"type": "text", "text": {"content": "Pages without Description"}}],
+                    [{"type": "text", "text": {"content": str(pages_without_description)}}],
+                ]}},
+            ]
+
+            children.append({
+                "object": "block",
+                "type": "table",
+                "table": {
+                    "table_width": 2,
+                    "has_column_header": True,
+                    "has_row_header": False,
+                    "children": stats_rows
+                }
+            })
+
+            # Pages list
+            children.append({
+                "object": "block",
+                "type": "heading_2",
+                "heading_2": {"rich_text": [{"type": "text", "text": {"content": "Analyzed Pages"}}]}
+            })
+
+            children.append({
+                "object": "block",
+                "type": "paragraph",
+                "paragraph": {"rich_text": [
+                    {"type": "text", "text": {"content": f"Filter by Audit ID in the database to see all {result.successful_pages} page entries."}}
+                ]}
+            })
+
+            # Create the summary page
+            response = self.notion.pages.create(
+                parent={"database_id": self.database_id},
+                properties={
+                    "Issue": {"title": [{"text": {"content": f"📊 Sitemap Crawl: {site_domain}"}}]},
+                    "Category": {"select": {"name": "Technical SEO"}},
+                    "Priority": {"select": {"name": "High"}},
+                    "Site": {"url": result.site},
+                    "Audit ID": {"rich_text": [{"text": {"content": result.audit_id}}]},
+                    "Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
+                },
+                children=children,
+            )
+
+            logger.info(f"Created crawl summary page: {response['id']}")
+            return response["id"]
+
+        except Exception as e:
+            logger.error(f"Failed to create summary page: {e}")
+            return None
+
+
+def print_progress_status(progress: CrawlProgress) -> None:
+    """Print formatted progress status."""
+    status_emoji = {
+        "running": "🔄",
+        "completed": "✅",
+        "failed": "❌",
+    }.get(progress.status, "❓")
+
+    print(f"""
+{'=' * 60}
+{status_emoji} SEO Page Analysis - {progress.status.upper()}
+{'=' * 60}
+Audit ID:    {progress.audit_id}
+Site:        {progress.site}
+Status:      {progress.status}
+
+Progress:    {progress.processed_urls}/{progress.total_urls} pages ({progress.get_progress_percent():.1f}%)
+Successful:  {progress.successful_urls}
+Failed:      {progress.failed_urls}
+Elapsed:     {progress.get_elapsed_time()}
+ETA:         {progress.get_eta() if progress.status == 'running' else 'N/A'}
+
+Current URL: {progress.current_url[:60] + '...' if len(progress.current_url) > 60 else progress.current_url}
+""")
+
+    if progress.summary_page_id:
+        print(f"Summary:     https://www.notion.so/{progress.summary_page_id.replace('-', '')}")
+
+    if progress.error_message:
+        print(f"Error:       {progress.error_message}")
+
+    print("=" * 60)
+
+
+def main():
+    """CLI entry point."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Sitemap Crawler with Background Support")
+    subparsers = parser.add_subparsers(dest="command", help="Commands")
+
+    # Crawl command
+    crawl_parser = subparsers.add_parser("crawl", help="Start crawling a sitemap")
+    crawl_parser.add_argument("sitemap_url", help="URL of the sitemap to crawl")
+    crawl_parser.add_argument("--delay", "-d", type=float, default=DEFAULT_DELAY_SECONDS,
+                             help=f"Delay between requests in seconds (default: {DEFAULT_DELAY_SECONDS})")
+    crawl_parser.add_argument("--max-pages", "-m", type=int, default=DEFAULT_MAX_PAGES,
+                             help=f"Maximum pages to process (default: {DEFAULT_MAX_PAGES})")
+    crawl_parser.add_argument("--no-notion", action="store_true",
+                             help="Don't save to Notion")
+    crawl_parser.add_argument("--no-limit", action="store_true",
+                             help="Remove page limit (use with caution)")
+
+    # Status command
+    status_parser = subparsers.add_parser("status", help="Check crawl progress")
+    status_parser.add_argument("audit_id", nargs="?", help="Specific audit ID to check (optional)")
+    status_parser.add_argument("--all", "-a", action="store_true", help="Show all crawls (not just active)")
+
+    # List command
+    list_parser = subparsers.add_parser("list", help="List all crawl jobs")
+
+    args = parser.parse_args()
+
+    # Default to crawl if no command specified but URL provided
+    if args.command is None:
+        # Check if first positional arg looks like a URL
+        import sys
+        if len(sys.argv) > 1 and (sys.argv[1].startswith("http") or sys.argv[1].endswith(".xml")):
+            args.command = "crawl"
+            args.sitemap_url = sys.argv[1]
+            args.delay = DEFAULT_DELAY_SECONDS
+            args.max_pages = DEFAULT_MAX_PAGES
+            args.no_notion = False
+            args.no_limit = False
+        else:
+            parser.print_help()
+            return
+
+    if args.command == "status":
+        if args.audit_id:
+            # Show specific crawl status
+            progress = get_crawl_status(args.audit_id)
+            if progress:
+                print_progress_status(progress)
+            else:
+                print(f"No crawl found with audit ID: {args.audit_id}")
+        else:
+            # Show active crawls
+            if args.all:
+                crawls = get_all_crawls()
+                label = "All"
+            else:
+                crawls = get_active_crawls()
+                label = "Active"
+
+            if crawls:
+                print(f"\n{label} Crawl Jobs ({len(crawls)}):")
+                print("-" * 60)
+                for p in crawls:
+                    status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓")
+                    print(f"{status_emoji} {p.audit_id}")
+                    print(f"   Site: {p.site}")
+                    print(f"   Progress: {p.processed_urls}/{p.total_urls} ({p.get_progress_percent():.1f}%)")
+                    print()
+            else:
+                print(f"No {label.lower()} crawl jobs found.")
+        return
+
+    if args.command == "list":
+        crawls = get_all_crawls()
+        if crawls:
+            print(f"\nAll Crawl Jobs ({len(crawls)}):")
+            print("-" * 80)
+            print(f"{'Status':<10} {'Audit ID':<45} {'Progress':<15}")
+            print("-" * 80)
+            for p in crawls[:20]:  # Show last 20
+                status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓")
+                progress_str = f"{p.processed_urls}/{p.total_urls}"
+                print(f"{status_emoji} {p.status:<7} {p.audit_id:<45} {progress_str:<15}")
+            if len(crawls) > 20:
+                print(f"... and {len(crawls) - 20} more")
+        else:
+            print("No crawl jobs found.")
+        return
+
+    if args.command == "crawl":
+        # Handle --no-limit option
+        max_pages = args.max_pages
+        if args.no_limit:
+            max_pages = 999999  # Effectively unlimited
+            print("⚠️  WARNING: Page limit disabled. This may take a very long time!")
+
+        def progress_callback(progress: CrawlProgress):
+            pct = progress.get_progress_percent()
+            print(f"\r[{pct:5.1f}%] {progress.processed_urls}/{progress.total_urls} pages | "
+                  f"Success: {progress.successful_urls} | Failed: {progress.failed_urls} | "
+                  f"ETA: {progress.get_eta()}", end="", flush=True)
+
+        crawler = SitemapCrawler()
+        result = crawler.crawl_sitemap(
+            args.sitemap_url,
+            delay=args.delay,
+            max_pages=max_pages,
+            progress_callback=progress_callback,
+            save_to_notion=not args.no_notion,
+        )
+
+        print()  # New line after progress
+        print()
+        print("=" * 60)
+        print("CRAWL COMPLETE")
+        print("=" * 60)
+        print(f"Audit ID: {result.audit_id}")
+        print(f"Total Pages: {result.total_pages}")
+        print(f"Successful: {result.successful_pages}")
+        print(f"Failed: {result.failed_pages}")
+        print(f"Duration: {result.get_duration()}")
+        if result.summary_page_id:
+            print(f"Summary Page: https://www.notion.so/{result.summary_page_id.replace('-', '')}")
+
+
+if __name__ == "__main__":
+    main()
--- a/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/sitemap_validator.py
+++ b/ourdigital-custom-skills/_archive/seo-audit-agent/scripts/sitemap_validator.py
@@ -0,0 +1,467 @@
+"""
+Sitemap Validator - Validate XML sitemaps
+==========================================
+Purpose: Parse and validate XML sitemaps for SEO compliance
+Python: 3.10+
+Usage:
+    python sitemap_validator.py --url https://example.com/sitemap.xml
+"""
+
+import argparse
+import asyncio
+import gzip
+import json
+import logging
+import re
+from dataclasses import dataclass, field
+from datetime import datetime
+from io import BytesIO
+from typing import Any
+from urllib.parse import urljoin, urlparse
+
+import aiohttp
+import requests
+from lxml import etree
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SitemapIssue:
+    """Represents a sitemap validation issue."""
+
+    severity: str  # "error", "warning", "info"
+    message: str
+    url: str | None = None
+    suggestion: str | None = None
+
+
+@dataclass
+class SitemapEntry:
+    """Represents a single URL entry in sitemap."""
+
+    loc: str
+    lastmod: str | None = None
+    changefreq: str | None = None
+    priority: float | None = None
+    status_code: int | None = None
+
+
+@dataclass
+class SitemapResult:
+    """Complete sitemap validation result."""
+
+    url: str
+    sitemap_type: str  # "urlset" or "sitemapindex"
+    entries: list[SitemapEntry] = field(default_factory=list)
+    child_sitemaps: list[str] = field(default_factory=list)
+    issues: list[SitemapIssue] = field(default_factory=list)
+    valid: bool = True
+    stats: dict = field(default_factory=dict)
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON output."""
+        return {
+            "url": self.url,
+            "sitemap_type": self.sitemap_type,
+            "valid": self.valid,
+            "stats": self.stats,
+            "issues": [
+                {
+                    "severity": i.severity,
+                    "message": i.message,
+                    "url": i.url,
+                    "suggestion": i.suggestion,
+                }
+                for i in self.issues
+            ],
+            "entries_count": len(self.entries),
+            "child_sitemaps": self.child_sitemaps,
+            "timestamp": self.timestamp,
+        }
+
+
+class SitemapValidator:
+    """Validate XML sitemaps."""
+
+    SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
+    MAX_URLS = 50000
+    MAX_SIZE_BYTES = 50 * 1024 * 1024  # 50MB
+
+    VALID_CHANGEFREQ = {
+        "always", "hourly", "daily", "weekly",
+        "monthly", "yearly", "never"
+    }
+
+    def __init__(self, check_urls: bool = False, max_concurrent: int = 10):
+        self.check_urls = check_urls
+        self.max_concurrent = max_concurrent
+        self.session = requests.Session()
+        self.session.headers.update({
+            "User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
+        })
+
+    def fetch_sitemap(self, url: str) -> tuple[bytes, bool]:
+        """Fetch sitemap content, handling gzip compression."""
+        try:
+            response = self.session.get(url, timeout=30)
+            response.raise_for_status()
+
+            content = response.content
+            is_gzipped = False
+
+            # Check if gzipped
+            if url.endswith(".gz") or response.headers.get(
+                "Content-Encoding"
+            ) == "gzip":
+                try:
+                    content = gzip.decompress(content)
+                    is_gzipped = True
+                except gzip.BadGzipFile:
+                    pass
+
+            return content, is_gzipped
+        except requests.RequestException as e:
+            raise RuntimeError(f"Failed to fetch sitemap: {e}")
+
+    def parse_sitemap(self, content: bytes) -> tuple[str, list[dict]]:
+        """Parse sitemap XML content."""
+        try:
+            root = etree.fromstring(content)
+        except etree.XMLSyntaxError as e:
+            raise ValueError(f"Invalid XML: {e}")
+
+        # Remove namespace for easier parsing
+        nsmap = {"sm": self.SITEMAP_NS}
+
+        # Check if it's a sitemap index or urlset
+        if root.tag == f"{{{self.SITEMAP_NS}}}sitemapindex":
+            sitemap_type = "sitemapindex"
+            entries = []
+            for sitemap in root.findall("sm:sitemap", nsmap):
+                entry = {}
+                loc = sitemap.find("sm:loc", nsmap)
+                if loc is not None and loc.text:
+                    entry["loc"] = loc.text.strip()
+                lastmod = sitemap.find("sm:lastmod", nsmap)
+                if lastmod is not None and lastmod.text:
+                    entry["lastmod"] = lastmod.text.strip()
+                if entry.get("loc"):
+                    entries.append(entry)
+        elif root.tag == f"{{{self.SITEMAP_NS}}}urlset":
+            sitemap_type = "urlset"
+            entries = []
+            for url in root.findall("sm:url", nsmap):
+                entry = {}
+                loc = url.find("sm:loc", nsmap)
+                if loc is not None and loc.text:
+                    entry["loc"] = loc.text.strip()
+                lastmod = url.find("sm:lastmod", nsmap)
+                if lastmod is not None and lastmod.text:
+                    entry["lastmod"] = lastmod.text.strip()
+                changefreq = url.find("sm:changefreq", nsmap)
+                if changefreq is not None and changefreq.text:
+                    entry["changefreq"] = changefreq.text.strip().lower()
+                priority = url.find("sm:priority", nsmap)
+                if priority is not None and priority.text:
+                    try:
+                        entry["priority"] = float(priority.text.strip())
+                    except ValueError:
+                        entry["priority"] = None
+                if entry.get("loc"):
+                    entries.append(entry)
+        else:
+            raise ValueError(f"Unknown sitemap type: {root.tag}")
+
+        return sitemap_type, entries
+
+    def validate(self, url: str) -> SitemapResult:
+        """Validate a sitemap URL."""
+        result = SitemapResult(url=url, sitemap_type="unknown")
+
+        # Fetch sitemap
+        try:
+            content, is_gzipped = self.fetch_sitemap(url)
+        except RuntimeError as e:
+            result.issues.append(SitemapIssue(
+                severity="error",
+                message=str(e),
+                url=url,
+            ))
+            result.valid = False
+            return result
+
+        # Check size
+        if len(content) > self.MAX_SIZE_BYTES:
+            result.issues.append(SitemapIssue(
+                severity="error",
+                message=f"Sitemap exceeds 50MB limit ({len(content) / 1024 / 1024:.2f}MB)",
+                url=url,
+                suggestion="Split sitemap into smaller files using sitemap index",
+            ))
+
+        # Parse XML
+        try:
+            sitemap_type, entries = self.parse_sitemap(content)
+        except ValueError as e:
+            result.issues.append(SitemapIssue(
+                severity="error",
+                message=str(e),
+                url=url,
+            ))
+            result.valid = False
+            return result
+
+        result.sitemap_type = sitemap_type
+
+        # Process entries
+        if sitemap_type == "sitemapindex":
+            result.child_sitemaps = [e["loc"] for e in entries]
+            result.stats = {
+                "child_sitemaps_count": len(entries),
+            }
+        else:
+            # Validate URL entries
+            url_count = len(entries)
+            result.stats["url_count"] = url_count
+
+            if url_count > self.MAX_URLS:
+                result.issues.append(SitemapIssue(
+                    severity="error",
+                    message=f"Sitemap exceeds 50,000 URL limit ({url_count} URLs)",
+                    url=url,
+                    suggestion="Split into multiple sitemaps with sitemap index",
+                ))
+
+            if url_count == 0:
+                result.issues.append(SitemapIssue(
+                    severity="warning",
+                    message="Sitemap is empty (no URLs)",
+                    url=url,
+                ))
+
+            # Validate individual entries
+            seen_urls = set()
+            invalid_lastmod = 0
+            invalid_changefreq = 0
+            invalid_priority = 0
+
+            for entry in entries:
+                loc = entry.get("loc", "")
+
+                # Check for duplicates
+                if loc in seen_urls:
+                    result.issues.append(SitemapIssue(
+                        severity="warning",
+                        message="Duplicate URL in sitemap",
+                        url=loc,
+                    ))
+                seen_urls.add(loc)
+
+                # Validate lastmod format
+                lastmod = entry.get("lastmod")
+                if lastmod:
+                    if not self._validate_date(lastmod):
+                        invalid_lastmod += 1
+
+                # Validate changefreq
+                changefreq = entry.get("changefreq")
+                if changefreq and changefreq not in self.VALID_CHANGEFREQ:
+                    invalid_changefreq += 1
+
+                # Validate priority
+                priority = entry.get("priority")
+                if priority is not None:
+                    if not (0.0 <= priority <= 1.0):
+                        invalid_priority += 1
+
+                # Create entry object
+                result.entries.append(SitemapEntry(
+                    loc=loc,
+                    lastmod=lastmod,
+                    changefreq=changefreq,
+                    priority=priority,
+                ))
+
+            # Add summary issues
+            if invalid_lastmod > 0:
+                result.issues.append(SitemapIssue(
+                    severity="warning",
+                    message=f"{invalid_lastmod} URLs with invalid lastmod format",
+                    suggestion="Use ISO 8601 format (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS+TZ)",
+                ))
+
+            if invalid_changefreq > 0:
+                result.issues.append(SitemapIssue(
+                    severity="info",
+                    message=f"{invalid_changefreq} URLs with invalid changefreq",
+                    suggestion="Use: always, hourly, daily, weekly, monthly, yearly, never",
+                ))
+
+            if invalid_priority > 0:
+                result.issues.append(SitemapIssue(
+                    severity="warning",
+                    message=f"{invalid_priority} URLs with invalid priority (must be 0.0-1.0)",
+                ))
+
+            result.stats.update({
+                "invalid_lastmod": invalid_lastmod,
+                "invalid_changefreq": invalid_changefreq,
+                "invalid_priority": invalid_priority,
+                "has_lastmod": sum(1 for e in result.entries if e.lastmod),
+                "has_changefreq": sum(1 for e in result.entries if e.changefreq),
+                "has_priority": sum(1 for e in result.entries if e.priority is not None),
+            })
+
+        # Check URLs if requested
+        if self.check_urls and result.entries:
+            asyncio.run(self._check_url_status(result))
+
+        # Determine validity
+        result.valid = not any(i.severity == "error" for i in result.issues)
+
+        return result
+
+    def _validate_date(self, date_str: str) -> bool:
+        """Validate ISO 8601 date format."""
+        patterns = [
+            r"^\d{4}-\d{2}-\d{2}$",
+            r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}",
+        ]
+        return any(re.match(p, date_str) for p in patterns)
+
+    async def _check_url_status(self, result: SitemapResult) -> None:
+        """Check HTTP status of URLs in sitemap."""
+        semaphore = asyncio.Semaphore(self.max_concurrent)
+
+        async def check_url(entry: SitemapEntry) -> None:
+            async with semaphore:
+                try:
+                    async with aiohttp.ClientSession() as session:
+                        async with session.head(
+                            entry.loc,
+                            timeout=aiohttp.ClientTimeout(total=10),
+                            allow_redirects=True,
+                        ) as response:
+                            entry.status_code = response.status
+                except Exception:
+                    entry.status_code = 0
+
+        await asyncio.gather(*[check_url(e) for e in result.entries[:100]])
+
+        # Count status codes
+        status_counts = {}
+        for entry in result.entries:
+            if entry.status_code:
+                status_counts[entry.status_code] = (
+                    status_counts.get(entry.status_code, 0) + 1
+                )
+
+        result.stats["url_status_codes"] = status_counts
+
+        # Add issues for non-200 URLs
+        error_count = sum(
+            1 for e in result.entries
+            if e.status_code and e.status_code >= 400
+        )
+        if error_count > 0:
+            result.issues.append(SitemapIssue(
+                severity="warning",
+                message=f"{error_count} URLs returning error status codes (4xx/5xx)",
+                suggestion="Remove or fix broken URLs in sitemap",
+            ))
+
+    def generate_report(self, result: SitemapResult) -> str:
+        """Generate human-readable validation report."""
+        lines = [
+            "=" * 60,
+            "Sitemap Validation Report",
+            "=" * 60,
+            f"URL: {result.url}",
+            f"Type: {result.sitemap_type}",
+            f"Valid: {'Yes' if result.valid else 'No'}",
+            f"Timestamp: {result.timestamp}",
+            "",
+        ]
+
+        lines.append("Statistics:")
+        for key, value in result.stats.items():
+            lines.append(f"  {key}: {value}")
+        lines.append("")
+
+        if result.child_sitemaps:
+            lines.append(f"Child Sitemaps ({len(result.child_sitemaps)}):")
+            for sitemap in result.child_sitemaps[:10]:
+                lines.append(f"  - {sitemap}")
+            if len(result.child_sitemaps) > 10:
+                lines.append(f"  ... and {len(result.child_sitemaps) - 10} more")
+            lines.append("")
+
+        if result.issues:
+            lines.append("Issues Found:")
+            errors = [i for i in result.issues if i.severity == "error"]
+            warnings = [i for i in result.issues if i.severity == "warning"]
+            infos = [i for i in result.issues if i.severity == "info"]
+
+            if errors:
+                lines.append(f"\n  ERRORS ({len(errors)}):")
+                for issue in errors:
+                    lines.append(f"    - {issue.message}")
+                    if issue.url:
+                        lines.append(f"      URL: {issue.url}")
+                    if issue.suggestion:
+                        lines.append(f"      Suggestion: {issue.suggestion}")
+
+            if warnings:
+                lines.append(f"\n  WARNINGS ({len(warnings)}):")
+                for issue in warnings:
+                    lines.append(f"    - {issue.message}")
+                    if issue.suggestion:
+                        lines.append(f"      Suggestion: {issue.suggestion}")
+
+            if infos:
+                lines.append(f"\n  INFO ({len(infos)}):")
+                for issue in infos:
+                    lines.append(f"    - {issue.message}")
+
+        lines.append("")
+        lines.append("=" * 60)
+
+        return "\n".join(lines)
+
+
+def main():
+    """Main entry point for CLI usage."""
+    parser = argparse.ArgumentParser(
+        description="Validate XML sitemaps",
+    )
+    parser.add_argument("--url", "-u", required=True, help="Sitemap URL to validate")
+    parser.add_argument("--check-urls", action="store_true",
+                       help="Check HTTP status of URLs (slower)")
+    parser.add_argument("--output", "-o", help="Output file for JSON report")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+
+    args = parser.parse_args()
+
+    validator = SitemapValidator(check_urls=args.check_urls)
+    result = validator.validate(args.url)
+
+    if args.json or args.output:
+        output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
+        if args.output:
+            with open(args.output, "w", encoding="utf-8") as f:
+                f.write(output)
+            logger.info(f"Report written to {args.output}")
+        else:
+            print(output)
+    else:
+        print(validator.generate_report(result))
+
+
+if __name__ == "__main__":
+    main()