our-claude-skills/custom-skills/99_archive/seo-audit-agent/scripts/page_analyzer.py

"""
Page Analyzer - Extract SEO metadata from web pages
===================================================
Purpose: Comprehensive page-level SEO data extraction
Python: 3.10+
Usage:
    from page_analyzer import PageAnalyzer, PageMetadata
    analyzer = PageAnalyzer()
    metadata = analyzer.analyze_url("https://example.com/page")
"""

import json
import logging
import re
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)


@dataclass
class LinkData:
    """Represents a link found on a page."""
    url: str
    anchor_text: str
    is_internal: bool
    is_nofollow: bool = False
    link_type: str = "body"  # body, nav, footer, etc.


@dataclass
class HeadingData:
    """Represents a heading found on a page."""
    level: int  # 1-6
    text: str


@dataclass
class SchemaData:
    """Represents schema.org structured data."""
    schema_type: str
    properties: dict
    format: str = "json-ld"  # json-ld, microdata, rdfa


@dataclass
class OpenGraphData:
    """Represents Open Graph metadata."""
    og_title: str | None = None
    og_description: str | None = None
    og_image: str | None = None
    og_url: str | None = None
    og_type: str | None = None
    og_site_name: str | None = None
    og_locale: str | None = None
    twitter_card: str | None = None
    twitter_title: str | None = None
    twitter_description: str | None = None
    twitter_image: str | None = None


@dataclass
class PageMetadata:
    """Complete SEO metadata for a page."""

    # Basic info
    url: str
    status_code: int = 0
    content_type: str = ""
    response_time_ms: float = 0
    analyzed_at: datetime = field(default_factory=datetime.now)

    # Meta tags
    title: str | None = None
    title_length: int = 0
    meta_description: str | None = None
    meta_description_length: int = 0
    canonical_url: str | None = None
    robots_meta: str | None = None

    # Language
    html_lang: str | None = None
    hreflang_tags: list[dict] = field(default_factory=list)  # [{"lang": "en", "url": "..."}]

    # Headings
    headings: list[HeadingData] = field(default_factory=list)
    h1_count: int = 0
    h1_text: str | None = None

    # Open Graph & Social
    open_graph: OpenGraphData = field(default_factory=OpenGraphData)

    # Schema/Structured Data
    schema_data: list[SchemaData] = field(default_factory=list)
    schema_types_found: list[str] = field(default_factory=list)

    # Links
    internal_links: list[LinkData] = field(default_factory=list)
    external_links: list[LinkData] = field(default_factory=list)
    internal_link_count: int = 0
    external_link_count: int = 0

    # Images
    images_total: int = 0
    images_without_alt: int = 0
    images_with_alt: int = 0

    # Content metrics
    word_count: int = 0

    # Issues found
    issues: list[str] = field(default_factory=list)
    warnings: list[str] = field(default_factory=list)

    def to_dict(self) -> dict:
        """Convert to dictionary for JSON serialization."""
        return {
            "url": self.url,
            "status_code": self.status_code,
            "content_type": self.content_type,
            "response_time_ms": self.response_time_ms,
            "analyzed_at": self.analyzed_at.isoformat(),
            "title": self.title,
            "title_length": self.title_length,
            "meta_description": self.meta_description,
            "meta_description_length": self.meta_description_length,
            "canonical_url": self.canonical_url,
            "robots_meta": self.robots_meta,
            "html_lang": self.html_lang,
            "hreflang_tags": self.hreflang_tags,
            "h1_count": self.h1_count,
            "h1_text": self.h1_text,
            "headings_count": len(self.headings),
            "schema_types_found": self.schema_types_found,
            "internal_link_count": self.internal_link_count,
            "external_link_count": self.external_link_count,
            "images_total": self.images_total,
            "images_without_alt": self.images_without_alt,
            "word_count": self.word_count,
            "issues": self.issues,
            "warnings": self.warnings,
            "open_graph": {
                "og_title": self.open_graph.og_title,
                "og_description": self.open_graph.og_description,
                "og_image": self.open_graph.og_image,
                "og_url": self.open_graph.og_url,
                "og_type": self.open_graph.og_type,
            },
        }

    def get_summary(self) -> str:
        """Get a brief summary of the page analysis."""
        lines = [
            f"URL: {self.url}",
            f"Status: {self.status_code}",
            f"Title: {self.title[:50] + '...' if self.title and len(self.title) > 50 else self.title}",
            f"Description: {'✓' if self.meta_description else '✗ Missing'}",
            f"Canonical: {'✓' if self.canonical_url else '✗ Missing'}",
            f"H1: {self.h1_count} found",
            f"Schema: {', '.join(self.schema_types_found) if self.schema_types_found else 'None'}",
            f"Links: {self.internal_link_count} internal, {self.external_link_count} external",
            f"Images: {self.images_total} total, {self.images_without_alt} without alt",
        ]
        if self.issues:
            lines.append(f"Issues: {len(self.issues)}")
        return "\n".join(lines)


class PageAnalyzer:
    """Analyze web pages for SEO metadata."""

    DEFAULT_USER_AGENT = "Mozilla/5.0 (compatible; OurDigitalSEOBot/1.0; +https://ourdigital.org)"

    def __init__(
        self,
        user_agent: str | None = None,
        timeout: int = 30,
    ):
        """
        Initialize page analyzer.

        Args:
            user_agent: Custom user agent string
            timeout: Request timeout in seconds
        """
        self.user_agent = user_agent or self.DEFAULT_USER_AGENT
        self.timeout = timeout
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": self.user_agent,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9,ko;q=0.8",
        })

    def analyze_url(self, url: str) -> PageMetadata:
        """
        Analyze a URL and extract SEO metadata.

        Args:
            url: URL to analyze

        Returns:
            PageMetadata object with all extracted data
        """
        metadata = PageMetadata(url=url)

        try:
            # Fetch page
            start_time = datetime.now()
            response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
            metadata.response_time_ms = (datetime.now() - start_time).total_seconds() * 1000
            metadata.status_code = response.status_code
            metadata.content_type = response.headers.get("Content-Type", "")

            if response.status_code != 200:
                metadata.issues.append(f"HTTP {response.status_code} status")
                if response.status_code >= 400:
                    return metadata

            # Parse HTML
            soup = BeautifulSoup(response.text, "html.parser")
            base_url = url

            # Extract all metadata
            self._extract_basic_meta(soup, metadata)
            self._extract_canonical(soup, metadata, base_url)
            self._extract_robots_meta(soup, metadata)
            self._extract_hreflang(soup, metadata)
            self._extract_headings(soup, metadata)
            self._extract_open_graph(soup, metadata)
            self._extract_schema(soup, metadata)
            self._extract_links(soup, metadata, base_url)
            self._extract_images(soup, metadata)
            self._extract_content_metrics(soup, metadata)

            # Run SEO checks
            self._run_seo_checks(metadata)

        except requests.RequestException as e:
            metadata.issues.append(f"Request failed: {str(e)}")
            logger.error(f"Failed to analyze {url}: {e}")
        except Exception as e:
            metadata.issues.append(f"Analysis error: {str(e)}")
            logger.error(f"Error analyzing {url}: {e}")

        return metadata

    def _extract_basic_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
        """Extract title and meta description."""
        # Title
        title_tag = soup.find("title")
        if title_tag and title_tag.string:
            metadata.title = title_tag.string.strip()
            metadata.title_length = len(metadata.title)

        # Meta description
        desc_tag = soup.find("meta", attrs={"name": re.compile(r"^description$", re.I)})
        if desc_tag and desc_tag.get("content"):
            metadata.meta_description = desc_tag["content"].strip()
            metadata.meta_description_length = len(metadata.meta_description)

        # HTML lang
        html_tag = soup.find("html")
        if html_tag and html_tag.get("lang"):
            metadata.html_lang = html_tag["lang"]

    def _extract_canonical(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None:
        """Extract canonical URL."""
        canonical = soup.find("link", rel="canonical")
        if canonical and canonical.get("href"):
            metadata.canonical_url = urljoin(base_url, canonical["href"])

    def _extract_robots_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
        """Extract robots meta tag."""
        robots = soup.find("meta", attrs={"name": re.compile(r"^robots$", re.I)})
        if robots and robots.get("content"):
            metadata.robots_meta = robots["content"]

        # Also check for googlebot-specific
        googlebot = soup.find("meta", attrs={"name": re.compile(r"^googlebot$", re.I)})
        if googlebot and googlebot.get("content"):
            if metadata.robots_meta:
                metadata.robots_meta += f" | googlebot: {googlebot['content']}"
            else:
                metadata.robots_meta = f"googlebot: {googlebot['content']}"

    def _extract_hreflang(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
        """Extract hreflang tags."""
        hreflang_tags = soup.find_all("link", rel="alternate", hreflang=True)
        for tag in hreflang_tags:
            if tag.get("href") and tag.get("hreflang"):
                metadata.hreflang_tags.append({
                    "lang": tag["hreflang"],
                    "url": tag["href"]
                })

    def _extract_headings(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
        """Extract all headings."""
        for level in range(1, 7):
            for heading in soup.find_all(f"h{level}"):
                text = heading.get_text(strip=True)
                if text:
                    metadata.headings.append(HeadingData(level=level, text=text))

        # Count H1s specifically
        h1_tags = soup.find_all("h1")
        metadata.h1_count = len(h1_tags)
        if h1_tags:
            metadata.h1_text = h1_tags[0].get_text(strip=True)

    def _extract_open_graph(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
        """Extract Open Graph and Twitter Card data."""
        og = metadata.open_graph

        # Open Graph tags
        og_mappings = {
            "og:title": "og_title",
            "og:description": "og_description",
            "og:image": "og_image",
            "og:url": "og_url",
            "og:type": "og_type",
            "og:site_name": "og_site_name",
            "og:locale": "og_locale",
        }

        for og_prop, attr_name in og_mappings.items():
            tag = soup.find("meta", property=og_prop)
            if tag and tag.get("content"):
                setattr(og, attr_name, tag["content"])

        # Twitter Card tags
        twitter_mappings = {
            "twitter:card": "twitter_card",
            "twitter:title": "twitter_title",
            "twitter:description": "twitter_description",
            "twitter:image": "twitter_image",
        }

        for tw_name, attr_name in twitter_mappings.items():
            tag = soup.find("meta", attrs={"name": tw_name})
            if tag and tag.get("content"):
                setattr(og, attr_name, tag["content"])

    def _extract_schema(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
        """Extract schema.org structured data."""
        # JSON-LD
        for script in soup.find_all("script", type="application/ld+json"):
            try:
                data = json.loads(script.string)
                if isinstance(data, list):
                    for item in data:
                        self._process_schema_item(item, metadata, "json-ld")
                else:
                    self._process_schema_item(data, metadata, "json-ld")
            except (json.JSONDecodeError, TypeError):
                continue

        # Microdata (basic detection)
        for item in soup.find_all(itemscope=True):
            itemtype = item.get("itemtype", "")
            if itemtype:
                schema_type = itemtype.split("/")[-1]
                if schema_type not in metadata.schema_types_found:
                    metadata.schema_types_found.append(schema_type)
                    metadata.schema_data.append(SchemaData(
                        schema_type=schema_type,
                        properties={},
                        format="microdata"
                    ))

    def _process_schema_item(self, data: dict, metadata: PageMetadata, format_type: str) -> None:
        """Process a single schema.org item."""
        if not isinstance(data, dict):
            return

        schema_type = data.get("@type", "Unknown")
        if isinstance(schema_type, list):
            schema_type = schema_type[0] if schema_type else "Unknown"

        if schema_type not in metadata.schema_types_found:
            metadata.schema_types_found.append(schema_type)

        metadata.schema_data.append(SchemaData(
            schema_type=schema_type,
            properties=data,
            format=format_type
        ))

        # Process nested @graph items
        if "@graph" in data:
            for item in data["@graph"]:
                self._process_schema_item(item, metadata, format_type)

    def _extract_links(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None:
        """Extract internal and external links."""
        parsed_base = urlparse(base_url)
        base_domain = parsed_base.netloc.lower()

        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]

            # Skip non-http links
            if href.startswith(("#", "javascript:", "mailto:", "tel:")):
                continue

            # Resolve relative URLs
            full_url = urljoin(base_url, href)
            parsed_url = urlparse(full_url)

            # Get anchor text
            anchor_text = a_tag.get_text(strip=True)[:100]  # Limit length

            # Check if nofollow
            rel = a_tag.get("rel", [])
            if isinstance(rel, str):
                rel = rel.split()
            is_nofollow = "nofollow" in rel

            # Determine if internal or external
            link_domain = parsed_url.netloc.lower()
            is_internal = (
                link_domain == base_domain or
                link_domain.endswith(f".{base_domain}") or
                base_domain.endswith(f".{link_domain}")
            )

            link_data = LinkData(
                url=full_url,
                anchor_text=anchor_text,
                is_internal=is_internal,
                is_nofollow=is_nofollow,
            )

            if is_internal:
                metadata.internal_links.append(link_data)
            else:
                metadata.external_links.append(link_data)

        metadata.internal_link_count = len(metadata.internal_links)
        metadata.external_link_count = len(metadata.external_links)

    def _extract_images(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
        """Extract image information."""
        images = soup.find_all("img")
        metadata.images_total = len(images)

        for img in images:
            alt = img.get("alt", "").strip()
            if alt:
                metadata.images_with_alt += 1
            else:
                metadata.images_without_alt += 1

    def _extract_content_metrics(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
        """Extract content metrics like word count."""
        # Remove script and style elements
        for element in soup(["script", "style", "noscript"]):
            element.decompose()

        # Get text content
        text = soup.get_text(separator=" ", strip=True)
        words = text.split()
        metadata.word_count = len(words)

    def _run_seo_checks(self, metadata: PageMetadata) -> None:
        """Run SEO checks and add issues/warnings."""
        # Title checks
        if not metadata.title:
            metadata.issues.append("Missing title tag")
        elif metadata.title_length < 30:
            metadata.warnings.append(f"Title too short ({metadata.title_length} chars, recommend 50-60)")
        elif metadata.title_length > 60:
            metadata.warnings.append(f"Title too long ({metadata.title_length} chars, recommend 50-60)")

        # Meta description checks
        if not metadata.meta_description:
            metadata.issues.append("Missing meta description")
        elif metadata.meta_description_length < 120:
            metadata.warnings.append(f"Meta description too short ({metadata.meta_description_length} chars)")
        elif metadata.meta_description_length > 160:
            metadata.warnings.append(f"Meta description too long ({metadata.meta_description_length} chars)")

        # Canonical check
        if not metadata.canonical_url:
            metadata.warnings.append("Missing canonical tag")
        elif metadata.canonical_url != metadata.url:
            metadata.warnings.append(f"Canonical points to different URL: {metadata.canonical_url}")

        # H1 checks
        if metadata.h1_count == 0:
            metadata.issues.append("Missing H1 tag")
        elif metadata.h1_count > 1:
            metadata.warnings.append(f"Multiple H1 tags ({metadata.h1_count})")

        # Image alt check
        if metadata.images_without_alt > 0:
            metadata.warnings.append(f"{metadata.images_without_alt} images missing alt text")

        # Schema check
        if not metadata.schema_types_found:
            metadata.warnings.append("No structured data found")

        # Open Graph check
        if not metadata.open_graph.og_title:
            metadata.warnings.append("Missing Open Graph tags")

        # Robots meta check
        if metadata.robots_meta:
            robots_lower = metadata.robots_meta.lower()
            if "noindex" in robots_lower:
                metadata.issues.append("Page is set to noindex")
            if "nofollow" in robots_lower:
                metadata.warnings.append("Page is set to nofollow")


def main():
    """CLI entry point for testing."""
    import argparse

    parser = argparse.ArgumentParser(description="Page SEO Analyzer")
    parser.add_argument("url", help="URL to analyze")
    parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")

    args = parser.parse_args()

    analyzer = PageAnalyzer()
    metadata = analyzer.analyze_url(args.url)

    if args.json:
        print(json.dumps(metadata.to_dict(), indent=2, ensure_ascii=False))
    else:
        print("=" * 60)
        print("PAGE ANALYSIS REPORT")
        print("=" * 60)
        print(metadata.get_summary())
        print()

        if metadata.issues:
            print("ISSUES:")
            for issue in metadata.issues:
                print(f"  ✗ {issue}")

        if metadata.warnings:
            print("\nWARNINGS:")
            for warning in metadata.warnings:
                print(f"  ⚠ {warning}")

        if metadata.hreflang_tags:
            print(f"\nHREFLANG TAGS ({len(metadata.hreflang_tags)}):")
            for tag in metadata.hreflang_tags[:5]:
                print(f"  {tag['lang']}: {tag['url']}")

        if metadata.schema_types_found:
            print(f"\nSCHEMA TYPES:")
            for schema_type in metadata.schema_types_found:
                print(f"  - {schema_type}")


if __name__ == "__main__":
    main()