""" Page Analyzer - Extract SEO metadata from web pages =================================================== Purpose: Comprehensive page-level SEO data extraction Python: 3.10+ Usage: from page_analyzer import PageAnalyzer, PageMetadata analyzer = PageAnalyzer() metadata = analyzer.analyze_url("https://example.com/page") """ import json import logging import re from dataclasses import dataclass, field from datetime import datetime from typing import Any from urllib.parse import urljoin, urlparse import requests from bs4 import BeautifulSoup logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", ) logger = logging.getLogger(__name__) @dataclass class LinkData: """Represents a link found on a page.""" url: str anchor_text: str is_internal: bool is_nofollow: bool = False link_type: str = "body" # body, nav, footer, etc. @dataclass class HeadingData: """Represents a heading found on a page.""" level: int # 1-6 text: str @dataclass class SchemaData: """Represents schema.org structured data.""" schema_type: str properties: dict format: str = "json-ld" # json-ld, microdata, rdfa @dataclass class OpenGraphData: """Represents Open Graph metadata.""" og_title: str | None = None og_description: str | None = None og_image: str | None = None og_url: str | None = None og_type: str | None = None og_site_name: str | None = None og_locale: str | None = None twitter_card: str | None = None twitter_title: str | None = None twitter_description: str | None = None twitter_image: str | None = None @dataclass class PageMetadata: """Complete SEO metadata for a page.""" # Basic info url: str status_code: int = 0 content_type: str = "" response_time_ms: float = 0 analyzed_at: datetime = field(default_factory=datetime.now) # Meta tags title: str | None = None title_length: int = 0 meta_description: str | None = None meta_description_length: int = 0 canonical_url: str | None = None robots_meta: str | None = None # Language html_lang: str | None = None hreflang_tags: list[dict] = field(default_factory=list) # [{"lang": "en", "url": "..."}] # Headings headings: list[HeadingData] = field(default_factory=list) h1_count: int = 0 h1_text: str | None = None # Open Graph & Social open_graph: OpenGraphData = field(default_factory=OpenGraphData) # Schema/Structured Data schema_data: list[SchemaData] = field(default_factory=list) schema_types_found: list[str] = field(default_factory=list) # Links internal_links: list[LinkData] = field(default_factory=list) external_links: list[LinkData] = field(default_factory=list) internal_link_count: int = 0 external_link_count: int = 0 # Images images_total: int = 0 images_without_alt: int = 0 images_with_alt: int = 0 # Content metrics word_count: int = 0 # Issues found issues: list[str] = field(default_factory=list) warnings: list[str] = field(default_factory=list) def to_dict(self) -> dict: """Convert to dictionary for JSON serialization.""" return { "url": self.url, "status_code": self.status_code, "content_type": self.content_type, "response_time_ms": self.response_time_ms, "analyzed_at": self.analyzed_at.isoformat(), "title": self.title, "title_length": self.title_length, "meta_description": self.meta_description, "meta_description_length": self.meta_description_length, "canonical_url": self.canonical_url, "robots_meta": self.robots_meta, "html_lang": self.html_lang, "hreflang_tags": self.hreflang_tags, "h1_count": self.h1_count, "h1_text": self.h1_text, "headings_count": len(self.headings), "schema_types_found": self.schema_types_found, "internal_link_count": self.internal_link_count, "external_link_count": self.external_link_count, "images_total": self.images_total, "images_without_alt": self.images_without_alt, "word_count": self.word_count, "issues": self.issues, "warnings": self.warnings, "open_graph": { "og_title": self.open_graph.og_title, "og_description": self.open_graph.og_description, "og_image": self.open_graph.og_image, "og_url": self.open_graph.og_url, "og_type": self.open_graph.og_type, }, } def get_summary(self) -> str: """Get a brief summary of the page analysis.""" lines = [ f"URL: {self.url}", f"Status: {self.status_code}", f"Title: {self.title[:50] + '...' if self.title and len(self.title) > 50 else self.title}", f"Description: {'✓' if self.meta_description else '✗ Missing'}", f"Canonical: {'✓' if self.canonical_url else '✗ Missing'}", f"H1: {self.h1_count} found", f"Schema: {', '.join(self.schema_types_found) if self.schema_types_found else 'None'}", f"Links: {self.internal_link_count} internal, {self.external_link_count} external", f"Images: {self.images_total} total, {self.images_without_alt} without alt", ] if self.issues: lines.append(f"Issues: {len(self.issues)}") return "\n".join(lines) class PageAnalyzer: """Analyze web pages for SEO metadata.""" DEFAULT_USER_AGENT = "Mozilla/5.0 (compatible; OurDigitalSEOBot/1.0; +https://ourdigital.org)" def __init__( self, user_agent: str | None = None, timeout: int = 30, ): """ Initialize page analyzer. Args: user_agent: Custom user agent string timeout: Request timeout in seconds """ self.user_agent = user_agent or self.DEFAULT_USER_AGENT self.timeout = timeout self.session = requests.Session() self.session.headers.update({ "User-Agent": self.user_agent, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9,ko;q=0.8", }) def analyze_url(self, url: str) -> PageMetadata: """ Analyze a URL and extract SEO metadata. Args: url: URL to analyze Returns: PageMetadata object with all extracted data """ metadata = PageMetadata(url=url) try: # Fetch page start_time = datetime.now() response = self.session.get(url, timeout=self.timeout, allow_redirects=True) metadata.response_time_ms = (datetime.now() - start_time).total_seconds() * 1000 metadata.status_code = response.status_code metadata.content_type = response.headers.get("Content-Type", "") if response.status_code != 200: metadata.issues.append(f"HTTP {response.status_code} status") if response.status_code >= 400: return metadata # Parse HTML soup = BeautifulSoup(response.text, "html.parser") base_url = url # Extract all metadata self._extract_basic_meta(soup, metadata) self._extract_canonical(soup, metadata, base_url) self._extract_robots_meta(soup, metadata) self._extract_hreflang(soup, metadata) self._extract_headings(soup, metadata) self._extract_open_graph(soup, metadata) self._extract_schema(soup, metadata) self._extract_links(soup, metadata, base_url) self._extract_images(soup, metadata) self._extract_content_metrics(soup, metadata) # Run SEO checks self._run_seo_checks(metadata) except requests.RequestException as e: metadata.issues.append(f"Request failed: {str(e)}") logger.error(f"Failed to analyze {url}: {e}") except Exception as e: metadata.issues.append(f"Analysis error: {str(e)}") logger.error(f"Error analyzing {url}: {e}") return metadata def _extract_basic_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None: """Extract title and meta description.""" # Title title_tag = soup.find("title") if title_tag and title_tag.string: metadata.title = title_tag.string.strip() metadata.title_length = len(metadata.title) # Meta description desc_tag = soup.find("meta", attrs={"name": re.compile(r"^description$", re.I)}) if desc_tag and desc_tag.get("content"): metadata.meta_description = desc_tag["content"].strip() metadata.meta_description_length = len(metadata.meta_description) # HTML lang html_tag = soup.find("html") if html_tag and html_tag.get("lang"): metadata.html_lang = html_tag["lang"] def _extract_canonical(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None: """Extract canonical URL.""" canonical = soup.find("link", rel="canonical") if canonical and canonical.get("href"): metadata.canonical_url = urljoin(base_url, canonical["href"]) def _extract_robots_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None: """Extract robots meta tag.""" robots = soup.find("meta", attrs={"name": re.compile(r"^robots$", re.I)}) if robots and robots.get("content"): metadata.robots_meta = robots["content"] # Also check for googlebot-specific googlebot = soup.find("meta", attrs={"name": re.compile(r"^googlebot$", re.I)}) if googlebot and googlebot.get("content"): if metadata.robots_meta: metadata.robots_meta += f" | googlebot: {googlebot['content']}" else: metadata.robots_meta = f"googlebot: {googlebot['content']}" def _extract_hreflang(self, soup: BeautifulSoup, metadata: PageMetadata) -> None: """Extract hreflang tags.""" hreflang_tags = soup.find_all("link", rel="alternate", hreflang=True) for tag in hreflang_tags: if tag.get("href") and tag.get("hreflang"): metadata.hreflang_tags.append({ "lang": tag["hreflang"], "url": tag["href"] }) def _extract_headings(self, soup: BeautifulSoup, metadata: PageMetadata) -> None: """Extract all headings.""" for level in range(1, 7): for heading in soup.find_all(f"h{level}"): text = heading.get_text(strip=True) if text: metadata.headings.append(HeadingData(level=level, text=text)) # Count H1s specifically h1_tags = soup.find_all("h1") metadata.h1_count = len(h1_tags) if h1_tags: metadata.h1_text = h1_tags[0].get_text(strip=True) def _extract_open_graph(self, soup: BeautifulSoup, metadata: PageMetadata) -> None: """Extract Open Graph and Twitter Card data.""" og = metadata.open_graph # Open Graph tags og_mappings = { "og:title": "og_title", "og:description": "og_description", "og:image": "og_image", "og:url": "og_url", "og:type": "og_type", "og:site_name": "og_site_name", "og:locale": "og_locale", } for og_prop, attr_name in og_mappings.items(): tag = soup.find("meta", property=og_prop) if tag and tag.get("content"): setattr(og, attr_name, tag["content"]) # Twitter Card tags twitter_mappings = { "twitter:card": "twitter_card", "twitter:title": "twitter_title", "twitter:description": "twitter_description", "twitter:image": "twitter_image", } for tw_name, attr_name in twitter_mappings.items(): tag = soup.find("meta", attrs={"name": tw_name}) if tag and tag.get("content"): setattr(og, attr_name, tag["content"]) def _extract_schema(self, soup: BeautifulSoup, metadata: PageMetadata) -> None: """Extract schema.org structured data.""" # JSON-LD for script in soup.find_all("script", type="application/ld+json"): try: data = json.loads(script.string) if isinstance(data, list): for item in data: self._process_schema_item(item, metadata, "json-ld") else: self._process_schema_item(data, metadata, "json-ld") except (json.JSONDecodeError, TypeError): continue # Microdata (basic detection) for item in soup.find_all(itemscope=True): itemtype = item.get("itemtype", "") if itemtype: schema_type = itemtype.split("/")[-1] if schema_type not in metadata.schema_types_found: metadata.schema_types_found.append(schema_type) metadata.schema_data.append(SchemaData( schema_type=schema_type, properties={}, format="microdata" )) def _process_schema_item(self, data: dict, metadata: PageMetadata, format_type: str) -> None: """Process a single schema.org item.""" if not isinstance(data, dict): return schema_type = data.get("@type", "Unknown") if isinstance(schema_type, list): schema_type = schema_type[0] if schema_type else "Unknown" if schema_type not in metadata.schema_types_found: metadata.schema_types_found.append(schema_type) metadata.schema_data.append(SchemaData( schema_type=schema_type, properties=data, format=format_type )) # Process nested @graph items if "@graph" in data: for item in data["@graph"]: self._process_schema_item(item, metadata, format_type) def _extract_links(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None: """Extract internal and external links.""" parsed_base = urlparse(base_url) base_domain = parsed_base.netloc.lower() for a_tag in soup.find_all("a", href=True): href = a_tag["href"] # Skip non-http links if href.startswith(("#", "javascript:", "mailto:", "tel:")): continue # Resolve relative URLs full_url = urljoin(base_url, href) parsed_url = urlparse(full_url) # Get anchor text anchor_text = a_tag.get_text(strip=True)[:100] # Limit length # Check if nofollow rel = a_tag.get("rel", []) if isinstance(rel, str): rel = rel.split() is_nofollow = "nofollow" in rel # Determine if internal or external link_domain = parsed_url.netloc.lower() is_internal = ( link_domain == base_domain or link_domain.endswith(f".{base_domain}") or base_domain.endswith(f".{link_domain}") ) link_data = LinkData( url=full_url, anchor_text=anchor_text, is_internal=is_internal, is_nofollow=is_nofollow, ) if is_internal: metadata.internal_links.append(link_data) else: metadata.external_links.append(link_data) metadata.internal_link_count = len(metadata.internal_links) metadata.external_link_count = len(metadata.external_links) def _extract_images(self, soup: BeautifulSoup, metadata: PageMetadata) -> None: """Extract image information.""" images = soup.find_all("img") metadata.images_total = len(images) for img in images: alt = img.get("alt", "").strip() if alt: metadata.images_with_alt += 1 else: metadata.images_without_alt += 1 def _extract_content_metrics(self, soup: BeautifulSoup, metadata: PageMetadata) -> None: """Extract content metrics like word count.""" # Remove script and style elements for element in soup(["script", "style", "noscript"]): element.decompose() # Get text content text = soup.get_text(separator=" ", strip=True) words = text.split() metadata.word_count = len(words) def _run_seo_checks(self, metadata: PageMetadata) -> None: """Run SEO checks and add issues/warnings.""" # Title checks if not metadata.title: metadata.issues.append("Missing title tag") elif metadata.title_length < 30: metadata.warnings.append(f"Title too short ({metadata.title_length} chars, recommend 50-60)") elif metadata.title_length > 60: metadata.warnings.append(f"Title too long ({metadata.title_length} chars, recommend 50-60)") # Meta description checks if not metadata.meta_description: metadata.issues.append("Missing meta description") elif metadata.meta_description_length < 120: metadata.warnings.append(f"Meta description too short ({metadata.meta_description_length} chars)") elif metadata.meta_description_length > 160: metadata.warnings.append(f"Meta description too long ({metadata.meta_description_length} chars)") # Canonical check if not metadata.canonical_url: metadata.warnings.append("Missing canonical tag") elif metadata.canonical_url != metadata.url: metadata.warnings.append(f"Canonical points to different URL: {metadata.canonical_url}") # H1 checks if metadata.h1_count == 0: metadata.issues.append("Missing H1 tag") elif metadata.h1_count > 1: metadata.warnings.append(f"Multiple H1 tags ({metadata.h1_count})") # Image alt check if metadata.images_without_alt > 0: metadata.warnings.append(f"{metadata.images_without_alt} images missing alt text") # Schema check if not metadata.schema_types_found: metadata.warnings.append("No structured data found") # Open Graph check if not metadata.open_graph.og_title: metadata.warnings.append("Missing Open Graph tags") # Robots meta check if metadata.robots_meta: robots_lower = metadata.robots_meta.lower() if "noindex" in robots_lower: metadata.issues.append("Page is set to noindex") if "nofollow" in robots_lower: metadata.warnings.append("Page is set to nofollow") def main(): """CLI entry point for testing.""" import argparse parser = argparse.ArgumentParser(description="Page SEO Analyzer") parser.add_argument("url", help="URL to analyze") parser.add_argument("--json", "-j", action="store_true", help="Output as JSON") args = parser.parse_args() analyzer = PageAnalyzer() metadata = analyzer.analyze_url(args.url) if args.json: print(json.dumps(metadata.to_dict(), indent=2, ensure_ascii=False)) else: print("=" * 60) print("PAGE ANALYSIS REPORT") print("=" * 60) print(metadata.get_summary()) print() if metadata.issues: print("ISSUES:") for issue in metadata.issues: print(f" ✗ {issue}") if metadata.warnings: print("\nWARNINGS:") for warning in metadata.warnings: print(f" ⚠ {warning}") if metadata.hreflang_tags: print(f"\nHREFLANG TAGS ({len(metadata.hreflang_tags)}):") for tag in metadata.hreflang_tags[:5]: print(f" {tag['lang']}: {tag['url']}") if metadata.schema_types_found: print(f"\nSCHEMA TYPES:") for schema_type in metadata.schema_types_found: print(f" - {schema_type}") if __name__ == "__main__": main()