Files
our-claude-skills/custom-skills/99_archive/seo-audit-agent/scripts/page_analyzer.py
Andrew Yim b69e4b6f3a refactor: Reorganize skill numbering and update documentation
Skill Numbering Changes:
- 01-03: OurDigital core (was 30-32)
- 31-32: Notion tools (was 01-02)
- 99_archive: Renamed from _archive for sorting

New Files:
- AGENTS.md: Claude Code agent routing guide
- requirements.txt for 00-claude-code-setting, 32-notion-writer, 43-jamie-youtube-manager

Documentation Updates:
- CLAUDE.md: Updated skill inventory (23 skills)
- AUDIT_REPORT.md: Current completion status (91%)
- Archived REFACTORING_PLAN.md (most tasks complete)

Removed:
- ga-agent-skills/ (moved to separate repo ~/Project/dintel-ga4-agent)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 18:42:39 +07:00

570 lines
20 KiB
Python

"""
Page Analyzer - Extract SEO metadata from web pages
===================================================
Purpose: Comprehensive page-level SEO data extraction
Python: 3.10+
Usage:
from page_analyzer import PageAnalyzer, PageMetadata
analyzer = PageAnalyzer()
metadata = analyzer.analyze_url("https://example.com/page")
"""
import json
import logging
import re
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
@dataclass
class LinkData:
"""Represents a link found on a page."""
url: str
anchor_text: str
is_internal: bool
is_nofollow: bool = False
link_type: str = "body" # body, nav, footer, etc.
@dataclass
class HeadingData:
"""Represents a heading found on a page."""
level: int # 1-6
text: str
@dataclass
class SchemaData:
"""Represents schema.org structured data."""
schema_type: str
properties: dict
format: str = "json-ld" # json-ld, microdata, rdfa
@dataclass
class OpenGraphData:
"""Represents Open Graph metadata."""
og_title: str | None = None
og_description: str | None = None
og_image: str | None = None
og_url: str | None = None
og_type: str | None = None
og_site_name: str | None = None
og_locale: str | None = None
twitter_card: str | None = None
twitter_title: str | None = None
twitter_description: str | None = None
twitter_image: str | None = None
@dataclass
class PageMetadata:
"""Complete SEO metadata for a page."""
# Basic info
url: str
status_code: int = 0
content_type: str = ""
response_time_ms: float = 0
analyzed_at: datetime = field(default_factory=datetime.now)
# Meta tags
title: str | None = None
title_length: int = 0
meta_description: str | None = None
meta_description_length: int = 0
canonical_url: str | None = None
robots_meta: str | None = None
# Language
html_lang: str | None = None
hreflang_tags: list[dict] = field(default_factory=list) # [{"lang": "en", "url": "..."}]
# Headings
headings: list[HeadingData] = field(default_factory=list)
h1_count: int = 0
h1_text: str | None = None
# Open Graph & Social
open_graph: OpenGraphData = field(default_factory=OpenGraphData)
# Schema/Structured Data
schema_data: list[SchemaData] = field(default_factory=list)
schema_types_found: list[str] = field(default_factory=list)
# Links
internal_links: list[LinkData] = field(default_factory=list)
external_links: list[LinkData] = field(default_factory=list)
internal_link_count: int = 0
external_link_count: int = 0
# Images
images_total: int = 0
images_without_alt: int = 0
images_with_alt: int = 0
# Content metrics
word_count: int = 0
# Issues found
issues: list[str] = field(default_factory=list)
warnings: list[str] = field(default_factory=list)
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"url": self.url,
"status_code": self.status_code,
"content_type": self.content_type,
"response_time_ms": self.response_time_ms,
"analyzed_at": self.analyzed_at.isoformat(),
"title": self.title,
"title_length": self.title_length,
"meta_description": self.meta_description,
"meta_description_length": self.meta_description_length,
"canonical_url": self.canonical_url,
"robots_meta": self.robots_meta,
"html_lang": self.html_lang,
"hreflang_tags": self.hreflang_tags,
"h1_count": self.h1_count,
"h1_text": self.h1_text,
"headings_count": len(self.headings),
"schema_types_found": self.schema_types_found,
"internal_link_count": self.internal_link_count,
"external_link_count": self.external_link_count,
"images_total": self.images_total,
"images_without_alt": self.images_without_alt,
"word_count": self.word_count,
"issues": self.issues,
"warnings": self.warnings,
"open_graph": {
"og_title": self.open_graph.og_title,
"og_description": self.open_graph.og_description,
"og_image": self.open_graph.og_image,
"og_url": self.open_graph.og_url,
"og_type": self.open_graph.og_type,
},
}
def get_summary(self) -> str:
"""Get a brief summary of the page analysis."""
lines = [
f"URL: {self.url}",
f"Status: {self.status_code}",
f"Title: {self.title[:50] + '...' if self.title and len(self.title) > 50 else self.title}",
f"Description: {'' if self.meta_description else '✗ Missing'}",
f"Canonical: {'' if self.canonical_url else '✗ Missing'}",
f"H1: {self.h1_count} found",
f"Schema: {', '.join(self.schema_types_found) if self.schema_types_found else 'None'}",
f"Links: {self.internal_link_count} internal, {self.external_link_count} external",
f"Images: {self.images_total} total, {self.images_without_alt} without alt",
]
if self.issues:
lines.append(f"Issues: {len(self.issues)}")
return "\n".join(lines)
class PageAnalyzer:
"""Analyze web pages for SEO metadata."""
DEFAULT_USER_AGENT = "Mozilla/5.0 (compatible; OurDigitalSEOBot/1.0; +https://ourdigital.org)"
def __init__(
self,
user_agent: str | None = None,
timeout: int = 30,
):
"""
Initialize page analyzer.
Args:
user_agent: Custom user agent string
timeout: Request timeout in seconds
"""
self.user_agent = user_agent or self.DEFAULT_USER_AGENT
self.timeout = timeout
self.session = requests.Session()
self.session.headers.update({
"User-Agent": self.user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9,ko;q=0.8",
})
def analyze_url(self, url: str) -> PageMetadata:
"""
Analyze a URL and extract SEO metadata.
Args:
url: URL to analyze
Returns:
PageMetadata object with all extracted data
"""
metadata = PageMetadata(url=url)
try:
# Fetch page
start_time = datetime.now()
response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
metadata.response_time_ms = (datetime.now() - start_time).total_seconds() * 1000
metadata.status_code = response.status_code
metadata.content_type = response.headers.get("Content-Type", "")
if response.status_code != 200:
metadata.issues.append(f"HTTP {response.status_code} status")
if response.status_code >= 400:
return metadata
# Parse HTML
soup = BeautifulSoup(response.text, "html.parser")
base_url = url
# Extract all metadata
self._extract_basic_meta(soup, metadata)
self._extract_canonical(soup, metadata, base_url)
self._extract_robots_meta(soup, metadata)
self._extract_hreflang(soup, metadata)
self._extract_headings(soup, metadata)
self._extract_open_graph(soup, metadata)
self._extract_schema(soup, metadata)
self._extract_links(soup, metadata, base_url)
self._extract_images(soup, metadata)
self._extract_content_metrics(soup, metadata)
# Run SEO checks
self._run_seo_checks(metadata)
except requests.RequestException as e:
metadata.issues.append(f"Request failed: {str(e)}")
logger.error(f"Failed to analyze {url}: {e}")
except Exception as e:
metadata.issues.append(f"Analysis error: {str(e)}")
logger.error(f"Error analyzing {url}: {e}")
return metadata
def _extract_basic_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract title and meta description."""
# Title
title_tag = soup.find("title")
if title_tag and title_tag.string:
metadata.title = title_tag.string.strip()
metadata.title_length = len(metadata.title)
# Meta description
desc_tag = soup.find("meta", attrs={"name": re.compile(r"^description$", re.I)})
if desc_tag and desc_tag.get("content"):
metadata.meta_description = desc_tag["content"].strip()
metadata.meta_description_length = len(metadata.meta_description)
# HTML lang
html_tag = soup.find("html")
if html_tag and html_tag.get("lang"):
metadata.html_lang = html_tag["lang"]
def _extract_canonical(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None:
"""Extract canonical URL."""
canonical = soup.find("link", rel="canonical")
if canonical and canonical.get("href"):
metadata.canonical_url = urljoin(base_url, canonical["href"])
def _extract_robots_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract robots meta tag."""
robots = soup.find("meta", attrs={"name": re.compile(r"^robots$", re.I)})
if robots and robots.get("content"):
metadata.robots_meta = robots["content"]
# Also check for googlebot-specific
googlebot = soup.find("meta", attrs={"name": re.compile(r"^googlebot$", re.I)})
if googlebot and googlebot.get("content"):
if metadata.robots_meta:
metadata.robots_meta += f" | googlebot: {googlebot['content']}"
else:
metadata.robots_meta = f"googlebot: {googlebot['content']}"
def _extract_hreflang(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract hreflang tags."""
hreflang_tags = soup.find_all("link", rel="alternate", hreflang=True)
for tag in hreflang_tags:
if tag.get("href") and tag.get("hreflang"):
metadata.hreflang_tags.append({
"lang": tag["hreflang"],
"url": tag["href"]
})
def _extract_headings(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract all headings."""
for level in range(1, 7):
for heading in soup.find_all(f"h{level}"):
text = heading.get_text(strip=True)
if text:
metadata.headings.append(HeadingData(level=level, text=text))
# Count H1s specifically
h1_tags = soup.find_all("h1")
metadata.h1_count = len(h1_tags)
if h1_tags:
metadata.h1_text = h1_tags[0].get_text(strip=True)
def _extract_open_graph(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract Open Graph and Twitter Card data."""
og = metadata.open_graph
# Open Graph tags
og_mappings = {
"og:title": "og_title",
"og:description": "og_description",
"og:image": "og_image",
"og:url": "og_url",
"og:type": "og_type",
"og:site_name": "og_site_name",
"og:locale": "og_locale",
}
for og_prop, attr_name in og_mappings.items():
tag = soup.find("meta", property=og_prop)
if tag and tag.get("content"):
setattr(og, attr_name, tag["content"])
# Twitter Card tags
twitter_mappings = {
"twitter:card": "twitter_card",
"twitter:title": "twitter_title",
"twitter:description": "twitter_description",
"twitter:image": "twitter_image",
}
for tw_name, attr_name in twitter_mappings.items():
tag = soup.find("meta", attrs={"name": tw_name})
if tag and tag.get("content"):
setattr(og, attr_name, tag["content"])
def _extract_schema(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract schema.org structured data."""
# JSON-LD
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string)
if isinstance(data, list):
for item in data:
self._process_schema_item(item, metadata, "json-ld")
else:
self._process_schema_item(data, metadata, "json-ld")
except (json.JSONDecodeError, TypeError):
continue
# Microdata (basic detection)
for item in soup.find_all(itemscope=True):
itemtype = item.get("itemtype", "")
if itemtype:
schema_type = itemtype.split("/")[-1]
if schema_type not in metadata.schema_types_found:
metadata.schema_types_found.append(schema_type)
metadata.schema_data.append(SchemaData(
schema_type=schema_type,
properties={},
format="microdata"
))
def _process_schema_item(self, data: dict, metadata: PageMetadata, format_type: str) -> None:
"""Process a single schema.org item."""
if not isinstance(data, dict):
return
schema_type = data.get("@type", "Unknown")
if isinstance(schema_type, list):
schema_type = schema_type[0] if schema_type else "Unknown"
if schema_type not in metadata.schema_types_found:
metadata.schema_types_found.append(schema_type)
metadata.schema_data.append(SchemaData(
schema_type=schema_type,
properties=data,
format=format_type
))
# Process nested @graph items
if "@graph" in data:
for item in data["@graph"]:
self._process_schema_item(item, metadata, format_type)
def _extract_links(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None:
"""Extract internal and external links."""
parsed_base = urlparse(base_url)
base_domain = parsed_base.netloc.lower()
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
# Skip non-http links
if href.startswith(("#", "javascript:", "mailto:", "tel:")):
continue
# Resolve relative URLs
full_url = urljoin(base_url, href)
parsed_url = urlparse(full_url)
# Get anchor text
anchor_text = a_tag.get_text(strip=True)[:100] # Limit length
# Check if nofollow
rel = a_tag.get("rel", [])
if isinstance(rel, str):
rel = rel.split()
is_nofollow = "nofollow" in rel
# Determine if internal or external
link_domain = parsed_url.netloc.lower()
is_internal = (
link_domain == base_domain or
link_domain.endswith(f".{base_domain}") or
base_domain.endswith(f".{link_domain}")
)
link_data = LinkData(
url=full_url,
anchor_text=anchor_text,
is_internal=is_internal,
is_nofollow=is_nofollow,
)
if is_internal:
metadata.internal_links.append(link_data)
else:
metadata.external_links.append(link_data)
metadata.internal_link_count = len(metadata.internal_links)
metadata.external_link_count = len(metadata.external_links)
def _extract_images(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract image information."""
images = soup.find_all("img")
metadata.images_total = len(images)
for img in images:
alt = img.get("alt", "").strip()
if alt:
metadata.images_with_alt += 1
else:
metadata.images_without_alt += 1
def _extract_content_metrics(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract content metrics like word count."""
# Remove script and style elements
for element in soup(["script", "style", "noscript"]):
element.decompose()
# Get text content
text = soup.get_text(separator=" ", strip=True)
words = text.split()
metadata.word_count = len(words)
def _run_seo_checks(self, metadata: PageMetadata) -> None:
"""Run SEO checks and add issues/warnings."""
# Title checks
if not metadata.title:
metadata.issues.append("Missing title tag")
elif metadata.title_length < 30:
metadata.warnings.append(f"Title too short ({metadata.title_length} chars, recommend 50-60)")
elif metadata.title_length > 60:
metadata.warnings.append(f"Title too long ({metadata.title_length} chars, recommend 50-60)")
# Meta description checks
if not metadata.meta_description:
metadata.issues.append("Missing meta description")
elif metadata.meta_description_length < 120:
metadata.warnings.append(f"Meta description too short ({metadata.meta_description_length} chars)")
elif metadata.meta_description_length > 160:
metadata.warnings.append(f"Meta description too long ({metadata.meta_description_length} chars)")
# Canonical check
if not metadata.canonical_url:
metadata.warnings.append("Missing canonical tag")
elif metadata.canonical_url != metadata.url:
metadata.warnings.append(f"Canonical points to different URL: {metadata.canonical_url}")
# H1 checks
if metadata.h1_count == 0:
metadata.issues.append("Missing H1 tag")
elif metadata.h1_count > 1:
metadata.warnings.append(f"Multiple H1 tags ({metadata.h1_count})")
# Image alt check
if metadata.images_without_alt > 0:
metadata.warnings.append(f"{metadata.images_without_alt} images missing alt text")
# Schema check
if not metadata.schema_types_found:
metadata.warnings.append("No structured data found")
# Open Graph check
if not metadata.open_graph.og_title:
metadata.warnings.append("Missing Open Graph tags")
# Robots meta check
if metadata.robots_meta:
robots_lower = metadata.robots_meta.lower()
if "noindex" in robots_lower:
metadata.issues.append("Page is set to noindex")
if "nofollow" in robots_lower:
metadata.warnings.append("Page is set to nofollow")
def main():
"""CLI entry point for testing."""
import argparse
parser = argparse.ArgumentParser(description="Page SEO Analyzer")
parser.add_argument("url", help="URL to analyze")
parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
args = parser.parse_args()
analyzer = PageAnalyzer()
metadata = analyzer.analyze_url(args.url)
if args.json:
print(json.dumps(metadata.to_dict(), indent=2, ensure_ascii=False))
else:
print("=" * 60)
print("PAGE ANALYSIS REPORT")
print("=" * 60)
print(metadata.get_summary())
print()
if metadata.issues:
print("ISSUES:")
for issue in metadata.issues:
print(f"{issue}")
if metadata.warnings:
print("\nWARNINGS:")
for warning in metadata.warnings:
print(f"{warning}")
if metadata.hreflang_tags:
print(f"\nHREFLANG TAGS ({len(metadata.hreflang_tags)}):")
for tag in metadata.hreflang_tags[:5]:
print(f" {tag['lang']}: {tag['url']}")
if metadata.schema_types_found:
print(f"\nSCHEMA TYPES:")
for schema_type in metadata.schema_types_found:
print(f" - {schema_type}")
if __name__ == "__main__":
main()