refactor(skills): Restructure skills to dual-platform architecture

Major refactoring of ourdigital-custom-skills with new numbering system:

## Structure Changes
- Each skill now has code/ (Claude Code) and desktop/ (Claude Desktop) versions
- New progressive numbering: 01-09 General, 10-19 SEO, 20-29 GTM, 30-39 OurDigital, 40-49 Jamie

## Skill Reorganization
- 01-notion-organizer (from 02)
- 10-18: SEO tools split into focused skills (technical, on-page, local, schema, vitals, gsc, gateway)
- 20-21: GTM audit and manager
- 30-32: OurDigital designer, research, presentation
- 40-41: Jamie brand editor and audit

## New Files
- .claude/commands/: Slash command definitions for all skills
- CLAUDE.md: Updated with new skill structure documentation
- REFACTORING_PLAN.md: Migration documentation
- COMPATIBILITY_REPORT.md, SKILLS_COMPARISON.md: Analysis docs

## Removed
- Old skill directories (02-05, 10-14, 20-21 old numbering)
- Consolidated into new structure with _archive/ for reference

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-22 01:58:24 +09:00
parent 214247ace2
commit eea49f9f8c
251 changed files with 12308 additions and 102 deletions

View File

@@ -0,0 +1,207 @@
"""
Base Client - Shared async client utilities
===========================================
Purpose: Rate-limited async operations for API clients
Python: 3.10+
"""
import asyncio
import logging
import os
from asyncio import Semaphore
from datetime import datetime
from typing import Any, Callable, TypeVar
from dotenv import load_dotenv
from tenacity import (
retry,
stop_after_attempt,
wait_exponential,
retry_if_exception_type,
)
# Load environment variables
load_dotenv()
# Logging setup
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
T = TypeVar("T")
class RateLimiter:
"""Rate limiter using token bucket algorithm."""
def __init__(self, rate: float, per: float = 1.0):
"""
Initialize rate limiter.
Args:
rate: Number of requests allowed
per: Time period in seconds (default: 1 second)
"""
self.rate = rate
self.per = per
self.tokens = rate
self.last_update = datetime.now()
self._lock = asyncio.Lock()
async def acquire(self) -> None:
"""Acquire a token, waiting if necessary."""
async with self._lock:
now = datetime.now()
elapsed = (now - self.last_update).total_seconds()
self.tokens = min(self.rate, self.tokens + elapsed * (self.rate / self.per))
self.last_update = now
if self.tokens < 1:
wait_time = (1 - self.tokens) * (self.per / self.rate)
await asyncio.sleep(wait_time)
self.tokens = 0
else:
self.tokens -= 1
class BaseAsyncClient:
"""Base class for async API clients with rate limiting."""
def __init__(
self,
max_concurrent: int = 5,
requests_per_second: float = 3.0,
logger: logging.Logger | None = None,
):
"""
Initialize base client.
Args:
max_concurrent: Maximum concurrent requests
requests_per_second: Rate limit
logger: Logger instance
"""
self.semaphore = Semaphore(max_concurrent)
self.rate_limiter = RateLimiter(requests_per_second)
self.logger = logger or logging.getLogger(self.__class__.__name__)
self.stats = {
"requests": 0,
"success": 0,
"errors": 0,
"retries": 0,
}
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=2, max=10),
retry=retry_if_exception_type(Exception),
)
async def _rate_limited_request(
self,
coro: Callable[[], Any],
) -> Any:
"""Execute a request with rate limiting and retry."""
async with self.semaphore:
await self.rate_limiter.acquire()
self.stats["requests"] += 1
try:
result = await coro()
self.stats["success"] += 1
return result
except Exception as e:
self.stats["errors"] += 1
self.logger.error(f"Request failed: {e}")
raise
async def batch_requests(
self,
requests: list[Callable[[], Any]],
desc: str = "Processing",
) -> list[Any]:
"""Execute multiple requests concurrently."""
try:
from tqdm.asyncio import tqdm
has_tqdm = True
except ImportError:
has_tqdm = False
async def execute(req: Callable) -> Any:
try:
return await self._rate_limited_request(req)
except Exception as e:
return {"error": str(e)}
tasks = [execute(req) for req in requests]
if has_tqdm:
results = []
for coro in tqdm.as_completed(tasks, total=len(tasks), desc=desc):
result = await coro
results.append(result)
return results
else:
return await asyncio.gather(*tasks, return_exceptions=True)
def print_stats(self) -> None:
"""Print request statistics."""
self.logger.info("=" * 40)
self.logger.info("Request Statistics:")
self.logger.info(f" Total Requests: {self.stats['requests']}")
self.logger.info(f" Successful: {self.stats['success']}")
self.logger.info(f" Errors: {self.stats['errors']}")
self.logger.info("=" * 40)
class ConfigManager:
"""Manage API configuration and credentials."""
def __init__(self):
load_dotenv()
@property
def google_credentials_path(self) -> str | None:
"""Get Google service account credentials path."""
# Prefer SEO-specific credentials, fallback to general credentials
seo_creds = os.path.expanduser("~/.credential/ourdigital-seo-agent.json")
if os.path.exists(seo_creds):
return seo_creds
return os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
@property
def pagespeed_api_key(self) -> str | None:
"""Get PageSpeed Insights API key."""
return os.getenv("PAGESPEED_API_KEY")
@property
def custom_search_api_key(self) -> str | None:
"""Get Custom Search API key."""
return os.getenv("CUSTOM_SEARCH_API_KEY")
@property
def custom_search_engine_id(self) -> str | None:
"""Get Custom Search Engine ID."""
return os.getenv("CUSTOM_SEARCH_ENGINE_ID")
@property
def notion_token(self) -> str | None:
"""Get Notion API token."""
return os.getenv("NOTION_TOKEN") or os.getenv("NOTION_API_KEY")
def validate_google_credentials(self) -> bool:
"""Validate Google credentials are configured."""
creds_path = self.google_credentials_path
if not creds_path:
return False
return os.path.exists(creds_path)
def get_required(self, key: str) -> str:
"""Get required environment variable or raise error."""
value = os.getenv(key)
if not value:
raise ValueError(f"Missing required environment variable: {key}")
return value
# Singleton config instance
config = ConfigManager()

View File

@@ -0,0 +1,497 @@
"""
Full SEO Audit - Orchestration Script
=====================================
Purpose: Run comprehensive SEO audit combining all tools
Python: 3.10+
Usage:
python full_audit.py --url https://example.com --output notion --notion-page-id abc123
"""
import argparse
import json
import logging
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
from urllib.parse import urlparse
from robots_checker import RobotsChecker
from schema_validator import SchemaValidator
from sitemap_validator import SitemapValidator
from pagespeed_client import PageSpeedClient
from notion_reporter import NotionReporter, SEOFinding
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
@dataclass
class AuditResult:
"""Complete SEO audit result."""
url: str
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
robots: dict = field(default_factory=dict)
sitemap: dict = field(default_factory=dict)
schema: dict = field(default_factory=dict)
performance: dict = field(default_factory=dict)
findings: list[SEOFinding] = field(default_factory=list)
summary: dict = field(default_factory=dict)
def to_dict(self) -> dict:
return {
"url": self.url,
"timestamp": self.timestamp,
"robots": self.robots,
"sitemap": self.sitemap,
"schema": self.schema,
"performance": self.performance,
"summary": self.summary,
"findings_count": len(self.findings),
}
class SEOAuditor:
"""Orchestrate comprehensive SEO audit."""
def __init__(self):
self.robots_checker = RobotsChecker()
self.sitemap_validator = SitemapValidator()
self.schema_validator = SchemaValidator()
self.pagespeed_client = PageSpeedClient()
def run_audit(
self,
url: str,
include_robots: bool = True,
include_sitemap: bool = True,
include_schema: bool = True,
include_performance: bool = True,
) -> AuditResult:
"""
Run comprehensive SEO audit.
Args:
url: URL to audit
include_robots: Check robots.txt
include_sitemap: Validate sitemap
include_schema: Validate schema markup
include_performance: Run PageSpeed analysis
"""
result = AuditResult(url=url)
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
logger.info(f"Starting SEO audit for {url}")
# 1. Robots.txt analysis
if include_robots:
logger.info("Analyzing robots.txt...")
try:
robots_result = self.robots_checker.analyze(base_url)
result.robots = robots_result.to_dict()
self._process_robots_findings(robots_result, result)
except Exception as e:
logger.error(f"Robots.txt analysis failed: {e}")
result.robots = {"error": str(e)}
# 2. Sitemap validation
if include_sitemap:
logger.info("Validating sitemap...")
sitemap_url = f"{base_url}/sitemap.xml"
# Try to get sitemap URL from robots.txt
if result.robots.get("sitemaps"):
sitemap_url = result.robots["sitemaps"][0]
try:
sitemap_result = self.sitemap_validator.validate(sitemap_url)
result.sitemap = sitemap_result.to_dict()
self._process_sitemap_findings(sitemap_result, result)
except Exception as e:
logger.error(f"Sitemap validation failed: {e}")
result.sitemap = {"error": str(e)}
# 3. Schema validation
if include_schema:
logger.info("Validating schema markup...")
try:
schema_result = self.schema_validator.validate(url=url)
result.schema = schema_result.to_dict()
self._process_schema_findings(schema_result, result)
except Exception as e:
logger.error(f"Schema validation failed: {e}")
result.schema = {"error": str(e)}
# 4. PageSpeed analysis
if include_performance:
logger.info("Running PageSpeed analysis...")
try:
perf_result = self.pagespeed_client.analyze(url, strategy="mobile")
result.performance = perf_result.to_dict()
self._process_performance_findings(perf_result, result)
except Exception as e:
logger.error(f"PageSpeed analysis failed: {e}")
result.performance = {"error": str(e)}
# Generate summary
result.summary = self._generate_summary(result)
logger.info(f"Audit complete. Found {len(result.findings)} issues.")
return result
def _process_robots_findings(self, robots_result, audit_result: AuditResult):
"""Convert robots.txt issues to findings."""
for issue in robots_result.issues:
priority = "Medium"
if issue.severity == "error":
priority = "Critical"
elif issue.severity == "warning":
priority = "High"
audit_result.findings.append(SEOFinding(
issue=issue.message,
category="Robots.txt",
priority=priority,
description=issue.directive or "",
recommendation=issue.suggestion or "",
))
def _process_sitemap_findings(self, sitemap_result, audit_result: AuditResult):
"""Convert sitemap issues to findings."""
for issue in sitemap_result.issues:
priority = "Medium"
if issue.severity == "error":
priority = "High"
elif issue.severity == "warning":
priority = "Medium"
audit_result.findings.append(SEOFinding(
issue=issue.message,
category="Sitemap",
priority=priority,
url=issue.url,
recommendation=issue.suggestion or "",
))
def _process_schema_findings(self, schema_result, audit_result: AuditResult):
"""Convert schema issues to findings."""
for issue in schema_result.issues:
priority = "Low"
if issue.severity == "error":
priority = "High"
elif issue.severity == "warning":
priority = "Medium"
audit_result.findings.append(SEOFinding(
issue=issue.message,
category="Schema/Structured Data",
priority=priority,
description=f"Schema type: {issue.schema_type}" if issue.schema_type else "",
recommendation=issue.suggestion or "",
))
def _process_performance_findings(self, perf_result, audit_result: AuditResult):
"""Convert performance issues to findings."""
cwv = perf_result.core_web_vitals
# Check Core Web Vitals
if cwv.lcp_rating == "POOR":
audit_result.findings.append(SEOFinding(
issue=f"Poor LCP: {cwv.lcp / 1000:.2f}s (should be < 2.5s)",
category="Performance",
priority="Critical",
impact="Users experience slow page loads, affecting bounce rate and rankings",
recommendation="Optimize images, reduce server response time, use CDN",
))
elif cwv.lcp_rating == "NEEDS_IMPROVEMENT":
audit_result.findings.append(SEOFinding(
issue=f"LCP needs improvement: {cwv.lcp / 1000:.2f}s (target < 2.5s)",
category="Performance",
priority="High",
recommendation="Optimize largest content element loading",
))
if cwv.cls_rating == "POOR":
audit_result.findings.append(SEOFinding(
issue=f"Poor CLS: {cwv.cls:.3f} (should be < 0.1)",
category="Performance",
priority="High",
impact="Layout shifts frustrate users",
recommendation="Set dimensions for images/embeds, avoid inserting content above existing content",
))
if cwv.fid_rating == "POOR":
audit_result.findings.append(SEOFinding(
issue=f"Poor FID/TBT: {cwv.fid:.0f}ms (should be < 100ms)",
category="Performance",
priority="High",
impact="Slow interactivity affects user experience",
recommendation="Reduce JavaScript execution time, break up long tasks",
))
# Check performance score
if perf_result.performance_score and perf_result.performance_score < 50:
audit_result.findings.append(SEOFinding(
issue=f"Low performance score: {perf_result.performance_score:.0f}/100",
category="Performance",
priority="High",
impact="Poor performance affects user experience and SEO",
recommendation="Address top opportunities from PageSpeed Insights",
))
# Add top opportunities as findings
for opp in perf_result.opportunities[:3]:
if opp["savings_ms"] > 500: # Only significant savings
audit_result.findings.append(SEOFinding(
issue=opp["title"],
category="Performance",
priority="Medium",
description=opp.get("description", ""),
impact=f"Potential savings: {opp['savings_ms'] / 1000:.1f}s",
recommendation="See PageSpeed Insights for details",
))
def _generate_summary(self, result: AuditResult) -> dict:
"""Generate audit summary."""
findings_by_priority = {}
findings_by_category = {}
for finding in result.findings:
# Count by priority
findings_by_priority[finding.priority] = (
findings_by_priority.get(finding.priority, 0) + 1
)
# Count by category
findings_by_category[finding.category] = (
findings_by_category.get(finding.category, 0) + 1
)
return {
"total_findings": len(result.findings),
"findings_by_priority": findings_by_priority,
"findings_by_category": findings_by_category,
"robots_accessible": result.robots.get("accessible", False),
"sitemap_valid": result.sitemap.get("valid", False),
"schema_valid": result.schema.get("valid", False),
"performance_score": result.performance.get("scores", {}).get("performance"),
"quick_wins": [
f.issue for f in result.findings
if f.priority in ("Medium", "Low")
][:5],
"critical_issues": [
f.issue for f in result.findings
if f.priority == "Critical"
],
}
def export_to_notion(
self,
result: AuditResult,
parent_page_id: str | None = None,
use_default_db: bool = True,
) -> dict:
"""
Export audit results to Notion.
Args:
result: AuditResult object
parent_page_id: Parent page ID (for creating new database)
use_default_db: If True, use OurDigital SEO Audit Log database
Returns:
Dict with database_id, summary_page_id, findings_created
"""
reporter = NotionReporter()
audit_id = f"{urlparse(result.url).netloc}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
# Add site and audit_id to all findings
for finding in result.findings:
finding.site = result.url
finding.audit_id = audit_id
if use_default_db:
# Use the default OurDigital SEO Audit Log database
page_ids = reporter.add_findings_batch(result.findings)
return {
"database_id": reporter.DEFAULT_DATABASE_ID if hasattr(reporter, 'DEFAULT_DATABASE_ID') else "2c8581e5-8a1e-8035-880b-e38cefc2f3ef",
"audit_id": audit_id,
"findings_created": len(page_ids),
}
else:
# Create new database under parent page
if not parent_page_id:
raise ValueError("parent_page_id required when not using default database")
db_title = f"SEO Audit - {urlparse(result.url).netloc} - {datetime.now().strftime('%Y-%m-%d')}"
database_id = reporter.create_findings_database(parent_page_id, db_title)
page_ids = reporter.add_findings_batch(result.findings, database_id)
# Create summary page
summary_page_id = reporter.create_audit_summary_page(
parent_page_id,
result.url,
result.summary,
)
return {
"database_id": database_id,
"summary_page_id": summary_page_id,
"audit_id": audit_id,
"findings_created": len(page_ids),
}
def generate_report(self, result: AuditResult) -> str:
"""Generate human-readable report."""
lines = [
"=" * 70,
"SEO AUDIT REPORT",
"=" * 70,
f"URL: {result.url}",
f"Date: {result.timestamp}",
"",
"-" * 70,
"SUMMARY",
"-" * 70,
f"Total Issues Found: {result.summary.get('total_findings', 0)}",
"",
]
# Priority breakdown
lines.append("Issues by Priority:")
for priority in ["Critical", "High", "Medium", "Low"]:
count = result.summary.get("findings_by_priority", {}).get(priority, 0)
if count:
lines.append(f" {priority}: {count}")
lines.append("")
# Category breakdown
lines.append("Issues by Category:")
for category, count in result.summary.get("findings_by_category", {}).items():
lines.append(f" {category}: {count}")
lines.append("")
lines.append("-" * 70)
lines.append("STATUS OVERVIEW")
lines.append("-" * 70)
# Status checks
lines.append(f"Robots.txt: {'✓ Accessible' if result.robots.get('accessible') else '✗ Not accessible'}")
lines.append(f"Sitemap: {'✓ Valid' if result.sitemap.get('valid') else '✗ Issues found'}")
lines.append(f"Schema: {'✓ Valid' if result.schema.get('valid') else '✗ Issues found'}")
perf_score = result.performance.get("scores", {}).get("performance")
if perf_score:
status = "✓ Good" if perf_score >= 90 else "⚠ Needs work" if perf_score >= 50 else "✗ Poor"
lines.append(f"Performance: {status} ({perf_score:.0f}/100)")
# Critical issues
critical = result.summary.get("critical_issues", [])
if critical:
lines.extend([
"",
"-" * 70,
"CRITICAL ISSUES (Fix Immediately)",
"-" * 70,
])
for issue in critical:
lines.append(f"{issue}")
# Quick wins
quick_wins = result.summary.get("quick_wins", [])
if quick_wins:
lines.extend([
"",
"-" * 70,
"QUICK WINS",
"-" * 70,
])
for issue in quick_wins[:5]:
lines.append(f"{issue}")
# All findings
if result.findings:
lines.extend([
"",
"-" * 70,
"ALL FINDINGS",
"-" * 70,
])
current_category = None
for finding in sorted(result.findings, key=lambda x: (x.category, x.priority)):
if finding.category != current_category:
current_category = finding.category
lines.append(f"\n[{current_category}]")
lines.append(f" [{finding.priority}] {finding.issue}")
if finding.recommendation:
lines.append(f"{finding.recommendation}")
lines.extend(["", "=" * 70])
return "\n".join(lines)
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(
description="Run comprehensive SEO audit",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Run full audit and output to console
python full_audit.py --url https://example.com
# Export to Notion
python full_audit.py --url https://example.com --output notion --notion-page-id abc123
# Output as JSON
python full_audit.py --url https://example.com --json
""",
)
parser.add_argument("--url", "-u", required=True, help="URL to audit")
parser.add_argument("--output", "-o", choices=["console", "notion", "json"],
default="console", help="Output format")
parser.add_argument("--notion-page-id", help="Notion parent page ID (required for notion output)")
parser.add_argument("--json", action="store_true", help="Output as JSON")
parser.add_argument("--no-robots", action="store_true", help="Skip robots.txt check")
parser.add_argument("--no-sitemap", action="store_true", help="Skip sitemap validation")
parser.add_argument("--no-schema", action="store_true", help="Skip schema validation")
parser.add_argument("--no-performance", action="store_true", help="Skip PageSpeed analysis")
args = parser.parse_args()
auditor = SEOAuditor()
# Run audit
result = auditor.run_audit(
args.url,
include_robots=not args.no_robots,
include_sitemap=not args.no_sitemap,
include_schema=not args.no_schema,
include_performance=not args.no_performance,
)
# Output results
if args.json or args.output == "json":
print(json.dumps(result.to_dict(), indent=2, default=str))
elif args.output == "notion":
if not args.notion_page_id:
parser.error("--notion-page-id required for notion output")
notion_result = auditor.export_to_notion(result, args.notion_page_id)
print(f"Exported to Notion:")
print(f" Database ID: {notion_result['database_id']}")
print(f" Summary Page: {notion_result['summary_page_id']}")
print(f" Findings Created: {notion_result['findings_created']}")
else:
print(auditor.generate_report(result))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,409 @@
"""
Google Search Console Client
============================
Purpose: Interact with Google Search Console API for SEO data
Python: 3.10+
Usage:
from gsc_client import SearchConsoleClient
client = SearchConsoleClient()
data = client.get_search_analytics("sc-domain:example.com")
"""
import logging
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from typing import Any
from google.oauth2 import service_account
from googleapiclient.discovery import build
from base_client import config
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
@dataclass
class SearchAnalyticsResult:
"""Search analytics query result."""
rows: list[dict] = field(default_factory=list)
total_clicks: int = 0
total_impressions: int = 0
average_ctr: float = 0.0
average_position: float = 0.0
@dataclass
class SitemapInfo:
"""Sitemap information from Search Console."""
path: str
last_submitted: str | None = None
last_downloaded: str | None = None
is_pending: bool = False
is_sitemaps_index: bool = False
warnings: int = 0
errors: int = 0
class SearchConsoleClient:
"""Client for Google Search Console API."""
SCOPES = ["https://www.googleapis.com/auth/webmasters.readonly"]
def __init__(self, credentials_path: str | None = None):
"""
Initialize Search Console client.
Args:
credentials_path: Path to service account JSON key
"""
self.credentials_path = credentials_path or config.google_credentials_path
self._service = None
@property
def service(self):
"""Get or create Search Console service."""
if self._service is None:
if not self.credentials_path:
raise ValueError(
"Google credentials not configured. "
"Set GOOGLE_APPLICATION_CREDENTIALS environment variable."
)
credentials = service_account.Credentials.from_service_account_file(
self.credentials_path,
scopes=self.SCOPES,
)
self._service = build("searchconsole", "v1", credentials=credentials)
return self._service
def list_sites(self) -> list[dict]:
"""List all sites accessible to the service account."""
response = self.service.sites().list().execute()
return response.get("siteEntry", [])
def get_search_analytics(
self,
site_url: str,
start_date: str | None = None,
end_date: str | None = None,
dimensions: list[str] | None = None,
row_limit: int = 25000,
filters: list[dict] | None = None,
) -> SearchAnalyticsResult:
"""
Get search analytics data.
Args:
site_url: Site URL (e.g., "sc-domain:example.com" or "https://example.com/")
start_date: Start date (YYYY-MM-DD), defaults to 30 days ago
end_date: End date (YYYY-MM-DD), defaults to yesterday
dimensions: List of dimensions (query, page, country, device, date)
row_limit: Maximum rows to return
filters: Dimension filters
Returns:
SearchAnalyticsResult with rows and summary stats
"""
# Default date range: last 30 days
if not end_date:
end_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
if not start_date:
start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
# Default dimensions
if dimensions is None:
dimensions = ["query", "page"]
request_body = {
"startDate": start_date,
"endDate": end_date,
"dimensions": dimensions,
"rowLimit": row_limit,
}
if filters:
request_body["dimensionFilterGroups"] = [{"filters": filters}]
try:
response = self.service.searchanalytics().query(
siteUrl=site_url,
body=request_body,
).execute()
except Exception as e:
logger.error(f"Failed to query search analytics: {e}")
raise
rows = response.get("rows", [])
# Calculate totals
total_clicks = sum(row.get("clicks", 0) for row in rows)
total_impressions = sum(row.get("impressions", 0) for row in rows)
total_ctr = sum(row.get("ctr", 0) for row in rows)
total_position = sum(row.get("position", 0) for row in rows)
avg_ctr = total_ctr / len(rows) if rows else 0
avg_position = total_position / len(rows) if rows else 0
return SearchAnalyticsResult(
rows=rows,
total_clicks=total_clicks,
total_impressions=total_impressions,
average_ctr=avg_ctr,
average_position=avg_position,
)
def get_top_queries(
self,
site_url: str,
limit: int = 100,
start_date: str | None = None,
end_date: str | None = None,
) -> list[dict]:
"""Get top search queries by clicks."""
result = self.get_search_analytics(
site_url=site_url,
dimensions=["query"],
row_limit=limit,
start_date=start_date,
end_date=end_date,
)
# Sort by clicks
sorted_rows = sorted(
result.rows,
key=lambda x: x.get("clicks", 0),
reverse=True,
)
return [
{
"query": row["keys"][0],
"clicks": row.get("clicks", 0),
"impressions": row.get("impressions", 0),
"ctr": row.get("ctr", 0),
"position": row.get("position", 0),
}
for row in sorted_rows[:limit]
]
def get_top_pages(
self,
site_url: str,
limit: int = 100,
start_date: str | None = None,
end_date: str | None = None,
) -> list[dict]:
"""Get top pages by clicks."""
result = self.get_search_analytics(
site_url=site_url,
dimensions=["page"],
row_limit=limit,
start_date=start_date,
end_date=end_date,
)
sorted_rows = sorted(
result.rows,
key=lambda x: x.get("clicks", 0),
reverse=True,
)
return [
{
"page": row["keys"][0],
"clicks": row.get("clicks", 0),
"impressions": row.get("impressions", 0),
"ctr": row.get("ctr", 0),
"position": row.get("position", 0),
}
for row in sorted_rows[:limit]
]
def get_sitemaps(self, site_url: str) -> list[SitemapInfo]:
"""Get list of sitemaps for a site."""
try:
response = self.service.sitemaps().list(siteUrl=site_url).execute()
except Exception as e:
logger.error(f"Failed to get sitemaps: {e}")
raise
sitemaps = []
for sm in response.get("sitemap", []):
sitemaps.append(SitemapInfo(
path=sm.get("path", ""),
last_submitted=sm.get("lastSubmitted"),
last_downloaded=sm.get("lastDownloaded"),
is_pending=sm.get("isPending", False),
is_sitemaps_index=sm.get("isSitemapsIndex", False),
warnings=sm.get("warnings", 0),
errors=sm.get("errors", 0),
))
return sitemaps
def submit_sitemap(self, site_url: str, sitemap_url: str) -> bool:
"""Submit a sitemap for indexing."""
try:
self.service.sitemaps().submit(
siteUrl=site_url,
feedpath=sitemap_url,
).execute()
logger.info(f"Submitted sitemap: {sitemap_url}")
return True
except Exception as e:
logger.error(f"Failed to submit sitemap: {e}")
return False
def inspect_url(self, site_url: str, inspection_url: str) -> dict:
"""
Inspect a URL's indexing status.
Note: This uses the URL Inspection API which may have different quotas.
"""
try:
response = self.service.urlInspection().index().inspect(
body={
"inspectionUrl": inspection_url,
"siteUrl": site_url,
}
).execute()
result = response.get("inspectionResult", {})
return {
"url": inspection_url,
"indexing_state": result.get("indexStatusResult", {}).get(
"coverageState", "Unknown"
),
"last_crawl_time": result.get("indexStatusResult", {}).get(
"lastCrawlTime"
),
"crawled_as": result.get("indexStatusResult", {}).get("crawledAs"),
"robots_txt_state": result.get("indexStatusResult", {}).get(
"robotsTxtState"
),
"mobile_usability": result.get("mobileUsabilityResult", {}).get(
"verdict", "Unknown"
),
"rich_results": result.get("richResultsResult", {}).get(
"verdict", "Unknown"
),
}
except Exception as e:
logger.error(f"Failed to inspect URL: {e}")
raise
def get_performance_summary(
self,
site_url: str,
days: int = 30,
) -> dict:
"""Get a summary of search performance."""
end_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
start_date = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
# Get overall stats
overall = self.get_search_analytics(
site_url=site_url,
dimensions=[],
start_date=start_date,
end_date=end_date,
)
# Get top queries
top_queries = self.get_top_queries(
site_url=site_url,
limit=10,
start_date=start_date,
end_date=end_date,
)
# Get top pages
top_pages = self.get_top_pages(
site_url=site_url,
limit=10,
start_date=start_date,
end_date=end_date,
)
# Get by device
by_device = self.get_search_analytics(
site_url=site_url,
dimensions=["device"],
start_date=start_date,
end_date=end_date,
)
device_breakdown = {}
for row in by_device.rows:
device = row["keys"][0]
device_breakdown[device] = {
"clicks": row.get("clicks", 0),
"impressions": row.get("impressions", 0),
"ctr": row.get("ctr", 0),
"position": row.get("position", 0),
}
return {
"period": f"{start_date} to {end_date}",
"total_clicks": overall.total_clicks,
"total_impressions": overall.total_impressions,
"average_ctr": overall.average_ctr,
"average_position": overall.average_position,
"top_queries": top_queries,
"top_pages": top_pages,
"by_device": device_breakdown,
}
def main():
"""Test the Search Console client."""
import argparse
parser = argparse.ArgumentParser(description="Google Search Console Client")
parser.add_argument("--site", "-s", required=True, help="Site URL")
parser.add_argument("--action", "-a", default="summary",
choices=["summary", "queries", "pages", "sitemaps", "inspect"],
help="Action to perform")
parser.add_argument("--url", help="URL to inspect")
parser.add_argument("--days", type=int, default=30, help="Days of data")
args = parser.parse_args()
client = SearchConsoleClient()
if args.action == "summary":
summary = client.get_performance_summary(args.site, args.days)
import json
print(json.dumps(summary, indent=2, default=str))
elif args.action == "queries":
queries = client.get_top_queries(args.site)
for q in queries[:20]:
print(f"{q['query']}: {q['clicks']} clicks, pos {q['position']:.1f}")
elif args.action == "pages":
pages = client.get_top_pages(args.site)
for p in pages[:20]:
print(f"{p['page']}: {p['clicks']} clicks, pos {p['position']:.1f}")
elif args.action == "sitemaps":
sitemaps = client.get_sitemaps(args.site)
for sm in sitemaps:
print(f"{sm.path}: errors={sm.errors}, warnings={sm.warnings}")
elif args.action == "inspect" and args.url:
result = client.inspect_url(args.site, args.url)
import json
print(json.dumps(result, indent=2))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,951 @@
"""
Notion Reporter - Create SEO audit findings in Notion
=====================================================
Purpose: Output SEO audit findings to Notion databases
Python: 3.10+
Usage:
from notion_reporter import NotionReporter, SEOFinding, AuditReport
reporter = NotionReporter()
# Create audit report with checklist table
report = AuditReport(site="https://example.com")
report.add_finding(SEOFinding(...))
reporter.create_audit_report(report)
"""
import json
import logging
import os
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any
from notion_client import Client
from base_client import config
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
# Template directory
TEMPLATE_DIR = Path(__file__).parent.parent / "templates"
# Default OurDigital SEO Audit Log database
DEFAULT_DATABASE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
# Default parent page for audit reports (OurDigital SEO Audit Log)
DEFAULT_AUDIT_REPORTS_PAGE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
@dataclass
class SEOFinding:
"""Represents an SEO audit finding."""
issue: str
category: str
priority: str
status: str = "To Fix"
url: str | None = None
description: str | None = None
impact: str | None = None
recommendation: str | None = None
site: str | None = None # The audited site URL
audit_id: str | None = None # Groups findings from same audit session
affected_urls: list[str] = field(default_factory=list) # List of all affected URLs
@dataclass
class AuditReport:
"""Represents a complete SEO audit report with checklist."""
site: str
audit_id: str = field(default_factory=lambda: datetime.now().strftime("%Y%m%d-%H%M%S"))
audit_date: datetime = field(default_factory=datetime.now)
findings: list[SEOFinding] = field(default_factory=list)
# Audit check results
robots_txt_status: str = "Not checked"
sitemap_status: str = "Not checked"
schema_status: str = "Not checked"
performance_status: str = "Not checked"
# Summary statistics
total_urls_checked: int = 0
total_issues: int = 0
def add_finding(self, finding: SEOFinding) -> None:
"""Add a finding to the report."""
finding.site = self.site
finding.audit_id = f"{self.site.replace('https://', '').replace('http://', '').split('/')[0]}-{self.audit_id}"
self.findings.append(finding)
self.total_issues = len(self.findings)
def get_findings_by_priority(self) -> dict[str, list[SEOFinding]]:
"""Group findings by priority."""
result = {"Critical": [], "High": [], "Medium": [], "Low": []}
for f in self.findings:
if f.priority in result:
result[f.priority].append(f)
return result
def get_findings_by_category(self) -> dict[str, list[SEOFinding]]:
"""Group findings by category."""
result = {}
for f in self.findings:
if f.category not in result:
result[f.category] = []
result[f.category].append(f)
return result
class NotionReporter:
"""Create and manage SEO audit findings in Notion."""
CATEGORIES = [
"Technical SEO",
"On-page SEO",
"Content",
"Local SEO",
"Performance",
"Schema/Structured Data",
"Sitemap",
"Robots.txt",
]
PRIORITIES = ["Critical", "High", "Medium", "Low"]
STATUSES = ["To Fix", "In Progress", "Fixed", "Monitoring"]
CATEGORY_COLORS = {
"Technical SEO": "blue",
"On-page SEO": "green",
"Content": "purple",
"Local SEO": "orange",
"Performance": "red",
"Schema/Structured Data": "yellow",
"Sitemap": "pink",
"Robots.txt": "gray",
}
PRIORITY_COLORS = {
"Critical": "red",
"High": "orange",
"Medium": "yellow",
"Low": "gray",
}
def __init__(self, token: str | None = None):
"""
Initialize Notion reporter.
Args:
token: Notion API token
"""
self.token = token or config.notion_token
if not self.token:
raise ValueError(
"Notion token not configured. "
"Set NOTION_TOKEN or NOTION_API_KEY environment variable."
)
self.client = Client(auth=self.token)
def create_findings_database(
self,
parent_page_id: str,
title: str = "SEO Audit Findings",
) -> str:
"""
Create a new SEO findings database.
Args:
parent_page_id: Parent page ID for the database
title: Database title
Returns:
Database ID
"""
# Build database schema
properties = {
"Issue": {"title": {}},
"Category": {
"select": {
"options": [
{"name": cat, "color": self.CATEGORY_COLORS.get(cat, "default")}
for cat in self.CATEGORIES
]
}
},
"Priority": {
"select": {
"options": [
{"name": pri, "color": self.PRIORITY_COLORS.get(pri, "default")}
for pri in self.PRIORITIES
]
}
},
"Status": {
"status": {
"options": [
{"name": "To Fix", "color": "red"},
{"name": "In Progress", "color": "yellow"},
{"name": "Fixed", "color": "green"},
{"name": "Monitoring", "color": "blue"},
],
"groups": [
{"name": "To-do", "option_ids": [], "color": "gray"},
{"name": "In progress", "option_ids": [], "color": "blue"},
{"name": "Complete", "option_ids": [], "color": "green"},
],
}
},
"URL": {"url": {}},
"Description": {"rich_text": {}},
"Impact": {"rich_text": {}},
"Recommendation": {"rich_text": {}},
"Found Date": {"date": {}},
}
try:
response = self.client.databases.create(
parent={"page_id": parent_page_id},
title=[{"type": "text", "text": {"content": title}}],
properties=properties,
)
database_id = response["id"]
logger.info(f"Created database: {database_id}")
return database_id
except Exception as e:
logger.error(f"Failed to create database: {e}")
raise
def add_finding(
self,
finding: SEOFinding,
database_id: str | None = None,
) -> str:
"""
Add a finding to the database with page content.
Args:
finding: SEOFinding object
database_id: Target database ID (defaults to OurDigital SEO Audit Log)
Returns:
Page ID of created entry
"""
db_id = database_id or DEFAULT_DATABASE_ID
# Database properties (metadata)
properties = {
"Issue": {"title": [{"text": {"content": finding.issue}}]},
"Category": {"select": {"name": finding.category}},
"Priority": {"select": {"name": finding.priority}},
"Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
}
if finding.url:
properties["URL"] = {"url": finding.url}
if finding.site:
properties["Site"] = {"url": finding.site}
if finding.audit_id:
properties["Audit ID"] = {
"rich_text": [{"text": {"content": finding.audit_id}}]
}
# Page content blocks (Description, Impact, Recommendation)
children = []
if finding.description:
children.extend([
{
"object": "block",
"type": "heading_2",
"heading_2": {
"rich_text": [{"type": "text", "text": {"content": "Description"}}]
}
},
{
"object": "block",
"type": "paragraph",
"paragraph": {
"rich_text": [{"type": "text", "text": {"content": finding.description}}]
}
}
])
if finding.impact:
children.extend([
{
"object": "block",
"type": "heading_2",
"heading_2": {
"rich_text": [{"type": "text", "text": {"content": "Impact"}}]
}
},
{
"object": "block",
"type": "callout",
"callout": {
"rich_text": [{"type": "text", "text": {"content": finding.impact}}],
"icon": {"type": "emoji", "emoji": "⚠️"}
}
}
])
if finding.recommendation:
children.extend([
{
"object": "block",
"type": "heading_2",
"heading_2": {
"rich_text": [{"type": "text", "text": {"content": "Recommendation"}}]
}
},
{
"object": "block",
"type": "callout",
"callout": {
"rich_text": [{"type": "text", "text": {"content": finding.recommendation}}],
"icon": {"type": "emoji", "emoji": "💡"}
}
}
])
try:
response = self.client.pages.create(
parent={"database_id": db_id},
properties=properties,
children=children if children else None,
)
return response["id"]
except Exception as e:
logger.error(f"Failed to add finding: {e}")
raise
def add_findings_batch(
self,
findings: list[SEOFinding],
database_id: str | None = None,
) -> list[str]:
"""
Add multiple findings to the database.
Args:
findings: List of SEOFinding objects
database_id: Target database ID (defaults to OurDigital SEO Audit Log)
Returns:
List of created page IDs
"""
page_ids = []
for finding in findings:
try:
page_id = self.add_finding(finding, database_id)
page_ids.append(page_id)
except Exception as e:
logger.error(f"Failed to add finding '{finding.issue}': {e}")
return page_ids
def create_audit_summary_page(
self,
parent_page_id: str,
url: str,
summary: dict,
) -> str:
"""
Create a summary page for the audit.
Args:
parent_page_id: Parent page ID
url: Audited URL
summary: Audit summary data
Returns:
Page ID
"""
# Build page content
children = [
{
"object": "block",
"type": "heading_1",
"heading_1": {
"rich_text": [{"type": "text", "text": {"content": f"SEO Audit: {url}"}}]
},
},
{
"object": "block",
"type": "paragraph",
"paragraph": {
"rich_text": [
{
"type": "text",
"text": {"content": f"Audit Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}"},
}
]
},
},
{
"object": "block",
"type": "divider",
"divider": {},
},
{
"object": "block",
"type": "heading_2",
"heading_2": {
"rich_text": [{"type": "text", "text": {"content": "Summary"}}]
},
},
]
# Add summary statistics
if "stats" in summary:
stats = summary["stats"]
stats_text = "\n".join([f"{k}: {v}" for k, v in stats.items()])
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {
"rich_text": [{"type": "text", "text": {"content": stats_text}}]
},
})
# Add findings by priority
if "findings_by_priority" in summary:
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {
"rich_text": [{"type": "text", "text": {"content": "Findings by Priority"}}]
},
})
for priority, count in summary["findings_by_priority"].items():
children.append({
"object": "block",
"type": "bulleted_list_item",
"bulleted_list_item": {
"rich_text": [{"type": "text", "text": {"content": f"{priority}: {count}"}}]
},
})
try:
response = self.client.pages.create(
parent={"page_id": parent_page_id},
properties={
"title": {"title": [{"text": {"content": f"SEO Audit - {url}"}}]}
},
children=children,
)
return response["id"]
except Exception as e:
logger.error(f"Failed to create summary page: {e}")
raise
def query_findings(
self,
database_id: str,
category: str | None = None,
priority: str | None = None,
status: str | None = None,
) -> list[dict]:
"""
Query findings from database.
Args:
database_id: Database ID
category: Filter by category
priority: Filter by priority
status: Filter by status
Returns:
List of finding records
"""
filters = []
if category:
filters.append({
"property": "Category",
"select": {"equals": category},
})
if priority:
filters.append({
"property": "Priority",
"select": {"equals": priority},
})
if status:
filters.append({
"property": "Status",
"status": {"equals": status},
})
query_params = {"database_id": database_id}
if filters:
if len(filters) == 1:
query_params["filter"] = filters[0]
else:
query_params["filter"] = {"and": filters}
try:
response = self.client.databases.query(**query_params)
return response.get("results", [])
except Exception as e:
logger.error(f"Failed to query findings: {e}")
raise
def update_finding_status(
self,
page_id: str,
status: str,
) -> None:
"""Update the status of a finding."""
if status not in self.STATUSES:
raise ValueError(f"Invalid status: {status}")
try:
self.client.pages.update(
page_id=page_id,
properties={"Status": {"status": {"name": status}}},
)
logger.info(f"Updated finding {page_id} to {status}")
except Exception as e:
logger.error(f"Failed to update status: {e}")
raise
def create_audit_report(
self,
report: "AuditReport",
database_id: str | None = None,
) -> dict:
"""
Create a comprehensive audit report page with checklist table.
This creates:
1. Individual finding pages in the database
2. A summary page with all findings in table format for checklist tracking
Args:
report: AuditReport object with all findings
database_id: Target database ID (defaults to OurDigital SEO Audit Log)
Returns:
Dict with summary_page_id and finding_page_ids
"""
db_id = database_id or DEFAULT_DATABASE_ID
# Generate full audit ID
site_domain = report.site.replace('https://', '').replace('http://', '').split('/')[0]
full_audit_id = f"{site_domain}-{report.audit_id}"
result = {
"audit_id": full_audit_id,
"site": report.site,
"summary_page_id": None,
"finding_page_ids": [],
}
# 1. Create individual finding pages in database
logger.info(f"Creating {len(report.findings)} finding pages...")
for finding in report.findings:
finding.audit_id = full_audit_id
finding.site = report.site
try:
page_id = self.add_finding(finding, db_id)
result["finding_page_ids"].append(page_id)
except Exception as e:
logger.error(f"Failed to add finding '{finding.issue}': {e}")
# 2. Create summary page with checklist table
logger.info("Creating audit summary page with checklist...")
summary_page_id = self._create_audit_summary_with_table(report, full_audit_id, db_id)
result["summary_page_id"] = summary_page_id
logger.info(f"Audit report created: {full_audit_id}")
return result
def _create_audit_summary_with_table(
self,
report: "AuditReport",
audit_id: str,
database_id: str,
) -> str:
"""
Create audit summary page with checklist table format.
Args:
report: AuditReport object
audit_id: Full audit ID
database_id: Parent database ID
Returns:
Summary page ID
"""
site_domain = report.site.replace('https://', '').replace('http://', '').split('/')[0]
# Build page content blocks
children = []
# Header with audit info
children.append({
"object": "block",
"type": "callout",
"callout": {
"rich_text": [
{"type": "text", "text": {"content": f"Audit ID: {audit_id}\n"}},
{"type": "text", "text": {"content": f"Date: {report.audit_date.strftime('%Y-%m-%d %H:%M')}\n"}},
{"type": "text", "text": {"content": f"Total Issues: {report.total_issues}"}},
],
"icon": {"type": "emoji", "emoji": "📋"},
"color": "blue_background",
}
})
# Audit Status Summary
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {
"rich_text": [{"type": "text", "text": {"content": "Audit Status"}}]
}
})
# Status table
status_table = {
"object": "block",
"type": "table",
"table": {
"table_width": 2,
"has_column_header": True,
"has_row_header": False,
"children": [
{
"type": "table_row",
"table_row": {
"cells": [
[{"type": "text", "text": {"content": "Check"}}],
[{"type": "text", "text": {"content": "Status"}}],
]
}
},
{
"type": "table_row",
"table_row": {
"cells": [
[{"type": "text", "text": {"content": "Robots.txt"}}],
[{"type": "text", "text": {"content": report.robots_txt_status}}],
]
}
},
{
"type": "table_row",
"table_row": {
"cells": [
[{"type": "text", "text": {"content": "Sitemap"}}],
[{"type": "text", "text": {"content": report.sitemap_status}}],
]
}
},
{
"type": "table_row",
"table_row": {
"cells": [
[{"type": "text", "text": {"content": "Schema Markup"}}],
[{"type": "text", "text": {"content": report.schema_status}}],
]
}
},
{
"type": "table_row",
"table_row": {
"cells": [
[{"type": "text", "text": {"content": "Performance"}}],
[{"type": "text", "text": {"content": report.performance_status}}],
]
}
},
]
}
}
children.append(status_table)
# Divider
children.append({"object": "block", "type": "divider", "divider": {}})
# Findings Checklist Header
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {
"rich_text": [{"type": "text", "text": {"content": "Findings Checklist"}}]
}
})
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {
"rich_text": [{"type": "text", "text": {"content": "Use this checklist to track fixes. Check off items as you complete them."}}]
}
})
# Create findings table with checklist format
if report.findings:
# Build table rows - Header row
table_rows = [
{
"type": "table_row",
"table_row": {
"cells": [
[{"type": "text", "text": {"content": "#"}, "annotations": {"bold": True}}],
[{"type": "text", "text": {"content": "Priority"}, "annotations": {"bold": True}}],
[{"type": "text", "text": {"content": "Category"}, "annotations": {"bold": True}}],
[{"type": "text", "text": {"content": "Issue"}, "annotations": {"bold": True}}],
[{"type": "text", "text": {"content": "URL"}, "annotations": {"bold": True}}],
]
}
}
]
# Add finding rows
for idx, finding in enumerate(report.findings, 1):
# Truncate long text for table cells
issue_text = finding.issue[:50] + "..." if len(finding.issue) > 50 else finding.issue
url_text = finding.url[:40] + "..." if finding.url and len(finding.url) > 40 else (finding.url or "-")
table_rows.append({
"type": "table_row",
"table_row": {
"cells": [
[{"type": "text", "text": {"content": str(idx)}}],
[{"type": "text", "text": {"content": finding.priority}}],
[{"type": "text", "text": {"content": finding.category}}],
[{"type": "text", "text": {"content": issue_text}}],
[{"type": "text", "text": {"content": url_text}}],
]
}
})
findings_table = {
"object": "block",
"type": "table",
"table": {
"table_width": 5,
"has_column_header": True,
"has_row_header": False,
"children": table_rows
}
}
children.append(findings_table)
# Divider
children.append({"object": "block", "type": "divider", "divider": {}})
# Detailed Findings with To-Do checkboxes
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {
"rich_text": [{"type": "text", "text": {"content": "Detailed Findings & Actions"}}]
}
})
# Group findings by priority and add as to-do items
for priority in ["Critical", "High", "Medium", "Low"]:
priority_findings = [f for f in report.findings if f.priority == priority]
if not priority_findings:
continue
# Priority header with emoji
priority_emoji = {"Critical": "🔴", "High": "🟠", "Medium": "🟡", "Low": ""}
children.append({
"object": "block",
"type": "heading_3",
"heading_3": {
"rich_text": [{"type": "text", "text": {"content": f"{priority_emoji.get(priority, '')} {priority} Priority ({len(priority_findings)})"}}]
}
})
# Add each finding as a to-do item with details
for finding in priority_findings:
# Main to-do item
children.append({
"object": "block",
"type": "to_do",
"to_do": {
"rich_text": [
{"type": "text", "text": {"content": f"[{finding.category}] "}, "annotations": {"bold": True}},
{"type": "text", "text": {"content": finding.issue}},
],
"checked": False,
}
})
# URL if available
if finding.url:
children.append({
"object": "block",
"type": "bulleted_list_item",
"bulleted_list_item": {
"rich_text": [
{"type": "text", "text": {"content": "URL: "}},
{"type": "text", "text": {"content": finding.url, "link": {"url": finding.url}}},
]
}
})
# Affected URLs list if available
if finding.affected_urls:
children.append({
"object": "block",
"type": "toggle",
"toggle": {
"rich_text": [{"type": "text", "text": {"content": f"Affected URLs ({len(finding.affected_urls)})"}}],
"children": [
{
"object": "block",
"type": "bulleted_list_item",
"bulleted_list_item": {
"rich_text": [{"type": "text", "text": {"content": url, "link": {"url": url} if url.startswith("http") else None}}]
}
}
for url in finding.affected_urls[:20] # Limit to 20 URLs
] + ([{
"object": "block",
"type": "paragraph",
"paragraph": {
"rich_text": [{"type": "text", "text": {"content": f"... and {len(finding.affected_urls) - 20} more URLs"}}]
}
}] if len(finding.affected_urls) > 20 else [])
}
})
# Recommendation as sub-item
if finding.recommendation:
children.append({
"object": "block",
"type": "bulleted_list_item",
"bulleted_list_item": {
"rich_text": [
{"type": "text", "text": {"content": "💡 "}, "annotations": {"bold": True}},
{"type": "text", "text": {"content": finding.recommendation}},
]
}
})
# Create the summary page
try:
response = self.client.pages.create(
parent={"database_id": database_id},
properties={
"Issue": {"title": [{"text": {"content": f"📊 Audit Report: {site_domain}"}}]},
"Category": {"select": {"name": "Technical SEO"}},
"Priority": {"select": {"name": "High"}},
"Site": {"url": report.site},
"Audit ID": {"rich_text": [{"text": {"content": audit_id}}]},
"Found Date": {"date": {"start": report.audit_date.strftime("%Y-%m-%d")}},
},
children=children,
)
logger.info(f"Created audit summary page: {response['id']}")
return response["id"]
except Exception as e:
logger.error(f"Failed to create audit summary page: {e}")
raise
def create_quick_audit_report(
self,
site: str,
findings: list[SEOFinding],
robots_status: str = "Not checked",
sitemap_status: str = "Not checked",
schema_status: str = "Not checked",
performance_status: str = "Not checked",
database_id: str | None = None,
) -> dict:
"""
Quick method to create audit report from a list of findings.
Args:
site: Site URL
findings: List of SEOFinding objects
robots_status: Robots.txt check result
sitemap_status: Sitemap check result
schema_status: Schema check result
performance_status: Performance check result
database_id: Target database ID
Returns:
Dict with audit results
"""
report = AuditReport(site=site)
report.robots_txt_status = robots_status
report.sitemap_status = sitemap_status
report.schema_status = schema_status
report.performance_status = performance_status
for finding in findings:
report.add_finding(finding)
return self.create_audit_report(report, database_id)
def main():
"""CLI entry point for testing."""
import argparse
parser = argparse.ArgumentParser(description="Notion SEO Reporter")
parser.add_argument("--action", "-a", required=True,
choices=["create-db", "add-finding", "query"],
help="Action to perform")
parser.add_argument("--parent-id", "-p", help="Parent page ID")
parser.add_argument("--database-id", "-d", help="Database ID")
parser.add_argument("--title", "-t", default="SEO Audit Findings",
help="Database title")
args = parser.parse_args()
reporter = NotionReporter()
if args.action == "create-db":
if not args.parent_id:
parser.error("--parent-id required for create-db")
db_id = reporter.create_findings_database(args.parent_id, args.title)
print(f"Created database: {db_id}")
elif args.action == "add-finding":
if not args.database_id:
parser.error("--database-id required for add-finding")
# Example finding
finding = SEOFinding(
issue="Missing meta description",
category="On-page SEO",
priority="Medium",
url="https://example.com/page",
description="Page is missing meta description tag",
impact="May affect CTR in search results",
recommendation="Add unique meta description under 160 characters",
)
page_id = reporter.add_finding(args.database_id, finding)
print(f"Created finding: {page_id}")
elif args.action == "query":
if not args.database_id:
parser.error("--database-id required for query")
findings = reporter.query_findings(args.database_id)
print(f"Found {len(findings)} findings")
for f in findings[:5]:
title = f["properties"]["Issue"]["title"]
if title:
print(f" - {title[0]['plain_text']}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,569 @@
"""
Page Analyzer - Extract SEO metadata from web pages
===================================================
Purpose: Comprehensive page-level SEO data extraction
Python: 3.10+
Usage:
from page_analyzer import PageAnalyzer, PageMetadata
analyzer = PageAnalyzer()
metadata = analyzer.analyze_url("https://example.com/page")
"""
import json
import logging
import re
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
@dataclass
class LinkData:
"""Represents a link found on a page."""
url: str
anchor_text: str
is_internal: bool
is_nofollow: bool = False
link_type: str = "body" # body, nav, footer, etc.
@dataclass
class HeadingData:
"""Represents a heading found on a page."""
level: int # 1-6
text: str
@dataclass
class SchemaData:
"""Represents schema.org structured data."""
schema_type: str
properties: dict
format: str = "json-ld" # json-ld, microdata, rdfa
@dataclass
class OpenGraphData:
"""Represents Open Graph metadata."""
og_title: str | None = None
og_description: str | None = None
og_image: str | None = None
og_url: str | None = None
og_type: str | None = None
og_site_name: str | None = None
og_locale: str | None = None
twitter_card: str | None = None
twitter_title: str | None = None
twitter_description: str | None = None
twitter_image: str | None = None
@dataclass
class PageMetadata:
"""Complete SEO metadata for a page."""
# Basic info
url: str
status_code: int = 0
content_type: str = ""
response_time_ms: float = 0
analyzed_at: datetime = field(default_factory=datetime.now)
# Meta tags
title: str | None = None
title_length: int = 0
meta_description: str | None = None
meta_description_length: int = 0
canonical_url: str | None = None
robots_meta: str | None = None
# Language
html_lang: str | None = None
hreflang_tags: list[dict] = field(default_factory=list) # [{"lang": "en", "url": "..."}]
# Headings
headings: list[HeadingData] = field(default_factory=list)
h1_count: int = 0
h1_text: str | None = None
# Open Graph & Social
open_graph: OpenGraphData = field(default_factory=OpenGraphData)
# Schema/Structured Data
schema_data: list[SchemaData] = field(default_factory=list)
schema_types_found: list[str] = field(default_factory=list)
# Links
internal_links: list[LinkData] = field(default_factory=list)
external_links: list[LinkData] = field(default_factory=list)
internal_link_count: int = 0
external_link_count: int = 0
# Images
images_total: int = 0
images_without_alt: int = 0
images_with_alt: int = 0
# Content metrics
word_count: int = 0
# Issues found
issues: list[str] = field(default_factory=list)
warnings: list[str] = field(default_factory=list)
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"url": self.url,
"status_code": self.status_code,
"content_type": self.content_type,
"response_time_ms": self.response_time_ms,
"analyzed_at": self.analyzed_at.isoformat(),
"title": self.title,
"title_length": self.title_length,
"meta_description": self.meta_description,
"meta_description_length": self.meta_description_length,
"canonical_url": self.canonical_url,
"robots_meta": self.robots_meta,
"html_lang": self.html_lang,
"hreflang_tags": self.hreflang_tags,
"h1_count": self.h1_count,
"h1_text": self.h1_text,
"headings_count": len(self.headings),
"schema_types_found": self.schema_types_found,
"internal_link_count": self.internal_link_count,
"external_link_count": self.external_link_count,
"images_total": self.images_total,
"images_without_alt": self.images_without_alt,
"word_count": self.word_count,
"issues": self.issues,
"warnings": self.warnings,
"open_graph": {
"og_title": self.open_graph.og_title,
"og_description": self.open_graph.og_description,
"og_image": self.open_graph.og_image,
"og_url": self.open_graph.og_url,
"og_type": self.open_graph.og_type,
},
}
def get_summary(self) -> str:
"""Get a brief summary of the page analysis."""
lines = [
f"URL: {self.url}",
f"Status: {self.status_code}",
f"Title: {self.title[:50] + '...' if self.title and len(self.title) > 50 else self.title}",
f"Description: {'' if self.meta_description else '✗ Missing'}",
f"Canonical: {'' if self.canonical_url else '✗ Missing'}",
f"H1: {self.h1_count} found",
f"Schema: {', '.join(self.schema_types_found) if self.schema_types_found else 'None'}",
f"Links: {self.internal_link_count} internal, {self.external_link_count} external",
f"Images: {self.images_total} total, {self.images_without_alt} without alt",
]
if self.issues:
lines.append(f"Issues: {len(self.issues)}")
return "\n".join(lines)
class PageAnalyzer:
"""Analyze web pages for SEO metadata."""
DEFAULT_USER_AGENT = "Mozilla/5.0 (compatible; OurDigitalSEOBot/1.0; +https://ourdigital.org)"
def __init__(
self,
user_agent: str | None = None,
timeout: int = 30,
):
"""
Initialize page analyzer.
Args:
user_agent: Custom user agent string
timeout: Request timeout in seconds
"""
self.user_agent = user_agent or self.DEFAULT_USER_AGENT
self.timeout = timeout
self.session = requests.Session()
self.session.headers.update({
"User-Agent": self.user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9,ko;q=0.8",
})
def analyze_url(self, url: str) -> PageMetadata:
"""
Analyze a URL and extract SEO metadata.
Args:
url: URL to analyze
Returns:
PageMetadata object with all extracted data
"""
metadata = PageMetadata(url=url)
try:
# Fetch page
start_time = datetime.now()
response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
metadata.response_time_ms = (datetime.now() - start_time).total_seconds() * 1000
metadata.status_code = response.status_code
metadata.content_type = response.headers.get("Content-Type", "")
if response.status_code != 200:
metadata.issues.append(f"HTTP {response.status_code} status")
if response.status_code >= 400:
return metadata
# Parse HTML
soup = BeautifulSoup(response.text, "html.parser")
base_url = url
# Extract all metadata
self._extract_basic_meta(soup, metadata)
self._extract_canonical(soup, metadata, base_url)
self._extract_robots_meta(soup, metadata)
self._extract_hreflang(soup, metadata)
self._extract_headings(soup, metadata)
self._extract_open_graph(soup, metadata)
self._extract_schema(soup, metadata)
self._extract_links(soup, metadata, base_url)
self._extract_images(soup, metadata)
self._extract_content_metrics(soup, metadata)
# Run SEO checks
self._run_seo_checks(metadata)
except requests.RequestException as e:
metadata.issues.append(f"Request failed: {str(e)}")
logger.error(f"Failed to analyze {url}: {e}")
except Exception as e:
metadata.issues.append(f"Analysis error: {str(e)}")
logger.error(f"Error analyzing {url}: {e}")
return metadata
def _extract_basic_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract title and meta description."""
# Title
title_tag = soup.find("title")
if title_tag and title_tag.string:
metadata.title = title_tag.string.strip()
metadata.title_length = len(metadata.title)
# Meta description
desc_tag = soup.find("meta", attrs={"name": re.compile(r"^description$", re.I)})
if desc_tag and desc_tag.get("content"):
metadata.meta_description = desc_tag["content"].strip()
metadata.meta_description_length = len(metadata.meta_description)
# HTML lang
html_tag = soup.find("html")
if html_tag and html_tag.get("lang"):
metadata.html_lang = html_tag["lang"]
def _extract_canonical(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None:
"""Extract canonical URL."""
canonical = soup.find("link", rel="canonical")
if canonical and canonical.get("href"):
metadata.canonical_url = urljoin(base_url, canonical["href"])
def _extract_robots_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract robots meta tag."""
robots = soup.find("meta", attrs={"name": re.compile(r"^robots$", re.I)})
if robots and robots.get("content"):
metadata.robots_meta = robots["content"]
# Also check for googlebot-specific
googlebot = soup.find("meta", attrs={"name": re.compile(r"^googlebot$", re.I)})
if googlebot and googlebot.get("content"):
if metadata.robots_meta:
metadata.robots_meta += f" | googlebot: {googlebot['content']}"
else:
metadata.robots_meta = f"googlebot: {googlebot['content']}"
def _extract_hreflang(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract hreflang tags."""
hreflang_tags = soup.find_all("link", rel="alternate", hreflang=True)
for tag in hreflang_tags:
if tag.get("href") and tag.get("hreflang"):
metadata.hreflang_tags.append({
"lang": tag["hreflang"],
"url": tag["href"]
})
def _extract_headings(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract all headings."""
for level in range(1, 7):
for heading in soup.find_all(f"h{level}"):
text = heading.get_text(strip=True)
if text:
metadata.headings.append(HeadingData(level=level, text=text))
# Count H1s specifically
h1_tags = soup.find_all("h1")
metadata.h1_count = len(h1_tags)
if h1_tags:
metadata.h1_text = h1_tags[0].get_text(strip=True)
def _extract_open_graph(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract Open Graph and Twitter Card data."""
og = metadata.open_graph
# Open Graph tags
og_mappings = {
"og:title": "og_title",
"og:description": "og_description",
"og:image": "og_image",
"og:url": "og_url",
"og:type": "og_type",
"og:site_name": "og_site_name",
"og:locale": "og_locale",
}
for og_prop, attr_name in og_mappings.items():
tag = soup.find("meta", property=og_prop)
if tag and tag.get("content"):
setattr(og, attr_name, tag["content"])
# Twitter Card tags
twitter_mappings = {
"twitter:card": "twitter_card",
"twitter:title": "twitter_title",
"twitter:description": "twitter_description",
"twitter:image": "twitter_image",
}
for tw_name, attr_name in twitter_mappings.items():
tag = soup.find("meta", attrs={"name": tw_name})
if tag and tag.get("content"):
setattr(og, attr_name, tag["content"])
def _extract_schema(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract schema.org structured data."""
# JSON-LD
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string)
if isinstance(data, list):
for item in data:
self._process_schema_item(item, metadata, "json-ld")
else:
self._process_schema_item(data, metadata, "json-ld")
except (json.JSONDecodeError, TypeError):
continue
# Microdata (basic detection)
for item in soup.find_all(itemscope=True):
itemtype = item.get("itemtype", "")
if itemtype:
schema_type = itemtype.split("/")[-1]
if schema_type not in metadata.schema_types_found:
metadata.schema_types_found.append(schema_type)
metadata.schema_data.append(SchemaData(
schema_type=schema_type,
properties={},
format="microdata"
))
def _process_schema_item(self, data: dict, metadata: PageMetadata, format_type: str) -> None:
"""Process a single schema.org item."""
if not isinstance(data, dict):
return
schema_type = data.get("@type", "Unknown")
if isinstance(schema_type, list):
schema_type = schema_type[0] if schema_type else "Unknown"
if schema_type not in metadata.schema_types_found:
metadata.schema_types_found.append(schema_type)
metadata.schema_data.append(SchemaData(
schema_type=schema_type,
properties=data,
format=format_type
))
# Process nested @graph items
if "@graph" in data:
for item in data["@graph"]:
self._process_schema_item(item, metadata, format_type)
def _extract_links(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None:
"""Extract internal and external links."""
parsed_base = urlparse(base_url)
base_domain = parsed_base.netloc.lower()
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
# Skip non-http links
if href.startswith(("#", "javascript:", "mailto:", "tel:")):
continue
# Resolve relative URLs
full_url = urljoin(base_url, href)
parsed_url = urlparse(full_url)
# Get anchor text
anchor_text = a_tag.get_text(strip=True)[:100] # Limit length
# Check if nofollow
rel = a_tag.get("rel", [])
if isinstance(rel, str):
rel = rel.split()
is_nofollow = "nofollow" in rel
# Determine if internal or external
link_domain = parsed_url.netloc.lower()
is_internal = (
link_domain == base_domain or
link_domain.endswith(f".{base_domain}") or
base_domain.endswith(f".{link_domain}")
)
link_data = LinkData(
url=full_url,
anchor_text=anchor_text,
is_internal=is_internal,
is_nofollow=is_nofollow,
)
if is_internal:
metadata.internal_links.append(link_data)
else:
metadata.external_links.append(link_data)
metadata.internal_link_count = len(metadata.internal_links)
metadata.external_link_count = len(metadata.external_links)
def _extract_images(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract image information."""
images = soup.find_all("img")
metadata.images_total = len(images)
for img in images:
alt = img.get("alt", "").strip()
if alt:
metadata.images_with_alt += 1
else:
metadata.images_without_alt += 1
def _extract_content_metrics(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
"""Extract content metrics like word count."""
# Remove script and style elements
for element in soup(["script", "style", "noscript"]):
element.decompose()
# Get text content
text = soup.get_text(separator=" ", strip=True)
words = text.split()
metadata.word_count = len(words)
def _run_seo_checks(self, metadata: PageMetadata) -> None:
"""Run SEO checks and add issues/warnings."""
# Title checks
if not metadata.title:
metadata.issues.append("Missing title tag")
elif metadata.title_length < 30:
metadata.warnings.append(f"Title too short ({metadata.title_length} chars, recommend 50-60)")
elif metadata.title_length > 60:
metadata.warnings.append(f"Title too long ({metadata.title_length} chars, recommend 50-60)")
# Meta description checks
if not metadata.meta_description:
metadata.issues.append("Missing meta description")
elif metadata.meta_description_length < 120:
metadata.warnings.append(f"Meta description too short ({metadata.meta_description_length} chars)")
elif metadata.meta_description_length > 160:
metadata.warnings.append(f"Meta description too long ({metadata.meta_description_length} chars)")
# Canonical check
if not metadata.canonical_url:
metadata.warnings.append("Missing canonical tag")
elif metadata.canonical_url != metadata.url:
metadata.warnings.append(f"Canonical points to different URL: {metadata.canonical_url}")
# H1 checks
if metadata.h1_count == 0:
metadata.issues.append("Missing H1 tag")
elif metadata.h1_count > 1:
metadata.warnings.append(f"Multiple H1 tags ({metadata.h1_count})")
# Image alt check
if metadata.images_without_alt > 0:
metadata.warnings.append(f"{metadata.images_without_alt} images missing alt text")
# Schema check
if not metadata.schema_types_found:
metadata.warnings.append("No structured data found")
# Open Graph check
if not metadata.open_graph.og_title:
metadata.warnings.append("Missing Open Graph tags")
# Robots meta check
if metadata.robots_meta:
robots_lower = metadata.robots_meta.lower()
if "noindex" in robots_lower:
metadata.issues.append("Page is set to noindex")
if "nofollow" in robots_lower:
metadata.warnings.append("Page is set to nofollow")
def main():
"""CLI entry point for testing."""
import argparse
parser = argparse.ArgumentParser(description="Page SEO Analyzer")
parser.add_argument("url", help="URL to analyze")
parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
args = parser.parse_args()
analyzer = PageAnalyzer()
metadata = analyzer.analyze_url(args.url)
if args.json:
print(json.dumps(metadata.to_dict(), indent=2, ensure_ascii=False))
else:
print("=" * 60)
print("PAGE ANALYSIS REPORT")
print("=" * 60)
print(metadata.get_summary())
print()
if metadata.issues:
print("ISSUES:")
for issue in metadata.issues:
print(f"{issue}")
if metadata.warnings:
print("\nWARNINGS:")
for warning in metadata.warnings:
print(f"{warning}")
if metadata.hreflang_tags:
print(f"\nHREFLANG TAGS ({len(metadata.hreflang_tags)}):")
for tag in metadata.hreflang_tags[:5]:
print(f" {tag['lang']}: {tag['url']}")
if metadata.schema_types_found:
print(f"\nSCHEMA TYPES:")
for schema_type in metadata.schema_types_found:
print(f" - {schema_type}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,452 @@
"""
PageSpeed Insights Client
=========================
Purpose: Get Core Web Vitals and performance data from PageSpeed Insights API
Python: 3.10+
Usage:
from pagespeed_client import PageSpeedClient
client = PageSpeedClient()
result = client.analyze("https://example.com")
"""
import argparse
import json
import logging
from dataclasses import dataclass, field
from typing import Any
import requests
from base_client import config
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
@dataclass
class CoreWebVitals:
"""Core Web Vitals metrics."""
lcp: float | None = None # Largest Contentful Paint (ms)
fid: float | None = None # First Input Delay (ms)
cls: float | None = None # Cumulative Layout Shift
inp: float | None = None # Interaction to Next Paint (ms)
ttfb: float | None = None # Time to First Byte (ms)
fcp: float | None = None # First Contentful Paint (ms)
# Assessment (GOOD, NEEDS_IMPROVEMENT, POOR)
lcp_rating: str | None = None
fid_rating: str | None = None
cls_rating: str | None = None
inp_rating: str | None = None
def to_dict(self) -> dict:
return {
"lcp": {"value": self.lcp, "rating": self.lcp_rating},
"fid": {"value": self.fid, "rating": self.fid_rating},
"cls": {"value": self.cls, "rating": self.cls_rating},
"inp": {"value": self.inp, "rating": self.inp_rating},
"ttfb": {"value": self.ttfb},
"fcp": {"value": self.fcp},
}
@dataclass
class PageSpeedResult:
"""PageSpeed analysis result."""
url: str
strategy: str # mobile or desktop
performance_score: float | None = None
seo_score: float | None = None
accessibility_score: float | None = None
best_practices_score: float | None = None
core_web_vitals: CoreWebVitals = field(default_factory=CoreWebVitals)
opportunities: list[dict] = field(default_factory=list)
diagnostics: list[dict] = field(default_factory=list)
passed_audits: list[str] = field(default_factory=list)
raw_data: dict = field(default_factory=dict)
def to_dict(self) -> dict:
return {
"url": self.url,
"strategy": self.strategy,
"scores": {
"performance": self.performance_score,
"seo": self.seo_score,
"accessibility": self.accessibility_score,
"best_practices": self.best_practices_score,
},
"core_web_vitals": self.core_web_vitals.to_dict(),
"opportunities_count": len(self.opportunities),
"opportunities": self.opportunities[:10],
"diagnostics_count": len(self.diagnostics),
"passed_audits_count": len(self.passed_audits),
}
class PageSpeedClient:
"""Client for PageSpeed Insights API."""
BASE_URL = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
# Core Web Vitals thresholds
THRESHOLDS = {
"lcp": {"good": 2500, "poor": 4000},
"fid": {"good": 100, "poor": 300},
"cls": {"good": 0.1, "poor": 0.25},
"inp": {"good": 200, "poor": 500},
"ttfb": {"good": 800, "poor": 1800},
"fcp": {"good": 1800, "poor": 3000},
}
def __init__(self, api_key: str | None = None):
"""
Initialize PageSpeed client.
Args:
api_key: PageSpeed API key (optional but recommended for higher quotas)
"""
self.api_key = api_key or config.pagespeed_api_key
self.session = requests.Session()
def _rate_metric(self, metric: str, value: float | None) -> str | None:
"""Rate a metric against thresholds."""
if value is None:
return None
thresholds = self.THRESHOLDS.get(metric)
if not thresholds:
return None
if value <= thresholds["good"]:
return "GOOD"
elif value <= thresholds["poor"]:
return "NEEDS_IMPROVEMENT"
else:
return "POOR"
def analyze(
self,
url: str,
strategy: str = "mobile",
categories: list[str] | None = None,
) -> PageSpeedResult:
"""
Analyze a URL with PageSpeed Insights.
Args:
url: URL to analyze
strategy: "mobile" or "desktop"
categories: Categories to analyze (performance, seo, accessibility, best-practices)
Returns:
PageSpeedResult with scores and metrics
"""
if categories is None:
categories = ["performance", "seo", "accessibility", "best-practices"]
params = {
"url": url,
"strategy": strategy,
"category": categories,
}
if self.api_key:
params["key"] = self.api_key
try:
response = self.session.get(self.BASE_URL, params=params, timeout=60)
response.raise_for_status()
data = response.json()
except requests.RequestException as e:
logger.error(f"PageSpeed API request failed: {e}")
raise
result = PageSpeedResult(url=url, strategy=strategy, raw_data=data)
# Extract scores
lighthouse = data.get("lighthouseResult", {})
categories_data = lighthouse.get("categories", {})
if "performance" in categories_data:
score = categories_data["performance"].get("score")
result.performance_score = score * 100 if score else None
if "seo" in categories_data:
score = categories_data["seo"].get("score")
result.seo_score = score * 100 if score else None
if "accessibility" in categories_data:
score = categories_data["accessibility"].get("score")
result.accessibility_score = score * 100 if score else None
if "best-practices" in categories_data:
score = categories_data["best-practices"].get("score")
result.best_practices_score = score * 100 if score else None
# Extract Core Web Vitals
audits = lighthouse.get("audits", {})
# Lab data
cwv = result.core_web_vitals
if "largest-contentful-paint" in audits:
cwv.lcp = audits["largest-contentful-paint"].get("numericValue")
cwv.lcp_rating = self._rate_metric("lcp", cwv.lcp)
if "total-blocking-time" in audits:
# TBT is proxy for FID in lab data
cwv.fid = audits["total-blocking-time"].get("numericValue")
cwv.fid_rating = self._rate_metric("fid", cwv.fid)
if "cumulative-layout-shift" in audits:
cwv.cls = audits["cumulative-layout-shift"].get("numericValue")
cwv.cls_rating = self._rate_metric("cls", cwv.cls)
if "experimental-interaction-to-next-paint" in audits:
cwv.inp = audits["experimental-interaction-to-next-paint"].get("numericValue")
cwv.inp_rating = self._rate_metric("inp", cwv.inp)
if "server-response-time" in audits:
cwv.ttfb = audits["server-response-time"].get("numericValue")
if "first-contentful-paint" in audits:
cwv.fcp = audits["first-contentful-paint"].get("numericValue")
# Field data (real user data) if available
loading_exp = data.get("loadingExperience", {})
metrics = loading_exp.get("metrics", {})
if "LARGEST_CONTENTFUL_PAINT_MS" in metrics:
cwv.lcp = metrics["LARGEST_CONTENTFUL_PAINT_MS"].get("percentile")
cwv.lcp_rating = metrics["LARGEST_CONTENTFUL_PAINT_MS"].get("category")
if "FIRST_INPUT_DELAY_MS" in metrics:
cwv.fid = metrics["FIRST_INPUT_DELAY_MS"].get("percentile")
cwv.fid_rating = metrics["FIRST_INPUT_DELAY_MS"].get("category")
if "CUMULATIVE_LAYOUT_SHIFT_SCORE" in metrics:
cwv.cls = metrics["CUMULATIVE_LAYOUT_SHIFT_SCORE"].get("percentile") / 100
cwv.cls_rating = metrics["CUMULATIVE_LAYOUT_SHIFT_SCORE"].get("category")
if "INTERACTION_TO_NEXT_PAINT" in metrics:
cwv.inp = metrics["INTERACTION_TO_NEXT_PAINT"].get("percentile")
cwv.inp_rating = metrics["INTERACTION_TO_NEXT_PAINT"].get("category")
# Extract opportunities
for audit_id, audit in audits.items():
if audit.get("details", {}).get("type") == "opportunity":
savings = audit.get("details", {}).get("overallSavingsMs", 0)
if savings > 0:
result.opportunities.append({
"id": audit_id,
"title": audit.get("title", ""),
"description": audit.get("description", ""),
"savings_ms": savings,
"score": audit.get("score", 0),
})
# Sort opportunities by savings
result.opportunities.sort(key=lambda x: x["savings_ms"], reverse=True)
# Extract diagnostics
for audit_id, audit in audits.items():
score = audit.get("score")
if score is not None and score < 1 and audit.get("details"):
if audit.get("details", {}).get("type") not in ["opportunity", None]:
result.diagnostics.append({
"id": audit_id,
"title": audit.get("title", ""),
"description": audit.get("description", ""),
"score": score,
})
# Extract passed audits
for audit_id, audit in audits.items():
if audit.get("score") == 1:
result.passed_audits.append(audit.get("title", audit_id))
return result
def analyze_both_strategies(self, url: str) -> dict:
"""Analyze URL for both mobile and desktop."""
mobile = self.analyze(url, strategy="mobile")
desktop = self.analyze(url, strategy="desktop")
return {
"url": url,
"mobile": mobile.to_dict(),
"desktop": desktop.to_dict(),
"comparison": {
"performance_difference": (
(desktop.performance_score or 0) - (mobile.performance_score or 0)
),
"mobile_first_issues": self._identify_mobile_issues(mobile, desktop),
},
}
def _identify_mobile_issues(
self,
mobile: PageSpeedResult,
desktop: PageSpeedResult,
) -> list[str]:
"""Identify issues that affect mobile more than desktop."""
issues = []
if mobile.performance_score and desktop.performance_score:
if desktop.performance_score - mobile.performance_score > 20:
issues.append("Significant performance gap between mobile and desktop")
m_cwv = mobile.core_web_vitals
d_cwv = desktop.core_web_vitals
if m_cwv.lcp and d_cwv.lcp and m_cwv.lcp > d_cwv.lcp * 1.5:
issues.append("LCP significantly slower on mobile")
if m_cwv.cls and d_cwv.cls and m_cwv.cls > d_cwv.cls * 2:
issues.append("Layout shift issues more severe on mobile")
return issues
def get_cwv_summary(self, url: str) -> dict:
"""Get a summary focused on Core Web Vitals."""
result = self.analyze(url, strategy="mobile")
cwv = result.core_web_vitals
return {
"url": url,
"overall_cwv_status": self._overall_cwv_status(cwv),
"metrics": {
"lcp": {
"value": f"{cwv.lcp / 1000:.2f}s" if cwv.lcp else None,
"rating": cwv.lcp_rating,
"threshold": "≤ 2.5s good, > 4.0s poor",
},
"fid": {
"value": f"{cwv.fid:.0f}ms" if cwv.fid else None,
"rating": cwv.fid_rating,
"threshold": "≤ 100ms good, > 300ms poor",
},
"cls": {
"value": f"{cwv.cls:.3f}" if cwv.cls else None,
"rating": cwv.cls_rating,
"threshold": "≤ 0.1 good, > 0.25 poor",
},
"inp": {
"value": f"{cwv.inp:.0f}ms" if cwv.inp else None,
"rating": cwv.inp_rating,
"threshold": "≤ 200ms good, > 500ms poor",
},
},
"top_opportunities": result.opportunities[:5],
}
def _overall_cwv_status(self, cwv: CoreWebVitals) -> str:
"""Determine overall Core Web Vitals status."""
ratings = [cwv.lcp_rating, cwv.fid_rating, cwv.cls_rating]
ratings = [r for r in ratings if r]
if not ratings:
return "UNKNOWN"
if any(r == "POOR" for r in ratings):
return "POOR"
if any(r == "NEEDS_IMPROVEMENT" for r in ratings):
return "NEEDS_IMPROVEMENT"
return "GOOD"
def generate_report(self, result: PageSpeedResult) -> str:
"""Generate human-readable performance report."""
lines = [
"=" * 60,
"PageSpeed Insights Report",
"=" * 60,
f"URL: {result.url}",
f"Strategy: {result.strategy}",
"",
"Scores:",
f" Performance: {result.performance_score:.0f}/100" if result.performance_score else " Performance: N/A",
f" SEO: {result.seo_score:.0f}/100" if result.seo_score else " SEO: N/A",
f" Accessibility: {result.accessibility_score:.0f}/100" if result.accessibility_score else " Accessibility: N/A",
f" Best Practices: {result.best_practices_score:.0f}/100" if result.best_practices_score else " Best Practices: N/A",
"",
"Core Web Vitals:",
]
cwv = result.core_web_vitals
def format_metric(name: str, value: Any, rating: str | None, unit: str) -> str:
if value is None:
return f" {name}: N/A"
rating_str = f" ({rating})" if rating else ""
return f" {name}: {value}{unit}{rating_str}"
lines.append(format_metric("LCP", f"{cwv.lcp / 1000:.2f}" if cwv.lcp else None, cwv.lcp_rating, "s"))
lines.append(format_metric("FID/TBT", f"{cwv.fid:.0f}" if cwv.fid else None, cwv.fid_rating, "ms"))
lines.append(format_metric("CLS", f"{cwv.cls:.3f}" if cwv.cls else None, cwv.cls_rating, ""))
lines.append(format_metric("INP", f"{cwv.inp:.0f}" if cwv.inp else None, cwv.inp_rating, "ms"))
lines.append(format_metric("TTFB", f"{cwv.ttfb:.0f}" if cwv.ttfb else None, None, "ms"))
lines.append(format_metric("FCP", f"{cwv.fcp / 1000:.2f}" if cwv.fcp else None, None, "s"))
if result.opportunities:
lines.extend([
"",
f"Top Opportunities ({len(result.opportunities)} total):",
])
for opp in result.opportunities[:5]:
savings = opp["savings_ms"]
lines.append(f" - {opp['title']}: -{savings / 1000:.1f}s potential savings")
lines.extend(["", "=" * 60])
return "\n".join(lines)
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(description="PageSpeed Insights Client")
parser.add_argument("--url", "-u", required=True, help="URL to analyze")
parser.add_argument("--strategy", "-s", default="mobile",
choices=["mobile", "desktop", "both"],
help="Analysis strategy")
parser.add_argument("--output", "-o", help="Output file for JSON")
parser.add_argument("--json", action="store_true", help="Output as JSON")
parser.add_argument("--cwv-only", action="store_true",
help="Show only Core Web Vitals summary")
args = parser.parse_args()
client = PageSpeedClient()
if args.cwv_only:
summary = client.get_cwv_summary(args.url)
print(json.dumps(summary, indent=2))
elif args.strategy == "both":
result = client.analyze_both_strategies(args.url)
output = json.dumps(result, indent=2)
if args.output:
with open(args.output, "w") as f:
f.write(output)
else:
print(output)
else:
result = client.analyze(args.url, strategy=args.strategy)
if args.json or args.output:
output = json.dumps(result.to_dict(), indent=2)
if args.output:
with open(args.output, "w") as f:
f.write(output)
else:
print(output)
else:
print(client.generate_report(result))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,40 @@
# OurDigital SEO Audit - Python Dependencies
# Install with: pip install -r requirements.txt
# Google APIs
google-api-python-client>=2.100.0
google-auth>=2.23.0
google-auth-oauthlib>=1.1.0
google-auth-httplib2>=0.1.1
google-analytics-data>=0.18.0
# Notion API
notion-client>=2.0.0
# Web Scraping & Parsing
lxml>=5.1.0
beautifulsoup4>=4.12.0
extruct>=0.16.0
requests>=2.31.0
aiohttp>=3.9.0
# Schema Validation
jsonschema>=4.21.0
rdflib>=7.0.0
# Google Trends
pytrends>=4.9.2
# Data Processing
pandas>=2.1.0
# Async & Retry
tenacity>=8.2.0
tqdm>=4.66.0
# Environment
python-dotenv>=1.0.0
# Logging & CLI
rich>=13.7.0
typer>=0.9.0

View File

@@ -0,0 +1,540 @@
"""
Robots.txt Checker - Analyze robots.txt configuration
=====================================================
Purpose: Parse and analyze robots.txt for SEO compliance
Python: 3.10+
Usage:
python robots_checker.py --url https://example.com/robots.txt
python robots_checker.py --url https://example.com --test-url /admin/
"""
import argparse
import json
import logging
import re
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
import requests
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
@dataclass
class RobotsIssue:
"""Represents a robots.txt issue."""
severity: str # "error", "warning", "info"
message: str
line_number: int | None = None
directive: str | None = None
suggestion: str | None = None
@dataclass
class UserAgentRules:
"""Rules for a specific user-agent."""
user_agent: str
disallow: list[str] = field(default_factory=list)
allow: list[str] = field(default_factory=list)
crawl_delay: float | None = None
@dataclass
class RobotsResult:
"""Complete robots.txt analysis result."""
url: str
accessible: bool = True
content: str = ""
rules: list[UserAgentRules] = field(default_factory=list)
sitemaps: list[str] = field(default_factory=list)
issues: list[RobotsIssue] = field(default_factory=list)
stats: dict = field(default_factory=dict)
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
def to_dict(self) -> dict:
"""Convert to dictionary for JSON output."""
return {
"url": self.url,
"accessible": self.accessible,
"sitemaps": self.sitemaps,
"rules": [
{
"user_agent": r.user_agent,
"disallow": r.disallow,
"allow": r.allow,
"crawl_delay": r.crawl_delay,
}
for r in self.rules
],
"issues": [
{
"severity": i.severity,
"message": i.message,
"line_number": i.line_number,
"directive": i.directive,
"suggestion": i.suggestion,
}
for i in self.issues
],
"stats": self.stats,
"timestamp": self.timestamp,
}
class RobotsChecker:
"""Analyze robots.txt configuration."""
# Common user agents
USER_AGENTS = {
"*": "All bots",
"Googlebot": "Google crawler",
"Googlebot-Image": "Google Image crawler",
"Googlebot-News": "Google News crawler",
"Googlebot-Video": "Google Video crawler",
"Bingbot": "Bing crawler",
"Slurp": "Yahoo crawler",
"DuckDuckBot": "DuckDuckGo crawler",
"Baiduspider": "Baidu crawler",
"Yandex": "Yandex crawler",
"facebot": "Facebook crawler",
"Twitterbot": "Twitter crawler",
"LinkedInBot": "LinkedIn crawler",
}
# Paths that should generally not be blocked
IMPORTANT_PATHS = [
"/",
"/*.css",
"/*.js",
"/*.jpg",
"/*.jpeg",
"/*.png",
"/*.gif",
"/*.svg",
"/*.webp",
]
# Paths commonly blocked
COMMON_BLOCKED = [
"/admin",
"/wp-admin",
"/login",
"/private",
"/api",
"/cgi-bin",
"/tmp",
"/search",
]
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
})
def fetch_robots(self, url: str) -> str | None:
"""Fetch robots.txt content."""
# Ensure we're fetching robots.txt
parsed = urlparse(url)
if not parsed.path.endswith("robots.txt"):
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
else:
robots_url = url
try:
response = self.session.get(robots_url, timeout=10)
if response.status_code == 200:
return response.text
elif response.status_code == 404:
return None
else:
raise RuntimeError(f"HTTP {response.status_code}")
except requests.RequestException as e:
raise RuntimeError(f"Failed to fetch robots.txt: {e}")
def parse_robots(self, content: str) -> tuple[list[UserAgentRules], list[str]]:
"""Parse robots.txt content."""
rules = []
sitemaps = []
current_ua = None
current_rules = None
for line_num, line in enumerate(content.split("\n"), 1):
line = line.strip()
# Skip empty lines and comments
if not line or line.startswith("#"):
continue
# Parse directive
if ":" not in line:
continue
directive, value = line.split(":", 1)
directive = directive.strip().lower()
value = value.strip()
if directive == "user-agent":
# Save previous user-agent rules
if current_rules:
rules.append(current_rules)
current_ua = value
current_rules = UserAgentRules(user_agent=value)
elif directive == "disallow" and current_rules:
if value: # Empty disallow means allow all
current_rules.disallow.append(value)
elif directive == "allow" and current_rules:
if value:
current_rules.allow.append(value)
elif directive == "crawl-delay" and current_rules:
try:
current_rules.crawl_delay = float(value)
except ValueError:
pass
elif directive == "sitemap":
if value:
sitemaps.append(value)
# Don't forget last user-agent
if current_rules:
rules.append(current_rules)
return rules, sitemaps
def analyze(self, url: str) -> RobotsResult:
"""Analyze robots.txt."""
result = RobotsResult(url=url)
# Fetch robots.txt
try:
content = self.fetch_robots(url)
if content is None:
result.accessible = False
result.issues.append(RobotsIssue(
severity="info",
message="No robots.txt found (returns 404)",
suggestion="Consider creating a robots.txt file",
))
return result
except RuntimeError as e:
result.accessible = False
result.issues.append(RobotsIssue(
severity="error",
message=str(e),
))
return result
result.content = content
result.rules, result.sitemaps = self.parse_robots(content)
# Analyze content
self._analyze_syntax(result)
self._analyze_rules(result)
self._analyze_sitemaps(result)
# Calculate stats
result.stats = {
"user_agents_count": len(result.rules),
"user_agents": [r.user_agent for r in result.rules],
"total_disallow_rules": sum(len(r.disallow) for r in result.rules),
"total_allow_rules": sum(len(r.allow) for r in result.rules),
"sitemaps_count": len(result.sitemaps),
"has_crawl_delay": any(r.crawl_delay for r in result.rules),
"content_length": len(content),
}
return result
def _analyze_syntax(self, result: RobotsResult) -> None:
"""Check for syntax issues."""
lines = result.content.split("\n")
for line_num, line in enumerate(lines, 1):
line = line.strip()
# Skip empty lines and comments
if not line or line.startswith("#"):
continue
# Check for valid directive
if ":" not in line:
result.issues.append(RobotsIssue(
severity="warning",
message=f"Invalid line (missing colon): {line[:50]}",
line_number=line_num,
))
continue
directive, value = line.split(":", 1)
directive = directive.strip().lower()
valid_directives = {
"user-agent", "disallow", "allow",
"crawl-delay", "sitemap", "host",
}
if directive not in valid_directives:
result.issues.append(RobotsIssue(
severity="info",
message=f"Unknown directive: {directive}",
line_number=line_num,
directive=directive,
))
def _analyze_rules(self, result: RobotsResult) -> None:
"""Analyze blocking rules."""
# Check if there are any rules
if not result.rules:
result.issues.append(RobotsIssue(
severity="info",
message="No user-agent rules defined",
suggestion="Add User-agent: * rules to control crawling",
))
return
# Check for wildcard rule
has_wildcard = any(r.user_agent == "*" for r in result.rules)
if not has_wildcard:
result.issues.append(RobotsIssue(
severity="info",
message="No wildcard (*) user-agent defined",
suggestion="Consider adding User-agent: * as fallback",
))
# Check for blocking important resources
for rules in result.rules:
for disallow in rules.disallow:
# Check if blocking root
if disallow == "/":
result.issues.append(RobotsIssue(
severity="error",
message=f"Blocking entire site for {rules.user_agent}",
directive=f"Disallow: {disallow}",
suggestion="This will prevent indexing. Is this intentional?",
))
# Check if blocking CSS/JS
if any(ext in disallow.lower() for ext in [".css", ".js"]):
result.issues.append(RobotsIssue(
severity="warning",
message=f"Blocking CSS/JS files for {rules.user_agent}",
directive=f"Disallow: {disallow}",
suggestion="May affect rendering and SEO",
))
# Check for blocking images
if any(ext in disallow.lower() for ext in [".jpg", ".png", ".gif", ".webp"]):
result.issues.append(RobotsIssue(
severity="info",
message=f"Blocking image files for {rules.user_agent}",
directive=f"Disallow: {disallow}",
))
# Check crawl delay
if rules.crawl_delay:
if rules.crawl_delay > 10:
result.issues.append(RobotsIssue(
severity="warning",
message=f"High crawl-delay ({rules.crawl_delay}s) for {rules.user_agent}",
directive=f"Crawl-delay: {rules.crawl_delay}",
suggestion="May significantly slow indexing",
))
elif rules.crawl_delay > 0:
result.issues.append(RobotsIssue(
severity="info",
message=f"Crawl-delay set to {rules.crawl_delay}s for {rules.user_agent}",
))
def _analyze_sitemaps(self, result: RobotsResult) -> None:
"""Analyze sitemap declarations."""
if not result.sitemaps:
result.issues.append(RobotsIssue(
severity="warning",
message="No sitemap declared in robots.txt",
suggestion="Add Sitemap: directive to help crawlers find your sitemap",
))
else:
for sitemap in result.sitemaps:
if not sitemap.startswith("http"):
result.issues.append(RobotsIssue(
severity="warning",
message=f"Sitemap URL should be absolute: {sitemap}",
directive=f"Sitemap: {sitemap}",
))
def test_url(self, robots_url: str, test_path: str,
user_agent: str = "Googlebot") -> dict:
"""Test if a specific URL is allowed."""
# Use Python's built-in parser
rp = RobotFileParser()
# Ensure robots.txt URL
parsed = urlparse(robots_url)
if not parsed.path.endswith("robots.txt"):
robots_txt_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
else:
robots_txt_url = robots_url
rp.set_url(robots_txt_url)
try:
rp.read()
except Exception as e:
return {
"path": test_path,
"user_agent": user_agent,
"allowed": None,
"error": str(e),
}
# Build full URL for testing
base_url = f"{parsed.scheme}://{parsed.netloc}"
full_url = urljoin(base_url, test_path)
allowed = rp.can_fetch(user_agent, full_url)
return {
"path": test_path,
"user_agent": user_agent,
"allowed": allowed,
"full_url": full_url,
}
def generate_report(self, result: RobotsResult) -> str:
"""Generate human-readable analysis report."""
lines = [
"=" * 60,
"Robots.txt Analysis Report",
"=" * 60,
f"URL: {result.url}",
f"Accessible: {'Yes' if result.accessible else 'No'}",
f"Timestamp: {result.timestamp}",
"",
]
if result.accessible:
lines.append("Statistics:")
for key, value in result.stats.items():
if key == "user_agents":
lines.append(f" {key}: {', '.join(value) if value else 'None'}")
else:
lines.append(f" {key}: {value}")
lines.append("")
if result.sitemaps:
lines.append(f"Sitemaps ({len(result.sitemaps)}):")
for sitemap in result.sitemaps:
lines.append(f" - {sitemap}")
lines.append("")
if result.rules:
lines.append("Rules Summary:")
for rules in result.rules:
lines.append(f"\n User-agent: {rules.user_agent}")
if rules.disallow:
lines.append(f" Disallow: {len(rules.disallow)} rules")
for d in rules.disallow[:5]:
lines.append(f" - {d}")
if len(rules.disallow) > 5:
lines.append(f" ... and {len(rules.disallow) - 5} more")
if rules.allow:
lines.append(f" Allow: {len(rules.allow)} rules")
for a in rules.allow[:3]:
lines.append(f" - {a}")
if rules.crawl_delay:
lines.append(f" Crawl-delay: {rules.crawl_delay}s")
lines.append("")
if result.issues:
lines.append("Issues Found:")
errors = [i for i in result.issues if i.severity == "error"]
warnings = [i for i in result.issues if i.severity == "warning"]
infos = [i for i in result.issues if i.severity == "info"]
if errors:
lines.append(f"\n ERRORS ({len(errors)}):")
for issue in errors:
lines.append(f" - {issue.message}")
if issue.directive:
lines.append(f" Directive: {issue.directive}")
if issue.suggestion:
lines.append(f" Suggestion: {issue.suggestion}")
if warnings:
lines.append(f"\n WARNINGS ({len(warnings)}):")
for issue in warnings:
lines.append(f" - {issue.message}")
if issue.suggestion:
lines.append(f" Suggestion: {issue.suggestion}")
if infos:
lines.append(f"\n INFO ({len(infos)}):")
for issue in infos:
lines.append(f" - {issue.message}")
lines.append("")
lines.append("=" * 60)
return "\n".join(lines)
def main():
"""Main entry point for CLI usage."""
parser = argparse.ArgumentParser(
description="Analyze robots.txt configuration",
)
parser.add_argument("--url", "-u", required=True,
help="URL to robots.txt or domain")
parser.add_argument("--test-url", "-t",
help="Test if specific URL path is allowed")
parser.add_argument("--user-agent", "-a", default="Googlebot",
help="User agent for testing (default: Googlebot)")
parser.add_argument("--output", "-o", help="Output file for JSON report")
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
checker = RobotsChecker()
if args.test_url:
# Test specific URL
test_result = checker.test_url(args.url, args.test_url, args.user_agent)
if args.json:
print(json.dumps(test_result, indent=2))
else:
status = "ALLOWED" if test_result["allowed"] else "BLOCKED"
print(f"URL: {test_result['path']}")
print(f"User-Agent: {test_result['user_agent']}")
print(f"Status: {status}")
else:
# Full analysis
result = checker.analyze(args.url)
if args.json or args.output:
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
logger.info(f"Report written to {args.output}")
else:
print(output)
else:
print(checker.generate_report(result))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,490 @@
"""
Schema Generator - Generate JSON-LD structured data markup
==========================================================
Purpose: Generate schema.org structured data in JSON-LD format
Python: 3.10+
Usage:
python schema_generator.py --type organization --name "Company Name" --url "https://example.com"
"""
import argparse
import json
import logging
import os
import re
from datetime import datetime
from pathlib import Path
from typing import Any
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
# Template directory relative to this script
TEMPLATE_DIR = Path(__file__).parent.parent / "templates" / "schema_templates"
class SchemaGenerator:
"""Generate JSON-LD schema markup from templates."""
SCHEMA_TYPES = {
"organization": "organization.json",
"local_business": "local_business.json",
"product": "product.json",
"article": "article.json",
"faq": "faq.json",
"breadcrumb": "breadcrumb.json",
"website": "website.json",
}
# Business type mappings for LocalBusiness
BUSINESS_TYPES = {
"restaurant": "Restaurant",
"cafe": "CafeOrCoffeeShop",
"bar": "BarOrPub",
"hotel": "Hotel",
"store": "Store",
"medical": "MedicalBusiness",
"dental": "Dentist",
"legal": "LegalService",
"real_estate": "RealEstateAgent",
"auto": "AutoRepair",
"beauty": "BeautySalon",
"gym": "HealthClub",
"spa": "DaySpa",
}
# Article type mappings
ARTICLE_TYPES = {
"article": "Article",
"blog": "BlogPosting",
"news": "NewsArticle",
"tech": "TechArticle",
"scholarly": "ScholarlyArticle",
}
def __init__(self, template_dir: Path = TEMPLATE_DIR):
self.template_dir = template_dir
def load_template(self, schema_type: str) -> dict:
"""Load a schema template file."""
if schema_type not in self.SCHEMA_TYPES:
raise ValueError(f"Unknown schema type: {schema_type}. "
f"Available: {list(self.SCHEMA_TYPES.keys())}")
template_file = self.template_dir / self.SCHEMA_TYPES[schema_type]
if not template_file.exists():
raise FileNotFoundError(f"Template not found: {template_file}")
with open(template_file, "r", encoding="utf-8") as f:
return json.load(f)
def fill_template(self, template: dict, data: dict[str, Any]) -> dict:
"""Fill template placeholders with actual data."""
template_str = json.dumps(template, ensure_ascii=False)
# Replace placeholders {{key}} with values
for key, value in data.items():
placeholder = f"{{{{{key}}}}}"
if value is not None:
template_str = template_str.replace(placeholder, str(value))
# Remove unfilled placeholders and their parent objects if empty
result = json.loads(template_str)
return self._clean_empty_values(result)
def _clean_empty_values(self, obj: Any) -> Any:
"""Remove empty values and unfilled placeholders."""
if isinstance(obj, dict):
cleaned = {}
for key, value in obj.items():
cleaned_value = self._clean_empty_values(value)
# Skip if value is empty, None, or unfilled placeholder
if cleaned_value is None:
continue
if isinstance(cleaned_value, str) and cleaned_value.startswith("{{"):
continue
if isinstance(cleaned_value, (list, dict)) and not cleaned_value:
continue
cleaned[key] = cleaned_value
return cleaned if cleaned else None
elif isinstance(obj, list):
cleaned = []
for item in obj:
cleaned_item = self._clean_empty_values(item)
if cleaned_item is not None:
if isinstance(cleaned_item, str) and cleaned_item.startswith("{{"):
continue
cleaned.append(cleaned_item)
return cleaned if cleaned else None
elif isinstance(obj, str):
if obj.startswith("{{") and obj.endswith("}}"):
return None
return obj
return obj
def generate_organization(
self,
name: str,
url: str,
logo_url: str | None = None,
description: str | None = None,
founding_date: str | None = None,
phone: str | None = None,
address: dict | None = None,
social_links: list[str] | None = None,
) -> dict:
"""Generate Organization schema."""
template = self.load_template("organization")
data = {
"name": name,
"url": url,
"logo_url": logo_url,
"description": description,
"founding_date": founding_date,
"phone": phone,
}
if address:
data.update({
"street_address": address.get("street"),
"city": address.get("city"),
"region": address.get("region"),
"postal_code": address.get("postal_code"),
"country": address.get("country", "KR"),
})
if social_links:
# Handle social links specially
pass
return self.fill_template(template, data)
def generate_local_business(
self,
name: str,
business_type: str,
address: dict,
phone: str | None = None,
url: str | None = None,
description: str | None = None,
hours: dict | None = None,
geo: dict | None = None,
price_range: str | None = None,
rating: float | None = None,
review_count: int | None = None,
) -> dict:
"""Generate LocalBusiness schema."""
template = self.load_template("local_business")
schema_business_type = self.BUSINESS_TYPES.get(
business_type.lower(), "LocalBusiness"
)
data = {
"business_type": schema_business_type,
"name": name,
"url": url,
"description": description,
"phone": phone,
"price_range": price_range,
"street_address": address.get("street"),
"city": address.get("city"),
"region": address.get("region"),
"postal_code": address.get("postal_code"),
"country": address.get("country", "KR"),
}
if geo:
data["latitude"] = geo.get("lat")
data["longitude"] = geo.get("lng")
if hours:
data.update({
"weekday_opens": hours.get("weekday_opens", "09:00"),
"weekday_closes": hours.get("weekday_closes", "18:00"),
"weekend_opens": hours.get("weekend_opens"),
"weekend_closes": hours.get("weekend_closes"),
})
if rating is not None:
data["rating"] = str(rating)
data["review_count"] = str(review_count or 0)
return self.fill_template(template, data)
def generate_product(
self,
name: str,
description: str,
price: float,
currency: str = "KRW",
brand: str | None = None,
sku: str | None = None,
images: list[str] | None = None,
availability: str = "InStock",
condition: str = "NewCondition",
rating: float | None = None,
review_count: int | None = None,
url: str | None = None,
seller: str | None = None,
) -> dict:
"""Generate Product schema."""
template = self.load_template("product")
data = {
"name": name,
"description": description,
"price": str(int(price)),
"currency": currency,
"brand_name": brand,
"sku": sku,
"product_url": url,
"availability": availability,
"condition": condition,
"seller_name": seller,
}
if images:
for i, img in enumerate(images[:3], 1):
data[f"image_url_{i}"] = img
if rating is not None:
data["rating"] = str(rating)
data["review_count"] = str(review_count or 0)
return self.fill_template(template, data)
def generate_article(
self,
headline: str,
description: str,
author_name: str,
date_published: str,
publisher_name: str,
article_type: str = "article",
date_modified: str | None = None,
images: list[str] | None = None,
page_url: str | None = None,
publisher_logo: str | None = None,
author_url: str | None = None,
section: str | None = None,
word_count: int | None = None,
keywords: str | None = None,
) -> dict:
"""Generate Article schema."""
template = self.load_template("article")
schema_article_type = self.ARTICLE_TYPES.get(
article_type.lower(), "Article"
)
data = {
"article_type": schema_article_type,
"headline": headline,
"description": description,
"author_name": author_name,
"author_url": author_url,
"date_published": date_published,
"date_modified": date_modified or date_published,
"publisher_name": publisher_name,
"publisher_logo_url": publisher_logo,
"page_url": page_url,
"section": section,
"word_count": str(word_count) if word_count else None,
"keywords": keywords,
}
if images:
for i, img in enumerate(images[:2], 1):
data[f"image_url_{i}"] = img
return self.fill_template(template, data)
def generate_faq(self, questions: list[dict[str, str]]) -> dict:
"""Generate FAQPage schema."""
schema = {
"@context": "https://schema.org",
"@type": "FAQPage",
"mainEntity": [],
}
for qa in questions:
schema["mainEntity"].append({
"@type": "Question",
"name": qa["question"],
"acceptedAnswer": {
"@type": "Answer",
"text": qa["answer"],
},
})
return schema
def generate_breadcrumb(self, items: list[dict[str, str]]) -> dict:
"""Generate BreadcrumbList schema."""
schema = {
"@context": "https://schema.org",
"@type": "BreadcrumbList",
"itemListElement": [],
}
for i, item in enumerate(items, 1):
schema["itemListElement"].append({
"@type": "ListItem",
"position": i,
"name": item["name"],
"item": item["url"],
})
return schema
def generate_website(
self,
name: str,
url: str,
search_url_template: str | None = None,
description: str | None = None,
language: str = "ko-KR",
publisher_name: str | None = None,
logo_url: str | None = None,
alternate_name: str | None = None,
) -> dict:
"""Generate WebSite schema."""
template = self.load_template("website")
data = {
"site_name": name,
"url": url,
"description": description,
"language": language,
"search_url_template": search_url_template,
"publisher_name": publisher_name or name,
"logo_url": logo_url,
"alternate_name": alternate_name,
}
return self.fill_template(template, data)
def to_json_ld(self, schema: dict, pretty: bool = True) -> str:
"""Convert schema dict to JSON-LD string."""
indent = 2 if pretty else None
return json.dumps(schema, ensure_ascii=False, indent=indent)
def to_html_script(self, schema: dict) -> str:
"""Wrap schema in HTML script tag."""
json_ld = self.to_json_ld(schema)
return f'<script type="application/ld+json">\n{json_ld}\n</script>'
def main():
"""Main entry point for CLI usage."""
parser = argparse.ArgumentParser(
description="Generate JSON-LD schema markup",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Generate Organization schema
python schema_generator.py --type organization --name "My Company" --url "https://example.com"
# Generate Product schema
python schema_generator.py --type product --name "Widget" --price 29900 --currency KRW
# Generate Article schema
python schema_generator.py --type article --headline "Article Title" --author "John Doe"
""",
)
parser.add_argument(
"--type", "-t",
required=True,
choices=SchemaGenerator.SCHEMA_TYPES.keys(),
help="Schema type to generate",
)
parser.add_argument("--name", help="Name/title")
parser.add_argument("--url", help="URL")
parser.add_argument("--description", help="Description")
parser.add_argument("--price", type=float, help="Price (for product)")
parser.add_argument("--currency", default="KRW", help="Currency code")
parser.add_argument("--headline", help="Headline (for article)")
parser.add_argument("--author", help="Author name")
parser.add_argument("--output", "-o", help="Output file path")
parser.add_argument("--html", action="store_true", help="Output as HTML script tag")
args = parser.parse_args()
generator = SchemaGenerator()
try:
if args.type == "organization":
schema = generator.generate_organization(
name=args.name or "Organization Name",
url=args.url or "https://example.com",
description=args.description,
)
elif args.type == "product":
schema = generator.generate_product(
name=args.name or "Product Name",
description=args.description or "Product description",
price=args.price or 0,
currency=args.currency,
)
elif args.type == "article":
schema = generator.generate_article(
headline=args.headline or args.name or "Article Title",
description=args.description or "Article description",
author_name=args.author or "Author",
date_published=datetime.now().strftime("%Y-%m-%d"),
publisher_name="Publisher",
)
elif args.type == "website":
schema = generator.generate_website(
name=args.name or "Website Name",
url=args.url or "https://example.com",
description=args.description,
)
elif args.type == "faq":
# Example FAQ
schema = generator.generate_faq([
{"question": "Question 1?", "answer": "Answer 1"},
{"question": "Question 2?", "answer": "Answer 2"},
])
elif args.type == "breadcrumb":
# Example breadcrumb
schema = generator.generate_breadcrumb([
{"name": "Home", "url": "https://example.com/"},
{"name": "Category", "url": "https://example.com/category/"},
])
elif args.type == "local_business":
schema = generator.generate_local_business(
name=args.name or "Business Name",
business_type="store",
address={"street": "123 Main St", "city": "Seoul", "country": "KR"},
url=args.url,
description=args.description,
)
else:
raise ValueError(f"Unsupported type: {args.type}")
if args.html:
output = generator.to_html_script(schema)
else:
output = generator.to_json_ld(schema)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
logger.info(f"Schema written to {args.output}")
else:
print(output)
except Exception as e:
logger.error(f"Error generating schema: {e}")
raise
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,498 @@
"""
Schema Validator - Validate JSON-LD structured data markup
==========================================================
Purpose: Extract and validate schema.org structured data from URLs or files
Python: 3.10+
Usage:
python schema_validator.py --url https://example.com
python schema_validator.py --file schema.json
"""
import argparse
import json
import logging
import re
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
try:
import extruct
HAS_EXTRUCT = True
except ImportError:
HAS_EXTRUCT = False
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
@dataclass
class ValidationIssue:
"""Represents a validation issue found in schema."""
severity: str # "error", "warning", "info"
message: str
schema_type: str | None = None
property_name: str | None = None
suggestion: str | None = None
@dataclass
class ValidationResult:
"""Complete validation result for a schema."""
url: str | None = None
schemas_found: list[dict] = field(default_factory=list)
issues: list[ValidationIssue] = field(default_factory=list)
valid: bool = True
rich_results_eligible: dict = field(default_factory=dict)
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
def to_dict(self) -> dict:
"""Convert to dictionary for JSON output."""
return {
"url": self.url,
"schemas_found": len(self.schemas_found),
"schema_types": [s.get("@type", "Unknown") for s in self.schemas_found],
"valid": self.valid,
"issues": [
{
"severity": i.severity,
"message": i.message,
"schema_type": i.schema_type,
"property": i.property_name,
"suggestion": i.suggestion,
}
for i in self.issues
],
"rich_results_eligible": self.rich_results_eligible,
"timestamp": self.timestamp,
}
class SchemaValidator:
"""Validate schema.org structured data."""
# Required properties for common schema types
REQUIRED_PROPERTIES = {
"Organization": ["name", "url"],
"LocalBusiness": ["name", "address"],
"Product": ["name"],
"Offer": ["price", "priceCurrency"],
"Article": ["headline", "author", "datePublished", "publisher"],
"BlogPosting": ["headline", "author", "datePublished", "publisher"],
"NewsArticle": ["headline", "author", "datePublished", "publisher"],
"FAQPage": ["mainEntity"],
"Question": ["name", "acceptedAnswer"],
"Answer": ["text"],
"BreadcrumbList": ["itemListElement"],
"ListItem": ["position", "name"],
"WebSite": ["name", "url"],
"WebPage": ["name"],
"Person": ["name"],
"Event": ["name", "startDate", "location"],
"Review": ["reviewRating", "author"],
"AggregateRating": ["ratingValue"],
"ImageObject": ["url"],
}
# Recommended (but not required) properties
RECOMMENDED_PROPERTIES = {
"Organization": ["logo", "description", "contactPoint", "sameAs"],
"LocalBusiness": ["telephone", "openingHoursSpecification", "geo", "image"],
"Product": ["description", "image", "brand", "offers", "aggregateRating"],
"Article": ["image", "dateModified", "description"],
"FAQPage": [],
"WebSite": ["potentialAction"],
"BreadcrumbList": [],
}
# Google Rich Results eligible types
RICH_RESULTS_TYPES = {
"Article", "BlogPosting", "NewsArticle",
"Product", "Review",
"FAQPage", "HowTo",
"LocalBusiness", "Restaurant",
"Event",
"Recipe",
"JobPosting",
"Course",
"BreadcrumbList",
"Organization",
"WebSite",
"VideoObject",
}
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
})
def extract_from_url(self, url: str) -> list[dict]:
"""Extract all structured data from a URL."""
try:
response = self.session.get(url, timeout=30)
response.raise_for_status()
return self.extract_from_html(response.text, url)
except requests.RequestException as e:
logger.error(f"Failed to fetch URL: {e}")
return []
def extract_from_html(self, html: str, base_url: str | None = None) -> list[dict]:
"""Extract structured data from HTML content."""
schemas = []
# Method 1: Use extruct if available (handles JSON-LD, Microdata, RDFa)
if HAS_EXTRUCT:
try:
data = extruct.extract(html, base_url=base_url, uniform=True)
schemas.extend(data.get("json-ld", []))
schemas.extend(data.get("microdata", []))
schemas.extend(data.get("rdfa", []))
except Exception as e:
logger.warning(f"extruct extraction failed: {e}")
# Method 2: Manual JSON-LD extraction (fallback/additional)
soup = BeautifulSoup(html, "html.parser")
for script in soup.find_all("script", type="application/ld+json"):
try:
content = script.string
if content:
data = json.loads(content)
if isinstance(data, list):
schemas.extend(data)
else:
schemas.append(data)
except json.JSONDecodeError as e:
logger.warning(f"Invalid JSON-LD: {e}")
# Deduplicate schemas
seen = set()
unique_schemas = []
for schema in schemas:
schema_str = json.dumps(schema, sort_keys=True)
if schema_str not in seen:
seen.add(schema_str)
unique_schemas.append(schema)
return unique_schemas
def validate(self, url: str | None = None, html: str | None = None,
schema: dict | None = None) -> ValidationResult:
"""Validate schema from URL, HTML, or direct schema dict."""
result = ValidationResult(url=url)
# Extract schemas
if schema:
schemas = [schema]
elif html:
schemas = self.extract_from_html(html, url)
elif url:
schemas = self.extract_from_url(url)
else:
raise ValueError("Must provide url, html, or schema")
result.schemas_found = schemas
if not schemas:
result.issues.append(ValidationIssue(
severity="warning",
message="No structured data found",
suggestion="Add JSON-LD schema markup to improve SEO",
))
result.valid = False
return result
# Validate each schema
for schema in schemas:
self._validate_schema(schema, result)
# Check for errors (warnings don't affect validity)
result.valid = not any(i.severity == "error" for i in result.issues)
return result
def _validate_schema(self, schema: dict, result: ValidationResult,
parent_type: str | None = None) -> None:
"""Validate a single schema object."""
schema_type = schema.get("@type")
if not schema_type:
result.issues.append(ValidationIssue(
severity="error",
message="Missing @type property",
schema_type=parent_type,
))
return
# Handle array of types
if isinstance(schema_type, list):
schema_type = schema_type[0]
# Check required properties
required = self.REQUIRED_PROPERTIES.get(schema_type, [])
for prop in required:
if prop not in schema:
result.issues.append(ValidationIssue(
severity="error",
message=f"Missing required property: {prop}",
schema_type=schema_type,
property_name=prop,
suggestion=f"Add '{prop}' property to {schema_type} schema",
))
# Check recommended properties
recommended = self.RECOMMENDED_PROPERTIES.get(schema_type, [])
for prop in recommended:
if prop not in schema:
result.issues.append(ValidationIssue(
severity="info",
message=f"Missing recommended property: {prop}",
schema_type=schema_type,
property_name=prop,
suggestion=f"Consider adding '{prop}' for better rich results",
))
# Check Rich Results eligibility
if schema_type in self.RICH_RESULTS_TYPES:
result.rich_results_eligible[schema_type] = self._check_rich_results(
schema, schema_type
)
# Validate nested schemas
for key, value in schema.items():
if key.startswith("@"):
continue
if isinstance(value, dict) and "@type" in value:
self._validate_schema(value, result, schema_type)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict) and "@type" in item:
self._validate_schema(item, result, schema_type)
# Type-specific validations
self._validate_type_specific(schema, schema_type, result)
def _validate_type_specific(self, schema: dict, schema_type: str,
result: ValidationResult) -> None:
"""Type-specific validation rules."""
if schema_type in ("Article", "BlogPosting", "NewsArticle"):
# Check image
if "image" not in schema:
result.issues.append(ValidationIssue(
severity="warning",
message="Article without image may not show in rich results",
schema_type=schema_type,
property_name="image",
suggestion="Add at least one image to the article",
))
# Check headline length
headline = schema.get("headline", "")
if len(headline) > 110:
result.issues.append(ValidationIssue(
severity="warning",
message=f"Headline too long ({len(headline)} chars, max 110)",
schema_type=schema_type,
property_name="headline",
))
elif schema_type == "Product":
offer = schema.get("offers", {})
if isinstance(offer, dict):
# Check price
price = offer.get("price")
if price is not None:
try:
float(price)
except (ValueError, TypeError):
result.issues.append(ValidationIssue(
severity="error",
message=f"Invalid price value: {price}",
schema_type="Offer",
property_name="price",
))
# Check availability
availability = offer.get("availability", "")
valid_availabilities = [
"InStock", "OutOfStock", "PreOrder", "Discontinued",
"https://schema.org/InStock", "https://schema.org/OutOfStock",
]
if availability and not any(
a in availability for a in valid_availabilities
):
result.issues.append(ValidationIssue(
severity="warning",
message=f"Unknown availability value: {availability}",
schema_type="Offer",
property_name="availability",
))
elif schema_type == "LocalBusiness":
# Check for geo coordinates
if "geo" not in schema:
result.issues.append(ValidationIssue(
severity="info",
message="Missing geo coordinates",
schema_type=schema_type,
property_name="geo",
suggestion="Add latitude/longitude for better local search",
))
elif schema_type == "FAQPage":
main_entity = schema.get("mainEntity", [])
if not main_entity:
result.issues.append(ValidationIssue(
severity="error",
message="FAQPage must have at least one question",
schema_type=schema_type,
property_name="mainEntity",
))
elif len(main_entity) < 2:
result.issues.append(ValidationIssue(
severity="info",
message="FAQPage has only one question",
schema_type=schema_type,
suggestion="Add more questions for better rich results",
))
def _check_rich_results(self, schema: dict, schema_type: str) -> dict:
"""Check if schema is eligible for Google Rich Results."""
result = {
"eligible": True,
"missing_for_rich_results": [],
}
if schema_type in ("Article", "BlogPosting", "NewsArticle"):
required_for_rich = ["headline", "image", "datePublished", "author"]
for prop in required_for_rich:
if prop not in schema:
result["eligible"] = False
result["missing_for_rich_results"].append(prop)
elif schema_type == "Product":
if "name" not in schema:
result["eligible"] = False
result["missing_for_rich_results"].append("name")
offer = schema.get("offers")
if not offer:
result["eligible"] = False
result["missing_for_rich_results"].append("offers")
elif schema_type == "FAQPage":
if not schema.get("mainEntity"):
result["eligible"] = False
result["missing_for_rich_results"].append("mainEntity")
return result
def generate_report(self, result: ValidationResult) -> str:
"""Generate human-readable validation report."""
lines = [
"=" * 60,
"Schema Validation Report",
"=" * 60,
f"URL: {result.url or 'N/A'}",
f"Timestamp: {result.timestamp}",
f"Valid: {'Yes' if result.valid else 'No'}",
f"Schemas Found: {len(result.schemas_found)}",
"",
]
if result.schemas_found:
lines.append("Schema Types:")
for schema in result.schemas_found:
schema_type = schema.get("@type", "Unknown")
lines.append(f" - {schema_type}")
lines.append("")
if result.rich_results_eligible:
lines.append("Rich Results Eligibility:")
for schema_type, status in result.rich_results_eligible.items():
eligible = "Yes" if status["eligible"] else "No"
lines.append(f" - {schema_type}: {eligible}")
if status["missing_for_rich_results"]:
missing = ", ".join(status["missing_for_rich_results"])
lines.append(f" Missing: {missing}")
lines.append("")
if result.issues:
lines.append("Issues Found:")
errors = [i for i in result.issues if i.severity == "error"]
warnings = [i for i in result.issues if i.severity == "warning"]
infos = [i for i in result.issues if i.severity == "info"]
if errors:
lines.append(f"\n ERRORS ({len(errors)}):")
for issue in errors:
lines.append(f" - [{issue.schema_type}] {issue.message}")
if issue.suggestion:
lines.append(f" Suggestion: {issue.suggestion}")
if warnings:
lines.append(f"\n WARNINGS ({len(warnings)}):")
for issue in warnings:
lines.append(f" - [{issue.schema_type}] {issue.message}")
if issue.suggestion:
lines.append(f" Suggestion: {issue.suggestion}")
if infos:
lines.append(f"\n INFO ({len(infos)}):")
for issue in infos:
lines.append(f" - [{issue.schema_type}] {issue.message}")
if issue.suggestion:
lines.append(f" Suggestion: {issue.suggestion}")
lines.append("")
lines.append("=" * 60)
return "\n".join(lines)
def main():
"""Main entry point for CLI usage."""
parser = argparse.ArgumentParser(
description="Validate schema.org structured data",
)
parser.add_argument("--url", "-u", help="URL to validate")
parser.add_argument("--file", "-f", help="JSON-LD file to validate")
parser.add_argument("--output", "-o", help="Output file for JSON report")
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
if not args.url and not args.file:
parser.error("Must provide --url or --file")
validator = SchemaValidator()
if args.file:
with open(args.file, "r", encoding="utf-8") as f:
schema = json.load(f)
result = validator.validate(schema=schema)
else:
result = validator.validate(url=args.url)
if args.json or args.output:
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
logger.info(f"Report written to {args.output}")
else:
print(output)
else:
print(validator.generate_report(result))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,969 @@
"""
Sitemap Crawler - Sequential page analysis from sitemap
=======================================================
Purpose: Crawl sitemap URLs one by one, analyze each page, save to Notion
Python: 3.10+
Usage:
from sitemap_crawler import SitemapCrawler
crawler = SitemapCrawler()
crawler.crawl_sitemap("https://example.com/sitemap.xml", delay=2.0)
"""
import json
import logging
import time
import xml.etree.ElementTree as ET
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Callable, Generator
from urllib.parse import urlparse
import requests
from notion_client import Client
from base_client import config
from page_analyzer import PageAnalyzer, PageMetadata
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
# Default database for page analysis data
DEFAULT_PAGES_DATABASE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
# Default limits to prevent excessive resource usage
DEFAULT_MAX_PAGES = 500
DEFAULT_DELAY_SECONDS = 2.0
# Progress tracking directory
PROGRESS_DIR = Path.home() / ".claude" / "seo-audit-progress"
PROGRESS_DIR.mkdir(parents=True, exist_ok=True)
@dataclass
class CrawlProgress:
"""Track crawl progress."""
total_urls: int = 0
processed_urls: int = 0
successful_urls: int = 0
failed_urls: int = 0
skipped_urls: int = 0
start_time: datetime = field(default_factory=datetime.now)
current_url: str = ""
audit_id: str = ""
site: str = ""
status: str = "running" # running, completed, failed
error_message: str = ""
summary_page_id: str = ""
def get_progress_percent(self) -> float:
if self.total_urls == 0:
return 0.0
return (self.processed_urls / self.total_urls) * 100
def get_elapsed_time(self) -> str:
elapsed = datetime.now() - self.start_time
minutes = int(elapsed.total_seconds() // 60)
seconds = int(elapsed.total_seconds() % 60)
return f"{minutes}m {seconds}s"
def get_eta(self) -> str:
if self.processed_urls == 0:
return "calculating..."
elapsed = (datetime.now() - self.start_time).total_seconds()
avg_time_per_url = elapsed / self.processed_urls
remaining_urls = self.total_urls - self.processed_urls
eta_seconds = remaining_urls * avg_time_per_url
minutes = int(eta_seconds // 60)
seconds = int(eta_seconds % 60)
return f"{minutes}m {seconds}s"
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization."""
return {
"audit_id": self.audit_id,
"site": self.site,
"status": self.status,
"total_urls": self.total_urls,
"processed_urls": self.processed_urls,
"successful_urls": self.successful_urls,
"failed_urls": self.failed_urls,
"progress_percent": round(self.get_progress_percent(), 1),
"elapsed_time": self.get_elapsed_time(),
"eta": self.get_eta(),
"current_url": self.current_url,
"start_time": self.start_time.isoformat(),
"error_message": self.error_message,
"summary_page_id": self.summary_page_id,
"updated_at": datetime.now().isoformat(),
}
def save_to_file(self, filepath: Path | None = None) -> Path:
"""Save progress to JSON file."""
if filepath is None:
filepath = PROGRESS_DIR / f"{self.audit_id}.json"
with open(filepath, "w") as f:
json.dump(self.to_dict(), f, indent=2)
return filepath
@classmethod
def load_from_file(cls, filepath: Path) -> "CrawlProgress":
"""Load progress from JSON file."""
with open(filepath, "r") as f:
data = json.load(f)
progress = cls()
progress.audit_id = data.get("audit_id", "")
progress.site = data.get("site", "")
progress.status = data.get("status", "unknown")
progress.total_urls = data.get("total_urls", 0)
progress.processed_urls = data.get("processed_urls", 0)
progress.successful_urls = data.get("successful_urls", 0)
progress.failed_urls = data.get("failed_urls", 0)
progress.current_url = data.get("current_url", "")
progress.error_message = data.get("error_message", "")
progress.summary_page_id = data.get("summary_page_id", "")
if data.get("start_time"):
progress.start_time = datetime.fromisoformat(data["start_time"])
return progress
def get_active_crawls() -> list[CrawlProgress]:
"""Get all active (running) crawl jobs."""
active = []
for filepath in PROGRESS_DIR.glob("*.json"):
try:
progress = CrawlProgress.load_from_file(filepath)
if progress.status == "running":
active.append(progress)
except Exception:
continue
return active
def get_all_crawls() -> list[CrawlProgress]:
"""Get all crawl jobs (active and completed)."""
crawls = []
for filepath in sorted(PROGRESS_DIR.glob("*.json"), reverse=True):
try:
progress = CrawlProgress.load_from_file(filepath)
crawls.append(progress)
except Exception:
continue
return crawls
def get_crawl_status(audit_id: str) -> CrawlProgress | None:
"""Get status of a specific crawl by audit ID."""
filepath = PROGRESS_DIR / f"{audit_id}.json"
if filepath.exists():
return CrawlProgress.load_from_file(filepath)
return None
@dataclass
class CrawlResult:
"""Result of a complete sitemap crawl."""
site: str
sitemap_url: str
audit_id: str
total_pages: int
successful_pages: int
failed_pages: int
start_time: datetime
end_time: datetime
pages_analyzed: list[PageMetadata] = field(default_factory=list)
notion_page_ids: list[str] = field(default_factory=list)
summary_page_id: str | None = None
def get_duration(self) -> str:
duration = self.end_time - self.start_time
minutes = int(duration.total_seconds() // 60)
seconds = int(duration.total_seconds() % 60)
return f"{minutes}m {seconds}s"
class SitemapCrawler:
"""Crawl sitemap URLs and analyze each page."""
def __init__(
self,
notion_token: str | None = None,
database_id: str | None = None,
):
"""
Initialize sitemap crawler.
Args:
notion_token: Notion API token
database_id: Notion database ID for storing results
"""
self.notion_token = notion_token or config.notion_token
self.database_id = database_id or DEFAULT_PAGES_DATABASE_ID
self.analyzer = PageAnalyzer()
if self.notion_token:
self.notion = Client(auth=self.notion_token)
else:
self.notion = None
logger.warning("Notion token not configured, results will not be saved")
def fetch_sitemap_urls(self, sitemap_url: str) -> list[str]:
"""
Fetch and parse URLs from a sitemap.
Args:
sitemap_url: URL of the sitemap
Returns:
List of URLs found in the sitemap
"""
try:
response = requests.get(sitemap_url, timeout=30)
response.raise_for_status()
# Parse XML
root = ET.fromstring(response.content)
# Handle namespace
namespaces = {
"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"
}
urls = []
# Check if this is a sitemap index
sitemap_tags = root.findall(".//sm:sitemap/sm:loc", namespaces)
if sitemap_tags:
# This is a sitemap index, recursively fetch child sitemaps
logger.info(f"Found sitemap index with {len(sitemap_tags)} child sitemaps")
for loc in sitemap_tags:
if loc.text:
child_urls = self.fetch_sitemap_urls(loc.text)
urls.extend(child_urls)
else:
# Regular sitemap, extract URLs
url_tags = root.findall(".//sm:url/sm:loc", namespaces)
if not url_tags:
# Try without namespace
url_tags = root.findall(".//url/loc")
for loc in url_tags:
if loc.text:
urls.append(loc.text)
# Remove duplicates while preserving order
seen = set()
unique_urls = []
for url in urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)
logger.info(f"Found {len(unique_urls)} unique URLs in sitemap")
return unique_urls
except Exception as e:
logger.error(f"Failed to fetch sitemap: {e}")
raise
def crawl_sitemap(
self,
sitemap_url: str,
delay: float = DEFAULT_DELAY_SECONDS,
max_pages: int = DEFAULT_MAX_PAGES,
progress_callback: Callable[[CrawlProgress], None] | None = None,
save_to_notion: bool = True,
url_filter: Callable[[str], bool] | None = None,
) -> CrawlResult:
"""
Crawl all URLs in a sitemap sequentially.
Args:
sitemap_url: URL of the sitemap
delay: Seconds to wait between requests (default: 2.0s)
max_pages: Maximum number of pages to process (default: 500)
progress_callback: Function called with progress updates
save_to_notion: Whether to save results to Notion
url_filter: Optional function to filter URLs (return True to include)
Returns:
CrawlResult with all analyzed pages
"""
# Parse site info
parsed_sitemap = urlparse(sitemap_url)
site = f"{parsed_sitemap.scheme}://{parsed_sitemap.netloc}"
site_domain = parsed_sitemap.netloc
# Generate audit ID
audit_id = f"{site_domain}-pages-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
logger.info(f"Starting sitemap crawl: {sitemap_url}")
logger.info(f"Audit ID: {audit_id}")
logger.info(f"Delay between requests: {delay}s")
# Initialize progress tracking
progress = CrawlProgress(
audit_id=audit_id,
site=site,
status="running",
)
# Fetch URLs
urls = self.fetch_sitemap_urls(sitemap_url)
# Apply URL filter if provided
if url_filter:
urls = [url for url in urls if url_filter(url)]
logger.info(f"After filtering: {len(urls)} URLs")
# Apply max pages limit (default: 500 to prevent excessive resource usage)
if len(urls) > max_pages:
logger.warning(f"Sitemap has {len(urls)} URLs, limiting to {max_pages} pages")
logger.warning(f"Use max_pages parameter to adjust this limit")
urls = urls[:max_pages]
logger.info(f"Processing {len(urls)} pages (max: {max_pages})")
# Update progress with total URLs
progress.total_urls = len(urls)
progress.save_to_file()
# Initialize result
result = CrawlResult(
site=site,
sitemap_url=sitemap_url,
audit_id=audit_id,
total_pages=len(urls),
successful_pages=0,
failed_pages=0,
start_time=datetime.now(),
end_time=datetime.now(),
)
# Process each URL
try:
for i, url in enumerate(urls):
progress.current_url = url
progress.processed_urls = i
progress.save_to_file() # Save progress to file
if progress_callback:
progress_callback(progress)
logger.info(f"[{i+1}/{len(urls)}] Analyzing: {url}")
try:
# Analyze page
metadata = self.analyzer.analyze_url(url)
result.pages_analyzed.append(metadata)
if metadata.status_code == 200:
progress.successful_urls += 1
result.successful_pages += 1
# Save to Notion
if save_to_notion and self.notion:
page_id = self._save_page_to_notion(metadata, audit_id, site)
if page_id:
result.notion_page_ids.append(page_id)
else:
progress.failed_urls += 1
result.failed_pages += 1
except Exception as e:
logger.error(f"Failed to analyze {url}: {e}")
progress.failed_urls += 1
result.failed_pages += 1
# Wait before next request
if i < len(urls) - 1: # Don't wait after last URL
time.sleep(delay)
# Final progress update
progress.processed_urls = len(urls)
progress.status = "completed"
if progress_callback:
progress_callback(progress)
except Exception as e:
progress.status = "failed"
progress.error_message = str(e)
progress.save_to_file()
raise
# Update result
result.end_time = datetime.now()
# Create summary page
if save_to_notion and self.notion:
summary_id = self._create_crawl_summary_page(result)
result.summary_page_id = summary_id
progress.summary_page_id = summary_id
# Save final progress
progress.save_to_file()
logger.info(f"Crawl complete: {result.successful_pages}/{result.total_pages} pages analyzed")
logger.info(f"Duration: {result.get_duration()}")
return result
def _save_page_to_notion(
self,
metadata: PageMetadata,
audit_id: str,
site: str,
) -> str | None:
"""Save page metadata to Notion database."""
try:
# Build properties
properties = {
"Issue": {"title": [{"text": {"content": f"📄 {metadata.url}"}}]},
"Category": {"select": {"name": "On-page SEO"}},
"Priority": {"select": {"name": self._determine_priority(metadata)}},
"Site": {"url": site},
"URL": {"url": metadata.url},
"Audit ID": {"rich_text": [{"text": {"content": audit_id}}]},
"Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
}
# Build page content
children = self._build_page_content(metadata)
response = self.notion.pages.create(
parent={"database_id": self.database_id},
properties=properties,
children=children,
)
return response["id"]
except Exception as e:
logger.error(f"Failed to save to Notion: {e}")
return None
def _determine_priority(self, metadata: PageMetadata) -> str:
"""Determine priority based on issues found."""
if len(metadata.issues) >= 3:
return "High"
elif len(metadata.issues) >= 1:
return "Medium"
elif len(metadata.warnings) >= 3:
return "Medium"
else:
return "Low"
def _build_page_content(self, metadata: PageMetadata) -> list[dict]:
"""Build Notion page content blocks from metadata."""
children = []
# Status summary callout
status_emoji = "" if not metadata.issues else "⚠️" if len(metadata.issues) < 3 else ""
children.append({
"object": "block",
"type": "callout",
"callout": {
"rich_text": [
{"type": "text", "text": {"content": f"Status: {metadata.status_code} | "}},
{"type": "text", "text": {"content": f"Response: {metadata.response_time_ms:.0f}ms | "}},
{"type": "text", "text": {"content": f"Issues: {len(metadata.issues)} | "}},
{"type": "text", "text": {"content": f"Warnings: {len(metadata.warnings)}"}},
],
"icon": {"type": "emoji", "emoji": status_emoji},
"color": "gray_background" if not metadata.issues else "yellow_background" if len(metadata.issues) < 3 else "red_background",
}
})
# Meta Tags Section
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Meta Tags"}}]}
})
# Meta tags table
meta_rows = [
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Tag"}, "annotations": {"bold": True}}],
[{"type": "text", "text": {"content": "Value"}, "annotations": {"bold": True}}],
[{"type": "text", "text": {"content": "Status"}, "annotations": {"bold": True}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Title"}}],
[{"type": "text", "text": {"content": (metadata.title or "")[:50]}}],
[{"type": "text", "text": {"content": f"{metadata.title_length} chars" if metadata.title else "✗ Missing"}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Description"}}],
[{"type": "text", "text": {"content": (metadata.meta_description or "")[:50]}}],
[{"type": "text", "text": {"content": f"{metadata.meta_description_length} chars" if metadata.meta_description else "✗ Missing"}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Canonical"}}],
[{"type": "text", "text": {"content": (metadata.canonical_url or "")[:50]}}],
[{"type": "text", "text": {"content": "" if metadata.canonical_url else "✗ Missing"}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Robots"}}],
[{"type": "text", "text": {"content": metadata.robots_meta or ""}}],
[{"type": "text", "text": {"content": "" if metadata.robots_meta else ""}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Lang"}}],
[{"type": "text", "text": {"content": metadata.html_lang or ""}}],
[{"type": "text", "text": {"content": "" if metadata.html_lang else ""}}],
]}},
]
children.append({
"object": "block",
"type": "table",
"table": {
"table_width": 3,
"has_column_header": True,
"has_row_header": False,
"children": meta_rows
}
})
# Headings Section
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Headings"}}]}
})
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {"rich_text": [
{"type": "text", "text": {"content": f"H1: {metadata.h1_count} | "}},
{"type": "text", "text": {"content": f"Total headings: {len(metadata.headings)}"}},
]}
})
if metadata.h1_text:
children.append({
"object": "block",
"type": "quote",
"quote": {"rich_text": [{"type": "text", "text": {"content": metadata.h1_text[:200]}}]}
})
# Schema Data Section
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Structured Data"}}]}
})
if metadata.schema_types_found:
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {"rich_text": [
{"type": "text", "text": {"content": "Schema types found: "}},
{"type": "text", "text": {"content": ", ".join(metadata.schema_types_found)}, "annotations": {"code": True}},
]}
})
else:
children.append({
"object": "block",
"type": "callout",
"callout": {
"rich_text": [{"type": "text", "text": {"content": "No structured data found on this page"}}],
"icon": {"type": "emoji", "emoji": "⚠️"},
"color": "yellow_background",
}
})
# Open Graph Section
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Open Graph"}}]}
})
og = metadata.open_graph
og_status = "✓ Configured" if og.og_title else "✗ Missing"
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {"rich_text": [
{"type": "text", "text": {"content": f"Status: {og_status}\n"}},
{"type": "text", "text": {"content": f"og:title: {og.og_title or ''}\n"}},
{"type": "text", "text": {"content": f"og:type: {og.og_type or ''}"}},
]}
})
# Links Section
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Links"}}]}
})
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {"rich_text": [
{"type": "text", "text": {"content": f"Internal links: {metadata.internal_link_count}\n"}},
{"type": "text", "text": {"content": f"External links: {metadata.external_link_count}"}},
]}
})
# Images Section
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Images"}}]}
})
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {"rich_text": [
{"type": "text", "text": {"content": f"Total: {metadata.images_total} | "}},
{"type": "text", "text": {"content": f"With alt: {metadata.images_with_alt} | "}},
{"type": "text", "text": {"content": f"Without alt: {metadata.images_without_alt}"}},
]}
})
# Hreflang Section (if present)
if metadata.hreflang_tags:
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Hreflang Tags"}}]}
})
for tag in metadata.hreflang_tags[:10]:
children.append({
"object": "block",
"type": "bulleted_list_item",
"bulleted_list_item": {"rich_text": [
{"type": "text", "text": {"content": f"{tag['lang']}: "}},
{"type": "text", "text": {"content": tag['url'], "link": {"url": tag['url']}}},
]}
})
# Issues & Warnings Section
if metadata.issues or metadata.warnings:
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Issues & Warnings"}}]}
})
for issue in metadata.issues:
children.append({
"object": "block",
"type": "to_do",
"to_do": {
"rich_text": [
{"type": "text", "text": {"content": ""}, "annotations": {"bold": True}},
{"type": "text", "text": {"content": issue}},
],
"checked": False,
}
})
for warning in metadata.warnings:
children.append({
"object": "block",
"type": "to_do",
"to_do": {
"rich_text": [
{"type": "text", "text": {"content": "⚠️ "}, "annotations": {"bold": True}},
{"type": "text", "text": {"content": warning}},
],
"checked": False,
}
})
return children
def _create_crawl_summary_page(self, result: CrawlResult) -> str | None:
"""Create a summary page for the crawl."""
try:
site_domain = urlparse(result.site).netloc
# Calculate statistics
total_issues = sum(len(p.issues) for p in result.pages_analyzed)
total_warnings = sum(len(p.warnings) for p in result.pages_analyzed)
pages_with_issues = sum(1 for p in result.pages_analyzed if p.issues)
pages_without_schema = sum(1 for p in result.pages_analyzed if not p.schema_types_found)
pages_without_description = sum(1 for p in result.pages_analyzed if not p.meta_description)
children = []
# Header callout
children.append({
"object": "block",
"type": "callout",
"callout": {
"rich_text": [
{"type": "text", "text": {"content": f"Sitemap Crawl Complete\n\n"}},
{"type": "text", "text": {"content": f"Audit ID: {result.audit_id}\n"}},
{"type": "text", "text": {"content": f"Duration: {result.get_duration()}\n"}},
{"type": "text", "text": {"content": f"Pages: {result.successful_pages}/{result.total_pages}"}},
],
"icon": {"type": "emoji", "emoji": "📊"},
"color": "blue_background",
}
})
# Statistics table
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Statistics"}}]}
})
stats_rows = [
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Metric"}, "annotations": {"bold": True}}],
[{"type": "text", "text": {"content": "Count"}, "annotations": {"bold": True}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Total Pages"}}],
[{"type": "text", "text": {"content": str(result.total_pages)}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Successfully Analyzed"}}],
[{"type": "text", "text": {"content": str(result.successful_pages)}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Pages with Issues"}}],
[{"type": "text", "text": {"content": str(pages_with_issues)}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Total Issues"}}],
[{"type": "text", "text": {"content": str(total_issues)}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Total Warnings"}}],
[{"type": "text", "text": {"content": str(total_warnings)}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Pages without Schema"}}],
[{"type": "text", "text": {"content": str(pages_without_schema)}}],
]}},
{"type": "table_row", "table_row": {"cells": [
[{"type": "text", "text": {"content": "Pages without Description"}}],
[{"type": "text", "text": {"content": str(pages_without_description)}}],
]}},
]
children.append({
"object": "block",
"type": "table",
"table": {
"table_width": 2,
"has_column_header": True,
"has_row_header": False,
"children": stats_rows
}
})
# Pages list
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Analyzed Pages"}}]}
})
children.append({
"object": "block",
"type": "paragraph",
"paragraph": {"rich_text": [
{"type": "text", "text": {"content": f"Filter by Audit ID in the database to see all {result.successful_pages} page entries."}}
]}
})
# Create the summary page
response = self.notion.pages.create(
parent={"database_id": self.database_id},
properties={
"Issue": {"title": [{"text": {"content": f"📊 Sitemap Crawl: {site_domain}"}}]},
"Category": {"select": {"name": "Technical SEO"}},
"Priority": {"select": {"name": "High"}},
"Site": {"url": result.site},
"Audit ID": {"rich_text": [{"text": {"content": result.audit_id}}]},
"Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
},
children=children,
)
logger.info(f"Created crawl summary page: {response['id']}")
return response["id"]
except Exception as e:
logger.error(f"Failed to create summary page: {e}")
return None
def print_progress_status(progress: CrawlProgress) -> None:
"""Print formatted progress status."""
status_emoji = {
"running": "🔄",
"completed": "",
"failed": "",
}.get(progress.status, "")
print(f"""
{'=' * 60}
{status_emoji} SEO Page Analysis - {progress.status.upper()}
{'=' * 60}
Audit ID: {progress.audit_id}
Site: {progress.site}
Status: {progress.status}
Progress: {progress.processed_urls}/{progress.total_urls} pages ({progress.get_progress_percent():.1f}%)
Successful: {progress.successful_urls}
Failed: {progress.failed_urls}
Elapsed: {progress.get_elapsed_time()}
ETA: {progress.get_eta() if progress.status == 'running' else 'N/A'}
Current URL: {progress.current_url[:60] + '...' if len(progress.current_url) > 60 else progress.current_url}
""")
if progress.summary_page_id:
print(f"Summary: https://www.notion.so/{progress.summary_page_id.replace('-', '')}")
if progress.error_message:
print(f"Error: {progress.error_message}")
print("=" * 60)
def main():
"""CLI entry point."""
import argparse
parser = argparse.ArgumentParser(description="Sitemap Crawler with Background Support")
subparsers = parser.add_subparsers(dest="command", help="Commands")
# Crawl command
crawl_parser = subparsers.add_parser("crawl", help="Start crawling a sitemap")
crawl_parser.add_argument("sitemap_url", help="URL of the sitemap to crawl")
crawl_parser.add_argument("--delay", "-d", type=float, default=DEFAULT_DELAY_SECONDS,
help=f"Delay between requests in seconds (default: {DEFAULT_DELAY_SECONDS})")
crawl_parser.add_argument("--max-pages", "-m", type=int, default=DEFAULT_MAX_PAGES,
help=f"Maximum pages to process (default: {DEFAULT_MAX_PAGES})")
crawl_parser.add_argument("--no-notion", action="store_true",
help="Don't save to Notion")
crawl_parser.add_argument("--no-limit", action="store_true",
help="Remove page limit (use with caution)")
# Status command
status_parser = subparsers.add_parser("status", help="Check crawl progress")
status_parser.add_argument("audit_id", nargs="?", help="Specific audit ID to check (optional)")
status_parser.add_argument("--all", "-a", action="store_true", help="Show all crawls (not just active)")
# List command
list_parser = subparsers.add_parser("list", help="List all crawl jobs")
args = parser.parse_args()
# Default to crawl if no command specified but URL provided
if args.command is None:
# Check if first positional arg looks like a URL
import sys
if len(sys.argv) > 1 and (sys.argv[1].startswith("http") or sys.argv[1].endswith(".xml")):
args.command = "crawl"
args.sitemap_url = sys.argv[1]
args.delay = DEFAULT_DELAY_SECONDS
args.max_pages = DEFAULT_MAX_PAGES
args.no_notion = False
args.no_limit = False
else:
parser.print_help()
return
if args.command == "status":
if args.audit_id:
# Show specific crawl status
progress = get_crawl_status(args.audit_id)
if progress:
print_progress_status(progress)
else:
print(f"No crawl found with audit ID: {args.audit_id}")
else:
# Show active crawls
if args.all:
crawls = get_all_crawls()
label = "All"
else:
crawls = get_active_crawls()
label = "Active"
if crawls:
print(f"\n{label} Crawl Jobs ({len(crawls)}):")
print("-" * 60)
for p in crawls:
status_emoji = {"running": "🔄", "completed": "", "failed": ""}.get(p.status, "")
print(f"{status_emoji} {p.audit_id}")
print(f" Site: {p.site}")
print(f" Progress: {p.processed_urls}/{p.total_urls} ({p.get_progress_percent():.1f}%)")
print()
else:
print(f"No {label.lower()} crawl jobs found.")
return
if args.command == "list":
crawls = get_all_crawls()
if crawls:
print(f"\nAll Crawl Jobs ({len(crawls)}):")
print("-" * 80)
print(f"{'Status':<10} {'Audit ID':<45} {'Progress':<15}")
print("-" * 80)
for p in crawls[:20]: # Show last 20
status_emoji = {"running": "🔄", "completed": "", "failed": ""}.get(p.status, "")
progress_str = f"{p.processed_urls}/{p.total_urls}"
print(f"{status_emoji} {p.status:<7} {p.audit_id:<45} {progress_str:<15}")
if len(crawls) > 20:
print(f"... and {len(crawls) - 20} more")
else:
print("No crawl jobs found.")
return
if args.command == "crawl":
# Handle --no-limit option
max_pages = args.max_pages
if args.no_limit:
max_pages = 999999 # Effectively unlimited
print("⚠️ WARNING: Page limit disabled. This may take a very long time!")
def progress_callback(progress: CrawlProgress):
pct = progress.get_progress_percent()
print(f"\r[{pct:5.1f}%] {progress.processed_urls}/{progress.total_urls} pages | "
f"Success: {progress.successful_urls} | Failed: {progress.failed_urls} | "
f"ETA: {progress.get_eta()}", end="", flush=True)
crawler = SitemapCrawler()
result = crawler.crawl_sitemap(
args.sitemap_url,
delay=args.delay,
max_pages=max_pages,
progress_callback=progress_callback,
save_to_notion=not args.no_notion,
)
print() # New line after progress
print()
print("=" * 60)
print("CRAWL COMPLETE")
print("=" * 60)
print(f"Audit ID: {result.audit_id}")
print(f"Total Pages: {result.total_pages}")
print(f"Successful: {result.successful_pages}")
print(f"Failed: {result.failed_pages}")
print(f"Duration: {result.get_duration()}")
if result.summary_page_id:
print(f"Summary Page: https://www.notion.so/{result.summary_page_id.replace('-', '')}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,467 @@
"""
Sitemap Validator - Validate XML sitemaps
==========================================
Purpose: Parse and validate XML sitemaps for SEO compliance
Python: 3.10+
Usage:
python sitemap_validator.py --url https://example.com/sitemap.xml
"""
import argparse
import asyncio
import gzip
import json
import logging
import re
from dataclasses import dataclass, field
from datetime import datetime
from io import BytesIO
from typing import Any
from urllib.parse import urljoin, urlparse
import aiohttp
import requests
from lxml import etree
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
@dataclass
class SitemapIssue:
"""Represents a sitemap validation issue."""
severity: str # "error", "warning", "info"
message: str
url: str | None = None
suggestion: str | None = None
@dataclass
class SitemapEntry:
"""Represents a single URL entry in sitemap."""
loc: str
lastmod: str | None = None
changefreq: str | None = None
priority: float | None = None
status_code: int | None = None
@dataclass
class SitemapResult:
"""Complete sitemap validation result."""
url: str
sitemap_type: str # "urlset" or "sitemapindex"
entries: list[SitemapEntry] = field(default_factory=list)
child_sitemaps: list[str] = field(default_factory=list)
issues: list[SitemapIssue] = field(default_factory=list)
valid: bool = True
stats: dict = field(default_factory=dict)
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
def to_dict(self) -> dict:
"""Convert to dictionary for JSON output."""
return {
"url": self.url,
"sitemap_type": self.sitemap_type,
"valid": self.valid,
"stats": self.stats,
"issues": [
{
"severity": i.severity,
"message": i.message,
"url": i.url,
"suggestion": i.suggestion,
}
for i in self.issues
],
"entries_count": len(self.entries),
"child_sitemaps": self.child_sitemaps,
"timestamp": self.timestamp,
}
class SitemapValidator:
"""Validate XML sitemaps."""
SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
MAX_URLS = 50000
MAX_SIZE_BYTES = 50 * 1024 * 1024 # 50MB
VALID_CHANGEFREQ = {
"always", "hourly", "daily", "weekly",
"monthly", "yearly", "never"
}
def __init__(self, check_urls: bool = False, max_concurrent: int = 10):
self.check_urls = check_urls
self.max_concurrent = max_concurrent
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
})
def fetch_sitemap(self, url: str) -> tuple[bytes, bool]:
"""Fetch sitemap content, handling gzip compression."""
try:
response = self.session.get(url, timeout=30)
response.raise_for_status()
content = response.content
is_gzipped = False
# Check if gzipped
if url.endswith(".gz") or response.headers.get(
"Content-Encoding"
) == "gzip":
try:
content = gzip.decompress(content)
is_gzipped = True
except gzip.BadGzipFile:
pass
return content, is_gzipped
except requests.RequestException as e:
raise RuntimeError(f"Failed to fetch sitemap: {e}")
def parse_sitemap(self, content: bytes) -> tuple[str, list[dict]]:
"""Parse sitemap XML content."""
try:
root = etree.fromstring(content)
except etree.XMLSyntaxError as e:
raise ValueError(f"Invalid XML: {e}")
# Remove namespace for easier parsing
nsmap = {"sm": self.SITEMAP_NS}
# Check if it's a sitemap index or urlset
if root.tag == f"{{{self.SITEMAP_NS}}}sitemapindex":
sitemap_type = "sitemapindex"
entries = []
for sitemap in root.findall("sm:sitemap", nsmap):
entry = {}
loc = sitemap.find("sm:loc", nsmap)
if loc is not None and loc.text:
entry["loc"] = loc.text.strip()
lastmod = sitemap.find("sm:lastmod", nsmap)
if lastmod is not None and lastmod.text:
entry["lastmod"] = lastmod.text.strip()
if entry.get("loc"):
entries.append(entry)
elif root.tag == f"{{{self.SITEMAP_NS}}}urlset":
sitemap_type = "urlset"
entries = []
for url in root.findall("sm:url", nsmap):
entry = {}
loc = url.find("sm:loc", nsmap)
if loc is not None and loc.text:
entry["loc"] = loc.text.strip()
lastmod = url.find("sm:lastmod", nsmap)
if lastmod is not None and lastmod.text:
entry["lastmod"] = lastmod.text.strip()
changefreq = url.find("sm:changefreq", nsmap)
if changefreq is not None and changefreq.text:
entry["changefreq"] = changefreq.text.strip().lower()
priority = url.find("sm:priority", nsmap)
if priority is not None and priority.text:
try:
entry["priority"] = float(priority.text.strip())
except ValueError:
entry["priority"] = None
if entry.get("loc"):
entries.append(entry)
else:
raise ValueError(f"Unknown sitemap type: {root.tag}")
return sitemap_type, entries
def validate(self, url: str) -> SitemapResult:
"""Validate a sitemap URL."""
result = SitemapResult(url=url, sitemap_type="unknown")
# Fetch sitemap
try:
content, is_gzipped = self.fetch_sitemap(url)
except RuntimeError as e:
result.issues.append(SitemapIssue(
severity="error",
message=str(e),
url=url,
))
result.valid = False
return result
# Check size
if len(content) > self.MAX_SIZE_BYTES:
result.issues.append(SitemapIssue(
severity="error",
message=f"Sitemap exceeds 50MB limit ({len(content) / 1024 / 1024:.2f}MB)",
url=url,
suggestion="Split sitemap into smaller files using sitemap index",
))
# Parse XML
try:
sitemap_type, entries = self.parse_sitemap(content)
except ValueError as e:
result.issues.append(SitemapIssue(
severity="error",
message=str(e),
url=url,
))
result.valid = False
return result
result.sitemap_type = sitemap_type
# Process entries
if sitemap_type == "sitemapindex":
result.child_sitemaps = [e["loc"] for e in entries]
result.stats = {
"child_sitemaps_count": len(entries),
}
else:
# Validate URL entries
url_count = len(entries)
result.stats["url_count"] = url_count
if url_count > self.MAX_URLS:
result.issues.append(SitemapIssue(
severity="error",
message=f"Sitemap exceeds 50,000 URL limit ({url_count} URLs)",
url=url,
suggestion="Split into multiple sitemaps with sitemap index",
))
if url_count == 0:
result.issues.append(SitemapIssue(
severity="warning",
message="Sitemap is empty (no URLs)",
url=url,
))
# Validate individual entries
seen_urls = set()
invalid_lastmod = 0
invalid_changefreq = 0
invalid_priority = 0
for entry in entries:
loc = entry.get("loc", "")
# Check for duplicates
if loc in seen_urls:
result.issues.append(SitemapIssue(
severity="warning",
message="Duplicate URL in sitemap",
url=loc,
))
seen_urls.add(loc)
# Validate lastmod format
lastmod = entry.get("lastmod")
if lastmod:
if not self._validate_date(lastmod):
invalid_lastmod += 1
# Validate changefreq
changefreq = entry.get("changefreq")
if changefreq and changefreq not in self.VALID_CHANGEFREQ:
invalid_changefreq += 1
# Validate priority
priority = entry.get("priority")
if priority is not None:
if not (0.0 <= priority <= 1.0):
invalid_priority += 1
# Create entry object
result.entries.append(SitemapEntry(
loc=loc,
lastmod=lastmod,
changefreq=changefreq,
priority=priority,
))
# Add summary issues
if invalid_lastmod > 0:
result.issues.append(SitemapIssue(
severity="warning",
message=f"{invalid_lastmod} URLs with invalid lastmod format",
suggestion="Use ISO 8601 format (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS+TZ)",
))
if invalid_changefreq > 0:
result.issues.append(SitemapIssue(
severity="info",
message=f"{invalid_changefreq} URLs with invalid changefreq",
suggestion="Use: always, hourly, daily, weekly, monthly, yearly, never",
))
if invalid_priority > 0:
result.issues.append(SitemapIssue(
severity="warning",
message=f"{invalid_priority} URLs with invalid priority (must be 0.0-1.0)",
))
result.stats.update({
"invalid_lastmod": invalid_lastmod,
"invalid_changefreq": invalid_changefreq,
"invalid_priority": invalid_priority,
"has_lastmod": sum(1 for e in result.entries if e.lastmod),
"has_changefreq": sum(1 for e in result.entries if e.changefreq),
"has_priority": sum(1 for e in result.entries if e.priority is not None),
})
# Check URLs if requested
if self.check_urls and result.entries:
asyncio.run(self._check_url_status(result))
# Determine validity
result.valid = not any(i.severity == "error" for i in result.issues)
return result
def _validate_date(self, date_str: str) -> bool:
"""Validate ISO 8601 date format."""
patterns = [
r"^\d{4}-\d{2}-\d{2}$",
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}",
]
return any(re.match(p, date_str) for p in patterns)
async def _check_url_status(self, result: SitemapResult) -> None:
"""Check HTTP status of URLs in sitemap."""
semaphore = asyncio.Semaphore(self.max_concurrent)
async def check_url(entry: SitemapEntry) -> None:
async with semaphore:
try:
async with aiohttp.ClientSession() as session:
async with session.head(
entry.loc,
timeout=aiohttp.ClientTimeout(total=10),
allow_redirects=True,
) as response:
entry.status_code = response.status
except Exception:
entry.status_code = 0
await asyncio.gather(*[check_url(e) for e in result.entries[:100]])
# Count status codes
status_counts = {}
for entry in result.entries:
if entry.status_code:
status_counts[entry.status_code] = (
status_counts.get(entry.status_code, 0) + 1
)
result.stats["url_status_codes"] = status_counts
# Add issues for non-200 URLs
error_count = sum(
1 for e in result.entries
if e.status_code and e.status_code >= 400
)
if error_count > 0:
result.issues.append(SitemapIssue(
severity="warning",
message=f"{error_count} URLs returning error status codes (4xx/5xx)",
suggestion="Remove or fix broken URLs in sitemap",
))
def generate_report(self, result: SitemapResult) -> str:
"""Generate human-readable validation report."""
lines = [
"=" * 60,
"Sitemap Validation Report",
"=" * 60,
f"URL: {result.url}",
f"Type: {result.sitemap_type}",
f"Valid: {'Yes' if result.valid else 'No'}",
f"Timestamp: {result.timestamp}",
"",
]
lines.append("Statistics:")
for key, value in result.stats.items():
lines.append(f" {key}: {value}")
lines.append("")
if result.child_sitemaps:
lines.append(f"Child Sitemaps ({len(result.child_sitemaps)}):")
for sitemap in result.child_sitemaps[:10]:
lines.append(f" - {sitemap}")
if len(result.child_sitemaps) > 10:
lines.append(f" ... and {len(result.child_sitemaps) - 10} more")
lines.append("")
if result.issues:
lines.append("Issues Found:")
errors = [i for i in result.issues if i.severity == "error"]
warnings = [i for i in result.issues if i.severity == "warning"]
infos = [i for i in result.issues if i.severity == "info"]
if errors:
lines.append(f"\n ERRORS ({len(errors)}):")
for issue in errors:
lines.append(f" - {issue.message}")
if issue.url:
lines.append(f" URL: {issue.url}")
if issue.suggestion:
lines.append(f" Suggestion: {issue.suggestion}")
if warnings:
lines.append(f"\n WARNINGS ({len(warnings)}):")
for issue in warnings:
lines.append(f" - {issue.message}")
if issue.suggestion:
lines.append(f" Suggestion: {issue.suggestion}")
if infos:
lines.append(f"\n INFO ({len(infos)}):")
for issue in infos:
lines.append(f" - {issue.message}")
lines.append("")
lines.append("=" * 60)
return "\n".join(lines)
def main():
"""Main entry point for CLI usage."""
parser = argparse.ArgumentParser(
description="Validate XML sitemaps",
)
parser.add_argument("--url", "-u", required=True, help="Sitemap URL to validate")
parser.add_argument("--check-urls", action="store_true",
help="Check HTTP status of URLs (slower)")
parser.add_argument("--output", "-o", help="Output file for JSON report")
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
validator = SitemapValidator(check_urls=args.check_urls)
result = validator.validate(args.url)
if args.json or args.output:
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
logger.info(f"Report written to {args.output}")
else:
print(output)
else:
print(validator.generate_report(result))
if __name__ == "__main__":
main()