refactor: Reorganize skill numbering and update documentation
Skill Numbering Changes: - 01-03: OurDigital core (was 30-32) - 31-32: Notion tools (was 01-02) - 99_archive: Renamed from _archive for sorting New Files: - AGENTS.md: Claude Code agent routing guide - requirements.txt for 00-claude-code-setting, 32-notion-writer, 43-jamie-youtube-manager Documentation Updates: - CLAUDE.md: Updated skill inventory (23 skills) - AUDIT_REPORT.md: Current completion status (91%) - Archived REFACTORING_PLAN.md (most tasks complete) Removed: - ga-agent-skills/ (moved to separate repo ~/Project/dintel-ga4-agent) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
207
custom-skills/99_archive/seo-audit-agent/scripts/base_client.py
Normal file
207
custom-skills/99_archive/seo-audit-agent/scripts/base_client.py
Normal file
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
Base Client - Shared async client utilities
|
||||
===========================================
|
||||
Purpose: Rate-limited async operations for API clients
|
||||
Python: 3.10+
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from asyncio import Semaphore
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable, TypeVar
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
retry_if_exception_type,
|
||||
)
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Logging setup
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Rate limiter using token bucket algorithm."""
|
||||
|
||||
def __init__(self, rate: float, per: float = 1.0):
|
||||
"""
|
||||
Initialize rate limiter.
|
||||
|
||||
Args:
|
||||
rate: Number of requests allowed
|
||||
per: Time period in seconds (default: 1 second)
|
||||
"""
|
||||
self.rate = rate
|
||||
self.per = per
|
||||
self.tokens = rate
|
||||
self.last_update = datetime.now()
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def acquire(self) -> None:
|
||||
"""Acquire a token, waiting if necessary."""
|
||||
async with self._lock:
|
||||
now = datetime.now()
|
||||
elapsed = (now - self.last_update).total_seconds()
|
||||
self.tokens = min(self.rate, self.tokens + elapsed * (self.rate / self.per))
|
||||
self.last_update = now
|
||||
|
||||
if self.tokens < 1:
|
||||
wait_time = (1 - self.tokens) * (self.per / self.rate)
|
||||
await asyncio.sleep(wait_time)
|
||||
self.tokens = 0
|
||||
else:
|
||||
self.tokens -= 1
|
||||
|
||||
|
||||
class BaseAsyncClient:
|
||||
"""Base class for async API clients with rate limiting."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_concurrent: int = 5,
|
||||
requests_per_second: float = 3.0,
|
||||
logger: logging.Logger | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize base client.
|
||||
|
||||
Args:
|
||||
max_concurrent: Maximum concurrent requests
|
||||
requests_per_second: Rate limit
|
||||
logger: Logger instance
|
||||
"""
|
||||
self.semaphore = Semaphore(max_concurrent)
|
||||
self.rate_limiter = RateLimiter(requests_per_second)
|
||||
self.logger = logger or logging.getLogger(self.__class__.__name__)
|
||||
self.stats = {
|
||||
"requests": 0,
|
||||
"success": 0,
|
||||
"errors": 0,
|
||||
"retries": 0,
|
||||
}
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=2, max=10),
|
||||
retry=retry_if_exception_type(Exception),
|
||||
)
|
||||
async def _rate_limited_request(
|
||||
self,
|
||||
coro: Callable[[], Any],
|
||||
) -> Any:
|
||||
"""Execute a request with rate limiting and retry."""
|
||||
async with self.semaphore:
|
||||
await self.rate_limiter.acquire()
|
||||
self.stats["requests"] += 1
|
||||
try:
|
||||
result = await coro()
|
||||
self.stats["success"] += 1
|
||||
return result
|
||||
except Exception as e:
|
||||
self.stats["errors"] += 1
|
||||
self.logger.error(f"Request failed: {e}")
|
||||
raise
|
||||
|
||||
async def batch_requests(
|
||||
self,
|
||||
requests: list[Callable[[], Any]],
|
||||
desc: str = "Processing",
|
||||
) -> list[Any]:
|
||||
"""Execute multiple requests concurrently."""
|
||||
try:
|
||||
from tqdm.asyncio import tqdm
|
||||
has_tqdm = True
|
||||
except ImportError:
|
||||
has_tqdm = False
|
||||
|
||||
async def execute(req: Callable) -> Any:
|
||||
try:
|
||||
return await self._rate_limited_request(req)
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
tasks = [execute(req) for req in requests]
|
||||
|
||||
if has_tqdm:
|
||||
results = []
|
||||
for coro in tqdm.as_completed(tasks, total=len(tasks), desc=desc):
|
||||
result = await coro
|
||||
results.append(result)
|
||||
return results
|
||||
else:
|
||||
return await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
def print_stats(self) -> None:
|
||||
"""Print request statistics."""
|
||||
self.logger.info("=" * 40)
|
||||
self.logger.info("Request Statistics:")
|
||||
self.logger.info(f" Total Requests: {self.stats['requests']}")
|
||||
self.logger.info(f" Successful: {self.stats['success']}")
|
||||
self.logger.info(f" Errors: {self.stats['errors']}")
|
||||
self.logger.info("=" * 40)
|
||||
|
||||
|
||||
class ConfigManager:
|
||||
"""Manage API configuration and credentials."""
|
||||
|
||||
def __init__(self):
|
||||
load_dotenv()
|
||||
|
||||
@property
|
||||
def google_credentials_path(self) -> str | None:
|
||||
"""Get Google service account credentials path."""
|
||||
# Prefer SEO-specific credentials, fallback to general credentials
|
||||
seo_creds = os.path.expanduser("~/.credential/ourdigital-seo-agent.json")
|
||||
if os.path.exists(seo_creds):
|
||||
return seo_creds
|
||||
return os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
|
||||
|
||||
@property
|
||||
def pagespeed_api_key(self) -> str | None:
|
||||
"""Get PageSpeed Insights API key."""
|
||||
return os.getenv("PAGESPEED_API_KEY")
|
||||
|
||||
@property
|
||||
def custom_search_api_key(self) -> str | None:
|
||||
"""Get Custom Search API key."""
|
||||
return os.getenv("CUSTOM_SEARCH_API_KEY")
|
||||
|
||||
@property
|
||||
def custom_search_engine_id(self) -> str | None:
|
||||
"""Get Custom Search Engine ID."""
|
||||
return os.getenv("CUSTOM_SEARCH_ENGINE_ID")
|
||||
|
||||
@property
|
||||
def notion_token(self) -> str | None:
|
||||
"""Get Notion API token."""
|
||||
return os.getenv("NOTION_TOKEN") or os.getenv("NOTION_API_KEY")
|
||||
|
||||
def validate_google_credentials(self) -> bool:
|
||||
"""Validate Google credentials are configured."""
|
||||
creds_path = self.google_credentials_path
|
||||
if not creds_path:
|
||||
return False
|
||||
return os.path.exists(creds_path)
|
||||
|
||||
def get_required(self, key: str) -> str:
|
||||
"""Get required environment variable or raise error."""
|
||||
value = os.getenv(key)
|
||||
if not value:
|
||||
raise ValueError(f"Missing required environment variable: {key}")
|
||||
return value
|
||||
|
||||
|
||||
# Singleton config instance
|
||||
config = ConfigManager()
|
||||
497
custom-skills/99_archive/seo-audit-agent/scripts/full_audit.py
Normal file
497
custom-skills/99_archive/seo-audit-agent/scripts/full_audit.py
Normal file
@@ -0,0 +1,497 @@
|
||||
"""
|
||||
Full SEO Audit - Orchestration Script
|
||||
=====================================
|
||||
Purpose: Run comprehensive SEO audit combining all tools
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
python full_audit.py --url https://example.com --output notion --notion-page-id abc123
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from robots_checker import RobotsChecker
|
||||
from schema_validator import SchemaValidator
|
||||
from sitemap_validator import SitemapValidator
|
||||
from pagespeed_client import PageSpeedClient
|
||||
from notion_reporter import NotionReporter, SEOFinding
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AuditResult:
|
||||
"""Complete SEO audit result."""
|
||||
|
||||
url: str
|
||||
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
||||
robots: dict = field(default_factory=dict)
|
||||
sitemap: dict = field(default_factory=dict)
|
||||
schema: dict = field(default_factory=dict)
|
||||
performance: dict = field(default_factory=dict)
|
||||
findings: list[SEOFinding] = field(default_factory=list)
|
||||
summary: dict = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"url": self.url,
|
||||
"timestamp": self.timestamp,
|
||||
"robots": self.robots,
|
||||
"sitemap": self.sitemap,
|
||||
"schema": self.schema,
|
||||
"performance": self.performance,
|
||||
"summary": self.summary,
|
||||
"findings_count": len(self.findings),
|
||||
}
|
||||
|
||||
|
||||
class SEOAuditor:
|
||||
"""Orchestrate comprehensive SEO audit."""
|
||||
|
||||
def __init__(self):
|
||||
self.robots_checker = RobotsChecker()
|
||||
self.sitemap_validator = SitemapValidator()
|
||||
self.schema_validator = SchemaValidator()
|
||||
self.pagespeed_client = PageSpeedClient()
|
||||
|
||||
def run_audit(
|
||||
self,
|
||||
url: str,
|
||||
include_robots: bool = True,
|
||||
include_sitemap: bool = True,
|
||||
include_schema: bool = True,
|
||||
include_performance: bool = True,
|
||||
) -> AuditResult:
|
||||
"""
|
||||
Run comprehensive SEO audit.
|
||||
|
||||
Args:
|
||||
url: URL to audit
|
||||
include_robots: Check robots.txt
|
||||
include_sitemap: Validate sitemap
|
||||
include_schema: Validate schema markup
|
||||
include_performance: Run PageSpeed analysis
|
||||
"""
|
||||
result = AuditResult(url=url)
|
||||
parsed_url = urlparse(url)
|
||||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||
|
||||
logger.info(f"Starting SEO audit for {url}")
|
||||
|
||||
# 1. Robots.txt analysis
|
||||
if include_robots:
|
||||
logger.info("Analyzing robots.txt...")
|
||||
try:
|
||||
robots_result = self.robots_checker.analyze(base_url)
|
||||
result.robots = robots_result.to_dict()
|
||||
self._process_robots_findings(robots_result, result)
|
||||
except Exception as e:
|
||||
logger.error(f"Robots.txt analysis failed: {e}")
|
||||
result.robots = {"error": str(e)}
|
||||
|
||||
# 2. Sitemap validation
|
||||
if include_sitemap:
|
||||
logger.info("Validating sitemap...")
|
||||
sitemap_url = f"{base_url}/sitemap.xml"
|
||||
# Try to get sitemap URL from robots.txt
|
||||
if result.robots.get("sitemaps"):
|
||||
sitemap_url = result.robots["sitemaps"][0]
|
||||
try:
|
||||
sitemap_result = self.sitemap_validator.validate(sitemap_url)
|
||||
result.sitemap = sitemap_result.to_dict()
|
||||
self._process_sitemap_findings(sitemap_result, result)
|
||||
except Exception as e:
|
||||
logger.error(f"Sitemap validation failed: {e}")
|
||||
result.sitemap = {"error": str(e)}
|
||||
|
||||
# 3. Schema validation
|
||||
if include_schema:
|
||||
logger.info("Validating schema markup...")
|
||||
try:
|
||||
schema_result = self.schema_validator.validate(url=url)
|
||||
result.schema = schema_result.to_dict()
|
||||
self._process_schema_findings(schema_result, result)
|
||||
except Exception as e:
|
||||
logger.error(f"Schema validation failed: {e}")
|
||||
result.schema = {"error": str(e)}
|
||||
|
||||
# 4. PageSpeed analysis
|
||||
if include_performance:
|
||||
logger.info("Running PageSpeed analysis...")
|
||||
try:
|
||||
perf_result = self.pagespeed_client.analyze(url, strategy="mobile")
|
||||
result.performance = perf_result.to_dict()
|
||||
self._process_performance_findings(perf_result, result)
|
||||
except Exception as e:
|
||||
logger.error(f"PageSpeed analysis failed: {e}")
|
||||
result.performance = {"error": str(e)}
|
||||
|
||||
# Generate summary
|
||||
result.summary = self._generate_summary(result)
|
||||
|
||||
logger.info(f"Audit complete. Found {len(result.findings)} issues.")
|
||||
return result
|
||||
|
||||
def _process_robots_findings(self, robots_result, audit_result: AuditResult):
|
||||
"""Convert robots.txt issues to findings."""
|
||||
for issue in robots_result.issues:
|
||||
priority = "Medium"
|
||||
if issue.severity == "error":
|
||||
priority = "Critical"
|
||||
elif issue.severity == "warning":
|
||||
priority = "High"
|
||||
|
||||
audit_result.findings.append(SEOFinding(
|
||||
issue=issue.message,
|
||||
category="Robots.txt",
|
||||
priority=priority,
|
||||
description=issue.directive or "",
|
||||
recommendation=issue.suggestion or "",
|
||||
))
|
||||
|
||||
def _process_sitemap_findings(self, sitemap_result, audit_result: AuditResult):
|
||||
"""Convert sitemap issues to findings."""
|
||||
for issue in sitemap_result.issues:
|
||||
priority = "Medium"
|
||||
if issue.severity == "error":
|
||||
priority = "High"
|
||||
elif issue.severity == "warning":
|
||||
priority = "Medium"
|
||||
|
||||
audit_result.findings.append(SEOFinding(
|
||||
issue=issue.message,
|
||||
category="Sitemap",
|
||||
priority=priority,
|
||||
url=issue.url,
|
||||
recommendation=issue.suggestion or "",
|
||||
))
|
||||
|
||||
def _process_schema_findings(self, schema_result, audit_result: AuditResult):
|
||||
"""Convert schema issues to findings."""
|
||||
for issue in schema_result.issues:
|
||||
priority = "Low"
|
||||
if issue.severity == "error":
|
||||
priority = "High"
|
||||
elif issue.severity == "warning":
|
||||
priority = "Medium"
|
||||
|
||||
audit_result.findings.append(SEOFinding(
|
||||
issue=issue.message,
|
||||
category="Schema/Structured Data",
|
||||
priority=priority,
|
||||
description=f"Schema type: {issue.schema_type}" if issue.schema_type else "",
|
||||
recommendation=issue.suggestion or "",
|
||||
))
|
||||
|
||||
def _process_performance_findings(self, perf_result, audit_result: AuditResult):
|
||||
"""Convert performance issues to findings."""
|
||||
cwv = perf_result.core_web_vitals
|
||||
|
||||
# Check Core Web Vitals
|
||||
if cwv.lcp_rating == "POOR":
|
||||
audit_result.findings.append(SEOFinding(
|
||||
issue=f"Poor LCP: {cwv.lcp / 1000:.2f}s (should be < 2.5s)",
|
||||
category="Performance",
|
||||
priority="Critical",
|
||||
impact="Users experience slow page loads, affecting bounce rate and rankings",
|
||||
recommendation="Optimize images, reduce server response time, use CDN",
|
||||
))
|
||||
elif cwv.lcp_rating == "NEEDS_IMPROVEMENT":
|
||||
audit_result.findings.append(SEOFinding(
|
||||
issue=f"LCP needs improvement: {cwv.lcp / 1000:.2f}s (target < 2.5s)",
|
||||
category="Performance",
|
||||
priority="High",
|
||||
recommendation="Optimize largest content element loading",
|
||||
))
|
||||
|
||||
if cwv.cls_rating == "POOR":
|
||||
audit_result.findings.append(SEOFinding(
|
||||
issue=f"Poor CLS: {cwv.cls:.3f} (should be < 0.1)",
|
||||
category="Performance",
|
||||
priority="High",
|
||||
impact="Layout shifts frustrate users",
|
||||
recommendation="Set dimensions for images/embeds, avoid inserting content above existing content",
|
||||
))
|
||||
|
||||
if cwv.fid_rating == "POOR":
|
||||
audit_result.findings.append(SEOFinding(
|
||||
issue=f"Poor FID/TBT: {cwv.fid:.0f}ms (should be < 100ms)",
|
||||
category="Performance",
|
||||
priority="High",
|
||||
impact="Slow interactivity affects user experience",
|
||||
recommendation="Reduce JavaScript execution time, break up long tasks",
|
||||
))
|
||||
|
||||
# Check performance score
|
||||
if perf_result.performance_score and perf_result.performance_score < 50:
|
||||
audit_result.findings.append(SEOFinding(
|
||||
issue=f"Low performance score: {perf_result.performance_score:.0f}/100",
|
||||
category="Performance",
|
||||
priority="High",
|
||||
impact="Poor performance affects user experience and SEO",
|
||||
recommendation="Address top opportunities from PageSpeed Insights",
|
||||
))
|
||||
|
||||
# Add top opportunities as findings
|
||||
for opp in perf_result.opportunities[:3]:
|
||||
if opp["savings_ms"] > 500: # Only significant savings
|
||||
audit_result.findings.append(SEOFinding(
|
||||
issue=opp["title"],
|
||||
category="Performance",
|
||||
priority="Medium",
|
||||
description=opp.get("description", ""),
|
||||
impact=f"Potential savings: {opp['savings_ms'] / 1000:.1f}s",
|
||||
recommendation="See PageSpeed Insights for details",
|
||||
))
|
||||
|
||||
def _generate_summary(self, result: AuditResult) -> dict:
|
||||
"""Generate audit summary."""
|
||||
findings_by_priority = {}
|
||||
findings_by_category = {}
|
||||
|
||||
for finding in result.findings:
|
||||
# Count by priority
|
||||
findings_by_priority[finding.priority] = (
|
||||
findings_by_priority.get(finding.priority, 0) + 1
|
||||
)
|
||||
# Count by category
|
||||
findings_by_category[finding.category] = (
|
||||
findings_by_category.get(finding.category, 0) + 1
|
||||
)
|
||||
|
||||
return {
|
||||
"total_findings": len(result.findings),
|
||||
"findings_by_priority": findings_by_priority,
|
||||
"findings_by_category": findings_by_category,
|
||||
"robots_accessible": result.robots.get("accessible", False),
|
||||
"sitemap_valid": result.sitemap.get("valid", False),
|
||||
"schema_valid": result.schema.get("valid", False),
|
||||
"performance_score": result.performance.get("scores", {}).get("performance"),
|
||||
"quick_wins": [
|
||||
f.issue for f in result.findings
|
||||
if f.priority in ("Medium", "Low")
|
||||
][:5],
|
||||
"critical_issues": [
|
||||
f.issue for f in result.findings
|
||||
if f.priority == "Critical"
|
||||
],
|
||||
}
|
||||
|
||||
def export_to_notion(
|
||||
self,
|
||||
result: AuditResult,
|
||||
parent_page_id: str | None = None,
|
||||
use_default_db: bool = True,
|
||||
) -> dict:
|
||||
"""
|
||||
Export audit results to Notion.
|
||||
|
||||
Args:
|
||||
result: AuditResult object
|
||||
parent_page_id: Parent page ID (for creating new database)
|
||||
use_default_db: If True, use OurDigital SEO Audit Log database
|
||||
|
||||
Returns:
|
||||
Dict with database_id, summary_page_id, findings_created
|
||||
"""
|
||||
reporter = NotionReporter()
|
||||
audit_id = f"{urlparse(result.url).netloc}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
||||
|
||||
# Add site and audit_id to all findings
|
||||
for finding in result.findings:
|
||||
finding.site = result.url
|
||||
finding.audit_id = audit_id
|
||||
|
||||
if use_default_db:
|
||||
# Use the default OurDigital SEO Audit Log database
|
||||
page_ids = reporter.add_findings_batch(result.findings)
|
||||
return {
|
||||
"database_id": reporter.DEFAULT_DATABASE_ID if hasattr(reporter, 'DEFAULT_DATABASE_ID') else "2c8581e5-8a1e-8035-880b-e38cefc2f3ef",
|
||||
"audit_id": audit_id,
|
||||
"findings_created": len(page_ids),
|
||||
}
|
||||
else:
|
||||
# Create new database under parent page
|
||||
if not parent_page_id:
|
||||
raise ValueError("parent_page_id required when not using default database")
|
||||
|
||||
db_title = f"SEO Audit - {urlparse(result.url).netloc} - {datetime.now().strftime('%Y-%m-%d')}"
|
||||
database_id = reporter.create_findings_database(parent_page_id, db_title)
|
||||
page_ids = reporter.add_findings_batch(result.findings, database_id)
|
||||
|
||||
# Create summary page
|
||||
summary_page_id = reporter.create_audit_summary_page(
|
||||
parent_page_id,
|
||||
result.url,
|
||||
result.summary,
|
||||
)
|
||||
|
||||
return {
|
||||
"database_id": database_id,
|
||||
"summary_page_id": summary_page_id,
|
||||
"audit_id": audit_id,
|
||||
"findings_created": len(page_ids),
|
||||
}
|
||||
|
||||
def generate_report(self, result: AuditResult) -> str:
|
||||
"""Generate human-readable report."""
|
||||
lines = [
|
||||
"=" * 70,
|
||||
"SEO AUDIT REPORT",
|
||||
"=" * 70,
|
||||
f"URL: {result.url}",
|
||||
f"Date: {result.timestamp}",
|
||||
"",
|
||||
"-" * 70,
|
||||
"SUMMARY",
|
||||
"-" * 70,
|
||||
f"Total Issues Found: {result.summary.get('total_findings', 0)}",
|
||||
"",
|
||||
]
|
||||
|
||||
# Priority breakdown
|
||||
lines.append("Issues by Priority:")
|
||||
for priority in ["Critical", "High", "Medium", "Low"]:
|
||||
count = result.summary.get("findings_by_priority", {}).get(priority, 0)
|
||||
if count:
|
||||
lines.append(f" {priority}: {count}")
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Category breakdown
|
||||
lines.append("Issues by Category:")
|
||||
for category, count in result.summary.get("findings_by_category", {}).items():
|
||||
lines.append(f" {category}: {count}")
|
||||
|
||||
lines.append("")
|
||||
lines.append("-" * 70)
|
||||
lines.append("STATUS OVERVIEW")
|
||||
lines.append("-" * 70)
|
||||
|
||||
# Status checks
|
||||
lines.append(f"Robots.txt: {'✓ Accessible' if result.robots.get('accessible') else '✗ Not accessible'}")
|
||||
lines.append(f"Sitemap: {'✓ Valid' if result.sitemap.get('valid') else '✗ Issues found'}")
|
||||
lines.append(f"Schema: {'✓ Valid' if result.schema.get('valid') else '✗ Issues found'}")
|
||||
|
||||
perf_score = result.performance.get("scores", {}).get("performance")
|
||||
if perf_score:
|
||||
status = "✓ Good" if perf_score >= 90 else "⚠ Needs work" if perf_score >= 50 else "✗ Poor"
|
||||
lines.append(f"Performance: {status} ({perf_score:.0f}/100)")
|
||||
|
||||
# Critical issues
|
||||
critical = result.summary.get("critical_issues", [])
|
||||
if critical:
|
||||
lines.extend([
|
||||
"",
|
||||
"-" * 70,
|
||||
"CRITICAL ISSUES (Fix Immediately)",
|
||||
"-" * 70,
|
||||
])
|
||||
for issue in critical:
|
||||
lines.append(f" • {issue}")
|
||||
|
||||
# Quick wins
|
||||
quick_wins = result.summary.get("quick_wins", [])
|
||||
if quick_wins:
|
||||
lines.extend([
|
||||
"",
|
||||
"-" * 70,
|
||||
"QUICK WINS",
|
||||
"-" * 70,
|
||||
])
|
||||
for issue in quick_wins[:5]:
|
||||
lines.append(f" • {issue}")
|
||||
|
||||
# All findings
|
||||
if result.findings:
|
||||
lines.extend([
|
||||
"",
|
||||
"-" * 70,
|
||||
"ALL FINDINGS",
|
||||
"-" * 70,
|
||||
])
|
||||
|
||||
current_category = None
|
||||
for finding in sorted(result.findings, key=lambda x: (x.category, x.priority)):
|
||||
if finding.category != current_category:
|
||||
current_category = finding.category
|
||||
lines.append(f"\n[{current_category}]")
|
||||
|
||||
lines.append(f" [{finding.priority}] {finding.issue}")
|
||||
if finding.recommendation:
|
||||
lines.append(f" → {finding.recommendation}")
|
||||
|
||||
lines.extend(["", "=" * 70])
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run comprehensive SEO audit",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Run full audit and output to console
|
||||
python full_audit.py --url https://example.com
|
||||
|
||||
# Export to Notion
|
||||
python full_audit.py --url https://example.com --output notion --notion-page-id abc123
|
||||
|
||||
# Output as JSON
|
||||
python full_audit.py --url https://example.com --json
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument("--url", "-u", required=True, help="URL to audit")
|
||||
parser.add_argument("--output", "-o", choices=["console", "notion", "json"],
|
||||
default="console", help="Output format")
|
||||
parser.add_argument("--notion-page-id", help="Notion parent page ID (required for notion output)")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
parser.add_argument("--no-robots", action="store_true", help="Skip robots.txt check")
|
||||
parser.add_argument("--no-sitemap", action="store_true", help="Skip sitemap validation")
|
||||
parser.add_argument("--no-schema", action="store_true", help="Skip schema validation")
|
||||
parser.add_argument("--no-performance", action="store_true", help="Skip PageSpeed analysis")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
auditor = SEOAuditor()
|
||||
|
||||
# Run audit
|
||||
result = auditor.run_audit(
|
||||
args.url,
|
||||
include_robots=not args.no_robots,
|
||||
include_sitemap=not args.no_sitemap,
|
||||
include_schema=not args.no_schema,
|
||||
include_performance=not args.no_performance,
|
||||
)
|
||||
|
||||
# Output results
|
||||
if args.json or args.output == "json":
|
||||
print(json.dumps(result.to_dict(), indent=2, default=str))
|
||||
|
||||
elif args.output == "notion":
|
||||
if not args.notion_page_id:
|
||||
parser.error("--notion-page-id required for notion output")
|
||||
notion_result = auditor.export_to_notion(result, args.notion_page_id)
|
||||
print(f"Exported to Notion:")
|
||||
print(f" Database ID: {notion_result['database_id']}")
|
||||
print(f" Summary Page: {notion_result['summary_page_id']}")
|
||||
print(f" Findings Created: {notion_result['findings_created']}")
|
||||
|
||||
else:
|
||||
print(auditor.generate_report(result))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
409
custom-skills/99_archive/seo-audit-agent/scripts/gsc_client.py
Normal file
409
custom-skills/99_archive/seo-audit-agent/scripts/gsc_client.py
Normal file
@@ -0,0 +1,409 @@
|
||||
"""
|
||||
Google Search Console Client
|
||||
============================
|
||||
Purpose: Interact with Google Search Console API for SEO data
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
from gsc_client import SearchConsoleClient
|
||||
client = SearchConsoleClient()
|
||||
data = client.get_search_analytics("sc-domain:example.com")
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any
|
||||
|
||||
from google.oauth2 import service_account
|
||||
from googleapiclient.discovery import build
|
||||
|
||||
from base_client import config
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchAnalyticsResult:
|
||||
"""Search analytics query result."""
|
||||
|
||||
rows: list[dict] = field(default_factory=list)
|
||||
total_clicks: int = 0
|
||||
total_impressions: int = 0
|
||||
average_ctr: float = 0.0
|
||||
average_position: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class SitemapInfo:
|
||||
"""Sitemap information from Search Console."""
|
||||
|
||||
path: str
|
||||
last_submitted: str | None = None
|
||||
last_downloaded: str | None = None
|
||||
is_pending: bool = False
|
||||
is_sitemaps_index: bool = False
|
||||
warnings: int = 0
|
||||
errors: int = 0
|
||||
|
||||
|
||||
class SearchConsoleClient:
|
||||
"""Client for Google Search Console API."""
|
||||
|
||||
SCOPES = ["https://www.googleapis.com/auth/webmasters.readonly"]
|
||||
|
||||
def __init__(self, credentials_path: str | None = None):
|
||||
"""
|
||||
Initialize Search Console client.
|
||||
|
||||
Args:
|
||||
credentials_path: Path to service account JSON key
|
||||
"""
|
||||
self.credentials_path = credentials_path or config.google_credentials_path
|
||||
self._service = None
|
||||
|
||||
@property
|
||||
def service(self):
|
||||
"""Get or create Search Console service."""
|
||||
if self._service is None:
|
||||
if not self.credentials_path:
|
||||
raise ValueError(
|
||||
"Google credentials not configured. "
|
||||
"Set GOOGLE_APPLICATION_CREDENTIALS environment variable."
|
||||
)
|
||||
|
||||
credentials = service_account.Credentials.from_service_account_file(
|
||||
self.credentials_path,
|
||||
scopes=self.SCOPES,
|
||||
)
|
||||
self._service = build("searchconsole", "v1", credentials=credentials)
|
||||
|
||||
return self._service
|
||||
|
||||
def list_sites(self) -> list[dict]:
|
||||
"""List all sites accessible to the service account."""
|
||||
response = self.service.sites().list().execute()
|
||||
return response.get("siteEntry", [])
|
||||
|
||||
def get_search_analytics(
|
||||
self,
|
||||
site_url: str,
|
||||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
dimensions: list[str] | None = None,
|
||||
row_limit: int = 25000,
|
||||
filters: list[dict] | None = None,
|
||||
) -> SearchAnalyticsResult:
|
||||
"""
|
||||
Get search analytics data.
|
||||
|
||||
Args:
|
||||
site_url: Site URL (e.g., "sc-domain:example.com" or "https://example.com/")
|
||||
start_date: Start date (YYYY-MM-DD), defaults to 30 days ago
|
||||
end_date: End date (YYYY-MM-DD), defaults to yesterday
|
||||
dimensions: List of dimensions (query, page, country, device, date)
|
||||
row_limit: Maximum rows to return
|
||||
filters: Dimension filters
|
||||
|
||||
Returns:
|
||||
SearchAnalyticsResult with rows and summary stats
|
||||
"""
|
||||
# Default date range: last 30 days
|
||||
if not end_date:
|
||||
end_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
if not start_date:
|
||||
start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
|
||||
|
||||
# Default dimensions
|
||||
if dimensions is None:
|
||||
dimensions = ["query", "page"]
|
||||
|
||||
request_body = {
|
||||
"startDate": start_date,
|
||||
"endDate": end_date,
|
||||
"dimensions": dimensions,
|
||||
"rowLimit": row_limit,
|
||||
}
|
||||
|
||||
if filters:
|
||||
request_body["dimensionFilterGroups"] = [{"filters": filters}]
|
||||
|
||||
try:
|
||||
response = self.service.searchanalytics().query(
|
||||
siteUrl=site_url,
|
||||
body=request_body,
|
||||
).execute()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to query search analytics: {e}")
|
||||
raise
|
||||
|
||||
rows = response.get("rows", [])
|
||||
|
||||
# Calculate totals
|
||||
total_clicks = sum(row.get("clicks", 0) for row in rows)
|
||||
total_impressions = sum(row.get("impressions", 0) for row in rows)
|
||||
total_ctr = sum(row.get("ctr", 0) for row in rows)
|
||||
total_position = sum(row.get("position", 0) for row in rows)
|
||||
|
||||
avg_ctr = total_ctr / len(rows) if rows else 0
|
||||
avg_position = total_position / len(rows) if rows else 0
|
||||
|
||||
return SearchAnalyticsResult(
|
||||
rows=rows,
|
||||
total_clicks=total_clicks,
|
||||
total_impressions=total_impressions,
|
||||
average_ctr=avg_ctr,
|
||||
average_position=avg_position,
|
||||
)
|
||||
|
||||
def get_top_queries(
|
||||
self,
|
||||
site_url: str,
|
||||
limit: int = 100,
|
||||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
) -> list[dict]:
|
||||
"""Get top search queries by clicks."""
|
||||
result = self.get_search_analytics(
|
||||
site_url=site_url,
|
||||
dimensions=["query"],
|
||||
row_limit=limit,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
)
|
||||
|
||||
# Sort by clicks
|
||||
sorted_rows = sorted(
|
||||
result.rows,
|
||||
key=lambda x: x.get("clicks", 0),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
"query": row["keys"][0],
|
||||
"clicks": row.get("clicks", 0),
|
||||
"impressions": row.get("impressions", 0),
|
||||
"ctr": row.get("ctr", 0),
|
||||
"position": row.get("position", 0),
|
||||
}
|
||||
for row in sorted_rows[:limit]
|
||||
]
|
||||
|
||||
def get_top_pages(
|
||||
self,
|
||||
site_url: str,
|
||||
limit: int = 100,
|
||||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
) -> list[dict]:
|
||||
"""Get top pages by clicks."""
|
||||
result = self.get_search_analytics(
|
||||
site_url=site_url,
|
||||
dimensions=["page"],
|
||||
row_limit=limit,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
)
|
||||
|
||||
sorted_rows = sorted(
|
||||
result.rows,
|
||||
key=lambda x: x.get("clicks", 0),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
"page": row["keys"][0],
|
||||
"clicks": row.get("clicks", 0),
|
||||
"impressions": row.get("impressions", 0),
|
||||
"ctr": row.get("ctr", 0),
|
||||
"position": row.get("position", 0),
|
||||
}
|
||||
for row in sorted_rows[:limit]
|
||||
]
|
||||
|
||||
def get_sitemaps(self, site_url: str) -> list[SitemapInfo]:
|
||||
"""Get list of sitemaps for a site."""
|
||||
try:
|
||||
response = self.service.sitemaps().list(siteUrl=site_url).execute()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get sitemaps: {e}")
|
||||
raise
|
||||
|
||||
sitemaps = []
|
||||
for sm in response.get("sitemap", []):
|
||||
sitemaps.append(SitemapInfo(
|
||||
path=sm.get("path", ""),
|
||||
last_submitted=sm.get("lastSubmitted"),
|
||||
last_downloaded=sm.get("lastDownloaded"),
|
||||
is_pending=sm.get("isPending", False),
|
||||
is_sitemaps_index=sm.get("isSitemapsIndex", False),
|
||||
warnings=sm.get("warnings", 0),
|
||||
errors=sm.get("errors", 0),
|
||||
))
|
||||
|
||||
return sitemaps
|
||||
|
||||
def submit_sitemap(self, site_url: str, sitemap_url: str) -> bool:
|
||||
"""Submit a sitemap for indexing."""
|
||||
try:
|
||||
self.service.sitemaps().submit(
|
||||
siteUrl=site_url,
|
||||
feedpath=sitemap_url,
|
||||
).execute()
|
||||
logger.info(f"Submitted sitemap: {sitemap_url}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to submit sitemap: {e}")
|
||||
return False
|
||||
|
||||
def inspect_url(self, site_url: str, inspection_url: str) -> dict:
|
||||
"""
|
||||
Inspect a URL's indexing status.
|
||||
|
||||
Note: This uses the URL Inspection API which may have different quotas.
|
||||
"""
|
||||
try:
|
||||
response = self.service.urlInspection().index().inspect(
|
||||
body={
|
||||
"inspectionUrl": inspection_url,
|
||||
"siteUrl": site_url,
|
||||
}
|
||||
).execute()
|
||||
|
||||
result = response.get("inspectionResult", {})
|
||||
|
||||
return {
|
||||
"url": inspection_url,
|
||||
"indexing_state": result.get("indexStatusResult", {}).get(
|
||||
"coverageState", "Unknown"
|
||||
),
|
||||
"last_crawl_time": result.get("indexStatusResult", {}).get(
|
||||
"lastCrawlTime"
|
||||
),
|
||||
"crawled_as": result.get("indexStatusResult", {}).get("crawledAs"),
|
||||
"robots_txt_state": result.get("indexStatusResult", {}).get(
|
||||
"robotsTxtState"
|
||||
),
|
||||
"mobile_usability": result.get("mobileUsabilityResult", {}).get(
|
||||
"verdict", "Unknown"
|
||||
),
|
||||
"rich_results": result.get("richResultsResult", {}).get(
|
||||
"verdict", "Unknown"
|
||||
),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to inspect URL: {e}")
|
||||
raise
|
||||
|
||||
def get_performance_summary(
|
||||
self,
|
||||
site_url: str,
|
||||
days: int = 30,
|
||||
) -> dict:
|
||||
"""Get a summary of search performance."""
|
||||
end_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
start_date = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
|
||||
|
||||
# Get overall stats
|
||||
overall = self.get_search_analytics(
|
||||
site_url=site_url,
|
||||
dimensions=[],
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
)
|
||||
|
||||
# Get top queries
|
||||
top_queries = self.get_top_queries(
|
||||
site_url=site_url,
|
||||
limit=10,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
)
|
||||
|
||||
# Get top pages
|
||||
top_pages = self.get_top_pages(
|
||||
site_url=site_url,
|
||||
limit=10,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
)
|
||||
|
||||
# Get by device
|
||||
by_device = self.get_search_analytics(
|
||||
site_url=site_url,
|
||||
dimensions=["device"],
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
)
|
||||
|
||||
device_breakdown = {}
|
||||
for row in by_device.rows:
|
||||
device = row["keys"][0]
|
||||
device_breakdown[device] = {
|
||||
"clicks": row.get("clicks", 0),
|
||||
"impressions": row.get("impressions", 0),
|
||||
"ctr": row.get("ctr", 0),
|
||||
"position": row.get("position", 0),
|
||||
}
|
||||
|
||||
return {
|
||||
"period": f"{start_date} to {end_date}",
|
||||
"total_clicks": overall.total_clicks,
|
||||
"total_impressions": overall.total_impressions,
|
||||
"average_ctr": overall.average_ctr,
|
||||
"average_position": overall.average_position,
|
||||
"top_queries": top_queries,
|
||||
"top_pages": top_pages,
|
||||
"by_device": device_breakdown,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
"""Test the Search Console client."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Google Search Console Client")
|
||||
parser.add_argument("--site", "-s", required=True, help="Site URL")
|
||||
parser.add_argument("--action", "-a", default="summary",
|
||||
choices=["summary", "queries", "pages", "sitemaps", "inspect"],
|
||||
help="Action to perform")
|
||||
parser.add_argument("--url", help="URL to inspect")
|
||||
parser.add_argument("--days", type=int, default=30, help="Days of data")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
client = SearchConsoleClient()
|
||||
|
||||
if args.action == "summary":
|
||||
summary = client.get_performance_summary(args.site, args.days)
|
||||
import json
|
||||
print(json.dumps(summary, indent=2, default=str))
|
||||
|
||||
elif args.action == "queries":
|
||||
queries = client.get_top_queries(args.site)
|
||||
for q in queries[:20]:
|
||||
print(f"{q['query']}: {q['clicks']} clicks, pos {q['position']:.1f}")
|
||||
|
||||
elif args.action == "pages":
|
||||
pages = client.get_top_pages(args.site)
|
||||
for p in pages[:20]:
|
||||
print(f"{p['page']}: {p['clicks']} clicks, pos {p['position']:.1f}")
|
||||
|
||||
elif args.action == "sitemaps":
|
||||
sitemaps = client.get_sitemaps(args.site)
|
||||
for sm in sitemaps:
|
||||
print(f"{sm.path}: errors={sm.errors}, warnings={sm.warnings}")
|
||||
|
||||
elif args.action == "inspect" and args.url:
|
||||
result = client.inspect_url(args.site, args.url)
|
||||
import json
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,951 @@
|
||||
"""
|
||||
Notion Reporter - Create SEO audit findings in Notion
|
||||
=====================================================
|
||||
Purpose: Output SEO audit findings to Notion databases
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
from notion_reporter import NotionReporter, SEOFinding, AuditReport
|
||||
reporter = NotionReporter()
|
||||
|
||||
# Create audit report with checklist table
|
||||
report = AuditReport(site="https://example.com")
|
||||
report.add_finding(SEOFinding(...))
|
||||
reporter.create_audit_report(report)
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from notion_client import Client
|
||||
|
||||
from base_client import config
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Template directory
|
||||
TEMPLATE_DIR = Path(__file__).parent.parent / "templates"
|
||||
|
||||
# Default OurDigital SEO Audit Log database
|
||||
DEFAULT_DATABASE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
|
||||
|
||||
# Default parent page for audit reports (OurDigital SEO Audit Log)
|
||||
DEFAULT_AUDIT_REPORTS_PAGE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
|
||||
|
||||
|
||||
@dataclass
|
||||
class SEOFinding:
|
||||
"""Represents an SEO audit finding."""
|
||||
|
||||
issue: str
|
||||
category: str
|
||||
priority: str
|
||||
status: str = "To Fix"
|
||||
url: str | None = None
|
||||
description: str | None = None
|
||||
impact: str | None = None
|
||||
recommendation: str | None = None
|
||||
site: str | None = None # The audited site URL
|
||||
audit_id: str | None = None # Groups findings from same audit session
|
||||
affected_urls: list[str] = field(default_factory=list) # List of all affected URLs
|
||||
|
||||
|
||||
@dataclass
|
||||
class AuditReport:
|
||||
"""Represents a complete SEO audit report with checklist."""
|
||||
|
||||
site: str
|
||||
audit_id: str = field(default_factory=lambda: datetime.now().strftime("%Y%m%d-%H%M%S"))
|
||||
audit_date: datetime = field(default_factory=datetime.now)
|
||||
findings: list[SEOFinding] = field(default_factory=list)
|
||||
|
||||
# Audit check results
|
||||
robots_txt_status: str = "Not checked"
|
||||
sitemap_status: str = "Not checked"
|
||||
schema_status: str = "Not checked"
|
||||
performance_status: str = "Not checked"
|
||||
|
||||
# Summary statistics
|
||||
total_urls_checked: int = 0
|
||||
total_issues: int = 0
|
||||
|
||||
def add_finding(self, finding: SEOFinding) -> None:
|
||||
"""Add a finding to the report."""
|
||||
finding.site = self.site
|
||||
finding.audit_id = f"{self.site.replace('https://', '').replace('http://', '').split('/')[0]}-{self.audit_id}"
|
||||
self.findings.append(finding)
|
||||
self.total_issues = len(self.findings)
|
||||
|
||||
def get_findings_by_priority(self) -> dict[str, list[SEOFinding]]:
|
||||
"""Group findings by priority."""
|
||||
result = {"Critical": [], "High": [], "Medium": [], "Low": []}
|
||||
for f in self.findings:
|
||||
if f.priority in result:
|
||||
result[f.priority].append(f)
|
||||
return result
|
||||
|
||||
def get_findings_by_category(self) -> dict[str, list[SEOFinding]]:
|
||||
"""Group findings by category."""
|
||||
result = {}
|
||||
for f in self.findings:
|
||||
if f.category not in result:
|
||||
result[f.category] = []
|
||||
result[f.category].append(f)
|
||||
return result
|
||||
|
||||
|
||||
class NotionReporter:
|
||||
"""Create and manage SEO audit findings in Notion."""
|
||||
|
||||
CATEGORIES = [
|
||||
"Technical SEO",
|
||||
"On-page SEO",
|
||||
"Content",
|
||||
"Local SEO",
|
||||
"Performance",
|
||||
"Schema/Structured Data",
|
||||
"Sitemap",
|
||||
"Robots.txt",
|
||||
]
|
||||
|
||||
PRIORITIES = ["Critical", "High", "Medium", "Low"]
|
||||
|
||||
STATUSES = ["To Fix", "In Progress", "Fixed", "Monitoring"]
|
||||
|
||||
CATEGORY_COLORS = {
|
||||
"Technical SEO": "blue",
|
||||
"On-page SEO": "green",
|
||||
"Content": "purple",
|
||||
"Local SEO": "orange",
|
||||
"Performance": "red",
|
||||
"Schema/Structured Data": "yellow",
|
||||
"Sitemap": "pink",
|
||||
"Robots.txt": "gray",
|
||||
}
|
||||
|
||||
PRIORITY_COLORS = {
|
||||
"Critical": "red",
|
||||
"High": "orange",
|
||||
"Medium": "yellow",
|
||||
"Low": "gray",
|
||||
}
|
||||
|
||||
def __init__(self, token: str | None = None):
|
||||
"""
|
||||
Initialize Notion reporter.
|
||||
|
||||
Args:
|
||||
token: Notion API token
|
||||
"""
|
||||
self.token = token or config.notion_token
|
||||
if not self.token:
|
||||
raise ValueError(
|
||||
"Notion token not configured. "
|
||||
"Set NOTION_TOKEN or NOTION_API_KEY environment variable."
|
||||
)
|
||||
self.client = Client(auth=self.token)
|
||||
|
||||
def create_findings_database(
|
||||
self,
|
||||
parent_page_id: str,
|
||||
title: str = "SEO Audit Findings",
|
||||
) -> str:
|
||||
"""
|
||||
Create a new SEO findings database.
|
||||
|
||||
Args:
|
||||
parent_page_id: Parent page ID for the database
|
||||
title: Database title
|
||||
|
||||
Returns:
|
||||
Database ID
|
||||
"""
|
||||
# Build database schema
|
||||
properties = {
|
||||
"Issue": {"title": {}},
|
||||
"Category": {
|
||||
"select": {
|
||||
"options": [
|
||||
{"name": cat, "color": self.CATEGORY_COLORS.get(cat, "default")}
|
||||
for cat in self.CATEGORIES
|
||||
]
|
||||
}
|
||||
},
|
||||
"Priority": {
|
||||
"select": {
|
||||
"options": [
|
||||
{"name": pri, "color": self.PRIORITY_COLORS.get(pri, "default")}
|
||||
for pri in self.PRIORITIES
|
||||
]
|
||||
}
|
||||
},
|
||||
"Status": {
|
||||
"status": {
|
||||
"options": [
|
||||
{"name": "To Fix", "color": "red"},
|
||||
{"name": "In Progress", "color": "yellow"},
|
||||
{"name": "Fixed", "color": "green"},
|
||||
{"name": "Monitoring", "color": "blue"},
|
||||
],
|
||||
"groups": [
|
||||
{"name": "To-do", "option_ids": [], "color": "gray"},
|
||||
{"name": "In progress", "option_ids": [], "color": "blue"},
|
||||
{"name": "Complete", "option_ids": [], "color": "green"},
|
||||
],
|
||||
}
|
||||
},
|
||||
"URL": {"url": {}},
|
||||
"Description": {"rich_text": {}},
|
||||
"Impact": {"rich_text": {}},
|
||||
"Recommendation": {"rich_text": {}},
|
||||
"Found Date": {"date": {}},
|
||||
}
|
||||
|
||||
try:
|
||||
response = self.client.databases.create(
|
||||
parent={"page_id": parent_page_id},
|
||||
title=[{"type": "text", "text": {"content": title}}],
|
||||
properties=properties,
|
||||
)
|
||||
database_id = response["id"]
|
||||
logger.info(f"Created database: {database_id}")
|
||||
return database_id
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create database: {e}")
|
||||
raise
|
||||
|
||||
def add_finding(
|
||||
self,
|
||||
finding: SEOFinding,
|
||||
database_id: str | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Add a finding to the database with page content.
|
||||
|
||||
Args:
|
||||
finding: SEOFinding object
|
||||
database_id: Target database ID (defaults to OurDigital SEO Audit Log)
|
||||
|
||||
Returns:
|
||||
Page ID of created entry
|
||||
"""
|
||||
db_id = database_id or DEFAULT_DATABASE_ID
|
||||
|
||||
# Database properties (metadata)
|
||||
properties = {
|
||||
"Issue": {"title": [{"text": {"content": finding.issue}}]},
|
||||
"Category": {"select": {"name": finding.category}},
|
||||
"Priority": {"select": {"name": finding.priority}},
|
||||
"Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
|
||||
}
|
||||
|
||||
if finding.url:
|
||||
properties["URL"] = {"url": finding.url}
|
||||
|
||||
if finding.site:
|
||||
properties["Site"] = {"url": finding.site}
|
||||
|
||||
if finding.audit_id:
|
||||
properties["Audit ID"] = {
|
||||
"rich_text": [{"text": {"content": finding.audit_id}}]
|
||||
}
|
||||
|
||||
# Page content blocks (Description, Impact, Recommendation)
|
||||
children = []
|
||||
|
||||
if finding.description:
|
||||
children.extend([
|
||||
{
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {
|
||||
"rich_text": [{"type": "text", "text": {"content": "Description"}}]
|
||||
}
|
||||
},
|
||||
{
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {
|
||||
"rich_text": [{"type": "text", "text": {"content": finding.description}}]
|
||||
}
|
||||
}
|
||||
])
|
||||
|
||||
if finding.impact:
|
||||
children.extend([
|
||||
{
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {
|
||||
"rich_text": [{"type": "text", "text": {"content": "Impact"}}]
|
||||
}
|
||||
},
|
||||
{
|
||||
"object": "block",
|
||||
"type": "callout",
|
||||
"callout": {
|
||||
"rich_text": [{"type": "text", "text": {"content": finding.impact}}],
|
||||
"icon": {"type": "emoji", "emoji": "⚠️"}
|
||||
}
|
||||
}
|
||||
])
|
||||
|
||||
if finding.recommendation:
|
||||
children.extend([
|
||||
{
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {
|
||||
"rich_text": [{"type": "text", "text": {"content": "Recommendation"}}]
|
||||
}
|
||||
},
|
||||
{
|
||||
"object": "block",
|
||||
"type": "callout",
|
||||
"callout": {
|
||||
"rich_text": [{"type": "text", "text": {"content": finding.recommendation}}],
|
||||
"icon": {"type": "emoji", "emoji": "💡"}
|
||||
}
|
||||
}
|
||||
])
|
||||
|
||||
try:
|
||||
response = self.client.pages.create(
|
||||
parent={"database_id": db_id},
|
||||
properties=properties,
|
||||
children=children if children else None,
|
||||
)
|
||||
return response["id"]
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to add finding: {e}")
|
||||
raise
|
||||
|
||||
def add_findings_batch(
|
||||
self,
|
||||
findings: list[SEOFinding],
|
||||
database_id: str | None = None,
|
||||
) -> list[str]:
|
||||
"""
|
||||
Add multiple findings to the database.
|
||||
|
||||
Args:
|
||||
findings: List of SEOFinding objects
|
||||
database_id: Target database ID (defaults to OurDigital SEO Audit Log)
|
||||
|
||||
Returns:
|
||||
List of created page IDs
|
||||
"""
|
||||
page_ids = []
|
||||
for finding in findings:
|
||||
try:
|
||||
page_id = self.add_finding(finding, database_id)
|
||||
page_ids.append(page_id)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to add finding '{finding.issue}': {e}")
|
||||
return page_ids
|
||||
|
||||
def create_audit_summary_page(
|
||||
self,
|
||||
parent_page_id: str,
|
||||
url: str,
|
||||
summary: dict,
|
||||
) -> str:
|
||||
"""
|
||||
Create a summary page for the audit.
|
||||
|
||||
Args:
|
||||
parent_page_id: Parent page ID
|
||||
url: Audited URL
|
||||
summary: Audit summary data
|
||||
|
||||
Returns:
|
||||
Page ID
|
||||
"""
|
||||
# Build page content
|
||||
children = [
|
||||
{
|
||||
"object": "block",
|
||||
"type": "heading_1",
|
||||
"heading_1": {
|
||||
"rich_text": [{"type": "text", "text": {"content": f"SEO Audit: {url}"}}]
|
||||
},
|
||||
},
|
||||
{
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {
|
||||
"rich_text": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": {"content": f"Audit Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}"},
|
||||
}
|
||||
]
|
||||
},
|
||||
},
|
||||
{
|
||||
"object": "block",
|
||||
"type": "divider",
|
||||
"divider": {},
|
||||
},
|
||||
{
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {
|
||||
"rich_text": [{"type": "text", "text": {"content": "Summary"}}]
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
# Add summary statistics
|
||||
if "stats" in summary:
|
||||
stats = summary["stats"]
|
||||
stats_text = "\n".join([f"• {k}: {v}" for k, v in stats.items()])
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {
|
||||
"rich_text": [{"type": "text", "text": {"content": stats_text}}]
|
||||
},
|
||||
})
|
||||
|
||||
# Add findings by priority
|
||||
if "findings_by_priority" in summary:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {
|
||||
"rich_text": [{"type": "text", "text": {"content": "Findings by Priority"}}]
|
||||
},
|
||||
})
|
||||
|
||||
for priority, count in summary["findings_by_priority"].items():
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "bulleted_list_item",
|
||||
"bulleted_list_item": {
|
||||
"rich_text": [{"type": "text", "text": {"content": f"{priority}: {count}"}}]
|
||||
},
|
||||
})
|
||||
|
||||
try:
|
||||
response = self.client.pages.create(
|
||||
parent={"page_id": parent_page_id},
|
||||
properties={
|
||||
"title": {"title": [{"text": {"content": f"SEO Audit - {url}"}}]}
|
||||
},
|
||||
children=children,
|
||||
)
|
||||
return response["id"]
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create summary page: {e}")
|
||||
raise
|
||||
|
||||
def query_findings(
|
||||
self,
|
||||
database_id: str,
|
||||
category: str | None = None,
|
||||
priority: str | None = None,
|
||||
status: str | None = None,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Query findings from database.
|
||||
|
||||
Args:
|
||||
database_id: Database ID
|
||||
category: Filter by category
|
||||
priority: Filter by priority
|
||||
status: Filter by status
|
||||
|
||||
Returns:
|
||||
List of finding records
|
||||
"""
|
||||
filters = []
|
||||
|
||||
if category:
|
||||
filters.append({
|
||||
"property": "Category",
|
||||
"select": {"equals": category},
|
||||
})
|
||||
|
||||
if priority:
|
||||
filters.append({
|
||||
"property": "Priority",
|
||||
"select": {"equals": priority},
|
||||
})
|
||||
|
||||
if status:
|
||||
filters.append({
|
||||
"property": "Status",
|
||||
"status": {"equals": status},
|
||||
})
|
||||
|
||||
query_params = {"database_id": database_id}
|
||||
if filters:
|
||||
if len(filters) == 1:
|
||||
query_params["filter"] = filters[0]
|
||||
else:
|
||||
query_params["filter"] = {"and": filters}
|
||||
|
||||
try:
|
||||
response = self.client.databases.query(**query_params)
|
||||
return response.get("results", [])
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to query findings: {e}")
|
||||
raise
|
||||
|
||||
def update_finding_status(
|
||||
self,
|
||||
page_id: str,
|
||||
status: str,
|
||||
) -> None:
|
||||
"""Update the status of a finding."""
|
||||
if status not in self.STATUSES:
|
||||
raise ValueError(f"Invalid status: {status}")
|
||||
|
||||
try:
|
||||
self.client.pages.update(
|
||||
page_id=page_id,
|
||||
properties={"Status": {"status": {"name": status}}},
|
||||
)
|
||||
logger.info(f"Updated finding {page_id} to {status}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to update status: {e}")
|
||||
raise
|
||||
|
||||
def create_audit_report(
|
||||
self,
|
||||
report: "AuditReport",
|
||||
database_id: str | None = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Create a comprehensive audit report page with checklist table.
|
||||
|
||||
This creates:
|
||||
1. Individual finding pages in the database
|
||||
2. A summary page with all findings in table format for checklist tracking
|
||||
|
||||
Args:
|
||||
report: AuditReport object with all findings
|
||||
database_id: Target database ID (defaults to OurDigital SEO Audit Log)
|
||||
|
||||
Returns:
|
||||
Dict with summary_page_id and finding_page_ids
|
||||
"""
|
||||
db_id = database_id or DEFAULT_DATABASE_ID
|
||||
|
||||
# Generate full audit ID
|
||||
site_domain = report.site.replace('https://', '').replace('http://', '').split('/')[0]
|
||||
full_audit_id = f"{site_domain}-{report.audit_id}"
|
||||
|
||||
result = {
|
||||
"audit_id": full_audit_id,
|
||||
"site": report.site,
|
||||
"summary_page_id": None,
|
||||
"finding_page_ids": [],
|
||||
}
|
||||
|
||||
# 1. Create individual finding pages in database
|
||||
logger.info(f"Creating {len(report.findings)} finding pages...")
|
||||
for finding in report.findings:
|
||||
finding.audit_id = full_audit_id
|
||||
finding.site = report.site
|
||||
try:
|
||||
page_id = self.add_finding(finding, db_id)
|
||||
result["finding_page_ids"].append(page_id)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to add finding '{finding.issue}': {e}")
|
||||
|
||||
# 2. Create summary page with checklist table
|
||||
logger.info("Creating audit summary page with checklist...")
|
||||
summary_page_id = self._create_audit_summary_with_table(report, full_audit_id, db_id)
|
||||
result["summary_page_id"] = summary_page_id
|
||||
|
||||
logger.info(f"Audit report created: {full_audit_id}")
|
||||
return result
|
||||
|
||||
def _create_audit_summary_with_table(
|
||||
self,
|
||||
report: "AuditReport",
|
||||
audit_id: str,
|
||||
database_id: str,
|
||||
) -> str:
|
||||
"""
|
||||
Create audit summary page with checklist table format.
|
||||
|
||||
Args:
|
||||
report: AuditReport object
|
||||
audit_id: Full audit ID
|
||||
database_id: Parent database ID
|
||||
|
||||
Returns:
|
||||
Summary page ID
|
||||
"""
|
||||
site_domain = report.site.replace('https://', '').replace('http://', '').split('/')[0]
|
||||
|
||||
# Build page content blocks
|
||||
children = []
|
||||
|
||||
# Header with audit info
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "callout",
|
||||
"callout": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Audit ID: {audit_id}\n"}},
|
||||
{"type": "text", "text": {"content": f"Date: {report.audit_date.strftime('%Y-%m-%d %H:%M')}\n"}},
|
||||
{"type": "text", "text": {"content": f"Total Issues: {report.total_issues}"}},
|
||||
],
|
||||
"icon": {"type": "emoji", "emoji": "📋"},
|
||||
"color": "blue_background",
|
||||
}
|
||||
})
|
||||
|
||||
# Audit Status Summary
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {
|
||||
"rich_text": [{"type": "text", "text": {"content": "Audit Status"}}]
|
||||
}
|
||||
})
|
||||
|
||||
# Status table
|
||||
status_table = {
|
||||
"object": "block",
|
||||
"type": "table",
|
||||
"table": {
|
||||
"table_width": 2,
|
||||
"has_column_header": True,
|
||||
"has_row_header": False,
|
||||
"children": [
|
||||
{
|
||||
"type": "table_row",
|
||||
"table_row": {
|
||||
"cells": [
|
||||
[{"type": "text", "text": {"content": "Check"}}],
|
||||
[{"type": "text", "text": {"content": "Status"}}],
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "table_row",
|
||||
"table_row": {
|
||||
"cells": [
|
||||
[{"type": "text", "text": {"content": "Robots.txt"}}],
|
||||
[{"type": "text", "text": {"content": report.robots_txt_status}}],
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "table_row",
|
||||
"table_row": {
|
||||
"cells": [
|
||||
[{"type": "text", "text": {"content": "Sitemap"}}],
|
||||
[{"type": "text", "text": {"content": report.sitemap_status}}],
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "table_row",
|
||||
"table_row": {
|
||||
"cells": [
|
||||
[{"type": "text", "text": {"content": "Schema Markup"}}],
|
||||
[{"type": "text", "text": {"content": report.schema_status}}],
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "table_row",
|
||||
"table_row": {
|
||||
"cells": [
|
||||
[{"type": "text", "text": {"content": "Performance"}}],
|
||||
[{"type": "text", "text": {"content": report.performance_status}}],
|
||||
]
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
children.append(status_table)
|
||||
|
||||
# Divider
|
||||
children.append({"object": "block", "type": "divider", "divider": {}})
|
||||
|
||||
# Findings Checklist Header
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {
|
||||
"rich_text": [{"type": "text", "text": {"content": "Findings Checklist"}}]
|
||||
}
|
||||
})
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {
|
||||
"rich_text": [{"type": "text", "text": {"content": "Use this checklist to track fixes. Check off items as you complete them."}}]
|
||||
}
|
||||
})
|
||||
|
||||
# Create findings table with checklist format
|
||||
if report.findings:
|
||||
# Build table rows - Header row
|
||||
table_rows = [
|
||||
{
|
||||
"type": "table_row",
|
||||
"table_row": {
|
||||
"cells": [
|
||||
[{"type": "text", "text": {"content": "#"}, "annotations": {"bold": True}}],
|
||||
[{"type": "text", "text": {"content": "Priority"}, "annotations": {"bold": True}}],
|
||||
[{"type": "text", "text": {"content": "Category"}, "annotations": {"bold": True}}],
|
||||
[{"type": "text", "text": {"content": "Issue"}, "annotations": {"bold": True}}],
|
||||
[{"type": "text", "text": {"content": "URL"}, "annotations": {"bold": True}}],
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
# Add finding rows
|
||||
for idx, finding in enumerate(report.findings, 1):
|
||||
# Truncate long text for table cells
|
||||
issue_text = finding.issue[:50] + "..." if len(finding.issue) > 50 else finding.issue
|
||||
url_text = finding.url[:40] + "..." if finding.url and len(finding.url) > 40 else (finding.url or "-")
|
||||
|
||||
table_rows.append({
|
||||
"type": "table_row",
|
||||
"table_row": {
|
||||
"cells": [
|
||||
[{"type": "text", "text": {"content": str(idx)}}],
|
||||
[{"type": "text", "text": {"content": finding.priority}}],
|
||||
[{"type": "text", "text": {"content": finding.category}}],
|
||||
[{"type": "text", "text": {"content": issue_text}}],
|
||||
[{"type": "text", "text": {"content": url_text}}],
|
||||
]
|
||||
}
|
||||
})
|
||||
|
||||
findings_table = {
|
||||
"object": "block",
|
||||
"type": "table",
|
||||
"table": {
|
||||
"table_width": 5,
|
||||
"has_column_header": True,
|
||||
"has_row_header": False,
|
||||
"children": table_rows
|
||||
}
|
||||
}
|
||||
children.append(findings_table)
|
||||
|
||||
# Divider
|
||||
children.append({"object": "block", "type": "divider", "divider": {}})
|
||||
|
||||
# Detailed Findings with To-Do checkboxes
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {
|
||||
"rich_text": [{"type": "text", "text": {"content": "Detailed Findings & Actions"}}]
|
||||
}
|
||||
})
|
||||
|
||||
# Group findings by priority and add as to-do items
|
||||
for priority in ["Critical", "High", "Medium", "Low"]:
|
||||
priority_findings = [f for f in report.findings if f.priority == priority]
|
||||
if not priority_findings:
|
||||
continue
|
||||
|
||||
# Priority header with emoji
|
||||
priority_emoji = {"Critical": "🔴", "High": "🟠", "Medium": "🟡", "Low": "⚪"}
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_3",
|
||||
"heading_3": {
|
||||
"rich_text": [{"type": "text", "text": {"content": f"{priority_emoji.get(priority, '')} {priority} Priority ({len(priority_findings)})"}}]
|
||||
}
|
||||
})
|
||||
|
||||
# Add each finding as a to-do item with details
|
||||
for finding in priority_findings:
|
||||
# Main to-do item
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "to_do",
|
||||
"to_do": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": f"[{finding.category}] "}, "annotations": {"bold": True}},
|
||||
{"type": "text", "text": {"content": finding.issue}},
|
||||
],
|
||||
"checked": False,
|
||||
}
|
||||
})
|
||||
|
||||
# URL if available
|
||||
if finding.url:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "bulleted_list_item",
|
||||
"bulleted_list_item": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": "URL: "}},
|
||||
{"type": "text", "text": {"content": finding.url, "link": {"url": finding.url}}},
|
||||
]
|
||||
}
|
||||
})
|
||||
|
||||
# Affected URLs list if available
|
||||
if finding.affected_urls:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "toggle",
|
||||
"toggle": {
|
||||
"rich_text": [{"type": "text", "text": {"content": f"Affected URLs ({len(finding.affected_urls)})"}}],
|
||||
"children": [
|
||||
{
|
||||
"object": "block",
|
||||
"type": "bulleted_list_item",
|
||||
"bulleted_list_item": {
|
||||
"rich_text": [{"type": "text", "text": {"content": url, "link": {"url": url} if url.startswith("http") else None}}]
|
||||
}
|
||||
}
|
||||
for url in finding.affected_urls[:20] # Limit to 20 URLs
|
||||
] + ([{
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {
|
||||
"rich_text": [{"type": "text", "text": {"content": f"... and {len(finding.affected_urls) - 20} more URLs"}}]
|
||||
}
|
||||
}] if len(finding.affected_urls) > 20 else [])
|
||||
}
|
||||
})
|
||||
|
||||
# Recommendation as sub-item
|
||||
if finding.recommendation:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "bulleted_list_item",
|
||||
"bulleted_list_item": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": "💡 "}, "annotations": {"bold": True}},
|
||||
{"type": "text", "text": {"content": finding.recommendation}},
|
||||
]
|
||||
}
|
||||
})
|
||||
|
||||
# Create the summary page
|
||||
try:
|
||||
response = self.client.pages.create(
|
||||
parent={"database_id": database_id},
|
||||
properties={
|
||||
"Issue": {"title": [{"text": {"content": f"📊 Audit Report: {site_domain}"}}]},
|
||||
"Category": {"select": {"name": "Technical SEO"}},
|
||||
"Priority": {"select": {"name": "High"}},
|
||||
"Site": {"url": report.site},
|
||||
"Audit ID": {"rich_text": [{"text": {"content": audit_id}}]},
|
||||
"Found Date": {"date": {"start": report.audit_date.strftime("%Y-%m-%d")}},
|
||||
},
|
||||
children=children,
|
||||
)
|
||||
logger.info(f"Created audit summary page: {response['id']}")
|
||||
return response["id"]
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create audit summary page: {e}")
|
||||
raise
|
||||
|
||||
def create_quick_audit_report(
|
||||
self,
|
||||
site: str,
|
||||
findings: list[SEOFinding],
|
||||
robots_status: str = "Not checked",
|
||||
sitemap_status: str = "Not checked",
|
||||
schema_status: str = "Not checked",
|
||||
performance_status: str = "Not checked",
|
||||
database_id: str | None = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Quick method to create audit report from a list of findings.
|
||||
|
||||
Args:
|
||||
site: Site URL
|
||||
findings: List of SEOFinding objects
|
||||
robots_status: Robots.txt check result
|
||||
sitemap_status: Sitemap check result
|
||||
schema_status: Schema check result
|
||||
performance_status: Performance check result
|
||||
database_id: Target database ID
|
||||
|
||||
Returns:
|
||||
Dict with audit results
|
||||
"""
|
||||
report = AuditReport(site=site)
|
||||
report.robots_txt_status = robots_status
|
||||
report.sitemap_status = sitemap_status
|
||||
report.schema_status = schema_status
|
||||
report.performance_status = performance_status
|
||||
|
||||
for finding in findings:
|
||||
report.add_finding(finding)
|
||||
|
||||
return self.create_audit_report(report, database_id)
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point for testing."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Notion SEO Reporter")
|
||||
parser.add_argument("--action", "-a", required=True,
|
||||
choices=["create-db", "add-finding", "query"],
|
||||
help="Action to perform")
|
||||
parser.add_argument("--parent-id", "-p", help="Parent page ID")
|
||||
parser.add_argument("--database-id", "-d", help="Database ID")
|
||||
parser.add_argument("--title", "-t", default="SEO Audit Findings",
|
||||
help="Database title")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
reporter = NotionReporter()
|
||||
|
||||
if args.action == "create-db":
|
||||
if not args.parent_id:
|
||||
parser.error("--parent-id required for create-db")
|
||||
db_id = reporter.create_findings_database(args.parent_id, args.title)
|
||||
print(f"Created database: {db_id}")
|
||||
|
||||
elif args.action == "add-finding":
|
||||
if not args.database_id:
|
||||
parser.error("--database-id required for add-finding")
|
||||
# Example finding
|
||||
finding = SEOFinding(
|
||||
issue="Missing meta description",
|
||||
category="On-page SEO",
|
||||
priority="Medium",
|
||||
url="https://example.com/page",
|
||||
description="Page is missing meta description tag",
|
||||
impact="May affect CTR in search results",
|
||||
recommendation="Add unique meta description under 160 characters",
|
||||
)
|
||||
page_id = reporter.add_finding(args.database_id, finding)
|
||||
print(f"Created finding: {page_id}")
|
||||
|
||||
elif args.action == "query":
|
||||
if not args.database_id:
|
||||
parser.error("--database-id required for query")
|
||||
findings = reporter.query_findings(args.database_id)
|
||||
print(f"Found {len(findings)} findings")
|
||||
for f in findings[:5]:
|
||||
title = f["properties"]["Issue"]["title"]
|
||||
if title:
|
||||
print(f" - {title[0]['plain_text']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,569 @@
|
||||
"""
|
||||
Page Analyzer - Extract SEO metadata from web pages
|
||||
===================================================
|
||||
Purpose: Comprehensive page-level SEO data extraction
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
from page_analyzer import PageAnalyzer, PageMetadata
|
||||
analyzer = PageAnalyzer()
|
||||
metadata = analyzer.analyze_url("https://example.com/page")
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LinkData:
|
||||
"""Represents a link found on a page."""
|
||||
url: str
|
||||
anchor_text: str
|
||||
is_internal: bool
|
||||
is_nofollow: bool = False
|
||||
link_type: str = "body" # body, nav, footer, etc.
|
||||
|
||||
|
||||
@dataclass
|
||||
class HeadingData:
|
||||
"""Represents a heading found on a page."""
|
||||
level: int # 1-6
|
||||
text: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SchemaData:
|
||||
"""Represents schema.org structured data."""
|
||||
schema_type: str
|
||||
properties: dict
|
||||
format: str = "json-ld" # json-ld, microdata, rdfa
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenGraphData:
|
||||
"""Represents Open Graph metadata."""
|
||||
og_title: str | None = None
|
||||
og_description: str | None = None
|
||||
og_image: str | None = None
|
||||
og_url: str | None = None
|
||||
og_type: str | None = None
|
||||
og_site_name: str | None = None
|
||||
og_locale: str | None = None
|
||||
twitter_card: str | None = None
|
||||
twitter_title: str | None = None
|
||||
twitter_description: str | None = None
|
||||
twitter_image: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageMetadata:
|
||||
"""Complete SEO metadata for a page."""
|
||||
|
||||
# Basic info
|
||||
url: str
|
||||
status_code: int = 0
|
||||
content_type: str = ""
|
||||
response_time_ms: float = 0
|
||||
analyzed_at: datetime = field(default_factory=datetime.now)
|
||||
|
||||
# Meta tags
|
||||
title: str | None = None
|
||||
title_length: int = 0
|
||||
meta_description: str | None = None
|
||||
meta_description_length: int = 0
|
||||
canonical_url: str | None = None
|
||||
robots_meta: str | None = None
|
||||
|
||||
# Language
|
||||
html_lang: str | None = None
|
||||
hreflang_tags: list[dict] = field(default_factory=list) # [{"lang": "en", "url": "..."}]
|
||||
|
||||
# Headings
|
||||
headings: list[HeadingData] = field(default_factory=list)
|
||||
h1_count: int = 0
|
||||
h1_text: str | None = None
|
||||
|
||||
# Open Graph & Social
|
||||
open_graph: OpenGraphData = field(default_factory=OpenGraphData)
|
||||
|
||||
# Schema/Structured Data
|
||||
schema_data: list[SchemaData] = field(default_factory=list)
|
||||
schema_types_found: list[str] = field(default_factory=list)
|
||||
|
||||
# Links
|
||||
internal_links: list[LinkData] = field(default_factory=list)
|
||||
external_links: list[LinkData] = field(default_factory=list)
|
||||
internal_link_count: int = 0
|
||||
external_link_count: int = 0
|
||||
|
||||
# Images
|
||||
images_total: int = 0
|
||||
images_without_alt: int = 0
|
||||
images_with_alt: int = 0
|
||||
|
||||
# Content metrics
|
||||
word_count: int = 0
|
||||
|
||||
# Issues found
|
||||
issues: list[str] = field(default_factory=list)
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return {
|
||||
"url": self.url,
|
||||
"status_code": self.status_code,
|
||||
"content_type": self.content_type,
|
||||
"response_time_ms": self.response_time_ms,
|
||||
"analyzed_at": self.analyzed_at.isoformat(),
|
||||
"title": self.title,
|
||||
"title_length": self.title_length,
|
||||
"meta_description": self.meta_description,
|
||||
"meta_description_length": self.meta_description_length,
|
||||
"canonical_url": self.canonical_url,
|
||||
"robots_meta": self.robots_meta,
|
||||
"html_lang": self.html_lang,
|
||||
"hreflang_tags": self.hreflang_tags,
|
||||
"h1_count": self.h1_count,
|
||||
"h1_text": self.h1_text,
|
||||
"headings_count": len(self.headings),
|
||||
"schema_types_found": self.schema_types_found,
|
||||
"internal_link_count": self.internal_link_count,
|
||||
"external_link_count": self.external_link_count,
|
||||
"images_total": self.images_total,
|
||||
"images_without_alt": self.images_without_alt,
|
||||
"word_count": self.word_count,
|
||||
"issues": self.issues,
|
||||
"warnings": self.warnings,
|
||||
"open_graph": {
|
||||
"og_title": self.open_graph.og_title,
|
||||
"og_description": self.open_graph.og_description,
|
||||
"og_image": self.open_graph.og_image,
|
||||
"og_url": self.open_graph.og_url,
|
||||
"og_type": self.open_graph.og_type,
|
||||
},
|
||||
}
|
||||
|
||||
def get_summary(self) -> str:
|
||||
"""Get a brief summary of the page analysis."""
|
||||
lines = [
|
||||
f"URL: {self.url}",
|
||||
f"Status: {self.status_code}",
|
||||
f"Title: {self.title[:50] + '...' if self.title and len(self.title) > 50 else self.title}",
|
||||
f"Description: {'✓' if self.meta_description else '✗ Missing'}",
|
||||
f"Canonical: {'✓' if self.canonical_url else '✗ Missing'}",
|
||||
f"H1: {self.h1_count} found",
|
||||
f"Schema: {', '.join(self.schema_types_found) if self.schema_types_found else 'None'}",
|
||||
f"Links: {self.internal_link_count} internal, {self.external_link_count} external",
|
||||
f"Images: {self.images_total} total, {self.images_without_alt} without alt",
|
||||
]
|
||||
if self.issues:
|
||||
lines.append(f"Issues: {len(self.issues)}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
class PageAnalyzer:
|
||||
"""Analyze web pages for SEO metadata."""
|
||||
|
||||
DEFAULT_USER_AGENT = "Mozilla/5.0 (compatible; OurDigitalSEOBot/1.0; +https://ourdigital.org)"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
user_agent: str | None = None,
|
||||
timeout: int = 30,
|
||||
):
|
||||
"""
|
||||
Initialize page analyzer.
|
||||
|
||||
Args:
|
||||
user_agent: Custom user agent string
|
||||
timeout: Request timeout in seconds
|
||||
"""
|
||||
self.user_agent = user_agent or self.DEFAULT_USER_AGENT
|
||||
self.timeout = timeout
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
"User-Agent": self.user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9,ko;q=0.8",
|
||||
})
|
||||
|
||||
def analyze_url(self, url: str) -> PageMetadata:
|
||||
"""
|
||||
Analyze a URL and extract SEO metadata.
|
||||
|
||||
Args:
|
||||
url: URL to analyze
|
||||
|
||||
Returns:
|
||||
PageMetadata object with all extracted data
|
||||
"""
|
||||
metadata = PageMetadata(url=url)
|
||||
|
||||
try:
|
||||
# Fetch page
|
||||
start_time = datetime.now()
|
||||
response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
|
||||
metadata.response_time_ms = (datetime.now() - start_time).total_seconds() * 1000
|
||||
metadata.status_code = response.status_code
|
||||
metadata.content_type = response.headers.get("Content-Type", "")
|
||||
|
||||
if response.status_code != 200:
|
||||
metadata.issues.append(f"HTTP {response.status_code} status")
|
||||
if response.status_code >= 400:
|
||||
return metadata
|
||||
|
||||
# Parse HTML
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
base_url = url
|
||||
|
||||
# Extract all metadata
|
||||
self._extract_basic_meta(soup, metadata)
|
||||
self._extract_canonical(soup, metadata, base_url)
|
||||
self._extract_robots_meta(soup, metadata)
|
||||
self._extract_hreflang(soup, metadata)
|
||||
self._extract_headings(soup, metadata)
|
||||
self._extract_open_graph(soup, metadata)
|
||||
self._extract_schema(soup, metadata)
|
||||
self._extract_links(soup, metadata, base_url)
|
||||
self._extract_images(soup, metadata)
|
||||
self._extract_content_metrics(soup, metadata)
|
||||
|
||||
# Run SEO checks
|
||||
self._run_seo_checks(metadata)
|
||||
|
||||
except requests.RequestException as e:
|
||||
metadata.issues.append(f"Request failed: {str(e)}")
|
||||
logger.error(f"Failed to analyze {url}: {e}")
|
||||
except Exception as e:
|
||||
metadata.issues.append(f"Analysis error: {str(e)}")
|
||||
logger.error(f"Error analyzing {url}: {e}")
|
||||
|
||||
return metadata
|
||||
|
||||
def _extract_basic_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract title and meta description."""
|
||||
# Title
|
||||
title_tag = soup.find("title")
|
||||
if title_tag and title_tag.string:
|
||||
metadata.title = title_tag.string.strip()
|
||||
metadata.title_length = len(metadata.title)
|
||||
|
||||
# Meta description
|
||||
desc_tag = soup.find("meta", attrs={"name": re.compile(r"^description$", re.I)})
|
||||
if desc_tag and desc_tag.get("content"):
|
||||
metadata.meta_description = desc_tag["content"].strip()
|
||||
metadata.meta_description_length = len(metadata.meta_description)
|
||||
|
||||
# HTML lang
|
||||
html_tag = soup.find("html")
|
||||
if html_tag and html_tag.get("lang"):
|
||||
metadata.html_lang = html_tag["lang"]
|
||||
|
||||
def _extract_canonical(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None:
|
||||
"""Extract canonical URL."""
|
||||
canonical = soup.find("link", rel="canonical")
|
||||
if canonical and canonical.get("href"):
|
||||
metadata.canonical_url = urljoin(base_url, canonical["href"])
|
||||
|
||||
def _extract_robots_meta(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract robots meta tag."""
|
||||
robots = soup.find("meta", attrs={"name": re.compile(r"^robots$", re.I)})
|
||||
if robots and robots.get("content"):
|
||||
metadata.robots_meta = robots["content"]
|
||||
|
||||
# Also check for googlebot-specific
|
||||
googlebot = soup.find("meta", attrs={"name": re.compile(r"^googlebot$", re.I)})
|
||||
if googlebot and googlebot.get("content"):
|
||||
if metadata.robots_meta:
|
||||
metadata.robots_meta += f" | googlebot: {googlebot['content']}"
|
||||
else:
|
||||
metadata.robots_meta = f"googlebot: {googlebot['content']}"
|
||||
|
||||
def _extract_hreflang(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract hreflang tags."""
|
||||
hreflang_tags = soup.find_all("link", rel="alternate", hreflang=True)
|
||||
for tag in hreflang_tags:
|
||||
if tag.get("href") and tag.get("hreflang"):
|
||||
metadata.hreflang_tags.append({
|
||||
"lang": tag["hreflang"],
|
||||
"url": tag["href"]
|
||||
})
|
||||
|
||||
def _extract_headings(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract all headings."""
|
||||
for level in range(1, 7):
|
||||
for heading in soup.find_all(f"h{level}"):
|
||||
text = heading.get_text(strip=True)
|
||||
if text:
|
||||
metadata.headings.append(HeadingData(level=level, text=text))
|
||||
|
||||
# Count H1s specifically
|
||||
h1_tags = soup.find_all("h1")
|
||||
metadata.h1_count = len(h1_tags)
|
||||
if h1_tags:
|
||||
metadata.h1_text = h1_tags[0].get_text(strip=True)
|
||||
|
||||
def _extract_open_graph(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract Open Graph and Twitter Card data."""
|
||||
og = metadata.open_graph
|
||||
|
||||
# Open Graph tags
|
||||
og_mappings = {
|
||||
"og:title": "og_title",
|
||||
"og:description": "og_description",
|
||||
"og:image": "og_image",
|
||||
"og:url": "og_url",
|
||||
"og:type": "og_type",
|
||||
"og:site_name": "og_site_name",
|
||||
"og:locale": "og_locale",
|
||||
}
|
||||
|
||||
for og_prop, attr_name in og_mappings.items():
|
||||
tag = soup.find("meta", property=og_prop)
|
||||
if tag and tag.get("content"):
|
||||
setattr(og, attr_name, tag["content"])
|
||||
|
||||
# Twitter Card tags
|
||||
twitter_mappings = {
|
||||
"twitter:card": "twitter_card",
|
||||
"twitter:title": "twitter_title",
|
||||
"twitter:description": "twitter_description",
|
||||
"twitter:image": "twitter_image",
|
||||
}
|
||||
|
||||
for tw_name, attr_name in twitter_mappings.items():
|
||||
tag = soup.find("meta", attrs={"name": tw_name})
|
||||
if tag and tag.get("content"):
|
||||
setattr(og, attr_name, tag["content"])
|
||||
|
||||
def _extract_schema(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract schema.org structured data."""
|
||||
# JSON-LD
|
||||
for script in soup.find_all("script", type="application/ld+json"):
|
||||
try:
|
||||
data = json.loads(script.string)
|
||||
if isinstance(data, list):
|
||||
for item in data:
|
||||
self._process_schema_item(item, metadata, "json-ld")
|
||||
else:
|
||||
self._process_schema_item(data, metadata, "json-ld")
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
continue
|
||||
|
||||
# Microdata (basic detection)
|
||||
for item in soup.find_all(itemscope=True):
|
||||
itemtype = item.get("itemtype", "")
|
||||
if itemtype:
|
||||
schema_type = itemtype.split("/")[-1]
|
||||
if schema_type not in metadata.schema_types_found:
|
||||
metadata.schema_types_found.append(schema_type)
|
||||
metadata.schema_data.append(SchemaData(
|
||||
schema_type=schema_type,
|
||||
properties={},
|
||||
format="microdata"
|
||||
))
|
||||
|
||||
def _process_schema_item(self, data: dict, metadata: PageMetadata, format_type: str) -> None:
|
||||
"""Process a single schema.org item."""
|
||||
if not isinstance(data, dict):
|
||||
return
|
||||
|
||||
schema_type = data.get("@type", "Unknown")
|
||||
if isinstance(schema_type, list):
|
||||
schema_type = schema_type[0] if schema_type else "Unknown"
|
||||
|
||||
if schema_type not in metadata.schema_types_found:
|
||||
metadata.schema_types_found.append(schema_type)
|
||||
|
||||
metadata.schema_data.append(SchemaData(
|
||||
schema_type=schema_type,
|
||||
properties=data,
|
||||
format=format_type
|
||||
))
|
||||
|
||||
# Process nested @graph items
|
||||
if "@graph" in data:
|
||||
for item in data["@graph"]:
|
||||
self._process_schema_item(item, metadata, format_type)
|
||||
|
||||
def _extract_links(self, soup: BeautifulSoup, metadata: PageMetadata, base_url: str) -> None:
|
||||
"""Extract internal and external links."""
|
||||
parsed_base = urlparse(base_url)
|
||||
base_domain = parsed_base.netloc.lower()
|
||||
|
||||
for a_tag in soup.find_all("a", href=True):
|
||||
href = a_tag["href"]
|
||||
|
||||
# Skip non-http links
|
||||
if href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
||||
continue
|
||||
|
||||
# Resolve relative URLs
|
||||
full_url = urljoin(base_url, href)
|
||||
parsed_url = urlparse(full_url)
|
||||
|
||||
# Get anchor text
|
||||
anchor_text = a_tag.get_text(strip=True)[:100] # Limit length
|
||||
|
||||
# Check if nofollow
|
||||
rel = a_tag.get("rel", [])
|
||||
if isinstance(rel, str):
|
||||
rel = rel.split()
|
||||
is_nofollow = "nofollow" in rel
|
||||
|
||||
# Determine if internal or external
|
||||
link_domain = parsed_url.netloc.lower()
|
||||
is_internal = (
|
||||
link_domain == base_domain or
|
||||
link_domain.endswith(f".{base_domain}") or
|
||||
base_domain.endswith(f".{link_domain}")
|
||||
)
|
||||
|
||||
link_data = LinkData(
|
||||
url=full_url,
|
||||
anchor_text=anchor_text,
|
||||
is_internal=is_internal,
|
||||
is_nofollow=is_nofollow,
|
||||
)
|
||||
|
||||
if is_internal:
|
||||
metadata.internal_links.append(link_data)
|
||||
else:
|
||||
metadata.external_links.append(link_data)
|
||||
|
||||
metadata.internal_link_count = len(metadata.internal_links)
|
||||
metadata.external_link_count = len(metadata.external_links)
|
||||
|
||||
def _extract_images(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract image information."""
|
||||
images = soup.find_all("img")
|
||||
metadata.images_total = len(images)
|
||||
|
||||
for img in images:
|
||||
alt = img.get("alt", "").strip()
|
||||
if alt:
|
||||
metadata.images_with_alt += 1
|
||||
else:
|
||||
metadata.images_without_alt += 1
|
||||
|
||||
def _extract_content_metrics(self, soup: BeautifulSoup, metadata: PageMetadata) -> None:
|
||||
"""Extract content metrics like word count."""
|
||||
# Remove script and style elements
|
||||
for element in soup(["script", "style", "noscript"]):
|
||||
element.decompose()
|
||||
|
||||
# Get text content
|
||||
text = soup.get_text(separator=" ", strip=True)
|
||||
words = text.split()
|
||||
metadata.word_count = len(words)
|
||||
|
||||
def _run_seo_checks(self, metadata: PageMetadata) -> None:
|
||||
"""Run SEO checks and add issues/warnings."""
|
||||
# Title checks
|
||||
if not metadata.title:
|
||||
metadata.issues.append("Missing title tag")
|
||||
elif metadata.title_length < 30:
|
||||
metadata.warnings.append(f"Title too short ({metadata.title_length} chars, recommend 50-60)")
|
||||
elif metadata.title_length > 60:
|
||||
metadata.warnings.append(f"Title too long ({metadata.title_length} chars, recommend 50-60)")
|
||||
|
||||
# Meta description checks
|
||||
if not metadata.meta_description:
|
||||
metadata.issues.append("Missing meta description")
|
||||
elif metadata.meta_description_length < 120:
|
||||
metadata.warnings.append(f"Meta description too short ({metadata.meta_description_length} chars)")
|
||||
elif metadata.meta_description_length > 160:
|
||||
metadata.warnings.append(f"Meta description too long ({metadata.meta_description_length} chars)")
|
||||
|
||||
# Canonical check
|
||||
if not metadata.canonical_url:
|
||||
metadata.warnings.append("Missing canonical tag")
|
||||
elif metadata.canonical_url != metadata.url:
|
||||
metadata.warnings.append(f"Canonical points to different URL: {metadata.canonical_url}")
|
||||
|
||||
# H1 checks
|
||||
if metadata.h1_count == 0:
|
||||
metadata.issues.append("Missing H1 tag")
|
||||
elif metadata.h1_count > 1:
|
||||
metadata.warnings.append(f"Multiple H1 tags ({metadata.h1_count})")
|
||||
|
||||
# Image alt check
|
||||
if metadata.images_without_alt > 0:
|
||||
metadata.warnings.append(f"{metadata.images_without_alt} images missing alt text")
|
||||
|
||||
# Schema check
|
||||
if not metadata.schema_types_found:
|
||||
metadata.warnings.append("No structured data found")
|
||||
|
||||
# Open Graph check
|
||||
if not metadata.open_graph.og_title:
|
||||
metadata.warnings.append("Missing Open Graph tags")
|
||||
|
||||
# Robots meta check
|
||||
if metadata.robots_meta:
|
||||
robots_lower = metadata.robots_meta.lower()
|
||||
if "noindex" in robots_lower:
|
||||
metadata.issues.append("Page is set to noindex")
|
||||
if "nofollow" in robots_lower:
|
||||
metadata.warnings.append("Page is set to nofollow")
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point for testing."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Page SEO Analyzer")
|
||||
parser.add_argument("url", help="URL to analyze")
|
||||
parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
analyzer = PageAnalyzer()
|
||||
metadata = analyzer.analyze_url(args.url)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(metadata.to_dict(), indent=2, ensure_ascii=False))
|
||||
else:
|
||||
print("=" * 60)
|
||||
print("PAGE ANALYSIS REPORT")
|
||||
print("=" * 60)
|
||||
print(metadata.get_summary())
|
||||
print()
|
||||
|
||||
if metadata.issues:
|
||||
print("ISSUES:")
|
||||
for issue in metadata.issues:
|
||||
print(f" ✗ {issue}")
|
||||
|
||||
if metadata.warnings:
|
||||
print("\nWARNINGS:")
|
||||
for warning in metadata.warnings:
|
||||
print(f" ⚠ {warning}")
|
||||
|
||||
if metadata.hreflang_tags:
|
||||
print(f"\nHREFLANG TAGS ({len(metadata.hreflang_tags)}):")
|
||||
for tag in metadata.hreflang_tags[:5]:
|
||||
print(f" {tag['lang']}: {tag['url']}")
|
||||
|
||||
if metadata.schema_types_found:
|
||||
print(f"\nSCHEMA TYPES:")
|
||||
for schema_type in metadata.schema_types_found:
|
||||
print(f" - {schema_type}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,452 @@
|
||||
"""
|
||||
PageSpeed Insights Client
|
||||
=========================
|
||||
Purpose: Get Core Web Vitals and performance data from PageSpeed Insights API
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
from pagespeed_client import PageSpeedClient
|
||||
client = PageSpeedClient()
|
||||
result = client.analyze("https://example.com")
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
from base_client import config
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CoreWebVitals:
|
||||
"""Core Web Vitals metrics."""
|
||||
|
||||
lcp: float | None = None # Largest Contentful Paint (ms)
|
||||
fid: float | None = None # First Input Delay (ms)
|
||||
cls: float | None = None # Cumulative Layout Shift
|
||||
inp: float | None = None # Interaction to Next Paint (ms)
|
||||
ttfb: float | None = None # Time to First Byte (ms)
|
||||
fcp: float | None = None # First Contentful Paint (ms)
|
||||
|
||||
# Assessment (GOOD, NEEDS_IMPROVEMENT, POOR)
|
||||
lcp_rating: str | None = None
|
||||
fid_rating: str | None = None
|
||||
cls_rating: str | None = None
|
||||
inp_rating: str | None = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"lcp": {"value": self.lcp, "rating": self.lcp_rating},
|
||||
"fid": {"value": self.fid, "rating": self.fid_rating},
|
||||
"cls": {"value": self.cls, "rating": self.cls_rating},
|
||||
"inp": {"value": self.inp, "rating": self.inp_rating},
|
||||
"ttfb": {"value": self.ttfb},
|
||||
"fcp": {"value": self.fcp},
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageSpeedResult:
|
||||
"""PageSpeed analysis result."""
|
||||
|
||||
url: str
|
||||
strategy: str # mobile or desktop
|
||||
performance_score: float | None = None
|
||||
seo_score: float | None = None
|
||||
accessibility_score: float | None = None
|
||||
best_practices_score: float | None = None
|
||||
core_web_vitals: CoreWebVitals = field(default_factory=CoreWebVitals)
|
||||
opportunities: list[dict] = field(default_factory=list)
|
||||
diagnostics: list[dict] = field(default_factory=list)
|
||||
passed_audits: list[str] = field(default_factory=list)
|
||||
raw_data: dict = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"url": self.url,
|
||||
"strategy": self.strategy,
|
||||
"scores": {
|
||||
"performance": self.performance_score,
|
||||
"seo": self.seo_score,
|
||||
"accessibility": self.accessibility_score,
|
||||
"best_practices": self.best_practices_score,
|
||||
},
|
||||
"core_web_vitals": self.core_web_vitals.to_dict(),
|
||||
"opportunities_count": len(self.opportunities),
|
||||
"opportunities": self.opportunities[:10],
|
||||
"diagnostics_count": len(self.diagnostics),
|
||||
"passed_audits_count": len(self.passed_audits),
|
||||
}
|
||||
|
||||
|
||||
class PageSpeedClient:
|
||||
"""Client for PageSpeed Insights API."""
|
||||
|
||||
BASE_URL = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
|
||||
|
||||
# Core Web Vitals thresholds
|
||||
THRESHOLDS = {
|
||||
"lcp": {"good": 2500, "poor": 4000},
|
||||
"fid": {"good": 100, "poor": 300},
|
||||
"cls": {"good": 0.1, "poor": 0.25},
|
||||
"inp": {"good": 200, "poor": 500},
|
||||
"ttfb": {"good": 800, "poor": 1800},
|
||||
"fcp": {"good": 1800, "poor": 3000},
|
||||
}
|
||||
|
||||
def __init__(self, api_key: str | None = None):
|
||||
"""
|
||||
Initialize PageSpeed client.
|
||||
|
||||
Args:
|
||||
api_key: PageSpeed API key (optional but recommended for higher quotas)
|
||||
"""
|
||||
self.api_key = api_key or config.pagespeed_api_key
|
||||
self.session = requests.Session()
|
||||
|
||||
def _rate_metric(self, metric: str, value: float | None) -> str | None:
|
||||
"""Rate a metric against thresholds."""
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
thresholds = self.THRESHOLDS.get(metric)
|
||||
if not thresholds:
|
||||
return None
|
||||
|
||||
if value <= thresholds["good"]:
|
||||
return "GOOD"
|
||||
elif value <= thresholds["poor"]:
|
||||
return "NEEDS_IMPROVEMENT"
|
||||
else:
|
||||
return "POOR"
|
||||
|
||||
def analyze(
|
||||
self,
|
||||
url: str,
|
||||
strategy: str = "mobile",
|
||||
categories: list[str] | None = None,
|
||||
) -> PageSpeedResult:
|
||||
"""
|
||||
Analyze a URL with PageSpeed Insights.
|
||||
|
||||
Args:
|
||||
url: URL to analyze
|
||||
strategy: "mobile" or "desktop"
|
||||
categories: Categories to analyze (performance, seo, accessibility, best-practices)
|
||||
|
||||
Returns:
|
||||
PageSpeedResult with scores and metrics
|
||||
"""
|
||||
if categories is None:
|
||||
categories = ["performance", "seo", "accessibility", "best-practices"]
|
||||
|
||||
params = {
|
||||
"url": url,
|
||||
"strategy": strategy,
|
||||
"category": categories,
|
||||
}
|
||||
|
||||
if self.api_key:
|
||||
params["key"] = self.api_key
|
||||
|
||||
try:
|
||||
response = self.session.get(self.BASE_URL, params=params, timeout=60)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"PageSpeed API request failed: {e}")
|
||||
raise
|
||||
|
||||
result = PageSpeedResult(url=url, strategy=strategy, raw_data=data)
|
||||
|
||||
# Extract scores
|
||||
lighthouse = data.get("lighthouseResult", {})
|
||||
categories_data = lighthouse.get("categories", {})
|
||||
|
||||
if "performance" in categories_data:
|
||||
score = categories_data["performance"].get("score")
|
||||
result.performance_score = score * 100 if score else None
|
||||
|
||||
if "seo" in categories_data:
|
||||
score = categories_data["seo"].get("score")
|
||||
result.seo_score = score * 100 if score else None
|
||||
|
||||
if "accessibility" in categories_data:
|
||||
score = categories_data["accessibility"].get("score")
|
||||
result.accessibility_score = score * 100 if score else None
|
||||
|
||||
if "best-practices" in categories_data:
|
||||
score = categories_data["best-practices"].get("score")
|
||||
result.best_practices_score = score * 100 if score else None
|
||||
|
||||
# Extract Core Web Vitals
|
||||
audits = lighthouse.get("audits", {})
|
||||
|
||||
# Lab data
|
||||
cwv = result.core_web_vitals
|
||||
|
||||
if "largest-contentful-paint" in audits:
|
||||
cwv.lcp = audits["largest-contentful-paint"].get("numericValue")
|
||||
cwv.lcp_rating = self._rate_metric("lcp", cwv.lcp)
|
||||
|
||||
if "total-blocking-time" in audits:
|
||||
# TBT is proxy for FID in lab data
|
||||
cwv.fid = audits["total-blocking-time"].get("numericValue")
|
||||
cwv.fid_rating = self._rate_metric("fid", cwv.fid)
|
||||
|
||||
if "cumulative-layout-shift" in audits:
|
||||
cwv.cls = audits["cumulative-layout-shift"].get("numericValue")
|
||||
cwv.cls_rating = self._rate_metric("cls", cwv.cls)
|
||||
|
||||
if "experimental-interaction-to-next-paint" in audits:
|
||||
cwv.inp = audits["experimental-interaction-to-next-paint"].get("numericValue")
|
||||
cwv.inp_rating = self._rate_metric("inp", cwv.inp)
|
||||
|
||||
if "server-response-time" in audits:
|
||||
cwv.ttfb = audits["server-response-time"].get("numericValue")
|
||||
|
||||
if "first-contentful-paint" in audits:
|
||||
cwv.fcp = audits["first-contentful-paint"].get("numericValue")
|
||||
|
||||
# Field data (real user data) if available
|
||||
loading_exp = data.get("loadingExperience", {})
|
||||
metrics = loading_exp.get("metrics", {})
|
||||
|
||||
if "LARGEST_CONTENTFUL_PAINT_MS" in metrics:
|
||||
cwv.lcp = metrics["LARGEST_CONTENTFUL_PAINT_MS"].get("percentile")
|
||||
cwv.lcp_rating = metrics["LARGEST_CONTENTFUL_PAINT_MS"].get("category")
|
||||
|
||||
if "FIRST_INPUT_DELAY_MS" in metrics:
|
||||
cwv.fid = metrics["FIRST_INPUT_DELAY_MS"].get("percentile")
|
||||
cwv.fid_rating = metrics["FIRST_INPUT_DELAY_MS"].get("category")
|
||||
|
||||
if "CUMULATIVE_LAYOUT_SHIFT_SCORE" in metrics:
|
||||
cwv.cls = metrics["CUMULATIVE_LAYOUT_SHIFT_SCORE"].get("percentile") / 100
|
||||
cwv.cls_rating = metrics["CUMULATIVE_LAYOUT_SHIFT_SCORE"].get("category")
|
||||
|
||||
if "INTERACTION_TO_NEXT_PAINT" in metrics:
|
||||
cwv.inp = metrics["INTERACTION_TO_NEXT_PAINT"].get("percentile")
|
||||
cwv.inp_rating = metrics["INTERACTION_TO_NEXT_PAINT"].get("category")
|
||||
|
||||
# Extract opportunities
|
||||
for audit_id, audit in audits.items():
|
||||
if audit.get("details", {}).get("type") == "opportunity":
|
||||
savings = audit.get("details", {}).get("overallSavingsMs", 0)
|
||||
if savings > 0:
|
||||
result.opportunities.append({
|
||||
"id": audit_id,
|
||||
"title": audit.get("title", ""),
|
||||
"description": audit.get("description", ""),
|
||||
"savings_ms": savings,
|
||||
"score": audit.get("score", 0),
|
||||
})
|
||||
|
||||
# Sort opportunities by savings
|
||||
result.opportunities.sort(key=lambda x: x["savings_ms"], reverse=True)
|
||||
|
||||
# Extract diagnostics
|
||||
for audit_id, audit in audits.items():
|
||||
score = audit.get("score")
|
||||
if score is not None and score < 1 and audit.get("details"):
|
||||
if audit.get("details", {}).get("type") not in ["opportunity", None]:
|
||||
result.diagnostics.append({
|
||||
"id": audit_id,
|
||||
"title": audit.get("title", ""),
|
||||
"description": audit.get("description", ""),
|
||||
"score": score,
|
||||
})
|
||||
|
||||
# Extract passed audits
|
||||
for audit_id, audit in audits.items():
|
||||
if audit.get("score") == 1:
|
||||
result.passed_audits.append(audit.get("title", audit_id))
|
||||
|
||||
return result
|
||||
|
||||
def analyze_both_strategies(self, url: str) -> dict:
|
||||
"""Analyze URL for both mobile and desktop."""
|
||||
mobile = self.analyze(url, strategy="mobile")
|
||||
desktop = self.analyze(url, strategy="desktop")
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"mobile": mobile.to_dict(),
|
||||
"desktop": desktop.to_dict(),
|
||||
"comparison": {
|
||||
"performance_difference": (
|
||||
(desktop.performance_score or 0) - (mobile.performance_score or 0)
|
||||
),
|
||||
"mobile_first_issues": self._identify_mobile_issues(mobile, desktop),
|
||||
},
|
||||
}
|
||||
|
||||
def _identify_mobile_issues(
|
||||
self,
|
||||
mobile: PageSpeedResult,
|
||||
desktop: PageSpeedResult,
|
||||
) -> list[str]:
|
||||
"""Identify issues that affect mobile more than desktop."""
|
||||
issues = []
|
||||
|
||||
if mobile.performance_score and desktop.performance_score:
|
||||
if desktop.performance_score - mobile.performance_score > 20:
|
||||
issues.append("Significant performance gap between mobile and desktop")
|
||||
|
||||
m_cwv = mobile.core_web_vitals
|
||||
d_cwv = desktop.core_web_vitals
|
||||
|
||||
if m_cwv.lcp and d_cwv.lcp and m_cwv.lcp > d_cwv.lcp * 1.5:
|
||||
issues.append("LCP significantly slower on mobile")
|
||||
|
||||
if m_cwv.cls and d_cwv.cls and m_cwv.cls > d_cwv.cls * 2:
|
||||
issues.append("Layout shift issues more severe on mobile")
|
||||
|
||||
return issues
|
||||
|
||||
def get_cwv_summary(self, url: str) -> dict:
|
||||
"""Get a summary focused on Core Web Vitals."""
|
||||
result = self.analyze(url, strategy="mobile")
|
||||
|
||||
cwv = result.core_web_vitals
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"overall_cwv_status": self._overall_cwv_status(cwv),
|
||||
"metrics": {
|
||||
"lcp": {
|
||||
"value": f"{cwv.lcp / 1000:.2f}s" if cwv.lcp else None,
|
||||
"rating": cwv.lcp_rating,
|
||||
"threshold": "≤ 2.5s good, > 4.0s poor",
|
||||
},
|
||||
"fid": {
|
||||
"value": f"{cwv.fid:.0f}ms" if cwv.fid else None,
|
||||
"rating": cwv.fid_rating,
|
||||
"threshold": "≤ 100ms good, > 300ms poor",
|
||||
},
|
||||
"cls": {
|
||||
"value": f"{cwv.cls:.3f}" if cwv.cls else None,
|
||||
"rating": cwv.cls_rating,
|
||||
"threshold": "≤ 0.1 good, > 0.25 poor",
|
||||
},
|
||||
"inp": {
|
||||
"value": f"{cwv.inp:.0f}ms" if cwv.inp else None,
|
||||
"rating": cwv.inp_rating,
|
||||
"threshold": "≤ 200ms good, > 500ms poor",
|
||||
},
|
||||
},
|
||||
"top_opportunities": result.opportunities[:5],
|
||||
}
|
||||
|
||||
def _overall_cwv_status(self, cwv: CoreWebVitals) -> str:
|
||||
"""Determine overall Core Web Vitals status."""
|
||||
ratings = [cwv.lcp_rating, cwv.fid_rating, cwv.cls_rating]
|
||||
ratings = [r for r in ratings if r]
|
||||
|
||||
if not ratings:
|
||||
return "UNKNOWN"
|
||||
|
||||
if any(r == "POOR" for r in ratings):
|
||||
return "POOR"
|
||||
if any(r == "NEEDS_IMPROVEMENT" for r in ratings):
|
||||
return "NEEDS_IMPROVEMENT"
|
||||
return "GOOD"
|
||||
|
||||
def generate_report(self, result: PageSpeedResult) -> str:
|
||||
"""Generate human-readable performance report."""
|
||||
lines = [
|
||||
"=" * 60,
|
||||
"PageSpeed Insights Report",
|
||||
"=" * 60,
|
||||
f"URL: {result.url}",
|
||||
f"Strategy: {result.strategy}",
|
||||
"",
|
||||
"Scores:",
|
||||
f" Performance: {result.performance_score:.0f}/100" if result.performance_score else " Performance: N/A",
|
||||
f" SEO: {result.seo_score:.0f}/100" if result.seo_score else " SEO: N/A",
|
||||
f" Accessibility: {result.accessibility_score:.0f}/100" if result.accessibility_score else " Accessibility: N/A",
|
||||
f" Best Practices: {result.best_practices_score:.0f}/100" if result.best_practices_score else " Best Practices: N/A",
|
||||
"",
|
||||
"Core Web Vitals:",
|
||||
]
|
||||
|
||||
cwv = result.core_web_vitals
|
||||
|
||||
def format_metric(name: str, value: Any, rating: str | None, unit: str) -> str:
|
||||
if value is None:
|
||||
return f" {name}: N/A"
|
||||
rating_str = f" ({rating})" if rating else ""
|
||||
return f" {name}: {value}{unit}{rating_str}"
|
||||
|
||||
lines.append(format_metric("LCP", f"{cwv.lcp / 1000:.2f}" if cwv.lcp else None, cwv.lcp_rating, "s"))
|
||||
lines.append(format_metric("FID/TBT", f"{cwv.fid:.0f}" if cwv.fid else None, cwv.fid_rating, "ms"))
|
||||
lines.append(format_metric("CLS", f"{cwv.cls:.3f}" if cwv.cls else None, cwv.cls_rating, ""))
|
||||
lines.append(format_metric("INP", f"{cwv.inp:.0f}" if cwv.inp else None, cwv.inp_rating, "ms"))
|
||||
lines.append(format_metric("TTFB", f"{cwv.ttfb:.0f}" if cwv.ttfb else None, None, "ms"))
|
||||
lines.append(format_metric("FCP", f"{cwv.fcp / 1000:.2f}" if cwv.fcp else None, None, "s"))
|
||||
|
||||
if result.opportunities:
|
||||
lines.extend([
|
||||
"",
|
||||
f"Top Opportunities ({len(result.opportunities)} total):",
|
||||
])
|
||||
for opp in result.opportunities[:5]:
|
||||
savings = opp["savings_ms"]
|
||||
lines.append(f" - {opp['title']}: -{savings / 1000:.1f}s potential savings")
|
||||
|
||||
lines.extend(["", "=" * 60])
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point."""
|
||||
parser = argparse.ArgumentParser(description="PageSpeed Insights Client")
|
||||
parser.add_argument("--url", "-u", required=True, help="URL to analyze")
|
||||
parser.add_argument("--strategy", "-s", default="mobile",
|
||||
choices=["mobile", "desktop", "both"],
|
||||
help="Analysis strategy")
|
||||
parser.add_argument("--output", "-o", help="Output file for JSON")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
parser.add_argument("--cwv-only", action="store_true",
|
||||
help="Show only Core Web Vitals summary")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
client = PageSpeedClient()
|
||||
|
||||
if args.cwv_only:
|
||||
summary = client.get_cwv_summary(args.url)
|
||||
print(json.dumps(summary, indent=2))
|
||||
elif args.strategy == "both":
|
||||
result = client.analyze_both_strategies(args.url)
|
||||
output = json.dumps(result, indent=2)
|
||||
if args.output:
|
||||
with open(args.output, "w") as f:
|
||||
f.write(output)
|
||||
else:
|
||||
print(output)
|
||||
else:
|
||||
result = client.analyze(args.url, strategy=args.strategy)
|
||||
|
||||
if args.json or args.output:
|
||||
output = json.dumps(result.to_dict(), indent=2)
|
||||
if args.output:
|
||||
with open(args.output, "w") as f:
|
||||
f.write(output)
|
||||
else:
|
||||
print(output)
|
||||
else:
|
||||
print(client.generate_report(result))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,40 @@
|
||||
# OurDigital SEO Audit - Python Dependencies
|
||||
# Install with: pip install -r requirements.txt
|
||||
|
||||
# Google APIs
|
||||
google-api-python-client>=2.100.0
|
||||
google-auth>=2.23.0
|
||||
google-auth-oauthlib>=1.1.0
|
||||
google-auth-httplib2>=0.1.1
|
||||
google-analytics-data>=0.18.0
|
||||
|
||||
# Notion API
|
||||
notion-client>=2.0.0
|
||||
|
||||
# Web Scraping & Parsing
|
||||
lxml>=5.1.0
|
||||
beautifulsoup4>=4.12.0
|
||||
extruct>=0.16.0
|
||||
requests>=2.31.0
|
||||
aiohttp>=3.9.0
|
||||
|
||||
# Schema Validation
|
||||
jsonschema>=4.21.0
|
||||
rdflib>=7.0.0
|
||||
|
||||
# Google Trends
|
||||
pytrends>=4.9.2
|
||||
|
||||
# Data Processing
|
||||
pandas>=2.1.0
|
||||
|
||||
# Async & Retry
|
||||
tenacity>=8.2.0
|
||||
tqdm>=4.66.0
|
||||
|
||||
# Environment
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
# Logging & CLI
|
||||
rich>=13.7.0
|
||||
typer>=0.9.0
|
||||
@@ -0,0 +1,540 @@
|
||||
"""
|
||||
Robots.txt Checker - Analyze robots.txt configuration
|
||||
=====================================================
|
||||
Purpose: Parse and analyze robots.txt for SEO compliance
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
python robots_checker.py --url https://example.com/robots.txt
|
||||
python robots_checker.py --url https://example.com --test-url /admin/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from urllib.robotparser import RobotFileParser
|
||||
|
||||
import requests
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RobotsIssue:
|
||||
"""Represents a robots.txt issue."""
|
||||
|
||||
severity: str # "error", "warning", "info"
|
||||
message: str
|
||||
line_number: int | None = None
|
||||
directive: str | None = None
|
||||
suggestion: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class UserAgentRules:
|
||||
"""Rules for a specific user-agent."""
|
||||
|
||||
user_agent: str
|
||||
disallow: list[str] = field(default_factory=list)
|
||||
allow: list[str] = field(default_factory=list)
|
||||
crawl_delay: float | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RobotsResult:
|
||||
"""Complete robots.txt analysis result."""
|
||||
|
||||
url: str
|
||||
accessible: bool = True
|
||||
content: str = ""
|
||||
rules: list[UserAgentRules] = field(default_factory=list)
|
||||
sitemaps: list[str] = field(default_factory=list)
|
||||
issues: list[RobotsIssue] = field(default_factory=list)
|
||||
stats: dict = field(default_factory=dict)
|
||||
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON output."""
|
||||
return {
|
||||
"url": self.url,
|
||||
"accessible": self.accessible,
|
||||
"sitemaps": self.sitemaps,
|
||||
"rules": [
|
||||
{
|
||||
"user_agent": r.user_agent,
|
||||
"disallow": r.disallow,
|
||||
"allow": r.allow,
|
||||
"crawl_delay": r.crawl_delay,
|
||||
}
|
||||
for r in self.rules
|
||||
],
|
||||
"issues": [
|
||||
{
|
||||
"severity": i.severity,
|
||||
"message": i.message,
|
||||
"line_number": i.line_number,
|
||||
"directive": i.directive,
|
||||
"suggestion": i.suggestion,
|
||||
}
|
||||
for i in self.issues
|
||||
],
|
||||
"stats": self.stats,
|
||||
"timestamp": self.timestamp,
|
||||
}
|
||||
|
||||
|
||||
class RobotsChecker:
|
||||
"""Analyze robots.txt configuration."""
|
||||
|
||||
# Common user agents
|
||||
USER_AGENTS = {
|
||||
"*": "All bots",
|
||||
"Googlebot": "Google crawler",
|
||||
"Googlebot-Image": "Google Image crawler",
|
||||
"Googlebot-News": "Google News crawler",
|
||||
"Googlebot-Video": "Google Video crawler",
|
||||
"Bingbot": "Bing crawler",
|
||||
"Slurp": "Yahoo crawler",
|
||||
"DuckDuckBot": "DuckDuckGo crawler",
|
||||
"Baiduspider": "Baidu crawler",
|
||||
"Yandex": "Yandex crawler",
|
||||
"facebot": "Facebook crawler",
|
||||
"Twitterbot": "Twitter crawler",
|
||||
"LinkedInBot": "LinkedIn crawler",
|
||||
}
|
||||
|
||||
# Paths that should generally not be blocked
|
||||
IMPORTANT_PATHS = [
|
||||
"/",
|
||||
"/*.css",
|
||||
"/*.js",
|
||||
"/*.jpg",
|
||||
"/*.jpeg",
|
||||
"/*.png",
|
||||
"/*.gif",
|
||||
"/*.svg",
|
||||
"/*.webp",
|
||||
]
|
||||
|
||||
# Paths commonly blocked
|
||||
COMMON_BLOCKED = [
|
||||
"/admin",
|
||||
"/wp-admin",
|
||||
"/login",
|
||||
"/private",
|
||||
"/api",
|
||||
"/cgi-bin",
|
||||
"/tmp",
|
||||
"/search",
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
|
||||
})
|
||||
|
||||
def fetch_robots(self, url: str) -> str | None:
|
||||
"""Fetch robots.txt content."""
|
||||
# Ensure we're fetching robots.txt
|
||||
parsed = urlparse(url)
|
||||
if not parsed.path.endswith("robots.txt"):
|
||||
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
||||
else:
|
||||
robots_url = url
|
||||
|
||||
try:
|
||||
response = self.session.get(robots_url, timeout=10)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
elif response.status_code == 404:
|
||||
return None
|
||||
else:
|
||||
raise RuntimeError(f"HTTP {response.status_code}")
|
||||
except requests.RequestException as e:
|
||||
raise RuntimeError(f"Failed to fetch robots.txt: {e}")
|
||||
|
||||
def parse_robots(self, content: str) -> tuple[list[UserAgentRules], list[str]]:
|
||||
"""Parse robots.txt content."""
|
||||
rules = []
|
||||
sitemaps = []
|
||||
current_ua = None
|
||||
current_rules = None
|
||||
|
||||
for line_num, line in enumerate(content.split("\n"), 1):
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines and comments
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
|
||||
# Parse directive
|
||||
if ":" not in line:
|
||||
continue
|
||||
|
||||
directive, value = line.split(":", 1)
|
||||
directive = directive.strip().lower()
|
||||
value = value.strip()
|
||||
|
||||
if directive == "user-agent":
|
||||
# Save previous user-agent rules
|
||||
if current_rules:
|
||||
rules.append(current_rules)
|
||||
|
||||
current_ua = value
|
||||
current_rules = UserAgentRules(user_agent=value)
|
||||
|
||||
elif directive == "disallow" and current_rules:
|
||||
if value: # Empty disallow means allow all
|
||||
current_rules.disallow.append(value)
|
||||
|
||||
elif directive == "allow" and current_rules:
|
||||
if value:
|
||||
current_rules.allow.append(value)
|
||||
|
||||
elif directive == "crawl-delay" and current_rules:
|
||||
try:
|
||||
current_rules.crawl_delay = float(value)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
elif directive == "sitemap":
|
||||
if value:
|
||||
sitemaps.append(value)
|
||||
|
||||
# Don't forget last user-agent
|
||||
if current_rules:
|
||||
rules.append(current_rules)
|
||||
|
||||
return rules, sitemaps
|
||||
|
||||
def analyze(self, url: str) -> RobotsResult:
|
||||
"""Analyze robots.txt."""
|
||||
result = RobotsResult(url=url)
|
||||
|
||||
# Fetch robots.txt
|
||||
try:
|
||||
content = self.fetch_robots(url)
|
||||
if content is None:
|
||||
result.accessible = False
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message="No robots.txt found (returns 404)",
|
||||
suggestion="Consider creating a robots.txt file",
|
||||
))
|
||||
return result
|
||||
except RuntimeError as e:
|
||||
result.accessible = False
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="error",
|
||||
message=str(e),
|
||||
))
|
||||
return result
|
||||
|
||||
result.content = content
|
||||
result.rules, result.sitemaps = self.parse_robots(content)
|
||||
|
||||
# Analyze content
|
||||
self._analyze_syntax(result)
|
||||
self._analyze_rules(result)
|
||||
self._analyze_sitemaps(result)
|
||||
|
||||
# Calculate stats
|
||||
result.stats = {
|
||||
"user_agents_count": len(result.rules),
|
||||
"user_agents": [r.user_agent for r in result.rules],
|
||||
"total_disallow_rules": sum(len(r.disallow) for r in result.rules),
|
||||
"total_allow_rules": sum(len(r.allow) for r in result.rules),
|
||||
"sitemaps_count": len(result.sitemaps),
|
||||
"has_crawl_delay": any(r.crawl_delay for r in result.rules),
|
||||
"content_length": len(content),
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
def _analyze_syntax(self, result: RobotsResult) -> None:
|
||||
"""Check for syntax issues."""
|
||||
lines = result.content.split("\n")
|
||||
|
||||
for line_num, line in enumerate(lines, 1):
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines and comments
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
|
||||
# Check for valid directive
|
||||
if ":" not in line:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message=f"Invalid line (missing colon): {line[:50]}",
|
||||
line_number=line_num,
|
||||
))
|
||||
continue
|
||||
|
||||
directive, value = line.split(":", 1)
|
||||
directive = directive.strip().lower()
|
||||
|
||||
valid_directives = {
|
||||
"user-agent", "disallow", "allow",
|
||||
"crawl-delay", "sitemap", "host",
|
||||
}
|
||||
|
||||
if directive not in valid_directives:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message=f"Unknown directive: {directive}",
|
||||
line_number=line_num,
|
||||
directive=directive,
|
||||
))
|
||||
|
||||
def _analyze_rules(self, result: RobotsResult) -> None:
|
||||
"""Analyze blocking rules."""
|
||||
# Check if there are any rules
|
||||
if not result.rules:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message="No user-agent rules defined",
|
||||
suggestion="Add User-agent: * rules to control crawling",
|
||||
))
|
||||
return
|
||||
|
||||
# Check for wildcard rule
|
||||
has_wildcard = any(r.user_agent == "*" for r in result.rules)
|
||||
if not has_wildcard:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message="No wildcard (*) user-agent defined",
|
||||
suggestion="Consider adding User-agent: * as fallback",
|
||||
))
|
||||
|
||||
# Check for blocking important resources
|
||||
for rules in result.rules:
|
||||
for disallow in rules.disallow:
|
||||
# Check if blocking root
|
||||
if disallow == "/":
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="error",
|
||||
message=f"Blocking entire site for {rules.user_agent}",
|
||||
directive=f"Disallow: {disallow}",
|
||||
suggestion="This will prevent indexing. Is this intentional?",
|
||||
))
|
||||
|
||||
# Check if blocking CSS/JS
|
||||
if any(ext in disallow.lower() for ext in [".css", ".js"]):
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message=f"Blocking CSS/JS files for {rules.user_agent}",
|
||||
directive=f"Disallow: {disallow}",
|
||||
suggestion="May affect rendering and SEO",
|
||||
))
|
||||
|
||||
# Check for blocking images
|
||||
if any(ext in disallow.lower() for ext in [".jpg", ".png", ".gif", ".webp"]):
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message=f"Blocking image files for {rules.user_agent}",
|
||||
directive=f"Disallow: {disallow}",
|
||||
))
|
||||
|
||||
# Check crawl delay
|
||||
if rules.crawl_delay:
|
||||
if rules.crawl_delay > 10:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message=f"High crawl-delay ({rules.crawl_delay}s) for {rules.user_agent}",
|
||||
directive=f"Crawl-delay: {rules.crawl_delay}",
|
||||
suggestion="May significantly slow indexing",
|
||||
))
|
||||
elif rules.crawl_delay > 0:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message=f"Crawl-delay set to {rules.crawl_delay}s for {rules.user_agent}",
|
||||
))
|
||||
|
||||
def _analyze_sitemaps(self, result: RobotsResult) -> None:
|
||||
"""Analyze sitemap declarations."""
|
||||
if not result.sitemaps:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message="No sitemap declared in robots.txt",
|
||||
suggestion="Add Sitemap: directive to help crawlers find your sitemap",
|
||||
))
|
||||
else:
|
||||
for sitemap in result.sitemaps:
|
||||
if not sitemap.startswith("http"):
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message=f"Sitemap URL should be absolute: {sitemap}",
|
||||
directive=f"Sitemap: {sitemap}",
|
||||
))
|
||||
|
||||
def test_url(self, robots_url: str, test_path: str,
|
||||
user_agent: str = "Googlebot") -> dict:
|
||||
"""Test if a specific URL is allowed."""
|
||||
# Use Python's built-in parser
|
||||
rp = RobotFileParser()
|
||||
|
||||
# Ensure robots.txt URL
|
||||
parsed = urlparse(robots_url)
|
||||
if not parsed.path.endswith("robots.txt"):
|
||||
robots_txt_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
||||
else:
|
||||
robots_txt_url = robots_url
|
||||
|
||||
rp.set_url(robots_txt_url)
|
||||
try:
|
||||
rp.read()
|
||||
except Exception as e:
|
||||
return {
|
||||
"path": test_path,
|
||||
"user_agent": user_agent,
|
||||
"allowed": None,
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
# Build full URL for testing
|
||||
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
||||
full_url = urljoin(base_url, test_path)
|
||||
|
||||
allowed = rp.can_fetch(user_agent, full_url)
|
||||
|
||||
return {
|
||||
"path": test_path,
|
||||
"user_agent": user_agent,
|
||||
"allowed": allowed,
|
||||
"full_url": full_url,
|
||||
}
|
||||
|
||||
def generate_report(self, result: RobotsResult) -> str:
|
||||
"""Generate human-readable analysis report."""
|
||||
lines = [
|
||||
"=" * 60,
|
||||
"Robots.txt Analysis Report",
|
||||
"=" * 60,
|
||||
f"URL: {result.url}",
|
||||
f"Accessible: {'Yes' if result.accessible else 'No'}",
|
||||
f"Timestamp: {result.timestamp}",
|
||||
"",
|
||||
]
|
||||
|
||||
if result.accessible:
|
||||
lines.append("Statistics:")
|
||||
for key, value in result.stats.items():
|
||||
if key == "user_agents":
|
||||
lines.append(f" {key}: {', '.join(value) if value else 'None'}")
|
||||
else:
|
||||
lines.append(f" {key}: {value}")
|
||||
lines.append("")
|
||||
|
||||
if result.sitemaps:
|
||||
lines.append(f"Sitemaps ({len(result.sitemaps)}):")
|
||||
for sitemap in result.sitemaps:
|
||||
lines.append(f" - {sitemap}")
|
||||
lines.append("")
|
||||
|
||||
if result.rules:
|
||||
lines.append("Rules Summary:")
|
||||
for rules in result.rules:
|
||||
lines.append(f"\n User-agent: {rules.user_agent}")
|
||||
if rules.disallow:
|
||||
lines.append(f" Disallow: {len(rules.disallow)} rules")
|
||||
for d in rules.disallow[:5]:
|
||||
lines.append(f" - {d}")
|
||||
if len(rules.disallow) > 5:
|
||||
lines.append(f" ... and {len(rules.disallow) - 5} more")
|
||||
if rules.allow:
|
||||
lines.append(f" Allow: {len(rules.allow)} rules")
|
||||
for a in rules.allow[:3]:
|
||||
lines.append(f" - {a}")
|
||||
if rules.crawl_delay:
|
||||
lines.append(f" Crawl-delay: {rules.crawl_delay}s")
|
||||
lines.append("")
|
||||
|
||||
if result.issues:
|
||||
lines.append("Issues Found:")
|
||||
errors = [i for i in result.issues if i.severity == "error"]
|
||||
warnings = [i for i in result.issues if i.severity == "warning"]
|
||||
infos = [i for i in result.issues if i.severity == "info"]
|
||||
|
||||
if errors:
|
||||
lines.append(f"\n ERRORS ({len(errors)}):")
|
||||
for issue in errors:
|
||||
lines.append(f" - {issue.message}")
|
||||
if issue.directive:
|
||||
lines.append(f" Directive: {issue.directive}")
|
||||
if issue.suggestion:
|
||||
lines.append(f" Suggestion: {issue.suggestion}")
|
||||
|
||||
if warnings:
|
||||
lines.append(f"\n WARNINGS ({len(warnings)}):")
|
||||
for issue in warnings:
|
||||
lines.append(f" - {issue.message}")
|
||||
if issue.suggestion:
|
||||
lines.append(f" Suggestion: {issue.suggestion}")
|
||||
|
||||
if infos:
|
||||
lines.append(f"\n INFO ({len(infos)}):")
|
||||
for issue in infos:
|
||||
lines.append(f" - {issue.message}")
|
||||
|
||||
lines.append("")
|
||||
lines.append("=" * 60)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for CLI usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze robots.txt configuration",
|
||||
)
|
||||
parser.add_argument("--url", "-u", required=True,
|
||||
help="URL to robots.txt or domain")
|
||||
parser.add_argument("--test-url", "-t",
|
||||
help="Test if specific URL path is allowed")
|
||||
parser.add_argument("--user-agent", "-a", default="Googlebot",
|
||||
help="User agent for testing (default: Googlebot)")
|
||||
parser.add_argument("--output", "-o", help="Output file for JSON report")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
checker = RobotsChecker()
|
||||
|
||||
if args.test_url:
|
||||
# Test specific URL
|
||||
test_result = checker.test_url(args.url, args.test_url, args.user_agent)
|
||||
if args.json:
|
||||
print(json.dumps(test_result, indent=2))
|
||||
else:
|
||||
status = "ALLOWED" if test_result["allowed"] else "BLOCKED"
|
||||
print(f"URL: {test_result['path']}")
|
||||
print(f"User-Agent: {test_result['user_agent']}")
|
||||
print(f"Status: {status}")
|
||||
else:
|
||||
# Full analysis
|
||||
result = checker.analyze(args.url)
|
||||
|
||||
if args.json or args.output:
|
||||
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
logger.info(f"Report written to {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
else:
|
||||
print(checker.generate_report(result))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,490 @@
|
||||
"""
|
||||
Schema Generator - Generate JSON-LD structured data markup
|
||||
==========================================================
|
||||
Purpose: Generate schema.org structured data in JSON-LD format
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
python schema_generator.py --type organization --name "Company Name" --url "https://example.com"
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Template directory relative to this script
|
||||
TEMPLATE_DIR = Path(__file__).parent.parent / "templates" / "schema_templates"
|
||||
|
||||
|
||||
class SchemaGenerator:
|
||||
"""Generate JSON-LD schema markup from templates."""
|
||||
|
||||
SCHEMA_TYPES = {
|
||||
"organization": "organization.json",
|
||||
"local_business": "local_business.json",
|
||||
"product": "product.json",
|
||||
"article": "article.json",
|
||||
"faq": "faq.json",
|
||||
"breadcrumb": "breadcrumb.json",
|
||||
"website": "website.json",
|
||||
}
|
||||
|
||||
# Business type mappings for LocalBusiness
|
||||
BUSINESS_TYPES = {
|
||||
"restaurant": "Restaurant",
|
||||
"cafe": "CafeOrCoffeeShop",
|
||||
"bar": "BarOrPub",
|
||||
"hotel": "Hotel",
|
||||
"store": "Store",
|
||||
"medical": "MedicalBusiness",
|
||||
"dental": "Dentist",
|
||||
"legal": "LegalService",
|
||||
"real_estate": "RealEstateAgent",
|
||||
"auto": "AutoRepair",
|
||||
"beauty": "BeautySalon",
|
||||
"gym": "HealthClub",
|
||||
"spa": "DaySpa",
|
||||
}
|
||||
|
||||
# Article type mappings
|
||||
ARTICLE_TYPES = {
|
||||
"article": "Article",
|
||||
"blog": "BlogPosting",
|
||||
"news": "NewsArticle",
|
||||
"tech": "TechArticle",
|
||||
"scholarly": "ScholarlyArticle",
|
||||
}
|
||||
|
||||
def __init__(self, template_dir: Path = TEMPLATE_DIR):
|
||||
self.template_dir = template_dir
|
||||
|
||||
def load_template(self, schema_type: str) -> dict:
|
||||
"""Load a schema template file."""
|
||||
if schema_type not in self.SCHEMA_TYPES:
|
||||
raise ValueError(f"Unknown schema type: {schema_type}. "
|
||||
f"Available: {list(self.SCHEMA_TYPES.keys())}")
|
||||
|
||||
template_file = self.template_dir / self.SCHEMA_TYPES[schema_type]
|
||||
if not template_file.exists():
|
||||
raise FileNotFoundError(f"Template not found: {template_file}")
|
||||
|
||||
with open(template_file, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
def fill_template(self, template: dict, data: dict[str, Any]) -> dict:
|
||||
"""Fill template placeholders with actual data."""
|
||||
template_str = json.dumps(template, ensure_ascii=False)
|
||||
|
||||
# Replace placeholders {{key}} with values
|
||||
for key, value in data.items():
|
||||
placeholder = f"{{{{{key}}}}}"
|
||||
if value is not None:
|
||||
template_str = template_str.replace(placeholder, str(value))
|
||||
|
||||
# Remove unfilled placeholders and their parent objects if empty
|
||||
result = json.loads(template_str)
|
||||
return self._clean_empty_values(result)
|
||||
|
||||
def _clean_empty_values(self, obj: Any) -> Any:
|
||||
"""Remove empty values and unfilled placeholders."""
|
||||
if isinstance(obj, dict):
|
||||
cleaned = {}
|
||||
for key, value in obj.items():
|
||||
cleaned_value = self._clean_empty_values(value)
|
||||
# Skip if value is empty, None, or unfilled placeholder
|
||||
if cleaned_value is None:
|
||||
continue
|
||||
if isinstance(cleaned_value, str) and cleaned_value.startswith("{{"):
|
||||
continue
|
||||
if isinstance(cleaned_value, (list, dict)) and not cleaned_value:
|
||||
continue
|
||||
cleaned[key] = cleaned_value
|
||||
return cleaned if cleaned else None
|
||||
elif isinstance(obj, list):
|
||||
cleaned = []
|
||||
for item in obj:
|
||||
cleaned_item = self._clean_empty_values(item)
|
||||
if cleaned_item is not None:
|
||||
if isinstance(cleaned_item, str) and cleaned_item.startswith("{{"):
|
||||
continue
|
||||
cleaned.append(cleaned_item)
|
||||
return cleaned if cleaned else None
|
||||
elif isinstance(obj, str):
|
||||
if obj.startswith("{{") and obj.endswith("}}"):
|
||||
return None
|
||||
return obj
|
||||
return obj
|
||||
|
||||
def generate_organization(
|
||||
self,
|
||||
name: str,
|
||||
url: str,
|
||||
logo_url: str | None = None,
|
||||
description: str | None = None,
|
||||
founding_date: str | None = None,
|
||||
phone: str | None = None,
|
||||
address: dict | None = None,
|
||||
social_links: list[str] | None = None,
|
||||
) -> dict:
|
||||
"""Generate Organization schema."""
|
||||
template = self.load_template("organization")
|
||||
|
||||
data = {
|
||||
"name": name,
|
||||
"url": url,
|
||||
"logo_url": logo_url,
|
||||
"description": description,
|
||||
"founding_date": founding_date,
|
||||
"phone": phone,
|
||||
}
|
||||
|
||||
if address:
|
||||
data.update({
|
||||
"street_address": address.get("street"),
|
||||
"city": address.get("city"),
|
||||
"region": address.get("region"),
|
||||
"postal_code": address.get("postal_code"),
|
||||
"country": address.get("country", "KR"),
|
||||
})
|
||||
|
||||
if social_links:
|
||||
# Handle social links specially
|
||||
pass
|
||||
|
||||
return self.fill_template(template, data)
|
||||
|
||||
def generate_local_business(
|
||||
self,
|
||||
name: str,
|
||||
business_type: str,
|
||||
address: dict,
|
||||
phone: str | None = None,
|
||||
url: str | None = None,
|
||||
description: str | None = None,
|
||||
hours: dict | None = None,
|
||||
geo: dict | None = None,
|
||||
price_range: str | None = None,
|
||||
rating: float | None = None,
|
||||
review_count: int | None = None,
|
||||
) -> dict:
|
||||
"""Generate LocalBusiness schema."""
|
||||
template = self.load_template("local_business")
|
||||
|
||||
schema_business_type = self.BUSINESS_TYPES.get(
|
||||
business_type.lower(), "LocalBusiness"
|
||||
)
|
||||
|
||||
data = {
|
||||
"business_type": schema_business_type,
|
||||
"name": name,
|
||||
"url": url,
|
||||
"description": description,
|
||||
"phone": phone,
|
||||
"price_range": price_range,
|
||||
"street_address": address.get("street"),
|
||||
"city": address.get("city"),
|
||||
"region": address.get("region"),
|
||||
"postal_code": address.get("postal_code"),
|
||||
"country": address.get("country", "KR"),
|
||||
}
|
||||
|
||||
if geo:
|
||||
data["latitude"] = geo.get("lat")
|
||||
data["longitude"] = geo.get("lng")
|
||||
|
||||
if hours:
|
||||
data.update({
|
||||
"weekday_opens": hours.get("weekday_opens", "09:00"),
|
||||
"weekday_closes": hours.get("weekday_closes", "18:00"),
|
||||
"weekend_opens": hours.get("weekend_opens"),
|
||||
"weekend_closes": hours.get("weekend_closes"),
|
||||
})
|
||||
|
||||
if rating is not None:
|
||||
data["rating"] = str(rating)
|
||||
data["review_count"] = str(review_count or 0)
|
||||
|
||||
return self.fill_template(template, data)
|
||||
|
||||
def generate_product(
|
||||
self,
|
||||
name: str,
|
||||
description: str,
|
||||
price: float,
|
||||
currency: str = "KRW",
|
||||
brand: str | None = None,
|
||||
sku: str | None = None,
|
||||
images: list[str] | None = None,
|
||||
availability: str = "InStock",
|
||||
condition: str = "NewCondition",
|
||||
rating: float | None = None,
|
||||
review_count: int | None = None,
|
||||
url: str | None = None,
|
||||
seller: str | None = None,
|
||||
) -> dict:
|
||||
"""Generate Product schema."""
|
||||
template = self.load_template("product")
|
||||
|
||||
data = {
|
||||
"name": name,
|
||||
"description": description,
|
||||
"price": str(int(price)),
|
||||
"currency": currency,
|
||||
"brand_name": brand,
|
||||
"sku": sku,
|
||||
"product_url": url,
|
||||
"availability": availability,
|
||||
"condition": condition,
|
||||
"seller_name": seller,
|
||||
}
|
||||
|
||||
if images:
|
||||
for i, img in enumerate(images[:3], 1):
|
||||
data[f"image_url_{i}"] = img
|
||||
|
||||
if rating is not None:
|
||||
data["rating"] = str(rating)
|
||||
data["review_count"] = str(review_count or 0)
|
||||
|
||||
return self.fill_template(template, data)
|
||||
|
||||
def generate_article(
|
||||
self,
|
||||
headline: str,
|
||||
description: str,
|
||||
author_name: str,
|
||||
date_published: str,
|
||||
publisher_name: str,
|
||||
article_type: str = "article",
|
||||
date_modified: str | None = None,
|
||||
images: list[str] | None = None,
|
||||
page_url: str | None = None,
|
||||
publisher_logo: str | None = None,
|
||||
author_url: str | None = None,
|
||||
section: str | None = None,
|
||||
word_count: int | None = None,
|
||||
keywords: str | None = None,
|
||||
) -> dict:
|
||||
"""Generate Article schema."""
|
||||
template = self.load_template("article")
|
||||
|
||||
schema_article_type = self.ARTICLE_TYPES.get(
|
||||
article_type.lower(), "Article"
|
||||
)
|
||||
|
||||
data = {
|
||||
"article_type": schema_article_type,
|
||||
"headline": headline,
|
||||
"description": description,
|
||||
"author_name": author_name,
|
||||
"author_url": author_url,
|
||||
"date_published": date_published,
|
||||
"date_modified": date_modified or date_published,
|
||||
"publisher_name": publisher_name,
|
||||
"publisher_logo_url": publisher_logo,
|
||||
"page_url": page_url,
|
||||
"section": section,
|
||||
"word_count": str(word_count) if word_count else None,
|
||||
"keywords": keywords,
|
||||
}
|
||||
|
||||
if images:
|
||||
for i, img in enumerate(images[:2], 1):
|
||||
data[f"image_url_{i}"] = img
|
||||
|
||||
return self.fill_template(template, data)
|
||||
|
||||
def generate_faq(self, questions: list[dict[str, str]]) -> dict:
|
||||
"""Generate FAQPage schema."""
|
||||
schema = {
|
||||
"@context": "https://schema.org",
|
||||
"@type": "FAQPage",
|
||||
"mainEntity": [],
|
||||
}
|
||||
|
||||
for qa in questions:
|
||||
schema["mainEntity"].append({
|
||||
"@type": "Question",
|
||||
"name": qa["question"],
|
||||
"acceptedAnswer": {
|
||||
"@type": "Answer",
|
||||
"text": qa["answer"],
|
||||
},
|
||||
})
|
||||
|
||||
return schema
|
||||
|
||||
def generate_breadcrumb(self, items: list[dict[str, str]]) -> dict:
|
||||
"""Generate BreadcrumbList schema."""
|
||||
schema = {
|
||||
"@context": "https://schema.org",
|
||||
"@type": "BreadcrumbList",
|
||||
"itemListElement": [],
|
||||
}
|
||||
|
||||
for i, item in enumerate(items, 1):
|
||||
schema["itemListElement"].append({
|
||||
"@type": "ListItem",
|
||||
"position": i,
|
||||
"name": item["name"],
|
||||
"item": item["url"],
|
||||
})
|
||||
|
||||
return schema
|
||||
|
||||
def generate_website(
|
||||
self,
|
||||
name: str,
|
||||
url: str,
|
||||
search_url_template: str | None = None,
|
||||
description: str | None = None,
|
||||
language: str = "ko-KR",
|
||||
publisher_name: str | None = None,
|
||||
logo_url: str | None = None,
|
||||
alternate_name: str | None = None,
|
||||
) -> dict:
|
||||
"""Generate WebSite schema."""
|
||||
template = self.load_template("website")
|
||||
|
||||
data = {
|
||||
"site_name": name,
|
||||
"url": url,
|
||||
"description": description,
|
||||
"language": language,
|
||||
"search_url_template": search_url_template,
|
||||
"publisher_name": publisher_name or name,
|
||||
"logo_url": logo_url,
|
||||
"alternate_name": alternate_name,
|
||||
}
|
||||
|
||||
return self.fill_template(template, data)
|
||||
|
||||
def to_json_ld(self, schema: dict, pretty: bool = True) -> str:
|
||||
"""Convert schema dict to JSON-LD string."""
|
||||
indent = 2 if pretty else None
|
||||
return json.dumps(schema, ensure_ascii=False, indent=indent)
|
||||
|
||||
def to_html_script(self, schema: dict) -> str:
|
||||
"""Wrap schema in HTML script tag."""
|
||||
json_ld = self.to_json_ld(schema)
|
||||
return f'<script type="application/ld+json">\n{json_ld}\n</script>'
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for CLI usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate JSON-LD schema markup",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Generate Organization schema
|
||||
python schema_generator.py --type organization --name "My Company" --url "https://example.com"
|
||||
|
||||
# Generate Product schema
|
||||
python schema_generator.py --type product --name "Widget" --price 29900 --currency KRW
|
||||
|
||||
# Generate Article schema
|
||||
python schema_generator.py --type article --headline "Article Title" --author "John Doe"
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--type", "-t",
|
||||
required=True,
|
||||
choices=SchemaGenerator.SCHEMA_TYPES.keys(),
|
||||
help="Schema type to generate",
|
||||
)
|
||||
parser.add_argument("--name", help="Name/title")
|
||||
parser.add_argument("--url", help="URL")
|
||||
parser.add_argument("--description", help="Description")
|
||||
parser.add_argument("--price", type=float, help="Price (for product)")
|
||||
parser.add_argument("--currency", default="KRW", help="Currency code")
|
||||
parser.add_argument("--headline", help="Headline (for article)")
|
||||
parser.add_argument("--author", help="Author name")
|
||||
parser.add_argument("--output", "-o", help="Output file path")
|
||||
parser.add_argument("--html", action="store_true", help="Output as HTML script tag")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
generator = SchemaGenerator()
|
||||
|
||||
try:
|
||||
if args.type == "organization":
|
||||
schema = generator.generate_organization(
|
||||
name=args.name or "Organization Name",
|
||||
url=args.url or "https://example.com",
|
||||
description=args.description,
|
||||
)
|
||||
elif args.type == "product":
|
||||
schema = generator.generate_product(
|
||||
name=args.name or "Product Name",
|
||||
description=args.description or "Product description",
|
||||
price=args.price or 0,
|
||||
currency=args.currency,
|
||||
)
|
||||
elif args.type == "article":
|
||||
schema = generator.generate_article(
|
||||
headline=args.headline or args.name or "Article Title",
|
||||
description=args.description or "Article description",
|
||||
author_name=args.author or "Author",
|
||||
date_published=datetime.now().strftime("%Y-%m-%d"),
|
||||
publisher_name="Publisher",
|
||||
)
|
||||
elif args.type == "website":
|
||||
schema = generator.generate_website(
|
||||
name=args.name or "Website Name",
|
||||
url=args.url or "https://example.com",
|
||||
description=args.description,
|
||||
)
|
||||
elif args.type == "faq":
|
||||
# Example FAQ
|
||||
schema = generator.generate_faq([
|
||||
{"question": "Question 1?", "answer": "Answer 1"},
|
||||
{"question": "Question 2?", "answer": "Answer 2"},
|
||||
])
|
||||
elif args.type == "breadcrumb":
|
||||
# Example breadcrumb
|
||||
schema = generator.generate_breadcrumb([
|
||||
{"name": "Home", "url": "https://example.com/"},
|
||||
{"name": "Category", "url": "https://example.com/category/"},
|
||||
])
|
||||
elif args.type == "local_business":
|
||||
schema = generator.generate_local_business(
|
||||
name=args.name or "Business Name",
|
||||
business_type="store",
|
||||
address={"street": "123 Main St", "city": "Seoul", "country": "KR"},
|
||||
url=args.url,
|
||||
description=args.description,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported type: {args.type}")
|
||||
|
||||
if args.html:
|
||||
output = generator.to_html_script(schema)
|
||||
else:
|
||||
output = generator.to_json_ld(schema)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
logger.info(f"Schema written to {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating schema: {e}")
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,498 @@
|
||||
"""
|
||||
Schema Validator - Validate JSON-LD structured data markup
|
||||
==========================================================
|
||||
Purpose: Extract and validate schema.org structured data from URLs or files
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
python schema_validator.py --url https://example.com
|
||||
python schema_validator.py --file schema.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
try:
|
||||
import extruct
|
||||
HAS_EXTRUCT = True
|
||||
except ImportError:
|
||||
HAS_EXTRUCT = False
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationIssue:
|
||||
"""Represents a validation issue found in schema."""
|
||||
|
||||
severity: str # "error", "warning", "info"
|
||||
message: str
|
||||
schema_type: str | None = None
|
||||
property_name: str | None = None
|
||||
suggestion: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationResult:
|
||||
"""Complete validation result for a schema."""
|
||||
|
||||
url: str | None = None
|
||||
schemas_found: list[dict] = field(default_factory=list)
|
||||
issues: list[ValidationIssue] = field(default_factory=list)
|
||||
valid: bool = True
|
||||
rich_results_eligible: dict = field(default_factory=dict)
|
||||
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON output."""
|
||||
return {
|
||||
"url": self.url,
|
||||
"schemas_found": len(self.schemas_found),
|
||||
"schema_types": [s.get("@type", "Unknown") for s in self.schemas_found],
|
||||
"valid": self.valid,
|
||||
"issues": [
|
||||
{
|
||||
"severity": i.severity,
|
||||
"message": i.message,
|
||||
"schema_type": i.schema_type,
|
||||
"property": i.property_name,
|
||||
"suggestion": i.suggestion,
|
||||
}
|
||||
for i in self.issues
|
||||
],
|
||||
"rich_results_eligible": self.rich_results_eligible,
|
||||
"timestamp": self.timestamp,
|
||||
}
|
||||
|
||||
|
||||
class SchemaValidator:
|
||||
"""Validate schema.org structured data."""
|
||||
|
||||
# Required properties for common schema types
|
||||
REQUIRED_PROPERTIES = {
|
||||
"Organization": ["name", "url"],
|
||||
"LocalBusiness": ["name", "address"],
|
||||
"Product": ["name"],
|
||||
"Offer": ["price", "priceCurrency"],
|
||||
"Article": ["headline", "author", "datePublished", "publisher"],
|
||||
"BlogPosting": ["headline", "author", "datePublished", "publisher"],
|
||||
"NewsArticle": ["headline", "author", "datePublished", "publisher"],
|
||||
"FAQPage": ["mainEntity"],
|
||||
"Question": ["name", "acceptedAnswer"],
|
||||
"Answer": ["text"],
|
||||
"BreadcrumbList": ["itemListElement"],
|
||||
"ListItem": ["position", "name"],
|
||||
"WebSite": ["name", "url"],
|
||||
"WebPage": ["name"],
|
||||
"Person": ["name"],
|
||||
"Event": ["name", "startDate", "location"],
|
||||
"Review": ["reviewRating", "author"],
|
||||
"AggregateRating": ["ratingValue"],
|
||||
"ImageObject": ["url"],
|
||||
}
|
||||
|
||||
# Recommended (but not required) properties
|
||||
RECOMMENDED_PROPERTIES = {
|
||||
"Organization": ["logo", "description", "contactPoint", "sameAs"],
|
||||
"LocalBusiness": ["telephone", "openingHoursSpecification", "geo", "image"],
|
||||
"Product": ["description", "image", "brand", "offers", "aggregateRating"],
|
||||
"Article": ["image", "dateModified", "description"],
|
||||
"FAQPage": [],
|
||||
"WebSite": ["potentialAction"],
|
||||
"BreadcrumbList": [],
|
||||
}
|
||||
|
||||
# Google Rich Results eligible types
|
||||
RICH_RESULTS_TYPES = {
|
||||
"Article", "BlogPosting", "NewsArticle",
|
||||
"Product", "Review",
|
||||
"FAQPage", "HowTo",
|
||||
"LocalBusiness", "Restaurant",
|
||||
"Event",
|
||||
"Recipe",
|
||||
"JobPosting",
|
||||
"Course",
|
||||
"BreadcrumbList",
|
||||
"Organization",
|
||||
"WebSite",
|
||||
"VideoObject",
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
|
||||
})
|
||||
|
||||
def extract_from_url(self, url: str) -> list[dict]:
|
||||
"""Extract all structured data from a URL."""
|
||||
try:
|
||||
response = self.session.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
return self.extract_from_html(response.text, url)
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Failed to fetch URL: {e}")
|
||||
return []
|
||||
|
||||
def extract_from_html(self, html: str, base_url: str | None = None) -> list[dict]:
|
||||
"""Extract structured data from HTML content."""
|
||||
schemas = []
|
||||
|
||||
# Method 1: Use extruct if available (handles JSON-LD, Microdata, RDFa)
|
||||
if HAS_EXTRUCT:
|
||||
try:
|
||||
data = extruct.extract(html, base_url=base_url, uniform=True)
|
||||
schemas.extend(data.get("json-ld", []))
|
||||
schemas.extend(data.get("microdata", []))
|
||||
schemas.extend(data.get("rdfa", []))
|
||||
except Exception as e:
|
||||
logger.warning(f"extruct extraction failed: {e}")
|
||||
|
||||
# Method 2: Manual JSON-LD extraction (fallback/additional)
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
for script in soup.find_all("script", type="application/ld+json"):
|
||||
try:
|
||||
content = script.string
|
||||
if content:
|
||||
data = json.loads(content)
|
||||
if isinstance(data, list):
|
||||
schemas.extend(data)
|
||||
else:
|
||||
schemas.append(data)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Invalid JSON-LD: {e}")
|
||||
|
||||
# Deduplicate schemas
|
||||
seen = set()
|
||||
unique_schemas = []
|
||||
for schema in schemas:
|
||||
schema_str = json.dumps(schema, sort_keys=True)
|
||||
if schema_str not in seen:
|
||||
seen.add(schema_str)
|
||||
unique_schemas.append(schema)
|
||||
|
||||
return unique_schemas
|
||||
|
||||
def validate(self, url: str | None = None, html: str | None = None,
|
||||
schema: dict | None = None) -> ValidationResult:
|
||||
"""Validate schema from URL, HTML, or direct schema dict."""
|
||||
result = ValidationResult(url=url)
|
||||
|
||||
# Extract schemas
|
||||
if schema:
|
||||
schemas = [schema]
|
||||
elif html:
|
||||
schemas = self.extract_from_html(html, url)
|
||||
elif url:
|
||||
schemas = self.extract_from_url(url)
|
||||
else:
|
||||
raise ValueError("Must provide url, html, or schema")
|
||||
|
||||
result.schemas_found = schemas
|
||||
|
||||
if not schemas:
|
||||
result.issues.append(ValidationIssue(
|
||||
severity="warning",
|
||||
message="No structured data found",
|
||||
suggestion="Add JSON-LD schema markup to improve SEO",
|
||||
))
|
||||
result.valid = False
|
||||
return result
|
||||
|
||||
# Validate each schema
|
||||
for schema in schemas:
|
||||
self._validate_schema(schema, result)
|
||||
|
||||
# Check for errors (warnings don't affect validity)
|
||||
result.valid = not any(i.severity == "error" for i in result.issues)
|
||||
|
||||
return result
|
||||
|
||||
def _validate_schema(self, schema: dict, result: ValidationResult,
|
||||
parent_type: str | None = None) -> None:
|
||||
"""Validate a single schema object."""
|
||||
schema_type = schema.get("@type")
|
||||
|
||||
if not schema_type:
|
||||
result.issues.append(ValidationIssue(
|
||||
severity="error",
|
||||
message="Missing @type property",
|
||||
schema_type=parent_type,
|
||||
))
|
||||
return
|
||||
|
||||
# Handle array of types
|
||||
if isinstance(schema_type, list):
|
||||
schema_type = schema_type[0]
|
||||
|
||||
# Check required properties
|
||||
required = self.REQUIRED_PROPERTIES.get(schema_type, [])
|
||||
for prop in required:
|
||||
if prop not in schema:
|
||||
result.issues.append(ValidationIssue(
|
||||
severity="error",
|
||||
message=f"Missing required property: {prop}",
|
||||
schema_type=schema_type,
|
||||
property_name=prop,
|
||||
suggestion=f"Add '{prop}' property to {schema_type} schema",
|
||||
))
|
||||
|
||||
# Check recommended properties
|
||||
recommended = self.RECOMMENDED_PROPERTIES.get(schema_type, [])
|
||||
for prop in recommended:
|
||||
if prop not in schema:
|
||||
result.issues.append(ValidationIssue(
|
||||
severity="info",
|
||||
message=f"Missing recommended property: {prop}",
|
||||
schema_type=schema_type,
|
||||
property_name=prop,
|
||||
suggestion=f"Consider adding '{prop}' for better rich results",
|
||||
))
|
||||
|
||||
# Check Rich Results eligibility
|
||||
if schema_type in self.RICH_RESULTS_TYPES:
|
||||
result.rich_results_eligible[schema_type] = self._check_rich_results(
|
||||
schema, schema_type
|
||||
)
|
||||
|
||||
# Validate nested schemas
|
||||
for key, value in schema.items():
|
||||
if key.startswith("@"):
|
||||
continue
|
||||
if isinstance(value, dict) and "@type" in value:
|
||||
self._validate_schema(value, result, schema_type)
|
||||
elif isinstance(value, list):
|
||||
for item in value:
|
||||
if isinstance(item, dict) and "@type" in item:
|
||||
self._validate_schema(item, result, schema_type)
|
||||
|
||||
# Type-specific validations
|
||||
self._validate_type_specific(schema, schema_type, result)
|
||||
|
||||
def _validate_type_specific(self, schema: dict, schema_type: str,
|
||||
result: ValidationResult) -> None:
|
||||
"""Type-specific validation rules."""
|
||||
if schema_type in ("Article", "BlogPosting", "NewsArticle"):
|
||||
# Check image
|
||||
if "image" not in schema:
|
||||
result.issues.append(ValidationIssue(
|
||||
severity="warning",
|
||||
message="Article without image may not show in rich results",
|
||||
schema_type=schema_type,
|
||||
property_name="image",
|
||||
suggestion="Add at least one image to the article",
|
||||
))
|
||||
|
||||
# Check headline length
|
||||
headline = schema.get("headline", "")
|
||||
if len(headline) > 110:
|
||||
result.issues.append(ValidationIssue(
|
||||
severity="warning",
|
||||
message=f"Headline too long ({len(headline)} chars, max 110)",
|
||||
schema_type=schema_type,
|
||||
property_name="headline",
|
||||
))
|
||||
|
||||
elif schema_type == "Product":
|
||||
offer = schema.get("offers", {})
|
||||
if isinstance(offer, dict):
|
||||
# Check price
|
||||
price = offer.get("price")
|
||||
if price is not None:
|
||||
try:
|
||||
float(price)
|
||||
except (ValueError, TypeError):
|
||||
result.issues.append(ValidationIssue(
|
||||
severity="error",
|
||||
message=f"Invalid price value: {price}",
|
||||
schema_type="Offer",
|
||||
property_name="price",
|
||||
))
|
||||
|
||||
# Check availability
|
||||
availability = offer.get("availability", "")
|
||||
valid_availabilities = [
|
||||
"InStock", "OutOfStock", "PreOrder", "Discontinued",
|
||||
"https://schema.org/InStock", "https://schema.org/OutOfStock",
|
||||
]
|
||||
if availability and not any(
|
||||
a in availability for a in valid_availabilities
|
||||
):
|
||||
result.issues.append(ValidationIssue(
|
||||
severity="warning",
|
||||
message=f"Unknown availability value: {availability}",
|
||||
schema_type="Offer",
|
||||
property_name="availability",
|
||||
))
|
||||
|
||||
elif schema_type == "LocalBusiness":
|
||||
# Check for geo coordinates
|
||||
if "geo" not in schema:
|
||||
result.issues.append(ValidationIssue(
|
||||
severity="info",
|
||||
message="Missing geo coordinates",
|
||||
schema_type=schema_type,
|
||||
property_name="geo",
|
||||
suggestion="Add latitude/longitude for better local search",
|
||||
))
|
||||
|
||||
elif schema_type == "FAQPage":
|
||||
main_entity = schema.get("mainEntity", [])
|
||||
if not main_entity:
|
||||
result.issues.append(ValidationIssue(
|
||||
severity="error",
|
||||
message="FAQPage must have at least one question",
|
||||
schema_type=schema_type,
|
||||
property_name="mainEntity",
|
||||
))
|
||||
elif len(main_entity) < 2:
|
||||
result.issues.append(ValidationIssue(
|
||||
severity="info",
|
||||
message="FAQPage has only one question",
|
||||
schema_type=schema_type,
|
||||
suggestion="Add more questions for better rich results",
|
||||
))
|
||||
|
||||
def _check_rich_results(self, schema: dict, schema_type: str) -> dict:
|
||||
"""Check if schema is eligible for Google Rich Results."""
|
||||
result = {
|
||||
"eligible": True,
|
||||
"missing_for_rich_results": [],
|
||||
}
|
||||
|
||||
if schema_type in ("Article", "BlogPosting", "NewsArticle"):
|
||||
required_for_rich = ["headline", "image", "datePublished", "author"]
|
||||
for prop in required_for_rich:
|
||||
if prop not in schema:
|
||||
result["eligible"] = False
|
||||
result["missing_for_rich_results"].append(prop)
|
||||
|
||||
elif schema_type == "Product":
|
||||
if "name" not in schema:
|
||||
result["eligible"] = False
|
||||
result["missing_for_rich_results"].append("name")
|
||||
offer = schema.get("offers")
|
||||
if not offer:
|
||||
result["eligible"] = False
|
||||
result["missing_for_rich_results"].append("offers")
|
||||
|
||||
elif schema_type == "FAQPage":
|
||||
if not schema.get("mainEntity"):
|
||||
result["eligible"] = False
|
||||
result["missing_for_rich_results"].append("mainEntity")
|
||||
|
||||
return result
|
||||
|
||||
def generate_report(self, result: ValidationResult) -> str:
|
||||
"""Generate human-readable validation report."""
|
||||
lines = [
|
||||
"=" * 60,
|
||||
"Schema Validation Report",
|
||||
"=" * 60,
|
||||
f"URL: {result.url or 'N/A'}",
|
||||
f"Timestamp: {result.timestamp}",
|
||||
f"Valid: {'Yes' if result.valid else 'No'}",
|
||||
f"Schemas Found: {len(result.schemas_found)}",
|
||||
"",
|
||||
]
|
||||
|
||||
if result.schemas_found:
|
||||
lines.append("Schema Types:")
|
||||
for schema in result.schemas_found:
|
||||
schema_type = schema.get("@type", "Unknown")
|
||||
lines.append(f" - {schema_type}")
|
||||
lines.append("")
|
||||
|
||||
if result.rich_results_eligible:
|
||||
lines.append("Rich Results Eligibility:")
|
||||
for schema_type, status in result.rich_results_eligible.items():
|
||||
eligible = "Yes" if status["eligible"] else "No"
|
||||
lines.append(f" - {schema_type}: {eligible}")
|
||||
if status["missing_for_rich_results"]:
|
||||
missing = ", ".join(status["missing_for_rich_results"])
|
||||
lines.append(f" Missing: {missing}")
|
||||
lines.append("")
|
||||
|
||||
if result.issues:
|
||||
lines.append("Issues Found:")
|
||||
errors = [i for i in result.issues if i.severity == "error"]
|
||||
warnings = [i for i in result.issues if i.severity == "warning"]
|
||||
infos = [i for i in result.issues if i.severity == "info"]
|
||||
|
||||
if errors:
|
||||
lines.append(f"\n ERRORS ({len(errors)}):")
|
||||
for issue in errors:
|
||||
lines.append(f" - [{issue.schema_type}] {issue.message}")
|
||||
if issue.suggestion:
|
||||
lines.append(f" Suggestion: {issue.suggestion}")
|
||||
|
||||
if warnings:
|
||||
lines.append(f"\n WARNINGS ({len(warnings)}):")
|
||||
for issue in warnings:
|
||||
lines.append(f" - [{issue.schema_type}] {issue.message}")
|
||||
if issue.suggestion:
|
||||
lines.append(f" Suggestion: {issue.suggestion}")
|
||||
|
||||
if infos:
|
||||
lines.append(f"\n INFO ({len(infos)}):")
|
||||
for issue in infos:
|
||||
lines.append(f" - [{issue.schema_type}] {issue.message}")
|
||||
if issue.suggestion:
|
||||
lines.append(f" Suggestion: {issue.suggestion}")
|
||||
|
||||
lines.append("")
|
||||
lines.append("=" * 60)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for CLI usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Validate schema.org structured data",
|
||||
)
|
||||
parser.add_argument("--url", "-u", help="URL to validate")
|
||||
parser.add_argument("--file", "-f", help="JSON-LD file to validate")
|
||||
parser.add_argument("--output", "-o", help="Output file for JSON report")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.url and not args.file:
|
||||
parser.error("Must provide --url or --file")
|
||||
|
||||
validator = SchemaValidator()
|
||||
|
||||
if args.file:
|
||||
with open(args.file, "r", encoding="utf-8") as f:
|
||||
schema = json.load(f)
|
||||
result = validator.validate(schema=schema)
|
||||
else:
|
||||
result = validator.validate(url=args.url)
|
||||
|
||||
if args.json or args.output:
|
||||
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
logger.info(f"Report written to {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
else:
|
||||
print(validator.generate_report(result))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,969 @@
|
||||
"""
|
||||
Sitemap Crawler - Sequential page analysis from sitemap
|
||||
=======================================================
|
||||
Purpose: Crawl sitemap URLs one by one, analyze each page, save to Notion
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
from sitemap_crawler import SitemapCrawler
|
||||
crawler = SitemapCrawler()
|
||||
crawler.crawl_sitemap("https://example.com/sitemap.xml", delay=2.0)
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Callable, Generator
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from notion_client import Client
|
||||
|
||||
from base_client import config
|
||||
from page_analyzer import PageAnalyzer, PageMetadata
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default database for page analysis data
|
||||
DEFAULT_PAGES_DATABASE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
|
||||
|
||||
# Default limits to prevent excessive resource usage
|
||||
DEFAULT_MAX_PAGES = 500
|
||||
DEFAULT_DELAY_SECONDS = 2.0
|
||||
|
||||
# Progress tracking directory
|
||||
PROGRESS_DIR = Path.home() / ".claude" / "seo-audit-progress"
|
||||
PROGRESS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlProgress:
|
||||
"""Track crawl progress."""
|
||||
total_urls: int = 0
|
||||
processed_urls: int = 0
|
||||
successful_urls: int = 0
|
||||
failed_urls: int = 0
|
||||
skipped_urls: int = 0
|
||||
start_time: datetime = field(default_factory=datetime.now)
|
||||
current_url: str = ""
|
||||
audit_id: str = ""
|
||||
site: str = ""
|
||||
status: str = "running" # running, completed, failed
|
||||
error_message: str = ""
|
||||
summary_page_id: str = ""
|
||||
|
||||
def get_progress_percent(self) -> float:
|
||||
if self.total_urls == 0:
|
||||
return 0.0
|
||||
return (self.processed_urls / self.total_urls) * 100
|
||||
|
||||
def get_elapsed_time(self) -> str:
|
||||
elapsed = datetime.now() - self.start_time
|
||||
minutes = int(elapsed.total_seconds() // 60)
|
||||
seconds = int(elapsed.total_seconds() % 60)
|
||||
return f"{minutes}m {seconds}s"
|
||||
|
||||
def get_eta(self) -> str:
|
||||
if self.processed_urls == 0:
|
||||
return "calculating..."
|
||||
elapsed = (datetime.now() - self.start_time).total_seconds()
|
||||
avg_time_per_url = elapsed / self.processed_urls
|
||||
remaining_urls = self.total_urls - self.processed_urls
|
||||
eta_seconds = remaining_urls * avg_time_per_url
|
||||
minutes = int(eta_seconds // 60)
|
||||
seconds = int(eta_seconds % 60)
|
||||
return f"{minutes}m {seconds}s"
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return {
|
||||
"audit_id": self.audit_id,
|
||||
"site": self.site,
|
||||
"status": self.status,
|
||||
"total_urls": self.total_urls,
|
||||
"processed_urls": self.processed_urls,
|
||||
"successful_urls": self.successful_urls,
|
||||
"failed_urls": self.failed_urls,
|
||||
"progress_percent": round(self.get_progress_percent(), 1),
|
||||
"elapsed_time": self.get_elapsed_time(),
|
||||
"eta": self.get_eta(),
|
||||
"current_url": self.current_url,
|
||||
"start_time": self.start_time.isoformat(),
|
||||
"error_message": self.error_message,
|
||||
"summary_page_id": self.summary_page_id,
|
||||
"updated_at": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
def save_to_file(self, filepath: Path | None = None) -> Path:
|
||||
"""Save progress to JSON file."""
|
||||
if filepath is None:
|
||||
filepath = PROGRESS_DIR / f"{self.audit_id}.json"
|
||||
with open(filepath, "w") as f:
|
||||
json.dump(self.to_dict(), f, indent=2)
|
||||
return filepath
|
||||
|
||||
@classmethod
|
||||
def load_from_file(cls, filepath: Path) -> "CrawlProgress":
|
||||
"""Load progress from JSON file."""
|
||||
with open(filepath, "r") as f:
|
||||
data = json.load(f)
|
||||
progress = cls()
|
||||
progress.audit_id = data.get("audit_id", "")
|
||||
progress.site = data.get("site", "")
|
||||
progress.status = data.get("status", "unknown")
|
||||
progress.total_urls = data.get("total_urls", 0)
|
||||
progress.processed_urls = data.get("processed_urls", 0)
|
||||
progress.successful_urls = data.get("successful_urls", 0)
|
||||
progress.failed_urls = data.get("failed_urls", 0)
|
||||
progress.current_url = data.get("current_url", "")
|
||||
progress.error_message = data.get("error_message", "")
|
||||
progress.summary_page_id = data.get("summary_page_id", "")
|
||||
if data.get("start_time"):
|
||||
progress.start_time = datetime.fromisoformat(data["start_time"])
|
||||
return progress
|
||||
|
||||
|
||||
def get_active_crawls() -> list[CrawlProgress]:
|
||||
"""Get all active (running) crawl jobs."""
|
||||
active = []
|
||||
for filepath in PROGRESS_DIR.glob("*.json"):
|
||||
try:
|
||||
progress = CrawlProgress.load_from_file(filepath)
|
||||
if progress.status == "running":
|
||||
active.append(progress)
|
||||
except Exception:
|
||||
continue
|
||||
return active
|
||||
|
||||
|
||||
def get_all_crawls() -> list[CrawlProgress]:
|
||||
"""Get all crawl jobs (active and completed)."""
|
||||
crawls = []
|
||||
for filepath in sorted(PROGRESS_DIR.glob("*.json"), reverse=True):
|
||||
try:
|
||||
progress = CrawlProgress.load_from_file(filepath)
|
||||
crawls.append(progress)
|
||||
except Exception:
|
||||
continue
|
||||
return crawls
|
||||
|
||||
|
||||
def get_crawl_status(audit_id: str) -> CrawlProgress | None:
|
||||
"""Get status of a specific crawl by audit ID."""
|
||||
filepath = PROGRESS_DIR / f"{audit_id}.json"
|
||||
if filepath.exists():
|
||||
return CrawlProgress.load_from_file(filepath)
|
||||
return None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlResult:
|
||||
"""Result of a complete sitemap crawl."""
|
||||
site: str
|
||||
sitemap_url: str
|
||||
audit_id: str
|
||||
total_pages: int
|
||||
successful_pages: int
|
||||
failed_pages: int
|
||||
start_time: datetime
|
||||
end_time: datetime
|
||||
pages_analyzed: list[PageMetadata] = field(default_factory=list)
|
||||
notion_page_ids: list[str] = field(default_factory=list)
|
||||
summary_page_id: str | None = None
|
||||
|
||||
def get_duration(self) -> str:
|
||||
duration = self.end_time - self.start_time
|
||||
minutes = int(duration.total_seconds() // 60)
|
||||
seconds = int(duration.total_seconds() % 60)
|
||||
return f"{minutes}m {seconds}s"
|
||||
|
||||
|
||||
class SitemapCrawler:
|
||||
"""Crawl sitemap URLs and analyze each page."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
notion_token: str | None = None,
|
||||
database_id: str | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize sitemap crawler.
|
||||
|
||||
Args:
|
||||
notion_token: Notion API token
|
||||
database_id: Notion database ID for storing results
|
||||
"""
|
||||
self.notion_token = notion_token or config.notion_token
|
||||
self.database_id = database_id or DEFAULT_PAGES_DATABASE_ID
|
||||
self.analyzer = PageAnalyzer()
|
||||
|
||||
if self.notion_token:
|
||||
self.notion = Client(auth=self.notion_token)
|
||||
else:
|
||||
self.notion = None
|
||||
logger.warning("Notion token not configured, results will not be saved")
|
||||
|
||||
def fetch_sitemap_urls(self, sitemap_url: str) -> list[str]:
|
||||
"""
|
||||
Fetch and parse URLs from a sitemap.
|
||||
|
||||
Args:
|
||||
sitemap_url: URL of the sitemap
|
||||
|
||||
Returns:
|
||||
List of URLs found in the sitemap
|
||||
"""
|
||||
try:
|
||||
response = requests.get(sitemap_url, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse XML
|
||||
root = ET.fromstring(response.content)
|
||||
|
||||
# Handle namespace
|
||||
namespaces = {
|
||||
"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||
}
|
||||
|
||||
urls = []
|
||||
|
||||
# Check if this is a sitemap index
|
||||
sitemap_tags = root.findall(".//sm:sitemap/sm:loc", namespaces)
|
||||
if sitemap_tags:
|
||||
# This is a sitemap index, recursively fetch child sitemaps
|
||||
logger.info(f"Found sitemap index with {len(sitemap_tags)} child sitemaps")
|
||||
for loc in sitemap_tags:
|
||||
if loc.text:
|
||||
child_urls = self.fetch_sitemap_urls(loc.text)
|
||||
urls.extend(child_urls)
|
||||
else:
|
||||
# Regular sitemap, extract URLs
|
||||
url_tags = root.findall(".//sm:url/sm:loc", namespaces)
|
||||
if not url_tags:
|
||||
# Try without namespace
|
||||
url_tags = root.findall(".//url/loc")
|
||||
|
||||
for loc in url_tags:
|
||||
if loc.text:
|
||||
urls.append(loc.text)
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
unique_urls = []
|
||||
for url in urls:
|
||||
if url not in seen:
|
||||
seen.add(url)
|
||||
unique_urls.append(url)
|
||||
|
||||
logger.info(f"Found {len(unique_urls)} unique URLs in sitemap")
|
||||
return unique_urls
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch sitemap: {e}")
|
||||
raise
|
||||
|
||||
def crawl_sitemap(
|
||||
self,
|
||||
sitemap_url: str,
|
||||
delay: float = DEFAULT_DELAY_SECONDS,
|
||||
max_pages: int = DEFAULT_MAX_PAGES,
|
||||
progress_callback: Callable[[CrawlProgress], None] | None = None,
|
||||
save_to_notion: bool = True,
|
||||
url_filter: Callable[[str], bool] | None = None,
|
||||
) -> CrawlResult:
|
||||
"""
|
||||
Crawl all URLs in a sitemap sequentially.
|
||||
|
||||
Args:
|
||||
sitemap_url: URL of the sitemap
|
||||
delay: Seconds to wait between requests (default: 2.0s)
|
||||
max_pages: Maximum number of pages to process (default: 500)
|
||||
progress_callback: Function called with progress updates
|
||||
save_to_notion: Whether to save results to Notion
|
||||
url_filter: Optional function to filter URLs (return True to include)
|
||||
|
||||
Returns:
|
||||
CrawlResult with all analyzed pages
|
||||
"""
|
||||
# Parse site info
|
||||
parsed_sitemap = urlparse(sitemap_url)
|
||||
site = f"{parsed_sitemap.scheme}://{parsed_sitemap.netloc}"
|
||||
site_domain = parsed_sitemap.netloc
|
||||
|
||||
# Generate audit ID
|
||||
audit_id = f"{site_domain}-pages-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
||||
|
||||
logger.info(f"Starting sitemap crawl: {sitemap_url}")
|
||||
logger.info(f"Audit ID: {audit_id}")
|
||||
logger.info(f"Delay between requests: {delay}s")
|
||||
|
||||
# Initialize progress tracking
|
||||
progress = CrawlProgress(
|
||||
audit_id=audit_id,
|
||||
site=site,
|
||||
status="running",
|
||||
)
|
||||
|
||||
# Fetch URLs
|
||||
urls = self.fetch_sitemap_urls(sitemap_url)
|
||||
|
||||
# Apply URL filter if provided
|
||||
if url_filter:
|
||||
urls = [url for url in urls if url_filter(url)]
|
||||
logger.info(f"After filtering: {len(urls)} URLs")
|
||||
|
||||
# Apply max pages limit (default: 500 to prevent excessive resource usage)
|
||||
if len(urls) > max_pages:
|
||||
logger.warning(f"Sitemap has {len(urls)} URLs, limiting to {max_pages} pages")
|
||||
logger.warning(f"Use max_pages parameter to adjust this limit")
|
||||
urls = urls[:max_pages]
|
||||
logger.info(f"Processing {len(urls)} pages (max: {max_pages})")
|
||||
|
||||
# Update progress with total URLs
|
||||
progress.total_urls = len(urls)
|
||||
progress.save_to_file()
|
||||
|
||||
# Initialize result
|
||||
result = CrawlResult(
|
||||
site=site,
|
||||
sitemap_url=sitemap_url,
|
||||
audit_id=audit_id,
|
||||
total_pages=len(urls),
|
||||
successful_pages=0,
|
||||
failed_pages=0,
|
||||
start_time=datetime.now(),
|
||||
end_time=datetime.now(),
|
||||
)
|
||||
|
||||
# Process each URL
|
||||
try:
|
||||
for i, url in enumerate(urls):
|
||||
progress.current_url = url
|
||||
progress.processed_urls = i
|
||||
progress.save_to_file() # Save progress to file
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(progress)
|
||||
|
||||
logger.info(f"[{i+1}/{len(urls)}] Analyzing: {url}")
|
||||
|
||||
try:
|
||||
# Analyze page
|
||||
metadata = self.analyzer.analyze_url(url)
|
||||
result.pages_analyzed.append(metadata)
|
||||
|
||||
if metadata.status_code == 200:
|
||||
progress.successful_urls += 1
|
||||
result.successful_pages += 1
|
||||
|
||||
# Save to Notion
|
||||
if save_to_notion and self.notion:
|
||||
page_id = self._save_page_to_notion(metadata, audit_id, site)
|
||||
if page_id:
|
||||
result.notion_page_ids.append(page_id)
|
||||
else:
|
||||
progress.failed_urls += 1
|
||||
result.failed_pages += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to analyze {url}: {e}")
|
||||
progress.failed_urls += 1
|
||||
result.failed_pages += 1
|
||||
|
||||
# Wait before next request
|
||||
if i < len(urls) - 1: # Don't wait after last URL
|
||||
time.sleep(delay)
|
||||
|
||||
# Final progress update
|
||||
progress.processed_urls = len(urls)
|
||||
progress.status = "completed"
|
||||
if progress_callback:
|
||||
progress_callback(progress)
|
||||
|
||||
except Exception as e:
|
||||
progress.status = "failed"
|
||||
progress.error_message = str(e)
|
||||
progress.save_to_file()
|
||||
raise
|
||||
|
||||
# Update result
|
||||
result.end_time = datetime.now()
|
||||
|
||||
# Create summary page
|
||||
if save_to_notion and self.notion:
|
||||
summary_id = self._create_crawl_summary_page(result)
|
||||
result.summary_page_id = summary_id
|
||||
progress.summary_page_id = summary_id
|
||||
|
||||
# Save final progress
|
||||
progress.save_to_file()
|
||||
|
||||
logger.info(f"Crawl complete: {result.successful_pages}/{result.total_pages} pages analyzed")
|
||||
logger.info(f"Duration: {result.get_duration()}")
|
||||
|
||||
return result
|
||||
|
||||
def _save_page_to_notion(
|
||||
self,
|
||||
metadata: PageMetadata,
|
||||
audit_id: str,
|
||||
site: str,
|
||||
) -> str | None:
|
||||
"""Save page metadata to Notion database."""
|
||||
try:
|
||||
# Build properties
|
||||
properties = {
|
||||
"Issue": {"title": [{"text": {"content": f"📄 {metadata.url}"}}]},
|
||||
"Category": {"select": {"name": "On-page SEO"}},
|
||||
"Priority": {"select": {"name": self._determine_priority(metadata)}},
|
||||
"Site": {"url": site},
|
||||
"URL": {"url": metadata.url},
|
||||
"Audit ID": {"rich_text": [{"text": {"content": audit_id}}]},
|
||||
"Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
|
||||
}
|
||||
|
||||
# Build page content
|
||||
children = self._build_page_content(metadata)
|
||||
|
||||
response = self.notion.pages.create(
|
||||
parent={"database_id": self.database_id},
|
||||
properties=properties,
|
||||
children=children,
|
||||
)
|
||||
|
||||
return response["id"]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save to Notion: {e}")
|
||||
return None
|
||||
|
||||
def _determine_priority(self, metadata: PageMetadata) -> str:
|
||||
"""Determine priority based on issues found."""
|
||||
if len(metadata.issues) >= 3:
|
||||
return "High"
|
||||
elif len(metadata.issues) >= 1:
|
||||
return "Medium"
|
||||
elif len(metadata.warnings) >= 3:
|
||||
return "Medium"
|
||||
else:
|
||||
return "Low"
|
||||
|
||||
def _build_page_content(self, metadata: PageMetadata) -> list[dict]:
|
||||
"""Build Notion page content blocks from metadata."""
|
||||
children = []
|
||||
|
||||
# Status summary callout
|
||||
status_emoji = "✅" if not metadata.issues else "⚠️" if len(metadata.issues) < 3 else "❌"
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "callout",
|
||||
"callout": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Status: {metadata.status_code} | "}},
|
||||
{"type": "text", "text": {"content": f"Response: {metadata.response_time_ms:.0f}ms | "}},
|
||||
{"type": "text", "text": {"content": f"Issues: {len(metadata.issues)} | "}},
|
||||
{"type": "text", "text": {"content": f"Warnings: {len(metadata.warnings)}"}},
|
||||
],
|
||||
"icon": {"type": "emoji", "emoji": status_emoji},
|
||||
"color": "gray_background" if not metadata.issues else "yellow_background" if len(metadata.issues) < 3 else "red_background",
|
||||
}
|
||||
})
|
||||
|
||||
# Meta Tags Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Meta Tags"}}]}
|
||||
})
|
||||
|
||||
# Meta tags table
|
||||
meta_rows = [
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Tag"}, "annotations": {"bold": True}}],
|
||||
[{"type": "text", "text": {"content": "Value"}, "annotations": {"bold": True}}],
|
||||
[{"type": "text", "text": {"content": "Status"}, "annotations": {"bold": True}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Title"}}],
|
||||
[{"type": "text", "text": {"content": (metadata.title or "—")[:50]}}],
|
||||
[{"type": "text", "text": {"content": f"✓ {metadata.title_length} chars" if metadata.title else "✗ Missing"}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Description"}}],
|
||||
[{"type": "text", "text": {"content": (metadata.meta_description or "—")[:50]}}],
|
||||
[{"type": "text", "text": {"content": f"✓ {metadata.meta_description_length} chars" if metadata.meta_description else "✗ Missing"}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Canonical"}}],
|
||||
[{"type": "text", "text": {"content": (metadata.canonical_url or "—")[:50]}}],
|
||||
[{"type": "text", "text": {"content": "✓" if metadata.canonical_url else "✗ Missing"}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Robots"}}],
|
||||
[{"type": "text", "text": {"content": metadata.robots_meta or "—"}}],
|
||||
[{"type": "text", "text": {"content": "✓" if metadata.robots_meta else "—"}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Lang"}}],
|
||||
[{"type": "text", "text": {"content": metadata.html_lang or "—"}}],
|
||||
[{"type": "text", "text": {"content": "✓" if metadata.html_lang else "—"}}],
|
||||
]}},
|
||||
]
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "table",
|
||||
"table": {
|
||||
"table_width": 3,
|
||||
"has_column_header": True,
|
||||
"has_row_header": False,
|
||||
"children": meta_rows
|
||||
}
|
||||
})
|
||||
|
||||
# Headings Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Headings"}}]}
|
||||
})
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"H1: {metadata.h1_count} | "}},
|
||||
{"type": "text", "text": {"content": f"Total headings: {len(metadata.headings)}"}},
|
||||
]}
|
||||
})
|
||||
|
||||
if metadata.h1_text:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "quote",
|
||||
"quote": {"rich_text": [{"type": "text", "text": {"content": metadata.h1_text[:200]}}]}
|
||||
})
|
||||
|
||||
# Schema Data Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Structured Data"}}]}
|
||||
})
|
||||
|
||||
if metadata.schema_types_found:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": "Schema types found: "}},
|
||||
{"type": "text", "text": {"content": ", ".join(metadata.schema_types_found)}, "annotations": {"code": True}},
|
||||
]}
|
||||
})
|
||||
else:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "callout",
|
||||
"callout": {
|
||||
"rich_text": [{"type": "text", "text": {"content": "No structured data found on this page"}}],
|
||||
"icon": {"type": "emoji", "emoji": "⚠️"},
|
||||
"color": "yellow_background",
|
||||
}
|
||||
})
|
||||
|
||||
# Open Graph Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Open Graph"}}]}
|
||||
})
|
||||
|
||||
og = metadata.open_graph
|
||||
og_status = "✓ Configured" if og.og_title else "✗ Missing"
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Status: {og_status}\n"}},
|
||||
{"type": "text", "text": {"content": f"og:title: {og.og_title or '—'}\n"}},
|
||||
{"type": "text", "text": {"content": f"og:type: {og.og_type or '—'}"}},
|
||||
]}
|
||||
})
|
||||
|
||||
# Links Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Links"}}]}
|
||||
})
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Internal links: {metadata.internal_link_count}\n"}},
|
||||
{"type": "text", "text": {"content": f"External links: {metadata.external_link_count}"}},
|
||||
]}
|
||||
})
|
||||
|
||||
# Images Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Images"}}]}
|
||||
})
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Total: {metadata.images_total} | "}},
|
||||
{"type": "text", "text": {"content": f"With alt: {metadata.images_with_alt} | "}},
|
||||
{"type": "text", "text": {"content": f"Without alt: {metadata.images_without_alt}"}},
|
||||
]}
|
||||
})
|
||||
|
||||
# Hreflang Section (if present)
|
||||
if metadata.hreflang_tags:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Hreflang Tags"}}]}
|
||||
})
|
||||
|
||||
for tag in metadata.hreflang_tags[:10]:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "bulleted_list_item",
|
||||
"bulleted_list_item": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"{tag['lang']}: "}},
|
||||
{"type": "text", "text": {"content": tag['url'], "link": {"url": tag['url']}}},
|
||||
]}
|
||||
})
|
||||
|
||||
# Issues & Warnings Section
|
||||
if metadata.issues or metadata.warnings:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Issues & Warnings"}}]}
|
||||
})
|
||||
|
||||
for issue in metadata.issues:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "to_do",
|
||||
"to_do": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": "❌ "}, "annotations": {"bold": True}},
|
||||
{"type": "text", "text": {"content": issue}},
|
||||
],
|
||||
"checked": False,
|
||||
}
|
||||
})
|
||||
|
||||
for warning in metadata.warnings:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "to_do",
|
||||
"to_do": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": "⚠️ "}, "annotations": {"bold": True}},
|
||||
{"type": "text", "text": {"content": warning}},
|
||||
],
|
||||
"checked": False,
|
||||
}
|
||||
})
|
||||
|
||||
return children
|
||||
|
||||
def _create_crawl_summary_page(self, result: CrawlResult) -> str | None:
|
||||
"""Create a summary page for the crawl."""
|
||||
try:
|
||||
site_domain = urlparse(result.site).netloc
|
||||
|
||||
# Calculate statistics
|
||||
total_issues = sum(len(p.issues) for p in result.pages_analyzed)
|
||||
total_warnings = sum(len(p.warnings) for p in result.pages_analyzed)
|
||||
pages_with_issues = sum(1 for p in result.pages_analyzed if p.issues)
|
||||
pages_without_schema = sum(1 for p in result.pages_analyzed if not p.schema_types_found)
|
||||
pages_without_description = sum(1 for p in result.pages_analyzed if not p.meta_description)
|
||||
|
||||
children = []
|
||||
|
||||
# Header callout
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "callout",
|
||||
"callout": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Sitemap Crawl Complete\n\n"}},
|
||||
{"type": "text", "text": {"content": f"Audit ID: {result.audit_id}\n"}},
|
||||
{"type": "text", "text": {"content": f"Duration: {result.get_duration()}\n"}},
|
||||
{"type": "text", "text": {"content": f"Pages: {result.successful_pages}/{result.total_pages}"}},
|
||||
],
|
||||
"icon": {"type": "emoji", "emoji": "📊"},
|
||||
"color": "blue_background",
|
||||
}
|
||||
})
|
||||
|
||||
# Statistics table
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Statistics"}}]}
|
||||
})
|
||||
|
||||
stats_rows = [
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Metric"}, "annotations": {"bold": True}}],
|
||||
[{"type": "text", "text": {"content": "Count"}, "annotations": {"bold": True}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Total Pages"}}],
|
||||
[{"type": "text", "text": {"content": str(result.total_pages)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Successfully Analyzed"}}],
|
||||
[{"type": "text", "text": {"content": str(result.successful_pages)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Pages with Issues"}}],
|
||||
[{"type": "text", "text": {"content": str(pages_with_issues)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Total Issues"}}],
|
||||
[{"type": "text", "text": {"content": str(total_issues)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Total Warnings"}}],
|
||||
[{"type": "text", "text": {"content": str(total_warnings)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Pages without Schema"}}],
|
||||
[{"type": "text", "text": {"content": str(pages_without_schema)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Pages without Description"}}],
|
||||
[{"type": "text", "text": {"content": str(pages_without_description)}}],
|
||||
]}},
|
||||
]
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "table",
|
||||
"table": {
|
||||
"table_width": 2,
|
||||
"has_column_header": True,
|
||||
"has_row_header": False,
|
||||
"children": stats_rows
|
||||
}
|
||||
})
|
||||
|
||||
# Pages list
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Analyzed Pages"}}]}
|
||||
})
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Filter by Audit ID in the database to see all {result.successful_pages} page entries."}}
|
||||
]}
|
||||
})
|
||||
|
||||
# Create the summary page
|
||||
response = self.notion.pages.create(
|
||||
parent={"database_id": self.database_id},
|
||||
properties={
|
||||
"Issue": {"title": [{"text": {"content": f"📊 Sitemap Crawl: {site_domain}"}}]},
|
||||
"Category": {"select": {"name": "Technical SEO"}},
|
||||
"Priority": {"select": {"name": "High"}},
|
||||
"Site": {"url": result.site},
|
||||
"Audit ID": {"rich_text": [{"text": {"content": result.audit_id}}]},
|
||||
"Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
|
||||
},
|
||||
children=children,
|
||||
)
|
||||
|
||||
logger.info(f"Created crawl summary page: {response['id']}")
|
||||
return response["id"]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create summary page: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def print_progress_status(progress: CrawlProgress) -> None:
|
||||
"""Print formatted progress status."""
|
||||
status_emoji = {
|
||||
"running": "🔄",
|
||||
"completed": "✅",
|
||||
"failed": "❌",
|
||||
}.get(progress.status, "❓")
|
||||
|
||||
print(f"""
|
||||
{'=' * 60}
|
||||
{status_emoji} SEO Page Analysis - {progress.status.upper()}
|
||||
{'=' * 60}
|
||||
Audit ID: {progress.audit_id}
|
||||
Site: {progress.site}
|
||||
Status: {progress.status}
|
||||
|
||||
Progress: {progress.processed_urls}/{progress.total_urls} pages ({progress.get_progress_percent():.1f}%)
|
||||
Successful: {progress.successful_urls}
|
||||
Failed: {progress.failed_urls}
|
||||
Elapsed: {progress.get_elapsed_time()}
|
||||
ETA: {progress.get_eta() if progress.status == 'running' else 'N/A'}
|
||||
|
||||
Current URL: {progress.current_url[:60] + '...' if len(progress.current_url) > 60 else progress.current_url}
|
||||
""")
|
||||
|
||||
if progress.summary_page_id:
|
||||
print(f"Summary: https://www.notion.so/{progress.summary_page_id.replace('-', '')}")
|
||||
|
||||
if progress.error_message:
|
||||
print(f"Error: {progress.error_message}")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Sitemap Crawler with Background Support")
|
||||
subparsers = parser.add_subparsers(dest="command", help="Commands")
|
||||
|
||||
# Crawl command
|
||||
crawl_parser = subparsers.add_parser("crawl", help="Start crawling a sitemap")
|
||||
crawl_parser.add_argument("sitemap_url", help="URL of the sitemap to crawl")
|
||||
crawl_parser.add_argument("--delay", "-d", type=float, default=DEFAULT_DELAY_SECONDS,
|
||||
help=f"Delay between requests in seconds (default: {DEFAULT_DELAY_SECONDS})")
|
||||
crawl_parser.add_argument("--max-pages", "-m", type=int, default=DEFAULT_MAX_PAGES,
|
||||
help=f"Maximum pages to process (default: {DEFAULT_MAX_PAGES})")
|
||||
crawl_parser.add_argument("--no-notion", action="store_true",
|
||||
help="Don't save to Notion")
|
||||
crawl_parser.add_argument("--no-limit", action="store_true",
|
||||
help="Remove page limit (use with caution)")
|
||||
|
||||
# Status command
|
||||
status_parser = subparsers.add_parser("status", help="Check crawl progress")
|
||||
status_parser.add_argument("audit_id", nargs="?", help="Specific audit ID to check (optional)")
|
||||
status_parser.add_argument("--all", "-a", action="store_true", help="Show all crawls (not just active)")
|
||||
|
||||
# List command
|
||||
list_parser = subparsers.add_parser("list", help="List all crawl jobs")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Default to crawl if no command specified but URL provided
|
||||
if args.command is None:
|
||||
# Check if first positional arg looks like a URL
|
||||
import sys
|
||||
if len(sys.argv) > 1 and (sys.argv[1].startswith("http") or sys.argv[1].endswith(".xml")):
|
||||
args.command = "crawl"
|
||||
args.sitemap_url = sys.argv[1]
|
||||
args.delay = DEFAULT_DELAY_SECONDS
|
||||
args.max_pages = DEFAULT_MAX_PAGES
|
||||
args.no_notion = False
|
||||
args.no_limit = False
|
||||
else:
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
if args.command == "status":
|
||||
if args.audit_id:
|
||||
# Show specific crawl status
|
||||
progress = get_crawl_status(args.audit_id)
|
||||
if progress:
|
||||
print_progress_status(progress)
|
||||
else:
|
||||
print(f"No crawl found with audit ID: {args.audit_id}")
|
||||
else:
|
||||
# Show active crawls
|
||||
if args.all:
|
||||
crawls = get_all_crawls()
|
||||
label = "All"
|
||||
else:
|
||||
crawls = get_active_crawls()
|
||||
label = "Active"
|
||||
|
||||
if crawls:
|
||||
print(f"\n{label} Crawl Jobs ({len(crawls)}):")
|
||||
print("-" * 60)
|
||||
for p in crawls:
|
||||
status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓")
|
||||
print(f"{status_emoji} {p.audit_id}")
|
||||
print(f" Site: {p.site}")
|
||||
print(f" Progress: {p.processed_urls}/{p.total_urls} ({p.get_progress_percent():.1f}%)")
|
||||
print()
|
||||
else:
|
||||
print(f"No {label.lower()} crawl jobs found.")
|
||||
return
|
||||
|
||||
if args.command == "list":
|
||||
crawls = get_all_crawls()
|
||||
if crawls:
|
||||
print(f"\nAll Crawl Jobs ({len(crawls)}):")
|
||||
print("-" * 80)
|
||||
print(f"{'Status':<10} {'Audit ID':<45} {'Progress':<15}")
|
||||
print("-" * 80)
|
||||
for p in crawls[:20]: # Show last 20
|
||||
status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓")
|
||||
progress_str = f"{p.processed_urls}/{p.total_urls}"
|
||||
print(f"{status_emoji} {p.status:<7} {p.audit_id:<45} {progress_str:<15}")
|
||||
if len(crawls) > 20:
|
||||
print(f"... and {len(crawls) - 20} more")
|
||||
else:
|
||||
print("No crawl jobs found.")
|
||||
return
|
||||
|
||||
if args.command == "crawl":
|
||||
# Handle --no-limit option
|
||||
max_pages = args.max_pages
|
||||
if args.no_limit:
|
||||
max_pages = 999999 # Effectively unlimited
|
||||
print("⚠️ WARNING: Page limit disabled. This may take a very long time!")
|
||||
|
||||
def progress_callback(progress: CrawlProgress):
|
||||
pct = progress.get_progress_percent()
|
||||
print(f"\r[{pct:5.1f}%] {progress.processed_urls}/{progress.total_urls} pages | "
|
||||
f"Success: {progress.successful_urls} | Failed: {progress.failed_urls} | "
|
||||
f"ETA: {progress.get_eta()}", end="", flush=True)
|
||||
|
||||
crawler = SitemapCrawler()
|
||||
result = crawler.crawl_sitemap(
|
||||
args.sitemap_url,
|
||||
delay=args.delay,
|
||||
max_pages=max_pages,
|
||||
progress_callback=progress_callback,
|
||||
save_to_notion=not args.no_notion,
|
||||
)
|
||||
|
||||
print() # New line after progress
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("CRAWL COMPLETE")
|
||||
print("=" * 60)
|
||||
print(f"Audit ID: {result.audit_id}")
|
||||
print(f"Total Pages: {result.total_pages}")
|
||||
print(f"Successful: {result.successful_pages}")
|
||||
print(f"Failed: {result.failed_pages}")
|
||||
print(f"Duration: {result.get_duration()}")
|
||||
if result.summary_page_id:
|
||||
print(f"Summary Page: https://www.notion.so/{result.summary_page_id.replace('-', '')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,467 @@
|
||||
"""
|
||||
Sitemap Validator - Validate XML sitemaps
|
||||
==========================================
|
||||
Purpose: Parse and validate XML sitemaps for SEO compliance
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
python sitemap_validator.py --url https://example.com/sitemap.xml
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import gzip
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
from lxml import etree
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SitemapIssue:
|
||||
"""Represents a sitemap validation issue."""
|
||||
|
||||
severity: str # "error", "warning", "info"
|
||||
message: str
|
||||
url: str | None = None
|
||||
suggestion: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SitemapEntry:
|
||||
"""Represents a single URL entry in sitemap."""
|
||||
|
||||
loc: str
|
||||
lastmod: str | None = None
|
||||
changefreq: str | None = None
|
||||
priority: float | None = None
|
||||
status_code: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SitemapResult:
|
||||
"""Complete sitemap validation result."""
|
||||
|
||||
url: str
|
||||
sitemap_type: str # "urlset" or "sitemapindex"
|
||||
entries: list[SitemapEntry] = field(default_factory=list)
|
||||
child_sitemaps: list[str] = field(default_factory=list)
|
||||
issues: list[SitemapIssue] = field(default_factory=list)
|
||||
valid: bool = True
|
||||
stats: dict = field(default_factory=dict)
|
||||
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON output."""
|
||||
return {
|
||||
"url": self.url,
|
||||
"sitemap_type": self.sitemap_type,
|
||||
"valid": self.valid,
|
||||
"stats": self.stats,
|
||||
"issues": [
|
||||
{
|
||||
"severity": i.severity,
|
||||
"message": i.message,
|
||||
"url": i.url,
|
||||
"suggestion": i.suggestion,
|
||||
}
|
||||
for i in self.issues
|
||||
],
|
||||
"entries_count": len(self.entries),
|
||||
"child_sitemaps": self.child_sitemaps,
|
||||
"timestamp": self.timestamp,
|
||||
}
|
||||
|
||||
|
||||
class SitemapValidator:
|
||||
"""Validate XML sitemaps."""
|
||||
|
||||
SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||
MAX_URLS = 50000
|
||||
MAX_SIZE_BYTES = 50 * 1024 * 1024 # 50MB
|
||||
|
||||
VALID_CHANGEFREQ = {
|
||||
"always", "hourly", "daily", "weekly",
|
||||
"monthly", "yearly", "never"
|
||||
}
|
||||
|
||||
def __init__(self, check_urls: bool = False, max_concurrent: int = 10):
|
||||
self.check_urls = check_urls
|
||||
self.max_concurrent = max_concurrent
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
|
||||
})
|
||||
|
||||
def fetch_sitemap(self, url: str) -> tuple[bytes, bool]:
|
||||
"""Fetch sitemap content, handling gzip compression."""
|
||||
try:
|
||||
response = self.session.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
content = response.content
|
||||
is_gzipped = False
|
||||
|
||||
# Check if gzipped
|
||||
if url.endswith(".gz") or response.headers.get(
|
||||
"Content-Encoding"
|
||||
) == "gzip":
|
||||
try:
|
||||
content = gzip.decompress(content)
|
||||
is_gzipped = True
|
||||
except gzip.BadGzipFile:
|
||||
pass
|
||||
|
||||
return content, is_gzipped
|
||||
except requests.RequestException as e:
|
||||
raise RuntimeError(f"Failed to fetch sitemap: {e}")
|
||||
|
||||
def parse_sitemap(self, content: bytes) -> tuple[str, list[dict]]:
|
||||
"""Parse sitemap XML content."""
|
||||
try:
|
||||
root = etree.fromstring(content)
|
||||
except etree.XMLSyntaxError as e:
|
||||
raise ValueError(f"Invalid XML: {e}")
|
||||
|
||||
# Remove namespace for easier parsing
|
||||
nsmap = {"sm": self.SITEMAP_NS}
|
||||
|
||||
# Check if it's a sitemap index or urlset
|
||||
if root.tag == f"{{{self.SITEMAP_NS}}}sitemapindex":
|
||||
sitemap_type = "sitemapindex"
|
||||
entries = []
|
||||
for sitemap in root.findall("sm:sitemap", nsmap):
|
||||
entry = {}
|
||||
loc = sitemap.find("sm:loc", nsmap)
|
||||
if loc is not None and loc.text:
|
||||
entry["loc"] = loc.text.strip()
|
||||
lastmod = sitemap.find("sm:lastmod", nsmap)
|
||||
if lastmod is not None and lastmod.text:
|
||||
entry["lastmod"] = lastmod.text.strip()
|
||||
if entry.get("loc"):
|
||||
entries.append(entry)
|
||||
elif root.tag == f"{{{self.SITEMAP_NS}}}urlset":
|
||||
sitemap_type = "urlset"
|
||||
entries = []
|
||||
for url in root.findall("sm:url", nsmap):
|
||||
entry = {}
|
||||
loc = url.find("sm:loc", nsmap)
|
||||
if loc is not None and loc.text:
|
||||
entry["loc"] = loc.text.strip()
|
||||
lastmod = url.find("sm:lastmod", nsmap)
|
||||
if lastmod is not None and lastmod.text:
|
||||
entry["lastmod"] = lastmod.text.strip()
|
||||
changefreq = url.find("sm:changefreq", nsmap)
|
||||
if changefreq is not None and changefreq.text:
|
||||
entry["changefreq"] = changefreq.text.strip().lower()
|
||||
priority = url.find("sm:priority", nsmap)
|
||||
if priority is not None and priority.text:
|
||||
try:
|
||||
entry["priority"] = float(priority.text.strip())
|
||||
except ValueError:
|
||||
entry["priority"] = None
|
||||
if entry.get("loc"):
|
||||
entries.append(entry)
|
||||
else:
|
||||
raise ValueError(f"Unknown sitemap type: {root.tag}")
|
||||
|
||||
return sitemap_type, entries
|
||||
|
||||
def validate(self, url: str) -> SitemapResult:
|
||||
"""Validate a sitemap URL."""
|
||||
result = SitemapResult(url=url, sitemap_type="unknown")
|
||||
|
||||
# Fetch sitemap
|
||||
try:
|
||||
content, is_gzipped = self.fetch_sitemap(url)
|
||||
except RuntimeError as e:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="error",
|
||||
message=str(e),
|
||||
url=url,
|
||||
))
|
||||
result.valid = False
|
||||
return result
|
||||
|
||||
# Check size
|
||||
if len(content) > self.MAX_SIZE_BYTES:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="error",
|
||||
message=f"Sitemap exceeds 50MB limit ({len(content) / 1024 / 1024:.2f}MB)",
|
||||
url=url,
|
||||
suggestion="Split sitemap into smaller files using sitemap index",
|
||||
))
|
||||
|
||||
# Parse XML
|
||||
try:
|
||||
sitemap_type, entries = self.parse_sitemap(content)
|
||||
except ValueError as e:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="error",
|
||||
message=str(e),
|
||||
url=url,
|
||||
))
|
||||
result.valid = False
|
||||
return result
|
||||
|
||||
result.sitemap_type = sitemap_type
|
||||
|
||||
# Process entries
|
||||
if sitemap_type == "sitemapindex":
|
||||
result.child_sitemaps = [e["loc"] for e in entries]
|
||||
result.stats = {
|
||||
"child_sitemaps_count": len(entries),
|
||||
}
|
||||
else:
|
||||
# Validate URL entries
|
||||
url_count = len(entries)
|
||||
result.stats["url_count"] = url_count
|
||||
|
||||
if url_count > self.MAX_URLS:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="error",
|
||||
message=f"Sitemap exceeds 50,000 URL limit ({url_count} URLs)",
|
||||
url=url,
|
||||
suggestion="Split into multiple sitemaps with sitemap index",
|
||||
))
|
||||
|
||||
if url_count == 0:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="warning",
|
||||
message="Sitemap is empty (no URLs)",
|
||||
url=url,
|
||||
))
|
||||
|
||||
# Validate individual entries
|
||||
seen_urls = set()
|
||||
invalid_lastmod = 0
|
||||
invalid_changefreq = 0
|
||||
invalid_priority = 0
|
||||
|
||||
for entry in entries:
|
||||
loc = entry.get("loc", "")
|
||||
|
||||
# Check for duplicates
|
||||
if loc in seen_urls:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="warning",
|
||||
message="Duplicate URL in sitemap",
|
||||
url=loc,
|
||||
))
|
||||
seen_urls.add(loc)
|
||||
|
||||
# Validate lastmod format
|
||||
lastmod = entry.get("lastmod")
|
||||
if lastmod:
|
||||
if not self._validate_date(lastmod):
|
||||
invalid_lastmod += 1
|
||||
|
||||
# Validate changefreq
|
||||
changefreq = entry.get("changefreq")
|
||||
if changefreq and changefreq not in self.VALID_CHANGEFREQ:
|
||||
invalid_changefreq += 1
|
||||
|
||||
# Validate priority
|
||||
priority = entry.get("priority")
|
||||
if priority is not None:
|
||||
if not (0.0 <= priority <= 1.0):
|
||||
invalid_priority += 1
|
||||
|
||||
# Create entry object
|
||||
result.entries.append(SitemapEntry(
|
||||
loc=loc,
|
||||
lastmod=lastmod,
|
||||
changefreq=changefreq,
|
||||
priority=priority,
|
||||
))
|
||||
|
||||
# Add summary issues
|
||||
if invalid_lastmod > 0:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="warning",
|
||||
message=f"{invalid_lastmod} URLs with invalid lastmod format",
|
||||
suggestion="Use ISO 8601 format (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS+TZ)",
|
||||
))
|
||||
|
||||
if invalid_changefreq > 0:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="info",
|
||||
message=f"{invalid_changefreq} URLs with invalid changefreq",
|
||||
suggestion="Use: always, hourly, daily, weekly, monthly, yearly, never",
|
||||
))
|
||||
|
||||
if invalid_priority > 0:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="warning",
|
||||
message=f"{invalid_priority} URLs with invalid priority (must be 0.0-1.0)",
|
||||
))
|
||||
|
||||
result.stats.update({
|
||||
"invalid_lastmod": invalid_lastmod,
|
||||
"invalid_changefreq": invalid_changefreq,
|
||||
"invalid_priority": invalid_priority,
|
||||
"has_lastmod": sum(1 for e in result.entries if e.lastmod),
|
||||
"has_changefreq": sum(1 for e in result.entries if e.changefreq),
|
||||
"has_priority": sum(1 for e in result.entries if e.priority is not None),
|
||||
})
|
||||
|
||||
# Check URLs if requested
|
||||
if self.check_urls and result.entries:
|
||||
asyncio.run(self._check_url_status(result))
|
||||
|
||||
# Determine validity
|
||||
result.valid = not any(i.severity == "error" for i in result.issues)
|
||||
|
||||
return result
|
||||
|
||||
def _validate_date(self, date_str: str) -> bool:
|
||||
"""Validate ISO 8601 date format."""
|
||||
patterns = [
|
||||
r"^\d{4}-\d{2}-\d{2}$",
|
||||
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}",
|
||||
]
|
||||
return any(re.match(p, date_str) for p in patterns)
|
||||
|
||||
async def _check_url_status(self, result: SitemapResult) -> None:
|
||||
"""Check HTTP status of URLs in sitemap."""
|
||||
semaphore = asyncio.Semaphore(self.max_concurrent)
|
||||
|
||||
async def check_url(entry: SitemapEntry) -> None:
|
||||
async with semaphore:
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.head(
|
||||
entry.loc,
|
||||
timeout=aiohttp.ClientTimeout(total=10),
|
||||
allow_redirects=True,
|
||||
) as response:
|
||||
entry.status_code = response.status
|
||||
except Exception:
|
||||
entry.status_code = 0
|
||||
|
||||
await asyncio.gather(*[check_url(e) for e in result.entries[:100]])
|
||||
|
||||
# Count status codes
|
||||
status_counts = {}
|
||||
for entry in result.entries:
|
||||
if entry.status_code:
|
||||
status_counts[entry.status_code] = (
|
||||
status_counts.get(entry.status_code, 0) + 1
|
||||
)
|
||||
|
||||
result.stats["url_status_codes"] = status_counts
|
||||
|
||||
# Add issues for non-200 URLs
|
||||
error_count = sum(
|
||||
1 for e in result.entries
|
||||
if e.status_code and e.status_code >= 400
|
||||
)
|
||||
if error_count > 0:
|
||||
result.issues.append(SitemapIssue(
|
||||
severity="warning",
|
||||
message=f"{error_count} URLs returning error status codes (4xx/5xx)",
|
||||
suggestion="Remove or fix broken URLs in sitemap",
|
||||
))
|
||||
|
||||
def generate_report(self, result: SitemapResult) -> str:
|
||||
"""Generate human-readable validation report."""
|
||||
lines = [
|
||||
"=" * 60,
|
||||
"Sitemap Validation Report",
|
||||
"=" * 60,
|
||||
f"URL: {result.url}",
|
||||
f"Type: {result.sitemap_type}",
|
||||
f"Valid: {'Yes' if result.valid else 'No'}",
|
||||
f"Timestamp: {result.timestamp}",
|
||||
"",
|
||||
]
|
||||
|
||||
lines.append("Statistics:")
|
||||
for key, value in result.stats.items():
|
||||
lines.append(f" {key}: {value}")
|
||||
lines.append("")
|
||||
|
||||
if result.child_sitemaps:
|
||||
lines.append(f"Child Sitemaps ({len(result.child_sitemaps)}):")
|
||||
for sitemap in result.child_sitemaps[:10]:
|
||||
lines.append(f" - {sitemap}")
|
||||
if len(result.child_sitemaps) > 10:
|
||||
lines.append(f" ... and {len(result.child_sitemaps) - 10} more")
|
||||
lines.append("")
|
||||
|
||||
if result.issues:
|
||||
lines.append("Issues Found:")
|
||||
errors = [i for i in result.issues if i.severity == "error"]
|
||||
warnings = [i for i in result.issues if i.severity == "warning"]
|
||||
infos = [i for i in result.issues if i.severity == "info"]
|
||||
|
||||
if errors:
|
||||
lines.append(f"\n ERRORS ({len(errors)}):")
|
||||
for issue in errors:
|
||||
lines.append(f" - {issue.message}")
|
||||
if issue.url:
|
||||
lines.append(f" URL: {issue.url}")
|
||||
if issue.suggestion:
|
||||
lines.append(f" Suggestion: {issue.suggestion}")
|
||||
|
||||
if warnings:
|
||||
lines.append(f"\n WARNINGS ({len(warnings)}):")
|
||||
for issue in warnings:
|
||||
lines.append(f" - {issue.message}")
|
||||
if issue.suggestion:
|
||||
lines.append(f" Suggestion: {issue.suggestion}")
|
||||
|
||||
if infos:
|
||||
lines.append(f"\n INFO ({len(infos)}):")
|
||||
for issue in infos:
|
||||
lines.append(f" - {issue.message}")
|
||||
|
||||
lines.append("")
|
||||
lines.append("=" * 60)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for CLI usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Validate XML sitemaps",
|
||||
)
|
||||
parser.add_argument("--url", "-u", required=True, help="Sitemap URL to validate")
|
||||
parser.add_argument("--check-urls", action="store_true",
|
||||
help="Check HTTP status of URLs (slower)")
|
||||
parser.add_argument("--output", "-o", help="Output file for JSON report")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
validator = SitemapValidator(check_urls=args.check_urls)
|
||||
result = validator.validate(args.url)
|
||||
|
||||
if args.json or args.output:
|
||||
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
logger.info(f"Report written to {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
else:
|
||||
print(validator.generate_report(result))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user