refactor(skills): Restructure skills to dual-platform architecture
Major refactoring of ourdigital-custom-skills with new numbering system: ## Structure Changes - Each skill now has code/ (Claude Code) and desktop/ (Claude Desktop) versions - New progressive numbering: 01-09 General, 10-19 SEO, 20-29 GTM, 30-39 OurDigital, 40-49 Jamie ## Skill Reorganization - 01-notion-organizer (from 02) - 10-18: SEO tools split into focused skills (technical, on-page, local, schema, vitals, gsc, gateway) - 20-21: GTM audit and manager - 30-32: OurDigital designer, research, presentation - 40-41: Jamie brand editor and audit ## New Files - .claude/commands/: Slash command definitions for all skills - CLAUDE.md: Updated with new skill structure documentation - REFACTORING_PLAN.md: Migration documentation - COMPATIBILITY_REPORT.md, SKILLS_COMPARISON.md: Analysis docs ## Removed - Old skill directories (02-05, 10-14, 20-21 old numbering) - Consolidated into new structure with _archive/ for reference 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,969 @@
|
||||
"""
|
||||
Sitemap Crawler - Sequential page analysis from sitemap
|
||||
=======================================================
|
||||
Purpose: Crawl sitemap URLs one by one, analyze each page, save to Notion
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
from sitemap_crawler import SitemapCrawler
|
||||
crawler = SitemapCrawler()
|
||||
crawler.crawl_sitemap("https://example.com/sitemap.xml", delay=2.0)
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Callable, Generator
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from notion_client import Client
|
||||
|
||||
from base_client import config
|
||||
from page_analyzer import PageAnalyzer, PageMetadata
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default database for page analysis data
|
||||
DEFAULT_PAGES_DATABASE_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
|
||||
|
||||
# Default limits to prevent excessive resource usage
|
||||
DEFAULT_MAX_PAGES = 500
|
||||
DEFAULT_DELAY_SECONDS = 2.0
|
||||
|
||||
# Progress tracking directory
|
||||
PROGRESS_DIR = Path.home() / ".claude" / "seo-audit-progress"
|
||||
PROGRESS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlProgress:
|
||||
"""Track crawl progress."""
|
||||
total_urls: int = 0
|
||||
processed_urls: int = 0
|
||||
successful_urls: int = 0
|
||||
failed_urls: int = 0
|
||||
skipped_urls: int = 0
|
||||
start_time: datetime = field(default_factory=datetime.now)
|
||||
current_url: str = ""
|
||||
audit_id: str = ""
|
||||
site: str = ""
|
||||
status: str = "running" # running, completed, failed
|
||||
error_message: str = ""
|
||||
summary_page_id: str = ""
|
||||
|
||||
def get_progress_percent(self) -> float:
|
||||
if self.total_urls == 0:
|
||||
return 0.0
|
||||
return (self.processed_urls / self.total_urls) * 100
|
||||
|
||||
def get_elapsed_time(self) -> str:
|
||||
elapsed = datetime.now() - self.start_time
|
||||
minutes = int(elapsed.total_seconds() // 60)
|
||||
seconds = int(elapsed.total_seconds() % 60)
|
||||
return f"{minutes}m {seconds}s"
|
||||
|
||||
def get_eta(self) -> str:
|
||||
if self.processed_urls == 0:
|
||||
return "calculating..."
|
||||
elapsed = (datetime.now() - self.start_time).total_seconds()
|
||||
avg_time_per_url = elapsed / self.processed_urls
|
||||
remaining_urls = self.total_urls - self.processed_urls
|
||||
eta_seconds = remaining_urls * avg_time_per_url
|
||||
minutes = int(eta_seconds // 60)
|
||||
seconds = int(eta_seconds % 60)
|
||||
return f"{minutes}m {seconds}s"
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return {
|
||||
"audit_id": self.audit_id,
|
||||
"site": self.site,
|
||||
"status": self.status,
|
||||
"total_urls": self.total_urls,
|
||||
"processed_urls": self.processed_urls,
|
||||
"successful_urls": self.successful_urls,
|
||||
"failed_urls": self.failed_urls,
|
||||
"progress_percent": round(self.get_progress_percent(), 1),
|
||||
"elapsed_time": self.get_elapsed_time(),
|
||||
"eta": self.get_eta(),
|
||||
"current_url": self.current_url,
|
||||
"start_time": self.start_time.isoformat(),
|
||||
"error_message": self.error_message,
|
||||
"summary_page_id": self.summary_page_id,
|
||||
"updated_at": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
def save_to_file(self, filepath: Path | None = None) -> Path:
|
||||
"""Save progress to JSON file."""
|
||||
if filepath is None:
|
||||
filepath = PROGRESS_DIR / f"{self.audit_id}.json"
|
||||
with open(filepath, "w") as f:
|
||||
json.dump(self.to_dict(), f, indent=2)
|
||||
return filepath
|
||||
|
||||
@classmethod
|
||||
def load_from_file(cls, filepath: Path) -> "CrawlProgress":
|
||||
"""Load progress from JSON file."""
|
||||
with open(filepath, "r") as f:
|
||||
data = json.load(f)
|
||||
progress = cls()
|
||||
progress.audit_id = data.get("audit_id", "")
|
||||
progress.site = data.get("site", "")
|
||||
progress.status = data.get("status", "unknown")
|
||||
progress.total_urls = data.get("total_urls", 0)
|
||||
progress.processed_urls = data.get("processed_urls", 0)
|
||||
progress.successful_urls = data.get("successful_urls", 0)
|
||||
progress.failed_urls = data.get("failed_urls", 0)
|
||||
progress.current_url = data.get("current_url", "")
|
||||
progress.error_message = data.get("error_message", "")
|
||||
progress.summary_page_id = data.get("summary_page_id", "")
|
||||
if data.get("start_time"):
|
||||
progress.start_time = datetime.fromisoformat(data["start_time"])
|
||||
return progress
|
||||
|
||||
|
||||
def get_active_crawls() -> list[CrawlProgress]:
|
||||
"""Get all active (running) crawl jobs."""
|
||||
active = []
|
||||
for filepath in PROGRESS_DIR.glob("*.json"):
|
||||
try:
|
||||
progress = CrawlProgress.load_from_file(filepath)
|
||||
if progress.status == "running":
|
||||
active.append(progress)
|
||||
except Exception:
|
||||
continue
|
||||
return active
|
||||
|
||||
|
||||
def get_all_crawls() -> list[CrawlProgress]:
|
||||
"""Get all crawl jobs (active and completed)."""
|
||||
crawls = []
|
||||
for filepath in sorted(PROGRESS_DIR.glob("*.json"), reverse=True):
|
||||
try:
|
||||
progress = CrawlProgress.load_from_file(filepath)
|
||||
crawls.append(progress)
|
||||
except Exception:
|
||||
continue
|
||||
return crawls
|
||||
|
||||
|
||||
def get_crawl_status(audit_id: str) -> CrawlProgress | None:
|
||||
"""Get status of a specific crawl by audit ID."""
|
||||
filepath = PROGRESS_DIR / f"{audit_id}.json"
|
||||
if filepath.exists():
|
||||
return CrawlProgress.load_from_file(filepath)
|
||||
return None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlResult:
|
||||
"""Result of a complete sitemap crawl."""
|
||||
site: str
|
||||
sitemap_url: str
|
||||
audit_id: str
|
||||
total_pages: int
|
||||
successful_pages: int
|
||||
failed_pages: int
|
||||
start_time: datetime
|
||||
end_time: datetime
|
||||
pages_analyzed: list[PageMetadata] = field(default_factory=list)
|
||||
notion_page_ids: list[str] = field(default_factory=list)
|
||||
summary_page_id: str | None = None
|
||||
|
||||
def get_duration(self) -> str:
|
||||
duration = self.end_time - self.start_time
|
||||
minutes = int(duration.total_seconds() // 60)
|
||||
seconds = int(duration.total_seconds() % 60)
|
||||
return f"{minutes}m {seconds}s"
|
||||
|
||||
|
||||
class SitemapCrawler:
|
||||
"""Crawl sitemap URLs and analyze each page."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
notion_token: str | None = None,
|
||||
database_id: str | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize sitemap crawler.
|
||||
|
||||
Args:
|
||||
notion_token: Notion API token
|
||||
database_id: Notion database ID for storing results
|
||||
"""
|
||||
self.notion_token = notion_token or config.notion_token
|
||||
self.database_id = database_id or DEFAULT_PAGES_DATABASE_ID
|
||||
self.analyzer = PageAnalyzer()
|
||||
|
||||
if self.notion_token:
|
||||
self.notion = Client(auth=self.notion_token)
|
||||
else:
|
||||
self.notion = None
|
||||
logger.warning("Notion token not configured, results will not be saved")
|
||||
|
||||
def fetch_sitemap_urls(self, sitemap_url: str) -> list[str]:
|
||||
"""
|
||||
Fetch and parse URLs from a sitemap.
|
||||
|
||||
Args:
|
||||
sitemap_url: URL of the sitemap
|
||||
|
||||
Returns:
|
||||
List of URLs found in the sitemap
|
||||
"""
|
||||
try:
|
||||
response = requests.get(sitemap_url, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse XML
|
||||
root = ET.fromstring(response.content)
|
||||
|
||||
# Handle namespace
|
||||
namespaces = {
|
||||
"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||
}
|
||||
|
||||
urls = []
|
||||
|
||||
# Check if this is a sitemap index
|
||||
sitemap_tags = root.findall(".//sm:sitemap/sm:loc", namespaces)
|
||||
if sitemap_tags:
|
||||
# This is a sitemap index, recursively fetch child sitemaps
|
||||
logger.info(f"Found sitemap index with {len(sitemap_tags)} child sitemaps")
|
||||
for loc in sitemap_tags:
|
||||
if loc.text:
|
||||
child_urls = self.fetch_sitemap_urls(loc.text)
|
||||
urls.extend(child_urls)
|
||||
else:
|
||||
# Regular sitemap, extract URLs
|
||||
url_tags = root.findall(".//sm:url/sm:loc", namespaces)
|
||||
if not url_tags:
|
||||
# Try without namespace
|
||||
url_tags = root.findall(".//url/loc")
|
||||
|
||||
for loc in url_tags:
|
||||
if loc.text:
|
||||
urls.append(loc.text)
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
unique_urls = []
|
||||
for url in urls:
|
||||
if url not in seen:
|
||||
seen.add(url)
|
||||
unique_urls.append(url)
|
||||
|
||||
logger.info(f"Found {len(unique_urls)} unique URLs in sitemap")
|
||||
return unique_urls
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch sitemap: {e}")
|
||||
raise
|
||||
|
||||
def crawl_sitemap(
|
||||
self,
|
||||
sitemap_url: str,
|
||||
delay: float = DEFAULT_DELAY_SECONDS,
|
||||
max_pages: int = DEFAULT_MAX_PAGES,
|
||||
progress_callback: Callable[[CrawlProgress], None] | None = None,
|
||||
save_to_notion: bool = True,
|
||||
url_filter: Callable[[str], bool] | None = None,
|
||||
) -> CrawlResult:
|
||||
"""
|
||||
Crawl all URLs in a sitemap sequentially.
|
||||
|
||||
Args:
|
||||
sitemap_url: URL of the sitemap
|
||||
delay: Seconds to wait between requests (default: 2.0s)
|
||||
max_pages: Maximum number of pages to process (default: 500)
|
||||
progress_callback: Function called with progress updates
|
||||
save_to_notion: Whether to save results to Notion
|
||||
url_filter: Optional function to filter URLs (return True to include)
|
||||
|
||||
Returns:
|
||||
CrawlResult with all analyzed pages
|
||||
"""
|
||||
# Parse site info
|
||||
parsed_sitemap = urlparse(sitemap_url)
|
||||
site = f"{parsed_sitemap.scheme}://{parsed_sitemap.netloc}"
|
||||
site_domain = parsed_sitemap.netloc
|
||||
|
||||
# Generate audit ID
|
||||
audit_id = f"{site_domain}-pages-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
||||
|
||||
logger.info(f"Starting sitemap crawl: {sitemap_url}")
|
||||
logger.info(f"Audit ID: {audit_id}")
|
||||
logger.info(f"Delay between requests: {delay}s")
|
||||
|
||||
# Initialize progress tracking
|
||||
progress = CrawlProgress(
|
||||
audit_id=audit_id,
|
||||
site=site,
|
||||
status="running",
|
||||
)
|
||||
|
||||
# Fetch URLs
|
||||
urls = self.fetch_sitemap_urls(sitemap_url)
|
||||
|
||||
# Apply URL filter if provided
|
||||
if url_filter:
|
||||
urls = [url for url in urls if url_filter(url)]
|
||||
logger.info(f"After filtering: {len(urls)} URLs")
|
||||
|
||||
# Apply max pages limit (default: 500 to prevent excessive resource usage)
|
||||
if len(urls) > max_pages:
|
||||
logger.warning(f"Sitemap has {len(urls)} URLs, limiting to {max_pages} pages")
|
||||
logger.warning(f"Use max_pages parameter to adjust this limit")
|
||||
urls = urls[:max_pages]
|
||||
logger.info(f"Processing {len(urls)} pages (max: {max_pages})")
|
||||
|
||||
# Update progress with total URLs
|
||||
progress.total_urls = len(urls)
|
||||
progress.save_to_file()
|
||||
|
||||
# Initialize result
|
||||
result = CrawlResult(
|
||||
site=site,
|
||||
sitemap_url=sitemap_url,
|
||||
audit_id=audit_id,
|
||||
total_pages=len(urls),
|
||||
successful_pages=0,
|
||||
failed_pages=0,
|
||||
start_time=datetime.now(),
|
||||
end_time=datetime.now(),
|
||||
)
|
||||
|
||||
# Process each URL
|
||||
try:
|
||||
for i, url in enumerate(urls):
|
||||
progress.current_url = url
|
||||
progress.processed_urls = i
|
||||
progress.save_to_file() # Save progress to file
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(progress)
|
||||
|
||||
logger.info(f"[{i+1}/{len(urls)}] Analyzing: {url}")
|
||||
|
||||
try:
|
||||
# Analyze page
|
||||
metadata = self.analyzer.analyze_url(url)
|
||||
result.pages_analyzed.append(metadata)
|
||||
|
||||
if metadata.status_code == 200:
|
||||
progress.successful_urls += 1
|
||||
result.successful_pages += 1
|
||||
|
||||
# Save to Notion
|
||||
if save_to_notion and self.notion:
|
||||
page_id = self._save_page_to_notion(metadata, audit_id, site)
|
||||
if page_id:
|
||||
result.notion_page_ids.append(page_id)
|
||||
else:
|
||||
progress.failed_urls += 1
|
||||
result.failed_pages += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to analyze {url}: {e}")
|
||||
progress.failed_urls += 1
|
||||
result.failed_pages += 1
|
||||
|
||||
# Wait before next request
|
||||
if i < len(urls) - 1: # Don't wait after last URL
|
||||
time.sleep(delay)
|
||||
|
||||
# Final progress update
|
||||
progress.processed_urls = len(urls)
|
||||
progress.status = "completed"
|
||||
if progress_callback:
|
||||
progress_callback(progress)
|
||||
|
||||
except Exception as e:
|
||||
progress.status = "failed"
|
||||
progress.error_message = str(e)
|
||||
progress.save_to_file()
|
||||
raise
|
||||
|
||||
# Update result
|
||||
result.end_time = datetime.now()
|
||||
|
||||
# Create summary page
|
||||
if save_to_notion and self.notion:
|
||||
summary_id = self._create_crawl_summary_page(result)
|
||||
result.summary_page_id = summary_id
|
||||
progress.summary_page_id = summary_id
|
||||
|
||||
# Save final progress
|
||||
progress.save_to_file()
|
||||
|
||||
logger.info(f"Crawl complete: {result.successful_pages}/{result.total_pages} pages analyzed")
|
||||
logger.info(f"Duration: {result.get_duration()}")
|
||||
|
||||
return result
|
||||
|
||||
def _save_page_to_notion(
|
||||
self,
|
||||
metadata: PageMetadata,
|
||||
audit_id: str,
|
||||
site: str,
|
||||
) -> str | None:
|
||||
"""Save page metadata to Notion database."""
|
||||
try:
|
||||
# Build properties
|
||||
properties = {
|
||||
"Issue": {"title": [{"text": {"content": f"📄 {metadata.url}"}}]},
|
||||
"Category": {"select": {"name": "On-page SEO"}},
|
||||
"Priority": {"select": {"name": self._determine_priority(metadata)}},
|
||||
"Site": {"url": site},
|
||||
"URL": {"url": metadata.url},
|
||||
"Audit ID": {"rich_text": [{"text": {"content": audit_id}}]},
|
||||
"Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
|
||||
}
|
||||
|
||||
# Build page content
|
||||
children = self._build_page_content(metadata)
|
||||
|
||||
response = self.notion.pages.create(
|
||||
parent={"database_id": self.database_id},
|
||||
properties=properties,
|
||||
children=children,
|
||||
)
|
||||
|
||||
return response["id"]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save to Notion: {e}")
|
||||
return None
|
||||
|
||||
def _determine_priority(self, metadata: PageMetadata) -> str:
|
||||
"""Determine priority based on issues found."""
|
||||
if len(metadata.issues) >= 3:
|
||||
return "High"
|
||||
elif len(metadata.issues) >= 1:
|
||||
return "Medium"
|
||||
elif len(metadata.warnings) >= 3:
|
||||
return "Medium"
|
||||
else:
|
||||
return "Low"
|
||||
|
||||
def _build_page_content(self, metadata: PageMetadata) -> list[dict]:
|
||||
"""Build Notion page content blocks from metadata."""
|
||||
children = []
|
||||
|
||||
# Status summary callout
|
||||
status_emoji = "✅" if not metadata.issues else "⚠️" if len(metadata.issues) < 3 else "❌"
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "callout",
|
||||
"callout": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Status: {metadata.status_code} | "}},
|
||||
{"type": "text", "text": {"content": f"Response: {metadata.response_time_ms:.0f}ms | "}},
|
||||
{"type": "text", "text": {"content": f"Issues: {len(metadata.issues)} | "}},
|
||||
{"type": "text", "text": {"content": f"Warnings: {len(metadata.warnings)}"}},
|
||||
],
|
||||
"icon": {"type": "emoji", "emoji": status_emoji},
|
||||
"color": "gray_background" if not metadata.issues else "yellow_background" if len(metadata.issues) < 3 else "red_background",
|
||||
}
|
||||
})
|
||||
|
||||
# Meta Tags Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Meta Tags"}}]}
|
||||
})
|
||||
|
||||
# Meta tags table
|
||||
meta_rows = [
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Tag"}, "annotations": {"bold": True}}],
|
||||
[{"type": "text", "text": {"content": "Value"}, "annotations": {"bold": True}}],
|
||||
[{"type": "text", "text": {"content": "Status"}, "annotations": {"bold": True}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Title"}}],
|
||||
[{"type": "text", "text": {"content": (metadata.title or "—")[:50]}}],
|
||||
[{"type": "text", "text": {"content": f"✓ {metadata.title_length} chars" if metadata.title else "✗ Missing"}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Description"}}],
|
||||
[{"type": "text", "text": {"content": (metadata.meta_description or "—")[:50]}}],
|
||||
[{"type": "text", "text": {"content": f"✓ {metadata.meta_description_length} chars" if metadata.meta_description else "✗ Missing"}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Canonical"}}],
|
||||
[{"type": "text", "text": {"content": (metadata.canonical_url or "—")[:50]}}],
|
||||
[{"type": "text", "text": {"content": "✓" if metadata.canonical_url else "✗ Missing"}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Robots"}}],
|
||||
[{"type": "text", "text": {"content": metadata.robots_meta or "—"}}],
|
||||
[{"type": "text", "text": {"content": "✓" if metadata.robots_meta else "—"}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Lang"}}],
|
||||
[{"type": "text", "text": {"content": metadata.html_lang or "—"}}],
|
||||
[{"type": "text", "text": {"content": "✓" if metadata.html_lang else "—"}}],
|
||||
]}},
|
||||
]
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "table",
|
||||
"table": {
|
||||
"table_width": 3,
|
||||
"has_column_header": True,
|
||||
"has_row_header": False,
|
||||
"children": meta_rows
|
||||
}
|
||||
})
|
||||
|
||||
# Headings Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Headings"}}]}
|
||||
})
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"H1: {metadata.h1_count} | "}},
|
||||
{"type": "text", "text": {"content": f"Total headings: {len(metadata.headings)}"}},
|
||||
]}
|
||||
})
|
||||
|
||||
if metadata.h1_text:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "quote",
|
||||
"quote": {"rich_text": [{"type": "text", "text": {"content": metadata.h1_text[:200]}}]}
|
||||
})
|
||||
|
||||
# Schema Data Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Structured Data"}}]}
|
||||
})
|
||||
|
||||
if metadata.schema_types_found:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": "Schema types found: "}},
|
||||
{"type": "text", "text": {"content": ", ".join(metadata.schema_types_found)}, "annotations": {"code": True}},
|
||||
]}
|
||||
})
|
||||
else:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "callout",
|
||||
"callout": {
|
||||
"rich_text": [{"type": "text", "text": {"content": "No structured data found on this page"}}],
|
||||
"icon": {"type": "emoji", "emoji": "⚠️"},
|
||||
"color": "yellow_background",
|
||||
}
|
||||
})
|
||||
|
||||
# Open Graph Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Open Graph"}}]}
|
||||
})
|
||||
|
||||
og = metadata.open_graph
|
||||
og_status = "✓ Configured" if og.og_title else "✗ Missing"
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Status: {og_status}\n"}},
|
||||
{"type": "text", "text": {"content": f"og:title: {og.og_title or '—'}\n"}},
|
||||
{"type": "text", "text": {"content": f"og:type: {og.og_type or '—'}"}},
|
||||
]}
|
||||
})
|
||||
|
||||
# Links Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Links"}}]}
|
||||
})
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Internal links: {metadata.internal_link_count}\n"}},
|
||||
{"type": "text", "text": {"content": f"External links: {metadata.external_link_count}"}},
|
||||
]}
|
||||
})
|
||||
|
||||
# Images Section
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Images"}}]}
|
||||
})
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Total: {metadata.images_total} | "}},
|
||||
{"type": "text", "text": {"content": f"With alt: {metadata.images_with_alt} | "}},
|
||||
{"type": "text", "text": {"content": f"Without alt: {metadata.images_without_alt}"}},
|
||||
]}
|
||||
})
|
||||
|
||||
# Hreflang Section (if present)
|
||||
if metadata.hreflang_tags:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Hreflang Tags"}}]}
|
||||
})
|
||||
|
||||
for tag in metadata.hreflang_tags[:10]:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "bulleted_list_item",
|
||||
"bulleted_list_item": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"{tag['lang']}: "}},
|
||||
{"type": "text", "text": {"content": tag['url'], "link": {"url": tag['url']}}},
|
||||
]}
|
||||
})
|
||||
|
||||
# Issues & Warnings Section
|
||||
if metadata.issues or metadata.warnings:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Issues & Warnings"}}]}
|
||||
})
|
||||
|
||||
for issue in metadata.issues:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "to_do",
|
||||
"to_do": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": "❌ "}, "annotations": {"bold": True}},
|
||||
{"type": "text", "text": {"content": issue}},
|
||||
],
|
||||
"checked": False,
|
||||
}
|
||||
})
|
||||
|
||||
for warning in metadata.warnings:
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "to_do",
|
||||
"to_do": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": "⚠️ "}, "annotations": {"bold": True}},
|
||||
{"type": "text", "text": {"content": warning}},
|
||||
],
|
||||
"checked": False,
|
||||
}
|
||||
})
|
||||
|
||||
return children
|
||||
|
||||
def _create_crawl_summary_page(self, result: CrawlResult) -> str | None:
|
||||
"""Create a summary page for the crawl."""
|
||||
try:
|
||||
site_domain = urlparse(result.site).netloc
|
||||
|
||||
# Calculate statistics
|
||||
total_issues = sum(len(p.issues) for p in result.pages_analyzed)
|
||||
total_warnings = sum(len(p.warnings) for p in result.pages_analyzed)
|
||||
pages_with_issues = sum(1 for p in result.pages_analyzed if p.issues)
|
||||
pages_without_schema = sum(1 for p in result.pages_analyzed if not p.schema_types_found)
|
||||
pages_without_description = sum(1 for p in result.pages_analyzed if not p.meta_description)
|
||||
|
||||
children = []
|
||||
|
||||
# Header callout
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "callout",
|
||||
"callout": {
|
||||
"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Sitemap Crawl Complete\n\n"}},
|
||||
{"type": "text", "text": {"content": f"Audit ID: {result.audit_id}\n"}},
|
||||
{"type": "text", "text": {"content": f"Duration: {result.get_duration()}\n"}},
|
||||
{"type": "text", "text": {"content": f"Pages: {result.successful_pages}/{result.total_pages}"}},
|
||||
],
|
||||
"icon": {"type": "emoji", "emoji": "📊"},
|
||||
"color": "blue_background",
|
||||
}
|
||||
})
|
||||
|
||||
# Statistics table
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Statistics"}}]}
|
||||
})
|
||||
|
||||
stats_rows = [
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Metric"}, "annotations": {"bold": True}}],
|
||||
[{"type": "text", "text": {"content": "Count"}, "annotations": {"bold": True}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Total Pages"}}],
|
||||
[{"type": "text", "text": {"content": str(result.total_pages)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Successfully Analyzed"}}],
|
||||
[{"type": "text", "text": {"content": str(result.successful_pages)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Pages with Issues"}}],
|
||||
[{"type": "text", "text": {"content": str(pages_with_issues)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Total Issues"}}],
|
||||
[{"type": "text", "text": {"content": str(total_issues)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Total Warnings"}}],
|
||||
[{"type": "text", "text": {"content": str(total_warnings)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Pages without Schema"}}],
|
||||
[{"type": "text", "text": {"content": str(pages_without_schema)}}],
|
||||
]}},
|
||||
{"type": "table_row", "table_row": {"cells": [
|
||||
[{"type": "text", "text": {"content": "Pages without Description"}}],
|
||||
[{"type": "text", "text": {"content": str(pages_without_description)}}],
|
||||
]}},
|
||||
]
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "table",
|
||||
"table": {
|
||||
"table_width": 2,
|
||||
"has_column_header": True,
|
||||
"has_row_header": False,
|
||||
"children": stats_rows
|
||||
}
|
||||
})
|
||||
|
||||
# Pages list
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "heading_2",
|
||||
"heading_2": {"rich_text": [{"type": "text", "text": {"content": "Analyzed Pages"}}]}
|
||||
})
|
||||
|
||||
children.append({
|
||||
"object": "block",
|
||||
"type": "paragraph",
|
||||
"paragraph": {"rich_text": [
|
||||
{"type": "text", "text": {"content": f"Filter by Audit ID in the database to see all {result.successful_pages} page entries."}}
|
||||
]}
|
||||
})
|
||||
|
||||
# Create the summary page
|
||||
response = self.notion.pages.create(
|
||||
parent={"database_id": self.database_id},
|
||||
properties={
|
||||
"Issue": {"title": [{"text": {"content": f"📊 Sitemap Crawl: {site_domain}"}}]},
|
||||
"Category": {"select": {"name": "Technical SEO"}},
|
||||
"Priority": {"select": {"name": "High"}},
|
||||
"Site": {"url": result.site},
|
||||
"Audit ID": {"rich_text": [{"text": {"content": result.audit_id}}]},
|
||||
"Found Date": {"date": {"start": datetime.now().strftime("%Y-%m-%d")}},
|
||||
},
|
||||
children=children,
|
||||
)
|
||||
|
||||
logger.info(f"Created crawl summary page: {response['id']}")
|
||||
return response["id"]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create summary page: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def print_progress_status(progress: CrawlProgress) -> None:
|
||||
"""Print formatted progress status."""
|
||||
status_emoji = {
|
||||
"running": "🔄",
|
||||
"completed": "✅",
|
||||
"failed": "❌",
|
||||
}.get(progress.status, "❓")
|
||||
|
||||
print(f"""
|
||||
{'=' * 60}
|
||||
{status_emoji} SEO Page Analysis - {progress.status.upper()}
|
||||
{'=' * 60}
|
||||
Audit ID: {progress.audit_id}
|
||||
Site: {progress.site}
|
||||
Status: {progress.status}
|
||||
|
||||
Progress: {progress.processed_urls}/{progress.total_urls} pages ({progress.get_progress_percent():.1f}%)
|
||||
Successful: {progress.successful_urls}
|
||||
Failed: {progress.failed_urls}
|
||||
Elapsed: {progress.get_elapsed_time()}
|
||||
ETA: {progress.get_eta() if progress.status == 'running' else 'N/A'}
|
||||
|
||||
Current URL: {progress.current_url[:60] + '...' if len(progress.current_url) > 60 else progress.current_url}
|
||||
""")
|
||||
|
||||
if progress.summary_page_id:
|
||||
print(f"Summary: https://www.notion.so/{progress.summary_page_id.replace('-', '')}")
|
||||
|
||||
if progress.error_message:
|
||||
print(f"Error: {progress.error_message}")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Sitemap Crawler with Background Support")
|
||||
subparsers = parser.add_subparsers(dest="command", help="Commands")
|
||||
|
||||
# Crawl command
|
||||
crawl_parser = subparsers.add_parser("crawl", help="Start crawling a sitemap")
|
||||
crawl_parser.add_argument("sitemap_url", help="URL of the sitemap to crawl")
|
||||
crawl_parser.add_argument("--delay", "-d", type=float, default=DEFAULT_DELAY_SECONDS,
|
||||
help=f"Delay between requests in seconds (default: {DEFAULT_DELAY_SECONDS})")
|
||||
crawl_parser.add_argument("--max-pages", "-m", type=int, default=DEFAULT_MAX_PAGES,
|
||||
help=f"Maximum pages to process (default: {DEFAULT_MAX_PAGES})")
|
||||
crawl_parser.add_argument("--no-notion", action="store_true",
|
||||
help="Don't save to Notion")
|
||||
crawl_parser.add_argument("--no-limit", action="store_true",
|
||||
help="Remove page limit (use with caution)")
|
||||
|
||||
# Status command
|
||||
status_parser = subparsers.add_parser("status", help="Check crawl progress")
|
||||
status_parser.add_argument("audit_id", nargs="?", help="Specific audit ID to check (optional)")
|
||||
status_parser.add_argument("--all", "-a", action="store_true", help="Show all crawls (not just active)")
|
||||
|
||||
# List command
|
||||
list_parser = subparsers.add_parser("list", help="List all crawl jobs")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Default to crawl if no command specified but URL provided
|
||||
if args.command is None:
|
||||
# Check if first positional arg looks like a URL
|
||||
import sys
|
||||
if len(sys.argv) > 1 and (sys.argv[1].startswith("http") or sys.argv[1].endswith(".xml")):
|
||||
args.command = "crawl"
|
||||
args.sitemap_url = sys.argv[1]
|
||||
args.delay = DEFAULT_DELAY_SECONDS
|
||||
args.max_pages = DEFAULT_MAX_PAGES
|
||||
args.no_notion = False
|
||||
args.no_limit = False
|
||||
else:
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
if args.command == "status":
|
||||
if args.audit_id:
|
||||
# Show specific crawl status
|
||||
progress = get_crawl_status(args.audit_id)
|
||||
if progress:
|
||||
print_progress_status(progress)
|
||||
else:
|
||||
print(f"No crawl found with audit ID: {args.audit_id}")
|
||||
else:
|
||||
# Show active crawls
|
||||
if args.all:
|
||||
crawls = get_all_crawls()
|
||||
label = "All"
|
||||
else:
|
||||
crawls = get_active_crawls()
|
||||
label = "Active"
|
||||
|
||||
if crawls:
|
||||
print(f"\n{label} Crawl Jobs ({len(crawls)}):")
|
||||
print("-" * 60)
|
||||
for p in crawls:
|
||||
status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓")
|
||||
print(f"{status_emoji} {p.audit_id}")
|
||||
print(f" Site: {p.site}")
|
||||
print(f" Progress: {p.processed_urls}/{p.total_urls} ({p.get_progress_percent():.1f}%)")
|
||||
print()
|
||||
else:
|
||||
print(f"No {label.lower()} crawl jobs found.")
|
||||
return
|
||||
|
||||
if args.command == "list":
|
||||
crawls = get_all_crawls()
|
||||
if crawls:
|
||||
print(f"\nAll Crawl Jobs ({len(crawls)}):")
|
||||
print("-" * 80)
|
||||
print(f"{'Status':<10} {'Audit ID':<45} {'Progress':<15}")
|
||||
print("-" * 80)
|
||||
for p in crawls[:20]: # Show last 20
|
||||
status_emoji = {"running": "🔄", "completed": "✅", "failed": "❌"}.get(p.status, "❓")
|
||||
progress_str = f"{p.processed_urls}/{p.total_urls}"
|
||||
print(f"{status_emoji} {p.status:<7} {p.audit_id:<45} {progress_str:<15}")
|
||||
if len(crawls) > 20:
|
||||
print(f"... and {len(crawls) - 20} more")
|
||||
else:
|
||||
print("No crawl jobs found.")
|
||||
return
|
||||
|
||||
if args.command == "crawl":
|
||||
# Handle --no-limit option
|
||||
max_pages = args.max_pages
|
||||
if args.no_limit:
|
||||
max_pages = 999999 # Effectively unlimited
|
||||
print("⚠️ WARNING: Page limit disabled. This may take a very long time!")
|
||||
|
||||
def progress_callback(progress: CrawlProgress):
|
||||
pct = progress.get_progress_percent()
|
||||
print(f"\r[{pct:5.1f}%] {progress.processed_urls}/{progress.total_urls} pages | "
|
||||
f"Success: {progress.successful_urls} | Failed: {progress.failed_urls} | "
|
||||
f"ETA: {progress.get_eta()}", end="", flush=True)
|
||||
|
||||
crawler = SitemapCrawler()
|
||||
result = crawler.crawl_sitemap(
|
||||
args.sitemap_url,
|
||||
delay=args.delay,
|
||||
max_pages=max_pages,
|
||||
progress_callback=progress_callback,
|
||||
save_to_notion=not args.no_notion,
|
||||
)
|
||||
|
||||
print() # New line after progress
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("CRAWL COMPLETE")
|
||||
print("=" * 60)
|
||||
print(f"Audit ID: {result.audit_id}")
|
||||
print(f"Total Pages: {result.total_pages}")
|
||||
print(f"Successful: {result.successful_pages}")
|
||||
print(f"Failed: {result.failed_pages}")
|
||||
print(f"Duration: {result.get_duration()}")
|
||||
if result.summary_page_id:
|
||||
print(f"Summary Page: https://www.notion.so/{result.summary_page_id.replace('-', '')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user