""" Report Aggregator - Collect and normalize outputs from all SEO skills ===================================================================== Purpose: Scan for recent audit outputs from skills 11-33, normalize data formats, merge findings by domain/date, compute cross-skill health scores, and identify top-priority issues across all audits. Python: 3.10+ Usage: python report_aggregator.py --domain https://example.com --json python report_aggregator.py --domain https://example.com --output-dir ./audit_outputs --json python report_aggregator.py --domain https://example.com --from 2025-01-01 --to 2025-03-31 --json python report_aggregator.py --domain https://example.com --json --output report.json """ import argparse import asyncio import json import logging import os import sys from dataclasses import dataclass, field, asdict from datetime import datetime, date from pathlib import Path from typing import Any from urllib.parse import urlparse from base_client import BaseAsyncClient, config logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Skill registry # --------------------------------------------------------------------------- SKILL_REGISTRY = { 11: {"name": "comprehensive-audit", "category": "comprehensive", "weight": 1.0}, 12: {"name": "technical-audit", "category": "technical", "weight": 0.20}, 13: {"name": "on-page-audit", "category": "on_page", "weight": 0.20}, 14: {"name": "core-web-vitals", "category": "performance", "weight": 0.25}, 15: {"name": "search-console", "category": "search_console", "weight": 0.10}, 16: {"name": "schema-validator", "category": "schema", "weight": 0.15}, 17: {"name": "schema-generator", "category": "schema", "weight": 0.10}, 18: {"name": "local-audit", "category": "local", "weight": 0.10}, 19: {"name": "keyword-strategy", "category": "keywords", "weight": 0.15}, 20: {"name": "serp-analysis", "category": "keywords", "weight": 0.10}, 21: {"name": "position-tracking", "category": "keywords", "weight": 0.15}, 22: {"name": "link-building", "category": "links", "weight": 0.15}, 23: {"name": "content-strategy", "category": "content", "weight": 0.15}, 24: {"name": "ecommerce-seo", "category": "ecommerce", "weight": 0.10}, 25: {"name": "kpi-framework", "category": "kpi", "weight": 0.20}, 26: {"name": "international-seo", "category": "international", "weight": 0.10}, 27: {"name": "ai-visibility", "category": "ai_search", "weight": 0.10}, 28: {"name": "knowledge-graph", "category": "entity_seo", "weight": 0.10}, 31: {"name": "competitor-intel", "category": "competitor", "weight": 0.15}, 32: {"name": "crawl-budget", "category": "technical", "weight": 0.10}, 33: {"name": "migration-planner", "category": "migration", "weight": 0.10}, } CATEGORY_WEIGHTS = { "technical": 0.20, "on_page": 0.15, "performance": 0.15, "content": 0.10, "links": 0.10, "local": 0.05, "keywords": 0.10, "competitor": 0.05, "schema": 0.05, "kpi": 0.05, "comprehensive": 1.0, "search_console": 0.05, "ecommerce": 0.05, "international": 0.05, "ai_search": 0.05, "entity_seo": 0.05, "migration": 0.05, } # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class SkillOutput: """Normalized output from a single SEO skill.""" skill_id: int = 0 skill_name: str = "" domain: str = "" audit_date: str = "" category: str = "" data: dict[str, Any] = field(default_factory=dict) health_score: float = 0.0 issues: list[dict[str, Any]] = field(default_factory=list) wins: list[dict[str, Any]] = field(default_factory=list) source_file: str = "" @dataclass class AggregatedReport: """Full aggregated report from all SEO skill outputs.""" domain: str = "" report_date: str = "" skills_included: list[dict[str, Any]] = field(default_factory=list) overall_health: float = 0.0 health_trend: str = "stable" category_scores: dict[str, float] = field(default_factory=dict) top_issues: list[dict[str, Any]] = field(default_factory=list) top_wins: list[dict[str, Any]] = field(default_factory=list) timeline: list[dict[str, Any]] = field(default_factory=list) raw_outputs: list[dict[str, Any]] = field(default_factory=list) audit_id: str = "" timestamp: str = "" errors: list[str] = field(default_factory=list) # --------------------------------------------------------------------------- # Aggregator # --------------------------------------------------------------------------- class ReportAggregator(BaseAsyncClient): """Aggregate outputs from all SEO skills into unified reports.""" NOTION_DB_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef" def __init__(self): super().__init__(max_concurrent=5, requests_per_second=2.0) @staticmethod def _extract_domain(url: str) -> str: """Extract bare domain from URL or return as-is if already bare.""" if "://" in url: parsed = urlparse(url) return parsed.netloc.lower().replace("www.", "") return url.lower().replace("www.", "") @staticmethod def _generate_audit_id() -> str: """Generate audit ID in DASH-YYYYMMDD-NNN format.""" now = datetime.now() return f"DASH-{now.strftime('%Y%m%d')}-001" def scan_local_outputs( self, output_dir: str, domain: str | None = None, date_from: str | None = None, date_to: str | None = None, ) -> list[SkillOutput]: """Find JSON output files from other SEO skills in a directory. Scans for files matching patterns from skills 11-33 and parses them into normalized SkillOutput objects. """ outputs: list[SkillOutput] = [] output_path = Path(output_dir) if not output_path.exists(): self.logger.warning(f"Output directory not found: {output_dir}") return outputs # Scan for JSON files matching skill output patterns json_files = list(output_path.rglob("*.json")) self.logger.info(f"Found {len(json_files)} JSON files in {output_dir}") for json_file in json_files: try: data = json.loads(json_file.read_text(encoding="utf-8")) # Attempt to identify which skill produced this output skill_output = self._identify_and_parse(data, str(json_file)) if skill_output is None: continue # Filter by domain if specified (supports subdomains) if domain: target_domain = self._extract_domain(domain) if skill_output.domain: file_domain = skill_output.domain # Match exact domain OR subdomains (e.g., blog.example.com matches example.com) if file_domain != target_domain and not file_domain.endswith("." + target_domain): continue # Filter by date range if date_from and skill_output.audit_date < date_from: continue if date_to and skill_output.audit_date > date_to: continue outputs.append(skill_output) self.logger.info( f"Parsed output from skill {skill_output.skill_id} " f"({skill_output.skill_name}): {json_file.name}" ) except (json.JSONDecodeError, KeyError, TypeError) as e: self.logger.warning(f"Could not parse {json_file}: {e}") self.logger.info(f"Successfully parsed {len(outputs)} skill outputs") return outputs def _identify_and_parse( self, data: dict[str, Any], source_file: str ) -> SkillOutput | None: """Identify which skill produced the output and parse it.""" skill_output = SkillOutput(source_file=source_file) # Strategy 1: Parse skill from audit_id prefix (e.g., KPI-20250115-001) audit_id = data.get("audit_id", "") if isinstance(audit_id, str): prefix_map = { "COMP": 11, "TECH": 12, "PAGE": 13, "CWV": 14, "GSC": 15, "SCHEMA": 16, "LOCAL": 18, "KW": 19, "SERP": 20, "RANK": 21, "LINK": 22, "CONTENT": 23, "ECOM": 24, "KPI": 25, "INTL": 26, "AI": 27, "KG": 28, "COMPET": 31, "CRAWL": 32, "MIGR": 33, "DASH": None, # Skip self-referencing dashboard reports } for prefix, skill_id in prefix_map.items(): if audit_id.startswith(prefix): if skill_id is None: return None # Skip aggregated reports skill_info = SKILL_REGISTRY.get(skill_id, {}) skill_output.skill_id = skill_id skill_output.skill_name = skill_info.get("name", "unknown") skill_output.category = skill_info.get("category", "unknown") break # Strategy 2: Fallback to audit_type field (used by our-seo-agent outputs) if not skill_output.skill_id: audit_type = data.get("audit_type", "") if isinstance(audit_type, str) and audit_type: type_map = { "comprehensive": 11, "technical": 12, "onpage": 13, "cwv": 14, "core-web-vitals": 14, "gsc": 15, "search-console": 15, "schema": 16, "local": 18, "keyword": 19, "serp": 20, "position": 21, "link": 22, "backlink": 22, "content": 23, "ecommerce": 24, "kpi": 25, "international": 26, "hreflang": 26, "ai-visibility": 27, "knowledge-graph": 28, "entity": 28, "competitor": 31, "crawl-budget": 32, "crawl": 32, "migration": 33, } for type_key, skill_id in type_map.items(): if audit_type.lower() == type_key: skill_info = SKILL_REGISTRY.get(skill_id, {}) skill_output.skill_id = skill_id skill_output.skill_name = skill_info.get("name", "unknown") skill_output.category = skill_info.get("category", "unknown") break # Extract domain for key in ("url", "target", "domain", "site"): if key in data: skill_output.domain = self._extract_domain(str(data[key])) break # Extract health score — check top-level first, then nested data dict score_found = False for key in ("health_score", "overall_health", "overall_score", "score", "technical_score", "efficiency_score", "onpage_score"): if key in data: try: skill_output.health_score = float(data[key]) score_found = True break except (ValueError, TypeError): continue if not score_found: nested = data.get("data", {}) if isinstance(nested, dict): for key in ("technical_score", "onpage_score", "schema_score", "local_seo_score", "cwv_score", "performance_score", "content_score", "link_score", "keyword_score", "competitor_score", "efficiency_score", "health_score", "overall_score", "score"): val = nested.get(key) if val is not None: try: skill_output.health_score = float(val) break except (ValueError, TypeError): continue # Extract audit date for key in ("audit_date", "report_date", "timestamp", "found_date"): if key in data: date_str = str(data[key])[:10] skill_output.audit_date = date_str break if not skill_output.audit_date: skill_output.audit_date = date.today().isoformat() # Extract issues issues_raw = data.get("issues", data.get("critical_issues", [])) if isinstance(issues_raw, list): for issue in issues_raw: if isinstance(issue, dict): skill_output.issues.append(issue) elif isinstance(issue, str): skill_output.issues.append({"description": issue, "severity": "medium"}) # Extract wins / recommendations wins_raw = data.get("wins", data.get("top_wins", [])) if isinstance(wins_raw, list): for win in wins_raw: if isinstance(win, dict): skill_output.wins.append(win) elif isinstance(win, str): skill_output.wins.append({"description": win}) # Store full data skill_output.data = data # Skip if no useful data was extracted if not skill_output.skill_id and not skill_output.domain: return None return skill_output async def query_notion_audits( self, domain: str, date_from: str | None = None, date_to: str | None = None, ) -> list[SkillOutput]: """Fetch past audit entries from Notion SEO Audit Log database. In production, this uses the Notion MCP tools to query the database. Returns normalized SkillOutput objects. """ outputs: list[SkillOutput] = [] self.logger.info( f"Querying Notion audits for {domain} " f"(db: {self.NOTION_DB_ID}, from={date_from}, to={date_to})" ) # In production, this would call: # mcp__notion__query-database with filters for Site URL and Found Date # For now, return empty list as placeholder self.logger.info( "Notion query is a placeholder; use MCP tools in Claude Desktop " "or manually provide JSON files via --output-dir." ) return outputs def normalize_output(self, skill_output: SkillOutput) -> dict[str, Any]: """Normalize a skill output into a unified format.""" return { "skill_id": skill_output.skill_id, "skill_name": skill_output.skill_name, "domain": skill_output.domain, "audit_date": skill_output.audit_date, "category": skill_output.category, "health_score": skill_output.health_score, "issues_count": len(skill_output.issues), "wins_count": len(skill_output.wins), "issues": skill_output.issues[:10], "wins": skill_output.wins[:10], } def compute_cross_skill_health( self, outputs: list[SkillOutput] ) -> tuple[float, dict[str, float]]: """Compute weighted overall health score across all skills. Returns (overall_score, category_scores_dict). """ category_scores: dict[str, list[float]] = {} for output in outputs: cat = output.category if cat and output.health_score > 0: category_scores.setdefault(cat, []).append(output.health_score) # Average scores per category avg_category: dict[str, float] = {} for cat, scores in category_scores.items(): avg_category[cat] = round(sum(scores) / len(scores), 1) # Weighted overall score total_weight = 0.0 weighted_sum = 0.0 for cat, avg_score in avg_category.items(): weight = CATEGORY_WEIGHTS.get(cat, 0.05) weighted_sum += avg_score * weight total_weight += weight overall = round(weighted_sum / total_weight, 1) if total_weight > 0 else 0.0 return overall, avg_category def identify_priorities( self, outputs: list[SkillOutput] ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: """Identify top issues and wins across all skill outputs. Returns (top_issues, top_wins). """ all_issues: list[dict[str, Any]] = [] all_wins: list[dict[str, Any]] = [] severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3} for output in outputs: for issue in output.issues: enriched = { **issue, "source_skill": output.skill_name, "source_skill_id": output.skill_id, "category": output.category, } all_issues.append(enriched) for win in output.wins: enriched = { **win, "source_skill": output.skill_name, "source_skill_id": output.skill_id, "category": output.category, } all_wins.append(enriched) # Sort issues by severity all_issues.sort( key=lambda i: severity_order.get( i.get("severity", "medium"), 2 ) ) return all_issues[:20], all_wins[:20] def build_timeline(self, outputs: list[SkillOutput]) -> list[dict[str, Any]]: """Build an audit history timeline from all skill outputs.""" timeline: list[dict[str, Any]] = [] for output in outputs: entry = { "date": output.audit_date, "skill": output.skill_name, "skill_id": output.skill_id, "health_score": output.health_score, "category": output.category, "issues_count": len(output.issues), } timeline.append(entry) # Sort by date descending timeline.sort(key=lambda e: e.get("date", ""), reverse=True) return timeline async def run( self, domain: str, output_dir: str | None = None, date_from: str | None = None, date_to: str | None = None, ) -> AggregatedReport: """Orchestrate the full report aggregation pipeline.""" target_domain = self._extract_domain(domain) report = AggregatedReport( domain=target_domain, report_date=date.today().isoformat(), audit_id=self._generate_audit_id(), timestamp=datetime.now().isoformat(), ) all_outputs: list[SkillOutput] = [] # Step 1: Scan local outputs if output_dir: self.logger.info(f"Step 1/5: Scanning local outputs in {output_dir}...") local_outputs = self.scan_local_outputs( output_dir, domain=target_domain, date_from=date_from, date_to=date_to, ) all_outputs.extend(local_outputs) else: self.logger.info("Step 1/5: No output directory specified, skipping local scan.") # Step 2: Query Notion for past audits self.logger.info("Step 2/5: Querying Notion for past audits...") try: notion_outputs = await self.query_notion_audits( domain=target_domain, date_from=date_from, date_to=date_to, ) all_outputs.extend(notion_outputs) except Exception as e: msg = f"Notion query error: {e}" self.logger.error(msg) report.errors.append(msg) if not all_outputs: self.logger.warning( "No skill outputs found. Provide --output-dir with JSON files " "from SEO skills 11-33, or ensure Notion audit log has entries." ) report.errors.append("No skill outputs found to aggregate.") return report # Step 3: Normalize and compute health scores self.logger.info( f"Step 3/5: Normalizing {len(all_outputs)} skill outputs..." ) report.skills_included = [ { "skill_id": o.skill_id, "skill_name": o.skill_name, "audit_date": o.audit_date, } for o in all_outputs ] report.raw_outputs = [self.normalize_output(o) for o in all_outputs] overall_health, category_scores = self.compute_cross_skill_health(all_outputs) report.overall_health = overall_health report.category_scores = category_scores # Determine health trend from timeline scores_by_date = sorted( [(o.audit_date, o.health_score) for o in all_outputs if o.health_score > 0], key=lambda x: x[0], ) if len(scores_by_date) >= 2: older_avg = sum(s for _, s in scores_by_date[:len(scores_by_date)//2]) / max(len(scores_by_date)//2, 1) newer_avg = sum(s for _, s in scores_by_date[len(scores_by_date)//2:]) / max(len(scores_by_date) - len(scores_by_date)//2, 1) if newer_avg > older_avg + 3: report.health_trend = "improving" elif newer_avg < older_avg - 3: report.health_trend = "declining" else: report.health_trend = "stable" # Step 4: Identify priorities self.logger.info("Step 4/5: Identifying top issues and wins...") top_issues, top_wins = self.identify_priorities(all_outputs) report.top_issues = top_issues report.top_wins = top_wins # Step 5: Build timeline self.logger.info("Step 5/5: Building audit history timeline...") report.timeline = self.build_timeline(all_outputs) self.logger.info( f"Aggregation complete: {len(all_outputs)} skills, " f"health={report.overall_health}/100, " f"trend={report.health_trend}, " f"issues={len(report.top_issues)}, wins={len(report.top_wins)}" ) return report # --------------------------------------------------------------------------- # Output formatting # --------------------------------------------------------------------------- def _format_text_report(report: AggregatedReport) -> str: """Format aggregated report as human-readable text.""" lines: list[str] = [] lines.append("=" * 70) lines.append(" SEO REPORTING DASHBOARD - AGGREGATED REPORT") lines.append(f" Domain: {report.domain}") lines.append(f" Report Date: {report.report_date}") lines.append(f" Audit ID: {report.audit_id}") lines.append("=" * 70) # Health score lines.append("") lines.append(f" Overall Health: {report.overall_health}/100 ({report.health_trend})") lines.append("-" * 50) # Category scores if report.category_scores: lines.append("") lines.append("--- CATEGORY SCORES ---") for cat, score in sorted( report.category_scores.items(), key=lambda x: x[1], reverse=True ): bar = "#" * int(score / 5) + "." * (20 - int(score / 5)) lines.append(f" {cat:<20} [{bar}] {score:.1f}/100") # Skills included if report.skills_included: lines.append("") lines.append("--- SKILLS INCLUDED ---") for skill in report.skills_included: lines.append( f" [{skill['skill_id']:>2}] {skill['skill_name']:<30} " f"({skill['audit_date']})" ) # Top issues if report.top_issues: lines.append("") lines.append("--- TOP ISSUES ---") for i, issue in enumerate(report.top_issues[:10], 1): severity = issue.get("severity", "medium").upper() desc = issue.get("description", "No description") cat = issue.get("category", "") lines.append(f" {i:>2}. [{severity}] ({cat}) {desc}") # Top wins if report.top_wins: lines.append("") lines.append("--- TOP WINS ---") for i, win in enumerate(report.top_wins[:10], 1): desc = win.get("description", "No description") cat = win.get("category", "") lines.append(f" {i:>2}. ({cat}) {desc}") # Timeline if report.timeline: lines.append("") lines.append("--- AUDIT TIMELINE ---") lines.append(f" {'Date':<12} {'Skill':<25} {'Score':>8} {'Issues':>8}") lines.append(" " + "-" * 55) for entry in report.timeline[:15]: lines.append( f" {entry['date']:<12} {entry['skill']:<25} " f"{entry['health_score']:>7.1f} {entry['issues_count']:>7}" ) # Errors if report.errors: lines.append("") lines.append("--- ERRORS ---") for err in report.errors: lines.append(f" - {err}") lines.append("") lines.append("=" * 70) return "\n".join(lines) def _serialize_report(report: AggregatedReport) -> dict: """Convert report to JSON-serializable dict.""" return { "domain": report.domain, "report_date": report.report_date, "overall_health": report.overall_health, "health_trend": report.health_trend, "skills_included": report.skills_included, "category_scores": report.category_scores, "top_issues": report.top_issues, "top_wins": report.top_wins, "timeline": report.timeline, "raw_outputs": report.raw_outputs, "audit_id": report.audit_id, "timestamp": report.timestamp, "errors": report.errors if report.errors else None, } # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def parse_args(argv: list[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser( description="SEO Report Aggregator - Collect and normalize outputs from all SEO skills", formatter_class=argparse.RawDescriptionHelpFormatter, epilog="""\ Examples: python report_aggregator.py --domain https://example.com --json python report_aggregator.py --domain https://example.com --output-dir ./audit_outputs --json python report_aggregator.py --domain https://example.com --from 2025-01-01 --to 2025-03-31 --json """, ) parser.add_argument( "--domain", required=True, help="Target domain to aggregate reports for", ) parser.add_argument( "--output-dir", type=str, default=None, help="Directory containing JSON outputs from SEO skills", ) parser.add_argument( "--from", type=str, default=None, dest="date_from", help="Start date for filtering (YYYY-MM-DD)", ) parser.add_argument( "--to", type=str, default=None, dest="date_to", help="End date for filtering (YYYY-MM-DD)", ) parser.add_argument( "--json", action="store_true", default=False, help="Output in JSON format", ) parser.add_argument( "--output", type=str, default=None, help="Save output to file path", ) return parser.parse_args(argv) async def async_main(args: argparse.Namespace) -> None: aggregator = ReportAggregator() report = await aggregator.run( domain=args.domain, output_dir=args.output_dir, date_from=args.date_from, date_to=args.date_to, ) if args.json: output_str = json.dumps( _serialize_report(report), indent=2, ensure_ascii=False ) else: output_str = _format_text_report(report) if args.output: Path(args.output).write_text(output_str, encoding="utf-8") logger.info(f"Report saved to {args.output}") else: print(output_str) aggregator.print_stats() def main() -> None: args = parse_args() asyncio.run(async_main(args)) if __name__ == "__main__": main()