Add SEO skills 33-34 and fix bugs in skills 19-34

New skills:
- Skill 33: Site migration planner with redirect mapping and monitoring
- Skill 34: Reporting dashboard with HTML charts and Korean executive reports

Bug fixes (Skill 34 - report_aggregator.py):
- Add audit_type fallback for skill identification (was only using audit_id prefix)
- Extract health scores from nested data dict (technical_score, onpage_score, etc.)
- Support subdomain matching in domain filter (blog.ourdigital.org matches ourdigital.org)
- Skip self-referencing DASH- aggregated reports

Bug fixes (Skill 20 - naver_serp_analyzer.py):
- Remove VIEW tab selectors (removed by Naver in 2026)
- Add new section detectors: books (도서), shortform (숏폼), influencer (인플루언서)

Improvements (Skill 34 - dashboard/executive report):
- Add Korean category labels for Chart.js charts (기술 SEO, 온페이지, etc.)
- Add Korean trend labels (개선 중 ↑, 안정 →, 하락 중 ↓)
- Add English→Korean issue description translation layer (20 common patterns)

Documentation improvements:
- Add Korean triggers to 4 skill descriptions (19, 25, 28, 31)
- Expand Skill 32 SKILL.md from 40→143 lines (was 6/10, added workflow, output format, limitations)
- Add output format examples to Skills 27 and 28 SKILL.md
- Add limitations sections to Skills 27 and 28
- Update README.md, CLAUDE.md, AGENTS.md for skills 33-34

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-14 00:01:00 +09:00
parent dbfaa883cd
commit d2d0a2d460
37 changed files with 5462 additions and 56 deletions

View File

@@ -0,0 +1,744 @@
"""
Report Aggregator - Collect and normalize outputs from all SEO skills
=====================================================================
Purpose: Scan for recent audit outputs from skills 11-33, normalize data
formats, merge findings by domain/date, compute cross-skill health
scores, and identify top-priority issues across all audits.
Python: 3.10+
Usage:
python report_aggregator.py --domain https://example.com --json
python report_aggregator.py --domain https://example.com --output-dir ./audit_outputs --json
python report_aggregator.py --domain https://example.com --from 2025-01-01 --to 2025-03-31 --json
python report_aggregator.py --domain https://example.com --json --output report.json
"""
import argparse
import asyncio
import json
import logging
import os
import sys
from dataclasses import dataclass, field, asdict
from datetime import datetime, date
from pathlib import Path
from typing import Any
from urllib.parse import urlparse
from base_client import BaseAsyncClient, config
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Skill registry
# ---------------------------------------------------------------------------
SKILL_REGISTRY = {
11: {"name": "comprehensive-audit", "category": "comprehensive", "weight": 1.0},
12: {"name": "technical-audit", "category": "technical", "weight": 0.20},
13: {"name": "on-page-audit", "category": "on_page", "weight": 0.20},
14: {"name": "core-web-vitals", "category": "performance", "weight": 0.25},
15: {"name": "search-console", "category": "search_console", "weight": 0.10},
16: {"name": "schema-validator", "category": "schema", "weight": 0.15},
17: {"name": "schema-generator", "category": "schema", "weight": 0.10},
18: {"name": "local-audit", "category": "local", "weight": 0.10},
19: {"name": "keyword-strategy", "category": "keywords", "weight": 0.15},
20: {"name": "serp-analysis", "category": "keywords", "weight": 0.10},
21: {"name": "position-tracking", "category": "keywords", "weight": 0.15},
22: {"name": "link-building", "category": "links", "weight": 0.15},
23: {"name": "content-strategy", "category": "content", "weight": 0.15},
24: {"name": "ecommerce-seo", "category": "ecommerce", "weight": 0.10},
25: {"name": "kpi-framework", "category": "kpi", "weight": 0.20},
26: {"name": "international-seo", "category": "international", "weight": 0.10},
27: {"name": "ai-visibility", "category": "ai_search", "weight": 0.10},
28: {"name": "knowledge-graph", "category": "entity_seo", "weight": 0.10},
31: {"name": "competitor-intel", "category": "competitor", "weight": 0.15},
32: {"name": "crawl-budget", "category": "technical", "weight": 0.10},
33: {"name": "page-experience", "category": "performance", "weight": 0.10},
}
CATEGORY_WEIGHTS = {
"technical": 0.20,
"on_page": 0.15,
"performance": 0.15,
"content": 0.10,
"links": 0.10,
"local": 0.05,
"keywords": 0.10,
"competitor": 0.05,
"schema": 0.05,
"kpi": 0.05,
}
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class SkillOutput:
"""Normalized output from a single SEO skill."""
skill_id: int = 0
skill_name: str = ""
domain: str = ""
audit_date: str = ""
category: str = ""
data: dict[str, Any] = field(default_factory=dict)
health_score: float = 0.0
issues: list[dict[str, Any]] = field(default_factory=list)
wins: list[dict[str, Any]] = field(default_factory=list)
source_file: str = ""
@dataclass
class AggregatedReport:
"""Full aggregated report from all SEO skill outputs."""
domain: str = ""
report_date: str = ""
skills_included: list[dict[str, Any]] = field(default_factory=list)
overall_health: float = 0.0
health_trend: str = "stable"
category_scores: dict[str, float] = field(default_factory=dict)
top_issues: list[dict[str, Any]] = field(default_factory=list)
top_wins: list[dict[str, Any]] = field(default_factory=list)
timeline: list[dict[str, Any]] = field(default_factory=list)
raw_outputs: list[dict[str, Any]] = field(default_factory=list)
audit_id: str = ""
timestamp: str = ""
errors: list[str] = field(default_factory=list)
# ---------------------------------------------------------------------------
# Aggregator
# ---------------------------------------------------------------------------
class ReportAggregator(BaseAsyncClient):
"""Aggregate outputs from all SEO skills into unified reports."""
NOTION_DB_ID = "2c8581e5-8a1e-8035-880b-e38cefc2f3ef"
def __init__(self):
super().__init__(max_concurrent=5, requests_per_second=2.0)
@staticmethod
def _extract_domain(url: str) -> str:
"""Extract bare domain from URL or return as-is if already bare."""
if "://" in url:
parsed = urlparse(url)
return parsed.netloc.lower().replace("www.", "")
return url.lower().replace("www.", "")
@staticmethod
def _generate_audit_id() -> str:
"""Generate audit ID in DASH-YYYYMMDD-NNN format."""
now = datetime.now()
return f"DASH-{now.strftime('%Y%m%d')}-001"
def scan_local_outputs(
self,
output_dir: str,
domain: str | None = None,
date_from: str | None = None,
date_to: str | None = None,
) -> list[SkillOutput]:
"""Find JSON output files from other SEO skills in a directory.
Scans for files matching patterns from skills 11-33 and parses
them into normalized SkillOutput objects.
"""
outputs: list[SkillOutput] = []
output_path = Path(output_dir)
if not output_path.exists():
self.logger.warning(f"Output directory not found: {output_dir}")
return outputs
# Scan for JSON files matching skill output patterns
json_files = list(output_path.rglob("*.json"))
self.logger.info(f"Found {len(json_files)} JSON files in {output_dir}")
for json_file in json_files:
try:
data = json.loads(json_file.read_text(encoding="utf-8"))
# Attempt to identify which skill produced this output
skill_output = self._identify_and_parse(data, str(json_file))
if skill_output is None:
continue
# Filter by domain if specified (supports subdomains)
if domain:
target_domain = self._extract_domain(domain)
if skill_output.domain:
file_domain = skill_output.domain
# Match exact domain OR subdomains (e.g., blog.example.com matches example.com)
if file_domain != target_domain and not file_domain.endswith("." + target_domain):
continue
# Filter by date range
if date_from and skill_output.audit_date < date_from:
continue
if date_to and skill_output.audit_date > date_to:
continue
outputs.append(skill_output)
self.logger.info(
f"Parsed output from skill {skill_output.skill_id} "
f"({skill_output.skill_name}): {json_file.name}"
)
except (json.JSONDecodeError, KeyError, TypeError) as e:
self.logger.warning(f"Could not parse {json_file}: {e}")
self.logger.info(f"Successfully parsed {len(outputs)} skill outputs")
return outputs
def _identify_and_parse(
self, data: dict[str, Any], source_file: str
) -> SkillOutput | None:
"""Identify which skill produced the output and parse it."""
skill_output = SkillOutput(source_file=source_file)
# Strategy 1: Parse skill from audit_id prefix (e.g., KPI-20250115-001)
audit_id = data.get("audit_id", "")
if isinstance(audit_id, str):
prefix_map = {
"COMP": 11, "TECH": 12, "PAGE": 13, "CWV": 14,
"GSC": 15, "SCHEMA": 16, "LOCAL": 18, "KW": 19,
"SERP": 20, "RANK": 21, "LINK": 22, "CONTENT": 23,
"ECOM": 24, "KPI": 25, "INTL": 26, "AI": 27,
"KG": 28, "COMPET": 31, "CRAWL": 32, "MIGR": 33,
"DASH": None, # Skip self-referencing dashboard reports
}
for prefix, skill_id in prefix_map.items():
if audit_id.startswith(prefix):
if skill_id is None:
return None # Skip aggregated reports
skill_info = SKILL_REGISTRY.get(skill_id, {})
skill_output.skill_id = skill_id
skill_output.skill_name = skill_info.get("name", "unknown")
skill_output.category = skill_info.get("category", "unknown")
break
# Strategy 2: Fallback to audit_type field (used by our-seo-agent outputs)
if not skill_output.skill_id:
audit_type = data.get("audit_type", "")
if isinstance(audit_type, str) and audit_type:
type_map = {
"comprehensive": 11, "technical": 12, "onpage": 13,
"cwv": 14, "core-web-vitals": 14,
"gsc": 15, "search-console": 15,
"schema": 16, "local": 18,
"keyword": 19, "serp": 20, "position": 21,
"link": 22, "backlink": 22,
"content": 23, "ecommerce": 24, "kpi": 25,
"international": 26, "hreflang": 26,
"ai-visibility": 27, "knowledge-graph": 28, "entity": 28,
"competitor": 31, "crawl-budget": 32, "crawl": 32,
"migration": 33,
}
for type_key, skill_id in type_map.items():
if audit_type.lower() == type_key:
skill_info = SKILL_REGISTRY.get(skill_id, {})
skill_output.skill_id = skill_id
skill_output.skill_name = skill_info.get("name", "unknown")
skill_output.category = skill_info.get("category", "unknown")
break
# Extract domain
for key in ("url", "target", "domain", "site"):
if key in data:
skill_output.domain = self._extract_domain(str(data[key]))
break
# Extract health score — check top-level first, then nested data dict
score_found = False
for key in ("health_score", "overall_health", "score"):
if key in data:
try:
skill_output.health_score = float(data[key])
score_found = True
except (ValueError, TypeError):
pass
break
if not score_found:
nested = data.get("data", {})
if isinstance(nested, dict):
for key in ("technical_score", "onpage_score", "schema_score",
"local_seo_score", "cwv_score", "performance_score",
"content_score", "link_score", "keyword_score",
"competitor_score", "efficiency_score",
"health_score", "overall_score", "score"):
val = nested.get(key)
if val is not None:
try:
skill_output.health_score = float(val)
except (ValueError, TypeError):
pass
break
# Extract audit date
for key in ("audit_date", "report_date", "timestamp", "found_date"):
if key in data:
date_str = str(data[key])[:10]
skill_output.audit_date = date_str
break
if not skill_output.audit_date:
skill_output.audit_date = date.today().isoformat()
# Extract issues
issues_raw = data.get("issues", data.get("critical_issues", []))
if isinstance(issues_raw, list):
for issue in issues_raw:
if isinstance(issue, dict):
skill_output.issues.append(issue)
elif isinstance(issue, str):
skill_output.issues.append({"description": issue, "severity": "medium"})
# Extract wins / recommendations
wins_raw = data.get("wins", data.get("top_wins", []))
if isinstance(wins_raw, list):
for win in wins_raw:
if isinstance(win, dict):
skill_output.wins.append(win)
elif isinstance(win, str):
skill_output.wins.append({"description": win})
# Store full data
skill_output.data = data
# Skip if no useful data was extracted
if not skill_output.skill_id and not skill_output.domain:
return None
return skill_output
async def query_notion_audits(
self,
domain: str,
date_from: str | None = None,
date_to: str | None = None,
) -> list[SkillOutput]:
"""Fetch past audit entries from Notion SEO Audit Log database.
In production, this uses the Notion MCP tools to query the database.
Returns normalized SkillOutput objects.
"""
outputs: list[SkillOutput] = []
self.logger.info(
f"Querying Notion audits for {domain} "
f"(db: {self.NOTION_DB_ID}, from={date_from}, to={date_to})"
)
# In production, this would call:
# mcp__notion__query-database with filters for Site URL and Found Date
# For now, return empty list as placeholder
self.logger.info(
"Notion query is a placeholder; use MCP tools in Claude Desktop "
"or manually provide JSON files via --output-dir."
)
return outputs
def normalize_output(self, skill_output: SkillOutput) -> dict[str, Any]:
"""Normalize a skill output into a unified format."""
return {
"skill_id": skill_output.skill_id,
"skill_name": skill_output.skill_name,
"domain": skill_output.domain,
"audit_date": skill_output.audit_date,
"category": skill_output.category,
"health_score": skill_output.health_score,
"issues_count": len(skill_output.issues),
"wins_count": len(skill_output.wins),
"issues": skill_output.issues[:10],
"wins": skill_output.wins[:10],
}
def compute_cross_skill_health(
self, outputs: list[SkillOutput]
) -> tuple[float, dict[str, float]]:
"""Compute weighted overall health score across all skills.
Returns (overall_score, category_scores_dict).
"""
category_scores: dict[str, list[float]] = {}
for output in outputs:
cat = output.category
if cat and output.health_score > 0:
category_scores.setdefault(cat, []).append(output.health_score)
# Average scores per category
avg_category: dict[str, float] = {}
for cat, scores in category_scores.items():
avg_category[cat] = round(sum(scores) / len(scores), 1)
# Weighted overall score
total_weight = 0.0
weighted_sum = 0.0
for cat, avg_score in avg_category.items():
weight = CATEGORY_WEIGHTS.get(cat, 0.05)
weighted_sum += avg_score * weight
total_weight += weight
overall = round(weighted_sum / total_weight, 1) if total_weight > 0 else 0.0
return overall, avg_category
def identify_priorities(
self, outputs: list[SkillOutput]
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
"""Identify top issues and wins across all skill outputs.
Returns (top_issues, top_wins).
"""
all_issues: list[dict[str, Any]] = []
all_wins: list[dict[str, Any]] = []
severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
for output in outputs:
for issue in output.issues:
enriched = {
**issue,
"source_skill": output.skill_name,
"source_skill_id": output.skill_id,
"category": output.category,
}
all_issues.append(enriched)
for win in output.wins:
enriched = {
**win,
"source_skill": output.skill_name,
"source_skill_id": output.skill_id,
"category": output.category,
}
all_wins.append(enriched)
# Sort issues by severity
all_issues.sort(
key=lambda i: severity_order.get(
i.get("severity", "medium"), 2
)
)
return all_issues[:20], all_wins[:20]
def build_timeline(self, outputs: list[SkillOutput]) -> list[dict[str, Any]]:
"""Build an audit history timeline from all skill outputs."""
timeline: list[dict[str, Any]] = []
for output in outputs:
entry = {
"date": output.audit_date,
"skill": output.skill_name,
"skill_id": output.skill_id,
"health_score": output.health_score,
"category": output.category,
"issues_count": len(output.issues),
}
timeline.append(entry)
# Sort by date descending
timeline.sort(key=lambda e: e.get("date", ""), reverse=True)
return timeline
async def run(
self,
domain: str,
output_dir: str | None = None,
date_from: str | None = None,
date_to: str | None = None,
) -> AggregatedReport:
"""Orchestrate the full report aggregation pipeline."""
target_domain = self._extract_domain(domain)
report = AggregatedReport(
domain=target_domain,
report_date=date.today().isoformat(),
audit_id=self._generate_audit_id(),
timestamp=datetime.now().isoformat(),
)
all_outputs: list[SkillOutput] = []
# Step 1: Scan local outputs
if output_dir:
self.logger.info(f"Step 1/5: Scanning local outputs in {output_dir}...")
local_outputs = self.scan_local_outputs(
output_dir, domain=target_domain,
date_from=date_from, date_to=date_to,
)
all_outputs.extend(local_outputs)
else:
self.logger.info("Step 1/5: No output directory specified, skipping local scan.")
# Step 2: Query Notion for past audits
self.logger.info("Step 2/5: Querying Notion for past audits...")
try:
notion_outputs = await self.query_notion_audits(
domain=target_domain,
date_from=date_from,
date_to=date_to,
)
all_outputs.extend(notion_outputs)
except Exception as e:
msg = f"Notion query error: {e}"
self.logger.error(msg)
report.errors.append(msg)
if not all_outputs:
self.logger.warning(
"No skill outputs found. Provide --output-dir with JSON files "
"from SEO skills 11-33, or ensure Notion audit log has entries."
)
report.errors.append("No skill outputs found to aggregate.")
return report
# Step 3: Normalize and compute health scores
self.logger.info(
f"Step 3/5: Normalizing {len(all_outputs)} skill outputs..."
)
report.skills_included = [
{
"skill_id": o.skill_id,
"skill_name": o.skill_name,
"audit_date": o.audit_date,
}
for o in all_outputs
]
report.raw_outputs = [self.normalize_output(o) for o in all_outputs]
overall_health, category_scores = self.compute_cross_skill_health(all_outputs)
report.overall_health = overall_health
report.category_scores = category_scores
# Determine health trend from timeline
scores_by_date = sorted(
[(o.audit_date, o.health_score) for o in all_outputs if o.health_score > 0],
key=lambda x: x[0],
)
if len(scores_by_date) >= 2:
older_avg = sum(s for _, s in scores_by_date[:len(scores_by_date)//2]) / max(len(scores_by_date)//2, 1)
newer_avg = sum(s for _, s in scores_by_date[len(scores_by_date)//2:]) / max(len(scores_by_date) - len(scores_by_date)//2, 1)
if newer_avg > older_avg + 3:
report.health_trend = "improving"
elif newer_avg < older_avg - 3:
report.health_trend = "declining"
else:
report.health_trend = "stable"
# Step 4: Identify priorities
self.logger.info("Step 4/5: Identifying top issues and wins...")
top_issues, top_wins = self.identify_priorities(all_outputs)
report.top_issues = top_issues
report.top_wins = top_wins
# Step 5: Build timeline
self.logger.info("Step 5/5: Building audit history timeline...")
report.timeline = self.build_timeline(all_outputs)
self.logger.info(
f"Aggregation complete: {len(all_outputs)} skills, "
f"health={report.overall_health}/100, "
f"trend={report.health_trend}, "
f"issues={len(report.top_issues)}, wins={len(report.top_wins)}"
)
return report
# ---------------------------------------------------------------------------
# Output formatting
# ---------------------------------------------------------------------------
def _format_text_report(report: AggregatedReport) -> str:
"""Format aggregated report as human-readable text."""
lines: list[str] = []
lines.append("=" * 70)
lines.append(" SEO REPORTING DASHBOARD - AGGREGATED REPORT")
lines.append(f" Domain: {report.domain}")
lines.append(f" Report Date: {report.report_date}")
lines.append(f" Audit ID: {report.audit_id}")
lines.append("=" * 70)
# Health score
lines.append("")
lines.append(f" Overall Health: {report.overall_health}/100 ({report.health_trend})")
lines.append("-" * 50)
# Category scores
if report.category_scores:
lines.append("")
lines.append("--- CATEGORY SCORES ---")
for cat, score in sorted(
report.category_scores.items(), key=lambda x: x[1], reverse=True
):
bar = "#" * int(score / 5) + "." * (20 - int(score / 5))
lines.append(f" {cat:<20} [{bar}] {score:.1f}/100")
# Skills included
if report.skills_included:
lines.append("")
lines.append("--- SKILLS INCLUDED ---")
for skill in report.skills_included:
lines.append(
f" [{skill['skill_id']:>2}] {skill['skill_name']:<30} "
f"({skill['audit_date']})"
)
# Top issues
if report.top_issues:
lines.append("")
lines.append("--- TOP ISSUES ---")
for i, issue in enumerate(report.top_issues[:10], 1):
severity = issue.get("severity", "medium").upper()
desc = issue.get("description", "No description")
cat = issue.get("category", "")
lines.append(f" {i:>2}. [{severity}] ({cat}) {desc}")
# Top wins
if report.top_wins:
lines.append("")
lines.append("--- TOP WINS ---")
for i, win in enumerate(report.top_wins[:10], 1):
desc = win.get("description", "No description")
cat = win.get("category", "")
lines.append(f" {i:>2}. ({cat}) {desc}")
# Timeline
if report.timeline:
lines.append("")
lines.append("--- AUDIT TIMELINE ---")
lines.append(f" {'Date':<12} {'Skill':<25} {'Score':>8} {'Issues':>8}")
lines.append(" " + "-" * 55)
for entry in report.timeline[:15]:
lines.append(
f" {entry['date']:<12} {entry['skill']:<25} "
f"{entry['health_score']:>7.1f} {entry['issues_count']:>7}"
)
# Errors
if report.errors:
lines.append("")
lines.append("--- ERRORS ---")
for err in report.errors:
lines.append(f" - {err}")
lines.append("")
lines.append("=" * 70)
return "\n".join(lines)
def _serialize_report(report: AggregatedReport) -> dict:
"""Convert report to JSON-serializable dict."""
return {
"domain": report.domain,
"report_date": report.report_date,
"overall_health": report.overall_health,
"health_trend": report.health_trend,
"skills_included": report.skills_included,
"category_scores": report.category_scores,
"top_issues": report.top_issues,
"top_wins": report.top_wins,
"timeline": report.timeline,
"raw_outputs": report.raw_outputs,
"audit_id": report.audit_id,
"timestamp": report.timestamp,
"errors": report.errors if report.errors else None,
}
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="SEO Report Aggregator - Collect and normalize outputs from all SEO skills",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""\
Examples:
python report_aggregator.py --domain https://example.com --json
python report_aggregator.py --domain https://example.com --output-dir ./audit_outputs --json
python report_aggregator.py --domain https://example.com --from 2025-01-01 --to 2025-03-31 --json
""",
)
parser.add_argument(
"--domain",
required=True,
help="Target domain to aggregate reports for",
)
parser.add_argument(
"--output-dir",
type=str,
default=None,
help="Directory containing JSON outputs from SEO skills",
)
parser.add_argument(
"--from",
type=str,
default=None,
dest="date_from",
help="Start date for filtering (YYYY-MM-DD)",
)
parser.add_argument(
"--to",
type=str,
default=None,
dest="date_to",
help="End date for filtering (YYYY-MM-DD)",
)
parser.add_argument(
"--json",
action="store_true",
default=False,
help="Output in JSON format",
)
parser.add_argument(
"--output",
type=str,
default=None,
help="Save output to file path",
)
return parser.parse_args(argv)
async def async_main(args: argparse.Namespace) -> None:
aggregator = ReportAggregator()
report = await aggregator.run(
domain=args.domain,
output_dir=args.output_dir,
date_from=args.date_from,
date_to=args.date_to,
)
if args.json:
output_str = json.dumps(
_serialize_report(report), indent=2, ensure_ascii=False
)
else:
output_str = _format_text_report(report)
if args.output:
Path(args.output).write_text(output_str, encoding="utf-8")
logger.info(f"Report saved to {args.output}")
else:
print(output_str)
aggregator.print_stats()
def main() -> None:
args = parse_args()
asyncio.run(async_main(args))
if __name__ == "__main__":
main()