Add SEO skills 19-28, 31-32 with full Python implementations

12 new skills: Keyword Strategy, SERP Analysis, Position Tracking,
Link Building, Content Strategy, E-Commerce SEO, KPI Framework,
International SEO, AI Visibility, Knowledge Graph, Competitor Intel,
and Crawl Budget. ~20K lines of Python across 25 domain scripts.
Updated skill 11 pipeline table and repo CLAUDE.md.
Enhanced skill 18 local SEO workflow from jamie.clinic audit.

Note: Skill 26 hreflang_validator.py pending (content filter block).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-13 12:05:59 +09:00
parent 159f7ec3f7
commit a3ff965b87
125 changed files with 25948 additions and 173 deletions

View File

@@ -0,0 +1,716 @@
"""
Content Auditor - SEO Content Inventory & Performance Analysis
==============================================================
Purpose: Build content inventory, score performance, detect decay,
classify content types, and analyze Korean content patterns.
Python: 3.10+
"""
import argparse
import asyncio
import json
import logging
import re
import sys
from dataclasses import asdict, dataclass, field
from datetime import datetime, timedelta
from typing import Any
from urllib.parse import urlparse
import aiohttp
import requests
from bs4 import BeautifulSoup
from base_client import BaseAsyncClient, config
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class ContentPage:
"""Single content page with performance metrics."""
url: str
title: str = ""
content_type: str = "other"
word_count: int = 0
traffic: int = 0
keywords_count: int = 0
backlinks: int = 0
performance_score: float = 0.0
last_modified: str = ""
is_decaying: bool = False
decay_rate: float = 0.0
korean_pattern: str = ""
topics: list[str] = field(default_factory=list)
@dataclass
class ContentInventory:
"""Aggregated content inventory summary."""
total_pages: int = 0
by_type: dict[str, int] = field(default_factory=dict)
avg_performance_score: float = 0.0
avg_word_count: float = 0.0
pages: list[ContentPage] = field(default_factory=list)
freshness_distribution: dict[str, int] = field(default_factory=dict)
@dataclass
class ContentAuditResult:
"""Full content audit result."""
url: str
timestamp: str = ""
content_inventory: ContentInventory = field(default_factory=ContentInventory)
top_performers: list[ContentPage] = field(default_factory=list)
decaying_content: list[ContentPage] = field(default_factory=list)
korean_content_analysis: dict[str, Any] = field(default_factory=dict)
recommendations: list[str] = field(default_factory=list)
errors: list[str] = field(default_factory=list)
# ---------------------------------------------------------------------------
# URL pattern rules for content type classification
# ---------------------------------------------------------------------------
CONTENT_TYPE_PATTERNS = {
"blog": [
r"/blog/", r"/post/", r"/posts/", r"/article/", r"/articles/",
r"/news/", r"/magazine/", r"/stories/", r"/insights/",
r"/블로그/", r"/소식/", r"/뉴스/",
],
"product": [
r"/product/", r"/products/", r"/shop/", r"/store/",
r"/item/", r"/goods/", r"/catalog/",
r"/제품/", r"/상품/", r"/쇼핑/",
],
"service": [
r"/service/", r"/services/", r"/solutions/", r"/offering/",
r"/진료/", r"/서비스/", r"/시술/", r"/치료/",
],
"landing": [
r"/lp/", r"/landing/", r"/campaign/", r"/promo/",
r"/event/", r"/이벤트/", r"/프로모션/",
],
"resource": [
r"/resource/", r"/resources/", r"/guide/", r"/guides/",
r"/whitepaper/", r"/ebook/", r"/download/", r"/faq/",
r"/help/", r"/support/", r"/가이드/", r"/자료/",
],
}
KOREAN_CONTENT_PATTERNS = {
"naver_blog_style": [
r"후기", r"리뷰", r"체험", r"솔직후기", r"방문후기",
r"사용후기", r"이용후기",
],
"listicle": [
r"추천", r"베스트", r"TOP\s*\d+", r"\d+선", r"\d+가지",
r"모음", r"정리", r"비교",
],
"how_to": [
r"방법", r"하는\s*법", r"하는\s*방법", r"가이드",
r"따라하기", r"시작하기", r"알아보기",
],
"informational": [
r"이란", r"", r"의미", r"차이", r"비교",
r"장단점", r"효과", r"부작용", r"비용", r"가격",
],
}
# ---------------------------------------------------------------------------
# ContentAuditor
# ---------------------------------------------------------------------------
class ContentAuditor(BaseAsyncClient):
"""Content auditor using Ahrefs API and sitemap crawling."""
def __init__(self, max_concurrent: int = 5, requests_per_second: float = 2.0):
super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second)
self.session: aiohttp.ClientSession | None = None
async def _ensure_session(self) -> aiohttp.ClientSession:
if self.session is None or self.session.closed:
timeout = aiohttp.ClientTimeout(total=30)
self.session = aiohttp.ClientSession(timeout=timeout)
return self.session
async def close(self) -> None:
if self.session and not self.session.closed:
await self.session.close()
# ------------------------------------------------------------------
# Ahrefs data retrieval
# ------------------------------------------------------------------
async def get_top_pages(self, url: str, limit: int = 100) -> list[dict]:
"""
Retrieve top pages via Ahrefs site-explorer-top-pages.
Returns list of dicts with keys: url, traffic, keywords, value, top_keyword.
"""
self.logger.info(f"Fetching top pages from Ahrefs for {url}")
target = urlparse(url).netloc or url
try:
# Ahrefs MCP call: site-explorer-top-pages
# In MCP context this would be called by the agent.
# Standalone fallback: use REST API if AHREFS_API_KEY is set.
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
if not api_key:
self.logger.warning("AHREFS_API_KEY not set; returning empty top pages")
return []
resp = requests.get(
"https://api.ahrefs.com/v3/site-explorer/top-pages",
params={"target": target, "limit": limit, "select": "url,traffic,keywords,value,top_keyword"},
headers={"Authorization": f"Bearer {api_key}"},
timeout=30,
)
resp.raise_for_status()
data = resp.json()
pages = data.get("pages", data.get("items", []))
self.logger.info(f"Retrieved {len(pages)} top pages")
return pages
except Exception as exc:
self.logger.warning(f"Ahrefs top-pages lookup failed: {exc}")
return []
async def get_pages_by_traffic(self, url: str, limit: int = 100) -> list[dict]:
"""
Retrieve pages sorted by organic traffic via Ahrefs site-explorer-pages-by-traffic.
Returns list of dicts with keys: url, traffic, keywords, top_keyword.
"""
self.logger.info(f"Fetching pages-by-traffic from Ahrefs for {url}")
target = urlparse(url).netloc or url
try:
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
if not api_key:
self.logger.warning("AHREFS_API_KEY not set; returning empty traffic pages")
return []
resp = requests.get(
"https://api.ahrefs.com/v3/site-explorer/pages-by-traffic",
params={"target": target, "limit": limit, "select": "url,traffic,keywords,top_keyword"},
headers={"Authorization": f"Bearer {api_key}"},
timeout=30,
)
resp.raise_for_status()
data = resp.json()
pages = data.get("pages", data.get("items", []))
self.logger.info(f"Retrieved {len(pages)} pages by traffic")
return pages
except Exception as exc:
self.logger.warning(f"Ahrefs pages-by-traffic lookup failed: {exc}")
return []
# ------------------------------------------------------------------
# Sitemap crawling
# ------------------------------------------------------------------
async def crawl_sitemap(self, url: str) -> list[str]:
"""Discover URLs from sitemap.xml."""
sitemap_urls_to_try = [
f"{url.rstrip('/')}/sitemap.xml",
f"{url.rstrip('/')}/sitemap_index.xml",
f"{url.rstrip('/')}/post-sitemap.xml",
]
discovered: list[str] = []
session = await self._ensure_session()
for sitemap_url in sitemap_urls_to_try:
try:
async with session.get(sitemap_url) as resp:
if resp.status != 200:
continue
text = await resp.text()
soup = BeautifulSoup(text, "lxml-xml")
# Sitemap index
sitemaps = soup.find_all("sitemap")
if sitemaps:
for sm in sitemaps:
loc = sm.find("loc")
if loc:
child_urls = await self._parse_sitemap(session, loc.text.strip())
discovered.extend(child_urls)
else:
urls = soup.find_all("url")
for u in urls:
loc = u.find("loc")
if loc:
discovered.append(loc.text.strip())
if discovered:
self.logger.info(f"Discovered {len(discovered)} URLs from {sitemap_url}")
break
except Exception as exc:
self.logger.debug(f"Failed to fetch {sitemap_url}: {exc}")
return list(set(discovered))
async def _parse_sitemap(self, session: aiohttp.ClientSession, sitemap_url: str) -> list[str]:
"""Parse a single sitemap XML and return URLs."""
urls: list[str] = []
try:
async with session.get(sitemap_url) as resp:
if resp.status != 200:
return urls
text = await resp.text()
soup = BeautifulSoup(text, "lxml-xml")
for u in soup.find_all("url"):
loc = u.find("loc")
if loc:
urls.append(loc.text.strip())
except Exception as exc:
self.logger.debug(f"Failed to parse sitemap {sitemap_url}: {exc}")
return urls
# ------------------------------------------------------------------
# Content type classification
# ------------------------------------------------------------------
@staticmethod
def classify_content_type(url: str, title: str = "") -> str:
"""
Classify content type based on URL path patterns and title.
Returns one of: blog, product, service, landing, resource, other.
"""
combined = f"{url.lower()} {title.lower()}"
scores: dict[str, int] = {}
for ctype, patterns in CONTENT_TYPE_PATTERNS.items():
score = 0
for pattern in patterns:
if re.search(pattern, combined, re.IGNORECASE):
score += 1
if score > 0:
scores[ctype] = score
if not scores:
return "other"
return max(scores, key=scores.get)
# ------------------------------------------------------------------
# Performance scoring
# ------------------------------------------------------------------
@staticmethod
def score_performance(page: ContentPage) -> float:
"""
Compute composite performance score (0-100) from traffic, keywords, backlinks.
Weights:
- Traffic: 50% (log-scaled, 10k+ traffic = max)
- Keywords count: 30% (log-scaled, 500+ = max)
- Backlinks: 20% (log-scaled, 100+ = max)
"""
import math
traffic_score = min(100, (math.log10(max(page.traffic, 1)) / math.log10(10000)) * 100)
keywords_score = min(100, (math.log10(max(page.keywords_count, 1)) / math.log10(500)) * 100)
backlinks_score = min(100, (math.log10(max(page.backlinks, 1)) / math.log10(100)) * 100)
composite = (traffic_score * 0.50) + (keywords_score * 0.30) + (backlinks_score * 0.20)
return round(min(100, max(0, composite)), 1)
# ------------------------------------------------------------------
# Content decay detection
# ------------------------------------------------------------------
@staticmethod
def detect_decay(pages: list[ContentPage], threshold: float = -20.0) -> list[ContentPage]:
"""
Flag pages with declining traffic trend.
Uses a simple heuristic: pages with low performance score relative to
their keyword count indicate potential decay. In production, historical
traffic data from Ahrefs metrics-history would be used.
Args:
pages: List of content pages with metrics.
threshold: Decay rate threshold (percentage decline).
Returns:
List of pages flagged as decaying.
"""
decaying: list[ContentPage] = []
for page in pages:
# Heuristic: high keyword count but low traffic suggests decay
if page.keywords_count > 10 and page.traffic < 50:
page.is_decaying = True
page.decay_rate = -50.0 if page.traffic == 0 else round(
-((page.keywords_count * 10 - page.traffic) / max(page.keywords_count * 10, 1)) * 100, 1
)
if page.decay_rate <= threshold:
decaying.append(page)
elif page.performance_score < 20 and page.keywords_count > 5:
page.is_decaying = True
page.decay_rate = round(-max(30, 100 - page.performance_score * 2), 1)
if page.decay_rate <= threshold:
decaying.append(page)
decaying.sort(key=lambda p: p.decay_rate)
return decaying
# ------------------------------------------------------------------
# Freshness assessment
# ------------------------------------------------------------------
@staticmethod
def analyze_freshness(pages: list[ContentPage]) -> dict[str, int]:
"""
Categorize pages by freshness based on last_modified dates.
Returns distribution: fresh (< 3 months), aging (3-12 months),
stale (> 12 months), unknown (no date).
"""
now = datetime.now()
distribution = {"fresh": 0, "aging": 0, "stale": 0, "unknown": 0}
for page in pages:
if not page.last_modified:
distribution["unknown"] += 1
continue
try:
# Try common date formats
for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%S%z"):
try:
modified = datetime.strptime(
page.last_modified.replace("+00:00", "").replace("Z", ""), fmt.replace("%z", "")
)
break
except ValueError:
continue
else:
distribution["unknown"] += 1
continue
age = now - modified
if age < timedelta(days=90):
distribution["fresh"] += 1
elif age < timedelta(days=365):
distribution["aging"] += 1
else:
distribution["stale"] += 1
except Exception:
distribution["unknown"] += 1
return distribution
# ------------------------------------------------------------------
# Korean content pattern identification
# ------------------------------------------------------------------
@staticmethod
def identify_korean_patterns(pages: list[ContentPage]) -> dict[str, Any]:
"""
Detect Korean content patterns across pages.
Identifies Naver Blog style review content, listicles,
how-to guides, and informational content patterns.
Returns summary with counts and example URLs per pattern.
"""
results: dict[str, Any] = {
"total_korean_content": 0,
"patterns": {},
}
for pattern_name, keywords in KOREAN_CONTENT_PATTERNS.items():
matches: list[dict[str, str]] = []
for page in pages:
combined = f"{page.url} {page.title}"
for keyword in keywords:
if re.search(keyword, combined, re.IGNORECASE):
matches.append({"url": page.url, "title": page.title, "matched_keyword": keyword})
break
results["patterns"][pattern_name] = {
"count": len(matches),
"examples": matches[:5],
}
korean_urls = set()
for pattern_data in results["patterns"].values():
for example in pattern_data["examples"]:
korean_urls.add(example["url"])
results["total_korean_content"] = len(korean_urls)
return results
# ------------------------------------------------------------------
# Orchestration
# ------------------------------------------------------------------
async def audit(
self,
url: str,
detect_decay_flag: bool = False,
content_type_filter: str | None = None,
limit: int = 200,
) -> ContentAuditResult:
"""
Run full content audit: inventory, scoring, decay, Korean patterns.
Args:
url: Target website URL.
detect_decay_flag: Whether to run decay detection.
content_type_filter: Filter by content type (blog, product, etc.).
limit: Maximum pages to analyze.
Returns:
ContentAuditResult with inventory, top performers, decay, analysis.
"""
result = ContentAuditResult(
url=url,
timestamp=datetime.now().isoformat(),
)
self.logger.info(f"Starting content audit for {url}")
# 1. Gather pages from Ahrefs and sitemap
top_pages_data, traffic_pages_data, sitemap_urls = await asyncio.gather(
self.get_top_pages(url, limit=limit),
self.get_pages_by_traffic(url, limit=limit),
self.crawl_sitemap(url),
)
# 2. Merge and deduplicate pages
page_map: dict[str, ContentPage] = {}
for item in top_pages_data:
page_url = item.get("url", "")
if not page_url:
continue
page_map[page_url] = ContentPage(
url=page_url,
title=item.get("top_keyword", ""),
traffic=int(item.get("traffic", 0)),
keywords_count=int(item.get("keywords", 0)),
backlinks=int(item.get("value", 0)),
)
for item in traffic_pages_data:
page_url = item.get("url", "")
if not page_url:
continue
if page_url in page_map:
existing = page_map[page_url]
existing.traffic = max(existing.traffic, int(item.get("traffic", 0)))
existing.keywords_count = max(existing.keywords_count, int(item.get("keywords", 0)))
else:
page_map[page_url] = ContentPage(
url=page_url,
title=item.get("top_keyword", ""),
traffic=int(item.get("traffic", 0)),
keywords_count=int(item.get("keywords", 0)),
)
# Add sitemap URLs not already present
for s_url in sitemap_urls:
if s_url not in page_map:
page_map[s_url] = ContentPage(url=s_url)
# 3. Classify and score
all_pages: list[ContentPage] = []
for page in page_map.values():
page.content_type = self.classify_content_type(page.url, page.title)
page.performance_score = self.score_performance(page)
all_pages.append(page)
# 4. Filter by content type if requested
if content_type_filter:
all_pages = [p for p in all_pages if p.content_type == content_type_filter]
# 5. Build inventory
by_type: dict[str, int] = {}
for page in all_pages:
by_type[page.content_type] = by_type.get(page.content_type, 0) + 1
avg_score = (
sum(p.performance_score for p in all_pages) / len(all_pages)
if all_pages else 0.0
)
avg_word_count = (
sum(p.word_count for p in all_pages) / len(all_pages)
if all_pages else 0.0
)
freshness = self.analyze_freshness(all_pages)
result.content_inventory = ContentInventory(
total_pages=len(all_pages),
by_type=by_type,
avg_performance_score=round(avg_score, 1),
avg_word_count=round(avg_word_count, 1),
pages=sorted(all_pages, key=lambda p: p.performance_score, reverse=True)[:limit],
freshness_distribution=freshness,
)
# 6. Top performers
result.top_performers = sorted(all_pages, key=lambda p: p.performance_score, reverse=True)[:20]
# 7. Decay detection
if detect_decay_flag:
result.decaying_content = self.detect_decay(all_pages)
# 8. Korean content analysis
result.korean_content_analysis = self.identify_korean_patterns(all_pages)
# 9. Recommendations
result.recommendations = self._generate_recommendations(result)
self.logger.info(
f"Audit complete: {len(all_pages)} pages, "
f"{len(result.top_performers)} top performers, "
f"{len(result.decaying_content)} decaying"
)
return result
@staticmethod
def _generate_recommendations(result: ContentAuditResult) -> list[str]:
"""Generate actionable recommendations from audit data."""
recs: list[str] = []
inv = result.content_inventory
# Low average score
if inv.avg_performance_score < 30:
recs.append(
"전체 콘텐츠 평균 성과 점수가 낮습니다 ({:.0f}/100). "
"상위 콘텐츠 패턴을 분석하여 저성과 페이지를 개선하세요.".format(inv.avg_performance_score)
)
# Stale content
stale = inv.freshness_distribution.get("stale", 0)
total = inv.total_pages or 1
if stale / total > 0.3:
recs.append(
f"오래된 콘텐츠가 {stale}개 ({stale * 100 // total}%)입니다. "
"콘텐츠 업데이트 또는 통합을 고려하세요."
)
# Decaying content
if len(result.decaying_content) > 5:
recs.append(
f"트래픽이 감소하는 콘텐츠가 {len(result.decaying_content)}개 감지되었습니다. "
"상위 감소 페이지부터 콘텐츠 리프레시를 진행하세요."
)
# Content type balance
blog_count = inv.by_type.get("blog", 0)
if blog_count == 0:
recs.append(
"블로그 콘텐츠가 없습니다. SEO 트래픽 확보를 위해 "
"블로그 콘텐츠 전략을 수립하세요."
)
# Korean content opportunities
korean = result.korean_content_analysis
review_count = korean.get("patterns", {}).get("naver_blog_style", {}).get("count", 0)
if review_count == 0:
recs.append(
"후기/리뷰 콘텐츠가 없습니다. 한국 시장에서 후기 콘텐츠는 "
"전환율에 큰 영향을 미치므로 후기 콘텐츠 생성을 권장합니다."
)
if not recs:
recs.append("현재 콘텐츠 전략이 양호합니다. 지속적인 모니터링을 권장합니다.")
return recs
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="SEO Content Auditor - inventory, scoring, and decay detection",
)
parser.add_argument("--url", required=True, help="Target website URL")
parser.add_argument("--decay", action="store_true", help="Enable content decay detection")
parser.add_argument("--type", dest="content_type", help="Filter by content type (blog, product, service, landing, resource)")
parser.add_argument("--limit", type=int, default=200, help="Maximum pages to analyze (default: 200)")
parser.add_argument("--json", action="store_true", help="Output as JSON")
parser.add_argument("--output", help="Save output to file")
return parser
def format_text_report(result: ContentAuditResult) -> str:
"""Format audit result as human-readable text."""
lines: list[str] = []
lines.append(f"## Content Audit: {result.url}")
lines.append(f"**Date**: {result.timestamp[:10]}")
lines.append("")
inv = result.content_inventory
lines.append(f"### Content Inventory")
lines.append(f"- Total pages: {inv.total_pages}")
lines.append(f"- Average performance score: {inv.avg_performance_score}/100")
lines.append(f"- Content types: {json.dumps(inv.by_type, ensure_ascii=False)}")
lines.append(f"- Freshness: {json.dumps(inv.freshness_distribution, ensure_ascii=False)}")
lines.append("")
lines.append("### Top Performers")
for i, page in enumerate(result.top_performers[:10], 1):
lines.append(f" {i}. [{page.performance_score:.0f}] {page.url} (traffic: {page.traffic})")
lines.append("")
if result.decaying_content:
lines.append("### Decaying Content")
for i, page in enumerate(result.decaying_content[:10], 1):
lines.append(f" {i}. [{page.decay_rate:+.0f}%] {page.url} (traffic: {page.traffic})")
lines.append("")
if result.korean_content_analysis.get("patterns"):
lines.append("### Korean Content Patterns")
for pattern_name, data in result.korean_content_analysis["patterns"].items():
lines.append(f" - {pattern_name}: {data['count']} pages")
lines.append("")
lines.append("### Recommendations")
for i, rec in enumerate(result.recommendations, 1):
lines.append(f" {i}. {rec}")
return "\n".join(lines)
async def main() -> None:
parser = build_parser()
args = parser.parse_args()
auditor = ContentAuditor()
try:
result = await auditor.audit(
url=args.url,
detect_decay_flag=args.decay,
content_type_filter=args.content_type,
limit=args.limit,
)
if args.json:
output = json.dumps(asdict(result), ensure_ascii=False, indent=2, default=str)
else:
output = format_text_report(result)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
logger.info(f"Output saved to {args.output}")
else:
print(output)
finally:
await auditor.close()
auditor.print_stats()
if __name__ == "__main__":
asyncio.run(main())