Add SEO skills 19-28, 31-32 with full Python implementations

12 new skills: Keyword Strategy, SERP Analysis, Position Tracking,
Link Building, Content Strategy, E-Commerce SEO, KPI Framework,
International SEO, AI Visibility, Knowledge Graph, Competitor Intel,
and Crawl Budget. ~20K lines of Python across 25 domain scripts.
Updated skill 11 pipeline table and repo CLAUDE.md.
Enhanced skill 18 local SEO workflow from jamie.clinic audit.

Note: Skill 26 hreflang_validator.py pending (content filter block).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-13 12:05:59 +09:00
parent 159f7ec3f7
commit a3ff965b87
125 changed files with 25948 additions and 173 deletions

View File

@@ -0,0 +1,694 @@
"""
Content Gap Analyzer - Topic Gap Detection & Cluster Mapping
=============================================================
Purpose: Identify content gaps vs competitors, build topic clusters,
and generate prioritized editorial calendars.
Python: 3.10+
"""
import argparse
import asyncio
import json
import logging
import math
import re
import sys
from collections import defaultdict
from dataclasses import asdict, dataclass, field
from datetime import datetime, timedelta
from typing import Any
from urllib.parse import urlparse
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from base_client import BaseAsyncClient, config
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class TopicGap:
"""A topic present in competitors but missing from target."""
topic: str
competitor_urls: list[str] = field(default_factory=list)
competitor_keywords: list[str] = field(default_factory=list)
estimated_traffic: int = 0
priority_score: float = 0.0
difficulty: str = "medium"
content_type_suggestion: str = "blog"
@dataclass
class TopicCluster:
"""Topic cluster with pillar and supporting cluster pages."""
pillar_topic: str
pillar_keyword: str = ""
cluster_topics: list[str] = field(default_factory=list)
cluster_keywords: list[str] = field(default_factory=list)
total_volume: int = 0
coverage_score: float = 0.0
@dataclass
class CalendarEntry:
"""Prioritized editorial calendar entry."""
topic: str
priority: str = "medium"
target_date: str = ""
content_type: str = "blog"
target_word_count: int = 1500
primary_keyword: str = ""
estimated_traffic: int = 0
cluster_name: str = ""
notes: str = ""
@dataclass
class ContentGapResult:
"""Full content gap analysis result."""
target_url: str
competitor_urls: list[str] = field(default_factory=list)
timestamp: str = ""
target_topics_count: int = 0
competitor_topics_count: int = 0
gaps: list[TopicGap] = field(default_factory=list)
clusters: list[TopicCluster] = field(default_factory=list)
calendar: list[CalendarEntry] = field(default_factory=list)
content_volume_comparison: dict[str, int] = field(default_factory=dict)
korean_opportunities: list[dict[str, Any]] = field(default_factory=dict)
recommendations: list[str] = field(default_factory=list)
errors: list[str] = field(default_factory=list)
# ---------------------------------------------------------------------------
# Korean opportunity patterns
# ---------------------------------------------------------------------------
KOREAN_OPPORTUNITY_PATTERNS = [
{"pattern": r"후기|리뷰", "label": "review_content", "description": "후기/리뷰 콘텐츠"},
{"pattern": r"비용|가격|견적", "label": "pricing_content", "description": "비용/가격 정보 콘텐츠"},
{"pattern": r"비교|차이", "label": "comparison_content", "description": "비교 콘텐츠"},
{"pattern": r"추천|베스트|TOP", "label": "recommendation_content", "description": "추천/리스트 콘텐츠"},
{"pattern": r"방법|하는\s*법|가이드", "label": "how_to_content", "description": "가이드/방법 콘텐츠"},
{"pattern": r"부작용|주의|위험", "label": "safety_content", "description": "안전/부작용 정보"},
{"pattern": r"효과|결과|전후", "label": "results_content", "description": "효과/결과 콘텐츠"},
]
# ---------------------------------------------------------------------------
# ContentGapAnalyzer
# ---------------------------------------------------------------------------
class ContentGapAnalyzer(BaseAsyncClient):
"""Analyze content gaps between target and competitor sites."""
def __init__(self, max_concurrent: int = 5, requests_per_second: float = 2.0):
super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second)
# ------------------------------------------------------------------
# Ahrefs data retrieval
# ------------------------------------------------------------------
async def get_competitor_topics(self, competitor_url: str, limit: int = 100) -> list[dict]:
"""
Get top pages and keywords for a competitor via Ahrefs.
Returns list of dicts: url, traffic, keywords, top_keyword, title.
"""
self.logger.info(f"Fetching competitor topics for {competitor_url}")
target = urlparse(competitor_url).netloc or competitor_url
try:
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
if not api_key:
self.logger.warning("AHREFS_API_KEY not set; returning empty competitor topics")
return []
resp = requests.get(
"https://api.ahrefs.com/v3/site-explorer/top-pages",
params={
"target": target,
"limit": limit,
"select": "url,traffic,keywords,value,top_keyword",
},
headers={"Authorization": f"Bearer {api_key}"},
timeout=30,
)
resp.raise_for_status()
data = resp.json()
pages = data.get("pages", data.get("items", []))
self.logger.info(f"Retrieved {len(pages)} competitor topics from {competitor_url}")
return pages
except Exception as exc:
self.logger.warning(f"Failed to get competitor topics for {competitor_url}: {exc}")
return []
async def get_target_keywords(self, target_url: str, limit: int = 200) -> set[str]:
"""Get the set of keywords the target site already ranks for."""
self.logger.info(f"Fetching target keywords for {target_url}")
target = urlparse(target_url).netloc or target_url
try:
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
if not api_key:
return set()
resp = requests.get(
"https://api.ahrefs.com/v3/site-explorer/organic-keywords",
params={"target": target, "limit": limit, "select": "keyword,position,traffic"},
headers={"Authorization": f"Bearer {api_key}"},
timeout=30,
)
resp.raise_for_status()
data = resp.json()
keywords = data.get("keywords", data.get("items", []))
return {kw.get("keyword", "").lower() for kw in keywords if kw.get("keyword")}
except Exception as exc:
self.logger.warning(f"Failed to get target keywords: {exc}")
return set()
async def get_organic_competitors(self, target_url: str, limit: int = 10) -> list[str]:
"""Discover organic competitors via Ahrefs."""
self.logger.info(f"Discovering organic competitors for {target_url}")
target = urlparse(target_url).netloc or target_url
try:
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
if not api_key:
return []
resp = requests.get(
"https://api.ahrefs.com/v3/site-explorer/organic-competitors",
params={"target": target, "limit": limit},
headers={"Authorization": f"Bearer {api_key}"},
timeout=30,
)
resp.raise_for_status()
data = resp.json()
competitors = data.get("competitors", data.get("items", []))
return [c.get("domain", "") for c in competitors if c.get("domain")]
except Exception as exc:
self.logger.warning(f"Failed to discover competitors: {exc}")
return []
# ------------------------------------------------------------------
# Gap analysis
# ------------------------------------------------------------------
async def find_topic_gaps(
self,
target_url: str,
competitor_urls: list[str],
) -> tuple[list[TopicGap], set[str], dict[str, int]]:
"""
Identify topics covered by competitors but missing from target.
Returns:
- List of TopicGap objects.
- Set of target keywords (for reference).
- Content volume comparison dict.
"""
# Gather target keywords
target_keywords = await self.get_target_keywords(target_url)
# Gather competitor data in parallel
competitor_tasks = [self.get_competitor_topics(c_url) for c_url in competitor_urls]
competitor_results = await asyncio.gather(*competitor_tasks, return_exceptions=True)
# Build competitor topic map
competitor_topic_map: dict[str, TopicGap] = {}
content_volume: dict[str, int] = {target_url: len(target_keywords)}
for c_url, c_result in zip(competitor_urls, competitor_results):
if isinstance(c_result, Exception):
self.logger.warning(f"Error fetching {c_url}: {c_result}")
continue
pages = c_result if isinstance(c_result, list) else []
content_volume[c_url] = len(pages)
for page in pages:
top_keyword = page.get("top_keyword", "").strip().lower()
if not top_keyword:
continue
# Skip if target already covers this keyword
if top_keyword in target_keywords:
continue
# Check for fuzzy matches (keyword contained in target set)
is_covered = any(
top_keyword in tk or tk in top_keyword
for tk in target_keywords
if len(tk) > 3
)
if is_covered:
continue
if top_keyword not in competitor_topic_map:
competitor_topic_map[top_keyword] = TopicGap(
topic=top_keyword,
estimated_traffic=int(page.get("traffic", 0)),
)
gap = competitor_topic_map[top_keyword]
gap.competitor_urls.append(page.get("url", c_url))
gap.competitor_keywords.append(top_keyword)
gap.estimated_traffic = max(gap.estimated_traffic, int(page.get("traffic", 0)))
# Score gaps
gaps = list(competitor_topic_map.values())
for gap in gaps:
competitor_count = len(set(gap.competitor_urls))
traffic_score = min(100, math.log10(max(gap.estimated_traffic, 1)) / math.log10(10000) * 100)
competition_score = (competitor_count / max(len(competitor_urls), 1)) * 100
gap.priority_score = round((traffic_score * 0.6) + (competition_score * 0.4), 1)
# Difficulty estimation
if competitor_count >= 3:
gap.difficulty = "high"
elif competitor_count >= 2:
gap.difficulty = "medium"
else:
gap.difficulty = "low"
# Content type suggestion
gap.content_type_suggestion = self._suggest_content_type(gap.topic)
gaps.sort(key=lambda g: g.priority_score, reverse=True)
return gaps, target_keywords, content_volume
@staticmethod
def _suggest_content_type(topic: str) -> str:
"""Suggest content type based on topic keywords."""
topic_lower = topic.lower()
if any(w in topic_lower for w in ["how to", "guide", "tutorial", "방법", "가이드"]):
return "guide"
if any(w in topic_lower for w in ["best", "top", "review", "추천", "후기", "비교"]):
return "listicle"
if any(w in topic_lower for w in ["what is", "이란", "", "의미"]):
return "informational"
if any(w in topic_lower for w in ["cost", "price", "비용", "가격"]):
return "landing"
return "blog"
# ------------------------------------------------------------------
# Topic cluster mapping
# ------------------------------------------------------------------
def build_topic_clusters(
self,
topics: list[str],
n_clusters: int | None = None,
min_cluster_size: int = 3,
) -> list[TopicCluster]:
"""
Group topics into pillar/cluster structure using TF-IDF + hierarchical clustering.
Args:
topics: List of topic strings.
n_clusters: Number of clusters (auto-detected if None).
min_cluster_size: Minimum topics per cluster.
Returns:
List of TopicCluster objects.
"""
if len(topics) < min_cluster_size:
self.logger.warning("Too few topics for clustering")
return []
# Vectorize topics
vectorizer = TfidfVectorizer(
max_features=500,
stop_words="english",
ngram_range=(1, 2),
)
try:
tfidf_matrix = vectorizer.fit_transform(topics)
except ValueError as exc:
self.logger.warning(f"TF-IDF vectorization failed: {exc}")
return []
# Auto-detect cluster count
if n_clusters is None:
n_clusters = max(2, min(len(topics) // 5, 15))
n_clusters = min(n_clusters, len(topics) - 1)
# Hierarchical clustering
clustering = AgglomerativeClustering(
n_clusters=n_clusters,
metric="cosine",
linkage="average",
)
labels = clustering.fit_predict(tfidf_matrix.toarray())
# Build cluster objects
cluster_map: dict[int, list[str]] = defaultdict(list)
for topic, label in zip(topics, labels):
cluster_map[label].append(topic)
clusters: list[TopicCluster] = []
for label, cluster_topics in sorted(cluster_map.items()):
if len(cluster_topics) < min_cluster_size:
continue
# Pick the longest topic as pillar (usually broader)
pillar = max(cluster_topics, key=len)
subtopics = [t for t in cluster_topics if t != pillar]
cluster = TopicCluster(
pillar_topic=pillar,
pillar_keyword=pillar,
cluster_topics=subtopics[:20],
cluster_keywords=[t for t in subtopics[:20]],
total_volume=0,
coverage_score=0.0,
)
clusters.append(cluster)
clusters.sort(key=lambda c: len(c.cluster_topics), reverse=True)
return clusters
# ------------------------------------------------------------------
# Editorial calendar generation
# ------------------------------------------------------------------
def generate_calendar(
self,
gaps: list[TopicGap],
clusters: list[TopicCluster],
weeks_ahead: int = 12,
entries_per_week: int = 2,
) -> list[CalendarEntry]:
"""
Generate prioritized editorial calendar from gaps and clusters.
Args:
gaps: List of topic gaps (sorted by priority).
clusters: List of topic clusters.
weeks_ahead: Number of weeks to plan.
entries_per_week: Content pieces per week.
Returns:
List of CalendarEntry objects.
"""
calendar: list[CalendarEntry] = []
today = datetime.now()
# Build cluster lookup
topic_to_cluster: dict[str, str] = {}
for cluster in clusters:
for topic in cluster.cluster_topics:
topic_to_cluster[topic] = cluster.pillar_topic
topic_to_cluster[cluster.pillar_topic] = cluster.pillar_topic
# Prioritize: pillar topics first, then by priority score
pillar_topics = {c.pillar_topic for c in clusters}
pillar_gaps = [g for g in gaps if g.topic in pillar_topics]
other_gaps = [g for g in gaps if g.topic not in pillar_topics]
ordered_gaps = pillar_gaps + other_gaps
max_entries = weeks_ahead * entries_per_week
week_offset = 0
slot_in_week = 0
for gap in ordered_gaps[:max_entries]:
target_date = today + timedelta(weeks=week_offset, days=slot_in_week * 3)
# Determine priority label
if gap.priority_score >= 70:
priority = "high"
elif gap.priority_score >= 40:
priority = "medium"
else:
priority = "low"
# Word count based on content type
word_count_map = {
"guide": 2500,
"listicle": 2000,
"informational": 1800,
"landing": 1200,
"blog": 1500,
}
entry = CalendarEntry(
topic=gap.topic,
priority=priority,
target_date=target_date.strftime("%Y-%m-%d"),
content_type=gap.content_type_suggestion,
target_word_count=word_count_map.get(gap.content_type_suggestion, 1500),
primary_keyword=gap.topic,
estimated_traffic=gap.estimated_traffic,
cluster_name=topic_to_cluster.get(gap.topic, "uncategorized"),
)
calendar.append(entry)
slot_in_week += 1
if slot_in_week >= entries_per_week:
slot_in_week = 0
week_offset += 1
return calendar
# ------------------------------------------------------------------
# Korean opportunity detection
# ------------------------------------------------------------------
@staticmethod
def detect_korean_opportunities(gaps: list[TopicGap]) -> list[dict[str, Any]]:
"""Detect Korean-market content opportunities in gaps."""
opportunities: list[dict[str, Any]] = []
for gap in gaps:
for pattern_info in KOREAN_OPPORTUNITY_PATTERNS:
if re.search(pattern_info["pattern"], gap.topic, re.IGNORECASE):
opportunities.append({
"topic": gap.topic,
"pattern": pattern_info["label"],
"description": pattern_info["description"],
"estimated_traffic": gap.estimated_traffic,
"priority_score": gap.priority_score,
})
break
opportunities.sort(key=lambda o: o["priority_score"], reverse=True)
return opportunities
# ------------------------------------------------------------------
# Orchestration
# ------------------------------------------------------------------
async def analyze(
self,
target_url: str,
competitor_urls: list[str],
build_clusters: bool = False,
) -> ContentGapResult:
"""
Run full content gap analysis.
Args:
target_url: Target website URL.
competitor_urls: List of competitor URLs.
build_clusters: Whether to build topic clusters.
Returns:
ContentGapResult with gaps, clusters, and calendar.
"""
result = ContentGapResult(
target_url=target_url,
competitor_urls=competitor_urls,
timestamp=datetime.now().isoformat(),
)
self.logger.info(
f"Starting gap analysis: {target_url} vs {len(competitor_urls)} competitors"
)
# 1. Find topic gaps
gaps, target_keywords, content_volume = await self.find_topic_gaps(
target_url, competitor_urls
)
result.gaps = gaps
result.target_topics_count = len(target_keywords)
result.competitor_topics_count = sum(content_volume.get(c, 0) for c in competitor_urls)
result.content_volume_comparison = content_volume
# 2. Build topic clusters if requested
if build_clusters and gaps:
all_topics = [g.topic for g in gaps]
result.clusters = self.build_topic_clusters(all_topics)
# 3. Generate editorial calendar
result.calendar = self.generate_calendar(gaps, result.clusters)
# 4. Detect Korean opportunities
result.korean_opportunities = self.detect_korean_opportunities(gaps)
# 5. Recommendations
result.recommendations = self._generate_recommendations(result)
self.logger.info(
f"Gap analysis complete: {len(gaps)} gaps, "
f"{len(result.clusters)} clusters, "
f"{len(result.calendar)} calendar entries"
)
return result
@staticmethod
def _generate_recommendations(result: ContentGapResult) -> list[str]:
"""Generate strategic recommendations from gap analysis."""
recs: list[str] = []
gap_count = len(result.gaps)
if gap_count > 50:
recs.append(
f"경쟁사 대비 {gap_count}개의 콘텐츠 격차가 발견되었습니다. "
"우선순위 상위 20개 주제부터 콘텐츠 생성을 시작하세요."
)
elif gap_count > 20:
recs.append(
f"{gap_count}개의 콘텐츠 격차가 있습니다. "
"높은 트래픽 기회부터 순차적으로 콘텐츠를 생성하세요."
)
elif gap_count > 0:
recs.append(
f"{gap_count}개의 콘텐츠 격차가 발견되었습니다. "
"비교적 적은 격차이므로 빠른 시일 내 모두 커버할 수 있습니다."
)
if result.clusters:
recs.append(
f"{len(result.clusters)}개의 토픽 클러스터를 구성했습니다. "
"필러 콘텐츠부터 작성하여 내부 링크 구조를 강화하세요."
)
if result.korean_opportunities:
recs.append(
f"한국어 시장 기회가 {len(result.korean_opportunities)}개 발견되었습니다. "
"후기, 비용, 비교 콘텐츠는 한국 검색 시장에서 높은 전환율을 보입니다."
)
high_priority = [g for g in result.gaps if g.priority_score >= 70]
if high_priority:
top_topics = ", ".join(g.topic for g in high_priority[:3])
recs.append(
f"최우선 주제: {top_topics}. "
"이 주제들은 높은 트래픽 잠재력과 경쟁사 커버리지를 가지고 있습니다."
)
if not recs:
recs.append("경쟁사 대비 콘텐츠 커버리지가 양호합니다. 기존 콘텐츠 최적화에 집중하세요.")
return recs
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="SEO Content Gap Analyzer - topic gaps, clusters, calendar",
)
parser.add_argument("--target", required=True, help="Target website URL")
parser.add_argument(
"--competitor", action="append", dest="competitors", required=True,
help="Competitor URL (can be repeated)",
)
parser.add_argument("--clusters", action="store_true", help="Build topic clusters")
parser.add_argument("--json", action="store_true", help="Output as JSON")
parser.add_argument("--output", help="Save output to file")
return parser
def format_text_report(result: ContentGapResult) -> str:
"""Format gap analysis result as human-readable text."""
lines: list[str] = []
lines.append(f"## Content Gap Analysis: {result.target_url}")
lines.append(f"**Date**: {result.timestamp[:10]}")
lines.append(f"**Competitors**: {', '.join(result.competitor_urls)}")
lines.append("")
lines.append("### Content Volume Comparison")
for site, count in result.content_volume_comparison.items():
lines.append(f" - {site}: {count} topics")
lines.append("")
lines.append(f"### Topic Gaps ({len(result.gaps)} found)")
for i, gap in enumerate(result.gaps[:20], 1):
lines.append(
f" {i}. [{gap.priority_score:.0f}] {gap.topic} "
f"(traffic: {gap.estimated_traffic}, difficulty: {gap.difficulty})"
)
lines.append("")
if result.clusters:
lines.append(f"### Topic Clusters ({len(result.clusters)})")
for i, cluster in enumerate(result.clusters, 1):
lines.append(f" {i}. **{cluster.pillar_topic}** ({len(cluster.cluster_topics)} subtopics)")
for sub in cluster.cluster_topics[:5]:
lines.append(f" - {sub}")
lines.append("")
if result.calendar:
lines.append(f"### Editorial Calendar ({len(result.calendar)} entries)")
for entry in result.calendar[:15]:
lines.append(
f" - [{entry.target_date}] {entry.topic} "
f"({entry.content_type}, {entry.target_word_count}w, priority: {entry.priority})"
)
lines.append("")
if result.korean_opportunities:
lines.append(f"### Korean Market Opportunities ({len(result.korean_opportunities)})")
for opp in result.korean_opportunities[:10]:
lines.append(f" - {opp['topic']} ({opp['description']})")
lines.append("")
lines.append("### Recommendations")
for i, rec in enumerate(result.recommendations, 1):
lines.append(f" {i}. {rec}")
return "\n".join(lines)
async def main() -> None:
parser = build_parser()
args = parser.parse_args()
analyzer = ContentGapAnalyzer()
result = await analyzer.analyze(
target_url=args.target,
competitor_urls=args.competitors,
build_clusters=args.clusters,
)
if args.json:
output = json.dumps(asdict(result), ensure_ascii=False, indent=2, default=str)
else:
output = format_text_report(result)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
logger.info(f"Output saved to {args.output}")
else:
print(output)
analyzer.print_stats()
if __name__ == "__main__":
asyncio.run(main())