""" Content Gap Analyzer - Topic Gap Detection & Cluster Mapping ============================================================= Purpose: Identify content gaps vs competitors, build topic clusters, and generate prioritized editorial calendars. Python: 3.10+ """ import argparse import asyncio import json import logging import math import re import sys from collections import defaultdict from dataclasses import asdict, dataclass, field from datetime import datetime, timedelta from typing import Any from urllib.parse import urlparse import requests from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import AgglomerativeClustering from base_client import BaseAsyncClient, config logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class TopicGap: """A topic present in competitors but missing from target.""" topic: str competitor_urls: list[str] = field(default_factory=list) competitor_keywords: list[str] = field(default_factory=list) estimated_traffic: int = 0 priority_score: float = 0.0 difficulty: str = "medium" content_type_suggestion: str = "blog" @dataclass class TopicCluster: """Topic cluster with pillar and supporting cluster pages.""" pillar_topic: str pillar_keyword: str = "" cluster_topics: list[str] = field(default_factory=list) cluster_keywords: list[str] = field(default_factory=list) total_volume: int = 0 coverage_score: float = 0.0 @dataclass class CalendarEntry: """Prioritized editorial calendar entry.""" topic: str priority: str = "medium" target_date: str = "" content_type: str = "blog" target_word_count: int = 1500 primary_keyword: str = "" estimated_traffic: int = 0 cluster_name: str = "" notes: str = "" @dataclass class ContentGapResult: """Full content gap analysis result.""" target_url: str competitor_urls: list[str] = field(default_factory=list) timestamp: str = "" target_topics_count: int = 0 competitor_topics_count: int = 0 gaps: list[TopicGap] = field(default_factory=list) clusters: list[TopicCluster] = field(default_factory=list) calendar: list[CalendarEntry] = field(default_factory=list) content_volume_comparison: dict[str, int] = field(default_factory=dict) korean_opportunities: list[dict[str, Any]] = field(default_factory=dict) recommendations: list[str] = field(default_factory=list) errors: list[str] = field(default_factory=list) # --------------------------------------------------------------------------- # Korean opportunity patterns # --------------------------------------------------------------------------- KOREAN_OPPORTUNITY_PATTERNS = [ {"pattern": r"후기|리뷰", "label": "review_content", "description": "후기/리뷰 콘텐츠"}, {"pattern": r"비용|가격|견적", "label": "pricing_content", "description": "비용/가격 정보 콘텐츠"}, {"pattern": r"비교|차이", "label": "comparison_content", "description": "비교 콘텐츠"}, {"pattern": r"추천|베스트|TOP", "label": "recommendation_content", "description": "추천/리스트 콘텐츠"}, {"pattern": r"방법|하는\s*법|가이드", "label": "how_to_content", "description": "가이드/방법 콘텐츠"}, {"pattern": r"부작용|주의|위험", "label": "safety_content", "description": "안전/부작용 정보"}, {"pattern": r"효과|결과|전후", "label": "results_content", "description": "효과/결과 콘텐츠"}, ] # --------------------------------------------------------------------------- # ContentGapAnalyzer # --------------------------------------------------------------------------- class ContentGapAnalyzer(BaseAsyncClient): """Analyze content gaps between target and competitor sites.""" def __init__(self, max_concurrent: int = 5, requests_per_second: float = 2.0): super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second) # ------------------------------------------------------------------ # Ahrefs data retrieval # ------------------------------------------------------------------ async def get_competitor_topics(self, competitor_url: str, limit: int = 100) -> list[dict]: """ Get top pages and keywords for a competitor via Ahrefs. Returns list of dicts: url, traffic, keywords, top_keyword, title. """ self.logger.info(f"Fetching competitor topics for {competitor_url}") target = urlparse(competitor_url).netloc or competitor_url try: api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None if not api_key: self.logger.warning("AHREFS_API_KEY not set; returning empty competitor topics") return [] resp = requests.get( "https://api.ahrefs.com/v3/site-explorer/top-pages", params={ "target": target, "limit": limit, "select": "url,traffic,keywords,value,top_keyword", }, headers={"Authorization": f"Bearer {api_key}"}, timeout=30, ) resp.raise_for_status() data = resp.json() pages = data.get("pages", data.get("items", [])) self.logger.info(f"Retrieved {len(pages)} competitor topics from {competitor_url}") return pages except Exception as exc: self.logger.warning(f"Failed to get competitor topics for {competitor_url}: {exc}") return [] async def get_target_keywords(self, target_url: str, limit: int = 200) -> set[str]: """Get the set of keywords the target site already ranks for.""" self.logger.info(f"Fetching target keywords for {target_url}") target = urlparse(target_url).netloc or target_url try: api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None if not api_key: return set() resp = requests.get( "https://api.ahrefs.com/v3/site-explorer/organic-keywords", params={"target": target, "limit": limit, "select": "keyword,position,traffic"}, headers={"Authorization": f"Bearer {api_key}"}, timeout=30, ) resp.raise_for_status() data = resp.json() keywords = data.get("keywords", data.get("items", [])) return {kw.get("keyword", "").lower() for kw in keywords if kw.get("keyword")} except Exception as exc: self.logger.warning(f"Failed to get target keywords: {exc}") return set() async def get_organic_competitors(self, target_url: str, limit: int = 10) -> list[str]: """Discover organic competitors via Ahrefs.""" self.logger.info(f"Discovering organic competitors for {target_url}") target = urlparse(target_url).netloc or target_url try: api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None if not api_key: return [] resp = requests.get( "https://api.ahrefs.com/v3/site-explorer/organic-competitors", params={"target": target, "limit": limit}, headers={"Authorization": f"Bearer {api_key}"}, timeout=30, ) resp.raise_for_status() data = resp.json() competitors = data.get("competitors", data.get("items", [])) return [c.get("domain", "") for c in competitors if c.get("domain")] except Exception as exc: self.logger.warning(f"Failed to discover competitors: {exc}") return [] # ------------------------------------------------------------------ # Gap analysis # ------------------------------------------------------------------ async def find_topic_gaps( self, target_url: str, competitor_urls: list[str], ) -> tuple[list[TopicGap], set[str], dict[str, int]]: """ Identify topics covered by competitors but missing from target. Returns: - List of TopicGap objects. - Set of target keywords (for reference). - Content volume comparison dict. """ # Gather target keywords target_keywords = await self.get_target_keywords(target_url) # Gather competitor data in parallel competitor_tasks = [self.get_competitor_topics(c_url) for c_url in competitor_urls] competitor_results = await asyncio.gather(*competitor_tasks, return_exceptions=True) # Build competitor topic map competitor_topic_map: dict[str, TopicGap] = {} content_volume: dict[str, int] = {target_url: len(target_keywords)} for c_url, c_result in zip(competitor_urls, competitor_results): if isinstance(c_result, Exception): self.logger.warning(f"Error fetching {c_url}: {c_result}") continue pages = c_result if isinstance(c_result, list) else [] content_volume[c_url] = len(pages) for page in pages: top_keyword = page.get("top_keyword", "").strip().lower() if not top_keyword: continue # Skip if target already covers this keyword if top_keyword in target_keywords: continue # Check for fuzzy matches (keyword contained in target set) is_covered = any( top_keyword in tk or tk in top_keyword for tk in target_keywords if len(tk) > 3 ) if is_covered: continue if top_keyword not in competitor_topic_map: competitor_topic_map[top_keyword] = TopicGap( topic=top_keyword, estimated_traffic=int(page.get("traffic", 0)), ) gap = competitor_topic_map[top_keyword] gap.competitor_urls.append(page.get("url", c_url)) gap.competitor_keywords.append(top_keyword) gap.estimated_traffic = max(gap.estimated_traffic, int(page.get("traffic", 0))) # Score gaps gaps = list(competitor_topic_map.values()) for gap in gaps: competitor_count = len(set(gap.competitor_urls)) traffic_score = min(100, math.log10(max(gap.estimated_traffic, 1)) / math.log10(10000) * 100) competition_score = (competitor_count / max(len(competitor_urls), 1)) * 100 gap.priority_score = round((traffic_score * 0.6) + (competition_score * 0.4), 1) # Difficulty estimation if competitor_count >= 3: gap.difficulty = "high" elif competitor_count >= 2: gap.difficulty = "medium" else: gap.difficulty = "low" # Content type suggestion gap.content_type_suggestion = self._suggest_content_type(gap.topic) gaps.sort(key=lambda g: g.priority_score, reverse=True) return gaps, target_keywords, content_volume @staticmethod def _suggest_content_type(topic: str) -> str: """Suggest content type based on topic keywords.""" topic_lower = topic.lower() if any(w in topic_lower for w in ["how to", "guide", "tutorial", "방법", "가이드"]): return "guide" if any(w in topic_lower for w in ["best", "top", "review", "추천", "후기", "비교"]): return "listicle" if any(w in topic_lower for w in ["what is", "이란", "뜻", "의미"]): return "informational" if any(w in topic_lower for w in ["cost", "price", "비용", "가격"]): return "landing" return "blog" # ------------------------------------------------------------------ # Topic cluster mapping # ------------------------------------------------------------------ def build_topic_clusters( self, topics: list[str], n_clusters: int | None = None, min_cluster_size: int = 3, ) -> list[TopicCluster]: """ Group topics into pillar/cluster structure using TF-IDF + hierarchical clustering. Args: topics: List of topic strings. n_clusters: Number of clusters (auto-detected if None). min_cluster_size: Minimum topics per cluster. Returns: List of TopicCluster objects. """ if len(topics) < min_cluster_size: self.logger.warning("Too few topics for clustering") return [] # Vectorize topics vectorizer = TfidfVectorizer( max_features=500, stop_words="english", ngram_range=(1, 2), ) try: tfidf_matrix = vectorizer.fit_transform(topics) except ValueError as exc: self.logger.warning(f"TF-IDF vectorization failed: {exc}") return [] # Auto-detect cluster count if n_clusters is None: n_clusters = max(2, min(len(topics) // 5, 15)) n_clusters = min(n_clusters, len(topics) - 1) # Hierarchical clustering clustering = AgglomerativeClustering( n_clusters=n_clusters, metric="cosine", linkage="average", ) labels = clustering.fit_predict(tfidf_matrix.toarray()) # Build cluster objects cluster_map: dict[int, list[str]] = defaultdict(list) for topic, label in zip(topics, labels): cluster_map[label].append(topic) clusters: list[TopicCluster] = [] for label, cluster_topics in sorted(cluster_map.items()): if len(cluster_topics) < min_cluster_size: continue # Pick the longest topic as pillar (usually broader) pillar = max(cluster_topics, key=len) subtopics = [t for t in cluster_topics if t != pillar] cluster = TopicCluster( pillar_topic=pillar, pillar_keyword=pillar, cluster_topics=subtopics[:20], cluster_keywords=[t for t in subtopics[:20]], total_volume=0, coverage_score=0.0, ) clusters.append(cluster) clusters.sort(key=lambda c: len(c.cluster_topics), reverse=True) return clusters # ------------------------------------------------------------------ # Editorial calendar generation # ------------------------------------------------------------------ def generate_calendar( self, gaps: list[TopicGap], clusters: list[TopicCluster], weeks_ahead: int = 12, entries_per_week: int = 2, ) -> list[CalendarEntry]: """ Generate prioritized editorial calendar from gaps and clusters. Args: gaps: List of topic gaps (sorted by priority). clusters: List of topic clusters. weeks_ahead: Number of weeks to plan. entries_per_week: Content pieces per week. Returns: List of CalendarEntry objects. """ calendar: list[CalendarEntry] = [] today = datetime.now() # Build cluster lookup topic_to_cluster: dict[str, str] = {} for cluster in clusters: for topic in cluster.cluster_topics: topic_to_cluster[topic] = cluster.pillar_topic topic_to_cluster[cluster.pillar_topic] = cluster.pillar_topic # Prioritize: pillar topics first, then by priority score pillar_topics = {c.pillar_topic for c in clusters} pillar_gaps = [g for g in gaps if g.topic in pillar_topics] other_gaps = [g for g in gaps if g.topic not in pillar_topics] ordered_gaps = pillar_gaps + other_gaps max_entries = weeks_ahead * entries_per_week week_offset = 0 slot_in_week = 0 for gap in ordered_gaps[:max_entries]: target_date = today + timedelta(weeks=week_offset, days=slot_in_week * 3) # Determine priority label if gap.priority_score >= 70: priority = "high" elif gap.priority_score >= 40: priority = "medium" else: priority = "low" # Word count based on content type word_count_map = { "guide": 2500, "listicle": 2000, "informational": 1800, "landing": 1200, "blog": 1500, } entry = CalendarEntry( topic=gap.topic, priority=priority, target_date=target_date.strftime("%Y-%m-%d"), content_type=gap.content_type_suggestion, target_word_count=word_count_map.get(gap.content_type_suggestion, 1500), primary_keyword=gap.topic, estimated_traffic=gap.estimated_traffic, cluster_name=topic_to_cluster.get(gap.topic, "uncategorized"), ) calendar.append(entry) slot_in_week += 1 if slot_in_week >= entries_per_week: slot_in_week = 0 week_offset += 1 return calendar # ------------------------------------------------------------------ # Korean opportunity detection # ------------------------------------------------------------------ @staticmethod def detect_korean_opportunities(gaps: list[TopicGap]) -> list[dict[str, Any]]: """Detect Korean-market content opportunities in gaps.""" opportunities: list[dict[str, Any]] = [] for gap in gaps: for pattern_info in KOREAN_OPPORTUNITY_PATTERNS: if re.search(pattern_info["pattern"], gap.topic, re.IGNORECASE): opportunities.append({ "topic": gap.topic, "pattern": pattern_info["label"], "description": pattern_info["description"], "estimated_traffic": gap.estimated_traffic, "priority_score": gap.priority_score, }) break opportunities.sort(key=lambda o: o["priority_score"], reverse=True) return opportunities # ------------------------------------------------------------------ # Orchestration # ------------------------------------------------------------------ async def analyze( self, target_url: str, competitor_urls: list[str], build_clusters: bool = False, ) -> ContentGapResult: """ Run full content gap analysis. Args: target_url: Target website URL. competitor_urls: List of competitor URLs. build_clusters: Whether to build topic clusters. Returns: ContentGapResult with gaps, clusters, and calendar. """ result = ContentGapResult( target_url=target_url, competitor_urls=competitor_urls, timestamp=datetime.now().isoformat(), ) self.logger.info( f"Starting gap analysis: {target_url} vs {len(competitor_urls)} competitors" ) # 1. Find topic gaps gaps, target_keywords, content_volume = await self.find_topic_gaps( target_url, competitor_urls ) result.gaps = gaps result.target_topics_count = len(target_keywords) result.competitor_topics_count = sum(content_volume.get(c, 0) for c in competitor_urls) result.content_volume_comparison = content_volume # 2. Build topic clusters if requested if build_clusters and gaps: all_topics = [g.topic for g in gaps] result.clusters = self.build_topic_clusters(all_topics) # 3. Generate editorial calendar result.calendar = self.generate_calendar(gaps, result.clusters) # 4. Detect Korean opportunities result.korean_opportunities = self.detect_korean_opportunities(gaps) # 5. Recommendations result.recommendations = self._generate_recommendations(result) self.logger.info( f"Gap analysis complete: {len(gaps)} gaps, " f"{len(result.clusters)} clusters, " f"{len(result.calendar)} calendar entries" ) return result @staticmethod def _generate_recommendations(result: ContentGapResult) -> list[str]: """Generate strategic recommendations from gap analysis.""" recs: list[str] = [] gap_count = len(result.gaps) if gap_count > 50: recs.append( f"경쟁사 대비 {gap_count}개의 콘텐츠 격차가 발견되었습니다. " "우선순위 상위 20개 주제부터 콘텐츠 생성을 시작하세요." ) elif gap_count > 20: recs.append( f"{gap_count}개의 콘텐츠 격차가 있습니다. " "높은 트래픽 기회부터 순차적으로 콘텐츠를 생성하세요." ) elif gap_count > 0: recs.append( f"{gap_count}개의 콘텐츠 격차가 발견되었습니다. " "비교적 적은 격차이므로 빠른 시일 내 모두 커버할 수 있습니다." ) if result.clusters: recs.append( f"{len(result.clusters)}개의 토픽 클러스터를 구성했습니다. " "필러 콘텐츠부터 작성하여 내부 링크 구조를 강화하세요." ) if result.korean_opportunities: recs.append( f"한국어 시장 기회가 {len(result.korean_opportunities)}개 발견되었습니다. " "후기, 비용, 비교 콘텐츠는 한국 검색 시장에서 높은 전환율을 보입니다." ) high_priority = [g for g in result.gaps if g.priority_score >= 70] if high_priority: top_topics = ", ".join(g.topic for g in high_priority[:3]) recs.append( f"최우선 주제: {top_topics}. " "이 주제들은 높은 트래픽 잠재력과 경쟁사 커버리지를 가지고 있습니다." ) if not recs: recs.append("경쟁사 대비 콘텐츠 커버리지가 양호합니다. 기존 콘텐츠 최적화에 집중하세요.") return recs # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( description="SEO Content Gap Analyzer - topic gaps, clusters, calendar", ) parser.add_argument("--target", required=True, help="Target website URL") parser.add_argument( "--competitor", action="append", dest="competitors", required=True, help="Competitor URL (can be repeated)", ) parser.add_argument("--clusters", action="store_true", help="Build topic clusters") parser.add_argument("--json", action="store_true", help="Output as JSON") parser.add_argument("--output", help="Save output to file") return parser def format_text_report(result: ContentGapResult) -> str: """Format gap analysis result as human-readable text.""" lines: list[str] = [] lines.append(f"## Content Gap Analysis: {result.target_url}") lines.append(f"**Date**: {result.timestamp[:10]}") lines.append(f"**Competitors**: {', '.join(result.competitor_urls)}") lines.append("") lines.append("### Content Volume Comparison") for site, count in result.content_volume_comparison.items(): lines.append(f" - {site}: {count} topics") lines.append("") lines.append(f"### Topic Gaps ({len(result.gaps)} found)") for i, gap in enumerate(result.gaps[:20], 1): lines.append( f" {i}. [{gap.priority_score:.0f}] {gap.topic} " f"(traffic: {gap.estimated_traffic}, difficulty: {gap.difficulty})" ) lines.append("") if result.clusters: lines.append(f"### Topic Clusters ({len(result.clusters)})") for i, cluster in enumerate(result.clusters, 1): lines.append(f" {i}. **{cluster.pillar_topic}** ({len(cluster.cluster_topics)} subtopics)") for sub in cluster.cluster_topics[:5]: lines.append(f" - {sub}") lines.append("") if result.calendar: lines.append(f"### Editorial Calendar ({len(result.calendar)} entries)") for entry in result.calendar[:15]: lines.append( f" - [{entry.target_date}] {entry.topic} " f"({entry.content_type}, {entry.target_word_count}w, priority: {entry.priority})" ) lines.append("") if result.korean_opportunities: lines.append(f"### Korean Market Opportunities ({len(result.korean_opportunities)})") for opp in result.korean_opportunities[:10]: lines.append(f" - {opp['topic']} ({opp['description']})") lines.append("") lines.append("### Recommendations") for i, rec in enumerate(result.recommendations, 1): lines.append(f" {i}. {rec}") return "\n".join(lines) async def main() -> None: parser = build_parser() args = parser.parse_args() analyzer = ContentGapAnalyzer() result = await analyzer.analyze( target_url=args.target, competitor_urls=args.competitors, build_clusters=args.clusters, ) if args.json: output = json.dumps(asdict(result), ensure_ascii=False, indent=2, default=str) else: output = format_text_report(result) if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(output) logger.info(f"Output saved to {args.output}") else: print(output) analyzer.print_stats() if __name__ == "__main__": asyncio.run(main())