""" Content Auditor - SEO Content Inventory & Performance Analysis ============================================================== Purpose: Build content inventory, score performance, detect decay, classify content types, and analyze Korean content patterns. Python: 3.10+ """ import argparse import asyncio import json import logging import re import sys from dataclasses import asdict, dataclass, field from datetime import datetime, timedelta from typing import Any from urllib.parse import urlparse import aiohttp import requests from bs4 import BeautifulSoup from base_client import BaseAsyncClient, config logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class ContentPage: """Single content page with performance metrics.""" url: str title: str = "" content_type: str = "other" word_count: int = 0 traffic: int = 0 keywords_count: int = 0 backlinks: int = 0 performance_score: float = 0.0 last_modified: str = "" is_decaying: bool = False decay_rate: float = 0.0 korean_pattern: str = "" topics: list[str] = field(default_factory=list) @dataclass class ContentInventory: """Aggregated content inventory summary.""" total_pages: int = 0 by_type: dict[str, int] = field(default_factory=dict) avg_performance_score: float = 0.0 avg_word_count: float = 0.0 pages: list[ContentPage] = field(default_factory=list) freshness_distribution: dict[str, int] = field(default_factory=dict) @dataclass class ContentAuditResult: """Full content audit result.""" url: str timestamp: str = "" content_inventory: ContentInventory = field(default_factory=ContentInventory) top_performers: list[ContentPage] = field(default_factory=list) decaying_content: list[ContentPage] = field(default_factory=list) korean_content_analysis: dict[str, Any] = field(default_factory=dict) recommendations: list[str] = field(default_factory=list) errors: list[str] = field(default_factory=list) # --------------------------------------------------------------------------- # URL pattern rules for content type classification # --------------------------------------------------------------------------- CONTENT_TYPE_PATTERNS = { "blog": [ r"/blog/", r"/post/", r"/posts/", r"/article/", r"/articles/", r"/news/", r"/magazine/", r"/stories/", r"/insights/", r"/블로그/", r"/소식/", r"/뉴스/", ], "product": [ r"/product/", r"/products/", r"/shop/", r"/store/", r"/item/", r"/goods/", r"/catalog/", r"/제품/", r"/상품/", r"/쇼핑/", ], "service": [ r"/service/", r"/services/", r"/solutions/", r"/offering/", r"/진료/", r"/서비스/", r"/시술/", r"/치료/", ], "landing": [ r"/lp/", r"/landing/", r"/campaign/", r"/promo/", r"/event/", r"/이벤트/", r"/프로모션/", ], "resource": [ r"/resource/", r"/resources/", r"/guide/", r"/guides/", r"/whitepaper/", r"/ebook/", r"/download/", r"/faq/", r"/help/", r"/support/", r"/가이드/", r"/자료/", ], } KOREAN_CONTENT_PATTERNS = { "naver_blog_style": [ r"후기", r"리뷰", r"체험", r"솔직후기", r"방문후기", r"사용후기", r"이용후기", ], "listicle": [ r"추천", r"베스트", r"TOP\s*\d+", r"\d+선", r"\d+가지", r"모음", r"정리", r"비교", ], "how_to": [ r"방법", r"하는\s*법", r"하는\s*방법", r"가이드", r"따라하기", r"시작하기", r"알아보기", ], "informational": [ r"이란", r"뜻", r"의미", r"차이", r"비교", r"장단점", r"효과", r"부작용", r"비용", r"가격", ], } # --------------------------------------------------------------------------- # ContentAuditor # --------------------------------------------------------------------------- class ContentAuditor(BaseAsyncClient): """Content auditor using Ahrefs API and sitemap crawling.""" def __init__(self, max_concurrent: int = 5, requests_per_second: float = 2.0): super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second) self.session: aiohttp.ClientSession | None = None async def _ensure_session(self) -> aiohttp.ClientSession: if self.session is None or self.session.closed: timeout = aiohttp.ClientTimeout(total=30) self.session = aiohttp.ClientSession(timeout=timeout) return self.session async def close(self) -> None: if self.session and not self.session.closed: await self.session.close() # ------------------------------------------------------------------ # Ahrefs data retrieval # ------------------------------------------------------------------ async def get_top_pages(self, url: str, limit: int = 100) -> list[dict]: """ Retrieve top pages via Ahrefs site-explorer-top-pages. Returns list of dicts with keys: url, traffic, keywords, value, top_keyword. """ self.logger.info(f"Fetching top pages from Ahrefs for {url}") target = urlparse(url).netloc or url try: # Ahrefs MCP call: site-explorer-top-pages # In MCP context this would be called by the agent. # Standalone fallback: use REST API if AHREFS_API_KEY is set. api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None if not api_key: self.logger.warning("AHREFS_API_KEY not set; returning empty top pages") return [] resp = requests.get( "https://api.ahrefs.com/v3/site-explorer/top-pages", params={"target": target, "limit": limit, "select": "url,traffic,keywords,value,top_keyword"}, headers={"Authorization": f"Bearer {api_key}"}, timeout=30, ) resp.raise_for_status() data = resp.json() pages = data.get("pages", data.get("items", [])) self.logger.info(f"Retrieved {len(pages)} top pages") return pages except Exception as exc: self.logger.warning(f"Ahrefs top-pages lookup failed: {exc}") return [] async def get_pages_by_traffic(self, url: str, limit: int = 100) -> list[dict]: """ Retrieve pages sorted by organic traffic via Ahrefs site-explorer-pages-by-traffic. Returns list of dicts with keys: url, traffic, keywords, top_keyword. """ self.logger.info(f"Fetching pages-by-traffic from Ahrefs for {url}") target = urlparse(url).netloc or url try: api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None if not api_key: self.logger.warning("AHREFS_API_KEY not set; returning empty traffic pages") return [] resp = requests.get( "https://api.ahrefs.com/v3/site-explorer/pages-by-traffic", params={"target": target, "limit": limit, "select": "url,traffic,keywords,top_keyword"}, headers={"Authorization": f"Bearer {api_key}"}, timeout=30, ) resp.raise_for_status() data = resp.json() pages = data.get("pages", data.get("items", [])) self.logger.info(f"Retrieved {len(pages)} pages by traffic") return pages except Exception as exc: self.logger.warning(f"Ahrefs pages-by-traffic lookup failed: {exc}") return [] # ------------------------------------------------------------------ # Sitemap crawling # ------------------------------------------------------------------ async def crawl_sitemap(self, url: str) -> list[str]: """Discover URLs from sitemap.xml.""" sitemap_urls_to_try = [ f"{url.rstrip('/')}/sitemap.xml", f"{url.rstrip('/')}/sitemap_index.xml", f"{url.rstrip('/')}/post-sitemap.xml", ] discovered: list[str] = [] session = await self._ensure_session() for sitemap_url in sitemap_urls_to_try: try: async with session.get(sitemap_url) as resp: if resp.status != 200: continue text = await resp.text() soup = BeautifulSoup(text, "lxml-xml") # Sitemap index sitemaps = soup.find_all("sitemap") if sitemaps: for sm in sitemaps: loc = sm.find("loc") if loc: child_urls = await self._parse_sitemap(session, loc.text.strip()) discovered.extend(child_urls) else: urls = soup.find_all("url") for u in urls: loc = u.find("loc") if loc: discovered.append(loc.text.strip()) if discovered: self.logger.info(f"Discovered {len(discovered)} URLs from {sitemap_url}") break except Exception as exc: self.logger.debug(f"Failed to fetch {sitemap_url}: {exc}") return list(set(discovered)) async def _parse_sitemap(self, session: aiohttp.ClientSession, sitemap_url: str) -> list[str]: """Parse a single sitemap XML and return URLs.""" urls: list[str] = [] try: async with session.get(sitemap_url) as resp: if resp.status != 200: return urls text = await resp.text() soup = BeautifulSoup(text, "lxml-xml") for u in soup.find_all("url"): loc = u.find("loc") if loc: urls.append(loc.text.strip()) except Exception as exc: self.logger.debug(f"Failed to parse sitemap {sitemap_url}: {exc}") return urls # ------------------------------------------------------------------ # Content type classification # ------------------------------------------------------------------ @staticmethod def classify_content_type(url: str, title: str = "") -> str: """ Classify content type based on URL path patterns and title. Returns one of: blog, product, service, landing, resource, other. """ combined = f"{url.lower()} {title.lower()}" scores: dict[str, int] = {} for ctype, patterns in CONTENT_TYPE_PATTERNS.items(): score = 0 for pattern in patterns: if re.search(pattern, combined, re.IGNORECASE): score += 1 if score > 0: scores[ctype] = score if not scores: return "other" return max(scores, key=scores.get) # ------------------------------------------------------------------ # Performance scoring # ------------------------------------------------------------------ @staticmethod def score_performance(page: ContentPage) -> float: """ Compute composite performance score (0-100) from traffic, keywords, backlinks. Weights: - Traffic: 50% (log-scaled, 10k+ traffic = max) - Keywords count: 30% (log-scaled, 500+ = max) - Backlinks: 20% (log-scaled, 100+ = max) """ import math traffic_score = min(100, (math.log10(max(page.traffic, 1)) / math.log10(10000)) * 100) keywords_score = min(100, (math.log10(max(page.keywords_count, 1)) / math.log10(500)) * 100) backlinks_score = min(100, (math.log10(max(page.backlinks, 1)) / math.log10(100)) * 100) composite = (traffic_score * 0.50) + (keywords_score * 0.30) + (backlinks_score * 0.20) return round(min(100, max(0, composite)), 1) # ------------------------------------------------------------------ # Content decay detection # ------------------------------------------------------------------ @staticmethod def detect_decay(pages: list[ContentPage], threshold: float = -20.0) -> list[ContentPage]: """ Flag pages with declining traffic trend. Uses a simple heuristic: pages with low performance score relative to their keyword count indicate potential decay. In production, historical traffic data from Ahrefs metrics-history would be used. Args: pages: List of content pages with metrics. threshold: Decay rate threshold (percentage decline). Returns: List of pages flagged as decaying. """ decaying: list[ContentPage] = [] for page in pages: # Heuristic: high keyword count but low traffic suggests decay if page.keywords_count > 10 and page.traffic < 50: page.is_decaying = True page.decay_rate = -50.0 if page.traffic == 0 else round( -((page.keywords_count * 10 - page.traffic) / max(page.keywords_count * 10, 1)) * 100, 1 ) if page.decay_rate <= threshold: decaying.append(page) elif page.performance_score < 20 and page.keywords_count > 5: page.is_decaying = True page.decay_rate = round(-max(30, 100 - page.performance_score * 2), 1) if page.decay_rate <= threshold: decaying.append(page) decaying.sort(key=lambda p: p.decay_rate) return decaying # ------------------------------------------------------------------ # Freshness assessment # ------------------------------------------------------------------ @staticmethod def analyze_freshness(pages: list[ContentPage]) -> dict[str, int]: """ Categorize pages by freshness based on last_modified dates. Returns distribution: fresh (< 3 months), aging (3-12 months), stale (> 12 months), unknown (no date). """ now = datetime.now() distribution = {"fresh": 0, "aging": 0, "stale": 0, "unknown": 0} for page in pages: if not page.last_modified: distribution["unknown"] += 1 continue try: # Try common date formats for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%S%z"): try: modified = datetime.strptime( page.last_modified.replace("+00:00", "").replace("Z", ""), fmt.replace("%z", "") ) break except ValueError: continue else: distribution["unknown"] += 1 continue age = now - modified if age < timedelta(days=90): distribution["fresh"] += 1 elif age < timedelta(days=365): distribution["aging"] += 1 else: distribution["stale"] += 1 except Exception: distribution["unknown"] += 1 return distribution # ------------------------------------------------------------------ # Korean content pattern identification # ------------------------------------------------------------------ @staticmethod def identify_korean_patterns(pages: list[ContentPage]) -> dict[str, Any]: """ Detect Korean content patterns across pages. Identifies Naver Blog style review content, listicles, how-to guides, and informational content patterns. Returns summary with counts and example URLs per pattern. """ results: dict[str, Any] = { "total_korean_content": 0, "patterns": {}, } for pattern_name, keywords in KOREAN_CONTENT_PATTERNS.items(): matches: list[dict[str, str]] = [] for page in pages: combined = f"{page.url} {page.title}" for keyword in keywords: if re.search(keyword, combined, re.IGNORECASE): matches.append({"url": page.url, "title": page.title, "matched_keyword": keyword}) break results["patterns"][pattern_name] = { "count": len(matches), "examples": matches[:5], } korean_urls = set() for pattern_data in results["patterns"].values(): for example in pattern_data["examples"]: korean_urls.add(example["url"]) results["total_korean_content"] = len(korean_urls) return results # ------------------------------------------------------------------ # Orchestration # ------------------------------------------------------------------ async def audit( self, url: str, detect_decay_flag: bool = False, content_type_filter: str | None = None, limit: int = 200, ) -> ContentAuditResult: """ Run full content audit: inventory, scoring, decay, Korean patterns. Args: url: Target website URL. detect_decay_flag: Whether to run decay detection. content_type_filter: Filter by content type (blog, product, etc.). limit: Maximum pages to analyze. Returns: ContentAuditResult with inventory, top performers, decay, analysis. """ result = ContentAuditResult( url=url, timestamp=datetime.now().isoformat(), ) self.logger.info(f"Starting content audit for {url}") # 1. Gather pages from Ahrefs and sitemap top_pages_data, traffic_pages_data, sitemap_urls = await asyncio.gather( self.get_top_pages(url, limit=limit), self.get_pages_by_traffic(url, limit=limit), self.crawl_sitemap(url), ) # 2. Merge and deduplicate pages page_map: dict[str, ContentPage] = {} for item in top_pages_data: page_url = item.get("url", "") if not page_url: continue page_map[page_url] = ContentPage( url=page_url, title=item.get("top_keyword", ""), traffic=int(item.get("traffic", 0)), keywords_count=int(item.get("keywords", 0)), backlinks=int(item.get("value", 0)), ) for item in traffic_pages_data: page_url = item.get("url", "") if not page_url: continue if page_url in page_map: existing = page_map[page_url] existing.traffic = max(existing.traffic, int(item.get("traffic", 0))) existing.keywords_count = max(existing.keywords_count, int(item.get("keywords", 0))) else: page_map[page_url] = ContentPage( url=page_url, title=item.get("top_keyword", ""), traffic=int(item.get("traffic", 0)), keywords_count=int(item.get("keywords", 0)), ) # Add sitemap URLs not already present for s_url in sitemap_urls: if s_url not in page_map: page_map[s_url] = ContentPage(url=s_url) # 3. Classify and score all_pages: list[ContentPage] = [] for page in page_map.values(): page.content_type = self.classify_content_type(page.url, page.title) page.performance_score = self.score_performance(page) all_pages.append(page) # 4. Filter by content type if requested if content_type_filter: all_pages = [p for p in all_pages if p.content_type == content_type_filter] # 5. Build inventory by_type: dict[str, int] = {} for page in all_pages: by_type[page.content_type] = by_type.get(page.content_type, 0) + 1 avg_score = ( sum(p.performance_score for p in all_pages) / len(all_pages) if all_pages else 0.0 ) avg_word_count = ( sum(p.word_count for p in all_pages) / len(all_pages) if all_pages else 0.0 ) freshness = self.analyze_freshness(all_pages) result.content_inventory = ContentInventory( total_pages=len(all_pages), by_type=by_type, avg_performance_score=round(avg_score, 1), avg_word_count=round(avg_word_count, 1), pages=sorted(all_pages, key=lambda p: p.performance_score, reverse=True)[:limit], freshness_distribution=freshness, ) # 6. Top performers result.top_performers = sorted(all_pages, key=lambda p: p.performance_score, reverse=True)[:20] # 7. Decay detection if detect_decay_flag: result.decaying_content = self.detect_decay(all_pages) # 8. Korean content analysis result.korean_content_analysis = self.identify_korean_patterns(all_pages) # 9. Recommendations result.recommendations = self._generate_recommendations(result) self.logger.info( f"Audit complete: {len(all_pages)} pages, " f"{len(result.top_performers)} top performers, " f"{len(result.decaying_content)} decaying" ) return result @staticmethod def _generate_recommendations(result: ContentAuditResult) -> list[str]: """Generate actionable recommendations from audit data.""" recs: list[str] = [] inv = result.content_inventory # Low average score if inv.avg_performance_score < 30: recs.append( "전체 콘텐츠 평균 성과 점수가 낮습니다 ({:.0f}/100). " "상위 콘텐츠 패턴을 분석하여 저성과 페이지를 개선하세요.".format(inv.avg_performance_score) ) # Stale content stale = inv.freshness_distribution.get("stale", 0) total = inv.total_pages or 1 if stale / total > 0.3: recs.append( f"오래된 콘텐츠가 {stale}개 ({stale * 100 // total}%)입니다. " "콘텐츠 업데이트 또는 통합을 고려하세요." ) # Decaying content if len(result.decaying_content) > 5: recs.append( f"트래픽이 감소하는 콘텐츠가 {len(result.decaying_content)}개 감지되었습니다. " "상위 감소 페이지부터 콘텐츠 리프레시를 진행하세요." ) # Content type balance blog_count = inv.by_type.get("blog", 0) if blog_count == 0: recs.append( "블로그 콘텐츠가 없습니다. SEO 트래픽 확보를 위해 " "블로그 콘텐츠 전략을 수립하세요." ) # Korean content opportunities korean = result.korean_content_analysis review_count = korean.get("patterns", {}).get("naver_blog_style", {}).get("count", 0) if review_count == 0: recs.append( "후기/리뷰 콘텐츠가 없습니다. 한국 시장에서 후기 콘텐츠는 " "전환율에 큰 영향을 미치므로 후기 콘텐츠 생성을 권장합니다." ) if not recs: recs.append("현재 콘텐츠 전략이 양호합니다. 지속적인 모니터링을 권장합니다.") return recs # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( description="SEO Content Auditor - inventory, scoring, and decay detection", ) parser.add_argument("--url", required=True, help="Target website URL") parser.add_argument("--decay", action="store_true", help="Enable content decay detection") parser.add_argument("--type", dest="content_type", help="Filter by content type (blog, product, service, landing, resource)") parser.add_argument("--limit", type=int, default=200, help="Maximum pages to analyze (default: 200)") parser.add_argument("--json", action="store_true", help="Output as JSON") parser.add_argument("--output", help="Save output to file") return parser def format_text_report(result: ContentAuditResult) -> str: """Format audit result as human-readable text.""" lines: list[str] = [] lines.append(f"## Content Audit: {result.url}") lines.append(f"**Date**: {result.timestamp[:10]}") lines.append("") inv = result.content_inventory lines.append(f"### Content Inventory") lines.append(f"- Total pages: {inv.total_pages}") lines.append(f"- Average performance score: {inv.avg_performance_score}/100") lines.append(f"- Content types: {json.dumps(inv.by_type, ensure_ascii=False)}") lines.append(f"- Freshness: {json.dumps(inv.freshness_distribution, ensure_ascii=False)}") lines.append("") lines.append("### Top Performers") for i, page in enumerate(result.top_performers[:10], 1): lines.append(f" {i}. [{page.performance_score:.0f}] {page.url} (traffic: {page.traffic})") lines.append("") if result.decaying_content: lines.append("### Decaying Content") for i, page in enumerate(result.decaying_content[:10], 1): lines.append(f" {i}. [{page.decay_rate:+.0f}%] {page.url} (traffic: {page.traffic})") lines.append("") if result.korean_content_analysis.get("patterns"): lines.append("### Korean Content Patterns") for pattern_name, data in result.korean_content_analysis["patterns"].items(): lines.append(f" - {pattern_name}: {data['count']} pages") lines.append("") lines.append("### Recommendations") for i, rec in enumerate(result.recommendations, 1): lines.append(f" {i}. {rec}") return "\n".join(lines) async def main() -> None: parser = build_parser() args = parser.parse_args() auditor = ContentAuditor() try: result = await auditor.audit( url=args.url, detect_decay_flag=args.decay, content_type_filter=args.content_type, limit=args.limit, ) if args.json: output = json.dumps(asdict(result), ensure_ascii=False, indent=2, default=str) else: output = format_text_report(result) if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(output) logger.info(f"Output saved to {args.output}") else: print(output) finally: await auditor.close() auditor.print_stats() if __name__ == "__main__": asyncio.run(main())