our-claude-skills/custom-skills/23-seo-content-strategy/code/scripts/content_auditor.py

"""
Content Auditor - SEO Content Inventory & Performance Analysis
==============================================================
Purpose: Build content inventory, score performance, detect decay,
         classify content types, and analyze Korean content patterns.
Python: 3.10+
"""

import argparse
import asyncio
import json
import logging
import re
import sys
from dataclasses import asdict, dataclass, field
from datetime import datetime, timedelta
from typing import Any
from urllib.parse import urlparse

import aiohttp
import requests
from bs4 import BeautifulSoup

from base_client import BaseAsyncClient, config

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class ContentPage:
    """Single content page with performance metrics."""
    url: str
    title: str = ""
    content_type: str = "other"
    word_count: int = 0
    traffic: int = 0
    keywords_count: int = 0
    backlinks: int = 0
    performance_score: float = 0.0
    last_modified: str = ""
    is_decaying: bool = False
    decay_rate: float = 0.0
    korean_pattern: str = ""
    topics: list[str] = field(default_factory=list)


@dataclass
class ContentInventory:
    """Aggregated content inventory summary."""
    total_pages: int = 0
    by_type: dict[str, int] = field(default_factory=dict)
    avg_performance_score: float = 0.0
    avg_word_count: float = 0.0
    pages: list[ContentPage] = field(default_factory=list)
    freshness_distribution: dict[str, int] = field(default_factory=dict)


@dataclass
class ContentAuditResult:
    """Full content audit result."""
    url: str
    timestamp: str = ""
    content_inventory: ContentInventory = field(default_factory=ContentInventory)
    top_performers: list[ContentPage] = field(default_factory=list)
    decaying_content: list[ContentPage] = field(default_factory=list)
    korean_content_analysis: dict[str, Any] = field(default_factory=dict)
    recommendations: list[str] = field(default_factory=list)
    errors: list[str] = field(default_factory=list)


# ---------------------------------------------------------------------------
# URL pattern rules for content type classification
# ---------------------------------------------------------------------------

CONTENT_TYPE_PATTERNS = {
    "blog": [
        r"/blog/", r"/post/", r"/posts/", r"/article/", r"/articles/",
        r"/news/", r"/magazine/", r"/stories/", r"/insights/",
        r"/블로그/", r"/소식/", r"/뉴스/",
    ],
    "product": [
        r"/product/", r"/products/", r"/shop/", r"/store/",
        r"/item/", r"/goods/", r"/catalog/",
        r"/제품/", r"/상품/", r"/쇼핑/",
    ],
    "service": [
        r"/service/", r"/services/", r"/solutions/", r"/offering/",
        r"/진료/", r"/서비스/", r"/시술/", r"/치료/",
    ],
    "landing": [
        r"/lp/", r"/landing/", r"/campaign/", r"/promo/",
        r"/event/", r"/이벤트/", r"/프로모션/",
    ],
    "resource": [
        r"/resource/", r"/resources/", r"/guide/", r"/guides/",
        r"/whitepaper/", r"/ebook/", r"/download/", r"/faq/",
        r"/help/", r"/support/", r"/가이드/", r"/자료/",
    ],
}

KOREAN_CONTENT_PATTERNS = {
    "naver_blog_style": [
        r"후기", r"리뷰", r"체험", r"솔직후기", r"방문후기",
        r"사용후기", r"이용후기",
    ],
    "listicle": [
        r"추천", r"베스트", r"TOP\s*\d+", r"\d+선", r"\d+가지",
        r"모음", r"정리", r"비교",
    ],
    "how_to": [
        r"방법", r"하는\s*법", r"하는\s*방법", r"가이드",
        r"따라하기", r"시작하기", r"알아보기",
    ],
    "informational": [
        r"이란", r"뜻", r"의미", r"차이", r"비교",
        r"장단점", r"효과", r"부작용", r"비용", r"가격",
    ],
}


# ---------------------------------------------------------------------------
# ContentAuditor
# ---------------------------------------------------------------------------

class ContentAuditor(BaseAsyncClient):
    """Content auditor using Ahrefs API and sitemap crawling."""

    def __init__(self, max_concurrent: int = 5, requests_per_second: float = 2.0):
        super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second)
        self.session: aiohttp.ClientSession | None = None

    async def _ensure_session(self) -> aiohttp.ClientSession:
        if self.session is None or self.session.closed:
            timeout = aiohttp.ClientTimeout(total=30)
            self.session = aiohttp.ClientSession(timeout=timeout)
        return self.session

    async def close(self) -> None:
        if self.session and not self.session.closed:
            await self.session.close()

    # ------------------------------------------------------------------
    # Ahrefs data retrieval
    # ------------------------------------------------------------------

    async def get_top_pages(self, url: str, limit: int = 100) -> list[dict]:
        """
        Retrieve top pages via Ahrefs site-explorer-top-pages.

        Returns list of dicts with keys: url, traffic, keywords, value, top_keyword.
        """
        self.logger.info(f"Fetching top pages from Ahrefs for {url}")
        target = urlparse(url).netloc or url
        try:
            # Ahrefs MCP call: site-explorer-top-pages
            # In MCP context this would be called by the agent.
            # Standalone fallback: use REST API if AHREFS_API_KEY is set.
            api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
            if not api_key:
                self.logger.warning("AHREFS_API_KEY not set; returning empty top pages")
                return []

            resp = requests.get(
                "https://api.ahrefs.com/v3/site-explorer/top-pages",
                params={"target": target, "limit": limit, "select": "url,traffic,keywords,value,top_keyword"},
                headers={"Authorization": f"Bearer {api_key}"},
                timeout=30,
            )
            resp.raise_for_status()
            data = resp.json()
            pages = data.get("pages", data.get("items", []))
            self.logger.info(f"Retrieved {len(pages)} top pages")
            return pages
        except Exception as exc:
            self.logger.warning(f"Ahrefs top-pages lookup failed: {exc}")
            return []

    async def get_pages_by_traffic(self, url: str, limit: int = 100) -> list[dict]:
        """
        Retrieve pages sorted by organic traffic via Ahrefs site-explorer-pages-by-traffic.

        Returns list of dicts with keys: url, traffic, keywords, top_keyword.
        """
        self.logger.info(f"Fetching pages-by-traffic from Ahrefs for {url}")
        target = urlparse(url).netloc or url
        try:
            api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
            if not api_key:
                self.logger.warning("AHREFS_API_KEY not set; returning empty traffic pages")
                return []

            resp = requests.get(
                "https://api.ahrefs.com/v3/site-explorer/pages-by-traffic",
                params={"target": target, "limit": limit, "select": "url,traffic,keywords,top_keyword"},
                headers={"Authorization": f"Bearer {api_key}"},
                timeout=30,
            )
            resp.raise_for_status()
            data = resp.json()
            pages = data.get("pages", data.get("items", []))
            self.logger.info(f"Retrieved {len(pages)} pages by traffic")
            return pages
        except Exception as exc:
            self.logger.warning(f"Ahrefs pages-by-traffic lookup failed: {exc}")
            return []

    # ------------------------------------------------------------------
    # Sitemap crawling
    # ------------------------------------------------------------------

    async def crawl_sitemap(self, url: str) -> list[str]:
        """Discover URLs from sitemap.xml."""
        sitemap_urls_to_try = [
            f"{url.rstrip('/')}/sitemap.xml",
            f"{url.rstrip('/')}/sitemap_index.xml",
            f"{url.rstrip('/')}/post-sitemap.xml",
        ]
        discovered: list[str] = []
        session = await self._ensure_session()

        for sitemap_url in sitemap_urls_to_try:
            try:
                async with session.get(sitemap_url) as resp:
                    if resp.status != 200:
                        continue
                    text = await resp.text()
                    soup = BeautifulSoup(text, "lxml-xml")

                    # Sitemap index
                    sitemaps = soup.find_all("sitemap")
                    if sitemaps:
                        for sm in sitemaps:
                            loc = sm.find("loc")
                            if loc:
                                child_urls = await self._parse_sitemap(session, loc.text.strip())
                                discovered.extend(child_urls)
                    else:
                        urls = soup.find_all("url")
                        for u in urls:
                            loc = u.find("loc")
                            if loc:
                                discovered.append(loc.text.strip())

                    if discovered:
                        self.logger.info(f"Discovered {len(discovered)} URLs from {sitemap_url}")
                        break
            except Exception as exc:
                self.logger.debug(f"Failed to fetch {sitemap_url}: {exc}")

        return list(set(discovered))

    async def _parse_sitemap(self, session: aiohttp.ClientSession, sitemap_url: str) -> list[str]:
        """Parse a single sitemap XML and return URLs."""
        urls: list[str] = []
        try:
            async with session.get(sitemap_url) as resp:
                if resp.status != 200:
                    return urls
                text = await resp.text()
                soup = BeautifulSoup(text, "lxml-xml")
                for u in soup.find_all("url"):
                    loc = u.find("loc")
                    if loc:
                        urls.append(loc.text.strip())
        except Exception as exc:
            self.logger.debug(f"Failed to parse sitemap {sitemap_url}: {exc}")
        return urls

    # ------------------------------------------------------------------
    # Content type classification
    # ------------------------------------------------------------------

    @staticmethod
    def classify_content_type(url: str, title: str = "") -> str:
        """
        Classify content type based on URL path patterns and title.

        Returns one of: blog, product, service, landing, resource, other.
        """
        combined = f"{url.lower()} {title.lower()}"
        scores: dict[str, int] = {}

        for ctype, patterns in CONTENT_TYPE_PATTERNS.items():
            score = 0
            for pattern in patterns:
                if re.search(pattern, combined, re.IGNORECASE):
                    score += 1
            if score > 0:
                scores[ctype] = score

        if not scores:
            return "other"
        return max(scores, key=scores.get)

    # ------------------------------------------------------------------
    # Performance scoring
    # ------------------------------------------------------------------

    @staticmethod
    def score_performance(page: ContentPage) -> float:
        """
        Compute composite performance score (0-100) from traffic, keywords, backlinks.

        Weights:
        - Traffic: 50% (log-scaled, 10k+ traffic = max)
        - Keywords count: 30% (log-scaled, 500+ = max)
        - Backlinks: 20% (log-scaled, 100+ = max)
        """
        import math

        traffic_score = min(100, (math.log10(max(page.traffic, 1)) / math.log10(10000)) * 100)
        keywords_score = min(100, (math.log10(max(page.keywords_count, 1)) / math.log10(500)) * 100)
        backlinks_score = min(100, (math.log10(max(page.backlinks, 1)) / math.log10(100)) * 100)

        composite = (traffic_score * 0.50) + (keywords_score * 0.30) + (backlinks_score * 0.20)
        return round(min(100, max(0, composite)), 1)

    # ------------------------------------------------------------------
    # Content decay detection
    # ------------------------------------------------------------------

    @staticmethod
    def detect_decay(pages: list[ContentPage], threshold: float = -20.0) -> list[ContentPage]:
        """
        Flag pages with declining traffic trend.

        Uses a simple heuristic: pages with low performance score relative to
        their keyword count indicate potential decay. In production, historical
        traffic data from Ahrefs metrics-history would be used.

        Args:
            pages: List of content pages with metrics.
            threshold: Decay rate threshold (percentage decline).

        Returns:
            List of pages flagged as decaying.
        """
        decaying: list[ContentPage] = []
        for page in pages:
            # Heuristic: high keyword count but low traffic suggests decay
            if page.keywords_count > 10 and page.traffic < 50:
                page.is_decaying = True
                page.decay_rate = -50.0 if page.traffic == 0 else round(
                    -((page.keywords_count * 10 - page.traffic) / max(page.keywords_count * 10, 1)) * 100, 1
                )
                if page.decay_rate <= threshold:
                    decaying.append(page)
            elif page.performance_score < 20 and page.keywords_count > 5:
                page.is_decaying = True
                page.decay_rate = round(-max(30, 100 - page.performance_score * 2), 1)
                if page.decay_rate <= threshold:
                    decaying.append(page)

        decaying.sort(key=lambda p: p.decay_rate)
        return decaying

    # ------------------------------------------------------------------
    # Freshness assessment
    # ------------------------------------------------------------------

    @staticmethod
    def analyze_freshness(pages: list[ContentPage]) -> dict[str, int]:
        """
        Categorize pages by freshness based on last_modified dates.

        Returns distribution: fresh (< 3 months), aging (3-12 months),
        stale (> 12 months), unknown (no date).
        """
        now = datetime.now()
        distribution = {"fresh": 0, "aging": 0, "stale": 0, "unknown": 0}

        for page in pages:
            if not page.last_modified:
                distribution["unknown"] += 1
                continue
            try:
                # Try common date formats
                for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%S%z"):
                    try:
                        modified = datetime.strptime(
                            page.last_modified.replace("+00:00", "").replace("Z", ""), fmt.replace("%z", "")
                        )
                        break
                    except ValueError:
                        continue
                else:
                    distribution["unknown"] += 1
                    continue

                age = now - modified
                if age < timedelta(days=90):
                    distribution["fresh"] += 1
                elif age < timedelta(days=365):
                    distribution["aging"] += 1
                else:
                    distribution["stale"] += 1
            except Exception:
                distribution["unknown"] += 1

        return distribution

    # ------------------------------------------------------------------
    # Korean content pattern identification
    # ------------------------------------------------------------------

    @staticmethod
    def identify_korean_patterns(pages: list[ContentPage]) -> dict[str, Any]:
        """
        Detect Korean content patterns across pages.

        Identifies Naver Blog style review content, listicles,
        how-to guides, and informational content patterns.

        Returns summary with counts and example URLs per pattern.
        """
        results: dict[str, Any] = {
            "total_korean_content": 0,
            "patterns": {},
        }

        for pattern_name, keywords in KOREAN_CONTENT_PATTERNS.items():
            matches: list[dict[str, str]] = []
            for page in pages:
                combined = f"{page.url} {page.title}"
                for keyword in keywords:
                    if re.search(keyword, combined, re.IGNORECASE):
                        matches.append({"url": page.url, "title": page.title, "matched_keyword": keyword})
                        break

            results["patterns"][pattern_name] = {
                "count": len(matches),
                "examples": matches[:5],
            }

        korean_urls = set()
        for pattern_data in results["patterns"].values():
            for example in pattern_data["examples"]:
                korean_urls.add(example["url"])
        results["total_korean_content"] = len(korean_urls)

        return results

    # ------------------------------------------------------------------
    # Orchestration
    # ------------------------------------------------------------------

    async def audit(
        self,
        url: str,
        detect_decay_flag: bool = False,
        content_type_filter: str | None = None,
        limit: int = 200,
    ) -> ContentAuditResult:
        """
        Run full content audit: inventory, scoring, decay, Korean patterns.

        Args:
            url: Target website URL.
            detect_decay_flag: Whether to run decay detection.
            content_type_filter: Filter by content type (blog, product, etc.).
            limit: Maximum pages to analyze.

        Returns:
            ContentAuditResult with inventory, top performers, decay, analysis.
        """
        result = ContentAuditResult(
            url=url,
            timestamp=datetime.now().isoformat(),
        )

        self.logger.info(f"Starting content audit for {url}")

        # 1. Gather pages from Ahrefs and sitemap
        top_pages_data, traffic_pages_data, sitemap_urls = await asyncio.gather(
            self.get_top_pages(url, limit=limit),
            self.get_pages_by_traffic(url, limit=limit),
            self.crawl_sitemap(url),
        )

        # 2. Merge and deduplicate pages
        page_map: dict[str, ContentPage] = {}

        for item in top_pages_data:
            page_url = item.get("url", "")
            if not page_url:
                continue
            page_map[page_url] = ContentPage(
                url=page_url,
                title=item.get("top_keyword", ""),
                traffic=int(item.get("traffic", 0)),
                keywords_count=int(item.get("keywords", 0)),
                backlinks=int(item.get("value", 0)),
            )

        for item in traffic_pages_data:
            page_url = item.get("url", "")
            if not page_url:
                continue
            if page_url in page_map:
                existing = page_map[page_url]
                existing.traffic = max(existing.traffic, int(item.get("traffic", 0)))
                existing.keywords_count = max(existing.keywords_count, int(item.get("keywords", 0)))
            else:
                page_map[page_url] = ContentPage(
                    url=page_url,
                    title=item.get("top_keyword", ""),
                    traffic=int(item.get("traffic", 0)),
                    keywords_count=int(item.get("keywords", 0)),
                )

        # Add sitemap URLs not already present
        for s_url in sitemap_urls:
            if s_url not in page_map:
                page_map[s_url] = ContentPage(url=s_url)

        # 3. Classify and score
        all_pages: list[ContentPage] = []
        for page in page_map.values():
            page.content_type = self.classify_content_type(page.url, page.title)
            page.performance_score = self.score_performance(page)
            all_pages.append(page)

        # 4. Filter by content type if requested
        if content_type_filter:
            all_pages = [p for p in all_pages if p.content_type == content_type_filter]

        # 5. Build inventory
        by_type: dict[str, int] = {}
        for page in all_pages:
            by_type[page.content_type] = by_type.get(page.content_type, 0) + 1

        avg_score = (
            sum(p.performance_score for p in all_pages) / len(all_pages)
            if all_pages else 0.0
        )
        avg_word_count = (
            sum(p.word_count for p in all_pages) / len(all_pages)
            if all_pages else 0.0
        )

        freshness = self.analyze_freshness(all_pages)

        result.content_inventory = ContentInventory(
            total_pages=len(all_pages),
            by_type=by_type,
            avg_performance_score=round(avg_score, 1),
            avg_word_count=round(avg_word_count, 1),
            pages=sorted(all_pages, key=lambda p: p.performance_score, reverse=True)[:limit],
            freshness_distribution=freshness,
        )

        # 6. Top performers
        result.top_performers = sorted(all_pages, key=lambda p: p.performance_score, reverse=True)[:20]

        # 7. Decay detection
        if detect_decay_flag:
            result.decaying_content = self.detect_decay(all_pages)

        # 8. Korean content analysis
        result.korean_content_analysis = self.identify_korean_patterns(all_pages)

        # 9. Recommendations
        result.recommendations = self._generate_recommendations(result)

        self.logger.info(
            f"Audit complete: {len(all_pages)} pages, "
            f"{len(result.top_performers)} top performers, "
            f"{len(result.decaying_content)} decaying"
        )

        return result

    @staticmethod
    def _generate_recommendations(result: ContentAuditResult) -> list[str]:
        """Generate actionable recommendations from audit data."""
        recs: list[str] = []
        inv = result.content_inventory

        # Low average score
        if inv.avg_performance_score < 30:
            recs.append(
                "전체 콘텐츠 평균 성과 점수가 낮습니다 ({:.0f}/100). "
                "상위 콘텐츠 패턴을 분석하여 저성과 페이지를 개선하세요.".format(inv.avg_performance_score)
            )

        # Stale content
        stale = inv.freshness_distribution.get("stale", 0)
        total = inv.total_pages or 1
        if stale / total > 0.3:
            recs.append(
                f"오래된 콘텐츠가 {stale}개 ({stale * 100 // total}%)입니다. "
                "콘텐츠 업데이트 또는 통합을 고려하세요."
            )

        # Decaying content
        if len(result.decaying_content) > 5:
            recs.append(
                f"트래픽이 감소하는 콘텐츠가 {len(result.decaying_content)}개 감지되었습니다. "
                "상위 감소 페이지부터 콘텐츠 리프레시를 진행하세요."
            )

        # Content type balance
        blog_count = inv.by_type.get("blog", 0)
        if blog_count == 0:
            recs.append(
                "블로그 콘텐츠가 없습니다. SEO 트래픽 확보를 위해 "
                "블로그 콘텐츠 전략을 수립하세요."
            )

        # Korean content opportunities
        korean = result.korean_content_analysis
        review_count = korean.get("patterns", {}).get("naver_blog_style", {}).get("count", 0)
        if review_count == 0:
            recs.append(
                "후기/리뷰 콘텐츠가 없습니다. 한국 시장에서 후기 콘텐츠는 "
                "전환율에 큰 영향을 미치므로 후기 콘텐츠 생성을 권장합니다."
            )

        if not recs:
            recs.append("현재 콘텐츠 전략이 양호합니다. 지속적인 모니터링을 권장합니다.")

        return recs


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="SEO Content Auditor - inventory, scoring, and decay detection",
    )
    parser.add_argument("--url", required=True, help="Target website URL")
    parser.add_argument("--decay", action="store_true", help="Enable content decay detection")
    parser.add_argument("--type", dest="content_type", help="Filter by content type (blog, product, service, landing, resource)")
    parser.add_argument("--limit", type=int, default=200, help="Maximum pages to analyze (default: 200)")
    parser.add_argument("--json", action="store_true", help="Output as JSON")
    parser.add_argument("--output", help="Save output to file")
    return parser


def format_text_report(result: ContentAuditResult) -> str:
    """Format audit result as human-readable text."""
    lines: list[str] = []
    lines.append(f"## Content Audit: {result.url}")
    lines.append(f"**Date**: {result.timestamp[:10]}")
    lines.append("")

    inv = result.content_inventory
    lines.append(f"### Content Inventory")
    lines.append(f"- Total pages: {inv.total_pages}")
    lines.append(f"- Average performance score: {inv.avg_performance_score}/100")
    lines.append(f"- Content types: {json.dumps(inv.by_type, ensure_ascii=False)}")
    lines.append(f"- Freshness: {json.dumps(inv.freshness_distribution, ensure_ascii=False)}")
    lines.append("")

    lines.append("### Top Performers")
    for i, page in enumerate(result.top_performers[:10], 1):
        lines.append(f"  {i}. [{page.performance_score:.0f}] {page.url} (traffic: {page.traffic})")
    lines.append("")

    if result.decaying_content:
        lines.append("### Decaying Content")
        for i, page in enumerate(result.decaying_content[:10], 1):
            lines.append(f"  {i}. [{page.decay_rate:+.0f}%] {page.url} (traffic: {page.traffic})")
        lines.append("")

    if result.korean_content_analysis.get("patterns"):
        lines.append("### Korean Content Patterns")
        for pattern_name, data in result.korean_content_analysis["patterns"].items():
            lines.append(f"  - {pattern_name}: {data['count']} pages")
        lines.append("")

    lines.append("### Recommendations")
    for i, rec in enumerate(result.recommendations, 1):
        lines.append(f"  {i}. {rec}")

    return "\n".join(lines)


async def main() -> None:
    parser = build_parser()
    args = parser.parse_args()

    auditor = ContentAuditor()
    try:
        result = await auditor.audit(
            url=args.url,
            detect_decay_flag=args.decay,
            content_type_filter=args.content_type,
            limit=args.limit,
        )

        if args.json:
            output = json.dumps(asdict(result), ensure_ascii=False, indent=2, default=str)
        else:
            output = format_text_report(result)

        if args.output:
            with open(args.output, "w", encoding="utf-8") as f:
                f.write(output)
            logger.info(f"Output saved to {args.output}")
        else:
            print(output)

    finally:
        await auditor.close()
        auditor.print_stats()


if __name__ == "__main__":
    asyncio.run(main())