our-claude-skills/custom-skills/31-seo-competitor-intel/code/scripts/competitor_profiler.py

"""
Competitor Profiler - SEO Competitive Intelligence
===================================================
Purpose: Auto-discover competitors, build profile cards, comparison matrices,
         keyword overlap analysis, and competitive threat scoring.
Python: 3.10+

Usage:
    python competitor_profiler.py --target https://example.com --json
    python competitor_profiler.py --target https://example.com --competitor https://comp1.com --json
    python competitor_profiler.py --target https://example.com --max-competitors 10 --korean-market --json
"""

import argparse
import asyncio
import json
import logging
import sys
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Any
from urllib.parse import urlparse

from base_client import BaseAsyncClient, config

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class CompetitorProfile:
    """Full profile card for a single domain."""
    domain: str
    domain_rating: float = 0.0
    organic_traffic: int = 0
    organic_keywords: int = 0
    referring_domains: int = 0
    top_pages_count: int = 0
    traffic_value_usd: float = 0.0
    content_volume: int = 0
    naver_blog_presence: bool = False
    naver_cafe_presence: bool = False


@dataclass
class KeywordOverlap:
    """Keyword overlap analysis between target and a competitor."""
    shared: int = 0
    unique_target: int = 0
    unique_competitor: int = 0
    gap_keywords: int = 0
    overlap_percentage: float = 0.0


@dataclass
class ThreatAssessment:
    """Competitive threat score and breakdown for one competitor."""
    domain: str = ""
    threat_score: float = 0.0
    growth_rate: float = 0.0
    dr_gap: float = 0.0
    keyword_overlap_pct: float = 0.0
    traffic_ratio: float = 0.0
    strengths: list[str] = field(default_factory=list)
    weaknesses: list[str] = field(default_factory=list)


@dataclass
class ComparisonMatrix:
    """Multi-dimensional comparison matrix across SEO dimensions."""
    dimensions: list[str] = field(default_factory=list)
    target_scores: dict[str, float] = field(default_factory=dict)
    competitor_scores: dict[str, dict[str, float]] = field(default_factory=dict)


@dataclass
class CompetitorProfilingResult:
    """Full profiling result with all competitor data."""
    target: str = ""
    target_profile: CompetitorProfile | None = None
    competitors: list[dict[str, Any]] = field(default_factory=list)
    comparison_matrix: ComparisonMatrix | None = None
    market_position: str = "unknown"
    timestamp: str = ""
    errors: list[str] = field(default_factory=list)


# ---------------------------------------------------------------------------
# Profiler
# ---------------------------------------------------------------------------

class CompetitorProfiler(BaseAsyncClient):
    """Builds competitor profiles using Ahrefs MCP tools."""

    DIMENSIONS = ["traffic", "domain_rating", "keywords", "backlinks", "content"]

    def __init__(self, korean_market: bool = False):
        super().__init__(max_concurrent=5, requests_per_second=2.0)
        self.korean_market = korean_market

    @staticmethod
    def _extract_domain(url: str) -> str:
        """Extract bare domain from URL or return as-is if already bare."""
        if "://" in url:
            parsed = urlparse(url)
            return parsed.netloc.lower().replace("www.", "")
        return url.lower().replace("www.", "")

    # ------------------------------------------------------------------
    # Ahrefs MCP wrappers (return dicts; Claude MCP bridge fills these)
    # ------------------------------------------------------------------

    async def _call_ahrefs(self, tool: str, params: dict[str, Any]) -> dict:
        """Simulate Ahrefs MCP call. In production, routed via MCP bridge."""
        self.logger.info(f"Ahrefs MCP call: {tool} | params={params}")
        return {"tool": tool, "params": params, "data": {}}

    async def discover_competitors(
        self, target: str, limit: int = 20
    ) -> list[str]:
        """Discover organic competitors via site-explorer-organic-competitors."""
        domain = self._extract_domain(target)
        self.logger.info(f"Discovering competitors for {domain} (limit={limit})")

        resp = await self._call_ahrefs(
            "site-explorer-organic-competitors",
            {"target": domain, "limit": limit, "country": "kr"},
        )

        competitors_raw: list[dict] = resp.get("data", {}).get("competitors", [])
        discovered = []
        for entry in competitors_raw:
            comp_domain = entry.get("domain", "")
            if comp_domain and comp_domain != domain:
                discovered.append(comp_domain)

        if not discovered:
            self.logger.warning(
                "No competitors returned from Ahrefs; "
                "check that the target domain has organic traffic."
            )
        else:
            self.logger.info(f"Discovered {len(discovered)} competitors")

        return discovered[:limit]

    async def build_profile(self, domain: str) -> CompetitorProfile:
        """Build a complete profile card for a single domain."""
        domain = self._extract_domain(domain)
        profile = CompetitorProfile(domain=domain)

        # --- Metrics ---
        metrics_resp = await self._call_ahrefs(
            "site-explorer-metrics", {"target": domain}
        )
        metrics = metrics_resp.get("data", {})
        profile.organic_traffic = int(metrics.get("organic_traffic", 0))
        profile.organic_keywords = int(metrics.get("organic_keywords", 0))
        profile.traffic_value_usd = float(metrics.get("traffic_value", 0.0))

        # --- Domain Rating ---
        dr_resp = await self._call_ahrefs(
            "site-explorer-domain-rating", {"target": domain}
        )
        dr_data = dr_resp.get("data", {})
        profile.domain_rating = float(dr_data.get("domain_rating", 0.0))

        # --- Referring Domains ---
        bl_resp = await self._call_ahrefs(
            "site-explorer-backlinks-stats", {"target": domain}
        )
        bl_data = bl_resp.get("data", {})
        profile.referring_domains = int(bl_data.get("referring_domains", 0))

        # --- Top Pages ---
        pages_resp = await self._call_ahrefs(
            "site-explorer-top-pages", {"target": domain, "limit": 1000}
        )
        pages_data = pages_resp.get("data", {})
        profile.top_pages_count = len(pages_data.get("pages", []))

        # --- Content Volume (pages indexed) ---
        history_resp = await self._call_ahrefs(
            "site-explorer-pages-history", {"target": domain}
        )
        history_data = history_resp.get("data", {})
        data_points = history_data.get("data_points", [])
        if data_points:
            latest = data_points[-1]
            profile.content_volume = int(latest.get("pages", 0))

        self.logger.info(
            f"Profile built for {domain}: DR={profile.domain_rating}, "
            f"traffic={profile.organic_traffic}, keywords={profile.organic_keywords}"
        )
        return profile

    async def analyze_keyword_overlap(
        self, target: str, competitor: str, limit: int = 1000
    ) -> KeywordOverlap:
        """Analyze keyword overlap between target and a single competitor."""
        target_domain = self._extract_domain(target)
        comp_domain = self._extract_domain(competitor)

        # Fetch keyword sets for both domains
        target_resp = await self._call_ahrefs(
            "site-explorer-organic-keywords",
            {"target": target_domain, "limit": limit},
        )
        comp_resp = await self._call_ahrefs(
            "site-explorer-organic-keywords",
            {"target": comp_domain, "limit": limit},
        )

        target_kws: set[str] = set()
        for kw in target_resp.get("data", {}).get("keywords", []):
            keyword = kw.get("keyword", "")
            if keyword:
                target_kws.add(keyword.lower())

        comp_kws: set[str] = set()
        for kw in comp_resp.get("data", {}).get("keywords", []):
            keyword = kw.get("keyword", "")
            if keyword:
                comp_kws.add(keyword.lower())

        shared = target_kws & comp_kws
        unique_target = target_kws - comp_kws
        unique_comp = comp_kws - target_kws
        gap = unique_comp  # keywords the competitor ranks for but target does not

        total_union = len(target_kws | comp_kws) or 1
        overlap_pct = (len(shared) / total_union) * 100.0

        overlap = KeywordOverlap(
            shared=len(shared),
            unique_target=len(unique_target),
            unique_competitor=len(unique_comp),
            gap_keywords=len(gap),
            overlap_percentage=round(overlap_pct, 2),
        )
        self.logger.info(
            f"Keyword overlap {target_domain} vs {comp_domain}: "
            f"shared={overlap.shared}, gap={overlap.gap_keywords}"
        )
        return overlap

    def build_comparison_matrix(
        self,
        target_profile: CompetitorProfile,
        competitor_profiles: list[CompetitorProfile],
    ) -> ComparisonMatrix:
        """Create a multi-dimensional comparison matrix."""
        matrix = ComparisonMatrix(dimensions=list(self.DIMENSIONS))

        # Normalize scores to 0-100 scale relative to max in competitive set
        all_profiles = [target_profile] + competitor_profiles

        def _max_val(attr: str) -> float:
            return max(getattr(p, attr, 0) for p in all_profiles) or 1

        max_traffic = _max_val("organic_traffic")
        max_dr = 100.0  # DR is already 0-100
        max_kw = _max_val("organic_keywords")
        max_rd = _max_val("referring_domains")
        max_content = _max_val("content_volume")

        def _norm(profile: CompetitorProfile) -> dict[str, float]:
            return {
                "traffic": round((profile.organic_traffic / max_traffic) * 100, 1),
                "domain_rating": round(profile.domain_rating, 1),
                "keywords": round((profile.organic_keywords / max_kw) * 100, 1),
                "backlinks": round((profile.referring_domains / max_rd) * 100, 1),
                "content": round((profile.content_volume / max_content) * 100, 1)
                if max_content > 0
                else 0.0,
            }

        matrix.target_scores = _norm(target_profile)

        for cp in competitor_profiles:
            matrix.competitor_scores[cp.domain] = _norm(cp)

        return matrix

    def score_threat(
        self,
        target_profile: CompetitorProfile,
        competitor_profile: CompetitorProfile,
        overlap: KeywordOverlap,
    ) -> ThreatAssessment:
        """Score competitive threat 0-100 based on multiple factors."""
        assessment = ThreatAssessment(domain=competitor_profile.domain)

        # --- DR gap (positive = competitor stronger) ---
        dr_gap = competitor_profile.domain_rating - target_profile.domain_rating
        assessment.dr_gap = round(dr_gap, 1)
        dr_score = min(max((dr_gap + 30) / 60 * 100, 0), 100)  # scale -30..+30 -> 0-100

        # --- Traffic ratio ---
        target_traffic = max(target_profile.organic_traffic, 1)
        traffic_ratio = competitor_profile.organic_traffic / target_traffic
        assessment.traffic_ratio = round(traffic_ratio, 2)
        traffic_score = min(traffic_ratio * 50, 100)  # 2x traffic = 100

        # --- Keyword overlap percentage ---
        assessment.keyword_overlap_pct = overlap.overlap_percentage
        overlap_score = min(overlap.overlap_percentage * 2, 100)  # 50% overlap = 100

        # --- Gap keywords (competitor ranks, target doesn't) ---
        total_target_kw = max(overlap.shared + overlap.unique_target, 1)
        gap_ratio = overlap.gap_keywords / total_target_kw
        gap_score = min(gap_ratio * 100, 100)

        # --- Weighted threat score ---
        threat = (
            dr_score * 0.20
            + traffic_score * 0.30
            + overlap_score * 0.25
            + gap_score * 0.25
        )
        assessment.threat_score = round(min(max(threat, 0), 100), 1)

        # --- Identify strengths & weaknesses ---
        if dr_gap > 5:
            assessment.strengths.append(f"Higher DR by {dr_gap:.0f} points")
        elif dr_gap < -5:
            assessment.weaknesses.append(f"Lower DR by {abs(dr_gap):.0f} points")

        if traffic_ratio > 1.5:
            assessment.strengths.append(
                f"Traffic {traffic_ratio:.1f}x higher than target"
            )
        elif traffic_ratio < 0.5:
            assessment.weaknesses.append(
                f"Traffic only {traffic_ratio:.1f}x of target"
            )

        if overlap.gap_keywords > overlap.shared:
            assessment.strengths.append(
                f"{overlap.gap_keywords} keywords target is missing"
            )

        if competitor_profile.referring_domains > target_profile.referring_domains * 1.5:
            assessment.strengths.append("Significantly more referring domains")
        elif competitor_profile.referring_domains < target_profile.referring_domains * 0.5:
            assessment.weaknesses.append("Fewer referring domains")

        if competitor_profile.content_volume > target_profile.content_volume * 1.5:
            assessment.strengths.append("Larger content volume")
        elif competitor_profile.content_volume < target_profile.content_volume * 0.5:
            assessment.weaknesses.append("Smaller content library")

        self.logger.info(
            f"Threat score for {competitor_profile.domain}: "
            f"{assessment.threat_score}/100"
        )
        return assessment

    async def detect_korean_presence(self, domain: str) -> dict[str, bool]:
        """Check Naver Blog/Cafe presence for a domain (heuristic)."""
        domain = self._extract_domain(domain)
        self.logger.info(f"Checking Korean market presence for {domain}")

        # In production, this would use WebSearch MCP to query Naver
        # Heuristic: check if domain has .co.kr or .kr TLD,
        # or has Korean-language top pages
        is_korean_tld = domain.endswith(".kr") or domain.endswith(".co.kr")

        # Check top pages for Korean content signals
        pages_resp = await self._call_ahrefs(
            "site-explorer-organic-keywords",
            {"target": domain, "limit": 50, "country": "kr"},
        )
        kr_keywords = pages_resp.get("data", {}).get("keywords", [])
        has_kr_keywords = len(kr_keywords) > 0

        return {
            "naver_blog_presence": is_korean_tld or has_kr_keywords,
            "naver_cafe_presence": is_korean_tld,
            "korean_tld": is_korean_tld,
            "korean_keyword_count": len(kr_keywords),
        }

    def determine_market_position(
        self,
        target_profile: CompetitorProfile,
        competitor_profiles: list[CompetitorProfile],
    ) -> str:
        """Classify target as leader / challenger / follower / niche."""
        if not competitor_profiles:
            return "unknown"

        all_profiles = [target_profile] + competitor_profiles
        all_profiles.sort(key=lambda p: p.organic_traffic, reverse=True)

        target_rank = next(
            (i for i, p in enumerate(all_profiles) if p.domain == target_profile.domain),
            len(all_profiles),
        )

        total = len(all_profiles)
        percentile = target_rank / total

        # DR comparison
        avg_competitor_dr = (
            sum(p.domain_rating for p in competitor_profiles) / len(competitor_profiles)
            if competitor_profiles
            else 0
        )
        dr_advantage = target_profile.domain_rating - avg_competitor_dr

        # Traffic leader check
        max_traffic = max(p.organic_traffic for p in all_profiles) or 1
        traffic_share = target_profile.organic_traffic / max_traffic

        if percentile <= 0.1 and traffic_share >= 0.8:
            return "leader"
        elif percentile <= 0.33 or (dr_advantage > 10 and traffic_share > 0.5):
            return "challenger"
        elif percentile <= 0.66:
            return "follower"
        else:
            # Check if niche player (high DR but low traffic = niche authority)
            if target_profile.domain_rating > avg_competitor_dr:
                return "niche"
            return "follower"

    async def profile(
        self,
        target: str,
        competitors: list[str] | None = None,
        max_competitors: int = 10,
    ) -> CompetitorProfilingResult:
        """Orchestrate full competitor profiling pipeline."""
        timestamp = datetime.now().isoformat()
        result = CompetitorProfilingResult(
            target=self._extract_domain(target),
            timestamp=timestamp,
        )

        try:
            # Step 1: Build target profile
            self.logger.info("Step 1/6: Building target profile...")
            target_profile = await self.build_profile(target)
            result.target_profile = target_profile

            # Step 2: Discover or validate competitors
            self.logger.info("Step 2/6: Discovering competitors...")
            if competitors:
                comp_domains = [self._extract_domain(c) for c in competitors]
            else:
                comp_domains = await self.discover_competitors(
                    target, limit=max_competitors
                )

            if not comp_domains:
                result.errors.append("No competitors found or provided.")
                return result

            comp_domains = comp_domains[:max_competitors]

            # Step 3: Build competitor profiles
            self.logger.info(
                f"Step 3/6: Profiling {len(comp_domains)} competitors..."
            )
            competitor_profiles: list[CompetitorProfile] = []
            for domain in comp_domains:
                try:
                    cp = await self.build_profile(domain)
                    if self.korean_market:
                        kr_presence = await self.detect_korean_presence(domain)
                        cp.naver_blog_presence = kr_presence.get(
                            "naver_blog_presence", False
                        )
                        cp.naver_cafe_presence = kr_presence.get(
                            "naver_cafe_presence", False
                        )
                    competitor_profiles.append(cp)
                except Exception as e:
                    msg = f"Failed to profile {domain}: {e}"
                    self.logger.error(msg)
                    result.errors.append(msg)

            # Step 4: Keyword overlap analysis
            self.logger.info("Step 4/6: Analyzing keyword overlaps...")
            overlaps: dict[str, KeywordOverlap] = {}
            for cp in competitor_profiles:
                try:
                    overlap = await self.analyze_keyword_overlap(target, cp.domain)
                    overlaps[cp.domain] = overlap
                except Exception as e:
                    msg = f"Keyword overlap failed for {cp.domain}: {e}"
                    self.logger.error(msg)
                    result.errors.append(msg)
                    overlaps[cp.domain] = KeywordOverlap()

            # Step 5: Build comparison matrix
            self.logger.info("Step 5/6: Building comparison matrix...")
            matrix = self.build_comparison_matrix(target_profile, competitor_profiles)
            result.comparison_matrix = matrix

            # Step 6: Score threats and assemble output
            self.logger.info("Step 6/6: Scoring competitive threats...")
            for cp in competitor_profiles:
                overlap = overlaps.get(cp.domain, KeywordOverlap())
                threat = self.score_threat(target_profile, cp, overlap)

                competitor_entry = {
                    "domain": cp.domain,
                    "profile": asdict(cp),
                    "threat_score": threat.threat_score,
                    "threat_detail": asdict(threat),
                    "keyword_overlap": asdict(overlap),
                }
                result.competitors.append(competitor_entry)

            # Sort by threat score descending
            result.competitors.sort(
                key=lambda c: c.get("threat_score", 0), reverse=True
            )

            # Determine market position
            result.market_position = self.determine_market_position(
                target_profile, competitor_profiles
            )

            self.logger.info(
                f"Profiling complete: {len(result.competitors)} competitors analyzed. "
                f"Market position: {result.market_position}"
            )

        except Exception as e:
            msg = f"Profiling pipeline error: {e}"
            self.logger.error(msg)
            result.errors.append(msg)

        return result


# ---------------------------------------------------------------------------
# Output helpers
# ---------------------------------------------------------------------------

def _format_text_report(result: CompetitorProfilingResult) -> str:
    """Format profiling result as human-readable text report."""
    lines: list[str] = []
    lines.append("=" * 70)
    lines.append(f"  COMPETITOR INTELLIGENCE REPORT")
    lines.append(f"  Target: {result.target}")
    lines.append(f"  Generated: {result.timestamp}")
    lines.append(f"  Market Position: {result.market_position.upper()}")
    lines.append("=" * 70)

    if result.target_profile:
        tp = result.target_profile
        lines.append("")
        lines.append("--- TARGET PROFILE ---")
        lines.append(f"  Domain Rating:     {tp.domain_rating}")
        lines.append(f"  Organic Traffic:   {tp.organic_traffic:,}")
        lines.append(f"  Organic Keywords:  {tp.organic_keywords:,}")
        lines.append(f"  Referring Domains: {tp.referring_domains:,}")
        lines.append(f"  Top Pages:         {tp.top_pages_count:,}")
        lines.append(f"  Content Volume:    {tp.content_volume:,}")
        lines.append(f"  Traffic Value:     ${tp.traffic_value_usd:,.2f}")

    if result.competitors:
        lines.append("")
        lines.append("--- COMPETITORS (sorted by threat score) ---")
        for i, comp in enumerate(result.competitors, 1):
            p = comp["profile"]
            t = comp["threat_detail"]
            o = comp["keyword_overlap"]
            lines.append("")
            lines.append(f"  #{i} {comp['domain']}")
            lines.append(f"     Threat Score:    {comp['threat_score']}/100")
            lines.append(f"     Domain Rating:   {p['domain_rating']}")
            lines.append(f"     Organic Traffic: {p['organic_traffic']:,}")
            lines.append(f"     Keywords:        {p['organic_keywords']:,}")
            lines.append(f"     Referring Doms:  {p['referring_domains']:,}")
            lines.append(f"     Keyword Overlap: {o['shared']} shared, {o['gap_keywords']} gap")
            if t.get("strengths"):
                lines.append(f"     Strengths:       {'; '.join(t['strengths'])}")
            if t.get("weaknesses"):
                lines.append(f"     Weaknesses:      {'; '.join(t['weaknesses'])}")

    if result.comparison_matrix:
        m = result.comparison_matrix
        lines.append("")
        lines.append("--- COMPARISON MATRIX ---")
        header = f"  {'Dimension':<20} {'Target':>10}"
        for domain in m.competitor_scores:
            short = domain[:15]
            header += f" {short:>15}"
        lines.append(header)
        lines.append("  " + "-" * (len(header) - 2))

        for dim in m.dimensions:
            row = f"  {dim:<20} {m.target_scores.get(dim, 0):>10.1f}"
            for domain, scores in m.competitor_scores.items():
                row += f" {scores.get(dim, 0):>15.1f}"
            lines.append(row)

    if result.errors:
        lines.append("")
        lines.append("--- ERRORS ---")
        for err in result.errors:
            lines.append(f"  - {err}")

    lines.append("")
    lines.append("=" * 70)
    return "\n".join(lines)


def _serialize_result(result: CompetitorProfilingResult) -> dict:
    """Convert result to JSON-serializable dict."""
    output = {
        "target": result.target,
        "target_profile": asdict(result.target_profile) if result.target_profile else None,
        "competitors": result.competitors,
        "comparison_matrix": asdict(result.comparison_matrix)
        if result.comparison_matrix
        else None,
        "market_position": result.market_position,
        "timestamp": result.timestamp,
    }
    if result.errors:
        output["errors"] = result.errors
    return output


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="SEO Competitor Profiler - Build competitive intelligence reports",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""\
Examples:
  python competitor_profiler.py --target https://example.com --json
  python competitor_profiler.py --target https://example.com --competitor https://comp1.com --json
  python competitor_profiler.py --target https://example.com --max-competitors 10 --korean-market --json
        """,
    )
    parser.add_argument(
        "--target",
        required=True,
        help="Target website URL or domain to analyze",
    )
    parser.add_argument(
        "--competitor",
        action="append",
        dest="competitors",
        default=[],
        help="Competitor URL/domain (repeatable; omit for auto-discovery)",
    )
    parser.add_argument(
        "--max-competitors",
        type=int,
        default=10,
        help="Maximum competitors to profile (default: 10)",
    )
    parser.add_argument(
        "--korean-market",
        action="store_true",
        default=False,
        help="Include Korean market analysis (Naver Blog/Cafe presence)",
    )
    parser.add_argument(
        "--json",
        action="store_true",
        default=False,
        help="Output in JSON format",
    )
    parser.add_argument(
        "--output",
        type=str,
        default=None,
        help="Save output to file path",
    )
    return parser.parse_args(argv)


async def async_main(args: argparse.Namespace) -> None:
    profiler = CompetitorProfiler(korean_market=args.korean_market)

    result = await profiler.profile(
        target=args.target,
        competitors=args.competitors or None,
        max_competitors=args.max_competitors,
    )

    if args.json:
        output_str = json.dumps(_serialize_result(result), indent=2, ensure_ascii=False)
    else:
        output_str = _format_text_report(result)

    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
            f.write(output_str)
        logger.info(f"Report saved to {args.output}")
    else:
        print(output_str)

    profiler.print_stats()


def main() -> None:
    args = parse_args()
    asyncio.run(async_main(args))


if __name__ == "__main__":
    main()