our-claude-skills/custom-skills/19-seo-keyword-strategy/code/scripts/keyword_gap_analyzer.py

"""
Keyword Gap Analyzer - Competitor keyword gap analysis with opportunity scoring
===============================================================================
Purpose: Identify keywords competitors rank for but target site doesn't,
         score opportunities, and prioritize by volume/difficulty ratio.
Python: 3.10+
"""

import argparse
import json
import logging
import re
import subprocess
import sys
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Optional
from urllib.parse import urlparse

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger("keyword_gap_analyzer")

# ---------------------------------------------------------------------------
# Intent classification patterns (shared with keyword_researcher)
# ---------------------------------------------------------------------------
INTENT_PATTERNS: dict[str, list[str]] = {
    "transactional": [
        r"구매|구입|주문|buy|order|purchase|shop|deal|discount|coupon|할인|쿠폰",
        r"예약|booking|reserve|sign\s?up|register|등록|신청",
    ],
    "commercial": [
        r"가격|비용|얼마|price|cost|pricing|fee|요금",
        r"추천|best|top\s?\d|review|비교|compare|vs|versus|후기|리뷰|평점|평가",
        r"잘하는곳|잘하는|맛집|업체|병원|추천\s?병원",
    ],
    "navigational": [
        r"^(www\.|http|\.com|\.co\.kr|\.net)",
        r"공식|official|login|로그인|홈페이지|사이트|website",
        r"고객센터|contact|support|customer\s?service",
    ],
    "informational": [
        r"방법|how\s?to|what\s?is|why|when|where|who|which",
        r"뜻|의미|정의|definition|meaning|guide|tutorial",
        r"효과|부작용|증상|원인|차이|종류|type|cause|symptom|effect",
        r"전후|before\s?and\s?after|결과|result",
    ],
}


# ---------------------------------------------------------------------------
# Dataclasses
# ---------------------------------------------------------------------------


@dataclass
class OrganicKeyword:
    """A keyword that a domain ranks for organically."""

    keyword: str
    position: int = 0
    volume: int = 0
    kd: float = 0.0
    cpc: float = 0.0
    url: str = ""
    traffic: int = 0


@dataclass
class GapKeyword:
    """A keyword gap between target and competitor(s)."""

    keyword: str
    volume: int = 0
    kd: float = 0.0
    cpc: float = 0.0
    intent: str = "informational"
    opportunity_score: float = 0.0
    competitor_positions: dict[str, int] = field(default_factory=dict)
    competitor_urls: dict[str, str] = field(default_factory=dict)
    avg_competitor_position: float = 0.0

    def to_dict(self) -> dict:
        return asdict(self)


@dataclass
class GapAnalysisResult:
    """Complete gap analysis result."""

    target: str
    competitors: list[str] = field(default_factory=list)
    country: str = "kr"
    total_gaps: int = 0
    total_opportunity_volume: int = 0
    gaps_by_intent: dict[str, int] = field(default_factory=dict)
    top_opportunities: list[GapKeyword] = field(default_factory=list)
    all_gaps: list[GapKeyword] = field(default_factory=list)
    target_keyword_count: int = 0
    competitor_keyword_counts: dict[str, int] = field(default_factory=dict)
    timestamp: str = ""

    def to_dict(self) -> dict:
        return {
            "target": self.target,
            "competitors": self.competitors,
            "country": self.country,
            "total_gaps": self.total_gaps,
            "total_opportunity_volume": self.total_opportunity_volume,
            "gaps_by_intent": self.gaps_by_intent,
            "top_opportunities": [g.to_dict() for g in self.top_opportunities],
            "all_gaps": [g.to_dict() for g in self.all_gaps],
            "target_keyword_count": self.target_keyword_count,
            "competitor_keyword_counts": self.competitor_keyword_counts,
            "timestamp": self.timestamp,
        }


# ---------------------------------------------------------------------------
# MCP Helper
# ---------------------------------------------------------------------------


def call_mcp_tool(tool_name: str, params: dict) -> dict:
    """
    Call an Ahrefs MCP tool and return parsed JSON response.

    In production this delegates to the MCP bridge. For standalone usage
    it invokes the Claude CLI with the appropriate tool call.
    """
    logger.info(f"Calling MCP tool: {tool_name} with params: {json.dumps(params, ensure_ascii=False)}")

    try:
        cmd = [
            "claude",
            "--print",
            "--output-format", "json",
            "-p",
            (
                f"Call the tool mcp__claude_ai_Ahrefs__{tool_name} with these parameters: "
                f"{json.dumps(params, ensure_ascii=False)}. Return ONLY the raw JSON result."
            ),
        ]
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)

        if result.returncode != 0:
            logger.warning(f"MCP tool {tool_name} returned non-zero exit code: {result.returncode}")
            logger.debug(f"stderr: {result.stderr}")
            return {"error": result.stderr, "keywords": [], "items": []}

        try:
            return json.loads(result.stdout)
        except json.JSONDecodeError:
            return {"raw": result.stdout, "keywords": [], "items": []}

    except subprocess.TimeoutExpired:
        logger.error(f"MCP tool {tool_name} timed out")
        return {"error": "timeout", "keywords": [], "items": []}
    except FileNotFoundError:
        logger.warning("Claude CLI not found - returning empty result for standalone testing")
        return {"keywords": [], "items": []}


# ---------------------------------------------------------------------------
# Utility functions
# ---------------------------------------------------------------------------


def extract_domain(url: str) -> str:
    """Extract clean domain from URL."""
    if not url.startswith(("http://", "https://")):
        url = f"https://{url}"
    parsed = urlparse(url)
    domain = parsed.netloc or parsed.path
    domain = domain.lower().strip("/")
    if domain.startswith("www."):
        domain = domain[4:]
    return domain


def classify_intent(keyword: str) -> str:
    """Classify search intent based on keyword patterns."""
    keyword_lower = keyword.lower().strip()
    for intent, patterns in INTENT_PATTERNS.items():
        for pattern in patterns:
            if re.search(pattern, keyword_lower, re.IGNORECASE):
                return intent
    return "informational"


# ---------------------------------------------------------------------------
# KeywordGapAnalyzer
# ---------------------------------------------------------------------------


class KeywordGapAnalyzer:
    """Analyze keyword gaps between a target site and its competitors."""

    def __init__(self, country: str = "kr", min_volume: int = 0):
        self.country = country
        self.min_volume = min_volume

    def get_organic_keywords(self, domain: str, limit: int = 1000) -> list[OrganicKeyword]:
        """
        Fetch organic keywords for a domain via Ahrefs site-explorer-organic-keywords.
        Returns a list of OrganicKeyword entries.
        """
        clean_domain = extract_domain(domain)
        logger.info(f"Fetching organic keywords for: {clean_domain} (limit={limit})")

        result = call_mcp_tool("site-explorer-organic-keywords", {
            "target": clean_domain,
            "country": self.country,
            "limit": limit,
            "mode": "domain",
        })

        keywords: list[OrganicKeyword] = []
        for item in result.get("keywords", result.get("items", [])):
            if not isinstance(item, dict):
                continue
            kw = OrganicKeyword(
                keyword=item.get("keyword", item.get("term", "")),
                position=int(item.get("position", item.get("rank", 0)) or 0),
                volume=int(item.get("volume", item.get("search_volume", 0)) or 0),
                kd=float(item.get("keyword_difficulty", item.get("kd", 0)) or 0),
                cpc=float(item.get("cpc", item.get("cost_per_click", 0)) or 0),
                url=item.get("url", item.get("best_position_url", "")),
                traffic=int(item.get("traffic", item.get("estimated_traffic", 0)) or 0),
            )
            if kw.keyword:
                keywords.append(kw)

        logger.info(f"Found {len(keywords)} organic keywords for {clean_domain}")
        return keywords

    def find_gaps(
        self,
        target_keywords: list[OrganicKeyword],
        competitor_keyword_sets: dict[str, list[OrganicKeyword]],
    ) -> list[GapKeyword]:
        """
        Identify keywords that competitors rank for but the target doesn't.

        A gap keyword is one that appears in at least one competitor's keyword
        set but not in the target's keyword set.
        """
        # Build target keyword set for fast lookup
        target_kw_set: set[str] = {kw.keyword.lower().strip() for kw in target_keywords}

        # Collect all competitor keywords with their positions
        gap_map: dict[str, GapKeyword] = {}

        for comp_domain, comp_keywords in competitor_keyword_sets.items():
            for ckw in comp_keywords:
                kw_lower = ckw.keyword.lower().strip()

                # Skip if target already ranks for this keyword
                if kw_lower in target_kw_set:
                    continue

                # Skip below minimum volume
                if ckw.volume < self.min_volume:
                    continue

                if kw_lower not in gap_map:
                    gap_map[kw_lower] = GapKeyword(
                        keyword=ckw.keyword,
                        volume=ckw.volume,
                        kd=ckw.kd,
                        cpc=ckw.cpc,
                        intent=classify_intent(ckw.keyword),
                        competitor_positions={},
                        competitor_urls={},
                    )

                gap_map[kw_lower].competitor_positions[comp_domain] = ckw.position
                gap_map[kw_lower].competitor_urls[comp_domain] = ckw.url

                # Update volume/kd if higher from another competitor
                if ckw.volume > gap_map[kw_lower].volume:
                    gap_map[kw_lower].volume = ckw.volume
                if ckw.kd > 0 and (gap_map[kw_lower].kd == 0 or ckw.kd < gap_map[kw_lower].kd):
                    gap_map[kw_lower].kd = ckw.kd

        gaps = list(gap_map.values())

        # Calculate average competitor position for each gap
        for gap in gaps:
            positions = list(gap.competitor_positions.values())
            gap.avg_competitor_position = round(
                sum(positions) / len(positions), 1
            ) if positions else 0.0

        logger.info(f"Found {len(gaps)} keyword gaps")
        return gaps

    def score_opportunities(self, gaps: list[GapKeyword]) -> list[GapKeyword]:
        """
        Score each gap keyword by opportunity potential.

        Formula:
            opportunity_score = (volume_score * 0.4) + (kd_score * 0.3) +
                                (position_score * 0.2) + (intent_score * 0.1)

        Where:
            - volume_score: normalized 0-100 based on max volume in set
            - kd_score: inverted (lower KD = higher score), normalized 0-100
            - position_score: based on avg competitor position (lower = easier to compete)
            - intent_score: commercial/transactional get higher scores
        """
        if not gaps:
            return gaps

        # Find max volume for normalization
        max_volume = max(g.volume for g in gaps) if gaps else 1
        max_volume = max(max_volume, 1)

        intent_scores = {
            "transactional": 100,
            "commercial": 80,
            "informational": 40,
            "navigational": 20,
        }

        for gap in gaps:
            # Volume score (0-100)
            volume_score = (gap.volume / max_volume) * 100

            # KD score (inverted: low KD = high score)
            kd_score = max(0, 100 - gap.kd)

            # Position score (competitors ranking 1-10 means realistic opportunity)
            if gap.avg_competitor_position <= 10:
                position_score = 90
            elif gap.avg_competitor_position <= 20:
                position_score = 70
            elif gap.avg_competitor_position <= 50:
                position_score = 50
            else:
                position_score = 30

            # Intent score
            intent_score = intent_scores.get(gap.intent, 40)

            # Combined score
            gap.opportunity_score = round(
                (volume_score * 0.4) +
                (kd_score * 0.3) +
                (position_score * 0.2) +
                (intent_score * 0.1),
                1,
            )

        # Sort by opportunity score descending
        gaps.sort(key=lambda g: g.opportunity_score, reverse=True)

        logger.info(f"Scored {len(gaps)} gap keywords by opportunity")
        return gaps

    def analyze(self, target_url: str, competitor_urls: list[str]) -> GapAnalysisResult:
        """
        Orchestrate full keyword gap analysis:
        1. Fetch organic keywords for target
        2. Fetch organic keywords for each competitor
        3. Identify gaps
        4. Score opportunities
        5. Compile results
        """
        target_domain = extract_domain(target_url)
        competitor_domains = [extract_domain(url) for url in competitor_urls]

        logger.info(
            f"Starting gap analysis: {target_domain} vs {', '.join(competitor_domains)}"
        )

        # Step 1: Fetch target keywords
        target_keywords = self.get_organic_keywords(target_domain)

        # Step 2: Fetch competitor keywords
        competitor_keyword_sets: dict[str, list[OrganicKeyword]] = {}
        competitor_keyword_counts: dict[str, int] = {}

        for comp_domain in competitor_domains:
            comp_keywords = self.get_organic_keywords(comp_domain)
            competitor_keyword_sets[comp_domain] = comp_keywords
            competitor_keyword_counts[comp_domain] = len(comp_keywords)

        # Step 3: Find gaps
        gaps = self.find_gaps(target_keywords, competitor_keyword_sets)

        # Step 4: Score opportunities
        scored_gaps = self.score_opportunities(gaps)

        # Step 5: Calculate intent distribution
        gaps_by_intent: dict[str, int] = {}
        for gap in scored_gaps:
            gaps_by_intent[gap.intent] = gaps_by_intent.get(gap.intent, 0) + 1

        # Step 6: Compile result
        result = GapAnalysisResult(
            target=target_domain,
            competitors=competitor_domains,
            country=self.country,
            total_gaps=len(scored_gaps),
            total_opportunity_volume=sum(g.volume for g in scored_gaps),
            gaps_by_intent=gaps_by_intent,
            top_opportunities=scored_gaps[:50],
            all_gaps=scored_gaps,
            target_keyword_count=len(target_keywords),
            competitor_keyword_counts=competitor_keyword_counts,
            timestamp=datetime.now().isoformat(),
        )

        logger.info(
            f"Gap analysis complete: {result.total_gaps} gaps found, "
            f"total opportunity volume {result.total_opportunity_volume:,}"
        )
        return result


# ---------------------------------------------------------------------------
# Plain-text report formatter
# ---------------------------------------------------------------------------


def format_text_report(result: GapAnalysisResult) -> str:
    """Format gap analysis result as a human-readable text report."""
    lines: list[str] = []
    lines.append("=" * 75)
    lines.append(f"Keyword Gap Analysis Report")
    lines.append(f"Target: {result.target}")
    lines.append(f"Competitors: {', '.join(result.competitors)}")
    lines.append(f"Country: {result.country.upper()} | Date: {result.timestamp[:10]}")
    lines.append("=" * 75)
    lines.append("")

    # Overview
    lines.append("## Overview")
    lines.append(f"  Target keywords: {result.target_keyword_count:,}")
    for comp, count in result.competitor_keyword_counts.items():
        lines.append(f"  {comp} keywords: {count:,}")
    lines.append(f"  Keyword gaps found: {result.total_gaps:,}")
    lines.append(f"  Total opportunity volume: {result.total_opportunity_volume:,}")
    lines.append("")

    # Intent distribution
    if result.gaps_by_intent:
        lines.append("## Gaps by Intent")
        for intent, count in sorted(result.gaps_by_intent.items(), key=lambda x: x[1], reverse=True):
            pct = (count / result.total_gaps) * 100 if result.total_gaps else 0
            lines.append(f"  {intent:<15}: {count:>5} ({pct:.1f}%)")
        lines.append("")

    # Top opportunities
    if result.top_opportunities:
        lines.append("## Top Opportunities (by score)")
        header = f"  {'Keyword':<35} {'Vol':>8} {'KD':>6} {'Score':>7} {'Intent':<15} {'Competitors'}"
        lines.append(header)
        lines.append("  " + "-" * 90)

        for gap in result.top_opportunities[:30]:
            kw_display = gap.keyword[:33] if len(gap.keyword) > 33 else gap.keyword
            comp_positions = ", ".join(
                f"{d}:#{p}" for d, p in gap.competitor_positions.items()
            )
            comp_display = comp_positions[:30] if len(comp_positions) > 30 else comp_positions

            lines.append(
                f"  {kw_display:<35} {gap.volume:>8,} {gap.kd:>6.1f} "
                f"{gap.opportunity_score:>7.1f} {gap.intent:<15} {comp_display}"
            )
        lines.append("")

    # Quick wins (low KD, high volume)
    quick_wins = [g for g in result.all_gaps if g.kd <= 30 and g.volume >= 100]
    quick_wins.sort(key=lambda g: g.volume, reverse=True)
    if quick_wins:
        lines.append("## Quick Wins (KD <= 30, Volume >= 100)")
        lines.append(f"  {'Keyword':<35} {'Vol':>8} {'KD':>6} {'Intent':<15}")
        lines.append("  " + "-" * 64)
        for gap in quick_wins[:20]:
            kw_display = gap.keyword[:33] if len(gap.keyword) > 33 else gap.keyword
            lines.append(
                f"  {kw_display:<35} {gap.volume:>8,} {gap.kd:>6.1f} {gap.intent:<15}"
            )
        lines.append("")

    return "\n".join(lines)


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------


def main():
    parser = argparse.ArgumentParser(
        description="Keyword Gap Analyzer - Find competitor keyword opportunities",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python keyword_gap_analyzer.py --target https://example.com --competitor https://comp.com --json
  python keyword_gap_analyzer.py --target example.com --competitor comp1.com --competitor comp2.com --min-volume 100 --json
  python keyword_gap_analyzer.py --target example.com --competitor comp.com --country us --output gaps.json
        """,
    )
    parser.add_argument(
        "--target",
        required=True,
        help="Target website URL or domain",
    )
    parser.add_argument(
        "--competitor",
        action="append",
        required=True,
        dest="competitors",
        help="Competitor URL or domain (can be repeated)",
    )
    parser.add_argument(
        "--country",
        default="kr",
        help="Target country code (default: kr)",
    )
    parser.add_argument(
        "--min-volume",
        type=int,
        default=0,
        help="Minimum search volume filter (default: 0)",
    )
    parser.add_argument(
        "--json",
        action="store_true",
        dest="output_json",
        help="Output results as JSON",
    )
    parser.add_argument(
        "--output",
        type=str,
        default=None,
        help="Write output to file (path)",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="Enable verbose/debug logging",
    )

    args = parser.parse_args()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    # Run analysis
    analyzer = KeywordGapAnalyzer(
        country=args.country,
        min_volume=args.min_volume,
    )
    result = analyzer.analyze(args.target, args.competitors)

    # Format output
    if args.output_json:
        output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
    else:
        output = format_text_report(result)

    # Write or print
    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
            f.write(output)
        logger.info(f"Output written to: {args.output}")
    else:
        print(output)

    return 0


if __name__ == "__main__":
    sys.exit(main())