our-claude-skills/custom-skills/19-seo-keyword-strategy/code/scripts/keyword_researcher.py

"""
Keyword Researcher - Seed keyword expansion, intent classification, and topic clustering
========================================================================================
Purpose: Expand seed keywords via Ahrefs APIs, classify search intent,
         cluster topics, and support Korean market keyword discovery.
Python: 3.10+
"""

import argparse
import json
import logging
import re
import subprocess
import sys
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Optional

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger("keyword_researcher")

# ---------------------------------------------------------------------------
# Constants - Korean suffix expansion
# ---------------------------------------------------------------------------
KOREAN_SUFFIXES: list[str] = [
    "추천",
    "가격",
    "후기",
    "잘하는곳",
    "부작용",
    "전후",
    "비용",
    "추천 병원",
    "후기 블로그",
    "방법",
    "종류",
    "비교",
    "효과",
    "주의사항",
    "장단점",
]

# ---------------------------------------------------------------------------
# Intent classification patterns
# ---------------------------------------------------------------------------
INTENT_PATTERNS: dict[str, list[str]] = {
    "transactional": [
        r"구매|구입|주문|buy|order|purchase|shop|deal|discount|coupon|할인|쿠폰",
        r"예약|booking|reserve|sign\s?up|register|등록|신청",
    ],
    "commercial": [
        r"가격|비용|얼마|price|cost|pricing|fee|요금",
        r"추천|best|top\s?\d|review|비교|compare|vs|versus|후기|리뷰|평점|평가",
        r"잘하는곳|잘하는|맛집|업체|병원|추천\s?병원",
    ],
    "navigational": [
        r"^(www\.|http|\.com|\.co\.kr|\.net)",
        r"공식|official|login|로그인|홈페이지|사이트|website",
        r"고객센터|contact|support|customer\s?service",
    ],
    "informational": [
        r"방법|how\s?to|what\s?is|why|when|where|who|which",
        r"뜻|의미|정의|definition|meaning|guide|tutorial",
        r"효과|부작용|증상|원인|차이|종류|type|cause|symptom|effect",
        r"전후|before\s?and\s?after|결과|result",
    ],
}

# ---------------------------------------------------------------------------
# Dataclasses
# ---------------------------------------------------------------------------


@dataclass
class KeywordEntry:
    """Single keyword with its metrics and classification."""

    keyword: str
    volume: int = 0
    kd: float = 0.0
    cpc: float = 0.0
    intent: str = "informational"
    cluster: str = ""
    source: str = ""
    country_volumes: dict[str, int] = field(default_factory=dict)

    def to_dict(self) -> dict:
        data = asdict(self)
        if not data["country_volumes"]:
            del data["country_volumes"]
        return data


@dataclass
class KeywordCluster:
    """Group of semantically related keywords."""

    topic: str
    keywords: list[str] = field(default_factory=list)
    total_volume: int = 0
    avg_kd: float = 0.0
    primary_intent: str = "informational"

    def to_dict(self) -> dict:
        return asdict(self)


@dataclass
class ResearchResult:
    """Full research result container."""

    seed_keyword: str
    country: str
    total_keywords: int = 0
    total_volume: int = 0
    clusters: list[KeywordCluster] = field(default_factory=list)
    keywords: list[KeywordEntry] = field(default_factory=list)
    timestamp: str = ""

    def to_dict(self) -> dict:
        return {
            "seed_keyword": self.seed_keyword,
            "country": self.country,
            "total_keywords": self.total_keywords,
            "total_volume": self.total_volume,
            "clusters": [c.to_dict() for c in self.clusters],
            "keywords": [k.to_dict() for k in self.keywords],
            "timestamp": self.timestamp,
        }


# ---------------------------------------------------------------------------
# MCP Helper - calls Ahrefs MCP tools via subprocess
# ---------------------------------------------------------------------------


def call_mcp_tool(tool_name: str, params: dict) -> dict:
    """
    Call an Ahrefs MCP tool and return parsed JSON response.

    In production this delegates to the MCP bridge. For standalone usage
    it invokes the Claude CLI with the appropriate tool call.
    """
    logger.info(f"Calling MCP tool: {tool_name} with params: {json.dumps(params, ensure_ascii=False)}")

    try:
        cmd = [
            "claude",
            "--print",
            "--output-format", "json",
            "-p",
            f"Call the tool mcp__claude_ai_Ahrefs__{tool_name} with these parameters: {json.dumps(params, ensure_ascii=False)}. Return ONLY the raw JSON result.",
        ]
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)

        if result.returncode != 0:
            logger.warning(f"MCP tool {tool_name} returned non-zero exit code: {result.returncode}")
            logger.debug(f"stderr: {result.stderr}")
            return {"error": result.stderr, "keywords": [], "items": []}

        try:
            return json.loads(result.stdout)
        except json.JSONDecodeError:
            return {"raw": result.stdout, "keywords": [], "items": []}

    except subprocess.TimeoutExpired:
        logger.error(f"MCP tool {tool_name} timed out")
        return {"error": "timeout", "keywords": [], "items": []}
    except FileNotFoundError:
        logger.warning("Claude CLI not found - returning empty result for standalone testing")
        return {"keywords": [], "items": []}


# ---------------------------------------------------------------------------
# KeywordResearcher
# ---------------------------------------------------------------------------


class KeywordResearcher:
    """Expand seed keywords, classify intent, and cluster topics."""

    def __init__(self, country: str = "kr", korean_suffixes: bool = False, compare_global: bool = False):
        self.country = country
        self.korean_suffixes = korean_suffixes
        self.compare_global = compare_global
        self._seen: set[str] = set()

    # ---- Keyword expansion via Ahrefs MCP ----

    def expand_keywords(self, seed: str) -> list[KeywordEntry]:
        """
        Expand a seed keyword using Ahrefs matching-terms, related-terms,
        and search-suggestions endpoints.
        """
        all_keywords: list[KeywordEntry] = []

        # 1. Matching terms
        logger.info(f"Fetching matching terms for: {seed}")
        matching = call_mcp_tool("keywords-explorer-matching-terms", {
            "keyword": seed,
            "country": self.country,
            "limit": 100,
        })
        for item in matching.get("keywords", matching.get("items", [])):
            kw = self._parse_keyword_item(item, source="matching-terms")
            if kw and kw.keyword not in self._seen:
                self._seen.add(kw.keyword)
                all_keywords.append(kw)

        # 2. Related terms
        logger.info(f"Fetching related terms for: {seed}")
        related = call_mcp_tool("keywords-explorer-related-terms", {
            "keyword": seed,
            "country": self.country,
            "limit": 100,
        })
        for item in related.get("keywords", related.get("items", [])):
            kw = self._parse_keyword_item(item, source="related-terms")
            if kw and kw.keyword not in self._seen:
                self._seen.add(kw.keyword)
                all_keywords.append(kw)

        # 3. Search suggestions
        logger.info(f"Fetching search suggestions for: {seed}")
        suggestions = call_mcp_tool("keywords-explorer-search-suggestions", {
            "keyword": seed,
            "country": self.country,
            "limit": 50,
        })
        for item in suggestions.get("keywords", suggestions.get("items", [])):
            kw = self._parse_keyword_item(item, source="search-suggestions")
            if kw and kw.keyword not in self._seen:
                self._seen.add(kw.keyword)
                all_keywords.append(kw)

        # 4. Add the seed itself if not already present
        if seed not in self._seen:
            self._seen.add(seed)
            overview = call_mcp_tool("keywords-explorer-overview", {
                "keyword": seed,
                "country": self.country,
            })
            seed_entry = self._parse_keyword_item(overview, source="seed")
            if seed_entry:
                seed_entry.keyword = seed
                all_keywords.insert(0, seed_entry)

        logger.info(f"Expanded to {len(all_keywords)} keywords from Ahrefs APIs")
        return all_keywords

    def expand_korean_suffixes(self, seed: str) -> list[KeywordEntry]:
        """
        Generate keyword variations by appending common Korean suffixes.
        Each variation is checked against Ahrefs for volume data.
        """
        suffix_keywords: list[KeywordEntry] = []

        for suffix in KOREAN_SUFFIXES:
            variation = f"{seed} {suffix}"
            if variation in self._seen:
                continue

            logger.info(f"Checking Korean suffix variation: {variation}")
            overview = call_mcp_tool("keywords-explorer-overview", {
                "keyword": variation,
                "country": self.country,
            })
            kw = self._parse_keyword_item(overview, source="korean-suffix")
            if kw:
                kw.keyword = variation
                if kw.volume > 0:
                    self._seen.add(variation)
                    suffix_keywords.append(kw)
            else:
                # Even if no data, include as zero-volume for completeness
                entry = KeywordEntry(
                    keyword=variation,
                    volume=0,
                    kd=0.0,
                    cpc=0.0,
                    intent=self.classify_intent(variation),
                    source="korean-suffix",
                )
                self._seen.add(variation)
                suffix_keywords.append(entry)

        logger.info(f"Korean suffix expansion yielded {len(suffix_keywords)} variations")
        return suffix_keywords

    def get_volume_by_country(self, keyword: str) -> dict[str, int]:
        """
        Get search volume breakdown by country for a keyword.
        Useful for comparing Korean vs global demand.
        """
        logger.info(f"Fetching volume-by-country for: {keyword}")
        result = call_mcp_tool("keywords-explorer-volume-by-country", {
            "keyword": keyword,
        })

        volumes: dict[str, int] = {}
        for item in result.get("countries", result.get("items", [])):
            if isinstance(item, dict):
                country_code = item.get("country", item.get("code", ""))
                volume = item.get("volume", item.get("search_volume", 0))
                if country_code and volume:
                    volumes[country_code.lower()] = int(volume)

        return volumes

    # ---- Intent classification ----

    def classify_intent(self, keyword: str) -> str:
        """
        Classify search intent based on keyword patterns.
        Priority: transactional > commercial > navigational > informational
        """
        keyword_lower = keyword.lower().strip()

        for intent, patterns in INTENT_PATTERNS.items():
            for pattern in patterns:
                if re.search(pattern, keyword_lower, re.IGNORECASE):
                    return intent

        return "informational"

    # ---- Keyword clustering ----

    def cluster_keywords(self, keywords: list[KeywordEntry]) -> list[KeywordCluster]:
        """
        Group keywords into topic clusters using shared n-gram tokens.
        Uses a simple token overlap approach: keywords sharing significant
        tokens (2+ character words) are grouped together.
        """
        if not keywords:
            return []

        # Extract meaningful tokens from each keyword
        def tokenize(text: str) -> set[str]:
            tokens = set()
            for word in re.split(r"\s+", text.strip().lower()):
                if len(word) >= 2:
                    tokens.add(word)
            return tokens

        # Build token-to-keyword mapping
        token_map: dict[str, list[int]] = {}
        kw_tokens: list[set[str]] = []

        for i, kw in enumerate(keywords):
            tokens = tokenize(kw.keyword)
            kw_tokens.append(tokens)
            for token in tokens:
                if token not in token_map:
                    token_map[token] = []
                token_map[token].append(i)

        # Find the most common significant tokens (cluster anchors)
        token_freq = sorted(token_map.items(), key=lambda x: len(x[1]), reverse=True)

        assigned: set[int] = set()
        clusters: list[KeywordCluster] = []

        for token, indices in token_freq:
            # Skip single-occurrence tokens or very common stop-like tokens
            if len(indices) < 2:
                continue

            # Gather unassigned keywords that share this token
            cluster_indices = [i for i in indices if i not in assigned]
            if len(cluster_indices) < 2:
                continue

            # Create the cluster
            cluster_kws = [keywords[i].keyword for i in cluster_indices]
            cluster_volumes = [keywords[i].volume for i in cluster_indices]
            cluster_kds = [keywords[i].kd for i in cluster_indices]
            cluster_intents = [keywords[i].intent for i in cluster_indices]

            # Determine primary intent by frequency
            intent_counts: dict[str, int] = {}
            for intent in cluster_intents:
                intent_counts[intent] = intent_counts.get(intent, 0) + 1
            primary_intent = max(intent_counts, key=intent_counts.get)

            cluster = KeywordCluster(
                topic=token,
                keywords=cluster_kws,
                total_volume=sum(cluster_volumes),
                avg_kd=round(sum(cluster_kds) / len(cluster_kds), 1) if cluster_kds else 0.0,
                primary_intent=primary_intent,
            )
            clusters.append(cluster)

            for i in cluster_indices:
                assigned.add(i)
                keywords[i].cluster = token

        # Assign unclustered keywords to an "other" cluster
        unclustered = [i for i in range(len(keywords)) if i not in assigned]
        if unclustered:
            other_kws = [keywords[i].keyword for i in unclustered]
            other_volumes = [keywords[i].volume for i in unclustered]
            other_kds = [keywords[i].kd for i in unclustered]

            other_cluster = KeywordCluster(
                topic="(unclustered)",
                keywords=other_kws,
                total_volume=sum(other_volumes),
                avg_kd=round(sum(other_kds) / len(other_kds), 1) if other_kds else 0.0,
                primary_intent="informational",
            )
            clusters.append(other_cluster)

            for i in unclustered:
                keywords[i].cluster = "(unclustered)"

        # Sort clusters by total volume descending
        clusters.sort(key=lambda c: c.total_volume, reverse=True)

        logger.info(f"Clustered {len(keywords)} keywords into {len(clusters)} clusters")
        return clusters

    # ---- Full analysis orchestration ----

    def analyze(self, seed_keyword: str) -> ResearchResult:
        """
        Orchestrate a full keyword research analysis:
        1. Expand seed via Ahrefs
        2. Optionally expand Korean suffixes
        3. Classify intent for all keywords
        4. Optionally fetch volume-by-country
        5. Cluster keywords into topics
        6. Compile results
        """
        logger.info(f"Starting keyword research for: {seed_keyword} (country={self.country})")

        # Step 1: Expand keywords
        keywords = self.expand_keywords(seed_keyword)

        # Step 2: Korean suffix expansion
        if self.korean_suffixes:
            suffix_keywords = self.expand_korean_suffixes(seed_keyword)
            keywords.extend(suffix_keywords)

        # Step 3: Classify intent for all keywords
        for kw in keywords:
            if not kw.intent or kw.intent == "informational":
                kw.intent = self.classify_intent(kw.keyword)

        # Step 4: Volume-by-country comparison
        if self.compare_global and keywords:
            # Fetch for the seed and top volume keywords
            top_keywords = sorted(keywords, key=lambda k: k.volume, reverse=True)[:10]
            for kw in top_keywords:
                volumes = self.get_volume_by_country(kw.keyword)
                kw.country_volumes = volumes

        # Step 5: Cluster keywords
        clusters = self.cluster_keywords(keywords)

        # Step 6: Compile result
        result = ResearchResult(
            seed_keyword=seed_keyword,
            country=self.country,
            total_keywords=len(keywords),
            total_volume=sum(kw.volume for kw in keywords),
            clusters=clusters,
            keywords=sorted(keywords, key=lambda k: k.volume, reverse=True),
            timestamp=datetime.now().isoformat(),
        )

        logger.info(
            f"Research complete: {result.total_keywords} keywords, "
            f"{len(result.clusters)} clusters, "
            f"total volume {result.total_volume}"
        )
        return result

    # ---- Internal helpers ----

    def _parse_keyword_item(self, item: dict, source: str = "") -> Optional[KeywordEntry]:
        """Parse an Ahrefs API response item into a KeywordEntry."""
        if not item or "error" in item:
            return None

        keyword = item.get("keyword", item.get("term", item.get("query", "")))
        if not keyword:
            return None

        volume = int(item.get("volume", item.get("search_volume", 0)) or 0)
        kd = float(item.get("keyword_difficulty", item.get("kd", 0)) or 0)
        cpc = float(item.get("cpc", item.get("cost_per_click", 0)) or 0)

        return KeywordEntry(
            keyword=keyword,
            volume=volume,
            kd=round(kd, 1),
            cpc=round(cpc, 2),
            intent=self.classify_intent(keyword),
            source=source,
        )


# ---------------------------------------------------------------------------
# Plain-text report formatter
# ---------------------------------------------------------------------------


def format_text_report(result: ResearchResult) -> str:
    """Format research result as a human-readable text report."""
    lines: list[str] = []
    lines.append("=" * 70)
    lines.append(f"Keyword Strategy Report: {result.seed_keyword}")
    lines.append(f"Country: {result.country.upper()} | Date: {result.timestamp[:10]}")
    lines.append("=" * 70)
    lines.append("")

    lines.append("## Overview")
    lines.append(f"  Total keywords discovered: {result.total_keywords}")
    lines.append(f"  Topic clusters: {len(result.clusters)}")
    lines.append(f"  Total search volume: {result.total_volume:,}")
    lines.append("")

    # Clusters summary
    if result.clusters:
        lines.append("## Top Clusters")
        lines.append(f"  {'Cluster':<25} {'Keywords':>8} {'Volume':>10} {'Avg KD':>8} {'Intent':<15}")
        lines.append("  " + "-" * 66)
        for cluster in result.clusters[:15]:
            lines.append(
                f"  {cluster.topic:<25} {len(cluster.keywords):>8} "
                f"{cluster.total_volume:>10,} {cluster.avg_kd:>8.1f} "
                f"{cluster.primary_intent:<15}"
            )
        lines.append("")

    # Top keywords
    if result.keywords:
        lines.append("## Top Keywords (by volume)")
        lines.append(f"  {'Keyword':<40} {'Vol':>8} {'KD':>6} {'CPC':>7} {'Intent':<15} {'Cluster':<15}")
        lines.append("  " + "-" * 91)
        for kw in result.keywords[:30]:
            kw_display = kw.keyword[:38] if len(kw.keyword) > 38 else kw.keyword
            cluster_display = kw.cluster[:13] if len(kw.cluster) > 13 else kw.cluster
            lines.append(
                f"  {kw_display:<40} {kw.volume:>8,} {kw.kd:>6.1f} "
                f"{kw.cpc:>7.2f} {kw.intent:<15} {cluster_display:<15}"
            )
        lines.append("")

    # Intent distribution
    intent_dist: dict[str, int] = {}
    for kw in result.keywords:
        intent_dist[kw.intent] = intent_dist.get(kw.intent, 0) + 1
    if intent_dist:
        lines.append("## Intent Distribution")
        for intent, count in sorted(intent_dist.items(), key=lambda x: x[1], reverse=True):
            pct = (count / len(result.keywords)) * 100 if result.keywords else 0
            lines.append(f"  {intent:<15}: {count:>5} ({pct:.1f}%)")
        lines.append("")

    return "\n".join(lines)


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------


def main():
    parser = argparse.ArgumentParser(
        description="Keyword Researcher - Expand, classify, and cluster keywords",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python keyword_researcher.py --keyword "치과 임플란트" --country kr --json
  python keyword_researcher.py --keyword "dental implant" --compare-global --json
  python keyword_researcher.py --keyword "치과 임플란트" --korean-suffixes --output report.json
        """,
    )
    parser.add_argument(
        "--keyword",
        required=True,
        help="Seed keyword to expand and research",
    )
    parser.add_argument(
        "--country",
        default="kr",
        help="Target country code (default: kr)",
    )
    parser.add_argument(
        "--korean-suffixes",
        action="store_true",
        help="Enable Korean suffix expansion (추천, 가격, 후기, etc.)",
    )
    parser.add_argument(
        "--compare-global",
        action="store_true",
        help="Fetch volume-by-country comparison for top keywords",
    )
    parser.add_argument(
        "--json",
        action="store_true",
        dest="output_json",
        help="Output results as JSON",
    )
    parser.add_argument(
        "--output",
        type=str,
        default=None,
        help="Write output to file (path)",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="Enable verbose/debug logging",
    )

    args = parser.parse_args()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    # Run analysis
    researcher = KeywordResearcher(
        country=args.country,
        korean_suffixes=args.korean_suffixes,
        compare_global=args.compare_global,
    )
    result = researcher.analyze(args.keyword)

    # Format output
    if args.output_json:
        output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
    else:
        output = format_text_report(result)

    # Write or print
    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
            f.write(output)
        logger.info(f"Output written to: {args.output}")
    else:
        print(output)

    return 0


if __name__ == "__main__":
    sys.exit(main())