Add SEO skills 19-28, 31-32 with full Python implementations
12 new skills: Keyword Strategy, SERP Analysis, Position Tracking, Link Building, Content Strategy, E-Commerce SEO, KPI Framework, International SEO, AI Visibility, Knowledge Graph, Competitor Intel, and Crawl Budget. ~20K lines of Python across 25 domain scripts. Updated skill 11 pipeline table and repo CLAUDE.md. Enhanced skill 18 local SEO workflow from jamie.clinic audit. Note: Skill 26 hreflang_validator.py pending (content filter block). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
Base Client - Shared async client utilities
|
||||
===========================================
|
||||
Purpose: Rate-limited async operations for API clients
|
||||
Python: 3.10+
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from asyncio import Semaphore
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable, TypeVar
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
retry_if_exception_type,
|
||||
)
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Logging setup
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Rate limiter using token bucket algorithm."""
|
||||
|
||||
def __init__(self, rate: float, per: float = 1.0):
|
||||
"""
|
||||
Initialize rate limiter.
|
||||
|
||||
Args:
|
||||
rate: Number of requests allowed
|
||||
per: Time period in seconds (default: 1 second)
|
||||
"""
|
||||
self.rate = rate
|
||||
self.per = per
|
||||
self.tokens = rate
|
||||
self.last_update = datetime.now()
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def acquire(self) -> None:
|
||||
"""Acquire a token, waiting if necessary."""
|
||||
async with self._lock:
|
||||
now = datetime.now()
|
||||
elapsed = (now - self.last_update).total_seconds()
|
||||
self.tokens = min(self.rate, self.tokens + elapsed * (self.rate / self.per))
|
||||
self.last_update = now
|
||||
|
||||
if self.tokens < 1:
|
||||
wait_time = (1 - self.tokens) * (self.per / self.rate)
|
||||
await asyncio.sleep(wait_time)
|
||||
self.tokens = 0
|
||||
else:
|
||||
self.tokens -= 1
|
||||
|
||||
|
||||
class BaseAsyncClient:
|
||||
"""Base class for async API clients with rate limiting."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_concurrent: int = 5,
|
||||
requests_per_second: float = 3.0,
|
||||
logger: logging.Logger | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize base client.
|
||||
|
||||
Args:
|
||||
max_concurrent: Maximum concurrent requests
|
||||
requests_per_second: Rate limit
|
||||
logger: Logger instance
|
||||
"""
|
||||
self.semaphore = Semaphore(max_concurrent)
|
||||
self.rate_limiter = RateLimiter(requests_per_second)
|
||||
self.logger = logger or logging.getLogger(self.__class__.__name__)
|
||||
self.stats = {
|
||||
"requests": 0,
|
||||
"success": 0,
|
||||
"errors": 0,
|
||||
"retries": 0,
|
||||
}
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=2, max=10),
|
||||
retry=retry_if_exception_type(Exception),
|
||||
)
|
||||
async def _rate_limited_request(
|
||||
self,
|
||||
coro: Callable[[], Any],
|
||||
) -> Any:
|
||||
"""Execute a request with rate limiting and retry."""
|
||||
async with self.semaphore:
|
||||
await self.rate_limiter.acquire()
|
||||
self.stats["requests"] += 1
|
||||
try:
|
||||
result = await coro()
|
||||
self.stats["success"] += 1
|
||||
return result
|
||||
except Exception as e:
|
||||
self.stats["errors"] += 1
|
||||
self.logger.error(f"Request failed: {e}")
|
||||
raise
|
||||
|
||||
async def batch_requests(
|
||||
self,
|
||||
requests: list[Callable[[], Any]],
|
||||
desc: str = "Processing",
|
||||
) -> list[Any]:
|
||||
"""Execute multiple requests concurrently."""
|
||||
try:
|
||||
from tqdm.asyncio import tqdm
|
||||
has_tqdm = True
|
||||
except ImportError:
|
||||
has_tqdm = False
|
||||
|
||||
async def execute(req: Callable) -> Any:
|
||||
try:
|
||||
return await self._rate_limited_request(req)
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
tasks = [execute(req) for req in requests]
|
||||
|
||||
if has_tqdm:
|
||||
results = []
|
||||
for coro in tqdm.as_completed(tasks, total=len(tasks), desc=desc):
|
||||
result = await coro
|
||||
results.append(result)
|
||||
return results
|
||||
else:
|
||||
return await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
def print_stats(self) -> None:
|
||||
"""Print request statistics."""
|
||||
self.logger.info("=" * 40)
|
||||
self.logger.info("Request Statistics:")
|
||||
self.logger.info(f" Total Requests: {self.stats['requests']}")
|
||||
self.logger.info(f" Successful: {self.stats['success']}")
|
||||
self.logger.info(f" Errors: {self.stats['errors']}")
|
||||
self.logger.info("=" * 40)
|
||||
|
||||
|
||||
class ConfigManager:
|
||||
"""Manage API configuration and credentials."""
|
||||
|
||||
def __init__(self):
|
||||
load_dotenv()
|
||||
|
||||
@property
|
||||
def google_credentials_path(self) -> str | None:
|
||||
"""Get Google service account credentials path."""
|
||||
# Prefer SEO-specific credentials, fallback to general credentials
|
||||
seo_creds = os.path.expanduser("~/.credential/ourdigital-seo-agent.json")
|
||||
if os.path.exists(seo_creds):
|
||||
return seo_creds
|
||||
return os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
|
||||
|
||||
@property
|
||||
def pagespeed_api_key(self) -> str | None:
|
||||
"""Get PageSpeed Insights API key."""
|
||||
return os.getenv("PAGESPEED_API_KEY")
|
||||
|
||||
@property
|
||||
def custom_search_api_key(self) -> str | None:
|
||||
"""Get Custom Search API key."""
|
||||
return os.getenv("CUSTOM_SEARCH_API_KEY")
|
||||
|
||||
@property
|
||||
def custom_search_engine_id(self) -> str | None:
|
||||
"""Get Custom Search Engine ID."""
|
||||
return os.getenv("CUSTOM_SEARCH_ENGINE_ID")
|
||||
|
||||
@property
|
||||
def notion_token(self) -> str | None:
|
||||
"""Get Notion API token."""
|
||||
return os.getenv("NOTION_TOKEN") or os.getenv("NOTION_API_KEY")
|
||||
|
||||
def validate_google_credentials(self) -> bool:
|
||||
"""Validate Google credentials are configured."""
|
||||
creds_path = self.google_credentials_path
|
||||
if not creds_path:
|
||||
return False
|
||||
return os.path.exists(creds_path)
|
||||
|
||||
def get_required(self, key: str) -> str:
|
||||
"""Get required environment variable or raise error."""
|
||||
value = os.getenv(key)
|
||||
if not value:
|
||||
raise ValueError(f"Missing required environment variable: {key}")
|
||||
return value
|
||||
|
||||
|
||||
# Singleton config instance
|
||||
config = ConfigManager()
|
||||
@@ -0,0 +1,584 @@
|
||||
"""
|
||||
Keyword Gap Analyzer - Competitor keyword gap analysis with opportunity scoring
|
||||
===============================================================================
|
||||
Purpose: Identify keywords competitors rank for but target site doesn't,
|
||||
score opportunities, and prioritize by volume/difficulty ratio.
|
||||
Python: 3.10+
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging
|
||||
# ---------------------------------------------------------------------------
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger("keyword_gap_analyzer")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Intent classification patterns (shared with keyword_researcher)
|
||||
# ---------------------------------------------------------------------------
|
||||
INTENT_PATTERNS: dict[str, list[str]] = {
|
||||
"transactional": [
|
||||
r"구매|구입|주문|buy|order|purchase|shop|deal|discount|coupon|할인|쿠폰",
|
||||
r"예약|booking|reserve|sign\s?up|register|등록|신청",
|
||||
],
|
||||
"commercial": [
|
||||
r"가격|비용|얼마|price|cost|pricing|fee|요금",
|
||||
r"추천|best|top\s?\d|review|비교|compare|vs|versus|후기|리뷰|평점|평가",
|
||||
r"잘하는곳|잘하는|맛집|업체|병원|추천\s?병원",
|
||||
],
|
||||
"navigational": [
|
||||
r"^(www\.|http|\.com|\.co\.kr|\.net)",
|
||||
r"공식|official|login|로그인|홈페이지|사이트|website",
|
||||
r"고객센터|contact|support|customer\s?service",
|
||||
],
|
||||
"informational": [
|
||||
r"방법|how\s?to|what\s?is|why|when|where|who|which",
|
||||
r"뜻|의미|정의|definition|meaning|guide|tutorial",
|
||||
r"효과|부작용|증상|원인|차이|종류|type|cause|symptom|effect",
|
||||
r"전후|before\s?and\s?after|결과|result",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Dataclasses
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class OrganicKeyword:
|
||||
"""A keyword that a domain ranks for organically."""
|
||||
|
||||
keyword: str
|
||||
position: int = 0
|
||||
volume: int = 0
|
||||
kd: float = 0.0
|
||||
cpc: float = 0.0
|
||||
url: str = ""
|
||||
traffic: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class GapKeyword:
|
||||
"""A keyword gap between target and competitor(s)."""
|
||||
|
||||
keyword: str
|
||||
volume: int = 0
|
||||
kd: float = 0.0
|
||||
cpc: float = 0.0
|
||||
intent: str = "informational"
|
||||
opportunity_score: float = 0.0
|
||||
competitor_positions: dict[str, int] = field(default_factory=dict)
|
||||
competitor_urls: dict[str, str] = field(default_factory=dict)
|
||||
avg_competitor_position: float = 0.0
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GapAnalysisResult:
|
||||
"""Complete gap analysis result."""
|
||||
|
||||
target: str
|
||||
competitors: list[str] = field(default_factory=list)
|
||||
country: str = "kr"
|
||||
total_gaps: int = 0
|
||||
total_opportunity_volume: int = 0
|
||||
gaps_by_intent: dict[str, int] = field(default_factory=dict)
|
||||
top_opportunities: list[GapKeyword] = field(default_factory=list)
|
||||
all_gaps: list[GapKeyword] = field(default_factory=list)
|
||||
target_keyword_count: int = 0
|
||||
competitor_keyword_counts: dict[str, int] = field(default_factory=dict)
|
||||
timestamp: str = ""
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"target": self.target,
|
||||
"competitors": self.competitors,
|
||||
"country": self.country,
|
||||
"total_gaps": self.total_gaps,
|
||||
"total_opportunity_volume": self.total_opportunity_volume,
|
||||
"gaps_by_intent": self.gaps_by_intent,
|
||||
"top_opportunities": [g.to_dict() for g in self.top_opportunities],
|
||||
"all_gaps": [g.to_dict() for g in self.all_gaps],
|
||||
"target_keyword_count": self.target_keyword_count,
|
||||
"competitor_keyword_counts": self.competitor_keyword_counts,
|
||||
"timestamp": self.timestamp,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MCP Helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def call_mcp_tool(tool_name: str, params: dict) -> dict:
|
||||
"""
|
||||
Call an Ahrefs MCP tool and return parsed JSON response.
|
||||
|
||||
In production this delegates to the MCP bridge. For standalone usage
|
||||
it invokes the Claude CLI with the appropriate tool call.
|
||||
"""
|
||||
logger.info(f"Calling MCP tool: {tool_name} with params: {json.dumps(params, ensure_ascii=False)}")
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
"claude",
|
||||
"--print",
|
||||
"--output-format", "json",
|
||||
"-p",
|
||||
(
|
||||
f"Call the tool mcp__claude_ai_Ahrefs__{tool_name} with these parameters: "
|
||||
f"{json.dumps(params, ensure_ascii=False)}. Return ONLY the raw JSON result."
|
||||
),
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.warning(f"MCP tool {tool_name} returned non-zero exit code: {result.returncode}")
|
||||
logger.debug(f"stderr: {result.stderr}")
|
||||
return {"error": result.stderr, "keywords": [], "items": []}
|
||||
|
||||
try:
|
||||
return json.loads(result.stdout)
|
||||
except json.JSONDecodeError:
|
||||
return {"raw": result.stdout, "keywords": [], "items": []}
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error(f"MCP tool {tool_name} timed out")
|
||||
return {"error": "timeout", "keywords": [], "items": []}
|
||||
except FileNotFoundError:
|
||||
logger.warning("Claude CLI not found - returning empty result for standalone testing")
|
||||
return {"keywords": [], "items": []}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Utility functions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def extract_domain(url: str) -> str:
|
||||
"""Extract clean domain from URL."""
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = f"https://{url}"
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc or parsed.path
|
||||
domain = domain.lower().strip("/")
|
||||
if domain.startswith("www."):
|
||||
domain = domain[4:]
|
||||
return domain
|
||||
|
||||
|
||||
def classify_intent(keyword: str) -> str:
|
||||
"""Classify search intent based on keyword patterns."""
|
||||
keyword_lower = keyword.lower().strip()
|
||||
for intent, patterns in INTENT_PATTERNS.items():
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, keyword_lower, re.IGNORECASE):
|
||||
return intent
|
||||
return "informational"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# KeywordGapAnalyzer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class KeywordGapAnalyzer:
|
||||
"""Analyze keyword gaps between a target site and its competitors."""
|
||||
|
||||
def __init__(self, country: str = "kr", min_volume: int = 0):
|
||||
self.country = country
|
||||
self.min_volume = min_volume
|
||||
|
||||
def get_organic_keywords(self, domain: str, limit: int = 1000) -> list[OrganicKeyword]:
|
||||
"""
|
||||
Fetch organic keywords for a domain via Ahrefs site-explorer-organic-keywords.
|
||||
Returns a list of OrganicKeyword entries.
|
||||
"""
|
||||
clean_domain = extract_domain(domain)
|
||||
logger.info(f"Fetching organic keywords for: {clean_domain} (limit={limit})")
|
||||
|
||||
result = call_mcp_tool("site-explorer-organic-keywords", {
|
||||
"target": clean_domain,
|
||||
"country": self.country,
|
||||
"limit": limit,
|
||||
"mode": "domain",
|
||||
})
|
||||
|
||||
keywords: list[OrganicKeyword] = []
|
||||
for item in result.get("keywords", result.get("items", [])):
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
kw = OrganicKeyword(
|
||||
keyword=item.get("keyword", item.get("term", "")),
|
||||
position=int(item.get("position", item.get("rank", 0)) or 0),
|
||||
volume=int(item.get("volume", item.get("search_volume", 0)) or 0),
|
||||
kd=float(item.get("keyword_difficulty", item.get("kd", 0)) or 0),
|
||||
cpc=float(item.get("cpc", item.get("cost_per_click", 0)) or 0),
|
||||
url=item.get("url", item.get("best_position_url", "")),
|
||||
traffic=int(item.get("traffic", item.get("estimated_traffic", 0)) or 0),
|
||||
)
|
||||
if kw.keyword:
|
||||
keywords.append(kw)
|
||||
|
||||
logger.info(f"Found {len(keywords)} organic keywords for {clean_domain}")
|
||||
return keywords
|
||||
|
||||
def find_gaps(
|
||||
self,
|
||||
target_keywords: list[OrganicKeyword],
|
||||
competitor_keyword_sets: dict[str, list[OrganicKeyword]],
|
||||
) -> list[GapKeyword]:
|
||||
"""
|
||||
Identify keywords that competitors rank for but the target doesn't.
|
||||
|
||||
A gap keyword is one that appears in at least one competitor's keyword
|
||||
set but not in the target's keyword set.
|
||||
"""
|
||||
# Build target keyword set for fast lookup
|
||||
target_kw_set: set[str] = {kw.keyword.lower().strip() for kw in target_keywords}
|
||||
|
||||
# Collect all competitor keywords with their positions
|
||||
gap_map: dict[str, GapKeyword] = {}
|
||||
|
||||
for comp_domain, comp_keywords in competitor_keyword_sets.items():
|
||||
for ckw in comp_keywords:
|
||||
kw_lower = ckw.keyword.lower().strip()
|
||||
|
||||
# Skip if target already ranks for this keyword
|
||||
if kw_lower in target_kw_set:
|
||||
continue
|
||||
|
||||
# Skip below minimum volume
|
||||
if ckw.volume < self.min_volume:
|
||||
continue
|
||||
|
||||
if kw_lower not in gap_map:
|
||||
gap_map[kw_lower] = GapKeyword(
|
||||
keyword=ckw.keyword,
|
||||
volume=ckw.volume,
|
||||
kd=ckw.kd,
|
||||
cpc=ckw.cpc,
|
||||
intent=classify_intent(ckw.keyword),
|
||||
competitor_positions={},
|
||||
competitor_urls={},
|
||||
)
|
||||
|
||||
gap_map[kw_lower].competitor_positions[comp_domain] = ckw.position
|
||||
gap_map[kw_lower].competitor_urls[comp_domain] = ckw.url
|
||||
|
||||
# Update volume/kd if higher from another competitor
|
||||
if ckw.volume > gap_map[kw_lower].volume:
|
||||
gap_map[kw_lower].volume = ckw.volume
|
||||
if ckw.kd > 0 and (gap_map[kw_lower].kd == 0 or ckw.kd < gap_map[kw_lower].kd):
|
||||
gap_map[kw_lower].kd = ckw.kd
|
||||
|
||||
gaps = list(gap_map.values())
|
||||
|
||||
# Calculate average competitor position for each gap
|
||||
for gap in gaps:
|
||||
positions = list(gap.competitor_positions.values())
|
||||
gap.avg_competitor_position = round(
|
||||
sum(positions) / len(positions), 1
|
||||
) if positions else 0.0
|
||||
|
||||
logger.info(f"Found {len(gaps)} keyword gaps")
|
||||
return gaps
|
||||
|
||||
def score_opportunities(self, gaps: list[GapKeyword]) -> list[GapKeyword]:
|
||||
"""
|
||||
Score each gap keyword by opportunity potential.
|
||||
|
||||
Formula:
|
||||
opportunity_score = (volume_score * 0.4) + (kd_score * 0.3) +
|
||||
(position_score * 0.2) + (intent_score * 0.1)
|
||||
|
||||
Where:
|
||||
- volume_score: normalized 0-100 based on max volume in set
|
||||
- kd_score: inverted (lower KD = higher score), normalized 0-100
|
||||
- position_score: based on avg competitor position (lower = easier to compete)
|
||||
- intent_score: commercial/transactional get higher scores
|
||||
"""
|
||||
if not gaps:
|
||||
return gaps
|
||||
|
||||
# Find max volume for normalization
|
||||
max_volume = max(g.volume for g in gaps) if gaps else 1
|
||||
max_volume = max(max_volume, 1)
|
||||
|
||||
intent_scores = {
|
||||
"transactional": 100,
|
||||
"commercial": 80,
|
||||
"informational": 40,
|
||||
"navigational": 20,
|
||||
}
|
||||
|
||||
for gap in gaps:
|
||||
# Volume score (0-100)
|
||||
volume_score = (gap.volume / max_volume) * 100
|
||||
|
||||
# KD score (inverted: low KD = high score)
|
||||
kd_score = max(0, 100 - gap.kd)
|
||||
|
||||
# Position score (competitors ranking 1-10 means realistic opportunity)
|
||||
if gap.avg_competitor_position <= 10:
|
||||
position_score = 90
|
||||
elif gap.avg_competitor_position <= 20:
|
||||
position_score = 70
|
||||
elif gap.avg_competitor_position <= 50:
|
||||
position_score = 50
|
||||
else:
|
||||
position_score = 30
|
||||
|
||||
# Intent score
|
||||
intent_score = intent_scores.get(gap.intent, 40)
|
||||
|
||||
# Combined score
|
||||
gap.opportunity_score = round(
|
||||
(volume_score * 0.4) +
|
||||
(kd_score * 0.3) +
|
||||
(position_score * 0.2) +
|
||||
(intent_score * 0.1),
|
||||
1,
|
||||
)
|
||||
|
||||
# Sort by opportunity score descending
|
||||
gaps.sort(key=lambda g: g.opportunity_score, reverse=True)
|
||||
|
||||
logger.info(f"Scored {len(gaps)} gap keywords by opportunity")
|
||||
return gaps
|
||||
|
||||
def analyze(self, target_url: str, competitor_urls: list[str]) -> GapAnalysisResult:
|
||||
"""
|
||||
Orchestrate full keyword gap analysis:
|
||||
1. Fetch organic keywords for target
|
||||
2. Fetch organic keywords for each competitor
|
||||
3. Identify gaps
|
||||
4. Score opportunities
|
||||
5. Compile results
|
||||
"""
|
||||
target_domain = extract_domain(target_url)
|
||||
competitor_domains = [extract_domain(url) for url in competitor_urls]
|
||||
|
||||
logger.info(
|
||||
f"Starting gap analysis: {target_domain} vs {', '.join(competitor_domains)}"
|
||||
)
|
||||
|
||||
# Step 1: Fetch target keywords
|
||||
target_keywords = self.get_organic_keywords(target_domain)
|
||||
|
||||
# Step 2: Fetch competitor keywords
|
||||
competitor_keyword_sets: dict[str, list[OrganicKeyword]] = {}
|
||||
competitor_keyword_counts: dict[str, int] = {}
|
||||
|
||||
for comp_domain in competitor_domains:
|
||||
comp_keywords = self.get_organic_keywords(comp_domain)
|
||||
competitor_keyword_sets[comp_domain] = comp_keywords
|
||||
competitor_keyword_counts[comp_domain] = len(comp_keywords)
|
||||
|
||||
# Step 3: Find gaps
|
||||
gaps = self.find_gaps(target_keywords, competitor_keyword_sets)
|
||||
|
||||
# Step 4: Score opportunities
|
||||
scored_gaps = self.score_opportunities(gaps)
|
||||
|
||||
# Step 5: Calculate intent distribution
|
||||
gaps_by_intent: dict[str, int] = {}
|
||||
for gap in scored_gaps:
|
||||
gaps_by_intent[gap.intent] = gaps_by_intent.get(gap.intent, 0) + 1
|
||||
|
||||
# Step 6: Compile result
|
||||
result = GapAnalysisResult(
|
||||
target=target_domain,
|
||||
competitors=competitor_domains,
|
||||
country=self.country,
|
||||
total_gaps=len(scored_gaps),
|
||||
total_opportunity_volume=sum(g.volume for g in scored_gaps),
|
||||
gaps_by_intent=gaps_by_intent,
|
||||
top_opportunities=scored_gaps[:50],
|
||||
all_gaps=scored_gaps,
|
||||
target_keyword_count=len(target_keywords),
|
||||
competitor_keyword_counts=competitor_keyword_counts,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Gap analysis complete: {result.total_gaps} gaps found, "
|
||||
f"total opportunity volume {result.total_opportunity_volume:,}"
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Plain-text report formatter
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def format_text_report(result: GapAnalysisResult) -> str:
|
||||
"""Format gap analysis result as a human-readable text report."""
|
||||
lines: list[str] = []
|
||||
lines.append("=" * 75)
|
||||
lines.append(f"Keyword Gap Analysis Report")
|
||||
lines.append(f"Target: {result.target}")
|
||||
lines.append(f"Competitors: {', '.join(result.competitors)}")
|
||||
lines.append(f"Country: {result.country.upper()} | Date: {result.timestamp[:10]}")
|
||||
lines.append("=" * 75)
|
||||
lines.append("")
|
||||
|
||||
# Overview
|
||||
lines.append("## Overview")
|
||||
lines.append(f" Target keywords: {result.target_keyword_count:,}")
|
||||
for comp, count in result.competitor_keyword_counts.items():
|
||||
lines.append(f" {comp} keywords: {count:,}")
|
||||
lines.append(f" Keyword gaps found: {result.total_gaps:,}")
|
||||
lines.append(f" Total opportunity volume: {result.total_opportunity_volume:,}")
|
||||
lines.append("")
|
||||
|
||||
# Intent distribution
|
||||
if result.gaps_by_intent:
|
||||
lines.append("## Gaps by Intent")
|
||||
for intent, count in sorted(result.gaps_by_intent.items(), key=lambda x: x[1], reverse=True):
|
||||
pct = (count / result.total_gaps) * 100 if result.total_gaps else 0
|
||||
lines.append(f" {intent:<15}: {count:>5} ({pct:.1f}%)")
|
||||
lines.append("")
|
||||
|
||||
# Top opportunities
|
||||
if result.top_opportunities:
|
||||
lines.append("## Top Opportunities (by score)")
|
||||
header = f" {'Keyword':<35} {'Vol':>8} {'KD':>6} {'Score':>7} {'Intent':<15} {'Competitors'}"
|
||||
lines.append(header)
|
||||
lines.append(" " + "-" * 90)
|
||||
|
||||
for gap in result.top_opportunities[:30]:
|
||||
kw_display = gap.keyword[:33] if len(gap.keyword) > 33 else gap.keyword
|
||||
comp_positions = ", ".join(
|
||||
f"{d}:#{p}" for d, p in gap.competitor_positions.items()
|
||||
)
|
||||
comp_display = comp_positions[:30] if len(comp_positions) > 30 else comp_positions
|
||||
|
||||
lines.append(
|
||||
f" {kw_display:<35} {gap.volume:>8,} {gap.kd:>6.1f} "
|
||||
f"{gap.opportunity_score:>7.1f} {gap.intent:<15} {comp_display}"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
# Quick wins (low KD, high volume)
|
||||
quick_wins = [g for g in result.all_gaps if g.kd <= 30 and g.volume >= 100]
|
||||
quick_wins.sort(key=lambda g: g.volume, reverse=True)
|
||||
if quick_wins:
|
||||
lines.append("## Quick Wins (KD <= 30, Volume >= 100)")
|
||||
lines.append(f" {'Keyword':<35} {'Vol':>8} {'KD':>6} {'Intent':<15}")
|
||||
lines.append(" " + "-" * 64)
|
||||
for gap in quick_wins[:20]:
|
||||
kw_display = gap.keyword[:33] if len(gap.keyword) > 33 else gap.keyword
|
||||
lines.append(
|
||||
f" {kw_display:<35} {gap.volume:>8,} {gap.kd:>6.1f} {gap.intent:<15}"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Keyword Gap Analyzer - Find competitor keyword opportunities",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python keyword_gap_analyzer.py --target https://example.com --competitor https://comp.com --json
|
||||
python keyword_gap_analyzer.py --target example.com --competitor comp1.com --competitor comp2.com --min-volume 100 --json
|
||||
python keyword_gap_analyzer.py --target example.com --competitor comp.com --country us --output gaps.json
|
||||
""",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--target",
|
||||
required=True,
|
||||
help="Target website URL or domain",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--competitor",
|
||||
action="append",
|
||||
required=True,
|
||||
dest="competitors",
|
||||
help="Competitor URL or domain (can be repeated)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--country",
|
||||
default="kr",
|
||||
help="Target country code (default: kr)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-volume",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Minimum search volume filter (default: 0)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
action="store_true",
|
||||
dest="output_json",
|
||||
help="Output results as JSON",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Write output to file (path)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="Enable verbose/debug logging",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
# Run analysis
|
||||
analyzer = KeywordGapAnalyzer(
|
||||
country=args.country,
|
||||
min_volume=args.min_volume,
|
||||
)
|
||||
result = analyzer.analyze(args.target, args.competitors)
|
||||
|
||||
# Format output
|
||||
if args.output_json:
|
||||
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
|
||||
else:
|
||||
output = format_text_report(result)
|
||||
|
||||
# Write or print
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
logger.info(f"Output written to: {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,656 @@
|
||||
"""
|
||||
Keyword Researcher - Seed keyword expansion, intent classification, and topic clustering
|
||||
========================================================================================
|
||||
Purpose: Expand seed keywords via Ahrefs APIs, classify search intent,
|
||||
cluster topics, and support Korean market keyword discovery.
|
||||
Python: 3.10+
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging
|
||||
# ---------------------------------------------------------------------------
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger("keyword_researcher")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants - Korean suffix expansion
|
||||
# ---------------------------------------------------------------------------
|
||||
KOREAN_SUFFIXES: list[str] = [
|
||||
"추천",
|
||||
"가격",
|
||||
"후기",
|
||||
"잘하는곳",
|
||||
"부작용",
|
||||
"전후",
|
||||
"비용",
|
||||
"추천 병원",
|
||||
"후기 블로그",
|
||||
"방법",
|
||||
"종류",
|
||||
"비교",
|
||||
"효과",
|
||||
"주의사항",
|
||||
"장단점",
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Intent classification patterns
|
||||
# ---------------------------------------------------------------------------
|
||||
INTENT_PATTERNS: dict[str, list[str]] = {
|
||||
"transactional": [
|
||||
r"구매|구입|주문|buy|order|purchase|shop|deal|discount|coupon|할인|쿠폰",
|
||||
r"예약|booking|reserve|sign\s?up|register|등록|신청",
|
||||
],
|
||||
"commercial": [
|
||||
r"가격|비용|얼마|price|cost|pricing|fee|요금",
|
||||
r"추천|best|top\s?\d|review|비교|compare|vs|versus|후기|리뷰|평점|평가",
|
||||
r"잘하는곳|잘하는|맛집|업체|병원|추천\s?병원",
|
||||
],
|
||||
"navigational": [
|
||||
r"^(www\.|http|\.com|\.co\.kr|\.net)",
|
||||
r"공식|official|login|로그인|홈페이지|사이트|website",
|
||||
r"고객센터|contact|support|customer\s?service",
|
||||
],
|
||||
"informational": [
|
||||
r"방법|how\s?to|what\s?is|why|when|where|who|which",
|
||||
r"뜻|의미|정의|definition|meaning|guide|tutorial",
|
||||
r"효과|부작용|증상|원인|차이|종류|type|cause|symptom|effect",
|
||||
r"전후|before\s?and\s?after|결과|result",
|
||||
],
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Dataclasses
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class KeywordEntry:
|
||||
"""Single keyword with its metrics and classification."""
|
||||
|
||||
keyword: str
|
||||
volume: int = 0
|
||||
kd: float = 0.0
|
||||
cpc: float = 0.0
|
||||
intent: str = "informational"
|
||||
cluster: str = ""
|
||||
source: str = ""
|
||||
country_volumes: dict[str, int] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
data = asdict(self)
|
||||
if not data["country_volumes"]:
|
||||
del data["country_volumes"]
|
||||
return data
|
||||
|
||||
|
||||
@dataclass
|
||||
class KeywordCluster:
|
||||
"""Group of semantically related keywords."""
|
||||
|
||||
topic: str
|
||||
keywords: list[str] = field(default_factory=list)
|
||||
total_volume: int = 0
|
||||
avg_kd: float = 0.0
|
||||
primary_intent: str = "informational"
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ResearchResult:
|
||||
"""Full research result container."""
|
||||
|
||||
seed_keyword: str
|
||||
country: str
|
||||
total_keywords: int = 0
|
||||
total_volume: int = 0
|
||||
clusters: list[KeywordCluster] = field(default_factory=list)
|
||||
keywords: list[KeywordEntry] = field(default_factory=list)
|
||||
timestamp: str = ""
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"seed_keyword": self.seed_keyword,
|
||||
"country": self.country,
|
||||
"total_keywords": self.total_keywords,
|
||||
"total_volume": self.total_volume,
|
||||
"clusters": [c.to_dict() for c in self.clusters],
|
||||
"keywords": [k.to_dict() for k in self.keywords],
|
||||
"timestamp": self.timestamp,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MCP Helper - calls Ahrefs MCP tools via subprocess
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def call_mcp_tool(tool_name: str, params: dict) -> dict:
|
||||
"""
|
||||
Call an Ahrefs MCP tool and return parsed JSON response.
|
||||
|
||||
In production this delegates to the MCP bridge. For standalone usage
|
||||
it invokes the Claude CLI with the appropriate tool call.
|
||||
"""
|
||||
logger.info(f"Calling MCP tool: {tool_name} with params: {json.dumps(params, ensure_ascii=False)}")
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
"claude",
|
||||
"--print",
|
||||
"--output-format", "json",
|
||||
"-p",
|
||||
f"Call the tool mcp__claude_ai_Ahrefs__{tool_name} with these parameters: {json.dumps(params, ensure_ascii=False)}. Return ONLY the raw JSON result.",
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.warning(f"MCP tool {tool_name} returned non-zero exit code: {result.returncode}")
|
||||
logger.debug(f"stderr: {result.stderr}")
|
||||
return {"error": result.stderr, "keywords": [], "items": []}
|
||||
|
||||
try:
|
||||
return json.loads(result.stdout)
|
||||
except json.JSONDecodeError:
|
||||
return {"raw": result.stdout, "keywords": [], "items": []}
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error(f"MCP tool {tool_name} timed out")
|
||||
return {"error": "timeout", "keywords": [], "items": []}
|
||||
except FileNotFoundError:
|
||||
logger.warning("Claude CLI not found - returning empty result for standalone testing")
|
||||
return {"keywords": [], "items": []}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# KeywordResearcher
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class KeywordResearcher:
|
||||
"""Expand seed keywords, classify intent, and cluster topics."""
|
||||
|
||||
def __init__(self, country: str = "kr", korean_suffixes: bool = False, compare_global: bool = False):
|
||||
self.country = country
|
||||
self.korean_suffixes = korean_suffixes
|
||||
self.compare_global = compare_global
|
||||
self._seen: set[str] = set()
|
||||
|
||||
# ---- Keyword expansion via Ahrefs MCP ----
|
||||
|
||||
def expand_keywords(self, seed: str) -> list[KeywordEntry]:
|
||||
"""
|
||||
Expand a seed keyword using Ahrefs matching-terms, related-terms,
|
||||
and search-suggestions endpoints.
|
||||
"""
|
||||
all_keywords: list[KeywordEntry] = []
|
||||
|
||||
# 1. Matching terms
|
||||
logger.info(f"Fetching matching terms for: {seed}")
|
||||
matching = call_mcp_tool("keywords-explorer-matching-terms", {
|
||||
"keyword": seed,
|
||||
"country": self.country,
|
||||
"limit": 100,
|
||||
})
|
||||
for item in matching.get("keywords", matching.get("items", [])):
|
||||
kw = self._parse_keyword_item(item, source="matching-terms")
|
||||
if kw and kw.keyword not in self._seen:
|
||||
self._seen.add(kw.keyword)
|
||||
all_keywords.append(kw)
|
||||
|
||||
# 2. Related terms
|
||||
logger.info(f"Fetching related terms for: {seed}")
|
||||
related = call_mcp_tool("keywords-explorer-related-terms", {
|
||||
"keyword": seed,
|
||||
"country": self.country,
|
||||
"limit": 100,
|
||||
})
|
||||
for item in related.get("keywords", related.get("items", [])):
|
||||
kw = self._parse_keyword_item(item, source="related-terms")
|
||||
if kw and kw.keyword not in self._seen:
|
||||
self._seen.add(kw.keyword)
|
||||
all_keywords.append(kw)
|
||||
|
||||
# 3. Search suggestions
|
||||
logger.info(f"Fetching search suggestions for: {seed}")
|
||||
suggestions = call_mcp_tool("keywords-explorer-search-suggestions", {
|
||||
"keyword": seed,
|
||||
"country": self.country,
|
||||
"limit": 50,
|
||||
})
|
||||
for item in suggestions.get("keywords", suggestions.get("items", [])):
|
||||
kw = self._parse_keyword_item(item, source="search-suggestions")
|
||||
if kw and kw.keyword not in self._seen:
|
||||
self._seen.add(kw.keyword)
|
||||
all_keywords.append(kw)
|
||||
|
||||
# 4. Add the seed itself if not already present
|
||||
if seed not in self._seen:
|
||||
self._seen.add(seed)
|
||||
overview = call_mcp_tool("keywords-explorer-overview", {
|
||||
"keyword": seed,
|
||||
"country": self.country,
|
||||
})
|
||||
seed_entry = self._parse_keyword_item(overview, source="seed")
|
||||
if seed_entry:
|
||||
seed_entry.keyword = seed
|
||||
all_keywords.insert(0, seed_entry)
|
||||
|
||||
logger.info(f"Expanded to {len(all_keywords)} keywords from Ahrefs APIs")
|
||||
return all_keywords
|
||||
|
||||
def expand_korean_suffixes(self, seed: str) -> list[KeywordEntry]:
|
||||
"""
|
||||
Generate keyword variations by appending common Korean suffixes.
|
||||
Each variation is checked against Ahrefs for volume data.
|
||||
"""
|
||||
suffix_keywords: list[KeywordEntry] = []
|
||||
|
||||
for suffix in KOREAN_SUFFIXES:
|
||||
variation = f"{seed} {suffix}"
|
||||
if variation in self._seen:
|
||||
continue
|
||||
|
||||
logger.info(f"Checking Korean suffix variation: {variation}")
|
||||
overview = call_mcp_tool("keywords-explorer-overview", {
|
||||
"keyword": variation,
|
||||
"country": self.country,
|
||||
})
|
||||
kw = self._parse_keyword_item(overview, source="korean-suffix")
|
||||
if kw:
|
||||
kw.keyword = variation
|
||||
if kw.volume > 0:
|
||||
self._seen.add(variation)
|
||||
suffix_keywords.append(kw)
|
||||
else:
|
||||
# Even if no data, include as zero-volume for completeness
|
||||
entry = KeywordEntry(
|
||||
keyword=variation,
|
||||
volume=0,
|
||||
kd=0.0,
|
||||
cpc=0.0,
|
||||
intent=self.classify_intent(variation),
|
||||
source="korean-suffix",
|
||||
)
|
||||
self._seen.add(variation)
|
||||
suffix_keywords.append(entry)
|
||||
|
||||
logger.info(f"Korean suffix expansion yielded {len(suffix_keywords)} variations")
|
||||
return suffix_keywords
|
||||
|
||||
def get_volume_by_country(self, keyword: str) -> dict[str, int]:
|
||||
"""
|
||||
Get search volume breakdown by country for a keyword.
|
||||
Useful for comparing Korean vs global demand.
|
||||
"""
|
||||
logger.info(f"Fetching volume-by-country for: {keyword}")
|
||||
result = call_mcp_tool("keywords-explorer-volume-by-country", {
|
||||
"keyword": keyword,
|
||||
})
|
||||
|
||||
volumes: dict[str, int] = {}
|
||||
for item in result.get("countries", result.get("items", [])):
|
||||
if isinstance(item, dict):
|
||||
country_code = item.get("country", item.get("code", ""))
|
||||
volume = item.get("volume", item.get("search_volume", 0))
|
||||
if country_code and volume:
|
||||
volumes[country_code.lower()] = int(volume)
|
||||
|
||||
return volumes
|
||||
|
||||
# ---- Intent classification ----
|
||||
|
||||
def classify_intent(self, keyword: str) -> str:
|
||||
"""
|
||||
Classify search intent based on keyword patterns.
|
||||
Priority: transactional > commercial > navigational > informational
|
||||
"""
|
||||
keyword_lower = keyword.lower().strip()
|
||||
|
||||
for intent, patterns in INTENT_PATTERNS.items():
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, keyword_lower, re.IGNORECASE):
|
||||
return intent
|
||||
|
||||
return "informational"
|
||||
|
||||
# ---- Keyword clustering ----
|
||||
|
||||
def cluster_keywords(self, keywords: list[KeywordEntry]) -> list[KeywordCluster]:
|
||||
"""
|
||||
Group keywords into topic clusters using shared n-gram tokens.
|
||||
Uses a simple token overlap approach: keywords sharing significant
|
||||
tokens (2+ character words) are grouped together.
|
||||
"""
|
||||
if not keywords:
|
||||
return []
|
||||
|
||||
# Extract meaningful tokens from each keyword
|
||||
def tokenize(text: str) -> set[str]:
|
||||
tokens = set()
|
||||
for word in re.split(r"\s+", text.strip().lower()):
|
||||
if len(word) >= 2:
|
||||
tokens.add(word)
|
||||
return tokens
|
||||
|
||||
# Build token-to-keyword mapping
|
||||
token_map: dict[str, list[int]] = {}
|
||||
kw_tokens: list[set[str]] = []
|
||||
|
||||
for i, kw in enumerate(keywords):
|
||||
tokens = tokenize(kw.keyword)
|
||||
kw_tokens.append(tokens)
|
||||
for token in tokens:
|
||||
if token not in token_map:
|
||||
token_map[token] = []
|
||||
token_map[token].append(i)
|
||||
|
||||
# Find the most common significant tokens (cluster anchors)
|
||||
token_freq = sorted(token_map.items(), key=lambda x: len(x[1]), reverse=True)
|
||||
|
||||
assigned: set[int] = set()
|
||||
clusters: list[KeywordCluster] = []
|
||||
|
||||
for token, indices in token_freq:
|
||||
# Skip single-occurrence tokens or very common stop-like tokens
|
||||
if len(indices) < 2:
|
||||
continue
|
||||
|
||||
# Gather unassigned keywords that share this token
|
||||
cluster_indices = [i for i in indices if i not in assigned]
|
||||
if len(cluster_indices) < 2:
|
||||
continue
|
||||
|
||||
# Create the cluster
|
||||
cluster_kws = [keywords[i].keyword for i in cluster_indices]
|
||||
cluster_volumes = [keywords[i].volume for i in cluster_indices]
|
||||
cluster_kds = [keywords[i].kd for i in cluster_indices]
|
||||
cluster_intents = [keywords[i].intent for i in cluster_indices]
|
||||
|
||||
# Determine primary intent by frequency
|
||||
intent_counts: dict[str, int] = {}
|
||||
for intent in cluster_intents:
|
||||
intent_counts[intent] = intent_counts.get(intent, 0) + 1
|
||||
primary_intent = max(intent_counts, key=intent_counts.get)
|
||||
|
||||
cluster = KeywordCluster(
|
||||
topic=token,
|
||||
keywords=cluster_kws,
|
||||
total_volume=sum(cluster_volumes),
|
||||
avg_kd=round(sum(cluster_kds) / len(cluster_kds), 1) if cluster_kds else 0.0,
|
||||
primary_intent=primary_intent,
|
||||
)
|
||||
clusters.append(cluster)
|
||||
|
||||
for i in cluster_indices:
|
||||
assigned.add(i)
|
||||
keywords[i].cluster = token
|
||||
|
||||
# Assign unclustered keywords to an "other" cluster
|
||||
unclustered = [i for i in range(len(keywords)) if i not in assigned]
|
||||
if unclustered:
|
||||
other_kws = [keywords[i].keyword for i in unclustered]
|
||||
other_volumes = [keywords[i].volume for i in unclustered]
|
||||
other_kds = [keywords[i].kd for i in unclustered]
|
||||
|
||||
other_cluster = KeywordCluster(
|
||||
topic="(unclustered)",
|
||||
keywords=other_kws,
|
||||
total_volume=sum(other_volumes),
|
||||
avg_kd=round(sum(other_kds) / len(other_kds), 1) if other_kds else 0.0,
|
||||
primary_intent="informational",
|
||||
)
|
||||
clusters.append(other_cluster)
|
||||
|
||||
for i in unclustered:
|
||||
keywords[i].cluster = "(unclustered)"
|
||||
|
||||
# Sort clusters by total volume descending
|
||||
clusters.sort(key=lambda c: c.total_volume, reverse=True)
|
||||
|
||||
logger.info(f"Clustered {len(keywords)} keywords into {len(clusters)} clusters")
|
||||
return clusters
|
||||
|
||||
# ---- Full analysis orchestration ----
|
||||
|
||||
def analyze(self, seed_keyword: str) -> ResearchResult:
|
||||
"""
|
||||
Orchestrate a full keyword research analysis:
|
||||
1. Expand seed via Ahrefs
|
||||
2. Optionally expand Korean suffixes
|
||||
3. Classify intent for all keywords
|
||||
4. Optionally fetch volume-by-country
|
||||
5. Cluster keywords into topics
|
||||
6. Compile results
|
||||
"""
|
||||
logger.info(f"Starting keyword research for: {seed_keyword} (country={self.country})")
|
||||
|
||||
# Step 1: Expand keywords
|
||||
keywords = self.expand_keywords(seed_keyword)
|
||||
|
||||
# Step 2: Korean suffix expansion
|
||||
if self.korean_suffixes:
|
||||
suffix_keywords = self.expand_korean_suffixes(seed_keyword)
|
||||
keywords.extend(suffix_keywords)
|
||||
|
||||
# Step 3: Classify intent for all keywords
|
||||
for kw in keywords:
|
||||
if not kw.intent or kw.intent == "informational":
|
||||
kw.intent = self.classify_intent(kw.keyword)
|
||||
|
||||
# Step 4: Volume-by-country comparison
|
||||
if self.compare_global and keywords:
|
||||
# Fetch for the seed and top volume keywords
|
||||
top_keywords = sorted(keywords, key=lambda k: k.volume, reverse=True)[:10]
|
||||
for kw in top_keywords:
|
||||
volumes = self.get_volume_by_country(kw.keyword)
|
||||
kw.country_volumes = volumes
|
||||
|
||||
# Step 5: Cluster keywords
|
||||
clusters = self.cluster_keywords(keywords)
|
||||
|
||||
# Step 6: Compile result
|
||||
result = ResearchResult(
|
||||
seed_keyword=seed_keyword,
|
||||
country=self.country,
|
||||
total_keywords=len(keywords),
|
||||
total_volume=sum(kw.volume for kw in keywords),
|
||||
clusters=clusters,
|
||||
keywords=sorted(keywords, key=lambda k: k.volume, reverse=True),
|
||||
timestamp=datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Research complete: {result.total_keywords} keywords, "
|
||||
f"{len(result.clusters)} clusters, "
|
||||
f"total volume {result.total_volume}"
|
||||
)
|
||||
return result
|
||||
|
||||
# ---- Internal helpers ----
|
||||
|
||||
def _parse_keyword_item(self, item: dict, source: str = "") -> Optional[KeywordEntry]:
|
||||
"""Parse an Ahrefs API response item into a KeywordEntry."""
|
||||
if not item or "error" in item:
|
||||
return None
|
||||
|
||||
keyword = item.get("keyword", item.get("term", item.get("query", "")))
|
||||
if not keyword:
|
||||
return None
|
||||
|
||||
volume = int(item.get("volume", item.get("search_volume", 0)) or 0)
|
||||
kd = float(item.get("keyword_difficulty", item.get("kd", 0)) or 0)
|
||||
cpc = float(item.get("cpc", item.get("cost_per_click", 0)) or 0)
|
||||
|
||||
return KeywordEntry(
|
||||
keyword=keyword,
|
||||
volume=volume,
|
||||
kd=round(kd, 1),
|
||||
cpc=round(cpc, 2),
|
||||
intent=self.classify_intent(keyword),
|
||||
source=source,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Plain-text report formatter
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def format_text_report(result: ResearchResult) -> str:
|
||||
"""Format research result as a human-readable text report."""
|
||||
lines: list[str] = []
|
||||
lines.append("=" * 70)
|
||||
lines.append(f"Keyword Strategy Report: {result.seed_keyword}")
|
||||
lines.append(f"Country: {result.country.upper()} | Date: {result.timestamp[:10]}")
|
||||
lines.append("=" * 70)
|
||||
lines.append("")
|
||||
|
||||
lines.append("## Overview")
|
||||
lines.append(f" Total keywords discovered: {result.total_keywords}")
|
||||
lines.append(f" Topic clusters: {len(result.clusters)}")
|
||||
lines.append(f" Total search volume: {result.total_volume:,}")
|
||||
lines.append("")
|
||||
|
||||
# Clusters summary
|
||||
if result.clusters:
|
||||
lines.append("## Top Clusters")
|
||||
lines.append(f" {'Cluster':<25} {'Keywords':>8} {'Volume':>10} {'Avg KD':>8} {'Intent':<15}")
|
||||
lines.append(" " + "-" * 66)
|
||||
for cluster in result.clusters[:15]:
|
||||
lines.append(
|
||||
f" {cluster.topic:<25} {len(cluster.keywords):>8} "
|
||||
f"{cluster.total_volume:>10,} {cluster.avg_kd:>8.1f} "
|
||||
f"{cluster.primary_intent:<15}"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
# Top keywords
|
||||
if result.keywords:
|
||||
lines.append("## Top Keywords (by volume)")
|
||||
lines.append(f" {'Keyword':<40} {'Vol':>8} {'KD':>6} {'CPC':>7} {'Intent':<15} {'Cluster':<15}")
|
||||
lines.append(" " + "-" * 91)
|
||||
for kw in result.keywords[:30]:
|
||||
kw_display = kw.keyword[:38] if len(kw.keyword) > 38 else kw.keyword
|
||||
cluster_display = kw.cluster[:13] if len(kw.cluster) > 13 else kw.cluster
|
||||
lines.append(
|
||||
f" {kw_display:<40} {kw.volume:>8,} {kw.kd:>6.1f} "
|
||||
f"{kw.cpc:>7.2f} {kw.intent:<15} {cluster_display:<15}"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
# Intent distribution
|
||||
intent_dist: dict[str, int] = {}
|
||||
for kw in result.keywords:
|
||||
intent_dist[kw.intent] = intent_dist.get(kw.intent, 0) + 1
|
||||
if intent_dist:
|
||||
lines.append("## Intent Distribution")
|
||||
for intent, count in sorted(intent_dist.items(), key=lambda x: x[1], reverse=True):
|
||||
pct = (count / len(result.keywords)) * 100 if result.keywords else 0
|
||||
lines.append(f" {intent:<15}: {count:>5} ({pct:.1f}%)")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Keyword Researcher - Expand, classify, and cluster keywords",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python keyword_researcher.py --keyword "치과 임플란트" --country kr --json
|
||||
python keyword_researcher.py --keyword "dental implant" --compare-global --json
|
||||
python keyword_researcher.py --keyword "치과 임플란트" --korean-suffixes --output report.json
|
||||
""",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--keyword",
|
||||
required=True,
|
||||
help="Seed keyword to expand and research",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--country",
|
||||
default="kr",
|
||||
help="Target country code (default: kr)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--korean-suffixes",
|
||||
action="store_true",
|
||||
help="Enable Korean suffix expansion (추천, 가격, 후기, etc.)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--compare-global",
|
||||
action="store_true",
|
||||
help="Fetch volume-by-country comparison for top keywords",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
action="store_true",
|
||||
dest="output_json",
|
||||
help="Output results as JSON",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Write output to file (path)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="Enable verbose/debug logging",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
# Run analysis
|
||||
researcher = KeywordResearcher(
|
||||
country=args.country,
|
||||
korean_suffixes=args.korean_suffixes,
|
||||
compare_global=args.compare_global,
|
||||
)
|
||||
result = researcher.analyze(args.keyword)
|
||||
|
||||
# Format output
|
||||
if args.output_json:
|
||||
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
|
||||
else:
|
||||
output = format_text_report(result)
|
||||
|
||||
# Write or print
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
logger.info(f"Output written to: {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,20 @@
|
||||
# 19-seo-keyword-strategy dependencies
|
||||
# Install: pip install -r requirements.txt
|
||||
|
||||
# HTTP & Async
|
||||
requests>=2.31.0
|
||||
aiohttp>=3.9.0
|
||||
|
||||
# Data Processing
|
||||
pandas>=2.1.0
|
||||
|
||||
# NLP / Text Similarity
|
||||
scikit-learn>=1.3.0
|
||||
|
||||
# Async & Retry
|
||||
tenacity>=8.2.0
|
||||
tqdm>=4.66.0
|
||||
|
||||
# Environment & CLI
|
||||
python-dotenv>=1.0.0
|
||||
rich>=13.7.0
|
||||
Reference in New Issue
Block a user