12 new skills: Keyword Strategy, SERP Analysis, Position Tracking, Link Building, Content Strategy, E-Commerce SEO, KPI Framework, International SEO, AI Visibility, Knowledge Graph, Competitor Intel, and Crawl Budget. ~20K lines of Python across 25 domain scripts. Updated skill 11 pipeline table and repo CLAUDE.md. Enhanced skill 18 local SEO workflow from jamie.clinic audit. Note: Skill 26 hreflang_validator.py pending (content filter block). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
585 lines
21 KiB
Python
585 lines
21 KiB
Python
"""
|
|
Keyword Gap Analyzer - Competitor keyword gap analysis with opportunity scoring
|
|
===============================================================================
|
|
Purpose: Identify keywords competitors rank for but target site doesn't,
|
|
score opportunities, and prioritize by volume/difficulty ratio.
|
|
Python: 3.10+
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from dataclasses import dataclass, field, asdict
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
from urllib.parse import urlparse
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Logging
|
|
# ---------------------------------------------------------------------------
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
)
|
|
logger = logging.getLogger("keyword_gap_analyzer")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Intent classification patterns (shared with keyword_researcher)
|
|
# ---------------------------------------------------------------------------
|
|
INTENT_PATTERNS: dict[str, list[str]] = {
|
|
"transactional": [
|
|
r"구매|구입|주문|buy|order|purchase|shop|deal|discount|coupon|할인|쿠폰",
|
|
r"예약|booking|reserve|sign\s?up|register|등록|신청",
|
|
],
|
|
"commercial": [
|
|
r"가격|비용|얼마|price|cost|pricing|fee|요금",
|
|
r"추천|best|top\s?\d|review|비교|compare|vs|versus|후기|리뷰|평점|평가",
|
|
r"잘하는곳|잘하는|맛집|업체|병원|추천\s?병원",
|
|
],
|
|
"navigational": [
|
|
r"^(www\.|http|\.com|\.co\.kr|\.net)",
|
|
r"공식|official|login|로그인|홈페이지|사이트|website",
|
|
r"고객센터|contact|support|customer\s?service",
|
|
],
|
|
"informational": [
|
|
r"방법|how\s?to|what\s?is|why|when|where|who|which",
|
|
r"뜻|의미|정의|definition|meaning|guide|tutorial",
|
|
r"효과|부작용|증상|원인|차이|종류|type|cause|symptom|effect",
|
|
r"전후|before\s?and\s?after|결과|result",
|
|
],
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Dataclasses
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@dataclass
|
|
class OrganicKeyword:
|
|
"""A keyword that a domain ranks for organically."""
|
|
|
|
keyword: str
|
|
position: int = 0
|
|
volume: int = 0
|
|
kd: float = 0.0
|
|
cpc: float = 0.0
|
|
url: str = ""
|
|
traffic: int = 0
|
|
|
|
|
|
@dataclass
|
|
class GapKeyword:
|
|
"""A keyword gap between target and competitor(s)."""
|
|
|
|
keyword: str
|
|
volume: int = 0
|
|
kd: float = 0.0
|
|
cpc: float = 0.0
|
|
intent: str = "informational"
|
|
opportunity_score: float = 0.0
|
|
competitor_positions: dict[str, int] = field(default_factory=dict)
|
|
competitor_urls: dict[str, str] = field(default_factory=dict)
|
|
avg_competitor_position: float = 0.0
|
|
|
|
def to_dict(self) -> dict:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class GapAnalysisResult:
|
|
"""Complete gap analysis result."""
|
|
|
|
target: str
|
|
competitors: list[str] = field(default_factory=list)
|
|
country: str = "kr"
|
|
total_gaps: int = 0
|
|
total_opportunity_volume: int = 0
|
|
gaps_by_intent: dict[str, int] = field(default_factory=dict)
|
|
top_opportunities: list[GapKeyword] = field(default_factory=list)
|
|
all_gaps: list[GapKeyword] = field(default_factory=list)
|
|
target_keyword_count: int = 0
|
|
competitor_keyword_counts: dict[str, int] = field(default_factory=dict)
|
|
timestamp: str = ""
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"target": self.target,
|
|
"competitors": self.competitors,
|
|
"country": self.country,
|
|
"total_gaps": self.total_gaps,
|
|
"total_opportunity_volume": self.total_opportunity_volume,
|
|
"gaps_by_intent": self.gaps_by_intent,
|
|
"top_opportunities": [g.to_dict() for g in self.top_opportunities],
|
|
"all_gaps": [g.to_dict() for g in self.all_gaps],
|
|
"target_keyword_count": self.target_keyword_count,
|
|
"competitor_keyword_counts": self.competitor_keyword_counts,
|
|
"timestamp": self.timestamp,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# MCP Helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def call_mcp_tool(tool_name: str, params: dict) -> dict:
|
|
"""
|
|
Call an Ahrefs MCP tool and return parsed JSON response.
|
|
|
|
In production this delegates to the MCP bridge. For standalone usage
|
|
it invokes the Claude CLI with the appropriate tool call.
|
|
"""
|
|
logger.info(f"Calling MCP tool: {tool_name} with params: {json.dumps(params, ensure_ascii=False)}")
|
|
|
|
try:
|
|
cmd = [
|
|
"claude",
|
|
"--print",
|
|
"--output-format", "json",
|
|
"-p",
|
|
(
|
|
f"Call the tool mcp__claude_ai_Ahrefs__{tool_name} with these parameters: "
|
|
f"{json.dumps(params, ensure_ascii=False)}. Return ONLY the raw JSON result."
|
|
),
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
|
|
|
if result.returncode != 0:
|
|
logger.warning(f"MCP tool {tool_name} returned non-zero exit code: {result.returncode}")
|
|
logger.debug(f"stderr: {result.stderr}")
|
|
return {"error": result.stderr, "keywords": [], "items": []}
|
|
|
|
try:
|
|
return json.loads(result.stdout)
|
|
except json.JSONDecodeError:
|
|
return {"raw": result.stdout, "keywords": [], "items": []}
|
|
|
|
except subprocess.TimeoutExpired:
|
|
logger.error(f"MCP tool {tool_name} timed out")
|
|
return {"error": "timeout", "keywords": [], "items": []}
|
|
except FileNotFoundError:
|
|
logger.warning("Claude CLI not found - returning empty result for standalone testing")
|
|
return {"keywords": [], "items": []}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Utility functions
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def extract_domain(url: str) -> str:
|
|
"""Extract clean domain from URL."""
|
|
if not url.startswith(("http://", "https://")):
|
|
url = f"https://{url}"
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc or parsed.path
|
|
domain = domain.lower().strip("/")
|
|
if domain.startswith("www."):
|
|
domain = domain[4:]
|
|
return domain
|
|
|
|
|
|
def classify_intent(keyword: str) -> str:
|
|
"""Classify search intent based on keyword patterns."""
|
|
keyword_lower = keyword.lower().strip()
|
|
for intent, patterns in INTENT_PATTERNS.items():
|
|
for pattern in patterns:
|
|
if re.search(pattern, keyword_lower, re.IGNORECASE):
|
|
return intent
|
|
return "informational"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# KeywordGapAnalyzer
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class KeywordGapAnalyzer:
|
|
"""Analyze keyword gaps between a target site and its competitors."""
|
|
|
|
def __init__(self, country: str = "kr", min_volume: int = 0):
|
|
self.country = country
|
|
self.min_volume = min_volume
|
|
|
|
def get_organic_keywords(self, domain: str, limit: int = 1000) -> list[OrganicKeyword]:
|
|
"""
|
|
Fetch organic keywords for a domain via Ahrefs site-explorer-organic-keywords.
|
|
Returns a list of OrganicKeyword entries.
|
|
"""
|
|
clean_domain = extract_domain(domain)
|
|
logger.info(f"Fetching organic keywords for: {clean_domain} (limit={limit})")
|
|
|
|
result = call_mcp_tool("site-explorer-organic-keywords", {
|
|
"target": clean_domain,
|
|
"country": self.country,
|
|
"limit": limit,
|
|
"mode": "domain",
|
|
})
|
|
|
|
keywords: list[OrganicKeyword] = []
|
|
for item in result.get("keywords", result.get("items", [])):
|
|
if not isinstance(item, dict):
|
|
continue
|
|
kw = OrganicKeyword(
|
|
keyword=item.get("keyword", item.get("term", "")),
|
|
position=int(item.get("position", item.get("rank", 0)) or 0),
|
|
volume=int(item.get("volume", item.get("search_volume", 0)) or 0),
|
|
kd=float(item.get("keyword_difficulty", item.get("kd", 0)) or 0),
|
|
cpc=float(item.get("cpc", item.get("cost_per_click", 0)) or 0),
|
|
url=item.get("url", item.get("best_position_url", "")),
|
|
traffic=int(item.get("traffic", item.get("estimated_traffic", 0)) or 0),
|
|
)
|
|
if kw.keyword:
|
|
keywords.append(kw)
|
|
|
|
logger.info(f"Found {len(keywords)} organic keywords for {clean_domain}")
|
|
return keywords
|
|
|
|
def find_gaps(
|
|
self,
|
|
target_keywords: list[OrganicKeyword],
|
|
competitor_keyword_sets: dict[str, list[OrganicKeyword]],
|
|
) -> list[GapKeyword]:
|
|
"""
|
|
Identify keywords that competitors rank for but the target doesn't.
|
|
|
|
A gap keyword is one that appears in at least one competitor's keyword
|
|
set but not in the target's keyword set.
|
|
"""
|
|
# Build target keyword set for fast lookup
|
|
target_kw_set: set[str] = {kw.keyword.lower().strip() for kw in target_keywords}
|
|
|
|
# Collect all competitor keywords with their positions
|
|
gap_map: dict[str, GapKeyword] = {}
|
|
|
|
for comp_domain, comp_keywords in competitor_keyword_sets.items():
|
|
for ckw in comp_keywords:
|
|
kw_lower = ckw.keyword.lower().strip()
|
|
|
|
# Skip if target already ranks for this keyword
|
|
if kw_lower in target_kw_set:
|
|
continue
|
|
|
|
# Skip below minimum volume
|
|
if ckw.volume < self.min_volume:
|
|
continue
|
|
|
|
if kw_lower not in gap_map:
|
|
gap_map[kw_lower] = GapKeyword(
|
|
keyword=ckw.keyword,
|
|
volume=ckw.volume,
|
|
kd=ckw.kd,
|
|
cpc=ckw.cpc,
|
|
intent=classify_intent(ckw.keyword),
|
|
competitor_positions={},
|
|
competitor_urls={},
|
|
)
|
|
|
|
gap_map[kw_lower].competitor_positions[comp_domain] = ckw.position
|
|
gap_map[kw_lower].competitor_urls[comp_domain] = ckw.url
|
|
|
|
# Update volume/kd if higher from another competitor
|
|
if ckw.volume > gap_map[kw_lower].volume:
|
|
gap_map[kw_lower].volume = ckw.volume
|
|
if ckw.kd > 0 and (gap_map[kw_lower].kd == 0 or ckw.kd < gap_map[kw_lower].kd):
|
|
gap_map[kw_lower].kd = ckw.kd
|
|
|
|
gaps = list(gap_map.values())
|
|
|
|
# Calculate average competitor position for each gap
|
|
for gap in gaps:
|
|
positions = list(gap.competitor_positions.values())
|
|
gap.avg_competitor_position = round(
|
|
sum(positions) / len(positions), 1
|
|
) if positions else 0.0
|
|
|
|
logger.info(f"Found {len(gaps)} keyword gaps")
|
|
return gaps
|
|
|
|
def score_opportunities(self, gaps: list[GapKeyword]) -> list[GapKeyword]:
|
|
"""
|
|
Score each gap keyword by opportunity potential.
|
|
|
|
Formula:
|
|
opportunity_score = (volume_score * 0.4) + (kd_score * 0.3) +
|
|
(position_score * 0.2) + (intent_score * 0.1)
|
|
|
|
Where:
|
|
- volume_score: normalized 0-100 based on max volume in set
|
|
- kd_score: inverted (lower KD = higher score), normalized 0-100
|
|
- position_score: based on avg competitor position (lower = easier to compete)
|
|
- intent_score: commercial/transactional get higher scores
|
|
"""
|
|
if not gaps:
|
|
return gaps
|
|
|
|
# Find max volume for normalization
|
|
max_volume = max(g.volume for g in gaps) if gaps else 1
|
|
max_volume = max(max_volume, 1)
|
|
|
|
intent_scores = {
|
|
"transactional": 100,
|
|
"commercial": 80,
|
|
"informational": 40,
|
|
"navigational": 20,
|
|
}
|
|
|
|
for gap in gaps:
|
|
# Volume score (0-100)
|
|
volume_score = (gap.volume / max_volume) * 100
|
|
|
|
# KD score (inverted: low KD = high score)
|
|
kd_score = max(0, 100 - gap.kd)
|
|
|
|
# Position score (competitors ranking 1-10 means realistic opportunity)
|
|
if gap.avg_competitor_position <= 10:
|
|
position_score = 90
|
|
elif gap.avg_competitor_position <= 20:
|
|
position_score = 70
|
|
elif gap.avg_competitor_position <= 50:
|
|
position_score = 50
|
|
else:
|
|
position_score = 30
|
|
|
|
# Intent score
|
|
intent_score = intent_scores.get(gap.intent, 40)
|
|
|
|
# Combined score
|
|
gap.opportunity_score = round(
|
|
(volume_score * 0.4) +
|
|
(kd_score * 0.3) +
|
|
(position_score * 0.2) +
|
|
(intent_score * 0.1),
|
|
1,
|
|
)
|
|
|
|
# Sort by opportunity score descending
|
|
gaps.sort(key=lambda g: g.opportunity_score, reverse=True)
|
|
|
|
logger.info(f"Scored {len(gaps)} gap keywords by opportunity")
|
|
return gaps
|
|
|
|
def analyze(self, target_url: str, competitor_urls: list[str]) -> GapAnalysisResult:
|
|
"""
|
|
Orchestrate full keyword gap analysis:
|
|
1. Fetch organic keywords for target
|
|
2. Fetch organic keywords for each competitor
|
|
3. Identify gaps
|
|
4. Score opportunities
|
|
5. Compile results
|
|
"""
|
|
target_domain = extract_domain(target_url)
|
|
competitor_domains = [extract_domain(url) for url in competitor_urls]
|
|
|
|
logger.info(
|
|
f"Starting gap analysis: {target_domain} vs {', '.join(competitor_domains)}"
|
|
)
|
|
|
|
# Step 1: Fetch target keywords
|
|
target_keywords = self.get_organic_keywords(target_domain)
|
|
|
|
# Step 2: Fetch competitor keywords
|
|
competitor_keyword_sets: dict[str, list[OrganicKeyword]] = {}
|
|
competitor_keyword_counts: dict[str, int] = {}
|
|
|
|
for comp_domain in competitor_domains:
|
|
comp_keywords = self.get_organic_keywords(comp_domain)
|
|
competitor_keyword_sets[comp_domain] = comp_keywords
|
|
competitor_keyword_counts[comp_domain] = len(comp_keywords)
|
|
|
|
# Step 3: Find gaps
|
|
gaps = self.find_gaps(target_keywords, competitor_keyword_sets)
|
|
|
|
# Step 4: Score opportunities
|
|
scored_gaps = self.score_opportunities(gaps)
|
|
|
|
# Step 5: Calculate intent distribution
|
|
gaps_by_intent: dict[str, int] = {}
|
|
for gap in scored_gaps:
|
|
gaps_by_intent[gap.intent] = gaps_by_intent.get(gap.intent, 0) + 1
|
|
|
|
# Step 6: Compile result
|
|
result = GapAnalysisResult(
|
|
target=target_domain,
|
|
competitors=competitor_domains,
|
|
country=self.country,
|
|
total_gaps=len(scored_gaps),
|
|
total_opportunity_volume=sum(g.volume for g in scored_gaps),
|
|
gaps_by_intent=gaps_by_intent,
|
|
top_opportunities=scored_gaps[:50],
|
|
all_gaps=scored_gaps,
|
|
target_keyword_count=len(target_keywords),
|
|
competitor_keyword_counts=competitor_keyword_counts,
|
|
timestamp=datetime.now().isoformat(),
|
|
)
|
|
|
|
logger.info(
|
|
f"Gap analysis complete: {result.total_gaps} gaps found, "
|
|
f"total opportunity volume {result.total_opportunity_volume:,}"
|
|
)
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Plain-text report formatter
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def format_text_report(result: GapAnalysisResult) -> str:
|
|
"""Format gap analysis result as a human-readable text report."""
|
|
lines: list[str] = []
|
|
lines.append("=" * 75)
|
|
lines.append(f"Keyword Gap Analysis Report")
|
|
lines.append(f"Target: {result.target}")
|
|
lines.append(f"Competitors: {', '.join(result.competitors)}")
|
|
lines.append(f"Country: {result.country.upper()} | Date: {result.timestamp[:10]}")
|
|
lines.append("=" * 75)
|
|
lines.append("")
|
|
|
|
# Overview
|
|
lines.append("## Overview")
|
|
lines.append(f" Target keywords: {result.target_keyword_count:,}")
|
|
for comp, count in result.competitor_keyword_counts.items():
|
|
lines.append(f" {comp} keywords: {count:,}")
|
|
lines.append(f" Keyword gaps found: {result.total_gaps:,}")
|
|
lines.append(f" Total opportunity volume: {result.total_opportunity_volume:,}")
|
|
lines.append("")
|
|
|
|
# Intent distribution
|
|
if result.gaps_by_intent:
|
|
lines.append("## Gaps by Intent")
|
|
for intent, count in sorted(result.gaps_by_intent.items(), key=lambda x: x[1], reverse=True):
|
|
pct = (count / result.total_gaps) * 100 if result.total_gaps else 0
|
|
lines.append(f" {intent:<15}: {count:>5} ({pct:.1f}%)")
|
|
lines.append("")
|
|
|
|
# Top opportunities
|
|
if result.top_opportunities:
|
|
lines.append("## Top Opportunities (by score)")
|
|
header = f" {'Keyword':<35} {'Vol':>8} {'KD':>6} {'Score':>7} {'Intent':<15} {'Competitors'}"
|
|
lines.append(header)
|
|
lines.append(" " + "-" * 90)
|
|
|
|
for gap in result.top_opportunities[:30]:
|
|
kw_display = gap.keyword[:33] if len(gap.keyword) > 33 else gap.keyword
|
|
comp_positions = ", ".join(
|
|
f"{d}:#{p}" for d, p in gap.competitor_positions.items()
|
|
)
|
|
comp_display = comp_positions[:30] if len(comp_positions) > 30 else comp_positions
|
|
|
|
lines.append(
|
|
f" {kw_display:<35} {gap.volume:>8,} {gap.kd:>6.1f} "
|
|
f"{gap.opportunity_score:>7.1f} {gap.intent:<15} {comp_display}"
|
|
)
|
|
lines.append("")
|
|
|
|
# Quick wins (low KD, high volume)
|
|
quick_wins = [g for g in result.all_gaps if g.kd <= 30 and g.volume >= 100]
|
|
quick_wins.sort(key=lambda g: g.volume, reverse=True)
|
|
if quick_wins:
|
|
lines.append("## Quick Wins (KD <= 30, Volume >= 100)")
|
|
lines.append(f" {'Keyword':<35} {'Vol':>8} {'KD':>6} {'Intent':<15}")
|
|
lines.append(" " + "-" * 64)
|
|
for gap in quick_wins[:20]:
|
|
kw_display = gap.keyword[:33] if len(gap.keyword) > 33 else gap.keyword
|
|
lines.append(
|
|
f" {kw_display:<35} {gap.volume:>8,} {gap.kd:>6.1f} {gap.intent:<15}"
|
|
)
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Keyword Gap Analyzer - Find competitor keyword opportunities",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python keyword_gap_analyzer.py --target https://example.com --competitor https://comp.com --json
|
|
python keyword_gap_analyzer.py --target example.com --competitor comp1.com --competitor comp2.com --min-volume 100 --json
|
|
python keyword_gap_analyzer.py --target example.com --competitor comp.com --country us --output gaps.json
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
"--target",
|
|
required=True,
|
|
help="Target website URL or domain",
|
|
)
|
|
parser.add_argument(
|
|
"--competitor",
|
|
action="append",
|
|
required=True,
|
|
dest="competitors",
|
|
help="Competitor URL or domain (can be repeated)",
|
|
)
|
|
parser.add_argument(
|
|
"--country",
|
|
default="kr",
|
|
help="Target country code (default: kr)",
|
|
)
|
|
parser.add_argument(
|
|
"--min-volume",
|
|
type=int,
|
|
default=0,
|
|
help="Minimum search volume filter (default: 0)",
|
|
)
|
|
parser.add_argument(
|
|
"--json",
|
|
action="store_true",
|
|
dest="output_json",
|
|
help="Output results as JSON",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=str,
|
|
default=None,
|
|
help="Write output to file (path)",
|
|
)
|
|
parser.add_argument(
|
|
"--verbose",
|
|
action="store_true",
|
|
help="Enable verbose/debug logging",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.verbose:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
# Run analysis
|
|
analyzer = KeywordGapAnalyzer(
|
|
country=args.country,
|
|
min_volume=args.min_volume,
|
|
)
|
|
result = analyzer.analyze(args.target, args.competitors)
|
|
|
|
# Format output
|
|
if args.output_json:
|
|
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
|
|
else:
|
|
output = format_text_report(result)
|
|
|
|
# Write or print
|
|
if args.output:
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
f.write(output)
|
|
logger.info(f"Output written to: {args.output}")
|
|
else:
|
|
print(output)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|