Add SEO skills 19-28, 31-32 with full Python implementations
12 new skills: Keyword Strategy, SERP Analysis, Position Tracking, Link Building, Content Strategy, E-Commerce SEO, KPI Framework, International SEO, AI Visibility, Knowledge Graph, Competitor Intel, and Crawl Budget. ~20K lines of Python across 25 domain scripts. Updated skill 11 pipeline table and repo CLAUDE.md. Enhanced skill 18 local SEO workflow from jamie.clinic audit. Note: Skill 26 hreflang_validator.py pending (content filter block). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
207
custom-skills/20-seo-serp-analysis/code/scripts/base_client.py
Normal file
207
custom-skills/20-seo-serp-analysis/code/scripts/base_client.py
Normal file
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
Base Client - Shared async client utilities
|
||||
===========================================
|
||||
Purpose: Rate-limited async operations for API clients
|
||||
Python: 3.10+
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from asyncio import Semaphore
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable, TypeVar
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
retry_if_exception_type,
|
||||
)
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Logging setup
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Rate limiter using token bucket algorithm."""
|
||||
|
||||
def __init__(self, rate: float, per: float = 1.0):
|
||||
"""
|
||||
Initialize rate limiter.
|
||||
|
||||
Args:
|
||||
rate: Number of requests allowed
|
||||
per: Time period in seconds (default: 1 second)
|
||||
"""
|
||||
self.rate = rate
|
||||
self.per = per
|
||||
self.tokens = rate
|
||||
self.last_update = datetime.now()
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def acquire(self) -> None:
|
||||
"""Acquire a token, waiting if necessary."""
|
||||
async with self._lock:
|
||||
now = datetime.now()
|
||||
elapsed = (now - self.last_update).total_seconds()
|
||||
self.tokens = min(self.rate, self.tokens + elapsed * (self.rate / self.per))
|
||||
self.last_update = now
|
||||
|
||||
if self.tokens < 1:
|
||||
wait_time = (1 - self.tokens) * (self.per / self.rate)
|
||||
await asyncio.sleep(wait_time)
|
||||
self.tokens = 0
|
||||
else:
|
||||
self.tokens -= 1
|
||||
|
||||
|
||||
class BaseAsyncClient:
|
||||
"""Base class for async API clients with rate limiting."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_concurrent: int = 5,
|
||||
requests_per_second: float = 3.0,
|
||||
logger: logging.Logger | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize base client.
|
||||
|
||||
Args:
|
||||
max_concurrent: Maximum concurrent requests
|
||||
requests_per_second: Rate limit
|
||||
logger: Logger instance
|
||||
"""
|
||||
self.semaphore = Semaphore(max_concurrent)
|
||||
self.rate_limiter = RateLimiter(requests_per_second)
|
||||
self.logger = logger or logging.getLogger(self.__class__.__name__)
|
||||
self.stats = {
|
||||
"requests": 0,
|
||||
"success": 0,
|
||||
"errors": 0,
|
||||
"retries": 0,
|
||||
}
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=2, max=10),
|
||||
retry=retry_if_exception_type(Exception),
|
||||
)
|
||||
async def _rate_limited_request(
|
||||
self,
|
||||
coro: Callable[[], Any],
|
||||
) -> Any:
|
||||
"""Execute a request with rate limiting and retry."""
|
||||
async with self.semaphore:
|
||||
await self.rate_limiter.acquire()
|
||||
self.stats["requests"] += 1
|
||||
try:
|
||||
result = await coro()
|
||||
self.stats["success"] += 1
|
||||
return result
|
||||
except Exception as e:
|
||||
self.stats["errors"] += 1
|
||||
self.logger.error(f"Request failed: {e}")
|
||||
raise
|
||||
|
||||
async def batch_requests(
|
||||
self,
|
||||
requests: list[Callable[[], Any]],
|
||||
desc: str = "Processing",
|
||||
) -> list[Any]:
|
||||
"""Execute multiple requests concurrently."""
|
||||
try:
|
||||
from tqdm.asyncio import tqdm
|
||||
has_tqdm = True
|
||||
except ImportError:
|
||||
has_tqdm = False
|
||||
|
||||
async def execute(req: Callable) -> Any:
|
||||
try:
|
||||
return await self._rate_limited_request(req)
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
tasks = [execute(req) for req in requests]
|
||||
|
||||
if has_tqdm:
|
||||
results = []
|
||||
for coro in tqdm.as_completed(tasks, total=len(tasks), desc=desc):
|
||||
result = await coro
|
||||
results.append(result)
|
||||
return results
|
||||
else:
|
||||
return await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
def print_stats(self) -> None:
|
||||
"""Print request statistics."""
|
||||
self.logger.info("=" * 40)
|
||||
self.logger.info("Request Statistics:")
|
||||
self.logger.info(f" Total Requests: {self.stats['requests']}")
|
||||
self.logger.info(f" Successful: {self.stats['success']}")
|
||||
self.logger.info(f" Errors: {self.stats['errors']}")
|
||||
self.logger.info("=" * 40)
|
||||
|
||||
|
||||
class ConfigManager:
|
||||
"""Manage API configuration and credentials."""
|
||||
|
||||
def __init__(self):
|
||||
load_dotenv()
|
||||
|
||||
@property
|
||||
def google_credentials_path(self) -> str | None:
|
||||
"""Get Google service account credentials path."""
|
||||
# Prefer SEO-specific credentials, fallback to general credentials
|
||||
seo_creds = os.path.expanduser("~/.credential/ourdigital-seo-agent.json")
|
||||
if os.path.exists(seo_creds):
|
||||
return seo_creds
|
||||
return os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
|
||||
|
||||
@property
|
||||
def pagespeed_api_key(self) -> str | None:
|
||||
"""Get PageSpeed Insights API key."""
|
||||
return os.getenv("PAGESPEED_API_KEY")
|
||||
|
||||
@property
|
||||
def custom_search_api_key(self) -> str | None:
|
||||
"""Get Custom Search API key."""
|
||||
return os.getenv("CUSTOM_SEARCH_API_KEY")
|
||||
|
||||
@property
|
||||
def custom_search_engine_id(self) -> str | None:
|
||||
"""Get Custom Search Engine ID."""
|
||||
return os.getenv("CUSTOM_SEARCH_ENGINE_ID")
|
||||
|
||||
@property
|
||||
def notion_token(self) -> str | None:
|
||||
"""Get Notion API token."""
|
||||
return os.getenv("NOTION_TOKEN") or os.getenv("NOTION_API_KEY")
|
||||
|
||||
def validate_google_credentials(self) -> bool:
|
||||
"""Validate Google credentials are configured."""
|
||||
creds_path = self.google_credentials_path
|
||||
if not creds_path:
|
||||
return False
|
||||
return os.path.exists(creds_path)
|
||||
|
||||
def get_required(self, key: str) -> str:
|
||||
"""Get required environment variable or raise error."""
|
||||
value = os.getenv(key)
|
||||
if not value:
|
||||
raise ValueError(f"Missing required environment variable: {key}")
|
||||
return value
|
||||
|
||||
|
||||
# Singleton config instance
|
||||
config = ConfigManager()
|
||||
@@ -0,0 +1,682 @@
|
||||
"""
|
||||
Naver SERP Analyzer - Naver search result composition analysis
|
||||
==============================================================
|
||||
Purpose: Analyze Naver SERP section distribution, content type mapping,
|
||||
brand zone detection, and VIEW tab content analysis.
|
||||
Python: 3.10+
|
||||
|
||||
Usage:
|
||||
python naver_serp_analyzer.py --keyword "치과 임플란트" --json
|
||||
python naver_serp_analyzer.py --keywords-file keywords.txt --json
|
||||
python naver_serp_analyzer.py --keyword "치과 임플란트" --output naver_report.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging
|
||||
# ---------------------------------------------------------------------------
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
console = Console()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants - Naver SERP Section Identifiers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# CSS class / id patterns used to detect Naver SERP sections
|
||||
NAVER_SECTION_SELECTORS: dict[str, list[str]] = {
|
||||
"blog": [
|
||||
"sp_blog",
|
||||
"blog_widget",
|
||||
"sc_new.sp_blog",
|
||||
"api_subject_blog",
|
||||
"type_blog",
|
||||
"blog_exact",
|
||||
],
|
||||
"cafe": [
|
||||
"sp_cafe",
|
||||
"cafe_widget",
|
||||
"sc_new.sp_cafe",
|
||||
"api_subject_cafe",
|
||||
"type_cafe",
|
||||
],
|
||||
"knowledge_in": [
|
||||
"sp_kin",
|
||||
"kin_widget",
|
||||
"sc_new.sp_kin",
|
||||
"api_subject_kin",
|
||||
"type_kin",
|
||||
"nx_kin",
|
||||
],
|
||||
"smart_store": [
|
||||
"sp_nshop",
|
||||
"shopping_widget",
|
||||
"sc_new.sp_nshop",
|
||||
"api_subject_shopping",
|
||||
"type_shopping",
|
||||
"smartstore",
|
||||
],
|
||||
"brand_zone": [
|
||||
"sp_brand",
|
||||
"brand_area",
|
||||
"brand_zone",
|
||||
"type_brand",
|
||||
"sc_new.sp_brand",
|
||||
],
|
||||
"view_tab": [
|
||||
"sp_view",
|
||||
"view_widget",
|
||||
"sc_new.sp_view",
|
||||
"type_view",
|
||||
"api_subject_view",
|
||||
],
|
||||
"news": [
|
||||
"sp_nnews",
|
||||
"news_widget",
|
||||
"sc_new.sp_nnews",
|
||||
"api_subject_news",
|
||||
"type_news",
|
||||
"group_news",
|
||||
],
|
||||
"encyclopedia": [
|
||||
"sp_encyclopedia",
|
||||
"sc_new.sp_encyclopedia",
|
||||
"api_subject_encyclopedia",
|
||||
"type_encyclopedia",
|
||||
"nx_encyclopedia",
|
||||
],
|
||||
"image": [
|
||||
"sp_image",
|
||||
"image_widget",
|
||||
"sc_new.sp_image",
|
||||
"api_subject_image",
|
||||
"type_image",
|
||||
],
|
||||
"video": [
|
||||
"sp_video",
|
||||
"video_widget",
|
||||
"sc_new.sp_video",
|
||||
"api_subject_video",
|
||||
"type_video",
|
||||
],
|
||||
"place": [
|
||||
"sp_local",
|
||||
"local_widget",
|
||||
"sc_new.sp_local",
|
||||
"type_place",
|
||||
"place_section",
|
||||
"loc_map",
|
||||
],
|
||||
"ad": [
|
||||
"sp_nad",
|
||||
"sp_tad",
|
||||
"ad_section",
|
||||
"type_powerlink",
|
||||
"type_ad",
|
||||
"nx_ad",
|
||||
],
|
||||
}
|
||||
|
||||
# Section display names in Korean
|
||||
SECTION_DISPLAY_NAMES: dict[str, str] = {
|
||||
"blog": "블로그",
|
||||
"cafe": "카페",
|
||||
"knowledge_in": "지식iN",
|
||||
"smart_store": "스마트스토어",
|
||||
"brand_zone": "브랜드존",
|
||||
"view_tab": "VIEW",
|
||||
"news": "뉴스",
|
||||
"encyclopedia": "백과사전",
|
||||
"image": "이미지",
|
||||
"video": "동영상",
|
||||
"place": "플레이스",
|
||||
"ad": "광고",
|
||||
}
|
||||
|
||||
# Default headers for Naver requests
|
||||
NAVER_HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data Classes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class NaverSection:
|
||||
"""A detected section within Naver SERP."""
|
||||
|
||||
section_type: str # blog, cafe, knowledge_in, smart_store, etc.
|
||||
display_name: str = ""
|
||||
position: int = 0 # Order of appearance (1-based)
|
||||
item_count: int = 0 # Number of items in the section
|
||||
is_above_fold: bool = False # Appears within first ~3 sections
|
||||
has_more_link: bool = False # Section has "more results" link
|
||||
raw_html_snippet: str = "" # Short HTML snippet for debugging
|
||||
|
||||
def __post_init__(self):
|
||||
if not self.display_name:
|
||||
self.display_name = SECTION_DISPLAY_NAMES.get(
|
||||
self.section_type, self.section_type
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NaverSerpResult:
|
||||
"""Complete Naver SERP analysis result for a keyword."""
|
||||
|
||||
keyword: str
|
||||
sections: list[NaverSection] = field(default_factory=list)
|
||||
section_order: list[str] = field(default_factory=list)
|
||||
brand_zone_present: bool = False
|
||||
brand_zone_brand: str = ""
|
||||
total_sections: int = 0
|
||||
above_fold_sections: list[str] = field(default_factory=list)
|
||||
ad_count: int = 0
|
||||
dominant_section: str = ""
|
||||
has_view_tab: bool = False
|
||||
has_place_section: bool = False
|
||||
timestamp: str = ""
|
||||
|
||||
def __post_init__(self):
|
||||
if not self.timestamp:
|
||||
self.timestamp = datetime.now().isoformat()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Naver SERP Analyzer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class NaverSerpAnalyzer:
|
||||
"""Analyzes Naver search result page composition."""
|
||||
|
||||
NAVER_SEARCH_URL = "https://search.naver.com/search.naver"
|
||||
|
||||
def __init__(self, timeout: int = 30):
|
||||
self.timeout = timeout
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(NAVER_HEADERS)
|
||||
|
||||
# ----- Data Fetching -----
|
||||
|
||||
def fetch_serp(self, keyword: str) -> str:
|
||||
"""
|
||||
Fetch Naver search results HTML for a given keyword.
|
||||
|
||||
Returns the raw HTML string of the search results page.
|
||||
"""
|
||||
self.logger.info(f"Fetching Naver SERP for '{keyword}'")
|
||||
|
||||
params = {
|
||||
"where": "nexearch",
|
||||
"sm": "top_hty",
|
||||
"fbm": "0",
|
||||
"ie": "utf8",
|
||||
"query": keyword,
|
||||
}
|
||||
|
||||
try:
|
||||
response = self.session.get(
|
||||
self.NAVER_SEARCH_URL,
|
||||
params=params,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
self.logger.info(
|
||||
f"Fetched {len(response.text):,} bytes "
|
||||
f"(status={response.status_code})"
|
||||
)
|
||||
return response.text
|
||||
|
||||
except requests.RequestException as exc:
|
||||
self.logger.error(f"Failed to fetch Naver SERP: {exc}")
|
||||
return ""
|
||||
|
||||
# ----- Section Detection -----
|
||||
|
||||
def detect_sections(self, html: str) -> list[NaverSection]:
|
||||
"""
|
||||
Identify Naver SERP sections from HTML structure.
|
||||
|
||||
Scans the HTML for known CSS class names and IDs that correspond
|
||||
to Naver's SERP section types.
|
||||
"""
|
||||
if not html:
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
sections: list[NaverSection] = []
|
||||
position = 0
|
||||
|
||||
# Strategy 1: Look for section containers with known class names
|
||||
# Naver uses <div class="sc_new sp_XXX"> and <section> elements
|
||||
all_sections = soup.find_all(
|
||||
["div", "section"],
|
||||
class_=re.compile(
|
||||
r"(sc_new|api_subject|sp_|type_|_widget|group_|nx_)"
|
||||
),
|
||||
)
|
||||
|
||||
seen_types: set[str] = set()
|
||||
|
||||
for element in all_sections:
|
||||
classes = " ".join(element.get("class", []))
|
||||
element_id = element.get("id", "")
|
||||
search_text = f"{classes} {element_id}".lower()
|
||||
|
||||
for section_type, selectors in NAVER_SECTION_SELECTORS.items():
|
||||
if section_type in seen_types:
|
||||
continue
|
||||
|
||||
matched = False
|
||||
for selector in selectors:
|
||||
if selector.lower() in search_text:
|
||||
matched = True
|
||||
break
|
||||
|
||||
if matched:
|
||||
position += 1
|
||||
seen_types.add(section_type)
|
||||
|
||||
# Count items within the section
|
||||
item_count = self._count_section_items(element, section_type)
|
||||
|
||||
# Check for "more" link
|
||||
has_more = bool(
|
||||
element.find("a", class_=re.compile(r"(more|_more|btn_more)"))
|
||||
or element.find("a", string=re.compile(r"(더보기|전체보기)"))
|
||||
)
|
||||
|
||||
# Get short HTML snippet for debugging
|
||||
snippet = str(element)[:200] if element else ""
|
||||
|
||||
section = NaverSection(
|
||||
section_type=section_type,
|
||||
position=position,
|
||||
item_count=item_count,
|
||||
is_above_fold=(position <= 3),
|
||||
has_more_link=has_more,
|
||||
raw_html_snippet=snippet,
|
||||
)
|
||||
sections.append(section)
|
||||
|
||||
# Strategy 2: Fallback - scan entire HTML text for section markers
|
||||
if not sections:
|
||||
self.logger.warning(
|
||||
"No sections found via DOM parsing; "
|
||||
"falling back to text pattern matching"
|
||||
)
|
||||
sections = self._fallback_text_detection(html)
|
||||
|
||||
return sections
|
||||
|
||||
def _count_section_items(self, element: Any, section_type: str) -> int:
|
||||
"""Count the number of result items within a section element."""
|
||||
# Common item container patterns
|
||||
item_selectors = [
|
||||
"li",
|
||||
".api_txt_lines",
|
||||
".total_tit",
|
||||
".detail_box",
|
||||
".item",
|
||||
".lst_total > li",
|
||||
]
|
||||
|
||||
for selector in item_selectors:
|
||||
items = element.select(selector)
|
||||
if items and len(items) > 0:
|
||||
return len(items)
|
||||
|
||||
# Fallback: count links that look like results
|
||||
links = element.find_all("a", href=True)
|
||||
result_links = [
|
||||
a
|
||||
for a in links
|
||||
if a.get("href", "").startswith("http")
|
||||
and "naver.com/search" not in a.get("href", "")
|
||||
]
|
||||
return len(result_links) if result_links else 0
|
||||
|
||||
def _fallback_text_detection(self, html: str) -> list[NaverSection]:
|
||||
"""Detect sections by scanning raw HTML text for known markers."""
|
||||
sections: list[NaverSection] = []
|
||||
position = 0
|
||||
html_lower = html.lower()
|
||||
|
||||
for section_type, selectors in NAVER_SECTION_SELECTORS.items():
|
||||
for selector in selectors:
|
||||
if selector.lower() in html_lower:
|
||||
position += 1
|
||||
sections.append(
|
||||
NaverSection(
|
||||
section_type=section_type,
|
||||
position=position,
|
||||
item_count=0,
|
||||
is_above_fold=(position <= 3),
|
||||
)
|
||||
)
|
||||
break
|
||||
|
||||
return sections
|
||||
|
||||
# ----- Section Priority Analysis -----
|
||||
|
||||
def analyze_section_priority(
|
||||
self, sections: list[NaverSection]
|
||||
) -> list[str]:
|
||||
"""
|
||||
Determine above-fold section order.
|
||||
|
||||
Returns ordered list of section types that appear in the first
|
||||
visible area of the SERP (approximately top 3 sections).
|
||||
"""
|
||||
sorted_sections = sorted(sections, key=lambda s: s.position)
|
||||
above_fold = [s.section_type for s in sorted_sections if s.is_above_fold]
|
||||
return above_fold
|
||||
|
||||
# ----- Brand Zone Detection -----
|
||||
|
||||
def check_brand_zone(self, html: str) -> tuple[bool, str]:
|
||||
"""
|
||||
Detect brand zone presence and extract brand name if available.
|
||||
|
||||
Returns (is_present, brand_name).
|
||||
"""
|
||||
if not html:
|
||||
return False, ""
|
||||
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
|
||||
# Look for brand zone container
|
||||
brand_selectors = [
|
||||
"sp_brand",
|
||||
"brand_area",
|
||||
"brand_zone",
|
||||
"type_brand",
|
||||
]
|
||||
|
||||
for selector in brand_selectors:
|
||||
brand_el = soup.find(
|
||||
["div", "section"],
|
||||
class_=re.compile(selector, re.IGNORECASE),
|
||||
)
|
||||
if brand_el:
|
||||
# Try to extract brand name from the section
|
||||
brand_name = ""
|
||||
title_el = brand_el.find(
|
||||
["h2", "h3", "strong", "a"],
|
||||
class_=re.compile(r"(tit|title|name|brand)", re.IGNORECASE),
|
||||
)
|
||||
if title_el:
|
||||
brand_name = title_el.get_text(strip=True)
|
||||
|
||||
return True, brand_name
|
||||
|
||||
# Text-based fallback
|
||||
if "brand_zone" in html.lower() or "sp_brand" in html.lower():
|
||||
return True, ""
|
||||
|
||||
return False, ""
|
||||
|
||||
# ----- Dominant Section -----
|
||||
|
||||
def _find_dominant_section(self, sections: list[NaverSection]) -> str:
|
||||
"""Find the section with the most items (excluding ads)."""
|
||||
non_ad = [s for s in sections if s.section_type != "ad"]
|
||||
if not non_ad:
|
||||
return ""
|
||||
return max(non_ad, key=lambda s: s.item_count).section_type
|
||||
|
||||
# ----- Main Analysis Orchestrator -----
|
||||
|
||||
def analyze(self, keyword: str) -> NaverSerpResult:
|
||||
"""
|
||||
Orchestrate full Naver SERP analysis for a single keyword.
|
||||
|
||||
Steps:
|
||||
1. Fetch Naver search results page
|
||||
2. Detect SERP sections
|
||||
3. Analyze section priority
|
||||
4. Check brand zone presence
|
||||
5. Compile results
|
||||
"""
|
||||
html = self.fetch_serp(keyword)
|
||||
|
||||
if not html:
|
||||
self.logger.error(f"No HTML content for keyword '{keyword}'")
|
||||
return NaverSerpResult(keyword=keyword)
|
||||
|
||||
sections = self.detect_sections(html)
|
||||
above_fold = self.analyze_section_priority(sections)
|
||||
brand_present, brand_name = self.check_brand_zone(html)
|
||||
|
||||
# Build section order
|
||||
section_order = [s.section_type for s in sorted(sections, key=lambda x: x.position)]
|
||||
|
||||
# Count ads
|
||||
ad_sections = [s for s in sections if s.section_type == "ad"]
|
||||
ad_count = sum(s.item_count for s in ad_sections) if ad_sections else 0
|
||||
|
||||
# Check special sections
|
||||
has_view = any(s.section_type == "view_tab" for s in sections)
|
||||
has_place = any(s.section_type == "place" for s in sections)
|
||||
dominant = self._find_dominant_section(sections)
|
||||
|
||||
result = NaverSerpResult(
|
||||
keyword=keyword,
|
||||
sections=sections,
|
||||
section_order=section_order,
|
||||
brand_zone_present=brand_present,
|
||||
brand_zone_brand=brand_name,
|
||||
total_sections=len(sections),
|
||||
above_fold_sections=above_fold,
|
||||
ad_count=ad_count,
|
||||
dominant_section=dominant,
|
||||
has_view_tab=has_view,
|
||||
has_place_section=has_place,
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Output Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def result_to_dict(result: NaverSerpResult) -> dict[str, Any]:
|
||||
"""Convert NaverSerpResult to a JSON-serializable dictionary."""
|
||||
d = asdict(result)
|
||||
# Remove raw HTML snippets from JSON output to keep it clean
|
||||
for section in d.get("sections", []):
|
||||
section.pop("raw_html_snippet", None)
|
||||
return d
|
||||
|
||||
|
||||
def print_rich_report(result: NaverSerpResult) -> None:
|
||||
"""Print a human-readable report using rich."""
|
||||
console.rule(f"[bold blue]Naver SERP Analysis: {result.keyword}")
|
||||
console.print(f"[dim]Timestamp: {result.timestamp}[/dim]")
|
||||
console.print()
|
||||
|
||||
# Summary
|
||||
summary_table = Table(title="Summary", show_lines=True)
|
||||
summary_table.add_column("Metric", style="cyan")
|
||||
summary_table.add_column("Value", style="green")
|
||||
summary_table.add_row("Total Sections", str(result.total_sections))
|
||||
summary_table.add_row("Ad Count", str(result.ad_count))
|
||||
summary_table.add_row("Brand Zone", "Yes" if result.brand_zone_present else "No")
|
||||
if result.brand_zone_brand:
|
||||
summary_table.add_row("Brand Name", result.brand_zone_brand)
|
||||
summary_table.add_row("VIEW Tab", "Yes" if result.has_view_tab else "No")
|
||||
summary_table.add_row("Place Section", "Yes" if result.has_place_section else "No")
|
||||
summary_table.add_row("Dominant Section", result.dominant_section or "N/A")
|
||||
console.print(summary_table)
|
||||
console.print()
|
||||
|
||||
# Section Details
|
||||
if result.sections:
|
||||
section_table = Table(title="Detected Sections", show_lines=True)
|
||||
section_table.add_column("#", style="bold")
|
||||
section_table.add_column("Section", style="cyan")
|
||||
section_table.add_column("Display Name", style="magenta")
|
||||
section_table.add_column("Items", style="green")
|
||||
section_table.add_column("Above Fold", style="yellow")
|
||||
section_table.add_column("More Link", style="dim")
|
||||
|
||||
for s in sorted(result.sections, key=lambda x: x.position):
|
||||
section_table.add_row(
|
||||
str(s.position),
|
||||
s.section_type,
|
||||
s.display_name,
|
||||
str(s.item_count),
|
||||
"Yes" if s.is_above_fold else "No",
|
||||
"Yes" if s.has_more_link else "No",
|
||||
)
|
||||
console.print(section_table)
|
||||
console.print()
|
||||
|
||||
# Above-Fold Sections
|
||||
if result.above_fold_sections:
|
||||
console.print("[bold]Above-Fold Section Order:[/bold]")
|
||||
for i, sec in enumerate(result.above_fold_sections, 1):
|
||||
display = SECTION_DISPLAY_NAMES.get(sec, sec)
|
||||
console.print(f" {i}. {display} ({sec})")
|
||||
console.print()
|
||||
|
||||
# Section Order
|
||||
if result.section_order:
|
||||
console.print("[bold]Full Section Order:[/bold]")
|
||||
order_str = " -> ".join(
|
||||
SECTION_DISPLAY_NAMES.get(s, s) for s in result.section_order
|
||||
)
|
||||
console.print(f" {order_str}")
|
||||
|
||||
console.rule()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Naver SERP composition analysis",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python naver_serp_analyzer.py --keyword "치과 임플란트" --json
|
||||
python naver_serp_analyzer.py --keywords-file keywords.txt --json
|
||||
python naver_serp_analyzer.py --keyword "치과 임플란트" --output report.json
|
||||
""",
|
||||
)
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument(
|
||||
"--keyword",
|
||||
type=str,
|
||||
help="Single keyword to analyze",
|
||||
)
|
||||
group.add_argument(
|
||||
"--keywords-file",
|
||||
type=str,
|
||||
help="Path to file with one keyword per line",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
action="store_true",
|
||||
dest="json_output",
|
||||
help="Output results as JSON",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
help="Write JSON results to file",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def load_keywords(filepath: str) -> list[str]:
|
||||
"""Load keywords from a text file, one per line."""
|
||||
path = Path(filepath)
|
||||
if not path.exists():
|
||||
logger.error(f"Keywords file not found: {filepath}")
|
||||
sys.exit(1)
|
||||
keywords = []
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
kw = line.strip()
|
||||
if kw and not kw.startswith("#"):
|
||||
keywords.append(kw)
|
||||
logger.info(f"Loaded {len(keywords)} keywords from {filepath}")
|
||||
return keywords
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
analyzer = NaverSerpAnalyzer()
|
||||
|
||||
# Collect keywords
|
||||
if args.keyword:
|
||||
keywords = [args.keyword]
|
||||
else:
|
||||
keywords = load_keywords(args.keywords_file)
|
||||
|
||||
if not keywords:
|
||||
logger.error("No keywords to analyze")
|
||||
sys.exit(1)
|
||||
|
||||
results: list[dict[str, Any]] = []
|
||||
|
||||
for kw in keywords:
|
||||
console.print(f"\n[bold]Analyzing Naver SERP:[/bold] {kw}")
|
||||
result = analyzer.analyze(kw)
|
||||
|
||||
if args.json_output or args.output:
|
||||
results.append(result_to_dict(result))
|
||||
else:
|
||||
print_rich_report(result)
|
||||
|
||||
# JSON output
|
||||
if args.json_output:
|
||||
output_data = results[0] if len(results) == 1 else results
|
||||
print(json.dumps(output_data, ensure_ascii=False, indent=2))
|
||||
|
||||
if args.output:
|
||||
output_data = results[0] if len(results) == 1 else results
|
||||
output_path = Path(args.output)
|
||||
with open(output_path, "w", encoding="utf-8") as fh:
|
||||
json.dump(output_data, fh, ensure_ascii=False, indent=2)
|
||||
logger.info(f"Results written to {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,9 @@
|
||||
# 20-seo-serp-analysis dependencies
|
||||
requests>=2.31.0
|
||||
aiohttp>=3.9.0
|
||||
beautifulsoup4>=4.12.0
|
||||
lxml>=5.1.0
|
||||
tenacity>=8.2.0
|
||||
tqdm>=4.66.0
|
||||
python-dotenv>=1.0.0
|
||||
rich>=13.7.0
|
||||
891
custom-skills/20-seo-serp-analysis/code/scripts/serp_analyzer.py
Normal file
891
custom-skills/20-seo-serp-analysis/code/scripts/serp_analyzer.py
Normal file
@@ -0,0 +1,891 @@
|
||||
"""
|
||||
SERP Analyzer - Google SERP feature detection and competitor mapping
|
||||
====================================================================
|
||||
Purpose: Analyze Google SERP features, map competitor positions,
|
||||
classify content types, and score SERP opportunities.
|
||||
Python: 3.10+
|
||||
|
||||
Usage:
|
||||
python serp_analyzer.py --keyword "치과 임플란트" --country kr --json
|
||||
python serp_analyzer.py --keywords-file keywords.txt --country kr --json
|
||||
python serp_analyzer.py --keyword "dental implant" --output serp_report.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging
|
||||
# ---------------------------------------------------------------------------
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
console = Console()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data Classes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class SerpFeatures:
|
||||
"""Tracks presence and count of Google SERP features."""
|
||||
|
||||
featured_snippet: bool = False
|
||||
people_also_ask: bool = False
|
||||
local_pack: bool = False
|
||||
knowledge_panel: bool = False
|
||||
video_carousel: bool = False
|
||||
image_pack: bool = False
|
||||
site_links: bool = False
|
||||
ads_top: int = 0
|
||||
ads_bottom: int = 0
|
||||
shopping: bool = False
|
||||
|
||||
@property
|
||||
def feature_count(self) -> int:
|
||||
"""Count of boolean features that are present."""
|
||||
count = 0
|
||||
for f in [
|
||||
self.featured_snippet,
|
||||
self.people_also_ask,
|
||||
self.local_pack,
|
||||
self.knowledge_panel,
|
||||
self.video_carousel,
|
||||
self.image_pack,
|
||||
self.site_links,
|
||||
self.shopping,
|
||||
]:
|
||||
if f:
|
||||
count += 1
|
||||
return count
|
||||
|
||||
@property
|
||||
def has_ads(self) -> bool:
|
||||
return self.ads_top > 0 or self.ads_bottom > 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompetitorPosition:
|
||||
"""A single competitor entry in the SERP."""
|
||||
|
||||
position: int
|
||||
url: str
|
||||
domain: str
|
||||
title: str = ""
|
||||
content_type: str = "unknown"
|
||||
is_featured: bool = False
|
||||
has_sitelinks: bool = False
|
||||
estimated_traffic_share: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class SerpResult:
|
||||
"""Complete SERP analysis result for a keyword."""
|
||||
|
||||
keyword: str
|
||||
country: str = "us"
|
||||
search_volume: int = 0
|
||||
keyword_difficulty: float = 0.0
|
||||
cpc: float = 0.0
|
||||
serp_features: SerpFeatures = field(default_factory=SerpFeatures)
|
||||
competitors: list[CompetitorPosition] = field(default_factory=list)
|
||||
opportunity_score: int = 0
|
||||
intent_signals: str = "informational"
|
||||
content_type_distribution: dict[str, int] = field(default_factory=dict)
|
||||
volatility: str = "stable"
|
||||
timestamp: str = ""
|
||||
|
||||
def __post_init__(self):
|
||||
if not self.timestamp:
|
||||
self.timestamp = datetime.now().isoformat()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Content Type Classifiers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# URL path patterns that hint at content type
|
||||
URL_CONTENT_PATTERNS: dict[str, list[str]] = {
|
||||
"blog": [
|
||||
r"/blog/",
|
||||
r"/post/",
|
||||
r"/article/",
|
||||
r"/news/",
|
||||
r"/magazine/",
|
||||
r"/journal/",
|
||||
r"/column/",
|
||||
r"/story/",
|
||||
r"\d{4}/\d{2}/",
|
||||
],
|
||||
"product": [
|
||||
r"/product/",
|
||||
r"/item/",
|
||||
r"/shop/",
|
||||
r"/store/",
|
||||
r"/buy/",
|
||||
r"/p/",
|
||||
r"/goods/",
|
||||
r"/catalog/",
|
||||
],
|
||||
"service": [
|
||||
r"/service",
|
||||
r"/solution",
|
||||
r"/treatment",
|
||||
r"/procedure",
|
||||
r"/pricing",
|
||||
r"/consultation",
|
||||
],
|
||||
"news": [
|
||||
r"/news/",
|
||||
r"/press/",
|
||||
r"/media/",
|
||||
r"/release/",
|
||||
r"news\.",
|
||||
r"press\.",
|
||||
],
|
||||
"video": [
|
||||
r"youtube\.com/watch",
|
||||
r"youtu\.be/",
|
||||
r"vimeo\.com/",
|
||||
r"/video/",
|
||||
r"/watch/",
|
||||
],
|
||||
"forum": [
|
||||
r"/forum/",
|
||||
r"/community/",
|
||||
r"/discuss",
|
||||
r"/thread/",
|
||||
r"/question/",
|
||||
r"/answers/",
|
||||
],
|
||||
"wiki": [
|
||||
r"wikipedia\.org",
|
||||
r"/wiki/",
|
||||
r"namu\.wiki",
|
||||
],
|
||||
}
|
||||
|
||||
# Title keywords that hint at content type
|
||||
TITLE_CONTENT_PATTERNS: dict[str, list[str]] = {
|
||||
"blog": ["블로그", "후기", "리뷰", "review", "guide", "가이드", "팁", "tips"],
|
||||
"product": ["구매", "가격", "buy", "price", "shop", "할인", "sale", "최저가"],
|
||||
"service": ["상담", "치료", "진료", "병원", "클리닉", "clinic", "treatment"],
|
||||
"news": ["뉴스", "속보", "보도", "news", "기사", "report"],
|
||||
"video": ["영상", "동영상", "video", "youtube"],
|
||||
"comparison": ["비교", "vs", "versus", "compare", "차이", "best"],
|
||||
}
|
||||
|
||||
# CTR distribution by position (approximate click-through rates)
|
||||
CTR_BY_POSITION: dict[int, float] = {
|
||||
1: 0.316,
|
||||
2: 0.158,
|
||||
3: 0.110,
|
||||
4: 0.080,
|
||||
5: 0.062,
|
||||
6: 0.049,
|
||||
7: 0.040,
|
||||
8: 0.034,
|
||||
9: 0.029,
|
||||
10: 0.025,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SERP Analyzer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class SerpAnalyzer:
|
||||
"""Analyzes Google SERP features, competitor positions, and opportunities."""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
|
||||
# ----- Data Fetching -----
|
||||
|
||||
def get_serp_data(self, keyword: str, country: str = "us") -> dict[str, Any]:
|
||||
"""
|
||||
Fetch SERP data via Ahrefs serp-overview MCP tool.
|
||||
|
||||
Uses subprocess to invoke the Ahrefs MCP tool. Falls back to a
|
||||
structured placeholder when the MCP tool is unavailable (e.g., in
|
||||
standalone / CI environments).
|
||||
"""
|
||||
self.logger.info(f"Fetching SERP data for '{keyword}' (country={country})")
|
||||
|
||||
try:
|
||||
# Attempt MCP tool call via subprocess
|
||||
cmd = [
|
||||
"claude",
|
||||
"mcp",
|
||||
"call",
|
||||
"ahrefs",
|
||||
"serp-overview",
|
||||
json.dumps({"keyword": keyword, "country": country}),
|
||||
]
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
data = json.loads(result.stdout)
|
||||
self.logger.info("Successfully fetched SERP data via MCP")
|
||||
return data
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError) as exc:
|
||||
self.logger.warning(f"MCP call unavailable ({exc}), using keyword metrics fallback")
|
||||
|
||||
# Fallback: try Ahrefs keywords-explorer-overview
|
||||
try:
|
||||
cmd_kw = [
|
||||
"claude",
|
||||
"mcp",
|
||||
"call",
|
||||
"ahrefs",
|
||||
"keywords-explorer-overview",
|
||||
json.dumps({"keyword": keyword, "country": country}),
|
||||
]
|
||||
result_kw = subprocess.run(
|
||||
cmd_kw,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
if result_kw.returncode == 0 and result_kw.stdout.strip():
|
||||
data = json.loads(result_kw.stdout)
|
||||
self.logger.info("Fetched keyword overview via MCP")
|
||||
return data
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError) as exc:
|
||||
self.logger.warning(f"Keywords-explorer MCP also unavailable ({exc})")
|
||||
|
||||
# Return empty structure when no MCP tools available
|
||||
self.logger.warning(
|
||||
"No MCP data source available. Run inside Claude Desktop "
|
||||
"or provide data via --input flag."
|
||||
)
|
||||
return {
|
||||
"keyword": keyword,
|
||||
"country": country,
|
||||
"serp": [],
|
||||
"serp_features": {},
|
||||
"metrics": {},
|
||||
}
|
||||
|
||||
# ----- Feature Detection -----
|
||||
|
||||
def detect_features(self, serp_data: dict[str, Any]) -> SerpFeatures:
|
||||
"""
|
||||
Identify SERP features from Ahrefs response data.
|
||||
|
||||
Handles both the structured 'serp_features' dict returned by
|
||||
keywords-explorer-overview and the raw SERP items list from
|
||||
serp-overview.
|
||||
"""
|
||||
features = SerpFeatures()
|
||||
|
||||
# -- Method 1: structured serp_features from Ahrefs --
|
||||
sf = serp_data.get("serp_features", {})
|
||||
if isinstance(sf, dict):
|
||||
features.featured_snippet = sf.get("featured_snippet", False)
|
||||
features.people_also_ask = sf.get("people_also_ask", False)
|
||||
features.local_pack = sf.get("local_pack", False)
|
||||
features.knowledge_panel = sf.get("knowledge_panel", False) or sf.get(
|
||||
"knowledge_graph", False
|
||||
)
|
||||
features.video_carousel = sf.get("video", False) or sf.get(
|
||||
"video_carousel", False
|
||||
)
|
||||
features.image_pack = sf.get("image_pack", False) or sf.get(
|
||||
"images", False
|
||||
)
|
||||
features.site_links = sf.get("sitelinks", False) or sf.get(
|
||||
"site_links", False
|
||||
)
|
||||
features.shopping = sf.get("shopping_results", False) or sf.get(
|
||||
"shopping", False
|
||||
)
|
||||
features.ads_top = int(sf.get("ads_top", 0) or 0)
|
||||
features.ads_bottom = int(sf.get("ads_bottom", 0) or 0)
|
||||
|
||||
# -- Method 2: infer from raw SERP items list --
|
||||
serp_items = serp_data.get("serp", [])
|
||||
if isinstance(serp_items, list):
|
||||
for item in serp_items:
|
||||
item_type = str(item.get("type", "")).lower()
|
||||
if "featured_snippet" in item_type or item.get("is_featured"):
|
||||
features.featured_snippet = True
|
||||
if "people_also_ask" in item_type or "paa" in item_type:
|
||||
features.people_also_ask = True
|
||||
if "local" in item_type or "map" in item_type:
|
||||
features.local_pack = True
|
||||
if "knowledge" in item_type:
|
||||
features.knowledge_panel = True
|
||||
if "video" in item_type:
|
||||
features.video_carousel = True
|
||||
if "image" in item_type:
|
||||
features.image_pack = True
|
||||
if item.get("sitelinks"):
|
||||
features.site_links = True
|
||||
if "shopping" in item_type:
|
||||
features.shopping = True
|
||||
if "ad" in item_type:
|
||||
pos = item.get("position", 0)
|
||||
if pos <= 4:
|
||||
features.ads_top += 1
|
||||
else:
|
||||
features.ads_bottom += 1
|
||||
|
||||
return features
|
||||
|
||||
# ----- Competitor Mapping -----
|
||||
|
||||
def map_competitors(self, serp_data: dict[str, Any]) -> list[CompetitorPosition]:
|
||||
"""Extract competitor positions and domains from SERP data."""
|
||||
competitors: list[CompetitorPosition] = []
|
||||
serp_items = serp_data.get("serp", [])
|
||||
|
||||
if not isinstance(serp_items, list):
|
||||
return competitors
|
||||
|
||||
for item in serp_items:
|
||||
url = item.get("url", "")
|
||||
if not url:
|
||||
continue
|
||||
|
||||
# Skip ads for organic mapping
|
||||
item_type = str(item.get("type", "")).lower()
|
||||
if "ad" in item_type:
|
||||
continue
|
||||
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.replace("www.", "")
|
||||
position = int(item.get("position", len(competitors) + 1))
|
||||
title = item.get("title", "")
|
||||
|
||||
content_type = self.classify_content_type(item)
|
||||
traffic_share = CTR_BY_POSITION.get(position, 0.01)
|
||||
|
||||
comp = CompetitorPosition(
|
||||
position=position,
|
||||
url=url,
|
||||
domain=domain,
|
||||
title=title,
|
||||
content_type=content_type,
|
||||
is_featured=bool(item.get("is_featured")),
|
||||
has_sitelinks=bool(item.get("sitelinks")),
|
||||
estimated_traffic_share=round(traffic_share, 4),
|
||||
)
|
||||
competitors.append(comp)
|
||||
|
||||
# Sort by position
|
||||
competitors.sort(key=lambda c: c.position)
|
||||
return competitors
|
||||
|
||||
# ----- Content Type Classification -----
|
||||
|
||||
def classify_content_type(self, result: dict[str, Any]) -> str:
|
||||
"""
|
||||
Classify a SERP result as blog/product/service/news/video/forum/wiki
|
||||
based on URL patterns and title keywords.
|
||||
"""
|
||||
url = result.get("url", "").lower()
|
||||
title = result.get("title", "").lower()
|
||||
|
||||
scores: dict[str, int] = {}
|
||||
|
||||
# Score from URL patterns
|
||||
for ctype, patterns in URL_CONTENT_PATTERNS.items():
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, url):
|
||||
scores[ctype] = scores.get(ctype, 0) + 2
|
||||
break
|
||||
|
||||
# Score from title patterns
|
||||
for ctype, keywords in TITLE_CONTENT_PATTERNS.items():
|
||||
for kw in keywords:
|
||||
if kw.lower() in title:
|
||||
scores[ctype] = scores.get(ctype, 0) + 1
|
||||
|
||||
if not scores:
|
||||
# Heuristic: if domain is a known authority site
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
if any(d in domain for d in ["wikipedia", "namu.wiki", "나무위키"]):
|
||||
return "wiki"
|
||||
if any(d in domain for d in ["youtube", "vimeo"]):
|
||||
return "video"
|
||||
if any(d in domain for d in ["naver.com", "tistory.com", "brunch.co.kr"]):
|
||||
return "blog"
|
||||
return "service_page"
|
||||
|
||||
# Return highest scoring type
|
||||
return max(scores, key=scores.get) # type: ignore[arg-type]
|
||||
|
||||
# ----- Opportunity Scoring -----
|
||||
|
||||
def calculate_opportunity_score(
|
||||
self,
|
||||
features: SerpFeatures,
|
||||
positions: list[CompetitorPosition],
|
||||
) -> int:
|
||||
"""
|
||||
Score SERP opportunity from 0-100.
|
||||
|
||||
Higher scores indicate better opportunity to rank or gain features.
|
||||
|
||||
Factors (additive):
|
||||
- Featured snippet available but could be captured +15
|
||||
- PAA present (related question opportunity) +10
|
||||
- No knowledge panel (less SERP real-estate taken) +10
|
||||
- Low ad count (more organic visibility) +10
|
||||
- Few sitelinks in top results +5
|
||||
- Content diversity (various domains in top 10) +10
|
||||
- No video carousel (opportunity to add video) +5
|
||||
- Top results are blogs (easier to outrank) +10
|
||||
- Image pack absent (image SEO opportunity) +5
|
||||
- Shopping absent for commercial keywords +5
|
||||
- Top positions lacking schema/rich results +5
|
||||
|
||||
Penalty factors (subtractive):
|
||||
- Knowledge panel dominates -15
|
||||
- Heavy ad presence (4+ top ads) -10
|
||||
- Single domain dominates top 5 -10
|
||||
"""
|
||||
score = 50 # Base score
|
||||
|
||||
# -- Positive signals --
|
||||
if features.featured_snippet:
|
||||
score += 15
|
||||
if features.people_also_ask:
|
||||
score += 10
|
||||
if not features.knowledge_panel:
|
||||
score += 10
|
||||
if features.ads_top <= 1:
|
||||
score += 10
|
||||
elif features.ads_top <= 2:
|
||||
score += 5
|
||||
if not features.video_carousel:
|
||||
score += 5
|
||||
if not features.image_pack:
|
||||
score += 5
|
||||
if not features.shopping:
|
||||
score += 5
|
||||
|
||||
# Domain diversity in top 10
|
||||
if positions:
|
||||
top10_domains = {p.domain for p in positions[:10]}
|
||||
if len(top10_domains) >= 8:
|
||||
score += 10
|
||||
elif len(top10_domains) >= 5:
|
||||
score += 5
|
||||
|
||||
# Blog-heavy top results (easier to compete)
|
||||
blog_count = sum(
|
||||
1 for p in positions[:5] if p.content_type == "blog"
|
||||
)
|
||||
if blog_count >= 3:
|
||||
score += 10
|
||||
elif blog_count >= 2:
|
||||
score += 5
|
||||
|
||||
# Sitelinks reduce available space
|
||||
sitelink_count = sum(1 for p in positions[:5] if p.has_sitelinks)
|
||||
if sitelink_count <= 1:
|
||||
score += 5
|
||||
|
||||
# Single domain dominance penalty
|
||||
domain_counts: dict[str, int] = {}
|
||||
for p in positions[:5]:
|
||||
domain_counts[p.domain] = domain_counts.get(p.domain, 0) + 1
|
||||
if any(c >= 3 for c in domain_counts.values()):
|
||||
score -= 10
|
||||
|
||||
# -- Negative signals --
|
||||
if features.knowledge_panel:
|
||||
score -= 15
|
||||
if features.ads_top >= 4:
|
||||
score -= 10
|
||||
elif features.ads_top >= 3:
|
||||
score -= 5
|
||||
|
||||
# Clamp to 0-100
|
||||
return max(0, min(100, score))
|
||||
|
||||
# ----- Intent Validation -----
|
||||
|
||||
def validate_intent(
|
||||
self,
|
||||
features: SerpFeatures,
|
||||
positions: list[CompetitorPosition],
|
||||
) -> str:
|
||||
"""
|
||||
Infer search intent from SERP composition.
|
||||
|
||||
Returns one of: informational, navigational, commercial, transactional, local
|
||||
"""
|
||||
signals: dict[str, int] = {
|
||||
"informational": 0,
|
||||
"navigational": 0,
|
||||
"commercial": 0,
|
||||
"transactional": 0,
|
||||
"local": 0,
|
||||
}
|
||||
|
||||
# Feature-based signals
|
||||
if features.featured_snippet:
|
||||
signals["informational"] += 3
|
||||
if features.people_also_ask:
|
||||
signals["informational"] += 2
|
||||
if features.knowledge_panel:
|
||||
signals["informational"] += 2
|
||||
signals["navigational"] += 2
|
||||
if features.local_pack:
|
||||
signals["local"] += 5
|
||||
if features.shopping:
|
||||
signals["transactional"] += 4
|
||||
if features.has_ads:
|
||||
signals["commercial"] += 2
|
||||
signals["transactional"] += 1
|
||||
if features.ads_top >= 3:
|
||||
signals["transactional"] += 2
|
||||
if features.image_pack:
|
||||
signals["informational"] += 1
|
||||
if features.video_carousel:
|
||||
signals["informational"] += 1
|
||||
|
||||
# Content type signals from top results
|
||||
for pos in positions[:10]:
|
||||
ct = pos.content_type
|
||||
if ct == "blog":
|
||||
signals["informational"] += 1
|
||||
elif ct == "product":
|
||||
signals["transactional"] += 2
|
||||
elif ct == "service":
|
||||
signals["commercial"] += 1
|
||||
elif ct == "news":
|
||||
signals["informational"] += 1
|
||||
elif ct == "video":
|
||||
signals["informational"] += 1
|
||||
elif ct == "wiki":
|
||||
signals["informational"] += 2
|
||||
elif ct == "forum":
|
||||
signals["informational"] += 1
|
||||
elif ct == "comparison":
|
||||
signals["commercial"] += 2
|
||||
|
||||
# Navigational: single domain dominates top 3
|
||||
if positions:
|
||||
top3_domains = [p.domain for p in positions[:3]]
|
||||
if len(set(top3_domains)) == 1:
|
||||
signals["navigational"] += 5
|
||||
|
||||
# Return highest signal
|
||||
return max(signals, key=signals.get) # type: ignore[arg-type]
|
||||
|
||||
# ----- Content Type Distribution -----
|
||||
|
||||
def _content_type_distribution(
|
||||
self, positions: list[CompetitorPosition]
|
||||
) -> dict[str, int]:
|
||||
"""Count content types across top organic results."""
|
||||
dist: dict[str, int] = {}
|
||||
for p in positions[:10]:
|
||||
dist[p.content_type] = dist.get(p.content_type, 0) + 1
|
||||
return dict(sorted(dist.items(), key=lambda x: x[1], reverse=True))
|
||||
|
||||
# ----- Volatility Assessment -----
|
||||
|
||||
def _assess_volatility(self, serp_data: dict[str, Any]) -> str:
|
||||
"""
|
||||
Assess SERP volatility based on available signals.
|
||||
|
||||
Returns: stable, moderate, volatile
|
||||
"""
|
||||
# Check if Ahrefs provides a volatility/movement score
|
||||
metrics = serp_data.get("metrics", {})
|
||||
if isinstance(metrics, dict):
|
||||
volatility_score = metrics.get("serp_volatility", None)
|
||||
if volatility_score is not None:
|
||||
if volatility_score < 3:
|
||||
return "stable"
|
||||
elif volatility_score < 7:
|
||||
return "moderate"
|
||||
else:
|
||||
return "volatile"
|
||||
|
||||
# Heuristic: if many results have recent dates, SERP is more volatile
|
||||
serp_items = serp_data.get("serp", [])
|
||||
if isinstance(serp_items, list) and serp_items:
|
||||
recent_count = 0
|
||||
for item in serp_items[:10]:
|
||||
last_seen = item.get("last_seen", "")
|
||||
if last_seen:
|
||||
try:
|
||||
dt = datetime.fromisoformat(last_seen.replace("Z", "+00:00"))
|
||||
if (datetime.now(dt.tzinfo) - dt).days < 30:
|
||||
recent_count += 1
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
if recent_count >= 5:
|
||||
return "volatile"
|
||||
elif recent_count >= 3:
|
||||
return "moderate"
|
||||
|
||||
return "stable"
|
||||
|
||||
# ----- Main Analysis Orchestrator -----
|
||||
|
||||
def analyze(self, keyword: str, country: str = "us") -> SerpResult:
|
||||
"""
|
||||
Orchestrate full SERP analysis for a single keyword.
|
||||
|
||||
Steps:
|
||||
1. Fetch SERP data from Ahrefs MCP
|
||||
2. Detect SERP features
|
||||
3. Map competitor positions
|
||||
4. Classify content types
|
||||
5. Calculate opportunity score
|
||||
6. Validate search intent
|
||||
7. Assess volatility
|
||||
"""
|
||||
serp_data = self.get_serp_data(keyword, country)
|
||||
|
||||
features = self.detect_features(serp_data)
|
||||
positions = self.map_competitors(serp_data)
|
||||
opportunity = self.calculate_opportunity_score(features, positions)
|
||||
intent = self.validate_intent(features, positions)
|
||||
content_dist = self._content_type_distribution(positions)
|
||||
volatility = self._assess_volatility(serp_data)
|
||||
|
||||
# Extract keyword metrics if available
|
||||
metrics = serp_data.get("metrics", {})
|
||||
search_volume = int(metrics.get("search_volume", 0) or 0)
|
||||
keyword_difficulty = float(metrics.get("keyword_difficulty", 0) or 0)
|
||||
cpc = float(metrics.get("cpc", 0) or 0)
|
||||
|
||||
result = SerpResult(
|
||||
keyword=keyword,
|
||||
country=country,
|
||||
search_volume=search_volume,
|
||||
keyword_difficulty=keyword_difficulty,
|
||||
cpc=cpc,
|
||||
serp_features=features,
|
||||
competitors=positions,
|
||||
opportunity_score=opportunity,
|
||||
intent_signals=intent,
|
||||
content_type_distribution=content_dist,
|
||||
volatility=volatility,
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Output Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def result_to_dict(result: SerpResult) -> dict[str, Any]:
|
||||
"""Convert SerpResult to a JSON-serializable dictionary."""
|
||||
d = asdict(result)
|
||||
return d
|
||||
|
||||
|
||||
def print_rich_report(result: SerpResult) -> None:
|
||||
"""Print a human-readable report using rich."""
|
||||
console.rule(f"[bold blue]SERP Analysis: {result.keyword}")
|
||||
console.print(f"[dim]Country: {result.country} | Timestamp: {result.timestamp}[/dim]")
|
||||
console.print()
|
||||
|
||||
# Metrics
|
||||
if result.search_volume or result.keyword_difficulty:
|
||||
metrics_table = Table(title="Keyword Metrics", show_lines=True)
|
||||
metrics_table.add_column("Metric", style="cyan")
|
||||
metrics_table.add_column("Value", style="green")
|
||||
metrics_table.add_row("Search Volume", f"{result.search_volume:,}")
|
||||
metrics_table.add_row("Keyword Difficulty", f"{result.keyword_difficulty:.1f}")
|
||||
metrics_table.add_row("CPC", f"${result.cpc:.2f}")
|
||||
console.print(metrics_table)
|
||||
console.print()
|
||||
|
||||
# SERP Features
|
||||
feat = result.serp_features
|
||||
feat_table = Table(title="SERP Features", show_lines=True)
|
||||
feat_table.add_column("Feature", style="cyan")
|
||||
feat_table.add_column("Present", style="green")
|
||||
feat_table.add_row("Featured Snippet", _bool_icon(feat.featured_snippet))
|
||||
feat_table.add_row("People Also Ask", _bool_icon(feat.people_also_ask))
|
||||
feat_table.add_row("Local Pack", _bool_icon(feat.local_pack))
|
||||
feat_table.add_row("Knowledge Panel", _bool_icon(feat.knowledge_panel))
|
||||
feat_table.add_row("Video Carousel", _bool_icon(feat.video_carousel))
|
||||
feat_table.add_row("Image Pack", _bool_icon(feat.image_pack))
|
||||
feat_table.add_row("Site Links", _bool_icon(feat.site_links))
|
||||
feat_table.add_row("Shopping", _bool_icon(feat.shopping))
|
||||
feat_table.add_row("Ads (top)", str(feat.ads_top))
|
||||
feat_table.add_row("Ads (bottom)", str(feat.ads_bottom))
|
||||
console.print(feat_table)
|
||||
console.print()
|
||||
|
||||
# Competitors
|
||||
if result.competitors:
|
||||
comp_table = Table(title="Top Competitors", show_lines=True)
|
||||
comp_table.add_column("#", style="bold")
|
||||
comp_table.add_column("Domain", style="cyan")
|
||||
comp_table.add_column("Type", style="magenta")
|
||||
comp_table.add_column("CTR Share", style="green")
|
||||
comp_table.add_column("Featured", style="yellow")
|
||||
for c in result.competitors[:10]:
|
||||
comp_table.add_row(
|
||||
str(c.position),
|
||||
c.domain,
|
||||
c.content_type,
|
||||
f"{c.estimated_traffic_share:.1%}",
|
||||
_bool_icon(c.is_featured),
|
||||
)
|
||||
console.print(comp_table)
|
||||
console.print()
|
||||
|
||||
# Content Distribution
|
||||
if result.content_type_distribution:
|
||||
dist_table = Table(title="Content Type Distribution (Top 10)", show_lines=True)
|
||||
dist_table.add_column("Content Type", style="cyan")
|
||||
dist_table.add_column("Count", style="green")
|
||||
for ct, count in result.content_type_distribution.items():
|
||||
dist_table.add_row(ct, str(count))
|
||||
console.print(dist_table)
|
||||
console.print()
|
||||
|
||||
# Summary
|
||||
opp_color = "green" if result.opportunity_score >= 60 else (
|
||||
"yellow" if result.opportunity_score >= 40 else "red"
|
||||
)
|
||||
console.print(f"Opportunity Score: [{opp_color}]{result.opportunity_score}/100[/{opp_color}]")
|
||||
console.print(f"Search Intent: [bold]{result.intent_signals}[/bold]")
|
||||
console.print(f"SERP Volatility: [bold]{result.volatility}[/bold]")
|
||||
console.rule()
|
||||
|
||||
|
||||
def _bool_icon(val: bool) -> str:
|
||||
"""Return Yes/No string for boolean values."""
|
||||
return "Yes" if val else "No"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Google SERP feature detection and competitor mapping",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python serp_analyzer.py --keyword "치과 임플란트" --country kr --json
|
||||
python serp_analyzer.py --keywords-file keywords.txt --country kr --output report.json
|
||||
""",
|
||||
)
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument(
|
||||
"--keyword",
|
||||
type=str,
|
||||
help="Single keyword to analyze",
|
||||
)
|
||||
group.add_argument(
|
||||
"--keywords-file",
|
||||
type=str,
|
||||
help="Path to file with one keyword per line",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--country",
|
||||
type=str,
|
||||
default="us",
|
||||
help="Country code for SERP (default: us)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
action="store_true",
|
||||
dest="json_output",
|
||||
help="Output results as JSON",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
help="Write JSON results to file",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def load_keywords(filepath: str) -> list[str]:
|
||||
"""Load keywords from a text file, one per line."""
|
||||
path = Path(filepath)
|
||||
if not path.exists():
|
||||
logger.error(f"Keywords file not found: {filepath}")
|
||||
sys.exit(1)
|
||||
keywords = []
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
kw = line.strip()
|
||||
if kw and not kw.startswith("#"):
|
||||
keywords.append(kw)
|
||||
logger.info(f"Loaded {len(keywords)} keywords from {filepath}")
|
||||
return keywords
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
analyzer = SerpAnalyzer()
|
||||
|
||||
# Collect keywords
|
||||
if args.keyword:
|
||||
keywords = [args.keyword]
|
||||
else:
|
||||
keywords = load_keywords(args.keywords_file)
|
||||
|
||||
if not keywords:
|
||||
logger.error("No keywords to analyze")
|
||||
sys.exit(1)
|
||||
|
||||
results: list[dict[str, Any]] = []
|
||||
|
||||
for kw in keywords:
|
||||
console.print(f"\n[bold]Analyzing:[/bold] {kw}")
|
||||
result = analyzer.analyze(kw, args.country)
|
||||
|
||||
if args.json_output or args.output:
|
||||
results.append(result_to_dict(result))
|
||||
else:
|
||||
print_rich_report(result)
|
||||
|
||||
# JSON output
|
||||
if args.json_output:
|
||||
output_data = results[0] if len(results) == 1 else results
|
||||
print(json.dumps(output_data, ensure_ascii=False, indent=2))
|
||||
|
||||
if args.output:
|
||||
output_data = results[0] if len(results) == 1 else results
|
||||
output_path = Path(args.output)
|
||||
with open(output_path, "w", encoding="utf-8") as fh:
|
||||
json.dump(output_data, fh, ensure_ascii=False, indent=2)
|
||||
logger.info(f"Results written to {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user