Files
our-claude-skills/custom-skills/20-seo-serp-analysis/code/scripts/naver_serp_analyzer.py

694 lines
21 KiB
Python

"""
Naver SERP Analyzer - Naver search result composition analysis
==============================================================
Purpose: Analyze Naver SERP section distribution, content type mapping,
brand zone detection, and section priority analysis.
Python: 3.10+
Usage:
python naver_serp_analyzer.py --keyword "치과 임플란트" --json
python naver_serp_analyzer.py --keywords-file keywords.txt --json
python naver_serp_analyzer.py --keyword "치과 임플란트" --output naver_report.json
"""
import argparse
import json
import logging
import re
import sys
from dataclasses import asdict, dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup
from rich.console import Console
from rich.table import Table
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
console = Console()
# ---------------------------------------------------------------------------
# Constants - Naver SERP Section Identifiers
# ---------------------------------------------------------------------------
# CSS class / id patterns used to detect Naver SERP sections
NAVER_SECTION_SELECTORS: dict[str, list[str]] = {
"blog": [
"sp_blog",
"blog_widget",
"sc_new.sp_blog",
"api_subject_blog",
"type_blog",
"blog_exact",
],
"cafe": [
"sp_cafe",
"cafe_widget",
"sc_new.sp_cafe",
"api_subject_cafe",
"type_cafe",
],
"knowledge_in": [
"sp_kin",
"kin_widget",
"sc_new.sp_kin",
"api_subject_kin",
"type_kin",
"nx_kin",
],
"smart_store": [
"sp_nshop",
"shopping_widget",
"sc_new.sp_nshop",
"api_subject_shopping",
"type_shopping",
"smartstore",
],
"brand_zone": [
"sp_brand",
"brand_area",
"brand_zone",
"type_brand",
"sc_new.sp_brand",
],
"news": [
"sp_nnews",
"news_widget",
"sc_new.sp_nnews",
"api_subject_news",
"type_news",
"group_news",
],
"encyclopedia": [
"sp_encyclopedia",
"sc_new.sp_encyclopedia",
"api_subject_encyclopedia",
"type_encyclopedia",
"nx_encyclopedia",
],
"image": [
"sp_image",
"image_widget",
"sc_new.sp_image",
"api_subject_image",
"type_image",
],
"video": [
"sp_video",
"video_widget",
"sc_new.sp_video",
"api_subject_video",
"type_video",
],
"place": [
"sp_local",
"local_widget",
"sc_new.sp_local",
"type_place",
"place_section",
"loc_map",
],
"ad": [
"sp_nad",
"sp_tad",
"ad_section",
"type_powerlink",
"type_ad",
"nx_ad",
],
"books": [
"sp_book",
"sc_new.sp_book",
"type_book",
"api_subject_book",
"nx_book",
],
"shortform": [
"sp_shortform",
"sc_new.sp_shortform",
"type_shortform",
"sp_shorts",
"type_shorts",
],
"influencer": [
"sp_influencer",
"sc_new.sp_influencer",
"type_influencer",
"api_subject_influencer",
],
}
# Section display names in Korean
SECTION_DISPLAY_NAMES: dict[str, str] = {
"blog": "블로그",
"cafe": "카페",
"knowledge_in": "지식iN",
"smart_store": "스마트스토어",
"brand_zone": "브랜드존",
"news": "뉴스",
"encyclopedia": "백과사전",
"image": "이미지",
"video": "동영상",
"place": "플레이스",
"ad": "광고",
"books": "도서",
"shortform": "숏폼",
"influencer": "인플루언서",
}
# Default headers for Naver requests
NAVER_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
# ---------------------------------------------------------------------------
# Data Classes
# ---------------------------------------------------------------------------
@dataclass
class NaverSection:
"""A detected section within Naver SERP."""
section_type: str # blog, cafe, knowledge_in, smart_store, etc.
display_name: str = ""
position: int = 0 # Order of appearance (1-based)
item_count: int = 0 # Number of items in the section
is_above_fold: bool = False # Appears within first ~3 sections
has_more_link: bool = False # Section has "more results" link
raw_html_snippet: str = "" # Short HTML snippet for debugging
def __post_init__(self):
if not self.display_name:
self.display_name = SECTION_DISPLAY_NAMES.get(
self.section_type, self.section_type
)
@dataclass
class NaverSerpResult:
"""Complete Naver SERP analysis result for a keyword."""
keyword: str
sections: list[NaverSection] = field(default_factory=list)
section_order: list[str] = field(default_factory=list)
brand_zone_present: bool = False
brand_zone_brand: str = ""
total_sections: int = 0
above_fold_sections: list[str] = field(default_factory=list)
ad_count: int = 0
dominant_section: str = ""
has_place_section: bool = False
timestamp: str = ""
def __post_init__(self):
if not self.timestamp:
self.timestamp = datetime.now().isoformat()
# ---------------------------------------------------------------------------
# Naver SERP Analyzer
# ---------------------------------------------------------------------------
class NaverSerpAnalyzer:
"""Analyzes Naver search result page composition."""
NAVER_SEARCH_URL = "https://search.naver.com/search.naver"
def __init__(self, timeout: int = 30):
self.timeout = timeout
self.logger = logging.getLogger(self.__class__.__name__)
self.session = requests.Session()
self.session.headers.update(NAVER_HEADERS)
# ----- Data Fetching -----
def fetch_serp(self, keyword: str) -> str:
"""
Fetch Naver search results HTML for a given keyword.
Returns the raw HTML string of the search results page.
"""
self.logger.info(f"Fetching Naver SERP for '{keyword}'")
params = {
"where": "nexearch",
"sm": "top_hty",
"fbm": "0",
"ie": "utf8",
"query": keyword,
}
try:
response = self.session.get(
self.NAVER_SEARCH_URL,
params=params,
timeout=self.timeout,
)
response.raise_for_status()
self.logger.info(
f"Fetched {len(response.text):,} bytes "
f"(status={response.status_code})"
)
return response.text
except requests.RequestException as exc:
self.logger.error(f"Failed to fetch Naver SERP: {exc}")
return ""
# ----- Section Detection -----
def detect_sections(self, html: str) -> list[NaverSection]:
"""
Identify Naver SERP sections from HTML structure.
Scans the HTML for known CSS class names and IDs that correspond
to Naver's SERP section types.
"""
if not html:
return []
soup = BeautifulSoup(html, "lxml")
sections: list[NaverSection] = []
position = 0
# Strategy 1: Look for section containers with known class names
# Naver uses <div class="sc_new sp_XXX"> and <section> elements
all_sections = soup.find_all(
["div", "section"],
class_=re.compile(
r"(sc_new|api_subject|sp_|type_|_widget|group_|nx_)"
),
)
seen_types: set[str] = set()
for element in all_sections:
classes = " ".join(element.get("class", []))
element_id = element.get("id", "")
search_text = f"{classes} {element_id}".lower()
for section_type, selectors in NAVER_SECTION_SELECTORS.items():
if section_type in seen_types:
continue
matched = False
for selector in selectors:
if selector.lower() in search_text:
matched = True
break
if matched:
position += 1
seen_types.add(section_type)
# Count items within the section
item_count = self._count_section_items(element, section_type)
# Check for "more" link
has_more = bool(
element.find("a", class_=re.compile(r"(more|_more|btn_more)"))
or element.find("a", string=re.compile(r"(더보기|전체보기)"))
)
# Get short HTML snippet for debugging
snippet = str(element)[:200] if element else ""
section = NaverSection(
section_type=section_type,
position=position,
item_count=item_count,
is_above_fold=(position <= 3),
has_more_link=has_more,
raw_html_snippet=snippet,
)
sections.append(section)
# Strategy 2: Fallback - scan entire HTML text for section markers
if not sections:
self.logger.warning(
"No sections found via DOM parsing; "
"falling back to text pattern matching"
)
sections = self._fallback_text_detection(html)
return sections
def _count_section_items(self, element: Any, section_type: str) -> int:
"""Count the number of result items within a section element."""
# Common item container patterns
item_selectors = [
"li",
".api_txt_lines",
".total_tit",
".detail_box",
".item",
".lst_total > li",
]
for selector in item_selectors:
items = element.select(selector)
if items and len(items) > 0:
return len(items)
# Fallback: count links that look like results
links = element.find_all("a", href=True)
result_links = [
a
for a in links
if a.get("href", "").startswith("http")
and "naver.com/search" not in a.get("href", "")
]
return len(result_links) if result_links else 0
def _fallback_text_detection(self, html: str) -> list[NaverSection]:
"""Detect sections by scanning raw HTML text for known markers."""
sections: list[NaverSection] = []
position = 0
html_lower = html.lower()
for section_type, selectors in NAVER_SECTION_SELECTORS.items():
for selector in selectors:
if selector.lower() in html_lower:
position += 1
sections.append(
NaverSection(
section_type=section_type,
position=position,
item_count=0,
is_above_fold=(position <= 3),
)
)
break
return sections
# ----- Section Priority Analysis -----
def analyze_section_priority(
self, sections: list[NaverSection]
) -> list[str]:
"""
Determine above-fold section order.
Returns ordered list of section types that appear in the first
visible area of the SERP (approximately top 3 sections).
"""
sorted_sections = sorted(sections, key=lambda s: s.position)
above_fold = [s.section_type for s in sorted_sections if s.is_above_fold]
return above_fold
# ----- Brand Zone Detection -----
def check_brand_zone(self, html: str) -> tuple[bool, str]:
"""
Detect brand zone presence and extract brand name if available.
Returns (is_present, brand_name).
"""
if not html:
return False, ""
soup = BeautifulSoup(html, "lxml")
# Look for brand zone container
brand_selectors = [
"sp_brand",
"brand_area",
"brand_zone",
"type_brand",
]
for selector in brand_selectors:
brand_el = soup.find(
["div", "section"],
class_=re.compile(selector, re.IGNORECASE),
)
if brand_el:
# Try to extract brand name from the section
brand_name = ""
title_el = brand_el.find(
["h2", "h3", "strong", "a"],
class_=re.compile(r"(tit|title|name|brand)", re.IGNORECASE),
)
if title_el:
brand_name = title_el.get_text(strip=True)
return True, brand_name
# Text-based fallback
if "brand_zone" in html.lower() or "sp_brand" in html.lower():
return True, ""
return False, ""
# ----- Dominant Section -----
def _find_dominant_section(self, sections: list[NaverSection]) -> str:
"""Find the section with the most items (excluding ads)."""
non_ad = [s for s in sections if s.section_type != "ad"]
if not non_ad:
return ""
return max(non_ad, key=lambda s: s.item_count).section_type
# ----- Main Analysis Orchestrator -----
def analyze(self, keyword: str) -> NaverSerpResult:
"""
Orchestrate full Naver SERP analysis for a single keyword.
Steps:
1. Fetch Naver search results page
2. Detect SERP sections
3. Analyze section priority
4. Check brand zone presence
5. Compile results
"""
html = self.fetch_serp(keyword)
if not html:
self.logger.error(f"No HTML content for keyword '{keyword}'")
return NaverSerpResult(keyword=keyword)
sections = self.detect_sections(html)
above_fold = self.analyze_section_priority(sections)
brand_present, brand_name = self.check_brand_zone(html)
# Build section order
section_order = [s.section_type for s in sorted(sections, key=lambda x: x.position)]
# Count ads
ad_sections = [s for s in sections if s.section_type == "ad"]
ad_count = sum(s.item_count for s in ad_sections) if ad_sections else 0
# Check special sections
has_place = any(s.section_type == "place" for s in sections)
dominant = self._find_dominant_section(sections)
result = NaverSerpResult(
keyword=keyword,
sections=sections,
section_order=section_order,
brand_zone_present=brand_present,
brand_zone_brand=brand_name,
total_sections=len(sections),
above_fold_sections=above_fold,
ad_count=ad_count,
dominant_section=dominant,
has_place_section=has_place,
)
return result
# ---------------------------------------------------------------------------
# Output Helpers
# ---------------------------------------------------------------------------
def result_to_dict(result: NaverSerpResult) -> dict[str, Any]:
"""Convert NaverSerpResult to a JSON-serializable dictionary."""
d = asdict(result)
# Remove raw HTML snippets from JSON output to keep it clean
for section in d.get("sections", []):
section.pop("raw_html_snippet", None)
return d
def print_rich_report(result: NaverSerpResult) -> None:
"""Print a human-readable report using rich."""
console.rule(f"[bold blue]Naver SERP Analysis: {result.keyword}")
console.print(f"[dim]Timestamp: {result.timestamp}[/dim]")
console.print()
# Summary
summary_table = Table(title="Summary", show_lines=True)
summary_table.add_column("Metric", style="cyan")
summary_table.add_column("Value", style="green")
summary_table.add_row("Total Sections", str(result.total_sections))
summary_table.add_row("Ad Count", str(result.ad_count))
summary_table.add_row("Brand Zone", "Yes" if result.brand_zone_present else "No")
if result.brand_zone_brand:
summary_table.add_row("Brand Name", result.brand_zone_brand)
summary_table.add_row("Place Section", "Yes" if result.has_place_section else "No")
summary_table.add_row("Dominant Section", result.dominant_section or "N/A")
console.print(summary_table)
console.print()
# Section Details
if result.sections:
section_table = Table(title="Detected Sections", show_lines=True)
section_table.add_column("#", style="bold")
section_table.add_column("Section", style="cyan")
section_table.add_column("Display Name", style="magenta")
section_table.add_column("Items", style="green")
section_table.add_column("Above Fold", style="yellow")
section_table.add_column("More Link", style="dim")
for s in sorted(result.sections, key=lambda x: x.position):
section_table.add_row(
str(s.position),
s.section_type,
s.display_name,
str(s.item_count),
"Yes" if s.is_above_fold else "No",
"Yes" if s.has_more_link else "No",
)
console.print(section_table)
console.print()
# Above-Fold Sections
if result.above_fold_sections:
console.print("[bold]Above-Fold Section Order:[/bold]")
for i, sec in enumerate(result.above_fold_sections, 1):
display = SECTION_DISPLAY_NAMES.get(sec, sec)
console.print(f" {i}. {display} ({sec})")
console.print()
# Section Order
if result.section_order:
console.print("[bold]Full Section Order:[/bold]")
order_str = " -> ".join(
SECTION_DISPLAY_NAMES.get(s, s) for s in result.section_order
)
console.print(f" {order_str}")
console.rule()
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Naver SERP composition analysis",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python naver_serp_analyzer.py --keyword "치과 임플란트" --json
python naver_serp_analyzer.py --keywords-file keywords.txt --json
python naver_serp_analyzer.py --keyword "치과 임플란트" --output report.json
""",
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
"--keyword",
type=str,
help="Single keyword to analyze",
)
group.add_argument(
"--keywords-file",
type=str,
help="Path to file with one keyword per line",
)
parser.add_argument(
"--json",
action="store_true",
dest="json_output",
help="Output results as JSON",
)
parser.add_argument(
"--output",
type=str,
help="Write JSON results to file",
)
return parser
def load_keywords(filepath: str) -> list[str]:
"""Load keywords from a text file, one per line."""
path = Path(filepath)
if not path.exists():
logger.error(f"Keywords file not found: {filepath}")
sys.exit(1)
keywords = []
with open(path, "r", encoding="utf-8") as fh:
for line in fh:
kw = line.strip()
if kw and not kw.startswith("#"):
keywords.append(kw)
logger.info(f"Loaded {len(keywords)} keywords from {filepath}")
return keywords
def main() -> None:
parser = build_parser()
args = parser.parse_args()
analyzer = NaverSerpAnalyzer()
# Collect keywords
if args.keyword:
keywords = [args.keyword]
else:
keywords = load_keywords(args.keywords_file)
if not keywords:
logger.error("No keywords to analyze")
sys.exit(1)
results: list[dict[str, Any]] = []
for kw in keywords:
console.print(f"\n[bold]Analyzing Naver SERP:[/bold] {kw}")
result = analyzer.analyze(kw)
if args.json_output or args.output:
results.append(result_to_dict(result))
else:
print_rich_report(result)
# JSON output
if args.json_output:
output_data = results[0] if len(results) == 1 else results
print(json.dumps(output_data, ensure_ascii=False, indent=2))
if args.output:
output_data = results[0] if len(results) == 1 else results
output_path = Path(args.output)
with open(output_path, "w", encoding="utf-8") as fh:
json.dump(output_data, fh, ensure_ascii=False, indent=2)
logger.info(f"Results written to {output_path}")
if __name__ == "__main__":
main()