694 lines
21 KiB
Python
694 lines
21 KiB
Python
"""
|
|
Naver SERP Analyzer - Naver search result composition analysis
|
|
==============================================================
|
|
Purpose: Analyze Naver SERP section distribution, content type mapping,
|
|
brand zone detection, and section priority analysis.
|
|
Python: 3.10+
|
|
|
|
Usage:
|
|
python naver_serp_analyzer.py --keyword "치과 임플란트" --json
|
|
python naver_serp_analyzer.py --keywords-file keywords.txt --json
|
|
python naver_serp_analyzer.py --keyword "치과 임플란트" --output naver_report.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import re
|
|
import sys
|
|
from dataclasses import asdict, dataclass, field
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Logging
|
|
# ---------------------------------------------------------------------------
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
console = Console()
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Constants - Naver SERP Section Identifiers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# CSS class / id patterns used to detect Naver SERP sections
|
|
NAVER_SECTION_SELECTORS: dict[str, list[str]] = {
|
|
"blog": [
|
|
"sp_blog",
|
|
"blog_widget",
|
|
"sc_new.sp_blog",
|
|
"api_subject_blog",
|
|
"type_blog",
|
|
"blog_exact",
|
|
],
|
|
"cafe": [
|
|
"sp_cafe",
|
|
"cafe_widget",
|
|
"sc_new.sp_cafe",
|
|
"api_subject_cafe",
|
|
"type_cafe",
|
|
],
|
|
"knowledge_in": [
|
|
"sp_kin",
|
|
"kin_widget",
|
|
"sc_new.sp_kin",
|
|
"api_subject_kin",
|
|
"type_kin",
|
|
"nx_kin",
|
|
],
|
|
"smart_store": [
|
|
"sp_nshop",
|
|
"shopping_widget",
|
|
"sc_new.sp_nshop",
|
|
"api_subject_shopping",
|
|
"type_shopping",
|
|
"smartstore",
|
|
],
|
|
"brand_zone": [
|
|
"sp_brand",
|
|
"brand_area",
|
|
"brand_zone",
|
|
"type_brand",
|
|
"sc_new.sp_brand",
|
|
],
|
|
"news": [
|
|
"sp_nnews",
|
|
"news_widget",
|
|
"sc_new.sp_nnews",
|
|
"api_subject_news",
|
|
"type_news",
|
|
"group_news",
|
|
],
|
|
"encyclopedia": [
|
|
"sp_encyclopedia",
|
|
"sc_new.sp_encyclopedia",
|
|
"api_subject_encyclopedia",
|
|
"type_encyclopedia",
|
|
"nx_encyclopedia",
|
|
],
|
|
"image": [
|
|
"sp_image",
|
|
"image_widget",
|
|
"sc_new.sp_image",
|
|
"api_subject_image",
|
|
"type_image",
|
|
],
|
|
"video": [
|
|
"sp_video",
|
|
"video_widget",
|
|
"sc_new.sp_video",
|
|
"api_subject_video",
|
|
"type_video",
|
|
],
|
|
"place": [
|
|
"sp_local",
|
|
"local_widget",
|
|
"sc_new.sp_local",
|
|
"type_place",
|
|
"place_section",
|
|
"loc_map",
|
|
],
|
|
"ad": [
|
|
"sp_nad",
|
|
"sp_tad",
|
|
"ad_section",
|
|
"type_powerlink",
|
|
"type_ad",
|
|
"nx_ad",
|
|
],
|
|
"books": [
|
|
"sp_book",
|
|
"sc_new.sp_book",
|
|
"type_book",
|
|
"api_subject_book",
|
|
"nx_book",
|
|
],
|
|
"shortform": [
|
|
"sp_shortform",
|
|
"sc_new.sp_shortform",
|
|
"type_shortform",
|
|
"sp_shorts",
|
|
"type_shorts",
|
|
],
|
|
"influencer": [
|
|
"sp_influencer",
|
|
"sc_new.sp_influencer",
|
|
"type_influencer",
|
|
"api_subject_influencer",
|
|
],
|
|
}
|
|
|
|
# Section display names in Korean
|
|
SECTION_DISPLAY_NAMES: dict[str, str] = {
|
|
"blog": "블로그",
|
|
"cafe": "카페",
|
|
"knowledge_in": "지식iN",
|
|
"smart_store": "스마트스토어",
|
|
"brand_zone": "브랜드존",
|
|
"news": "뉴스",
|
|
"encyclopedia": "백과사전",
|
|
"image": "이미지",
|
|
"video": "동영상",
|
|
"place": "플레이스",
|
|
"ad": "광고",
|
|
"books": "도서",
|
|
"shortform": "숏폼",
|
|
"influencer": "인플루언서",
|
|
}
|
|
|
|
# Default headers for Naver requests
|
|
NAVER_HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/120.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data Classes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@dataclass
|
|
class NaverSection:
|
|
"""A detected section within Naver SERP."""
|
|
|
|
section_type: str # blog, cafe, knowledge_in, smart_store, etc.
|
|
display_name: str = ""
|
|
position: int = 0 # Order of appearance (1-based)
|
|
item_count: int = 0 # Number of items in the section
|
|
is_above_fold: bool = False # Appears within first ~3 sections
|
|
has_more_link: bool = False # Section has "more results" link
|
|
raw_html_snippet: str = "" # Short HTML snippet for debugging
|
|
|
|
def __post_init__(self):
|
|
if not self.display_name:
|
|
self.display_name = SECTION_DISPLAY_NAMES.get(
|
|
self.section_type, self.section_type
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class NaverSerpResult:
|
|
"""Complete Naver SERP analysis result for a keyword."""
|
|
|
|
keyword: str
|
|
sections: list[NaverSection] = field(default_factory=list)
|
|
section_order: list[str] = field(default_factory=list)
|
|
brand_zone_present: bool = False
|
|
brand_zone_brand: str = ""
|
|
total_sections: int = 0
|
|
above_fold_sections: list[str] = field(default_factory=list)
|
|
ad_count: int = 0
|
|
dominant_section: str = ""
|
|
has_place_section: bool = False
|
|
timestamp: str = ""
|
|
|
|
def __post_init__(self):
|
|
if not self.timestamp:
|
|
self.timestamp = datetime.now().isoformat()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Naver SERP Analyzer
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class NaverSerpAnalyzer:
|
|
"""Analyzes Naver search result page composition."""
|
|
|
|
NAVER_SEARCH_URL = "https://search.naver.com/search.naver"
|
|
|
|
def __init__(self, timeout: int = 30):
|
|
self.timeout = timeout
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
self.session = requests.Session()
|
|
self.session.headers.update(NAVER_HEADERS)
|
|
|
|
# ----- Data Fetching -----
|
|
|
|
def fetch_serp(self, keyword: str) -> str:
|
|
"""
|
|
Fetch Naver search results HTML for a given keyword.
|
|
|
|
Returns the raw HTML string of the search results page.
|
|
"""
|
|
self.logger.info(f"Fetching Naver SERP for '{keyword}'")
|
|
|
|
params = {
|
|
"where": "nexearch",
|
|
"sm": "top_hty",
|
|
"fbm": "0",
|
|
"ie": "utf8",
|
|
"query": keyword,
|
|
}
|
|
|
|
try:
|
|
response = self.session.get(
|
|
self.NAVER_SEARCH_URL,
|
|
params=params,
|
|
timeout=self.timeout,
|
|
)
|
|
response.raise_for_status()
|
|
self.logger.info(
|
|
f"Fetched {len(response.text):,} bytes "
|
|
f"(status={response.status_code})"
|
|
)
|
|
return response.text
|
|
|
|
except requests.RequestException as exc:
|
|
self.logger.error(f"Failed to fetch Naver SERP: {exc}")
|
|
return ""
|
|
|
|
# ----- Section Detection -----
|
|
|
|
def detect_sections(self, html: str) -> list[NaverSection]:
|
|
"""
|
|
Identify Naver SERP sections from HTML structure.
|
|
|
|
Scans the HTML for known CSS class names and IDs that correspond
|
|
to Naver's SERP section types.
|
|
"""
|
|
if not html:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
sections: list[NaverSection] = []
|
|
position = 0
|
|
|
|
# Strategy 1: Look for section containers with known class names
|
|
# Naver uses <div class="sc_new sp_XXX"> and <section> elements
|
|
all_sections = soup.find_all(
|
|
["div", "section"],
|
|
class_=re.compile(
|
|
r"(sc_new|api_subject|sp_|type_|_widget|group_|nx_)"
|
|
),
|
|
)
|
|
|
|
seen_types: set[str] = set()
|
|
|
|
for element in all_sections:
|
|
classes = " ".join(element.get("class", []))
|
|
element_id = element.get("id", "")
|
|
search_text = f"{classes} {element_id}".lower()
|
|
|
|
for section_type, selectors in NAVER_SECTION_SELECTORS.items():
|
|
if section_type in seen_types:
|
|
continue
|
|
|
|
matched = False
|
|
for selector in selectors:
|
|
if selector.lower() in search_text:
|
|
matched = True
|
|
break
|
|
|
|
if matched:
|
|
position += 1
|
|
seen_types.add(section_type)
|
|
|
|
# Count items within the section
|
|
item_count = self._count_section_items(element, section_type)
|
|
|
|
# Check for "more" link
|
|
has_more = bool(
|
|
element.find("a", class_=re.compile(r"(more|_more|btn_more)"))
|
|
or element.find("a", string=re.compile(r"(더보기|전체보기)"))
|
|
)
|
|
|
|
# Get short HTML snippet for debugging
|
|
snippet = str(element)[:200] if element else ""
|
|
|
|
section = NaverSection(
|
|
section_type=section_type,
|
|
position=position,
|
|
item_count=item_count,
|
|
is_above_fold=(position <= 3),
|
|
has_more_link=has_more,
|
|
raw_html_snippet=snippet,
|
|
)
|
|
sections.append(section)
|
|
|
|
# Strategy 2: Fallback - scan entire HTML text for section markers
|
|
if not sections:
|
|
self.logger.warning(
|
|
"No sections found via DOM parsing; "
|
|
"falling back to text pattern matching"
|
|
)
|
|
sections = self._fallback_text_detection(html)
|
|
|
|
return sections
|
|
|
|
def _count_section_items(self, element: Any, section_type: str) -> int:
|
|
"""Count the number of result items within a section element."""
|
|
# Common item container patterns
|
|
item_selectors = [
|
|
"li",
|
|
".api_txt_lines",
|
|
".total_tit",
|
|
".detail_box",
|
|
".item",
|
|
".lst_total > li",
|
|
]
|
|
|
|
for selector in item_selectors:
|
|
items = element.select(selector)
|
|
if items and len(items) > 0:
|
|
return len(items)
|
|
|
|
# Fallback: count links that look like results
|
|
links = element.find_all("a", href=True)
|
|
result_links = [
|
|
a
|
|
for a in links
|
|
if a.get("href", "").startswith("http")
|
|
and "naver.com/search" not in a.get("href", "")
|
|
]
|
|
return len(result_links) if result_links else 0
|
|
|
|
def _fallback_text_detection(self, html: str) -> list[NaverSection]:
|
|
"""Detect sections by scanning raw HTML text for known markers."""
|
|
sections: list[NaverSection] = []
|
|
position = 0
|
|
html_lower = html.lower()
|
|
|
|
for section_type, selectors in NAVER_SECTION_SELECTORS.items():
|
|
for selector in selectors:
|
|
if selector.lower() in html_lower:
|
|
position += 1
|
|
sections.append(
|
|
NaverSection(
|
|
section_type=section_type,
|
|
position=position,
|
|
item_count=0,
|
|
is_above_fold=(position <= 3),
|
|
)
|
|
)
|
|
break
|
|
|
|
return sections
|
|
|
|
# ----- Section Priority Analysis -----
|
|
|
|
def analyze_section_priority(
|
|
self, sections: list[NaverSection]
|
|
) -> list[str]:
|
|
"""
|
|
Determine above-fold section order.
|
|
|
|
Returns ordered list of section types that appear in the first
|
|
visible area of the SERP (approximately top 3 sections).
|
|
"""
|
|
sorted_sections = sorted(sections, key=lambda s: s.position)
|
|
above_fold = [s.section_type for s in sorted_sections if s.is_above_fold]
|
|
return above_fold
|
|
|
|
# ----- Brand Zone Detection -----
|
|
|
|
def check_brand_zone(self, html: str) -> tuple[bool, str]:
|
|
"""
|
|
Detect brand zone presence and extract brand name if available.
|
|
|
|
Returns (is_present, brand_name).
|
|
"""
|
|
if not html:
|
|
return False, ""
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
# Look for brand zone container
|
|
brand_selectors = [
|
|
"sp_brand",
|
|
"brand_area",
|
|
"brand_zone",
|
|
"type_brand",
|
|
]
|
|
|
|
for selector in brand_selectors:
|
|
brand_el = soup.find(
|
|
["div", "section"],
|
|
class_=re.compile(selector, re.IGNORECASE),
|
|
)
|
|
if brand_el:
|
|
# Try to extract brand name from the section
|
|
brand_name = ""
|
|
title_el = brand_el.find(
|
|
["h2", "h3", "strong", "a"],
|
|
class_=re.compile(r"(tit|title|name|brand)", re.IGNORECASE),
|
|
)
|
|
if title_el:
|
|
brand_name = title_el.get_text(strip=True)
|
|
|
|
return True, brand_name
|
|
|
|
# Text-based fallback
|
|
if "brand_zone" in html.lower() or "sp_brand" in html.lower():
|
|
return True, ""
|
|
|
|
return False, ""
|
|
|
|
# ----- Dominant Section -----
|
|
|
|
def _find_dominant_section(self, sections: list[NaverSection]) -> str:
|
|
"""Find the section with the most items (excluding ads)."""
|
|
non_ad = [s for s in sections if s.section_type != "ad"]
|
|
if not non_ad:
|
|
return ""
|
|
return max(non_ad, key=lambda s: s.item_count).section_type
|
|
|
|
# ----- Main Analysis Orchestrator -----
|
|
|
|
def analyze(self, keyword: str) -> NaverSerpResult:
|
|
"""
|
|
Orchestrate full Naver SERP analysis for a single keyword.
|
|
|
|
Steps:
|
|
1. Fetch Naver search results page
|
|
2. Detect SERP sections
|
|
3. Analyze section priority
|
|
4. Check brand zone presence
|
|
5. Compile results
|
|
"""
|
|
html = self.fetch_serp(keyword)
|
|
|
|
if not html:
|
|
self.logger.error(f"No HTML content for keyword '{keyword}'")
|
|
return NaverSerpResult(keyword=keyword)
|
|
|
|
sections = self.detect_sections(html)
|
|
above_fold = self.analyze_section_priority(sections)
|
|
brand_present, brand_name = self.check_brand_zone(html)
|
|
|
|
# Build section order
|
|
section_order = [s.section_type for s in sorted(sections, key=lambda x: x.position)]
|
|
|
|
# Count ads
|
|
ad_sections = [s for s in sections if s.section_type == "ad"]
|
|
ad_count = sum(s.item_count for s in ad_sections) if ad_sections else 0
|
|
|
|
# Check special sections
|
|
has_place = any(s.section_type == "place" for s in sections)
|
|
dominant = self._find_dominant_section(sections)
|
|
|
|
result = NaverSerpResult(
|
|
keyword=keyword,
|
|
sections=sections,
|
|
section_order=section_order,
|
|
brand_zone_present=brand_present,
|
|
brand_zone_brand=brand_name,
|
|
total_sections=len(sections),
|
|
above_fold_sections=above_fold,
|
|
ad_count=ad_count,
|
|
dominant_section=dominant,
|
|
has_place_section=has_place,
|
|
)
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Output Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def result_to_dict(result: NaverSerpResult) -> dict[str, Any]:
|
|
"""Convert NaverSerpResult to a JSON-serializable dictionary."""
|
|
d = asdict(result)
|
|
# Remove raw HTML snippets from JSON output to keep it clean
|
|
for section in d.get("sections", []):
|
|
section.pop("raw_html_snippet", None)
|
|
return d
|
|
|
|
|
|
def print_rich_report(result: NaverSerpResult) -> None:
|
|
"""Print a human-readable report using rich."""
|
|
console.rule(f"[bold blue]Naver SERP Analysis: {result.keyword}")
|
|
console.print(f"[dim]Timestamp: {result.timestamp}[/dim]")
|
|
console.print()
|
|
|
|
# Summary
|
|
summary_table = Table(title="Summary", show_lines=True)
|
|
summary_table.add_column("Metric", style="cyan")
|
|
summary_table.add_column("Value", style="green")
|
|
summary_table.add_row("Total Sections", str(result.total_sections))
|
|
summary_table.add_row("Ad Count", str(result.ad_count))
|
|
summary_table.add_row("Brand Zone", "Yes" if result.brand_zone_present else "No")
|
|
if result.brand_zone_brand:
|
|
summary_table.add_row("Brand Name", result.brand_zone_brand)
|
|
summary_table.add_row("Place Section", "Yes" if result.has_place_section else "No")
|
|
summary_table.add_row("Dominant Section", result.dominant_section or "N/A")
|
|
console.print(summary_table)
|
|
console.print()
|
|
|
|
# Section Details
|
|
if result.sections:
|
|
section_table = Table(title="Detected Sections", show_lines=True)
|
|
section_table.add_column("#", style="bold")
|
|
section_table.add_column("Section", style="cyan")
|
|
section_table.add_column("Display Name", style="magenta")
|
|
section_table.add_column("Items", style="green")
|
|
section_table.add_column("Above Fold", style="yellow")
|
|
section_table.add_column("More Link", style="dim")
|
|
|
|
for s in sorted(result.sections, key=lambda x: x.position):
|
|
section_table.add_row(
|
|
str(s.position),
|
|
s.section_type,
|
|
s.display_name,
|
|
str(s.item_count),
|
|
"Yes" if s.is_above_fold else "No",
|
|
"Yes" if s.has_more_link else "No",
|
|
)
|
|
console.print(section_table)
|
|
console.print()
|
|
|
|
# Above-Fold Sections
|
|
if result.above_fold_sections:
|
|
console.print("[bold]Above-Fold Section Order:[/bold]")
|
|
for i, sec in enumerate(result.above_fold_sections, 1):
|
|
display = SECTION_DISPLAY_NAMES.get(sec, sec)
|
|
console.print(f" {i}. {display} ({sec})")
|
|
console.print()
|
|
|
|
# Section Order
|
|
if result.section_order:
|
|
console.print("[bold]Full Section Order:[/bold]")
|
|
order_str = " -> ".join(
|
|
SECTION_DISPLAY_NAMES.get(s, s) for s in result.section_order
|
|
)
|
|
console.print(f" {order_str}")
|
|
|
|
console.rule()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def build_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(
|
|
description="Naver SERP composition analysis",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python naver_serp_analyzer.py --keyword "치과 임플란트" --json
|
|
python naver_serp_analyzer.py --keywords-file keywords.txt --json
|
|
python naver_serp_analyzer.py --keyword "치과 임플란트" --output report.json
|
|
""",
|
|
)
|
|
group = parser.add_mutually_exclusive_group(required=True)
|
|
group.add_argument(
|
|
"--keyword",
|
|
type=str,
|
|
help="Single keyword to analyze",
|
|
)
|
|
group.add_argument(
|
|
"--keywords-file",
|
|
type=str,
|
|
help="Path to file with one keyword per line",
|
|
)
|
|
parser.add_argument(
|
|
"--json",
|
|
action="store_true",
|
|
dest="json_output",
|
|
help="Output results as JSON",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=str,
|
|
help="Write JSON results to file",
|
|
)
|
|
return parser
|
|
|
|
|
|
def load_keywords(filepath: str) -> list[str]:
|
|
"""Load keywords from a text file, one per line."""
|
|
path = Path(filepath)
|
|
if not path.exists():
|
|
logger.error(f"Keywords file not found: {filepath}")
|
|
sys.exit(1)
|
|
keywords = []
|
|
with open(path, "r", encoding="utf-8") as fh:
|
|
for line in fh:
|
|
kw = line.strip()
|
|
if kw and not kw.startswith("#"):
|
|
keywords.append(kw)
|
|
logger.info(f"Loaded {len(keywords)} keywords from {filepath}")
|
|
return keywords
|
|
|
|
|
|
def main() -> None:
|
|
parser = build_parser()
|
|
args = parser.parse_args()
|
|
|
|
analyzer = NaverSerpAnalyzer()
|
|
|
|
# Collect keywords
|
|
if args.keyword:
|
|
keywords = [args.keyword]
|
|
else:
|
|
keywords = load_keywords(args.keywords_file)
|
|
|
|
if not keywords:
|
|
logger.error("No keywords to analyze")
|
|
sys.exit(1)
|
|
|
|
results: list[dict[str, Any]] = []
|
|
|
|
for kw in keywords:
|
|
console.print(f"\n[bold]Analyzing Naver SERP:[/bold] {kw}")
|
|
result = analyzer.analyze(kw)
|
|
|
|
if args.json_output or args.output:
|
|
results.append(result_to_dict(result))
|
|
else:
|
|
print_rich_report(result)
|
|
|
|
# JSON output
|
|
if args.json_output:
|
|
output_data = results[0] if len(results) == 1 else results
|
|
print(json.dumps(output_data, ensure_ascii=False, indent=2))
|
|
|
|
if args.output:
|
|
output_data = results[0] if len(results) == 1 else results
|
|
output_path = Path(args.output)
|
|
with open(output_path, "w", encoding="utf-8") as fh:
|
|
json.dump(output_data, fh, ensure_ascii=False, indent=2)
|
|
logger.info(f"Results written to {output_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|