""" Naver SERP Analyzer - Naver search result composition analysis ============================================================== Purpose: Analyze Naver SERP section distribution, content type mapping, brand zone detection, and VIEW tab content analysis. Python: 3.10+ Usage: python naver_serp_analyzer.py --keyword "치과 임플란트" --json python naver_serp_analyzer.py --keywords-file keywords.txt --json python naver_serp_analyzer.py --keyword "치과 임플란트" --output naver_report.json """ import argparse import json import logging import re import sys from dataclasses import asdict, dataclass, field from datetime import datetime from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup from rich.console import Console from rich.table import Table # --------------------------------------------------------------------------- # Logging # --------------------------------------------------------------------------- logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", ) logger = logging.getLogger(__name__) console = Console() # --------------------------------------------------------------------------- # Constants - Naver SERP Section Identifiers # --------------------------------------------------------------------------- # CSS class / id patterns used to detect Naver SERP sections NAVER_SECTION_SELECTORS: dict[str, list[str]] = { "blog": [ "sp_blog", "blog_widget", "sc_new.sp_blog", "api_subject_blog", "type_blog", "blog_exact", ], "cafe": [ "sp_cafe", "cafe_widget", "sc_new.sp_cafe", "api_subject_cafe", "type_cafe", ], "knowledge_in": [ "sp_kin", "kin_widget", "sc_new.sp_kin", "api_subject_kin", "type_kin", "nx_kin", ], "smart_store": [ "sp_nshop", "shopping_widget", "sc_new.sp_nshop", "api_subject_shopping", "type_shopping", "smartstore", ], "brand_zone": [ "sp_brand", "brand_area", "brand_zone", "type_brand", "sc_new.sp_brand", ], "news": [ "sp_nnews", "news_widget", "sc_new.sp_nnews", "api_subject_news", "type_news", "group_news", ], "encyclopedia": [ "sp_encyclopedia", "sc_new.sp_encyclopedia", "api_subject_encyclopedia", "type_encyclopedia", "nx_encyclopedia", ], "image": [ "sp_image", "image_widget", "sc_new.sp_image", "api_subject_image", "type_image", ], "video": [ "sp_video", "video_widget", "sc_new.sp_video", "api_subject_video", "type_video", ], "place": [ "sp_local", "local_widget", "sc_new.sp_local", "type_place", "place_section", "loc_map", ], "ad": [ "sp_nad", "sp_tad", "ad_section", "type_powerlink", "type_ad", "nx_ad", ], "books": [ "sp_book", "sc_new.sp_book", "type_book", "api_subject_book", "nx_book", ], "shortform": [ "sp_shortform", "sc_new.sp_shortform", "type_shortform", "sp_shorts", "type_shorts", ], "influencer": [ "sp_influencer", "sc_new.sp_influencer", "type_influencer", "api_subject_influencer", ], } # Section display names in Korean SECTION_DISPLAY_NAMES: dict[str, str] = { "blog": "블로그", "cafe": "카페", "knowledge_in": "지식iN", "smart_store": "스마트스토어", "brand_zone": "브랜드존", "news": "뉴스", "encyclopedia": "백과사전", "image": "이미지", "video": "동영상", "place": "플레이스", "ad": "광고", "books": "도서", "shortform": "숏폼", "influencer": "인플루언서", } # Default headers for Naver requests NAVER_HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", } # --------------------------------------------------------------------------- # Data Classes # --------------------------------------------------------------------------- @dataclass class NaverSection: """A detected section within Naver SERP.""" section_type: str # blog, cafe, knowledge_in, smart_store, etc. display_name: str = "" position: int = 0 # Order of appearance (1-based) item_count: int = 0 # Number of items in the section is_above_fold: bool = False # Appears within first ~3 sections has_more_link: bool = False # Section has "more results" link raw_html_snippet: str = "" # Short HTML snippet for debugging def __post_init__(self): if not self.display_name: self.display_name = SECTION_DISPLAY_NAMES.get( self.section_type, self.section_type ) @dataclass class NaverSerpResult: """Complete Naver SERP analysis result for a keyword.""" keyword: str sections: list[NaverSection] = field(default_factory=list) section_order: list[str] = field(default_factory=list) brand_zone_present: bool = False brand_zone_brand: str = "" total_sections: int = 0 above_fold_sections: list[str] = field(default_factory=list) ad_count: int = 0 dominant_section: str = "" has_place_section: bool = False timestamp: str = "" def __post_init__(self): if not self.timestamp: self.timestamp = datetime.now().isoformat() # --------------------------------------------------------------------------- # Naver SERP Analyzer # --------------------------------------------------------------------------- class NaverSerpAnalyzer: """Analyzes Naver search result page composition.""" NAVER_SEARCH_URL = "https://search.naver.com/search.naver" def __init__(self, timeout: int = 30): self.timeout = timeout self.logger = logging.getLogger(self.__class__.__name__) self.session = requests.Session() self.session.headers.update(NAVER_HEADERS) # ----- Data Fetching ----- def fetch_serp(self, keyword: str) -> str: """ Fetch Naver search results HTML for a given keyword. Returns the raw HTML string of the search results page. """ self.logger.info(f"Fetching Naver SERP for '{keyword}'") params = { "where": "nexearch", "sm": "top_hty", "fbm": "0", "ie": "utf8", "query": keyword, } try: response = self.session.get( self.NAVER_SEARCH_URL, params=params, timeout=self.timeout, ) response.raise_for_status() self.logger.info( f"Fetched {len(response.text):,} bytes " f"(status={response.status_code})" ) return response.text except requests.RequestException as exc: self.logger.error(f"Failed to fetch Naver SERP: {exc}") return "" # ----- Section Detection ----- def detect_sections(self, html: str) -> list[NaverSection]: """ Identify Naver SERP sections from HTML structure. Scans the HTML for known CSS class names and IDs that correspond to Naver's SERP section types. """ if not html: return [] soup = BeautifulSoup(html, "lxml") sections: list[NaverSection] = [] position = 0 # Strategy 1: Look for section containers with known class names # Naver uses
and
elements all_sections = soup.find_all( ["div", "section"], class_=re.compile( r"(sc_new|api_subject|sp_|type_|_widget|group_|nx_)" ), ) seen_types: set[str] = set() for element in all_sections: classes = " ".join(element.get("class", [])) element_id = element.get("id", "") search_text = f"{classes} {element_id}".lower() for section_type, selectors in NAVER_SECTION_SELECTORS.items(): if section_type in seen_types: continue matched = False for selector in selectors: if selector.lower() in search_text: matched = True break if matched: position += 1 seen_types.add(section_type) # Count items within the section item_count = self._count_section_items(element, section_type) # Check for "more" link has_more = bool( element.find("a", class_=re.compile(r"(more|_more|btn_more)")) or element.find("a", string=re.compile(r"(더보기|전체보기)")) ) # Get short HTML snippet for debugging snippet = str(element)[:200] if element else "" section = NaverSection( section_type=section_type, position=position, item_count=item_count, is_above_fold=(position <= 3), has_more_link=has_more, raw_html_snippet=snippet, ) sections.append(section) # Strategy 2: Fallback - scan entire HTML text for section markers if not sections: self.logger.warning( "No sections found via DOM parsing; " "falling back to text pattern matching" ) sections = self._fallback_text_detection(html) return sections def _count_section_items(self, element: Any, section_type: str) -> int: """Count the number of result items within a section element.""" # Common item container patterns item_selectors = [ "li", ".api_txt_lines", ".total_tit", ".detail_box", ".item", ".lst_total > li", ] for selector in item_selectors: items = element.select(selector) if items and len(items) > 0: return len(items) # Fallback: count links that look like results links = element.find_all("a", href=True) result_links = [ a for a in links if a.get("href", "").startswith("http") and "naver.com/search" not in a.get("href", "") ] return len(result_links) if result_links else 0 def _fallback_text_detection(self, html: str) -> list[NaverSection]: """Detect sections by scanning raw HTML text for known markers.""" sections: list[NaverSection] = [] position = 0 html_lower = html.lower() for section_type, selectors in NAVER_SECTION_SELECTORS.items(): for selector in selectors: if selector.lower() in html_lower: position += 1 sections.append( NaverSection( section_type=section_type, position=position, item_count=0, is_above_fold=(position <= 3), ) ) break return sections # ----- Section Priority Analysis ----- def analyze_section_priority( self, sections: list[NaverSection] ) -> list[str]: """ Determine above-fold section order. Returns ordered list of section types that appear in the first visible area of the SERP (approximately top 3 sections). """ sorted_sections = sorted(sections, key=lambda s: s.position) above_fold = [s.section_type for s in sorted_sections if s.is_above_fold] return above_fold # ----- Brand Zone Detection ----- def check_brand_zone(self, html: str) -> tuple[bool, str]: """ Detect brand zone presence and extract brand name if available. Returns (is_present, brand_name). """ if not html: return False, "" soup = BeautifulSoup(html, "lxml") # Look for brand zone container brand_selectors = [ "sp_brand", "brand_area", "brand_zone", "type_brand", ] for selector in brand_selectors: brand_el = soup.find( ["div", "section"], class_=re.compile(selector, re.IGNORECASE), ) if brand_el: # Try to extract brand name from the section brand_name = "" title_el = brand_el.find( ["h2", "h3", "strong", "a"], class_=re.compile(r"(tit|title|name|brand)", re.IGNORECASE), ) if title_el: brand_name = title_el.get_text(strip=True) return True, brand_name # Text-based fallback if "brand_zone" in html.lower() or "sp_brand" in html.lower(): return True, "" return False, "" # ----- Dominant Section ----- def _find_dominant_section(self, sections: list[NaverSection]) -> str: """Find the section with the most items (excluding ads).""" non_ad = [s for s in sections if s.section_type != "ad"] if not non_ad: return "" return max(non_ad, key=lambda s: s.item_count).section_type # ----- Main Analysis Orchestrator ----- def analyze(self, keyword: str) -> NaverSerpResult: """ Orchestrate full Naver SERP analysis for a single keyword. Steps: 1. Fetch Naver search results page 2. Detect SERP sections 3. Analyze section priority 4. Check brand zone presence 5. Compile results """ html = self.fetch_serp(keyword) if not html: self.logger.error(f"No HTML content for keyword '{keyword}'") return NaverSerpResult(keyword=keyword) sections = self.detect_sections(html) above_fold = self.analyze_section_priority(sections) brand_present, brand_name = self.check_brand_zone(html) # Build section order section_order = [s.section_type for s in sorted(sections, key=lambda x: x.position)] # Count ads ad_sections = [s for s in sections if s.section_type == "ad"] ad_count = sum(s.item_count for s in ad_sections) if ad_sections else 0 # Check special sections has_place = any(s.section_type == "place" for s in sections) dominant = self._find_dominant_section(sections) result = NaverSerpResult( keyword=keyword, sections=sections, section_order=section_order, brand_zone_present=brand_present, brand_zone_brand=brand_name, total_sections=len(sections), above_fold_sections=above_fold, ad_count=ad_count, dominant_section=dominant, has_place_section=has_place, ) return result # --------------------------------------------------------------------------- # Output Helpers # --------------------------------------------------------------------------- def result_to_dict(result: NaverSerpResult) -> dict[str, Any]: """Convert NaverSerpResult to a JSON-serializable dictionary.""" d = asdict(result) # Remove raw HTML snippets from JSON output to keep it clean for section in d.get("sections", []): section.pop("raw_html_snippet", None) return d def print_rich_report(result: NaverSerpResult) -> None: """Print a human-readable report using rich.""" console.rule(f"[bold blue]Naver SERP Analysis: {result.keyword}") console.print(f"[dim]Timestamp: {result.timestamp}[/dim]") console.print() # Summary summary_table = Table(title="Summary", show_lines=True) summary_table.add_column("Metric", style="cyan") summary_table.add_column("Value", style="green") summary_table.add_row("Total Sections", str(result.total_sections)) summary_table.add_row("Ad Count", str(result.ad_count)) summary_table.add_row("Brand Zone", "Yes" if result.brand_zone_present else "No") if result.brand_zone_brand: summary_table.add_row("Brand Name", result.brand_zone_brand) summary_table.add_row("Place Section", "Yes" if result.has_place_section else "No") summary_table.add_row("Dominant Section", result.dominant_section or "N/A") console.print(summary_table) console.print() # Section Details if result.sections: section_table = Table(title="Detected Sections", show_lines=True) section_table.add_column("#", style="bold") section_table.add_column("Section", style="cyan") section_table.add_column("Display Name", style="magenta") section_table.add_column("Items", style="green") section_table.add_column("Above Fold", style="yellow") section_table.add_column("More Link", style="dim") for s in sorted(result.sections, key=lambda x: x.position): section_table.add_row( str(s.position), s.section_type, s.display_name, str(s.item_count), "Yes" if s.is_above_fold else "No", "Yes" if s.has_more_link else "No", ) console.print(section_table) console.print() # Above-Fold Sections if result.above_fold_sections: console.print("[bold]Above-Fold Section Order:[/bold]") for i, sec in enumerate(result.above_fold_sections, 1): display = SECTION_DISPLAY_NAMES.get(sec, sec) console.print(f" {i}. {display} ({sec})") console.print() # Section Order if result.section_order: console.print("[bold]Full Section Order:[/bold]") order_str = " -> ".join( SECTION_DISPLAY_NAMES.get(s, s) for s in result.section_order ) console.print(f" {order_str}") console.rule() # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( description="Naver SERP composition analysis", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python naver_serp_analyzer.py --keyword "치과 임플란트" --json python naver_serp_analyzer.py --keywords-file keywords.txt --json python naver_serp_analyzer.py --keyword "치과 임플란트" --output report.json """, ) group = parser.add_mutually_exclusive_group(required=True) group.add_argument( "--keyword", type=str, help="Single keyword to analyze", ) group.add_argument( "--keywords-file", type=str, help="Path to file with one keyword per line", ) parser.add_argument( "--json", action="store_true", dest="json_output", help="Output results as JSON", ) parser.add_argument( "--output", type=str, help="Write JSON results to file", ) return parser def load_keywords(filepath: str) -> list[str]: """Load keywords from a text file, one per line.""" path = Path(filepath) if not path.exists(): logger.error(f"Keywords file not found: {filepath}") sys.exit(1) keywords = [] with open(path, "r", encoding="utf-8") as fh: for line in fh: kw = line.strip() if kw and not kw.startswith("#"): keywords.append(kw) logger.info(f"Loaded {len(keywords)} keywords from {filepath}") return keywords def main() -> None: parser = build_parser() args = parser.parse_args() analyzer = NaverSerpAnalyzer() # Collect keywords if args.keyword: keywords = [args.keyword] else: keywords = load_keywords(args.keywords_file) if not keywords: logger.error("No keywords to analyze") sys.exit(1) results: list[dict[str, Any]] = [] for kw in keywords: console.print(f"\n[bold]Analyzing Naver SERP:[/bold] {kw}") result = analyzer.analyze(kw) if args.json_output or args.output: results.append(result_to_dict(result)) else: print_rich_report(result) # JSON output if args.json_output: output_data = results[0] if len(results) == 1 else results print(json.dumps(output_data, ensure_ascii=False, indent=2)) if args.output: output_data = results[0] if len(results) == 1 else results output_path = Path(args.output) with open(output_path, "w", encoding="utf-8") as fh: json.dump(output_data, fh, ensure_ascii=False, indent=2) logger.info(f"Results written to {output_path}") if __name__ == "__main__": main()