our-claude-skills/custom-skills/20-seo-serp-analysis/code/scripts/naver_serp_analyzer.py

"""
Naver SERP Analyzer - Naver search result composition analysis
==============================================================
Purpose: Analyze Naver SERP section distribution, content type mapping,
         brand zone detection, and section priority analysis.
Python: 3.10+

Usage:
    python naver_serp_analyzer.py --keyword "치과 임플란트" --json
    python naver_serp_analyzer.py --keywords-file keywords.txt --json
    python naver_serp_analyzer.py --keyword "치과 임플란트" --output naver_report.json
"""

import argparse
import json
import logging
import re
import sys
from dataclasses import asdict, dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup
from rich.console import Console
from rich.table import Table

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
console = Console()

# ---------------------------------------------------------------------------
# Constants - Naver SERP Section Identifiers
# ---------------------------------------------------------------------------

# CSS class / id patterns used to detect Naver SERP sections
NAVER_SECTION_SELECTORS: dict[str, list[str]] = {
    "blog": [
        "sp_blog",
        "blog_widget",
        "sc_new.sp_blog",
        "api_subject_blog",
        "type_blog",
        "blog_exact",
    ],
    "cafe": [
        "sp_cafe",
        "cafe_widget",
        "sc_new.sp_cafe",
        "api_subject_cafe",
        "type_cafe",
    ],
    "knowledge_in": [
        "sp_kin",
        "kin_widget",
        "sc_new.sp_kin",
        "api_subject_kin",
        "type_kin",
        "nx_kin",
    ],
    "smart_store": [
        "sp_nshop",
        "shopping_widget",
        "sc_new.sp_nshop",
        "api_subject_shopping",
        "type_shopping",
        "smartstore",
    ],
    "brand_zone": [
        "sp_brand",
        "brand_area",
        "brand_zone",
        "type_brand",
        "sc_new.sp_brand",
    ],
    "news": [
        "sp_nnews",
        "news_widget",
        "sc_new.sp_nnews",
        "api_subject_news",
        "type_news",
        "group_news",
    ],
    "encyclopedia": [
        "sp_encyclopedia",
        "sc_new.sp_encyclopedia",
        "api_subject_encyclopedia",
        "type_encyclopedia",
        "nx_encyclopedia",
    ],
    "image": [
        "sp_image",
        "image_widget",
        "sc_new.sp_image",
        "api_subject_image",
        "type_image",
    ],
    "video": [
        "sp_video",
        "video_widget",
        "sc_new.sp_video",
        "api_subject_video",
        "type_video",
    ],
    "place": [
        "sp_local",
        "local_widget",
        "sc_new.sp_local",
        "type_place",
        "place_section",
        "loc_map",
    ],
    "ad": [
        "sp_nad",
        "sp_tad",
        "ad_section",
        "type_powerlink",
        "type_ad",
        "nx_ad",
    ],
    "books": [
        "sp_book",
        "sc_new.sp_book",
        "type_book",
        "api_subject_book",
        "nx_book",
    ],
    "shortform": [
        "sp_shortform",
        "sc_new.sp_shortform",
        "type_shortform",
        "sp_shorts",
        "type_shorts",
    ],
    "influencer": [
        "sp_influencer",
        "sc_new.sp_influencer",
        "type_influencer",
        "api_subject_influencer",
    ],
}

# Section display names in Korean
SECTION_DISPLAY_NAMES: dict[str, str] = {
    "blog": "블로그",
    "cafe": "카페",
    "knowledge_in": "지식iN",
    "smart_store": "스마트스토어",
    "brand_zone": "브랜드존",
    "news": "뉴스",
    "encyclopedia": "백과사전",
    "image": "이미지",
    "video": "동영상",
    "place": "플레이스",
    "ad": "광고",
    "books": "도서",
    "shortform": "숏폼",
    "influencer": "인플루언서",
}

# Default headers for Naver requests
NAVER_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}


# ---------------------------------------------------------------------------
# Data Classes
# ---------------------------------------------------------------------------


@dataclass
class NaverSection:
    """A detected section within Naver SERP."""

    section_type: str  # blog, cafe, knowledge_in, smart_store, etc.
    display_name: str = ""
    position: int = 0  # Order of appearance (1-based)
    item_count: int = 0  # Number of items in the section
    is_above_fold: bool = False  # Appears within first ~3 sections
    has_more_link: bool = False  # Section has "more results" link
    raw_html_snippet: str = ""  # Short HTML snippet for debugging

    def __post_init__(self):
        if not self.display_name:
            self.display_name = SECTION_DISPLAY_NAMES.get(
                self.section_type, self.section_type
            )


@dataclass
class NaverSerpResult:
    """Complete Naver SERP analysis result for a keyword."""

    keyword: str
    sections: list[NaverSection] = field(default_factory=list)
    section_order: list[str] = field(default_factory=list)
    brand_zone_present: bool = False
    brand_zone_brand: str = ""
    total_sections: int = 0
    above_fold_sections: list[str] = field(default_factory=list)
    ad_count: int = 0
    dominant_section: str = ""
    has_place_section: bool = False
    timestamp: str = ""

    def __post_init__(self):
        if not self.timestamp:
            self.timestamp = datetime.now().isoformat()


# ---------------------------------------------------------------------------
# Naver SERP Analyzer
# ---------------------------------------------------------------------------


class NaverSerpAnalyzer:
    """Analyzes Naver search result page composition."""

    NAVER_SEARCH_URL = "https://search.naver.com/search.naver"

    def __init__(self, timeout: int = 30):
        self.timeout = timeout
        self.logger = logging.getLogger(self.__class__.__name__)
        self.session = requests.Session()
        self.session.headers.update(NAVER_HEADERS)

    # ----- Data Fetching -----

    def fetch_serp(self, keyword: str) -> str:
        """
        Fetch Naver search results HTML for a given keyword.

        Returns the raw HTML string of the search results page.
        """
        self.logger.info(f"Fetching Naver SERP for '{keyword}'")

        params = {
            "where": "nexearch",
            "sm": "top_hty",
            "fbm": "0",
            "ie": "utf8",
            "query": keyword,
        }

        try:
            response = self.session.get(
                self.NAVER_SEARCH_URL,
                params=params,
                timeout=self.timeout,
            )
            response.raise_for_status()
            self.logger.info(
                f"Fetched {len(response.text):,} bytes "
                f"(status={response.status_code})"
            )
            return response.text

        except requests.RequestException as exc:
            self.logger.error(f"Failed to fetch Naver SERP: {exc}")
            return ""

    # ----- Section Detection -----

    def detect_sections(self, html: str) -> list[NaverSection]:
        """
        Identify Naver SERP sections from HTML structure.

        Scans the HTML for known CSS class names and IDs that correspond
        to Naver's SERP section types.
        """
        if not html:
            return []

        soup = BeautifulSoup(html, "lxml")
        sections: list[NaverSection] = []
        position = 0

        # Strategy 1: Look for section containers with known class names
        # Naver uses <div class="sc_new sp_XXX"> and <section> elements
        all_sections = soup.find_all(
            ["div", "section"],
            class_=re.compile(
                r"(sc_new|api_subject|sp_|type_|_widget|group_|nx_)"
            ),
        )

        seen_types: set[str] = set()

        for element in all_sections:
            classes = " ".join(element.get("class", []))
            element_id = element.get("id", "")
            search_text = f"{classes} {element_id}".lower()

            for section_type, selectors in NAVER_SECTION_SELECTORS.items():
                if section_type in seen_types:
                    continue

                matched = False
                for selector in selectors:
                    if selector.lower() in search_text:
                        matched = True
                        break

                if matched:
                    position += 1
                    seen_types.add(section_type)

                    # Count items within the section
                    item_count = self._count_section_items(element, section_type)

                    # Check for "more" link
                    has_more = bool(
                        element.find("a", class_=re.compile(r"(more|_more|btn_more)"))
                        or element.find("a", string=re.compile(r"(더보기|전체보기)"))
                    )

                    # Get short HTML snippet for debugging
                    snippet = str(element)[:200] if element else ""

                    section = NaverSection(
                        section_type=section_type,
                        position=position,
                        item_count=item_count,
                        is_above_fold=(position <= 3),
                        has_more_link=has_more,
                        raw_html_snippet=snippet,
                    )
                    sections.append(section)

        # Strategy 2: Fallback - scan entire HTML text for section markers
        if not sections:
            self.logger.warning(
                "No sections found via DOM parsing; "
                "falling back to text pattern matching"
            )
            sections = self._fallback_text_detection(html)

        return sections

    def _count_section_items(self, element: Any, section_type: str) -> int:
        """Count the number of result items within a section element."""
        # Common item container patterns
        item_selectors = [
            "li",
            ".api_txt_lines",
            ".total_tit",
            ".detail_box",
            ".item",
            ".lst_total > li",
        ]

        for selector in item_selectors:
            items = element.select(selector)
            if items and len(items) > 0:
                return len(items)

        # Fallback: count links that look like results
        links = element.find_all("a", href=True)
        result_links = [
            a
            for a in links
            if a.get("href", "").startswith("http")
            and "naver.com/search" not in a.get("href", "")
        ]
        return len(result_links) if result_links else 0

    def _fallback_text_detection(self, html: str) -> list[NaverSection]:
        """Detect sections by scanning raw HTML text for known markers."""
        sections: list[NaverSection] = []
        position = 0
        html_lower = html.lower()

        for section_type, selectors in NAVER_SECTION_SELECTORS.items():
            for selector in selectors:
                if selector.lower() in html_lower:
                    position += 1
                    sections.append(
                        NaverSection(
                            section_type=section_type,
                            position=position,
                            item_count=0,
                            is_above_fold=(position <= 3),
                        )
                    )
                    break

        return sections

    # ----- Section Priority Analysis -----

    def analyze_section_priority(
        self, sections: list[NaverSection]
    ) -> list[str]:
        """
        Determine above-fold section order.

        Returns ordered list of section types that appear in the first
        visible area of the SERP (approximately top 3 sections).
        """
        sorted_sections = sorted(sections, key=lambda s: s.position)
        above_fold = [s.section_type for s in sorted_sections if s.is_above_fold]
        return above_fold

    # ----- Brand Zone Detection -----

    def check_brand_zone(self, html: str) -> tuple[bool, str]:
        """
        Detect brand zone presence and extract brand name if available.

        Returns (is_present, brand_name).
        """
        if not html:
            return False, ""

        soup = BeautifulSoup(html, "lxml")

        # Look for brand zone container
        brand_selectors = [
            "sp_brand",
            "brand_area",
            "brand_zone",
            "type_brand",
        ]

        for selector in brand_selectors:
            brand_el = soup.find(
                ["div", "section"],
                class_=re.compile(selector, re.IGNORECASE),
            )
            if brand_el:
                # Try to extract brand name from the section
                brand_name = ""
                title_el = brand_el.find(
                    ["h2", "h3", "strong", "a"],
                    class_=re.compile(r"(tit|title|name|brand)", re.IGNORECASE),
                )
                if title_el:
                    brand_name = title_el.get_text(strip=True)

                return True, brand_name

        # Text-based fallback
        if "brand_zone" in html.lower() or "sp_brand" in html.lower():
            return True, ""

        return False, ""

    # ----- Dominant Section -----

    def _find_dominant_section(self, sections: list[NaverSection]) -> str:
        """Find the section with the most items (excluding ads)."""
        non_ad = [s for s in sections if s.section_type != "ad"]
        if not non_ad:
            return ""
        return max(non_ad, key=lambda s: s.item_count).section_type

    # ----- Main Analysis Orchestrator -----

    def analyze(self, keyword: str) -> NaverSerpResult:
        """
        Orchestrate full Naver SERP analysis for a single keyword.

        Steps:
        1. Fetch Naver search results page
        2. Detect SERP sections
        3. Analyze section priority
        4. Check brand zone presence
        5. Compile results
        """
        html = self.fetch_serp(keyword)

        if not html:
            self.logger.error(f"No HTML content for keyword '{keyword}'")
            return NaverSerpResult(keyword=keyword)

        sections = self.detect_sections(html)
        above_fold = self.analyze_section_priority(sections)
        brand_present, brand_name = self.check_brand_zone(html)

        # Build section order
        section_order = [s.section_type for s in sorted(sections, key=lambda x: x.position)]

        # Count ads
        ad_sections = [s for s in sections if s.section_type == "ad"]
        ad_count = sum(s.item_count for s in ad_sections) if ad_sections else 0

        # Check special sections
        has_place = any(s.section_type == "place" for s in sections)
        dominant = self._find_dominant_section(sections)

        result = NaverSerpResult(
            keyword=keyword,
            sections=sections,
            section_order=section_order,
            brand_zone_present=brand_present,
            brand_zone_brand=brand_name,
            total_sections=len(sections),
            above_fold_sections=above_fold,
            ad_count=ad_count,
            dominant_section=dominant,
            has_place_section=has_place,
        )
        return result


# ---------------------------------------------------------------------------
# Output Helpers
# ---------------------------------------------------------------------------


def result_to_dict(result: NaverSerpResult) -> dict[str, Any]:
    """Convert NaverSerpResult to a JSON-serializable dictionary."""
    d = asdict(result)
    # Remove raw HTML snippets from JSON output to keep it clean
    for section in d.get("sections", []):
        section.pop("raw_html_snippet", None)
    return d


def print_rich_report(result: NaverSerpResult) -> None:
    """Print a human-readable report using rich."""
    console.rule(f"[bold blue]Naver SERP Analysis: {result.keyword}")
    console.print(f"[dim]Timestamp: {result.timestamp}[/dim]")
    console.print()

    # Summary
    summary_table = Table(title="Summary", show_lines=True)
    summary_table.add_column("Metric", style="cyan")
    summary_table.add_column("Value", style="green")
    summary_table.add_row("Total Sections", str(result.total_sections))
    summary_table.add_row("Ad Count", str(result.ad_count))
    summary_table.add_row("Brand Zone", "Yes" if result.brand_zone_present else "No")
    if result.brand_zone_brand:
        summary_table.add_row("Brand Name", result.brand_zone_brand)
    summary_table.add_row("Place Section", "Yes" if result.has_place_section else "No")
    summary_table.add_row("Dominant Section", result.dominant_section or "N/A")
    console.print(summary_table)
    console.print()

    # Section Details
    if result.sections:
        section_table = Table(title="Detected Sections", show_lines=True)
        section_table.add_column("#", style="bold")
        section_table.add_column("Section", style="cyan")
        section_table.add_column("Display Name", style="magenta")
        section_table.add_column("Items", style="green")
        section_table.add_column("Above Fold", style="yellow")
        section_table.add_column("More Link", style="dim")

        for s in sorted(result.sections, key=lambda x: x.position):
            section_table.add_row(
                str(s.position),
                s.section_type,
                s.display_name,
                str(s.item_count),
                "Yes" if s.is_above_fold else "No",
                "Yes" if s.has_more_link else "No",
            )
        console.print(section_table)
        console.print()

    # Above-Fold Sections
    if result.above_fold_sections:
        console.print("[bold]Above-Fold Section Order:[/bold]")
        for i, sec in enumerate(result.above_fold_sections, 1):
            display = SECTION_DISPLAY_NAMES.get(sec, sec)
            console.print(f"  {i}. {display} ({sec})")
        console.print()

    # Section Order
    if result.section_order:
        console.print("[bold]Full Section Order:[/bold]")
        order_str = " -> ".join(
            SECTION_DISPLAY_NAMES.get(s, s) for s in result.section_order
        )
        console.print(f"  {order_str}")

    console.rule()


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="Naver SERP composition analysis",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python naver_serp_analyzer.py --keyword "치과 임플란트" --json
  python naver_serp_analyzer.py --keywords-file keywords.txt --json
  python naver_serp_analyzer.py --keyword "치과 임플란트" --output report.json
        """,
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        "--keyword",
        type=str,
        help="Single keyword to analyze",
    )
    group.add_argument(
        "--keywords-file",
        type=str,
        help="Path to file with one keyword per line",
    )
    parser.add_argument(
        "--json",
        action="store_true",
        dest="json_output",
        help="Output results as JSON",
    )
    parser.add_argument(
        "--output",
        type=str,
        help="Write JSON results to file",
    )
    return parser


def load_keywords(filepath: str) -> list[str]:
    """Load keywords from a text file, one per line."""
    path = Path(filepath)
    if not path.exists():
        logger.error(f"Keywords file not found: {filepath}")
        sys.exit(1)
    keywords = []
    with open(path, "r", encoding="utf-8") as fh:
        for line in fh:
            kw = line.strip()
            if kw and not kw.startswith("#"):
                keywords.append(kw)
    logger.info(f"Loaded {len(keywords)} keywords from {filepath}")
    return keywords


def main() -> None:
    parser = build_parser()
    args = parser.parse_args()

    analyzer = NaverSerpAnalyzer()

    # Collect keywords
    if args.keyword:
        keywords = [args.keyword]
    else:
        keywords = load_keywords(args.keywords_file)

    if not keywords:
        logger.error("No keywords to analyze")
        sys.exit(1)

    results: list[dict[str, Any]] = []

    for kw in keywords:
        console.print(f"\n[bold]Analyzing Naver SERP:[/bold] {kw}")
        result = analyzer.analyze(kw)

        if args.json_output or args.output:
            results.append(result_to_dict(result))
        else:
            print_rich_report(result)

    # JSON output
    if args.json_output:
        output_data = results[0] if len(results) == 1 else results
        print(json.dumps(output_data, ensure_ascii=False, indent=2))

    if args.output:
        output_data = results[0] if len(results) == 1 else results
        output_path = Path(args.output)
        with open(output_path, "w", encoding="utf-8") as fh:
            json.dump(output_data, fh, ensure_ascii=False, indent=2)
        logger.info(f"Results written to {output_path}")


if __name__ == "__main__":
    main()