our-claude-skills/custom-skills/32-seo-crawl-budget/code/scripts/crawl_budget_analyzer.py

"""
Crawl Budget Analyzer - Identify crawl waste and generate recommendations
=========================================================================
Purpose: Analyze server access logs for crawl budget efficiency, detect waste
         (parameter URLs, redirect chains, soft 404s, duplicates), find orphan
         pages, profile per-bot behavior, and produce prioritized recommendations.
Python: 3.10+
"""

import argparse
import json
import logging
import re
import sys
from collections import Counter, defaultdict
from dataclasses import asdict, dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any
from urllib.parse import parse_qs, urlparse

import requests
from bs4 import BeautifulSoup

from log_parser import BotIdentification, LogEntry, LogParser

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

WASTE_PARAMS = {"sort", "filter", "order", "orderby", "dir", "direction"}
TRACKING_PARAMS_RE = re.compile(r"^utm_", re.IGNORECASE)
PAGINATION_PARAM = "page"
HIGH_PAGE_THRESHOLD = 5
SOFT_404_MAX_SIZE = 1024  # bytes - pages smaller than this may be soft 404s
REDIRECT_STATUSES = {301, 302, 303, 307, 308}
TOP_N_URLS = 50


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class CrawlWaste:
    """A category of crawl budget waste."""
    waste_type: str
    urls: list[str]
    count: int
    pct_of_total: float
    recommendation: str

    def to_dict(self) -> dict:
        return {
            "waste_type": self.waste_type,
            "count": self.count,
            "pct_of_total": round(self.pct_of_total, 2),
            "recommendation": self.recommendation,
            "sample_urls": self.urls[:20],
        }


@dataclass
class OrphanPage:
    """A page that is either in the sitemap but uncrawled, or crawled but not in sitemap."""
    url: str
    in_sitemap: bool
    crawled: bool
    last_crawl_date: str | None = None

    def to_dict(self) -> dict:
        return asdict(self)


@dataclass
class BotProfile:
    """Per-bot crawl behavior profile."""
    name: str
    total_requests: int = 0
    requests_per_day: float = 0.0
    crawl_depth_distribution: dict[int, int] = field(default_factory=dict)
    peak_hours: list[int] = field(default_factory=list)
    status_breakdown: dict[str, int] = field(default_factory=dict)
    top_crawled_urls: list[tuple[str, int]] = field(default_factory=list)
    unique_urls: int = 0
    days_active: int = 0

    def to_dict(self) -> dict:
        return {
            "name": self.name,
            "total_requests": self.total_requests,
            "requests_per_day": round(self.requests_per_day, 1),
            "crawl_depth_distribution": self.crawl_depth_distribution,
            "peak_hours": self.peak_hours,
            "status_breakdown": self.status_breakdown,
            "top_crawled_urls": [{"url": u, "count": c} for u, c in self.top_crawled_urls],
            "unique_urls": self.unique_urls,
            "days_active": self.days_active,
        }


@dataclass
class CrawlRecommendation:
    """A single optimization recommendation."""
    category: str
    priority: str  # critical, high, medium, low
    action: str
    impact: str
    details: str

    def to_dict(self) -> dict:
        return asdict(self)


@dataclass
class CrawlBudgetResult:
    """Complete crawl budget analysis result."""
    log_file: str
    analysis_period: dict[str, str]
    total_bot_requests: int
    bots: dict[str, BotProfile]
    waste: list[CrawlWaste]
    total_waste_pct: float
    orphan_pages: dict[str, list[OrphanPage]]
    recommendations: list[CrawlRecommendation]
    efficiency_score: int
    timestamp: str

    def to_dict(self) -> dict:
        return {
            "log_file": self.log_file,
            "analysis_period": self.analysis_period,
            "total_bot_requests": self.total_bot_requests,
            "bots": {n: p.to_dict() for n, p in self.bots.items()},
            "waste": {w.waste_type: w.to_dict() for w in self.waste},
            "total_waste_pct": round(self.total_waste_pct, 2),
            "orphan_pages": {
                k: [o.to_dict() for o in v]
                for k, v in self.orphan_pages.items()
            },
            "recommendations": [r.to_dict() for r in self.recommendations],
            "efficiency_score": self.efficiency_score,
            "timestamp": self.timestamp,
        }


# ---------------------------------------------------------------------------
# CrawlBudgetAnalyzer
# ---------------------------------------------------------------------------

class CrawlBudgetAnalyzer:
    """Analyze crawl budget efficiency from server access logs."""

    def __init__(
        self,
        log_file: str,
        sitemap_url: str | None = None,
        target_url: str | None = None,
    ):
        self.log_file = log_file
        self.sitemap_url = sitemap_url
        self.target_url = target_url
        self._bot_entries: list[tuple[LogEntry, BotIdentification]] = []
        self._sitemap_urls: set[str] = set()

    # -- data loading ---------------------------------------------------------

    def load_log_data(self, log_file: str) -> list[tuple[LogEntry, BotIdentification]]:
        """Use LogParser to load all bot requests from the log file."""
        parser = LogParser(log_file=log_file, fmt="auto")
        entries = parser.parse()
        logger.info(f"Loaded {len(entries):,} bot entries from {log_file}")
        self._bot_entries = entries
        return entries

    def load_sitemap_urls(self, sitemap_url: str) -> set[str]:
        """Fetch and parse an XML sitemap, returning the set of URLs."""
        urls: set[str] = set()
        try:
            resp = requests.get(sitemap_url, timeout=30, headers={
                "User-Agent": "CrawlBudgetAnalyzer/1.0",
            })
            resp.raise_for_status()
            soup = BeautifulSoup(resp.content, "lxml-xml")

            # Handle sitemap index
            sitemap_tags = soup.find_all("sitemap")
            if sitemap_tags:
                for st in sitemap_tags:
                    loc = st.find("loc")
                    if loc and loc.text:
                        child_urls = self._fetch_sitemap_child(loc.text.strip())
                        urls.update(child_urls)
            else:
                for url_tag in soup.find_all("url"):
                    loc = url_tag.find("loc")
                    if loc and loc.text:
                        urls.add(self._normalize_url(loc.text.strip()))

            logger.info(f"Loaded {len(urls):,} URLs from sitemap: {sitemap_url}")
        except Exception as e:
            logger.error(f"Failed to load sitemap {sitemap_url}: {e}")

        self._sitemap_urls = urls
        return urls

    def _fetch_sitemap_child(self, url: str) -> set[str]:
        """Fetch a child sitemap from a sitemap index."""
        urls: set[str] = set()
        try:
            resp = requests.get(url, timeout=30, headers={
                "User-Agent": "CrawlBudgetAnalyzer/1.0",
            })
            resp.raise_for_status()
            soup = BeautifulSoup(resp.content, "lxml-xml")
            for url_tag in soup.find_all("url"):
                loc = url_tag.find("loc")
                if loc and loc.text:
                    urls.add(self._normalize_url(loc.text.strip()))
        except Exception as e:
            logger.warning(f"Failed to fetch child sitemap {url}: {e}")
        return urls

    @staticmethod
    def _normalize_url(url: str) -> str:
        """Normalize a URL by removing trailing slash and lowercasing the scheme/host."""
        parsed = urlparse(url)
        path = parsed.path.rstrip("/") or "/"
        return f"{parsed.scheme}://{parsed.netloc}{path}"

    # -- waste identification -------------------------------------------------

    def identify_parameter_waste(
        self,
        bot_requests: list[tuple[LogEntry, BotIdentification]],
    ) -> CrawlWaste:
        """Find URLs with unnecessary query parameters wasting crawl budget."""
        waste_urls: list[str] = []
        for entry, _ in bot_requests:
            parsed = urlparse(entry.url)
            if not parsed.query:
                continue
            params = parse_qs(parsed.query)
            param_keys = {k.lower() for k in params}
            # Check for waste parameters
            has_waste = bool(param_keys & WASTE_PARAMS)
            # Check for tracking parameters
            has_tracking = any(TRACKING_PARAMS_RE.match(k) for k in param_keys)
            # Check for deep pagination
            page_val = params.get(PAGINATION_PARAM, params.get("p", [None]))
            has_deep_page = False
            if page_val and page_val[0]:
                try:
                    if int(page_val[0]) > HIGH_PAGE_THRESHOLD:
                        has_deep_page = True
                except (ValueError, TypeError):
                    pass
            if has_waste or has_tracking or has_deep_page:
                waste_urls.append(entry.url)

        total = len(bot_requests)
        count = len(waste_urls)
        pct = (count / total * 100) if total else 0.0
        return CrawlWaste(
            waste_type="parameter_urls",
            urls=list(set(waste_urls)),
            count=count,
            pct_of_total=pct,
            recommendation=(
                "robots.txt에 불필요한 parameter URL 패턴을 Disallow로 추가하거나, "
                "Google Search Console의 URL Parameters 설정을 활용하세요. "
                "UTM 파라미터가 포함된 URL은 canonical 태그로 처리하세요."
            ),
        )

    def identify_redirect_chains(
        self,
        bot_requests: list[tuple[LogEntry, BotIdentification]],
    ) -> CrawlWaste:
        """Find URLs that repeatedly return redirect status codes."""
        redirect_urls: list[str] = []
        redirect_counter: Counter = Counter()
        for entry, _ in bot_requests:
            if entry.status_code in REDIRECT_STATUSES:
                redirect_counter[entry.url] += 1
                redirect_urls.append(entry.url)

        # URLs redirected more than once are chain candidates
        chain_urls = [url for url, cnt in redirect_counter.items() if cnt >= 2]
        total = len(bot_requests)
        count = len(redirect_urls)
        pct = (count / total * 100) if total else 0.0
        return CrawlWaste(
            waste_type="redirect_chains",
            urls=chain_urls,
            count=count,
            pct_of_total=pct,
            recommendation=(
                "301/302 리다이렉트가 반복적으로 크롤링되고 있습니다. "
                "내부 링크를 최종 목적지 URL로 직접 업데이트하고, "
                "리다이렉트 체인을 단일 리다이렉트로 단축하세요."
            ),
        )

    def identify_soft_404s(
        self,
        bot_requests: list[tuple[LogEntry, BotIdentification]],
    ) -> CrawlWaste:
        """Find 200-status pages with suspiciously small response sizes."""
        soft_404_urls: list[str] = []
        for entry, _ in bot_requests:
            if entry.status_code == 200 and entry.response_size < SOFT_404_MAX_SIZE:
                if entry.response_size > 0:
                    soft_404_urls.append(entry.url)

        total = len(bot_requests)
        count = len(soft_404_urls)
        pct = (count / total * 100) if total else 0.0
        return CrawlWaste(
            waste_type="soft_404s",
            urls=list(set(soft_404_urls)),
            count=count,
            pct_of_total=pct,
            recommendation=(
                "200 상태 코드를 반환하지만 콘텐츠가 거의 없는 Soft 404 페이지입니다. "
                "실제 404 상태 코드를 반환하거나, 해당 페이지에 noindex 태그를 추가하세요."
            ),
        )

    def identify_duplicate_crawls(
        self,
        bot_requests: list[tuple[LogEntry, BotIdentification]],
    ) -> CrawlWaste:
        """Find duplicate URL variants: www/non-www, trailing slash, etc."""
        url_variants: dict[str, set[str]] = defaultdict(set)
        for entry, _ in bot_requests:
            parsed = urlparse(entry.url)
            # Normalize: strip www, strip trailing slash, lowercase
            host = parsed.netloc.lower().lstrip("www.")
            path = parsed.path.rstrip("/") or "/"
            canonical = f"{host}{path}"
            full_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
            url_variants[canonical].add(full_url)

        # Identify canonicals with multiple variants
        duplicate_urls: list[str] = []
        for canonical, variants in url_variants.items():
            if len(variants) > 1:
                duplicate_urls.extend(variants)

        total = len(bot_requests)
        # Count how many requests hit duplicate variant URLs
        dup_set = set(duplicate_urls)
        dup_request_count = sum(1 for e, _ in bot_requests if f"{urlparse(e.url).scheme}://{urlparse(e.url).netloc}{urlparse(e.url).path}" in dup_set)
        pct = (dup_request_count / total * 100) if total else 0.0
        return CrawlWaste(
            waste_type="duplicate_urls",
            urls=duplicate_urls[:TOP_N_URLS],
            count=dup_request_count,
            pct_of_total=pct,
            recommendation=(
                "www/non-www, trailing slash 유무 등 중복 URL 변형이 크롤링되고 있습니다. "
                "301 리다이렉트로 canonical URL로 통합하고, "
                "rel=canonical 태그를 정확히 설정하세요."
            ),
        )

    # -- bot profiling --------------------------------------------------------

    def profile_bots(
        self,
        bot_requests: list[tuple[LogEntry, BotIdentification]],
    ) -> dict[str, BotProfile]:
        """Generate per-bot behavior profiles."""
        bot_data: dict[str, dict] = defaultdict(lambda: {
            "urls": Counter(),
            "statuses": Counter(),
            "hours": Counter(),
            "days": set(),
            "depths": Counter(),
            "count": 0,
        })

        for entry, bot in bot_requests:
            bd = bot_data[bot.name]
            bd["count"] += 1
            bd["urls"][entry.url] += 1
            bd["statuses"][str(entry.status_code)] += 1
            # URL depth = number of path segments
            depth = len([s for s in urlparse(entry.url).path.split("/") if s])
            bd["depths"][depth] += 1
            if entry.timestamp:
                bd["hours"][entry.timestamp.hour] += 1
                bd["days"].add(entry.timestamp.strftime("%Y-%m-%d"))

        profiles: dict[str, BotProfile] = {}
        for name, bd in bot_data.items():
            days_active = len(bd["days"]) or 1
            rpd = bd["count"] / days_active
            # Top 3 peak hours
            top_hours = sorted(bd["hours"].items(), key=lambda x: -x[1])[:3]
            peak = [h for h, _ in top_hours]
            profiles[name] = BotProfile(
                name=name,
                total_requests=bd["count"],
                requests_per_day=rpd,
                crawl_depth_distribution=dict(sorted(bd["depths"].items())),
                peak_hours=peak,
                status_breakdown=dict(bd["statuses"]),
                top_crawled_urls=bd["urls"].most_common(TOP_N_URLS),
                unique_urls=len(bd["urls"]),
                days_active=days_active,
            )
        return profiles

    # -- orphan detection -----------------------------------------------------

    def detect_orphan_pages(
        self,
        crawled_urls: set[str],
        sitemap_urls: set[str],
    ) -> dict[str, list[OrphanPage]]:
        """Compare crawled URLs with sitemap URLs to find orphans."""
        in_sitemap_not_crawled = sitemap_urls - crawled_urls
        crawled_not_in_sitemap = crawled_urls - sitemap_urls

        return {
            "in_sitemap_not_crawled": [
                OrphanPage(url=u, in_sitemap=True, crawled=False)
                for u in sorted(in_sitemap_not_crawled)
            ],
            "crawled_not_in_sitemap": [
                OrphanPage(url=u, in_sitemap=False, crawled=True)
                for u in sorted(crawled_not_in_sitemap)
            ],
        }

    # -- efficiency score -----------------------------------------------------

    @staticmethod
    def calculate_efficiency_score(total_waste_pct: float) -> int:
        """Calculate crawl efficiency score: 100 - waste%, capped at [0, 100]."""
        score = int(100 - total_waste_pct)
        return max(0, min(100, score))

    # -- recommendations ------------------------------------------------------

    def generate_recommendations(
        self,
        waste: list[CrawlWaste],
        orphans: dict[str, list[OrphanPage]],
        bot_profiles: dict[str, BotProfile],
    ) -> list[CrawlRecommendation]:
        """Generate prioritized crawl budget optimization recommendations."""
        recs: list[CrawlRecommendation] = []

        # Waste-based recommendations
        for w in waste:
            if w.pct_of_total > 5.0:
                priority = "critical"
            elif w.pct_of_total > 2.0:
                priority = "high"
            elif w.pct_of_total > 0.5:
                priority = "medium"
            else:
                priority = "low"

            if w.waste_type == "parameter_urls" and w.count > 0:
                recs.append(CrawlRecommendation(
                    category="URL Parameters",
                    priority=priority,
                    action="robots.txt에 parameter URL 패턴 Disallow 규칙 추가",
                    impact=f"크롤 요청 {w.pct_of_total:.1f}% 절감 가능",
                    details=(
                        f"총 {w.count:,}건의 parameter URL이 크롤링되었습니다. "
                        f"sort, filter, utm_* 등 불필요한 파라미터를 차단하세요."
                    ),
                ))
            elif w.waste_type == "redirect_chains" and w.count > 0:
                recs.append(CrawlRecommendation(
                    category="Redirect Chains",
                    priority=priority,
                    action="리다이렉트 체인을 단축하고 내부 링크 업데이트",
                    impact=f"크롤 요청 {w.pct_of_total:.1f}% 절감 가능",
                    details=(
                        f"총 {w.count:,}건의 리다이렉트 요청이 발생했습니다. "
                        f"내부 링크를 최종 URL로 직접 연결하세요."
                    ),
                ))
            elif w.waste_type == "soft_404s" and w.count > 0:
                recs.append(CrawlRecommendation(
                    category="Soft 404s",
                    priority=priority,
                    action="Soft 404 페이지에 적절한 HTTP 상태 코드 또는 noindex 적용",
                    impact=f"크롤 요청 {w.pct_of_total:.1f}% 절감 가능",
                    details=(
                        f"총 {w.count:,}건의 Soft 404가 감지되었습니다. "
                        f"적절한 404 응답 또는 noindex meta 태그를 설정하세요."
                    ),
                ))
            elif w.waste_type == "duplicate_urls" and w.count > 0:
                recs.append(CrawlRecommendation(
                    category="Duplicate URLs",
                    priority=priority,
                    action="URL 정규화 및 canonical 태그 설정",
                    impact=f"크롤 요청 {w.pct_of_total:.1f}% 절감 가능",
                    details=(
                        f"총 {w.count:,}건의 중복 URL 변형이 크롤링되었습니다. "
                        f"www/non-www, trailing slash 통합을 진행하세요."
                    ),
                ))

        # Orphan page recommendations
        not_crawled = orphans.get("in_sitemap_not_crawled", [])
        not_in_sitemap = orphans.get("crawled_not_in_sitemap", [])

        if len(not_crawled) > 0:
            pct = len(not_crawled) / max(len(self._sitemap_urls), 1) * 100
            priority = "critical" if pct > 30 else "high" if pct > 10 else "medium"
            recs.append(CrawlRecommendation(
                category="Orphan Pages (Uncrawled)",
                priority=priority,
                action="사이트맵에 있으나 크롤링되지 않은 페이지의 내부 링크 강화",
                impact=f"사이트맵 URL의 {pct:.1f}%가 미크롤 상태",
                details=(
                    f"총 {len(not_crawled):,}개 URL이 사이트맵에 있지만 "
                    f"봇이 크롤링하지 않았습니다. 내부 링크를 추가하세요."
                ),
            ))

        if len(not_in_sitemap) > 0:
            recs.append(CrawlRecommendation(
                category="Orphan Pages (Unlisted)",
                priority="medium",
                action="크롤링되었으나 사이트맵에 없는 페이지를 사이트맵에 추가 또는 차단",
                impact=f"{len(not_in_sitemap):,}개 URL이 사이트맵에 미등록",
                details=(
                    f"봇이 크롤링한 {len(not_in_sitemap):,}개 URL이 "
                    f"사이트맵에 포함되어 있지 않습니다. 유효한 페이지는 "
                    f"사이트맵에 추가하고, 불필요한 페이지는 robots.txt로 차단하세요."
                ),
            ))

        # Bot-specific recommendations
        for name, profile in bot_profiles.items():
            error_count = sum(
                v for k, v in profile.status_breakdown.items()
                if k.startswith("4") or k.startswith("5")
            )
            error_pct = (error_count / profile.total_requests * 100) if profile.total_requests else 0
            if error_pct > 10:
                recs.append(CrawlRecommendation(
                    category=f"Bot Errors ({name})",
                    priority="high" if error_pct > 20 else "medium",
                    action=f"{name}의 4xx/5xx 오류율 {error_pct:.1f}% 개선 필요",
                    impact=f"{name} 크롤 예산의 {error_pct:.1f}%가 오류에 소비",
                    details=(
                        f"{name}이(가) {error_count:,}건의 오류 응답을 받았습니다. "
                        f"깨진 링크를 수정하고 서버 안정성을 개선하세요."
                    ),
                ))

        # Sort by priority
        priority_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
        recs.sort(key=lambda r: priority_order.get(r.priority, 4))
        return recs

    # -- orchestrator ---------------------------------------------------------

    def analyze(self, scope: str = "all") -> CrawlBudgetResult:
        """Orchestrate the full crawl budget analysis."""
        # Load log data
        entries = self.load_log_data(self.log_file)
        if not entries:
            logger.warning("No bot entries found in log file.")

        # Load sitemap if provided
        if self.sitemap_url:
            self.load_sitemap_urls(self.sitemap_url)

        # Profile bots
        bot_profiles: dict[str, BotProfile] = {}
        if scope in ("all", "bots"):
            bot_profiles = self.profile_bots(entries)

        # Identify waste
        waste: list[CrawlWaste] = []
        if scope in ("all", "waste"):
            waste.append(self.identify_parameter_waste(entries))
            waste.append(self.identify_redirect_chains(entries))
            waste.append(self.identify_soft_404s(entries))
            waste.append(self.identify_duplicate_crawls(entries))

        total_waste_pct = sum(w.pct_of_total for w in waste)

        # Detect orphan pages
        orphans: dict[str, list[OrphanPage]] = {
            "in_sitemap_not_crawled": [],
            "crawled_not_in_sitemap": [],
        }
        if scope in ("all", "orphans") and self._sitemap_urls:
            crawled_urls: set[str] = set()
            for entry, _ in entries:
                # Build full URL from path for comparison
                if self.target_url:
                    parsed_target = urlparse(self.target_url)
                    full = f"{parsed_target.scheme}://{parsed_target.netloc}{entry.url}"
                    crawled_urls.add(self._normalize_url(full))
                else:
                    crawled_urls.add(entry.url)
            orphans = self.detect_orphan_pages(crawled_urls, self._sitemap_urls)

        # Efficiency score
        efficiency_score = self.calculate_efficiency_score(total_waste_pct)

        # Recommendations
        recommendations = self.generate_recommendations(waste, orphans, bot_profiles)

        # Date range from entries
        timestamps = [e.timestamp for e, _ in entries if e.timestamp]
        analysis_period = {}
        if timestamps:
            analysis_period = {
                "from": min(timestamps).strftime("%Y-%m-%d"),
                "to": max(timestamps).strftime("%Y-%m-%d"),
            }

        return CrawlBudgetResult(
            log_file=self.log_file,
            analysis_period=analysis_period,
            total_bot_requests=len(entries),
            bots=bot_profiles,
            waste=waste,
            total_waste_pct=total_waste_pct,
            orphan_pages=orphans,
            recommendations=recommendations,
            efficiency_score=efficiency_score,
            timestamp=datetime.now().isoformat(),
        )


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def main() -> None:
    parser = argparse.ArgumentParser(
        description="Analyze crawl budget efficiency and generate optimization recommendations.",
    )
    parser.add_argument(
        "--log-file",
        required=True,
        help="Path to server access log file",
    )
    parser.add_argument(
        "--sitemap",
        default=None,
        help="URL of XML sitemap for orphan page detection",
    )
    parser.add_argument(
        "--url",
        default=None,
        help="Target website URL (used for URL normalization and Ahrefs)",
    )
    parser.add_argument(
        "--scope",
        choices=["all", "waste", "orphans", "bots"],
        default="all",
        help="Analysis scope (default: all)",
    )
    parser.add_argument(
        "--ahrefs",
        action="store_true",
        help="Include Ahrefs page history comparison (requires MCP tool)",
    )
    parser.add_argument(
        "--json",
        action="store_true",
        help="Output in JSON format",
    )
    parser.add_argument(
        "--output",
        default=None,
        help="Write output to file instead of stdout",
    )
    args = parser.parse_args()

    # Validate log file exists
    if not Path(args.log_file).exists():
        logger.error(f"Log file not found: {args.log_file}")
        sys.exit(1)

    analyzer = CrawlBudgetAnalyzer(
        log_file=args.log_file,
        sitemap_url=args.sitemap,
        target_url=args.url,
    )

    result = analyzer.analyze(scope=args.scope)

    if args.json:
        output_data = result.to_dict()
        output_str = json.dumps(output_data, indent=2, ensure_ascii=False)
    else:
        lines = _format_text_report(result)
        output_str = "\n".join(lines)

    if args.output:
        Path(args.output).write_text(output_str, encoding="utf-8")
        logger.info(f"Output written to {args.output}")
    else:
        print(output_str)


def _format_text_report(result: CrawlBudgetResult) -> list[str]:
    """Format the analysis result as a human-readable text report."""
    lines = [
        "=" * 70,
        "Crawl Budget Analysis Report",
        "=" * 70,
        f"Log File: {result.log_file}",
        f"Total Bot Requests: {result.total_bot_requests:,}",
        f"Efficiency Score: {result.efficiency_score}/100",
        f"Total Waste: {result.total_waste_pct:.1f}%",
    ]
    if result.analysis_period:
        lines.append(
            f"Period: {result.analysis_period.get('from', 'N/A')} ~ "
            f"{result.analysis_period.get('to', 'N/A')}"
        )
    lines.append("")

    # Bot profiles
    if result.bots:
        lines.append("-" * 60)
        lines.append("Bot Profiles")
        lines.append("-" * 60)
        for name, profile in sorted(result.bots.items(), key=lambda x: -x[1].total_requests):
            lines.append(f"\n  [{name.upper()}]")
            lines.append(f"    Requests: {profile.total_requests:,}")
            lines.append(f"    Unique URLs: {profile.unique_urls:,}")
            lines.append(f"    Requests/Day: {profile.requests_per_day:,.1f}")
            lines.append(f"    Days Active: {profile.days_active}")
            lines.append(f"    Peak Hours: {profile.peak_hours}")
            lines.append(f"    Status: {profile.status_breakdown}")
        lines.append("")

    # Waste breakdown
    if result.waste:
        lines.append("-" * 60)
        lines.append("Crawl Waste Breakdown")
        lines.append("-" * 60)
        for w in result.waste:
            if w.count > 0:
                lines.append(f"\n  [{w.waste_type}]")
                lines.append(f"    Count: {w.count:,} ({w.pct_of_total:.1f}%)")
                lines.append(f"    Recommendation: {w.recommendation}")
                if w.urls:
                    lines.append(f"    Sample URLs:")
                    for u in w.urls[:5]:
                        lines.append(f"      - {u}")
        lines.append("")

    # Orphan pages
    not_crawled = result.orphan_pages.get("in_sitemap_not_crawled", [])
    not_in_sitemap = result.orphan_pages.get("crawled_not_in_sitemap", [])
    if not_crawled or not_in_sitemap:
        lines.append("-" * 60)
        lines.append("Orphan Pages")
        lines.append("-" * 60)
        if not_crawled:
            lines.append(f"\n  In Sitemap but Not Crawled: {len(not_crawled):,}")
            for op in not_crawled[:10]:
                lines.append(f"    - {op.url}")
        if not_in_sitemap:
            lines.append(f"\n  Crawled but Not in Sitemap: {len(not_in_sitemap):,}")
            for op in not_in_sitemap[:10]:
                lines.append(f"    - {op.url}")
        lines.append("")

    # Recommendations
    if result.recommendations:
        lines.append("-" * 60)
        lines.append("Recommendations")
        lines.append("-" * 60)
        for i, rec in enumerate(result.recommendations, 1):
            lines.append(f"\n  {i}. [{rec.priority.upper()}] {rec.category}")
            lines.append(f"     Action: {rec.action}")
            lines.append(f"     Impact: {rec.impact}")
            lines.append(f"     Details: {rec.details}")

    lines.append("")
    lines.append(f"Generated: {result.timestamp}")
    return lines


if __name__ == "__main__":
    main()