our-claude-skills/custom-skills/32-seo-crawl-budget/code/scripts/log_parser.py

"""
Log Parser - Server access log parser with bot identification
=============================================================
Purpose: Parse Apache/Nginx/CloudFront access logs, identify search engine
         bots, extract crawl data, and generate per-bot statistics.
Python: 3.10+
"""

import argparse
import bz2
import gzip
import json
import logging
import re
import sys
from collections import Counter, defaultdict
from dataclasses import asdict, dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Generator, TextIO

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Constants: bot user-agent patterns
# ---------------------------------------------------------------------------

BOT_PATTERNS: list[tuple[str, str, str]] = [
    # (canonical name, regex pattern, category)
    ("googlebot", r"Googlebot(?:-Image|-News|-Video)?/", "search_engine"),
    ("googlebot-adsbot", r"AdsBot-Google", "search_engine"),
    ("googlebot-mediapartners", r"Mediapartners-Google", "search_engine"),
    ("yeti", r"Yeti/", "search_engine"),
    ("bingbot", r"bingbot/", "search_engine"),
    ("daumoa", r"Daumoa", "search_engine"),
    ("applebot", r"Applebot/", "search_engine"),
    ("duckduckbot", r"DuckDuckBot/", "search_engine"),
    ("baiduspider", r"Baiduspider", "search_engine"),
    ("yandexbot", r"YandexBot/", "search_engine"),
    ("sogou", r"Sogou", "search_engine"),
    ("seznambot", r"SeznamBot/", "search_engine"),
    ("ahrefsbot", r"AhrefsBot/", "seo_tool"),
    ("semrushbot", r"SemrushBot/", "seo_tool"),
    ("mj12bot", r"MJ12bot/", "seo_tool"),
    ("dotbot", r"DotBot/", "seo_tool"),
    ("rogerbot", r"rogerbot/", "seo_tool"),
    ("screaming frog", r"Screaming Frog SEO Spider", "seo_tool"),
]

COMPILED_BOT_PATTERNS: list[tuple[str, re.Pattern, str]] = [
    (name, re.compile(pattern, re.IGNORECASE), category)
    for name, pattern, category in BOT_PATTERNS
]

# ---------------------------------------------------------------------------
# Regex patterns for each log format
# ---------------------------------------------------------------------------

NGINX_COMBINED_RE = re.compile(
    r'(?P<ip>[\d.:a-fA-F]+)\s+-\s+(?P<user>\S+)\s+'
    r'\[(?P<timestamp>[^\]]+)\]\s+'
    r'"(?P<method>\S+)\s+(?P<url>\S+)\s+(?P<protocol>[^"]+)"\s+'
    r'(?P<status>\d{3})\s+(?P<size>\d+|-)\s+'
    r'"(?P<referer>[^"]*)"\s+'
    r'"(?P<user_agent>[^"]*)"'
)

APACHE_COMBINED_RE = re.compile(
    r'(?P<ip>[\d.:a-fA-F]+)\s+\S+\s+(?P<user>\S+)\s+'
    r'\[(?P<timestamp>[^\]]+)\]\s+'
    r'"(?P<method>\S+)\s+(?P<url>\S+)\s+(?P<protocol>[^"]+)"\s+'
    r'(?P<status>\d{3})\s+(?P<size>\d+|-)\s+'
    r'"(?P<referer>[^"]*)"\s+'
    r'"(?P<user_agent>[^"]*)"'
)

CLOUDFRONT_FIELDS = [
    "date", "time", "x_edge_location", "sc_bytes", "c_ip",
    "cs_method", "cs_host", "cs_uri_stem", "sc_status",
    "cs_referer", "cs_user_agent", "cs_uri_query",
    "cs_cookie", "x_edge_result_type", "x_edge_request_id",
    "x_host_header", "cs_protocol", "cs_bytes",
    "time_taken", "x_forwarded_for", "ssl_protocol",
    "ssl_cipher", "x_edge_response_result_type", "cs_protocol_version",
]

# Timestamp formats
NGINX_TS_FORMAT = "%d/%b/%Y:%H:%M:%S %z"
APACHE_TS_FORMAT = "%d/%b/%Y:%H:%M:%S %z"


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class LogEntry:
    """A single parsed log entry."""
    timestamp: datetime | None
    ip: str
    method: str
    url: str
    status_code: int
    response_size: int
    user_agent: str
    referer: str

    def to_dict(self) -> dict:
        d = asdict(self)
        if self.timestamp:
            d["timestamp"] = self.timestamp.isoformat()
        return d


@dataclass
class BotIdentification:
    """Bot identification result."""
    name: str
    user_agent_pattern: str
    category: str


@dataclass
class BotStats:
    """Aggregated statistics for a single bot."""
    name: str
    total_requests: int = 0
    unique_urls: int = 0
    status_distribution: dict[str, int] = field(default_factory=dict)
    top_urls: list[tuple[str, int]] = field(default_factory=list)
    hourly_distribution: dict[int, int] = field(default_factory=dict)
    daily_distribution: dict[str, int] = field(default_factory=dict)
    avg_response_size: float = 0.0

    def to_dict(self) -> dict:
        return {
            "name": self.name,
            "total_requests": self.total_requests,
            "unique_urls": self.unique_urls,
            "status_distribution": self.status_distribution,
            "top_urls": [{"url": u, "count": c} for u, c in self.top_urls],
            "hourly_distribution": self.hourly_distribution,
            "daily_distribution": self.daily_distribution,
            "avg_response_size": round(self.avg_response_size, 1),
        }


@dataclass
class LogParseResult:
    """Complete log parsing result."""
    log_file: str
    format_detected: str
    total_lines: int
    parsed_lines: int
    bot_entries: int
    date_range: dict[str, str]
    bots: dict[str, BotStats]
    errors: int

    def to_dict(self) -> dict:
        return {
            "log_file": self.log_file,
            "format_detected": self.format_detected,
            "total_lines": self.total_lines,
            "parsed_lines": self.parsed_lines,
            "bot_entries": self.bot_entries,
            "date_range": self.date_range,
            "bots": {name: stats.to_dict() for name, stats in self.bots.items()},
            "errors": self.errors,
        }


# ---------------------------------------------------------------------------
# LogParser class
# ---------------------------------------------------------------------------

class LogParser:
    """Parse server access logs and identify search engine bot traffic."""

    def __init__(
        self,
        log_file: str,
        fmt: str = "auto",
        streaming: bool = False,
    ):
        self.log_file = log_file
        self.fmt = fmt
        self.streaming = streaming
        self._detected_format: str | None = None
        self._parse_errors = 0

    # -- format detection -----------------------------------------------------

    def _detect_format(self, line: str) -> str:
        """Auto-detect log format from a sample line."""
        if line.startswith("#"):
            return "cloudfront"
        if NGINX_COMBINED_RE.match(line):
            return "nginx"
        if APACHE_COMBINED_RE.match(line):
            return "apache"
        # Fallback: try tab-separated (CloudFront without header)
        if "\t" in line and line.count("\t") >= 10:
            return "cloudfront"
        return "nginx"

    # -- line parsers ---------------------------------------------------------

    def _parse_nginx_combined(self, line: str) -> LogEntry | None:
        """Parse a single Nginx combined format log line."""
        m = NGINX_COMBINED_RE.match(line)
        if not m:
            return None
        ts = None
        try:
            ts = datetime.strptime(m.group("timestamp"), NGINX_TS_FORMAT)
        except (ValueError, TypeError):
            pass
        size_raw = m.group("size")
        size = int(size_raw) if size_raw != "-" else 0
        return LogEntry(
            timestamp=ts,
            ip=m.group("ip"),
            method=m.group("method"),
            url=m.group("url"),
            status_code=int(m.group("status")),
            response_size=size,
            user_agent=m.group("user_agent"),
            referer=m.group("referer"),
        )

    def _parse_apache_combined(self, line: str) -> LogEntry | None:
        """Parse a single Apache combined format log line."""
        m = APACHE_COMBINED_RE.match(line)
        if not m:
            return None
        ts = None
        try:
            ts = datetime.strptime(m.group("timestamp"), APACHE_TS_FORMAT)
        except (ValueError, TypeError):
            pass
        size_raw = m.group("size")
        size = int(size_raw) if size_raw != "-" else 0
        return LogEntry(
            timestamp=ts,
            ip=m.group("ip"),
            method=m.group("method"),
            url=m.group("url"),
            status_code=int(m.group("status")),
            response_size=size,
            user_agent=m.group("user_agent"),
            referer=m.group("referer"),
        )

    def _parse_cloudfront(self, line: str) -> LogEntry | None:
        """Parse a CloudFront tab-separated log line."""
        if line.startswith("#"):
            return None
        parts = line.strip().split("\t")
        if len(parts) < 13:
            return None
        ts = None
        try:
            ts = datetime.strptime(f"{parts[0]} {parts[1]}", "%Y-%m-%d %H:%M:%S")
        except (ValueError, IndexError):
            pass
        try:
            status = int(parts[8])
        except (ValueError, IndexError):
            status = 0
        try:
            size = int(parts[3])
        except (ValueError, IndexError):
            size = 0
        url = parts[7] if len(parts) > 7 else ""
        query = parts[11] if len(parts) > 11 else ""
        if query and query != "-":
            url = f"{url}?{query}"
        ua = parts[10] if len(parts) > 10 else ""
        ua = ua.replace("%20", " ").replace("%2520", " ")
        referer = parts[9] if len(parts) > 9 else ""
        return LogEntry(
            timestamp=ts,
            ip=parts[4] if len(parts) > 4 else "",
            method=parts[5] if len(parts) > 5 else "",
            url=url,
            status_code=status,
            response_size=size,
            user_agent=ua,
            referer=referer,
        )

    def _parse_line(self, line: str, fmt: str) -> LogEntry | None:
        """Route to the correct parser based on format."""
        parsers = {
            "nginx": self._parse_nginx_combined,
            "apache": self._parse_apache_combined,
            "cloudfront": self._parse_cloudfront,
        }
        parser = parsers.get(fmt, self._parse_nginx_combined)
        return parser(line)

    # -- bot identification ---------------------------------------------------

    @staticmethod
    def identify_bot(user_agent: str) -> BotIdentification | None:
        """Match user-agent against known bot patterns."""
        if not user_agent or user_agent == "-":
            return None
        for name, pattern, category in COMPILED_BOT_PATTERNS:
            if pattern.search(user_agent):
                return BotIdentification(
                    name=name,
                    user_agent_pattern=pattern.pattern,
                    category=category,
                )
        # Heuristic: generic bot detection via common keywords
        ua_lower = user_agent.lower()
        bot_keywords = ["bot", "spider", "crawler", "scraper", "fetch"]
        for kw in bot_keywords:
            if kw in ua_lower:
                return BotIdentification(
                    name="other",
                    user_agent_pattern=kw,
                    category="other",
                )
        return None

    # -- file handling --------------------------------------------------------

    @staticmethod
    def _open_file(path: str) -> TextIO:
        """Open plain text, .gz, or .bz2 log files."""
        p = Path(path)
        if p.suffix == ".gz":
            return gzip.open(path, "rt", encoding="utf-8", errors="replace")
        if p.suffix == ".bz2":
            return bz2.open(path, "rt", encoding="utf-8", errors="replace")
        return open(path, "r", encoding="utf-8", errors="replace")

    # -- streaming parser -----------------------------------------------------

    def parse_streaming(
        self,
        filter_bot: str | None = None,
    ) -> Generator[tuple[LogEntry, BotIdentification], None, None]:
        """Generator-based streaming parser for large files."""
        fmt = self.fmt
        first_line_checked = False

        fh = self._open_file(self.log_file)
        try:
            for line in fh:
                line = line.strip()
                if not line:
                    continue
                if not first_line_checked and fmt == "auto":
                    fmt = self._detect_format(line)
                    self._detected_format = fmt
                    first_line_checked = True
                entry = self._parse_line(line, fmt)
                if entry is None:
                    self._parse_errors += 1
                    continue
                bot = self.identify_bot(entry.user_agent)
                if bot is None:
                    continue
                if filter_bot and bot.name != filter_bot.lower():
                    continue
                yield entry, bot
        finally:
            fh.close()

    # -- full parse -----------------------------------------------------------

    def parse(
        self,
        filter_bot: str | None = None,
        date_from: datetime | None = None,
        date_to: datetime | None = None,
    ) -> list[tuple[LogEntry, BotIdentification]]:
        """Full parse with optional date and bot filters."""
        results: list[tuple[LogEntry, BotIdentification]] = []
        for entry, bot in self.parse_streaming(filter_bot):
            if date_from and entry.timestamp and entry.timestamp < date_from:
                continue
            if date_to and entry.timestamp and entry.timestamp > date_to:
                continue
            results.append((entry, bot))
        return results

    # -- statistics -----------------------------------------------------------

    @staticmethod
    def get_bot_stats(
        entries: list[tuple[LogEntry, BotIdentification]],
    ) -> dict[str, BotStats]:
        """Aggregate per-bot statistics from parsed entries."""
        bot_data: dict[str, dict] = defaultdict(lambda: {
            "urls": Counter(),
            "statuses": Counter(),
            "hours": Counter(),
            "days": Counter(),
            "sizes": [],
            "count": 0,
        })

        for entry, bot in entries:
            bd = bot_data[bot.name]
            bd["count"] += 1
            bd["urls"][entry.url] += 1
            bd["statuses"][str(entry.status_code)] += 1
            bd["sizes"].append(entry.response_size)
            if entry.timestamp:
                bd["hours"][entry.timestamp.hour] += 1
                day_key = entry.timestamp.strftime("%Y-%m-%d")
                bd["days"][day_key] += 1

        stats: dict[str, BotStats] = {}
        for name, bd in bot_data.items():
            avg_size = sum(bd["sizes"]) / len(bd["sizes"]) if bd["sizes"] else 0.0
            top_20 = bd["urls"].most_common(20)
            stats[name] = BotStats(
                name=name,
                total_requests=bd["count"],
                unique_urls=len(bd["urls"]),
                status_distribution=dict(bd["statuses"]),
                top_urls=top_20,
                hourly_distribution=dict(sorted(bd["hours"].items())),
                daily_distribution=dict(sorted(bd["days"].items())),
                avg_response_size=avg_size,
            )
        return stats

    # -- orchestrator ---------------------------------------------------------

    def parse_and_analyze(
        self,
        filter_bot: str | None = None,
        date_from: datetime | None = None,
        date_to: datetime | None = None,
    ) -> LogParseResult:
        """Orchestrate parsing and statistics generation."""
        entries = self.parse(filter_bot, date_from, date_to)
        bot_stats = self.get_bot_stats(entries)

        # Determine date range
        timestamps = [e.timestamp for e, _ in entries if e.timestamp]
        date_range = {}
        if timestamps:
            date_range = {
                "from": min(timestamps).isoformat(),
                "to": max(timestamps).isoformat(),
            }

        # Count total lines for context
        total_lines = 0
        fh = self._open_file(self.log_file)
        try:
            for _ in fh:
                total_lines += 1
        finally:
            fh.close()

        return LogParseResult(
            log_file=self.log_file,
            format_detected=self._detected_format or self.fmt,
            total_lines=total_lines,
            parsed_lines=total_lines - self._parse_errors,
            bot_entries=len(entries),
            date_range=date_range,
            bots=bot_stats,
            errors=self._parse_errors,
        )


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def _parse_date(val: str) -> datetime:
    """Parse a date string in YYYY-MM-DD format."""
    return datetime.strptime(val, "%Y-%m-%d")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Parse server access logs and identify search engine bot traffic.",
    )
    parser.add_argument(
        "--log-file",
        required=True,
        help="Path to access log file (plain, .gz, .bz2)",
    )
    parser.add_argument(
        "--format",
        dest="fmt",
        choices=["auto", "nginx", "apache", "cloudfront"],
        default="auto",
        help="Log format (default: auto-detect)",
    )
    parser.add_argument(
        "--bot",
        default=None,
        help="Filter results to a specific bot (e.g., googlebot, yeti, bingbot, daumoa)",
    )
    parser.add_argument(
        "--streaming",
        action="store_true",
        help="Use streaming parser for large files (prints entries incrementally)",
    )
    parser.add_argument(
        "--date-from",
        default=None,
        help="Filter entries from date (YYYY-MM-DD)",
    )
    parser.add_argument(
        "--date-to",
        default=None,
        help="Filter entries to date (YYYY-MM-DD)",
    )
    parser.add_argument(
        "--json",
        action="store_true",
        help="Output in JSON format",
    )
    parser.add_argument(
        "--output",
        default=None,
        help="Write output to file instead of stdout",
    )
    args = parser.parse_args()

    # Validate file exists
    if not Path(args.log_file).exists():
        logger.error(f"Log file not found: {args.log_file}")
        sys.exit(1)

    date_from = _parse_date(args.date_from) if args.date_from else None
    date_to = _parse_date(args.date_to) if args.date_to else None

    lp = LogParser(log_file=args.log_file, fmt=args.fmt, streaming=args.streaming)

    if args.streaming and not args.json:
        # Streaming mode: print entries as they are parsed
        count = 0
        for entry, bot in lp.parse_streaming(args.bot):
            if date_from and entry.timestamp and entry.timestamp < date_from:
                continue
            if date_to and entry.timestamp and entry.timestamp > date_to:
                continue
            ts_str = entry.timestamp.isoformat() if entry.timestamp else "N/A"
            print(
                f"[{bot.name}] {ts_str} {entry.status_code} "
                f"{entry.method} {entry.url} ({entry.response_size}B)"
            )
            count += 1
        print(f"\n--- Total bot requests: {count} ---")
        return

    # Full analysis mode
    result = lp.parse_and_analyze(
        filter_bot=args.bot,
        date_from=date_from,
        date_to=date_to,
    )

    if args.json:
        output_data = result.to_dict()
        output_str = json.dumps(output_data, indent=2, ensure_ascii=False)
    else:
        lines = [
            f"Log File: {result.log_file}",
            f"Format: {result.format_detected}",
            f"Total Lines: {result.total_lines:,}",
            f"Parsed Lines: {result.parsed_lines:,}",
            f"Bot Entries: {result.bot_entries:,}",
            f"Parse Errors: {result.errors:,}",
        ]
        if result.date_range:
            lines.append(f"Date Range: {result.date_range.get('from', 'N/A')} to {result.date_range.get('to', 'N/A')}")
        lines.append("")
        lines.append("=" * 60)
        lines.append("Bot Statistics")
        lines.append("=" * 60)
        for name, stats in sorted(result.bots.items(), key=lambda x: -x[1].total_requests):
            lines.append(f"\n--- {name.upper()} ---")
            lines.append(f"  Requests: {stats.total_requests:,}")
            lines.append(f"  Unique URLs: {stats.unique_urls:,}")
            lines.append(f"  Avg Response Size: {stats.avg_response_size:,.0f} bytes")
            lines.append(f"  Status Distribution: {stats.status_distribution}")
            lines.append(f"  Top 10 URLs:")
            for url, cnt in stats.top_urls[:10]:
                lines.append(f"    {cnt:>6,} | {url}")
            if stats.hourly_distribution:
                peak_hour = max(stats.hourly_distribution, key=stats.hourly_distribution.get)
                lines.append(f"  Peak Hour: {peak_hour}:00 ({stats.hourly_distribution[peak_hour]:,} reqs)")
        output_str = "\n".join(lines)

    if args.output:
        Path(args.output).write_text(output_str, encoding="utf-8")
        logger.info(f"Output written to {args.output}")
    else:
        print(output_str)


if __name__ == "__main__":
    main()