""" Log Parser - Server access log parser with bot identification ============================================================= Purpose: Parse Apache/Nginx/CloudFront access logs, identify search engine bots, extract crawl data, and generate per-bot statistics. Python: 3.10+ """ import argparse import bz2 import gzip import json import logging import re import sys from collections import Counter, defaultdict from dataclasses import asdict, dataclass, field from datetime import datetime from pathlib import Path from typing import Generator, TextIO logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", ) logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Constants: bot user-agent patterns # --------------------------------------------------------------------------- BOT_PATTERNS: list[tuple[str, str, str]] = [ # (canonical name, regex pattern, category) ("googlebot", r"Googlebot(?:-Image|-News|-Video)?/", "search_engine"), ("googlebot-adsbot", r"AdsBot-Google", "search_engine"), ("googlebot-mediapartners", r"Mediapartners-Google", "search_engine"), ("yeti", r"Yeti/", "search_engine"), ("bingbot", r"bingbot/", "search_engine"), ("daumoa", r"Daumoa", "search_engine"), ("applebot", r"Applebot/", "search_engine"), ("duckduckbot", r"DuckDuckBot/", "search_engine"), ("baiduspider", r"Baiduspider", "search_engine"), ("yandexbot", r"YandexBot/", "search_engine"), ("sogou", r"Sogou", "search_engine"), ("seznambot", r"SeznamBot/", "search_engine"), ("ahrefsbot", r"AhrefsBot/", "seo_tool"), ("semrushbot", r"SemrushBot/", "seo_tool"), ("mj12bot", r"MJ12bot/", "seo_tool"), ("dotbot", r"DotBot/", "seo_tool"), ("rogerbot", r"rogerbot/", "seo_tool"), ("screaming frog", r"Screaming Frog SEO Spider", "seo_tool"), ] COMPILED_BOT_PATTERNS: list[tuple[str, re.Pattern, str]] = [ (name, re.compile(pattern, re.IGNORECASE), category) for name, pattern, category in BOT_PATTERNS ] # --------------------------------------------------------------------------- # Regex patterns for each log format # --------------------------------------------------------------------------- NGINX_COMBINED_RE = re.compile( r'(?P[\d.:a-fA-F]+)\s+-\s+(?P\S+)\s+' r'\[(?P[^\]]+)\]\s+' r'"(?P\S+)\s+(?P\S+)\s+(?P[^"]+)"\s+' r'(?P\d{3})\s+(?P\d+|-)\s+' r'"(?P[^"]*)"\s+' r'"(?P[^"]*)"' ) APACHE_COMBINED_RE = re.compile( r'(?P[\d.:a-fA-F]+)\s+\S+\s+(?P\S+)\s+' r'\[(?P[^\]]+)\]\s+' r'"(?P\S+)\s+(?P\S+)\s+(?P[^"]+)"\s+' r'(?P\d{3})\s+(?P\d+|-)\s+' r'"(?P[^"]*)"\s+' r'"(?P[^"]*)"' ) CLOUDFRONT_FIELDS = [ "date", "time", "x_edge_location", "sc_bytes", "c_ip", "cs_method", "cs_host", "cs_uri_stem", "sc_status", "cs_referer", "cs_user_agent", "cs_uri_query", "cs_cookie", "x_edge_result_type", "x_edge_request_id", "x_host_header", "cs_protocol", "cs_bytes", "time_taken", "x_forwarded_for", "ssl_protocol", "ssl_cipher", "x_edge_response_result_type", "cs_protocol_version", ] # Timestamp formats NGINX_TS_FORMAT = "%d/%b/%Y:%H:%M:%S %z" APACHE_TS_FORMAT = "%d/%b/%Y:%H:%M:%S %z" # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class LogEntry: """A single parsed log entry.""" timestamp: datetime | None ip: str method: str url: str status_code: int response_size: int user_agent: str referer: str def to_dict(self) -> dict: d = asdict(self) if self.timestamp: d["timestamp"] = self.timestamp.isoformat() return d @dataclass class BotIdentification: """Bot identification result.""" name: str user_agent_pattern: str category: str @dataclass class BotStats: """Aggregated statistics for a single bot.""" name: str total_requests: int = 0 unique_urls: int = 0 status_distribution: dict[str, int] = field(default_factory=dict) top_urls: list[tuple[str, int]] = field(default_factory=list) hourly_distribution: dict[int, int] = field(default_factory=dict) daily_distribution: dict[str, int] = field(default_factory=dict) avg_response_size: float = 0.0 def to_dict(self) -> dict: return { "name": self.name, "total_requests": self.total_requests, "unique_urls": self.unique_urls, "status_distribution": self.status_distribution, "top_urls": [{"url": u, "count": c} for u, c in self.top_urls], "hourly_distribution": self.hourly_distribution, "daily_distribution": self.daily_distribution, "avg_response_size": round(self.avg_response_size, 1), } @dataclass class LogParseResult: """Complete log parsing result.""" log_file: str format_detected: str total_lines: int parsed_lines: int bot_entries: int date_range: dict[str, str] bots: dict[str, BotStats] errors: int def to_dict(self) -> dict: return { "log_file": self.log_file, "format_detected": self.format_detected, "total_lines": self.total_lines, "parsed_lines": self.parsed_lines, "bot_entries": self.bot_entries, "date_range": self.date_range, "bots": {name: stats.to_dict() for name, stats in self.bots.items()}, "errors": self.errors, } # --------------------------------------------------------------------------- # LogParser class # --------------------------------------------------------------------------- class LogParser: """Parse server access logs and identify search engine bot traffic.""" def __init__( self, log_file: str, fmt: str = "auto", streaming: bool = False, ): self.log_file = log_file self.fmt = fmt self.streaming = streaming self._detected_format: str | None = None self._parse_errors = 0 # -- format detection ----------------------------------------------------- def _detect_format(self, line: str) -> str: """Auto-detect log format from a sample line.""" if line.startswith("#"): return "cloudfront" if NGINX_COMBINED_RE.match(line): return "nginx" if APACHE_COMBINED_RE.match(line): return "apache" # Fallback: try tab-separated (CloudFront without header) if "\t" in line and line.count("\t") >= 10: return "cloudfront" return "nginx" # -- line parsers --------------------------------------------------------- def _parse_nginx_combined(self, line: str) -> LogEntry | None: """Parse a single Nginx combined format log line.""" m = NGINX_COMBINED_RE.match(line) if not m: return None ts = None try: ts = datetime.strptime(m.group("timestamp"), NGINX_TS_FORMAT) except (ValueError, TypeError): pass size_raw = m.group("size") size = int(size_raw) if size_raw != "-" else 0 return LogEntry( timestamp=ts, ip=m.group("ip"), method=m.group("method"), url=m.group("url"), status_code=int(m.group("status")), response_size=size, user_agent=m.group("user_agent"), referer=m.group("referer"), ) def _parse_apache_combined(self, line: str) -> LogEntry | None: """Parse a single Apache combined format log line.""" m = APACHE_COMBINED_RE.match(line) if not m: return None ts = None try: ts = datetime.strptime(m.group("timestamp"), APACHE_TS_FORMAT) except (ValueError, TypeError): pass size_raw = m.group("size") size = int(size_raw) if size_raw != "-" else 0 return LogEntry( timestamp=ts, ip=m.group("ip"), method=m.group("method"), url=m.group("url"), status_code=int(m.group("status")), response_size=size, user_agent=m.group("user_agent"), referer=m.group("referer"), ) def _parse_cloudfront(self, line: str) -> LogEntry | None: """Parse a CloudFront tab-separated log line.""" if line.startswith("#"): return None parts = line.strip().split("\t") if len(parts) < 13: return None ts = None try: ts = datetime.strptime(f"{parts[0]} {parts[1]}", "%Y-%m-%d %H:%M:%S") except (ValueError, IndexError): pass try: status = int(parts[8]) except (ValueError, IndexError): status = 0 try: size = int(parts[3]) except (ValueError, IndexError): size = 0 url = parts[7] if len(parts) > 7 else "" query = parts[11] if len(parts) > 11 else "" if query and query != "-": url = f"{url}?{query}" ua = parts[10] if len(parts) > 10 else "" ua = ua.replace("%20", " ").replace("%2520", " ") referer = parts[9] if len(parts) > 9 else "" return LogEntry( timestamp=ts, ip=parts[4] if len(parts) > 4 else "", method=parts[5] if len(parts) > 5 else "", url=url, status_code=status, response_size=size, user_agent=ua, referer=referer, ) def _parse_line(self, line: str, fmt: str) -> LogEntry | None: """Route to the correct parser based on format.""" parsers = { "nginx": self._parse_nginx_combined, "apache": self._parse_apache_combined, "cloudfront": self._parse_cloudfront, } parser = parsers.get(fmt, self._parse_nginx_combined) return parser(line) # -- bot identification --------------------------------------------------- @staticmethod def identify_bot(user_agent: str) -> BotIdentification | None: """Match user-agent against known bot patterns.""" if not user_agent or user_agent == "-": return None for name, pattern, category in COMPILED_BOT_PATTERNS: if pattern.search(user_agent): return BotIdentification( name=name, user_agent_pattern=pattern.pattern, category=category, ) # Heuristic: generic bot detection via common keywords ua_lower = user_agent.lower() bot_keywords = ["bot", "spider", "crawler", "scraper", "fetch"] for kw in bot_keywords: if kw in ua_lower: return BotIdentification( name="other", user_agent_pattern=kw, category="other", ) return None # -- file handling -------------------------------------------------------- @staticmethod def _open_file(path: str) -> TextIO: """Open plain text, .gz, or .bz2 log files.""" p = Path(path) if p.suffix == ".gz": return gzip.open(path, "rt", encoding="utf-8", errors="replace") if p.suffix == ".bz2": return bz2.open(path, "rt", encoding="utf-8", errors="replace") return open(path, "r", encoding="utf-8", errors="replace") # -- streaming parser ----------------------------------------------------- def parse_streaming( self, filter_bot: str | None = None, ) -> Generator[tuple[LogEntry, BotIdentification], None, None]: """Generator-based streaming parser for large files.""" fmt = self.fmt first_line_checked = False fh = self._open_file(self.log_file) try: for line in fh: line = line.strip() if not line: continue if not first_line_checked and fmt == "auto": fmt = self._detect_format(line) self._detected_format = fmt first_line_checked = True entry = self._parse_line(line, fmt) if entry is None: self._parse_errors += 1 continue bot = self.identify_bot(entry.user_agent) if bot is None: continue if filter_bot and bot.name != filter_bot.lower(): continue yield entry, bot finally: fh.close() # -- full parse ----------------------------------------------------------- def parse( self, filter_bot: str | None = None, date_from: datetime | None = None, date_to: datetime | None = None, ) -> list[tuple[LogEntry, BotIdentification]]: """Full parse with optional date and bot filters.""" results: list[tuple[LogEntry, BotIdentification]] = [] for entry, bot in self.parse_streaming(filter_bot): if date_from and entry.timestamp and entry.timestamp < date_from: continue if date_to and entry.timestamp and entry.timestamp > date_to: continue results.append((entry, bot)) return results # -- statistics ----------------------------------------------------------- @staticmethod def get_bot_stats( entries: list[tuple[LogEntry, BotIdentification]], ) -> dict[str, BotStats]: """Aggregate per-bot statistics from parsed entries.""" bot_data: dict[str, dict] = defaultdict(lambda: { "urls": Counter(), "statuses": Counter(), "hours": Counter(), "days": Counter(), "sizes": [], "count": 0, }) for entry, bot in entries: bd = bot_data[bot.name] bd["count"] += 1 bd["urls"][entry.url] += 1 bd["statuses"][str(entry.status_code)] += 1 bd["sizes"].append(entry.response_size) if entry.timestamp: bd["hours"][entry.timestamp.hour] += 1 day_key = entry.timestamp.strftime("%Y-%m-%d") bd["days"][day_key] += 1 stats: dict[str, BotStats] = {} for name, bd in bot_data.items(): avg_size = sum(bd["sizes"]) / len(bd["sizes"]) if bd["sizes"] else 0.0 top_20 = bd["urls"].most_common(20) stats[name] = BotStats( name=name, total_requests=bd["count"], unique_urls=len(bd["urls"]), status_distribution=dict(bd["statuses"]), top_urls=top_20, hourly_distribution=dict(sorted(bd["hours"].items())), daily_distribution=dict(sorted(bd["days"].items())), avg_response_size=avg_size, ) return stats # -- orchestrator --------------------------------------------------------- def parse_and_analyze( self, filter_bot: str | None = None, date_from: datetime | None = None, date_to: datetime | None = None, ) -> LogParseResult: """Orchestrate parsing and statistics generation.""" entries = self.parse(filter_bot, date_from, date_to) bot_stats = self.get_bot_stats(entries) # Determine date range timestamps = [e.timestamp for e, _ in entries if e.timestamp] date_range = {} if timestamps: date_range = { "from": min(timestamps).isoformat(), "to": max(timestamps).isoformat(), } # Count total lines for context total_lines = 0 fh = self._open_file(self.log_file) try: for _ in fh: total_lines += 1 finally: fh.close() return LogParseResult( log_file=self.log_file, format_detected=self._detected_format or self.fmt, total_lines=total_lines, parsed_lines=total_lines - self._parse_errors, bot_entries=len(entries), date_range=date_range, bots=bot_stats, errors=self._parse_errors, ) # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def _parse_date(val: str) -> datetime: """Parse a date string in YYYY-MM-DD format.""" return datetime.strptime(val, "%Y-%m-%d") def main() -> None: parser = argparse.ArgumentParser( description="Parse server access logs and identify search engine bot traffic.", ) parser.add_argument( "--log-file", required=True, help="Path to access log file (plain, .gz, .bz2)", ) parser.add_argument( "--format", dest="fmt", choices=["auto", "nginx", "apache", "cloudfront"], default="auto", help="Log format (default: auto-detect)", ) parser.add_argument( "--bot", default=None, help="Filter results to a specific bot (e.g., googlebot, yeti, bingbot, daumoa)", ) parser.add_argument( "--streaming", action="store_true", help="Use streaming parser for large files (prints entries incrementally)", ) parser.add_argument( "--date-from", default=None, help="Filter entries from date (YYYY-MM-DD)", ) parser.add_argument( "--date-to", default=None, help="Filter entries to date (YYYY-MM-DD)", ) parser.add_argument( "--json", action="store_true", help="Output in JSON format", ) parser.add_argument( "--output", default=None, help="Write output to file instead of stdout", ) args = parser.parse_args() # Validate file exists if not Path(args.log_file).exists(): logger.error(f"Log file not found: {args.log_file}") sys.exit(1) date_from = _parse_date(args.date_from) if args.date_from else None date_to = _parse_date(args.date_to) if args.date_to else None lp = LogParser(log_file=args.log_file, fmt=args.fmt, streaming=args.streaming) if args.streaming and not args.json: # Streaming mode: print entries as they are parsed count = 0 for entry, bot in lp.parse_streaming(args.bot): if date_from and entry.timestamp and entry.timestamp < date_from: continue if date_to and entry.timestamp and entry.timestamp > date_to: continue ts_str = entry.timestamp.isoformat() if entry.timestamp else "N/A" print( f"[{bot.name}] {ts_str} {entry.status_code} " f"{entry.method} {entry.url} ({entry.response_size}B)" ) count += 1 print(f"\n--- Total bot requests: {count} ---") return # Full analysis mode result = lp.parse_and_analyze( filter_bot=args.bot, date_from=date_from, date_to=date_to, ) if args.json: output_data = result.to_dict() output_str = json.dumps(output_data, indent=2, ensure_ascii=False) else: lines = [ f"Log File: {result.log_file}", f"Format: {result.format_detected}", f"Total Lines: {result.total_lines:,}", f"Parsed Lines: {result.parsed_lines:,}", f"Bot Entries: {result.bot_entries:,}", f"Parse Errors: {result.errors:,}", ] if result.date_range: lines.append(f"Date Range: {result.date_range.get('from', 'N/A')} to {result.date_range.get('to', 'N/A')}") lines.append("") lines.append("=" * 60) lines.append("Bot Statistics") lines.append("=" * 60) for name, stats in sorted(result.bots.items(), key=lambda x: -x[1].total_requests): lines.append(f"\n--- {name.upper()} ---") lines.append(f" Requests: {stats.total_requests:,}") lines.append(f" Unique URLs: {stats.unique_urls:,}") lines.append(f" Avg Response Size: {stats.avg_response_size:,.0f} bytes") lines.append(f" Status Distribution: {stats.status_distribution}") lines.append(f" Top 10 URLs:") for url, cnt in stats.top_urls[:10]: lines.append(f" {cnt:>6,} | {url}") if stats.hourly_distribution: peak_hour = max(stats.hourly_distribution, key=stats.hourly_distribution.get) lines.append(f" Peak Hour: {peak_hour}:00 ({stats.hourly_distribution[peak_hour]:,} reqs)") output_str = "\n".join(lines) if args.output: Path(args.output).write_text(output_str, encoding="utf-8") logger.info(f"Output written to {args.output}") else: print(output_str) if __name__ == "__main__": main()