Files
our-claude-skills/custom-skills/32-seo-crawl-budget/code/scripts/log_parser.py
Andrew Yim a3ff965b87 Add SEO skills 19-28, 31-32 with full Python implementations
12 new skills: Keyword Strategy, SERP Analysis, Position Tracking,
Link Building, Content Strategy, E-Commerce SEO, KPI Framework,
International SEO, AI Visibility, Knowledge Graph, Competitor Intel,
and Crawl Budget. ~20K lines of Python across 25 domain scripts.
Updated skill 11 pipeline table and repo CLAUDE.md.
Enhanced skill 18 local SEO workflow from jamie.clinic audit.

Note: Skill 26 hreflang_validator.py pending (content filter block).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 12:05:59 +09:00

614 lines
21 KiB
Python

"""
Log Parser - Server access log parser with bot identification
=============================================================
Purpose: Parse Apache/Nginx/CloudFront access logs, identify search engine
bots, extract crawl data, and generate per-bot statistics.
Python: 3.10+
"""
import argparse
import bz2
import gzip
import json
import logging
import re
import sys
from collections import Counter, defaultdict
from dataclasses import asdict, dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Generator, TextIO
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants: bot user-agent patterns
# ---------------------------------------------------------------------------
BOT_PATTERNS: list[tuple[str, str, str]] = [
# (canonical name, regex pattern, category)
("googlebot", r"Googlebot(?:-Image|-News|-Video)?/", "search_engine"),
("googlebot-adsbot", r"AdsBot-Google", "search_engine"),
("googlebot-mediapartners", r"Mediapartners-Google", "search_engine"),
("yeti", r"Yeti/", "search_engine"),
("bingbot", r"bingbot/", "search_engine"),
("daumoa", r"Daumoa", "search_engine"),
("applebot", r"Applebot/", "search_engine"),
("duckduckbot", r"DuckDuckBot/", "search_engine"),
("baiduspider", r"Baiduspider", "search_engine"),
("yandexbot", r"YandexBot/", "search_engine"),
("sogou", r"Sogou", "search_engine"),
("seznambot", r"SeznamBot/", "search_engine"),
("ahrefsbot", r"AhrefsBot/", "seo_tool"),
("semrushbot", r"SemrushBot/", "seo_tool"),
("mj12bot", r"MJ12bot/", "seo_tool"),
("dotbot", r"DotBot/", "seo_tool"),
("rogerbot", r"rogerbot/", "seo_tool"),
("screaming frog", r"Screaming Frog SEO Spider", "seo_tool"),
]
COMPILED_BOT_PATTERNS: list[tuple[str, re.Pattern, str]] = [
(name, re.compile(pattern, re.IGNORECASE), category)
for name, pattern, category in BOT_PATTERNS
]
# ---------------------------------------------------------------------------
# Regex patterns for each log format
# ---------------------------------------------------------------------------
NGINX_COMBINED_RE = re.compile(
r'(?P<ip>[\d.:a-fA-F]+)\s+-\s+(?P<user>\S+)\s+'
r'\[(?P<timestamp>[^\]]+)\]\s+'
r'"(?P<method>\S+)\s+(?P<url>\S+)\s+(?P<protocol>[^"]+)"\s+'
r'(?P<status>\d{3})\s+(?P<size>\d+|-)\s+'
r'"(?P<referer>[^"]*)"\s+'
r'"(?P<user_agent>[^"]*)"'
)
APACHE_COMBINED_RE = re.compile(
r'(?P<ip>[\d.:a-fA-F]+)\s+\S+\s+(?P<user>\S+)\s+'
r'\[(?P<timestamp>[^\]]+)\]\s+'
r'"(?P<method>\S+)\s+(?P<url>\S+)\s+(?P<protocol>[^"]+)"\s+'
r'(?P<status>\d{3})\s+(?P<size>\d+|-)\s+'
r'"(?P<referer>[^"]*)"\s+'
r'"(?P<user_agent>[^"]*)"'
)
CLOUDFRONT_FIELDS = [
"date", "time", "x_edge_location", "sc_bytes", "c_ip",
"cs_method", "cs_host", "cs_uri_stem", "sc_status",
"cs_referer", "cs_user_agent", "cs_uri_query",
"cs_cookie", "x_edge_result_type", "x_edge_request_id",
"x_host_header", "cs_protocol", "cs_bytes",
"time_taken", "x_forwarded_for", "ssl_protocol",
"ssl_cipher", "x_edge_response_result_type", "cs_protocol_version",
]
# Timestamp formats
NGINX_TS_FORMAT = "%d/%b/%Y:%H:%M:%S %z"
APACHE_TS_FORMAT = "%d/%b/%Y:%H:%M:%S %z"
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class LogEntry:
"""A single parsed log entry."""
timestamp: datetime | None
ip: str
method: str
url: str
status_code: int
response_size: int
user_agent: str
referer: str
def to_dict(self) -> dict:
d = asdict(self)
if self.timestamp:
d["timestamp"] = self.timestamp.isoformat()
return d
@dataclass
class BotIdentification:
"""Bot identification result."""
name: str
user_agent_pattern: str
category: str
@dataclass
class BotStats:
"""Aggregated statistics for a single bot."""
name: str
total_requests: int = 0
unique_urls: int = 0
status_distribution: dict[str, int] = field(default_factory=dict)
top_urls: list[tuple[str, int]] = field(default_factory=list)
hourly_distribution: dict[int, int] = field(default_factory=dict)
daily_distribution: dict[str, int] = field(default_factory=dict)
avg_response_size: float = 0.0
def to_dict(self) -> dict:
return {
"name": self.name,
"total_requests": self.total_requests,
"unique_urls": self.unique_urls,
"status_distribution": self.status_distribution,
"top_urls": [{"url": u, "count": c} for u, c in self.top_urls],
"hourly_distribution": self.hourly_distribution,
"daily_distribution": self.daily_distribution,
"avg_response_size": round(self.avg_response_size, 1),
}
@dataclass
class LogParseResult:
"""Complete log parsing result."""
log_file: str
format_detected: str
total_lines: int
parsed_lines: int
bot_entries: int
date_range: dict[str, str]
bots: dict[str, BotStats]
errors: int
def to_dict(self) -> dict:
return {
"log_file": self.log_file,
"format_detected": self.format_detected,
"total_lines": self.total_lines,
"parsed_lines": self.parsed_lines,
"bot_entries": self.bot_entries,
"date_range": self.date_range,
"bots": {name: stats.to_dict() for name, stats in self.bots.items()},
"errors": self.errors,
}
# ---------------------------------------------------------------------------
# LogParser class
# ---------------------------------------------------------------------------
class LogParser:
"""Parse server access logs and identify search engine bot traffic."""
def __init__(
self,
log_file: str,
fmt: str = "auto",
streaming: bool = False,
):
self.log_file = log_file
self.fmt = fmt
self.streaming = streaming
self._detected_format: str | None = None
self._parse_errors = 0
# -- format detection -----------------------------------------------------
def _detect_format(self, line: str) -> str:
"""Auto-detect log format from a sample line."""
if line.startswith("#"):
return "cloudfront"
if NGINX_COMBINED_RE.match(line):
return "nginx"
if APACHE_COMBINED_RE.match(line):
return "apache"
# Fallback: try tab-separated (CloudFront without header)
if "\t" in line and line.count("\t") >= 10:
return "cloudfront"
return "nginx"
# -- line parsers ---------------------------------------------------------
def _parse_nginx_combined(self, line: str) -> LogEntry | None:
"""Parse a single Nginx combined format log line."""
m = NGINX_COMBINED_RE.match(line)
if not m:
return None
ts = None
try:
ts = datetime.strptime(m.group("timestamp"), NGINX_TS_FORMAT)
except (ValueError, TypeError):
pass
size_raw = m.group("size")
size = int(size_raw) if size_raw != "-" else 0
return LogEntry(
timestamp=ts,
ip=m.group("ip"),
method=m.group("method"),
url=m.group("url"),
status_code=int(m.group("status")),
response_size=size,
user_agent=m.group("user_agent"),
referer=m.group("referer"),
)
def _parse_apache_combined(self, line: str) -> LogEntry | None:
"""Parse a single Apache combined format log line."""
m = APACHE_COMBINED_RE.match(line)
if not m:
return None
ts = None
try:
ts = datetime.strptime(m.group("timestamp"), APACHE_TS_FORMAT)
except (ValueError, TypeError):
pass
size_raw = m.group("size")
size = int(size_raw) if size_raw != "-" else 0
return LogEntry(
timestamp=ts,
ip=m.group("ip"),
method=m.group("method"),
url=m.group("url"),
status_code=int(m.group("status")),
response_size=size,
user_agent=m.group("user_agent"),
referer=m.group("referer"),
)
def _parse_cloudfront(self, line: str) -> LogEntry | None:
"""Parse a CloudFront tab-separated log line."""
if line.startswith("#"):
return None
parts = line.strip().split("\t")
if len(parts) < 13:
return None
ts = None
try:
ts = datetime.strptime(f"{parts[0]} {parts[1]}", "%Y-%m-%d %H:%M:%S")
except (ValueError, IndexError):
pass
try:
status = int(parts[8])
except (ValueError, IndexError):
status = 0
try:
size = int(parts[3])
except (ValueError, IndexError):
size = 0
url = parts[7] if len(parts) > 7 else ""
query = parts[11] if len(parts) > 11 else ""
if query and query != "-":
url = f"{url}?{query}"
ua = parts[10] if len(parts) > 10 else ""
ua = ua.replace("%20", " ").replace("%2520", " ")
referer = parts[9] if len(parts) > 9 else ""
return LogEntry(
timestamp=ts,
ip=parts[4] if len(parts) > 4 else "",
method=parts[5] if len(parts) > 5 else "",
url=url,
status_code=status,
response_size=size,
user_agent=ua,
referer=referer,
)
def _parse_line(self, line: str, fmt: str) -> LogEntry | None:
"""Route to the correct parser based on format."""
parsers = {
"nginx": self._parse_nginx_combined,
"apache": self._parse_apache_combined,
"cloudfront": self._parse_cloudfront,
}
parser = parsers.get(fmt, self._parse_nginx_combined)
return parser(line)
# -- bot identification ---------------------------------------------------
@staticmethod
def identify_bot(user_agent: str) -> BotIdentification | None:
"""Match user-agent against known bot patterns."""
if not user_agent or user_agent == "-":
return None
for name, pattern, category in COMPILED_BOT_PATTERNS:
if pattern.search(user_agent):
return BotIdentification(
name=name,
user_agent_pattern=pattern.pattern,
category=category,
)
# Heuristic: generic bot detection via common keywords
ua_lower = user_agent.lower()
bot_keywords = ["bot", "spider", "crawler", "scraper", "fetch"]
for kw in bot_keywords:
if kw in ua_lower:
return BotIdentification(
name="other",
user_agent_pattern=kw,
category="other",
)
return None
# -- file handling --------------------------------------------------------
@staticmethod
def _open_file(path: str) -> TextIO:
"""Open plain text, .gz, or .bz2 log files."""
p = Path(path)
if p.suffix == ".gz":
return gzip.open(path, "rt", encoding="utf-8", errors="replace")
if p.suffix == ".bz2":
return bz2.open(path, "rt", encoding="utf-8", errors="replace")
return open(path, "r", encoding="utf-8", errors="replace")
# -- streaming parser -----------------------------------------------------
def parse_streaming(
self,
filter_bot: str | None = None,
) -> Generator[tuple[LogEntry, BotIdentification], None, None]:
"""Generator-based streaming parser for large files."""
fmt = self.fmt
first_line_checked = False
fh = self._open_file(self.log_file)
try:
for line in fh:
line = line.strip()
if not line:
continue
if not first_line_checked and fmt == "auto":
fmt = self._detect_format(line)
self._detected_format = fmt
first_line_checked = True
entry = self._parse_line(line, fmt)
if entry is None:
self._parse_errors += 1
continue
bot = self.identify_bot(entry.user_agent)
if bot is None:
continue
if filter_bot and bot.name != filter_bot.lower():
continue
yield entry, bot
finally:
fh.close()
# -- full parse -----------------------------------------------------------
def parse(
self,
filter_bot: str | None = None,
date_from: datetime | None = None,
date_to: datetime | None = None,
) -> list[tuple[LogEntry, BotIdentification]]:
"""Full parse with optional date and bot filters."""
results: list[tuple[LogEntry, BotIdentification]] = []
for entry, bot in self.parse_streaming(filter_bot):
if date_from and entry.timestamp and entry.timestamp < date_from:
continue
if date_to and entry.timestamp and entry.timestamp > date_to:
continue
results.append((entry, bot))
return results
# -- statistics -----------------------------------------------------------
@staticmethod
def get_bot_stats(
entries: list[tuple[LogEntry, BotIdentification]],
) -> dict[str, BotStats]:
"""Aggregate per-bot statistics from parsed entries."""
bot_data: dict[str, dict] = defaultdict(lambda: {
"urls": Counter(),
"statuses": Counter(),
"hours": Counter(),
"days": Counter(),
"sizes": [],
"count": 0,
})
for entry, bot in entries:
bd = bot_data[bot.name]
bd["count"] += 1
bd["urls"][entry.url] += 1
bd["statuses"][str(entry.status_code)] += 1
bd["sizes"].append(entry.response_size)
if entry.timestamp:
bd["hours"][entry.timestamp.hour] += 1
day_key = entry.timestamp.strftime("%Y-%m-%d")
bd["days"][day_key] += 1
stats: dict[str, BotStats] = {}
for name, bd in bot_data.items():
avg_size = sum(bd["sizes"]) / len(bd["sizes"]) if bd["sizes"] else 0.0
top_20 = bd["urls"].most_common(20)
stats[name] = BotStats(
name=name,
total_requests=bd["count"],
unique_urls=len(bd["urls"]),
status_distribution=dict(bd["statuses"]),
top_urls=top_20,
hourly_distribution=dict(sorted(bd["hours"].items())),
daily_distribution=dict(sorted(bd["days"].items())),
avg_response_size=avg_size,
)
return stats
# -- orchestrator ---------------------------------------------------------
def parse_and_analyze(
self,
filter_bot: str | None = None,
date_from: datetime | None = None,
date_to: datetime | None = None,
) -> LogParseResult:
"""Orchestrate parsing and statistics generation."""
entries = self.parse(filter_bot, date_from, date_to)
bot_stats = self.get_bot_stats(entries)
# Determine date range
timestamps = [e.timestamp for e, _ in entries if e.timestamp]
date_range = {}
if timestamps:
date_range = {
"from": min(timestamps).isoformat(),
"to": max(timestamps).isoformat(),
}
# Count total lines for context
total_lines = 0
fh = self._open_file(self.log_file)
try:
for _ in fh:
total_lines += 1
finally:
fh.close()
return LogParseResult(
log_file=self.log_file,
format_detected=self._detected_format or self.fmt,
total_lines=total_lines,
parsed_lines=total_lines - self._parse_errors,
bot_entries=len(entries),
date_range=date_range,
bots=bot_stats,
errors=self._parse_errors,
)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def _parse_date(val: str) -> datetime:
"""Parse a date string in YYYY-MM-DD format."""
return datetime.strptime(val, "%Y-%m-%d")
def main() -> None:
parser = argparse.ArgumentParser(
description="Parse server access logs and identify search engine bot traffic.",
)
parser.add_argument(
"--log-file",
required=True,
help="Path to access log file (plain, .gz, .bz2)",
)
parser.add_argument(
"--format",
dest="fmt",
choices=["auto", "nginx", "apache", "cloudfront"],
default="auto",
help="Log format (default: auto-detect)",
)
parser.add_argument(
"--bot",
default=None,
help="Filter results to a specific bot (e.g., googlebot, yeti, bingbot, daumoa)",
)
parser.add_argument(
"--streaming",
action="store_true",
help="Use streaming parser for large files (prints entries incrementally)",
)
parser.add_argument(
"--date-from",
default=None,
help="Filter entries from date (YYYY-MM-DD)",
)
parser.add_argument(
"--date-to",
default=None,
help="Filter entries to date (YYYY-MM-DD)",
)
parser.add_argument(
"--json",
action="store_true",
help="Output in JSON format",
)
parser.add_argument(
"--output",
default=None,
help="Write output to file instead of stdout",
)
args = parser.parse_args()
# Validate file exists
if not Path(args.log_file).exists():
logger.error(f"Log file not found: {args.log_file}")
sys.exit(1)
date_from = _parse_date(args.date_from) if args.date_from else None
date_to = _parse_date(args.date_to) if args.date_to else None
lp = LogParser(log_file=args.log_file, fmt=args.fmt, streaming=args.streaming)
if args.streaming and not args.json:
# Streaming mode: print entries as they are parsed
count = 0
for entry, bot in lp.parse_streaming(args.bot):
if date_from and entry.timestamp and entry.timestamp < date_from:
continue
if date_to and entry.timestamp and entry.timestamp > date_to:
continue
ts_str = entry.timestamp.isoformat() if entry.timestamp else "N/A"
print(
f"[{bot.name}] {ts_str} {entry.status_code} "
f"{entry.method} {entry.url} ({entry.response_size}B)"
)
count += 1
print(f"\n--- Total bot requests: {count} ---")
return
# Full analysis mode
result = lp.parse_and_analyze(
filter_bot=args.bot,
date_from=date_from,
date_to=date_to,
)
if args.json:
output_data = result.to_dict()
output_str = json.dumps(output_data, indent=2, ensure_ascii=False)
else:
lines = [
f"Log File: {result.log_file}",
f"Format: {result.format_detected}",
f"Total Lines: {result.total_lines:,}",
f"Parsed Lines: {result.parsed_lines:,}",
f"Bot Entries: {result.bot_entries:,}",
f"Parse Errors: {result.errors:,}",
]
if result.date_range:
lines.append(f"Date Range: {result.date_range.get('from', 'N/A')} to {result.date_range.get('to', 'N/A')}")
lines.append("")
lines.append("=" * 60)
lines.append("Bot Statistics")
lines.append("=" * 60)
for name, stats in sorted(result.bots.items(), key=lambda x: -x[1].total_requests):
lines.append(f"\n--- {name.upper()} ---")
lines.append(f" Requests: {stats.total_requests:,}")
lines.append(f" Unique URLs: {stats.unique_urls:,}")
lines.append(f" Avg Response Size: {stats.avg_response_size:,.0f} bytes")
lines.append(f" Status Distribution: {stats.status_distribution}")
lines.append(f" Top 10 URLs:")
for url, cnt in stats.top_urls[:10]:
lines.append(f" {cnt:>6,} | {url}")
if stats.hourly_distribution:
peak_hour = max(stats.hourly_distribution, key=stats.hourly_distribution.get)
lines.append(f" Peak Hour: {peak_hour}:00 ({stats.hourly_distribution[peak_hour]:,} reqs)")
output_str = "\n".join(lines)
if args.output:
Path(args.output).write_text(output_str, encoding="utf-8")
logger.info(f"Output written to {args.output}")
else:
print(output_str)
if __name__ == "__main__":
main()