Add SEO skills 19-28, 31-32 with full Python implementations
12 new skills: Keyword Strategy, SERP Analysis, Position Tracking, Link Building, Content Strategy, E-Commerce SEO, KPI Framework, International SEO, AI Visibility, Knowledge Graph, Competitor Intel, and Crawl Budget. ~20K lines of Python across 25 domain scripts. Updated skill 11 pipeline table and repo CLAUDE.md. Enhanced skill 18 local SEO workflow from jamie.clinic audit. Note: Skill 26 hreflang_validator.py pending (content filter block). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
613
custom-skills/32-seo-crawl-budget/code/scripts/log_parser.py
Normal file
613
custom-skills/32-seo-crawl-budget/code/scripts/log_parser.py
Normal file
@@ -0,0 +1,613 @@
|
||||
"""
|
||||
Log Parser - Server access log parser with bot identification
|
||||
=============================================================
|
||||
Purpose: Parse Apache/Nginx/CloudFront access logs, identify search engine
|
||||
bots, extract crawl data, and generate per-bot statistics.
|
||||
Python: 3.10+
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import bz2
|
||||
import gzip
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter, defaultdict
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Generator, TextIO
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants: bot user-agent patterns
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
BOT_PATTERNS: list[tuple[str, str, str]] = [
|
||||
# (canonical name, regex pattern, category)
|
||||
("googlebot", r"Googlebot(?:-Image|-News|-Video)?/", "search_engine"),
|
||||
("googlebot-adsbot", r"AdsBot-Google", "search_engine"),
|
||||
("googlebot-mediapartners", r"Mediapartners-Google", "search_engine"),
|
||||
("yeti", r"Yeti/", "search_engine"),
|
||||
("bingbot", r"bingbot/", "search_engine"),
|
||||
("daumoa", r"Daumoa", "search_engine"),
|
||||
("applebot", r"Applebot/", "search_engine"),
|
||||
("duckduckbot", r"DuckDuckBot/", "search_engine"),
|
||||
("baiduspider", r"Baiduspider", "search_engine"),
|
||||
("yandexbot", r"YandexBot/", "search_engine"),
|
||||
("sogou", r"Sogou", "search_engine"),
|
||||
("seznambot", r"SeznamBot/", "search_engine"),
|
||||
("ahrefsbot", r"AhrefsBot/", "seo_tool"),
|
||||
("semrushbot", r"SemrushBot/", "seo_tool"),
|
||||
("mj12bot", r"MJ12bot/", "seo_tool"),
|
||||
("dotbot", r"DotBot/", "seo_tool"),
|
||||
("rogerbot", r"rogerbot/", "seo_tool"),
|
||||
("screaming frog", r"Screaming Frog SEO Spider", "seo_tool"),
|
||||
]
|
||||
|
||||
COMPILED_BOT_PATTERNS: list[tuple[str, re.Pattern, str]] = [
|
||||
(name, re.compile(pattern, re.IGNORECASE), category)
|
||||
for name, pattern, category in BOT_PATTERNS
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Regex patterns for each log format
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
NGINX_COMBINED_RE = re.compile(
|
||||
r'(?P<ip>[\d.:a-fA-F]+)\s+-\s+(?P<user>\S+)\s+'
|
||||
r'\[(?P<timestamp>[^\]]+)\]\s+'
|
||||
r'"(?P<method>\S+)\s+(?P<url>\S+)\s+(?P<protocol>[^"]+)"\s+'
|
||||
r'(?P<status>\d{3})\s+(?P<size>\d+|-)\s+'
|
||||
r'"(?P<referer>[^"]*)"\s+'
|
||||
r'"(?P<user_agent>[^"]*)"'
|
||||
)
|
||||
|
||||
APACHE_COMBINED_RE = re.compile(
|
||||
r'(?P<ip>[\d.:a-fA-F]+)\s+\S+\s+(?P<user>\S+)\s+'
|
||||
r'\[(?P<timestamp>[^\]]+)\]\s+'
|
||||
r'"(?P<method>\S+)\s+(?P<url>\S+)\s+(?P<protocol>[^"]+)"\s+'
|
||||
r'(?P<status>\d{3})\s+(?P<size>\d+|-)\s+'
|
||||
r'"(?P<referer>[^"]*)"\s+'
|
||||
r'"(?P<user_agent>[^"]*)"'
|
||||
)
|
||||
|
||||
CLOUDFRONT_FIELDS = [
|
||||
"date", "time", "x_edge_location", "sc_bytes", "c_ip",
|
||||
"cs_method", "cs_host", "cs_uri_stem", "sc_status",
|
||||
"cs_referer", "cs_user_agent", "cs_uri_query",
|
||||
"cs_cookie", "x_edge_result_type", "x_edge_request_id",
|
||||
"x_host_header", "cs_protocol", "cs_bytes",
|
||||
"time_taken", "x_forwarded_for", "ssl_protocol",
|
||||
"ssl_cipher", "x_edge_response_result_type", "cs_protocol_version",
|
||||
]
|
||||
|
||||
# Timestamp formats
|
||||
NGINX_TS_FORMAT = "%d/%b/%Y:%H:%M:%S %z"
|
||||
APACHE_TS_FORMAT = "%d/%b/%Y:%H:%M:%S %z"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data classes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class LogEntry:
|
||||
"""A single parsed log entry."""
|
||||
timestamp: datetime | None
|
||||
ip: str
|
||||
method: str
|
||||
url: str
|
||||
status_code: int
|
||||
response_size: int
|
||||
user_agent: str
|
||||
referer: str
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
d = asdict(self)
|
||||
if self.timestamp:
|
||||
d["timestamp"] = self.timestamp.isoformat()
|
||||
return d
|
||||
|
||||
|
||||
@dataclass
|
||||
class BotIdentification:
|
||||
"""Bot identification result."""
|
||||
name: str
|
||||
user_agent_pattern: str
|
||||
category: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class BotStats:
|
||||
"""Aggregated statistics for a single bot."""
|
||||
name: str
|
||||
total_requests: int = 0
|
||||
unique_urls: int = 0
|
||||
status_distribution: dict[str, int] = field(default_factory=dict)
|
||||
top_urls: list[tuple[str, int]] = field(default_factory=list)
|
||||
hourly_distribution: dict[int, int] = field(default_factory=dict)
|
||||
daily_distribution: dict[str, int] = field(default_factory=dict)
|
||||
avg_response_size: float = 0.0
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"name": self.name,
|
||||
"total_requests": self.total_requests,
|
||||
"unique_urls": self.unique_urls,
|
||||
"status_distribution": self.status_distribution,
|
||||
"top_urls": [{"url": u, "count": c} for u, c in self.top_urls],
|
||||
"hourly_distribution": self.hourly_distribution,
|
||||
"daily_distribution": self.daily_distribution,
|
||||
"avg_response_size": round(self.avg_response_size, 1),
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class LogParseResult:
|
||||
"""Complete log parsing result."""
|
||||
log_file: str
|
||||
format_detected: str
|
||||
total_lines: int
|
||||
parsed_lines: int
|
||||
bot_entries: int
|
||||
date_range: dict[str, str]
|
||||
bots: dict[str, BotStats]
|
||||
errors: int
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"log_file": self.log_file,
|
||||
"format_detected": self.format_detected,
|
||||
"total_lines": self.total_lines,
|
||||
"parsed_lines": self.parsed_lines,
|
||||
"bot_entries": self.bot_entries,
|
||||
"date_range": self.date_range,
|
||||
"bots": {name: stats.to_dict() for name, stats in self.bots.items()},
|
||||
"errors": self.errors,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LogParser class
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class LogParser:
|
||||
"""Parse server access logs and identify search engine bot traffic."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
log_file: str,
|
||||
fmt: str = "auto",
|
||||
streaming: bool = False,
|
||||
):
|
||||
self.log_file = log_file
|
||||
self.fmt = fmt
|
||||
self.streaming = streaming
|
||||
self._detected_format: str | None = None
|
||||
self._parse_errors = 0
|
||||
|
||||
# -- format detection -----------------------------------------------------
|
||||
|
||||
def _detect_format(self, line: str) -> str:
|
||||
"""Auto-detect log format from a sample line."""
|
||||
if line.startswith("#"):
|
||||
return "cloudfront"
|
||||
if NGINX_COMBINED_RE.match(line):
|
||||
return "nginx"
|
||||
if APACHE_COMBINED_RE.match(line):
|
||||
return "apache"
|
||||
# Fallback: try tab-separated (CloudFront without header)
|
||||
if "\t" in line and line.count("\t") >= 10:
|
||||
return "cloudfront"
|
||||
return "nginx"
|
||||
|
||||
# -- line parsers ---------------------------------------------------------
|
||||
|
||||
def _parse_nginx_combined(self, line: str) -> LogEntry | None:
|
||||
"""Parse a single Nginx combined format log line."""
|
||||
m = NGINX_COMBINED_RE.match(line)
|
||||
if not m:
|
||||
return None
|
||||
ts = None
|
||||
try:
|
||||
ts = datetime.strptime(m.group("timestamp"), NGINX_TS_FORMAT)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
size_raw = m.group("size")
|
||||
size = int(size_raw) if size_raw != "-" else 0
|
||||
return LogEntry(
|
||||
timestamp=ts,
|
||||
ip=m.group("ip"),
|
||||
method=m.group("method"),
|
||||
url=m.group("url"),
|
||||
status_code=int(m.group("status")),
|
||||
response_size=size,
|
||||
user_agent=m.group("user_agent"),
|
||||
referer=m.group("referer"),
|
||||
)
|
||||
|
||||
def _parse_apache_combined(self, line: str) -> LogEntry | None:
|
||||
"""Parse a single Apache combined format log line."""
|
||||
m = APACHE_COMBINED_RE.match(line)
|
||||
if not m:
|
||||
return None
|
||||
ts = None
|
||||
try:
|
||||
ts = datetime.strptime(m.group("timestamp"), APACHE_TS_FORMAT)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
size_raw = m.group("size")
|
||||
size = int(size_raw) if size_raw != "-" else 0
|
||||
return LogEntry(
|
||||
timestamp=ts,
|
||||
ip=m.group("ip"),
|
||||
method=m.group("method"),
|
||||
url=m.group("url"),
|
||||
status_code=int(m.group("status")),
|
||||
response_size=size,
|
||||
user_agent=m.group("user_agent"),
|
||||
referer=m.group("referer"),
|
||||
)
|
||||
|
||||
def _parse_cloudfront(self, line: str) -> LogEntry | None:
|
||||
"""Parse a CloudFront tab-separated log line."""
|
||||
if line.startswith("#"):
|
||||
return None
|
||||
parts = line.strip().split("\t")
|
||||
if len(parts) < 13:
|
||||
return None
|
||||
ts = None
|
||||
try:
|
||||
ts = datetime.strptime(f"{parts[0]} {parts[1]}", "%Y-%m-%d %H:%M:%S")
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
try:
|
||||
status = int(parts[8])
|
||||
except (ValueError, IndexError):
|
||||
status = 0
|
||||
try:
|
||||
size = int(parts[3])
|
||||
except (ValueError, IndexError):
|
||||
size = 0
|
||||
url = parts[7] if len(parts) > 7 else ""
|
||||
query = parts[11] if len(parts) > 11 else ""
|
||||
if query and query != "-":
|
||||
url = f"{url}?{query}"
|
||||
ua = parts[10] if len(parts) > 10 else ""
|
||||
ua = ua.replace("%20", " ").replace("%2520", " ")
|
||||
referer = parts[9] if len(parts) > 9 else ""
|
||||
return LogEntry(
|
||||
timestamp=ts,
|
||||
ip=parts[4] if len(parts) > 4 else "",
|
||||
method=parts[5] if len(parts) > 5 else "",
|
||||
url=url,
|
||||
status_code=status,
|
||||
response_size=size,
|
||||
user_agent=ua,
|
||||
referer=referer,
|
||||
)
|
||||
|
||||
def _parse_line(self, line: str, fmt: str) -> LogEntry | None:
|
||||
"""Route to the correct parser based on format."""
|
||||
parsers = {
|
||||
"nginx": self._parse_nginx_combined,
|
||||
"apache": self._parse_apache_combined,
|
||||
"cloudfront": self._parse_cloudfront,
|
||||
}
|
||||
parser = parsers.get(fmt, self._parse_nginx_combined)
|
||||
return parser(line)
|
||||
|
||||
# -- bot identification ---------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def identify_bot(user_agent: str) -> BotIdentification | None:
|
||||
"""Match user-agent against known bot patterns."""
|
||||
if not user_agent or user_agent == "-":
|
||||
return None
|
||||
for name, pattern, category in COMPILED_BOT_PATTERNS:
|
||||
if pattern.search(user_agent):
|
||||
return BotIdentification(
|
||||
name=name,
|
||||
user_agent_pattern=pattern.pattern,
|
||||
category=category,
|
||||
)
|
||||
# Heuristic: generic bot detection via common keywords
|
||||
ua_lower = user_agent.lower()
|
||||
bot_keywords = ["bot", "spider", "crawler", "scraper", "fetch"]
|
||||
for kw in bot_keywords:
|
||||
if kw in ua_lower:
|
||||
return BotIdentification(
|
||||
name="other",
|
||||
user_agent_pattern=kw,
|
||||
category="other",
|
||||
)
|
||||
return None
|
||||
|
||||
# -- file handling --------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _open_file(path: str) -> TextIO:
|
||||
"""Open plain text, .gz, or .bz2 log files."""
|
||||
p = Path(path)
|
||||
if p.suffix == ".gz":
|
||||
return gzip.open(path, "rt", encoding="utf-8", errors="replace")
|
||||
if p.suffix == ".bz2":
|
||||
return bz2.open(path, "rt", encoding="utf-8", errors="replace")
|
||||
return open(path, "r", encoding="utf-8", errors="replace")
|
||||
|
||||
# -- streaming parser -----------------------------------------------------
|
||||
|
||||
def parse_streaming(
|
||||
self,
|
||||
filter_bot: str | None = None,
|
||||
) -> Generator[tuple[LogEntry, BotIdentification], None, None]:
|
||||
"""Generator-based streaming parser for large files."""
|
||||
fmt = self.fmt
|
||||
first_line_checked = False
|
||||
|
||||
fh = self._open_file(self.log_file)
|
||||
try:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
if not first_line_checked and fmt == "auto":
|
||||
fmt = self._detect_format(line)
|
||||
self._detected_format = fmt
|
||||
first_line_checked = True
|
||||
entry = self._parse_line(line, fmt)
|
||||
if entry is None:
|
||||
self._parse_errors += 1
|
||||
continue
|
||||
bot = self.identify_bot(entry.user_agent)
|
||||
if bot is None:
|
||||
continue
|
||||
if filter_bot and bot.name != filter_bot.lower():
|
||||
continue
|
||||
yield entry, bot
|
||||
finally:
|
||||
fh.close()
|
||||
|
||||
# -- full parse -----------------------------------------------------------
|
||||
|
||||
def parse(
|
||||
self,
|
||||
filter_bot: str | None = None,
|
||||
date_from: datetime | None = None,
|
||||
date_to: datetime | None = None,
|
||||
) -> list[tuple[LogEntry, BotIdentification]]:
|
||||
"""Full parse with optional date and bot filters."""
|
||||
results: list[tuple[LogEntry, BotIdentification]] = []
|
||||
for entry, bot in self.parse_streaming(filter_bot):
|
||||
if date_from and entry.timestamp and entry.timestamp < date_from:
|
||||
continue
|
||||
if date_to and entry.timestamp and entry.timestamp > date_to:
|
||||
continue
|
||||
results.append((entry, bot))
|
||||
return results
|
||||
|
||||
# -- statistics -----------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def get_bot_stats(
|
||||
entries: list[tuple[LogEntry, BotIdentification]],
|
||||
) -> dict[str, BotStats]:
|
||||
"""Aggregate per-bot statistics from parsed entries."""
|
||||
bot_data: dict[str, dict] = defaultdict(lambda: {
|
||||
"urls": Counter(),
|
||||
"statuses": Counter(),
|
||||
"hours": Counter(),
|
||||
"days": Counter(),
|
||||
"sizes": [],
|
||||
"count": 0,
|
||||
})
|
||||
|
||||
for entry, bot in entries:
|
||||
bd = bot_data[bot.name]
|
||||
bd["count"] += 1
|
||||
bd["urls"][entry.url] += 1
|
||||
bd["statuses"][str(entry.status_code)] += 1
|
||||
bd["sizes"].append(entry.response_size)
|
||||
if entry.timestamp:
|
||||
bd["hours"][entry.timestamp.hour] += 1
|
||||
day_key = entry.timestamp.strftime("%Y-%m-%d")
|
||||
bd["days"][day_key] += 1
|
||||
|
||||
stats: dict[str, BotStats] = {}
|
||||
for name, bd in bot_data.items():
|
||||
avg_size = sum(bd["sizes"]) / len(bd["sizes"]) if bd["sizes"] else 0.0
|
||||
top_20 = bd["urls"].most_common(20)
|
||||
stats[name] = BotStats(
|
||||
name=name,
|
||||
total_requests=bd["count"],
|
||||
unique_urls=len(bd["urls"]),
|
||||
status_distribution=dict(bd["statuses"]),
|
||||
top_urls=top_20,
|
||||
hourly_distribution=dict(sorted(bd["hours"].items())),
|
||||
daily_distribution=dict(sorted(bd["days"].items())),
|
||||
avg_response_size=avg_size,
|
||||
)
|
||||
return stats
|
||||
|
||||
# -- orchestrator ---------------------------------------------------------
|
||||
|
||||
def parse_and_analyze(
|
||||
self,
|
||||
filter_bot: str | None = None,
|
||||
date_from: datetime | None = None,
|
||||
date_to: datetime | None = None,
|
||||
) -> LogParseResult:
|
||||
"""Orchestrate parsing and statistics generation."""
|
||||
entries = self.parse(filter_bot, date_from, date_to)
|
||||
bot_stats = self.get_bot_stats(entries)
|
||||
|
||||
# Determine date range
|
||||
timestamps = [e.timestamp for e, _ in entries if e.timestamp]
|
||||
date_range = {}
|
||||
if timestamps:
|
||||
date_range = {
|
||||
"from": min(timestamps).isoformat(),
|
||||
"to": max(timestamps).isoformat(),
|
||||
}
|
||||
|
||||
# Count total lines for context
|
||||
total_lines = 0
|
||||
fh = self._open_file(self.log_file)
|
||||
try:
|
||||
for _ in fh:
|
||||
total_lines += 1
|
||||
finally:
|
||||
fh.close()
|
||||
|
||||
return LogParseResult(
|
||||
log_file=self.log_file,
|
||||
format_detected=self._detected_format or self.fmt,
|
||||
total_lines=total_lines,
|
||||
parsed_lines=total_lines - self._parse_errors,
|
||||
bot_entries=len(entries),
|
||||
date_range=date_range,
|
||||
bots=bot_stats,
|
||||
errors=self._parse_errors,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _parse_date(val: str) -> datetime:
|
||||
"""Parse a date string in YYYY-MM-DD format."""
|
||||
return datetime.strptime(val, "%Y-%m-%d")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Parse server access logs and identify search engine bot traffic.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log-file",
|
||||
required=True,
|
||||
help="Path to access log file (plain, .gz, .bz2)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--format",
|
||||
dest="fmt",
|
||||
choices=["auto", "nginx", "apache", "cloudfront"],
|
||||
default="auto",
|
||||
help="Log format (default: auto-detect)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bot",
|
||||
default=None,
|
||||
help="Filter results to a specific bot (e.g., googlebot, yeti, bingbot, daumoa)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--streaming",
|
||||
action="store_true",
|
||||
help="Use streaming parser for large files (prints entries incrementally)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--date-from",
|
||||
default=None,
|
||||
help="Filter entries from date (YYYY-MM-DD)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--date-to",
|
||||
default=None,
|
||||
help="Filter entries to date (YYYY-MM-DD)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
action="store_true",
|
||||
help="Output in JSON format",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default=None,
|
||||
help="Write output to file instead of stdout",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate file exists
|
||||
if not Path(args.log_file).exists():
|
||||
logger.error(f"Log file not found: {args.log_file}")
|
||||
sys.exit(1)
|
||||
|
||||
date_from = _parse_date(args.date_from) if args.date_from else None
|
||||
date_to = _parse_date(args.date_to) if args.date_to else None
|
||||
|
||||
lp = LogParser(log_file=args.log_file, fmt=args.fmt, streaming=args.streaming)
|
||||
|
||||
if args.streaming and not args.json:
|
||||
# Streaming mode: print entries as they are parsed
|
||||
count = 0
|
||||
for entry, bot in lp.parse_streaming(args.bot):
|
||||
if date_from and entry.timestamp and entry.timestamp < date_from:
|
||||
continue
|
||||
if date_to and entry.timestamp and entry.timestamp > date_to:
|
||||
continue
|
||||
ts_str = entry.timestamp.isoformat() if entry.timestamp else "N/A"
|
||||
print(
|
||||
f"[{bot.name}] {ts_str} {entry.status_code} "
|
||||
f"{entry.method} {entry.url} ({entry.response_size}B)"
|
||||
)
|
||||
count += 1
|
||||
print(f"\n--- Total bot requests: {count} ---")
|
||||
return
|
||||
|
||||
# Full analysis mode
|
||||
result = lp.parse_and_analyze(
|
||||
filter_bot=args.bot,
|
||||
date_from=date_from,
|
||||
date_to=date_to,
|
||||
)
|
||||
|
||||
if args.json:
|
||||
output_data = result.to_dict()
|
||||
output_str = json.dumps(output_data, indent=2, ensure_ascii=False)
|
||||
else:
|
||||
lines = [
|
||||
f"Log File: {result.log_file}",
|
||||
f"Format: {result.format_detected}",
|
||||
f"Total Lines: {result.total_lines:,}",
|
||||
f"Parsed Lines: {result.parsed_lines:,}",
|
||||
f"Bot Entries: {result.bot_entries:,}",
|
||||
f"Parse Errors: {result.errors:,}",
|
||||
]
|
||||
if result.date_range:
|
||||
lines.append(f"Date Range: {result.date_range.get('from', 'N/A')} to {result.date_range.get('to', 'N/A')}")
|
||||
lines.append("")
|
||||
lines.append("=" * 60)
|
||||
lines.append("Bot Statistics")
|
||||
lines.append("=" * 60)
|
||||
for name, stats in sorted(result.bots.items(), key=lambda x: -x[1].total_requests):
|
||||
lines.append(f"\n--- {name.upper()} ---")
|
||||
lines.append(f" Requests: {stats.total_requests:,}")
|
||||
lines.append(f" Unique URLs: {stats.unique_urls:,}")
|
||||
lines.append(f" Avg Response Size: {stats.avg_response_size:,.0f} bytes")
|
||||
lines.append(f" Status Distribution: {stats.status_distribution}")
|
||||
lines.append(f" Top 10 URLs:")
|
||||
for url, cnt in stats.top_urls[:10]:
|
||||
lines.append(f" {cnt:>6,} | {url}")
|
||||
if stats.hourly_distribution:
|
||||
peak_hour = max(stats.hourly_distribution, key=stats.hourly_distribution.get)
|
||||
lines.append(f" Peak Hour: {peak_hour}:00 ({stats.hourly_distribution[peak_hour]:,} reqs)")
|
||||
output_str = "\n".join(lines)
|
||||
|
||||
if args.output:
|
||||
Path(args.output).write_text(output_str, encoding="utf-8")
|
||||
logger.info(f"Output written to {args.output}")
|
||||
else:
|
||||
print(output_str)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user