Add SEO skills 19-28, 31-32 with full Python implementations

12 new skills: Keyword Strategy, SERP Analysis, Position Tracking, Link Building, Content Strategy, E-Commerce SEO, KPI Framework, International SEO, AI Visibility, Knowledge Graph, Competitor Intel, and Crawl Budget. ~20K lines of Python across 25 domain scripts. Updated skill 11 pipeline table and repo CLAUDE.md. Enhanced skill 18 local SEO workflow from jamie.clinic audit. Note: Skill 26 hreflang_validator.py pending (content filter block). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 12:05:59 +09:00
parent 159f7ec3f7
commit a3ff965b87
125 changed files with 25948 additions and 173 deletions
--- a/custom-skills/32-seo-crawl-budget/code/scripts/log_parser.py
+++ b/custom-skills/32-seo-crawl-budget/code/scripts/log_parser.py
@@ -0,0 +1,613 @@
+"""
+Log Parser - Server access log parser with bot identification
+=============================================================
+Purpose: Parse Apache/Nginx/CloudFront access logs, identify search engine
+         bots, extract crawl data, and generate per-bot statistics.
+Python: 3.10+
+"""
+
+import argparse
+import bz2
+import gzip
+import json
+import logging
+import re
+import sys
+from collections import Counter, defaultdict
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Generator, TextIO
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Constants: bot user-agent patterns
+# ---------------------------------------------------------------------------
+
+BOT_PATTERNS: list[tuple[str, str, str]] = [
+    # (canonical name, regex pattern, category)
+    ("googlebot", r"Googlebot(?:-Image|-News|-Video)?/", "search_engine"),
+    ("googlebot-adsbot", r"AdsBot-Google", "search_engine"),
+    ("googlebot-mediapartners", r"Mediapartners-Google", "search_engine"),
+    ("yeti", r"Yeti/", "search_engine"),
+    ("bingbot", r"bingbot/", "search_engine"),
+    ("daumoa", r"Daumoa", "search_engine"),
+    ("applebot", r"Applebot/", "search_engine"),
+    ("duckduckbot", r"DuckDuckBot/", "search_engine"),
+    ("baiduspider", r"Baiduspider", "search_engine"),
+    ("yandexbot", r"YandexBot/", "search_engine"),
+    ("sogou", r"Sogou", "search_engine"),
+    ("seznambot", r"SeznamBot/", "search_engine"),
+    ("ahrefsbot", r"AhrefsBot/", "seo_tool"),
+    ("semrushbot", r"SemrushBot/", "seo_tool"),
+    ("mj12bot", r"MJ12bot/", "seo_tool"),
+    ("dotbot", r"DotBot/", "seo_tool"),
+    ("rogerbot", r"rogerbot/", "seo_tool"),
+    ("screaming frog", r"Screaming Frog SEO Spider", "seo_tool"),
+]
+
+COMPILED_BOT_PATTERNS: list[tuple[str, re.Pattern, str]] = [
+    (name, re.compile(pattern, re.IGNORECASE), category)
+    for name, pattern, category in BOT_PATTERNS
+]
+
+# ---------------------------------------------------------------------------
+# Regex patterns for each log format
+# ---------------------------------------------------------------------------
+
+NGINX_COMBINED_RE = re.compile(
+    r'(?P<ip>[\d.:a-fA-F]+)\s+-\s+(?P<user>\S+)\s+'
+    r'\[(?P<timestamp>[^\]]+)\]\s+'
+    r'"(?P<method>\S+)\s+(?P<url>\S+)\s+(?P<protocol>[^"]+)"\s+'
+    r'(?P<status>\d{3})\s+(?P<size>\d+|-)\s+'
+    r'"(?P<referer>[^"]*)"\s+'
+    r'"(?P<user_agent>[^"]*)"'
+)
+
+APACHE_COMBINED_RE = re.compile(
+    r'(?P<ip>[\d.:a-fA-F]+)\s+\S+\s+(?P<user>\S+)\s+'
+    r'\[(?P<timestamp>[^\]]+)\]\s+'
+    r'"(?P<method>\S+)\s+(?P<url>\S+)\s+(?P<protocol>[^"]+)"\s+'
+    r'(?P<status>\d{3})\s+(?P<size>\d+|-)\s+'
+    r'"(?P<referer>[^"]*)"\s+'
+    r'"(?P<user_agent>[^"]*)"'
+)
+
+CLOUDFRONT_FIELDS = [
+    "date", "time", "x_edge_location", "sc_bytes", "c_ip",
+    "cs_method", "cs_host", "cs_uri_stem", "sc_status",
+    "cs_referer", "cs_user_agent", "cs_uri_query",
+    "cs_cookie", "x_edge_result_type", "x_edge_request_id",
+    "x_host_header", "cs_protocol", "cs_bytes",
+    "time_taken", "x_forwarded_for", "ssl_protocol",
+    "ssl_cipher", "x_edge_response_result_type", "cs_protocol_version",
+]
+
+# Timestamp formats
+NGINX_TS_FORMAT = "%d/%b/%Y:%H:%M:%S %z"
+APACHE_TS_FORMAT = "%d/%b/%Y:%H:%M:%S %z"
+
+
+# ---------------------------------------------------------------------------
+# Data classes
+# ---------------------------------------------------------------------------
+
+@dataclass
+class LogEntry:
+    """A single parsed log entry."""
+    timestamp: datetime | None
+    ip: str
+    method: str
+    url: str
+    status_code: int
+    response_size: int
+    user_agent: str
+    referer: str
+
+    def to_dict(self) -> dict:
+        d = asdict(self)
+        if self.timestamp:
+            d["timestamp"] = self.timestamp.isoformat()
+        return d
+
+
+@dataclass
+class BotIdentification:
+    """Bot identification result."""
+    name: str
+    user_agent_pattern: str
+    category: str
+
+
+@dataclass
+class BotStats:
+    """Aggregated statistics for a single bot."""
+    name: str
+    total_requests: int = 0
+    unique_urls: int = 0
+    status_distribution: dict[str, int] = field(default_factory=dict)
+    top_urls: list[tuple[str, int]] = field(default_factory=list)
+    hourly_distribution: dict[int, int] = field(default_factory=dict)
+    daily_distribution: dict[str, int] = field(default_factory=dict)
+    avg_response_size: float = 0.0
+
+    def to_dict(self) -> dict:
+        return {
+            "name": self.name,
+            "total_requests": self.total_requests,
+            "unique_urls": self.unique_urls,
+            "status_distribution": self.status_distribution,
+            "top_urls": [{"url": u, "count": c} for u, c in self.top_urls],
+            "hourly_distribution": self.hourly_distribution,
+            "daily_distribution": self.daily_distribution,
+            "avg_response_size": round(self.avg_response_size, 1),
+        }
+
+
+@dataclass
+class LogParseResult:
+    """Complete log parsing result."""
+    log_file: str
+    format_detected: str
+    total_lines: int
+    parsed_lines: int
+    bot_entries: int
+    date_range: dict[str, str]
+    bots: dict[str, BotStats]
+    errors: int
+
+    def to_dict(self) -> dict:
+        return {
+            "log_file": self.log_file,
+            "format_detected": self.format_detected,
+            "total_lines": self.total_lines,
+            "parsed_lines": self.parsed_lines,
+            "bot_entries": self.bot_entries,
+            "date_range": self.date_range,
+            "bots": {name: stats.to_dict() for name, stats in self.bots.items()},
+            "errors": self.errors,
+        }
+
+
+# ---------------------------------------------------------------------------
+# LogParser class
+# ---------------------------------------------------------------------------
+
+class LogParser:
+    """Parse server access logs and identify search engine bot traffic."""
+
+    def __init__(
+        self,
+        log_file: str,
+        fmt: str = "auto",
+        streaming: bool = False,
+    ):
+        self.log_file = log_file
+        self.fmt = fmt
+        self.streaming = streaming
+        self._detected_format: str | None = None
+        self._parse_errors = 0
+
+    # -- format detection -----------------------------------------------------
+
+    def _detect_format(self, line: str) -> str:
+        """Auto-detect log format from a sample line."""
+        if line.startswith("#"):
+            return "cloudfront"
+        if NGINX_COMBINED_RE.match(line):
+            return "nginx"
+        if APACHE_COMBINED_RE.match(line):
+            return "apache"
+        # Fallback: try tab-separated (CloudFront without header)
+        if "\t" in line and line.count("\t") >= 10:
+            return "cloudfront"
+        return "nginx"
+
+    # -- line parsers ---------------------------------------------------------
+
+    def _parse_nginx_combined(self, line: str) -> LogEntry | None:
+        """Parse a single Nginx combined format log line."""
+        m = NGINX_COMBINED_RE.match(line)
+        if not m:
+            return None
+        ts = None
+        try:
+            ts = datetime.strptime(m.group("timestamp"), NGINX_TS_FORMAT)
+        except (ValueError, TypeError):
+            pass
+        size_raw = m.group("size")
+        size = int(size_raw) if size_raw != "-" else 0
+        return LogEntry(
+            timestamp=ts,
+            ip=m.group("ip"),
+            method=m.group("method"),
+            url=m.group("url"),
+            status_code=int(m.group("status")),
+            response_size=size,
+            user_agent=m.group("user_agent"),
+            referer=m.group("referer"),
+        )
+
+    def _parse_apache_combined(self, line: str) -> LogEntry | None:
+        """Parse a single Apache combined format log line."""
+        m = APACHE_COMBINED_RE.match(line)
+        if not m:
+            return None
+        ts = None
+        try:
+            ts = datetime.strptime(m.group("timestamp"), APACHE_TS_FORMAT)
+        except (ValueError, TypeError):
+            pass
+        size_raw = m.group("size")
+        size = int(size_raw) if size_raw != "-" else 0
+        return LogEntry(
+            timestamp=ts,
+            ip=m.group("ip"),
+            method=m.group("method"),
+            url=m.group("url"),
+            status_code=int(m.group("status")),
+            response_size=size,
+            user_agent=m.group("user_agent"),
+            referer=m.group("referer"),
+        )
+
+    def _parse_cloudfront(self, line: str) -> LogEntry | None:
+        """Parse a CloudFront tab-separated log line."""
+        if line.startswith("#"):
+            return None
+        parts = line.strip().split("\t")
+        if len(parts) < 13:
+            return None
+        ts = None
+        try:
+            ts = datetime.strptime(f"{parts[0]} {parts[1]}", "%Y-%m-%d %H:%M:%S")
+        except (ValueError, IndexError):
+            pass
+        try:
+            status = int(parts[8])
+        except (ValueError, IndexError):
+            status = 0
+        try:
+            size = int(parts[3])
+        except (ValueError, IndexError):
+            size = 0
+        url = parts[7] if len(parts) > 7 else ""
+        query = parts[11] if len(parts) > 11 else ""
+        if query and query != "-":
+            url = f"{url}?{query}"
+        ua = parts[10] if len(parts) > 10 else ""
+        ua = ua.replace("%20", " ").replace("%2520", " ")
+        referer = parts[9] if len(parts) > 9 else ""
+        return LogEntry(
+            timestamp=ts,
+            ip=parts[4] if len(parts) > 4 else "",
+            method=parts[5] if len(parts) > 5 else "",
+            url=url,
+            status_code=status,
+            response_size=size,
+            user_agent=ua,
+            referer=referer,
+        )
+
+    def _parse_line(self, line: str, fmt: str) -> LogEntry | None:
+        """Route to the correct parser based on format."""
+        parsers = {
+            "nginx": self._parse_nginx_combined,
+            "apache": self._parse_apache_combined,
+            "cloudfront": self._parse_cloudfront,
+        }
+        parser = parsers.get(fmt, self._parse_nginx_combined)
+        return parser(line)
+
+    # -- bot identification ---------------------------------------------------
+
+    @staticmethod
+    def identify_bot(user_agent: str) -> BotIdentification | None:
+        """Match user-agent against known bot patterns."""
+        if not user_agent or user_agent == "-":
+            return None
+        for name, pattern, category in COMPILED_BOT_PATTERNS:
+            if pattern.search(user_agent):
+                return BotIdentification(
+                    name=name,
+                    user_agent_pattern=pattern.pattern,
+                    category=category,
+                )
+        # Heuristic: generic bot detection via common keywords
+        ua_lower = user_agent.lower()
+        bot_keywords = ["bot", "spider", "crawler", "scraper", "fetch"]
+        for kw in bot_keywords:
+            if kw in ua_lower:
+                return BotIdentification(
+                    name="other",
+                    user_agent_pattern=kw,
+                    category="other",
+                )
+        return None
+
+    # -- file handling --------------------------------------------------------
+
+    @staticmethod
+    def _open_file(path: str) -> TextIO:
+        """Open plain text, .gz, or .bz2 log files."""
+        p = Path(path)
+        if p.suffix == ".gz":
+            return gzip.open(path, "rt", encoding="utf-8", errors="replace")
+        if p.suffix == ".bz2":
+            return bz2.open(path, "rt", encoding="utf-8", errors="replace")
+        return open(path, "r", encoding="utf-8", errors="replace")
+
+    # -- streaming parser -----------------------------------------------------
+
+    def parse_streaming(
+        self,
+        filter_bot: str | None = None,
+    ) -> Generator[tuple[LogEntry, BotIdentification], None, None]:
+        """Generator-based streaming parser for large files."""
+        fmt = self.fmt
+        first_line_checked = False
+
+        fh = self._open_file(self.log_file)
+        try:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                if not first_line_checked and fmt == "auto":
+                    fmt = self._detect_format(line)
+                    self._detected_format = fmt
+                    first_line_checked = True
+                entry = self._parse_line(line, fmt)
+                if entry is None:
+                    self._parse_errors += 1
+                    continue
+                bot = self.identify_bot(entry.user_agent)
+                if bot is None:
+                    continue
+                if filter_bot and bot.name != filter_bot.lower():
+                    continue
+                yield entry, bot
+        finally:
+            fh.close()
+
+    # -- full parse -----------------------------------------------------------
+
+    def parse(
+        self,
+        filter_bot: str | None = None,
+        date_from: datetime | None = None,
+        date_to: datetime | None = None,
+    ) -> list[tuple[LogEntry, BotIdentification]]:
+        """Full parse with optional date and bot filters."""
+        results: list[tuple[LogEntry, BotIdentification]] = []
+        for entry, bot in self.parse_streaming(filter_bot):
+            if date_from and entry.timestamp and entry.timestamp < date_from:
+                continue
+            if date_to and entry.timestamp and entry.timestamp > date_to:
+                continue
+            results.append((entry, bot))
+        return results
+
+    # -- statistics -----------------------------------------------------------
+
+    @staticmethod
+    def get_bot_stats(
+        entries: list[tuple[LogEntry, BotIdentification]],
+    ) -> dict[str, BotStats]:
+        """Aggregate per-bot statistics from parsed entries."""
+        bot_data: dict[str, dict] = defaultdict(lambda: {
+            "urls": Counter(),
+            "statuses": Counter(),
+            "hours": Counter(),
+            "days": Counter(),
+            "sizes": [],
+            "count": 0,
+        })
+
+        for entry, bot in entries:
+            bd = bot_data[bot.name]
+            bd["count"] += 1
+            bd["urls"][entry.url] += 1
+            bd["statuses"][str(entry.status_code)] += 1
+            bd["sizes"].append(entry.response_size)
+            if entry.timestamp:
+                bd["hours"][entry.timestamp.hour] += 1
+                day_key = entry.timestamp.strftime("%Y-%m-%d")
+                bd["days"][day_key] += 1
+
+        stats: dict[str, BotStats] = {}
+        for name, bd in bot_data.items():
+            avg_size = sum(bd["sizes"]) / len(bd["sizes"]) if bd["sizes"] else 0.0
+            top_20 = bd["urls"].most_common(20)
+            stats[name] = BotStats(
+                name=name,
+                total_requests=bd["count"],
+                unique_urls=len(bd["urls"]),
+                status_distribution=dict(bd["statuses"]),
+                top_urls=top_20,
+                hourly_distribution=dict(sorted(bd["hours"].items())),
+                daily_distribution=dict(sorted(bd["days"].items())),
+                avg_response_size=avg_size,
+            )
+        return stats
+
+    # -- orchestrator ---------------------------------------------------------
+
+    def parse_and_analyze(
+        self,
+        filter_bot: str | None = None,
+        date_from: datetime | None = None,
+        date_to: datetime | None = None,
+    ) -> LogParseResult:
+        """Orchestrate parsing and statistics generation."""
+        entries = self.parse(filter_bot, date_from, date_to)
+        bot_stats = self.get_bot_stats(entries)
+
+        # Determine date range
+        timestamps = [e.timestamp for e, _ in entries if e.timestamp]
+        date_range = {}
+        if timestamps:
+            date_range = {
+                "from": min(timestamps).isoformat(),
+                "to": max(timestamps).isoformat(),
+            }
+
+        # Count total lines for context
+        total_lines = 0
+        fh = self._open_file(self.log_file)
+        try:
+            for _ in fh:
+                total_lines += 1
+        finally:
+            fh.close()
+
+        return LogParseResult(
+            log_file=self.log_file,
+            format_detected=self._detected_format or self.fmt,
+            total_lines=total_lines,
+            parsed_lines=total_lines - self._parse_errors,
+            bot_entries=len(entries),
+            date_range=date_range,
+            bots=bot_stats,
+            errors=self._parse_errors,
+        )
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def _parse_date(val: str) -> datetime:
+    """Parse a date string in YYYY-MM-DD format."""
+    return datetime.strptime(val, "%Y-%m-%d")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Parse server access logs and identify search engine bot traffic.",
+    )
+    parser.add_argument(
+        "--log-file",
+        required=True,
+        help="Path to access log file (plain, .gz, .bz2)",
+    )
+    parser.add_argument(
+        "--format",
+        dest="fmt",
+        choices=["auto", "nginx", "apache", "cloudfront"],
+        default="auto",
+        help="Log format (default: auto-detect)",
+    )
+    parser.add_argument(
+        "--bot",
+        default=None,
+        help="Filter results to a specific bot (e.g., googlebot, yeti, bingbot, daumoa)",
+    )
+    parser.add_argument(
+        "--streaming",
+        action="store_true",
+        help="Use streaming parser for large files (prints entries incrementally)",
+    )
+    parser.add_argument(
+        "--date-from",
+        default=None,
+        help="Filter entries from date (YYYY-MM-DD)",
+    )
+    parser.add_argument(
+        "--date-to",
+        default=None,
+        help="Filter entries to date (YYYY-MM-DD)",
+    )
+    parser.add_argument(
+        "--json",
+        action="store_true",
+        help="Output in JSON format",
+    )
+    parser.add_argument(
+        "--output",
+        default=None,
+        help="Write output to file instead of stdout",
+    )
+    args = parser.parse_args()
+
+    # Validate file exists
+    if not Path(args.log_file).exists():
+        logger.error(f"Log file not found: {args.log_file}")
+        sys.exit(1)
+
+    date_from = _parse_date(args.date_from) if args.date_from else None
+    date_to = _parse_date(args.date_to) if args.date_to else None
+
+    lp = LogParser(log_file=args.log_file, fmt=args.fmt, streaming=args.streaming)
+
+    if args.streaming and not args.json:
+        # Streaming mode: print entries as they are parsed
+        count = 0
+        for entry, bot in lp.parse_streaming(args.bot):
+            if date_from and entry.timestamp and entry.timestamp < date_from:
+                continue
+            if date_to and entry.timestamp and entry.timestamp > date_to:
+                continue
+            ts_str = entry.timestamp.isoformat() if entry.timestamp else "N/A"
+            print(
+                f"[{bot.name}] {ts_str} {entry.status_code} "
+                f"{entry.method} {entry.url} ({entry.response_size}B)"
+            )
+            count += 1
+        print(f"\n--- Total bot requests: {count} ---")
+        return
+
+    # Full analysis mode
+    result = lp.parse_and_analyze(
+        filter_bot=args.bot,
+        date_from=date_from,
+        date_to=date_to,
+    )
+
+    if args.json:
+        output_data = result.to_dict()
+        output_str = json.dumps(output_data, indent=2, ensure_ascii=False)
+    else:
+        lines = [
+            f"Log File: {result.log_file}",
+            f"Format: {result.format_detected}",
+            f"Total Lines: {result.total_lines:,}",
+            f"Parsed Lines: {result.parsed_lines:,}",
+            f"Bot Entries: {result.bot_entries:,}",
+            f"Parse Errors: {result.errors:,}",
+        ]
+        if result.date_range:
+            lines.append(f"Date Range: {result.date_range.get('from', 'N/A')} to {result.date_range.get('to', 'N/A')}")
+        lines.append("")
+        lines.append("=" * 60)
+        lines.append("Bot Statistics")
+        lines.append("=" * 60)
+        for name, stats in sorted(result.bots.items(), key=lambda x: -x[1].total_requests):
+            lines.append(f"\n--- {name.upper()} ---")
+            lines.append(f"  Requests: {stats.total_requests:,}")
+            lines.append(f"  Unique URLs: {stats.unique_urls:,}")
+            lines.append(f"  Avg Response Size: {stats.avg_response_size:,.0f} bytes")
+            lines.append(f"  Status Distribution: {stats.status_distribution}")
+            lines.append(f"  Top 10 URLs:")
+            for url, cnt in stats.top_urls[:10]:
+                lines.append(f"    {cnt:>6,} | {url}")
+            if stats.hourly_distribution:
+                peak_hour = max(stats.hourly_distribution, key=stats.hourly_distribution.get)
+                lines.append(f"  Peak Hour: {peak_hour}:00 ({stats.hourly_distribution[peak_hour]:,} reqs)")
+        output_str = "\n".join(lines)
+
+    if args.output:
+        Path(args.output).write_text(output_str, encoding="utf-8")
+        logger.info(f"Output written to {args.output}")
+    else:
+        print(output_str)
+
+
+if __name__ == "__main__":
+    main()