refactor: Reorganize skill numbering and update documentation

Skill Numbering Changes: - 01-03: OurDigital core (was 30-32) - 31-32: Notion tools (was 01-02) - 99_archive: Renamed from _archive for sorting New Files: - AGENTS.md: Claude Code agent routing guide - requirements.txt for 00-claude-code-setting, 32-notion-writer, 43-jamie-youtube-manager Documentation Updates: - CLAUDE.md: Updated skill inventory (23 skills) - AUDIT_REPORT.md: Current completion status (91%) - Archived REFACTORING_PLAN.md (most tasks complete) Removed: - ga-agent-skills/ (moved to separate repo ~/Project/dintel-ga4-agent) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 18:42:39 +07:00
parent ae193d5e08
commit b69e4b6f3a
100 changed files with 655 additions and 1812 deletions
--- a/custom-skills/99_archive/seo-audit-agent/scripts/robots_checker.py
+++ b/custom-skills/99_archive/seo-audit-agent/scripts/robots_checker.py
@@ -0,0 +1,540 @@
+"""
+Robots.txt Checker - Analyze robots.txt configuration
+=====================================================
+Purpose: Parse and analyze robots.txt for SEO compliance
+Python: 3.10+
+Usage:
+    python robots_checker.py --url https://example.com/robots.txt
+    python robots_checker.py --url https://example.com --test-url /admin/
+"""
+
+import argparse
+import json
+import logging
+import re
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any
+from urllib.parse import urljoin, urlparse
+from urllib.robotparser import RobotFileParser
+
+import requests
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class RobotsIssue:
+    """Represents a robots.txt issue."""
+
+    severity: str  # "error", "warning", "info"
+    message: str
+    line_number: int | None = None
+    directive: str | None = None
+    suggestion: str | None = None
+
+
+@dataclass
+class UserAgentRules:
+    """Rules for a specific user-agent."""
+
+    user_agent: str
+    disallow: list[str] = field(default_factory=list)
+    allow: list[str] = field(default_factory=list)
+    crawl_delay: float | None = None
+
+
+@dataclass
+class RobotsResult:
+    """Complete robots.txt analysis result."""
+
+    url: str
+    accessible: bool = True
+    content: str = ""
+    rules: list[UserAgentRules] = field(default_factory=list)
+    sitemaps: list[str] = field(default_factory=list)
+    issues: list[RobotsIssue] = field(default_factory=list)
+    stats: dict = field(default_factory=dict)
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON output."""
+        return {
+            "url": self.url,
+            "accessible": self.accessible,
+            "sitemaps": self.sitemaps,
+            "rules": [
+                {
+                    "user_agent": r.user_agent,
+                    "disallow": r.disallow,
+                    "allow": r.allow,
+                    "crawl_delay": r.crawl_delay,
+                }
+                for r in self.rules
+            ],
+            "issues": [
+                {
+                    "severity": i.severity,
+                    "message": i.message,
+                    "line_number": i.line_number,
+                    "directive": i.directive,
+                    "suggestion": i.suggestion,
+                }
+                for i in self.issues
+            ],
+            "stats": self.stats,
+            "timestamp": self.timestamp,
+        }
+
+
+class RobotsChecker:
+    """Analyze robots.txt configuration."""
+
+    # Common user agents
+    USER_AGENTS = {
+        "*": "All bots",
+        "Googlebot": "Google crawler",
+        "Googlebot-Image": "Google Image crawler",
+        "Googlebot-News": "Google News crawler",
+        "Googlebot-Video": "Google Video crawler",
+        "Bingbot": "Bing crawler",
+        "Slurp": "Yahoo crawler",
+        "DuckDuckBot": "DuckDuckGo crawler",
+        "Baiduspider": "Baidu crawler",
+        "Yandex": "Yandex crawler",
+        "facebot": "Facebook crawler",
+        "Twitterbot": "Twitter crawler",
+        "LinkedInBot": "LinkedIn crawler",
+    }
+
+    # Paths that should generally not be blocked
+    IMPORTANT_PATHS = [
+        "/",
+        "/*.css",
+        "/*.js",
+        "/*.jpg",
+        "/*.jpeg",
+        "/*.png",
+        "/*.gif",
+        "/*.svg",
+        "/*.webp",
+    ]
+
+    # Paths commonly blocked
+    COMMON_BLOCKED = [
+        "/admin",
+        "/wp-admin",
+        "/login",
+        "/private",
+        "/api",
+        "/cgi-bin",
+        "/tmp",
+        "/search",
+    ]
+
+    def __init__(self):
+        self.session = requests.Session()
+        self.session.headers.update({
+            "User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
+        })
+
+    def fetch_robots(self, url: str) -> str | None:
+        """Fetch robots.txt content."""
+        # Ensure we're fetching robots.txt
+        parsed = urlparse(url)
+        if not parsed.path.endswith("robots.txt"):
+            robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
+        else:
+            robots_url = url
+
+        try:
+            response = self.session.get(robots_url, timeout=10)
+            if response.status_code == 200:
+                return response.text
+            elif response.status_code == 404:
+                return None
+            else:
+                raise RuntimeError(f"HTTP {response.status_code}")
+        except requests.RequestException as e:
+            raise RuntimeError(f"Failed to fetch robots.txt: {e}")
+
+    def parse_robots(self, content: str) -> tuple[list[UserAgentRules], list[str]]:
+        """Parse robots.txt content."""
+        rules = []
+        sitemaps = []
+        current_ua = None
+        current_rules = None
+
+        for line_num, line in enumerate(content.split("\n"), 1):
+            line = line.strip()
+
+            # Skip empty lines and comments
+            if not line or line.startswith("#"):
+                continue
+
+            # Parse directive
+            if ":" not in line:
+                continue
+
+            directive, value = line.split(":", 1)
+            directive = directive.strip().lower()
+            value = value.strip()
+
+            if directive == "user-agent":
+                # Save previous user-agent rules
+                if current_rules:
+                    rules.append(current_rules)
+
+                current_ua = value
+                current_rules = UserAgentRules(user_agent=value)
+
+            elif directive == "disallow" and current_rules:
+                if value:  # Empty disallow means allow all
+                    current_rules.disallow.append(value)
+
+            elif directive == "allow" and current_rules:
+                if value:
+                    current_rules.allow.append(value)
+
+            elif directive == "crawl-delay" and current_rules:
+                try:
+                    current_rules.crawl_delay = float(value)
+                except ValueError:
+                    pass
+
+            elif directive == "sitemap":
+                if value:
+                    sitemaps.append(value)
+
+        # Don't forget last user-agent
+        if current_rules:
+            rules.append(current_rules)
+
+        return rules, sitemaps
+
+    def analyze(self, url: str) -> RobotsResult:
+        """Analyze robots.txt."""
+        result = RobotsResult(url=url)
+
+        # Fetch robots.txt
+        try:
+            content = self.fetch_robots(url)
+            if content is None:
+                result.accessible = False
+                result.issues.append(RobotsIssue(
+                    severity="info",
+                    message="No robots.txt found (returns 404)",
+                    suggestion="Consider creating a robots.txt file",
+                ))
+                return result
+        except RuntimeError as e:
+            result.accessible = False
+            result.issues.append(RobotsIssue(
+                severity="error",
+                message=str(e),
+            ))
+            return result
+
+        result.content = content
+        result.rules, result.sitemaps = self.parse_robots(content)
+
+        # Analyze content
+        self._analyze_syntax(result)
+        self._analyze_rules(result)
+        self._analyze_sitemaps(result)
+
+        # Calculate stats
+        result.stats = {
+            "user_agents_count": len(result.rules),
+            "user_agents": [r.user_agent for r in result.rules],
+            "total_disallow_rules": sum(len(r.disallow) for r in result.rules),
+            "total_allow_rules": sum(len(r.allow) for r in result.rules),
+            "sitemaps_count": len(result.sitemaps),
+            "has_crawl_delay": any(r.crawl_delay for r in result.rules),
+            "content_length": len(content),
+        }
+
+        return result
+
+    def _analyze_syntax(self, result: RobotsResult) -> None:
+        """Check for syntax issues."""
+        lines = result.content.split("\n")
+
+        for line_num, line in enumerate(lines, 1):
+            line = line.strip()
+
+            # Skip empty lines and comments
+            if not line or line.startswith("#"):
+                continue
+
+            # Check for valid directive
+            if ":" not in line:
+                result.issues.append(RobotsIssue(
+                    severity="warning",
+                    message=f"Invalid line (missing colon): {line[:50]}",
+                    line_number=line_num,
+                ))
+                continue
+
+            directive, value = line.split(":", 1)
+            directive = directive.strip().lower()
+
+            valid_directives = {
+                "user-agent", "disallow", "allow",
+                "crawl-delay", "sitemap", "host",
+            }
+
+            if directive not in valid_directives:
+                result.issues.append(RobotsIssue(
+                    severity="info",
+                    message=f"Unknown directive: {directive}",
+                    line_number=line_num,
+                    directive=directive,
+                ))
+
+    def _analyze_rules(self, result: RobotsResult) -> None:
+        """Analyze blocking rules."""
+        # Check if there are any rules
+        if not result.rules:
+            result.issues.append(RobotsIssue(
+                severity="info",
+                message="No user-agent rules defined",
+                suggestion="Add User-agent: * rules to control crawling",
+            ))
+            return
+
+        # Check for wildcard rule
+        has_wildcard = any(r.user_agent == "*" for r in result.rules)
+        if not has_wildcard:
+            result.issues.append(RobotsIssue(
+                severity="info",
+                message="No wildcard (*) user-agent defined",
+                suggestion="Consider adding User-agent: * as fallback",
+            ))
+
+        # Check for blocking important resources
+        for rules in result.rules:
+            for disallow in rules.disallow:
+                # Check if blocking root
+                if disallow == "/":
+                    result.issues.append(RobotsIssue(
+                        severity="error",
+                        message=f"Blocking entire site for {rules.user_agent}",
+                        directive=f"Disallow: {disallow}",
+                        suggestion="This will prevent indexing. Is this intentional?",
+                    ))
+
+                # Check if blocking CSS/JS
+                if any(ext in disallow.lower() for ext in [".css", ".js"]):
+                    result.issues.append(RobotsIssue(
+                        severity="warning",
+                        message=f"Blocking CSS/JS files for {rules.user_agent}",
+                        directive=f"Disallow: {disallow}",
+                        suggestion="May affect rendering and SEO",
+                    ))
+
+                # Check for blocking images
+                if any(ext in disallow.lower() for ext in [".jpg", ".png", ".gif", ".webp"]):
+                    result.issues.append(RobotsIssue(
+                        severity="info",
+                        message=f"Blocking image files for {rules.user_agent}",
+                        directive=f"Disallow: {disallow}",
+                    ))
+
+            # Check crawl delay
+            if rules.crawl_delay:
+                if rules.crawl_delay > 10:
+                    result.issues.append(RobotsIssue(
+                        severity="warning",
+                        message=f"High crawl-delay ({rules.crawl_delay}s) for {rules.user_agent}",
+                        directive=f"Crawl-delay: {rules.crawl_delay}",
+                        suggestion="May significantly slow indexing",
+                    ))
+                elif rules.crawl_delay > 0:
+                    result.issues.append(RobotsIssue(
+                        severity="info",
+                        message=f"Crawl-delay set to {rules.crawl_delay}s for {rules.user_agent}",
+                    ))
+
+    def _analyze_sitemaps(self, result: RobotsResult) -> None:
+        """Analyze sitemap declarations."""
+        if not result.sitemaps:
+            result.issues.append(RobotsIssue(
+                severity="warning",
+                message="No sitemap declared in robots.txt",
+                suggestion="Add Sitemap: directive to help crawlers find your sitemap",
+            ))
+        else:
+            for sitemap in result.sitemaps:
+                if not sitemap.startswith("http"):
+                    result.issues.append(RobotsIssue(
+                        severity="warning",
+                        message=f"Sitemap URL should be absolute: {sitemap}",
+                        directive=f"Sitemap: {sitemap}",
+                    ))
+
+    def test_url(self, robots_url: str, test_path: str,
+                 user_agent: str = "Googlebot") -> dict:
+        """Test if a specific URL is allowed."""
+        # Use Python's built-in parser
+        rp = RobotFileParser()
+
+        # Ensure robots.txt URL
+        parsed = urlparse(robots_url)
+        if not parsed.path.endswith("robots.txt"):
+            robots_txt_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
+        else:
+            robots_txt_url = robots_url
+
+        rp.set_url(robots_txt_url)
+        try:
+            rp.read()
+        except Exception as e:
+            return {
+                "path": test_path,
+                "user_agent": user_agent,
+                "allowed": None,
+                "error": str(e),
+            }
+
+        # Build full URL for testing
+        base_url = f"{parsed.scheme}://{parsed.netloc}"
+        full_url = urljoin(base_url, test_path)
+
+        allowed = rp.can_fetch(user_agent, full_url)
+
+        return {
+            "path": test_path,
+            "user_agent": user_agent,
+            "allowed": allowed,
+            "full_url": full_url,
+        }
+
+    def generate_report(self, result: RobotsResult) -> str:
+        """Generate human-readable analysis report."""
+        lines = [
+            "=" * 60,
+            "Robots.txt Analysis Report",
+            "=" * 60,
+            f"URL: {result.url}",
+            f"Accessible: {'Yes' if result.accessible else 'No'}",
+            f"Timestamp: {result.timestamp}",
+            "",
+        ]
+
+        if result.accessible:
+            lines.append("Statistics:")
+            for key, value in result.stats.items():
+                if key == "user_agents":
+                    lines.append(f"  {key}: {', '.join(value) if value else 'None'}")
+                else:
+                    lines.append(f"  {key}: {value}")
+            lines.append("")
+
+            if result.sitemaps:
+                lines.append(f"Sitemaps ({len(result.sitemaps)}):")
+                for sitemap in result.sitemaps:
+                    lines.append(f"  - {sitemap}")
+                lines.append("")
+
+            if result.rules:
+                lines.append("Rules Summary:")
+                for rules in result.rules:
+                    lines.append(f"\n  User-agent: {rules.user_agent}")
+                    if rules.disallow:
+                        lines.append(f"    Disallow: {len(rules.disallow)} rules")
+                        for d in rules.disallow[:5]:
+                            lines.append(f"      - {d}")
+                        if len(rules.disallow) > 5:
+                            lines.append(f"      ... and {len(rules.disallow) - 5} more")
+                    if rules.allow:
+                        lines.append(f"    Allow: {len(rules.allow)} rules")
+                        for a in rules.allow[:3]:
+                            lines.append(f"      - {a}")
+                    if rules.crawl_delay:
+                        lines.append(f"    Crawl-delay: {rules.crawl_delay}s")
+                lines.append("")
+
+        if result.issues:
+            lines.append("Issues Found:")
+            errors = [i for i in result.issues if i.severity == "error"]
+            warnings = [i for i in result.issues if i.severity == "warning"]
+            infos = [i for i in result.issues if i.severity == "info"]
+
+            if errors:
+                lines.append(f"\n  ERRORS ({len(errors)}):")
+                for issue in errors:
+                    lines.append(f"    - {issue.message}")
+                    if issue.directive:
+                        lines.append(f"      Directive: {issue.directive}")
+                    if issue.suggestion:
+                        lines.append(f"      Suggestion: {issue.suggestion}")
+
+            if warnings:
+                lines.append(f"\n  WARNINGS ({len(warnings)}):")
+                for issue in warnings:
+                    lines.append(f"    - {issue.message}")
+                    if issue.suggestion:
+                        lines.append(f"      Suggestion: {issue.suggestion}")
+
+            if infos:
+                lines.append(f"\n  INFO ({len(infos)}):")
+                for issue in infos:
+                    lines.append(f"    - {issue.message}")
+
+        lines.append("")
+        lines.append("=" * 60)
+
+        return "\n".join(lines)
+
+
+def main():
+    """Main entry point for CLI usage."""
+    parser = argparse.ArgumentParser(
+        description="Analyze robots.txt configuration",
+    )
+    parser.add_argument("--url", "-u", required=True,
+                       help="URL to robots.txt or domain")
+    parser.add_argument("--test-url", "-t",
+                       help="Test if specific URL path is allowed")
+    parser.add_argument("--user-agent", "-a", default="Googlebot",
+                       help="User agent for testing (default: Googlebot)")
+    parser.add_argument("--output", "-o", help="Output file for JSON report")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+
+    args = parser.parse_args()
+
+    checker = RobotsChecker()
+
+    if args.test_url:
+        # Test specific URL
+        test_result = checker.test_url(args.url, args.test_url, args.user_agent)
+        if args.json:
+            print(json.dumps(test_result, indent=2))
+        else:
+            status = "ALLOWED" if test_result["allowed"] else "BLOCKED"
+            print(f"URL: {test_result['path']}")
+            print(f"User-Agent: {test_result['user_agent']}")
+            print(f"Status: {status}")
+    else:
+        # Full analysis
+        result = checker.analyze(args.url)
+
+        if args.json or args.output:
+            output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
+            if args.output:
+                with open(args.output, "w", encoding="utf-8") as f:
+                    f.write(output)
+                logger.info(f"Report written to {args.output}")
+            else:
+                print(output)
+        else:
+            print(checker.generate_report(result))
+
+
+if __name__ == "__main__":
+    main()