""" Robots.txt Checker - Analyze robots.txt configuration ===================================================== Purpose: Parse and analyze robots.txt for SEO compliance Python: 3.10+ Usage: python robots_checker.py --url https://example.com/robots.txt python robots_checker.py --url https://example.com --test-url /admin/ """ import argparse import json import logging import re from dataclasses import dataclass, field from datetime import datetime from typing import Any from urllib.parse import urljoin, urlparse from urllib.robotparser import RobotFileParser import requests logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", ) logger = logging.getLogger(__name__) @dataclass class RobotsIssue: """Represents a robots.txt issue.""" severity: str # "error", "warning", "info" message: str line_number: int | None = None directive: str | None = None suggestion: str | None = None @dataclass class UserAgentRules: """Rules for a specific user-agent.""" user_agent: str disallow: list[str] = field(default_factory=list) allow: list[str] = field(default_factory=list) crawl_delay: float | None = None @dataclass class RobotsResult: """Complete robots.txt analysis result.""" url: str accessible: bool = True content: str = "" rules: list[UserAgentRules] = field(default_factory=list) sitemaps: list[str] = field(default_factory=list) issues: list[RobotsIssue] = field(default_factory=list) stats: dict = field(default_factory=dict) timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) def to_dict(self) -> dict: """Convert to dictionary for JSON output.""" return { "url": self.url, "accessible": self.accessible, "sitemaps": self.sitemaps, "rules": [ { "user_agent": r.user_agent, "disallow": r.disallow, "allow": r.allow, "crawl_delay": r.crawl_delay, } for r in self.rules ], "issues": [ { "severity": i.severity, "message": i.message, "line_number": i.line_number, "directive": i.directive, "suggestion": i.suggestion, } for i in self.issues ], "stats": self.stats, "timestamp": self.timestamp, } class RobotsChecker: """Analyze robots.txt configuration.""" # Common user agents USER_AGENTS = { "*": "All bots", "Googlebot": "Google crawler", "Googlebot-Image": "Google Image crawler", "Googlebot-News": "Google News crawler", "Googlebot-Video": "Google Video crawler", "Bingbot": "Bing crawler", "Slurp": "Yahoo crawler", "DuckDuckBot": "DuckDuckGo crawler", "Baiduspider": "Baidu crawler", "Yandex": "Yandex crawler", "facebot": "Facebook crawler", "Twitterbot": "Twitter crawler", "LinkedInBot": "LinkedIn crawler", } # Paths that should generally not be blocked IMPORTANT_PATHS = [ "/", "/*.css", "/*.js", "/*.jpg", "/*.jpeg", "/*.png", "/*.gif", "/*.svg", "/*.webp", ] # Paths commonly blocked COMMON_BLOCKED = [ "/admin", "/wp-admin", "/login", "/private", "/api", "/cgi-bin", "/tmp", "/search", ] def __init__(self): self.session = requests.Session() self.session.headers.update({ "User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)" }) def fetch_robots(self, url: str) -> str | None: """Fetch robots.txt content.""" # Ensure we're fetching robots.txt parsed = urlparse(url) if not parsed.path.endswith("robots.txt"): robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" else: robots_url = url try: response = self.session.get(robots_url, timeout=10) if response.status_code == 200: return response.text elif response.status_code == 404: return None else: raise RuntimeError(f"HTTP {response.status_code}") except requests.RequestException as e: raise RuntimeError(f"Failed to fetch robots.txt: {e}") def parse_robots(self, content: str) -> tuple[list[UserAgentRules], list[str]]: """Parse robots.txt content.""" rules = [] sitemaps = [] current_ua = None current_rules = None for line_num, line in enumerate(content.split("\n"), 1): line = line.strip() # Skip empty lines and comments if not line or line.startswith("#"): continue # Parse directive if ":" not in line: continue directive, value = line.split(":", 1) directive = directive.strip().lower() value = value.strip() if directive == "user-agent": # Save previous user-agent rules if current_rules: rules.append(current_rules) current_ua = value current_rules = UserAgentRules(user_agent=value) elif directive == "disallow" and current_rules: if value: # Empty disallow means allow all current_rules.disallow.append(value) elif directive == "allow" and current_rules: if value: current_rules.allow.append(value) elif directive == "crawl-delay" and current_rules: try: current_rules.crawl_delay = float(value) except ValueError: pass elif directive == "sitemap": if value: sitemaps.append(value) # Don't forget last user-agent if current_rules: rules.append(current_rules) return rules, sitemaps def analyze(self, url: str) -> RobotsResult: """Analyze robots.txt.""" result = RobotsResult(url=url) # Fetch robots.txt try: content = self.fetch_robots(url) if content is None: result.accessible = False result.issues.append(RobotsIssue( severity="info", message="No robots.txt found (returns 404)", suggestion="Consider creating a robots.txt file", )) return result except RuntimeError as e: result.accessible = False result.issues.append(RobotsIssue( severity="error", message=str(e), )) return result result.content = content result.rules, result.sitemaps = self.parse_robots(content) # Analyze content self._analyze_syntax(result) self._analyze_rules(result) self._analyze_sitemaps(result) # Calculate stats result.stats = { "user_agents_count": len(result.rules), "user_agents": [r.user_agent for r in result.rules], "total_disallow_rules": sum(len(r.disallow) for r in result.rules), "total_allow_rules": sum(len(r.allow) for r in result.rules), "sitemaps_count": len(result.sitemaps), "has_crawl_delay": any(r.crawl_delay for r in result.rules), "content_length": len(content), } return result def _analyze_syntax(self, result: RobotsResult) -> None: """Check for syntax issues.""" lines = result.content.split("\n") for line_num, line in enumerate(lines, 1): line = line.strip() # Skip empty lines and comments if not line or line.startswith("#"): continue # Check for valid directive if ":" not in line: result.issues.append(RobotsIssue( severity="warning", message=f"Invalid line (missing colon): {line[:50]}", line_number=line_num, )) continue directive, value = line.split(":", 1) directive = directive.strip().lower() valid_directives = { "user-agent", "disallow", "allow", "crawl-delay", "sitemap", "host", } if directive not in valid_directives: result.issues.append(RobotsIssue( severity="info", message=f"Unknown directive: {directive}", line_number=line_num, directive=directive, )) def _analyze_rules(self, result: RobotsResult) -> None: """Analyze blocking rules.""" # Check if there are any rules if not result.rules: result.issues.append(RobotsIssue( severity="info", message="No user-agent rules defined", suggestion="Add User-agent: * rules to control crawling", )) return # Check for wildcard rule has_wildcard = any(r.user_agent == "*" for r in result.rules) if not has_wildcard: result.issues.append(RobotsIssue( severity="info", message="No wildcard (*) user-agent defined", suggestion="Consider adding User-agent: * as fallback", )) # Check for blocking important resources for rules in result.rules: for disallow in rules.disallow: # Check if blocking root if disallow == "/": result.issues.append(RobotsIssue( severity="error", message=f"Blocking entire site for {rules.user_agent}", directive=f"Disallow: {disallow}", suggestion="This will prevent indexing. Is this intentional?", )) # Check if blocking CSS/JS if any(ext in disallow.lower() for ext in [".css", ".js"]): result.issues.append(RobotsIssue( severity="warning", message=f"Blocking CSS/JS files for {rules.user_agent}", directive=f"Disallow: {disallow}", suggestion="May affect rendering and SEO", )) # Check for blocking images if any(ext in disallow.lower() for ext in [".jpg", ".png", ".gif", ".webp"]): result.issues.append(RobotsIssue( severity="info", message=f"Blocking image files for {rules.user_agent}", directive=f"Disallow: {disallow}", )) # Check crawl delay if rules.crawl_delay: if rules.crawl_delay > 10: result.issues.append(RobotsIssue( severity="warning", message=f"High crawl-delay ({rules.crawl_delay}s) for {rules.user_agent}", directive=f"Crawl-delay: {rules.crawl_delay}", suggestion="May significantly slow indexing", )) elif rules.crawl_delay > 0: result.issues.append(RobotsIssue( severity="info", message=f"Crawl-delay set to {rules.crawl_delay}s for {rules.user_agent}", )) def _analyze_sitemaps(self, result: RobotsResult) -> None: """Analyze sitemap declarations.""" if not result.sitemaps: result.issues.append(RobotsIssue( severity="warning", message="No sitemap declared in robots.txt", suggestion="Add Sitemap: directive to help crawlers find your sitemap", )) else: for sitemap in result.sitemaps: if not sitemap.startswith("http"): result.issues.append(RobotsIssue( severity="warning", message=f"Sitemap URL should be absolute: {sitemap}", directive=f"Sitemap: {sitemap}", )) def test_url(self, robots_url: str, test_path: str, user_agent: str = "Googlebot") -> dict: """Test if a specific URL is allowed.""" # Use Python's built-in parser rp = RobotFileParser() # Ensure robots.txt URL parsed = urlparse(robots_url) if not parsed.path.endswith("robots.txt"): robots_txt_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" else: robots_txt_url = robots_url rp.set_url(robots_txt_url) try: rp.read() except Exception as e: return { "path": test_path, "user_agent": user_agent, "allowed": None, "error": str(e), } # Build full URL for testing base_url = f"{parsed.scheme}://{parsed.netloc}" full_url = urljoin(base_url, test_path) allowed = rp.can_fetch(user_agent, full_url) return { "path": test_path, "user_agent": user_agent, "allowed": allowed, "full_url": full_url, } def generate_report(self, result: RobotsResult) -> str: """Generate human-readable analysis report.""" lines = [ "=" * 60, "Robots.txt Analysis Report", "=" * 60, f"URL: {result.url}", f"Accessible: {'Yes' if result.accessible else 'No'}", f"Timestamp: {result.timestamp}", "", ] if result.accessible: lines.append("Statistics:") for key, value in result.stats.items(): if key == "user_agents": lines.append(f" {key}: {', '.join(value) if value else 'None'}") else: lines.append(f" {key}: {value}") lines.append("") if result.sitemaps: lines.append(f"Sitemaps ({len(result.sitemaps)}):") for sitemap in result.sitemaps: lines.append(f" - {sitemap}") lines.append("") if result.rules: lines.append("Rules Summary:") for rules in result.rules: lines.append(f"\n User-agent: {rules.user_agent}") if rules.disallow: lines.append(f" Disallow: {len(rules.disallow)} rules") for d in rules.disallow[:5]: lines.append(f" - {d}") if len(rules.disallow) > 5: lines.append(f" ... and {len(rules.disallow) - 5} more") if rules.allow: lines.append(f" Allow: {len(rules.allow)} rules") for a in rules.allow[:3]: lines.append(f" - {a}") if rules.crawl_delay: lines.append(f" Crawl-delay: {rules.crawl_delay}s") lines.append("") if result.issues: lines.append("Issues Found:") errors = [i for i in result.issues if i.severity == "error"] warnings = [i for i in result.issues if i.severity == "warning"] infos = [i for i in result.issues if i.severity == "info"] if errors: lines.append(f"\n ERRORS ({len(errors)}):") for issue in errors: lines.append(f" - {issue.message}") if issue.directive: lines.append(f" Directive: {issue.directive}") if issue.suggestion: lines.append(f" Suggestion: {issue.suggestion}") if warnings: lines.append(f"\n WARNINGS ({len(warnings)}):") for issue in warnings: lines.append(f" - {issue.message}") if issue.suggestion: lines.append(f" Suggestion: {issue.suggestion}") if infos: lines.append(f"\n INFO ({len(infos)}):") for issue in infos: lines.append(f" - {issue.message}") lines.append("") lines.append("=" * 60) return "\n".join(lines) def main(): """Main entry point for CLI usage.""" parser = argparse.ArgumentParser( description="Analyze robots.txt configuration", ) parser.add_argument("--url", "-u", required=True, help="URL to robots.txt or domain") parser.add_argument("--test-url", "-t", help="Test if specific URL path is allowed") parser.add_argument("--user-agent", "-a", default="Googlebot", help="User agent for testing (default: Googlebot)") parser.add_argument("--output", "-o", help="Output file for JSON report") parser.add_argument("--json", action="store_true", help="Output as JSON") args = parser.parse_args() checker = RobotsChecker() if args.test_url: # Test specific URL test_result = checker.test_url(args.url, args.test_url, args.user_agent) if args.json: print(json.dumps(test_result, indent=2)) else: status = "ALLOWED" if test_result["allowed"] else "BLOCKED" print(f"URL: {test_result['path']}") print(f"User-Agent: {test_result['user_agent']}") print(f"Status: {status}") else: # Full analysis result = checker.analyze(args.url) if args.json or args.output: output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2) if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(output) logger.info(f"Report written to {args.output}") else: print(output) else: print(checker.generate_report(result)) if __name__ == "__main__": main()