🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
541 lines
18 KiB
Python
541 lines
18 KiB
Python
"""
|
|
Robots.txt Checker - Analyze robots.txt configuration
|
|
=====================================================
|
|
Purpose: Parse and analyze robots.txt for SEO compliance
|
|
Python: 3.10+
|
|
Usage:
|
|
python robots_checker.py --url https://example.com/robots.txt
|
|
python robots_checker.py --url https://example.com --test-url /admin/
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from typing import Any
|
|
from urllib.parse import urljoin, urlparse
|
|
from urllib.robotparser import RobotFileParser
|
|
|
|
import requests
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class RobotsIssue:
|
|
"""Represents a robots.txt issue."""
|
|
|
|
severity: str # "error", "warning", "info"
|
|
message: str
|
|
line_number: int | None = None
|
|
directive: str | None = None
|
|
suggestion: str | None = None
|
|
|
|
|
|
@dataclass
|
|
class UserAgentRules:
|
|
"""Rules for a specific user-agent."""
|
|
|
|
user_agent: str
|
|
disallow: list[str] = field(default_factory=list)
|
|
allow: list[str] = field(default_factory=list)
|
|
crawl_delay: float | None = None
|
|
|
|
|
|
@dataclass
|
|
class RobotsResult:
|
|
"""Complete robots.txt analysis result."""
|
|
|
|
url: str
|
|
accessible: bool = True
|
|
content: str = ""
|
|
rules: list[UserAgentRules] = field(default_factory=list)
|
|
sitemaps: list[str] = field(default_factory=list)
|
|
issues: list[RobotsIssue] = field(default_factory=list)
|
|
stats: dict = field(default_factory=dict)
|
|
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to dictionary for JSON output."""
|
|
return {
|
|
"url": self.url,
|
|
"accessible": self.accessible,
|
|
"sitemaps": self.sitemaps,
|
|
"rules": [
|
|
{
|
|
"user_agent": r.user_agent,
|
|
"disallow": r.disallow,
|
|
"allow": r.allow,
|
|
"crawl_delay": r.crawl_delay,
|
|
}
|
|
for r in self.rules
|
|
],
|
|
"issues": [
|
|
{
|
|
"severity": i.severity,
|
|
"message": i.message,
|
|
"line_number": i.line_number,
|
|
"directive": i.directive,
|
|
"suggestion": i.suggestion,
|
|
}
|
|
for i in self.issues
|
|
],
|
|
"stats": self.stats,
|
|
"timestamp": self.timestamp,
|
|
}
|
|
|
|
|
|
class RobotsChecker:
|
|
"""Analyze robots.txt configuration."""
|
|
|
|
# Common user agents
|
|
USER_AGENTS = {
|
|
"*": "All bots",
|
|
"Googlebot": "Google crawler",
|
|
"Googlebot-Image": "Google Image crawler",
|
|
"Googlebot-News": "Google News crawler",
|
|
"Googlebot-Video": "Google Video crawler",
|
|
"Bingbot": "Bing crawler",
|
|
"Slurp": "Yahoo crawler",
|
|
"DuckDuckBot": "DuckDuckGo crawler",
|
|
"Baiduspider": "Baidu crawler",
|
|
"Yandex": "Yandex crawler",
|
|
"facebot": "Facebook crawler",
|
|
"Twitterbot": "Twitter crawler",
|
|
"LinkedInBot": "LinkedIn crawler",
|
|
}
|
|
|
|
# Paths that should generally not be blocked
|
|
IMPORTANT_PATHS = [
|
|
"/",
|
|
"/*.css",
|
|
"/*.js",
|
|
"/*.jpg",
|
|
"/*.jpeg",
|
|
"/*.png",
|
|
"/*.gif",
|
|
"/*.svg",
|
|
"/*.webp",
|
|
]
|
|
|
|
# Paths commonly blocked
|
|
COMMON_BLOCKED = [
|
|
"/admin",
|
|
"/wp-admin",
|
|
"/login",
|
|
"/private",
|
|
"/api",
|
|
"/cgi-bin",
|
|
"/tmp",
|
|
"/search",
|
|
]
|
|
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
"User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
|
|
})
|
|
|
|
def fetch_robots(self, url: str) -> str | None:
|
|
"""Fetch robots.txt content."""
|
|
# Ensure we're fetching robots.txt
|
|
parsed = urlparse(url)
|
|
if not parsed.path.endswith("robots.txt"):
|
|
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
|
else:
|
|
robots_url = url
|
|
|
|
try:
|
|
response = self.session.get(robots_url, timeout=10)
|
|
if response.status_code == 200:
|
|
return response.text
|
|
elif response.status_code == 404:
|
|
return None
|
|
else:
|
|
raise RuntimeError(f"HTTP {response.status_code}")
|
|
except requests.RequestException as e:
|
|
raise RuntimeError(f"Failed to fetch robots.txt: {e}")
|
|
|
|
def parse_robots(self, content: str) -> tuple[list[UserAgentRules], list[str]]:
|
|
"""Parse robots.txt content."""
|
|
rules = []
|
|
sitemaps = []
|
|
current_ua = None
|
|
current_rules = None
|
|
|
|
for line_num, line in enumerate(content.split("\n"), 1):
|
|
line = line.strip()
|
|
|
|
# Skip empty lines and comments
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
|
|
# Parse directive
|
|
if ":" not in line:
|
|
continue
|
|
|
|
directive, value = line.split(":", 1)
|
|
directive = directive.strip().lower()
|
|
value = value.strip()
|
|
|
|
if directive == "user-agent":
|
|
# Save previous user-agent rules
|
|
if current_rules:
|
|
rules.append(current_rules)
|
|
|
|
current_ua = value
|
|
current_rules = UserAgentRules(user_agent=value)
|
|
|
|
elif directive == "disallow" and current_rules:
|
|
if value: # Empty disallow means allow all
|
|
current_rules.disallow.append(value)
|
|
|
|
elif directive == "allow" and current_rules:
|
|
if value:
|
|
current_rules.allow.append(value)
|
|
|
|
elif directive == "crawl-delay" and current_rules:
|
|
try:
|
|
current_rules.crawl_delay = float(value)
|
|
except ValueError:
|
|
pass
|
|
|
|
elif directive == "sitemap":
|
|
if value:
|
|
sitemaps.append(value)
|
|
|
|
# Don't forget last user-agent
|
|
if current_rules:
|
|
rules.append(current_rules)
|
|
|
|
return rules, sitemaps
|
|
|
|
def analyze(self, url: str) -> RobotsResult:
|
|
"""Analyze robots.txt."""
|
|
result = RobotsResult(url=url)
|
|
|
|
# Fetch robots.txt
|
|
try:
|
|
content = self.fetch_robots(url)
|
|
if content is None:
|
|
result.accessible = False
|
|
result.issues.append(RobotsIssue(
|
|
severity="info",
|
|
message="No robots.txt found (returns 404)",
|
|
suggestion="Consider creating a robots.txt file",
|
|
))
|
|
return result
|
|
except RuntimeError as e:
|
|
result.accessible = False
|
|
result.issues.append(RobotsIssue(
|
|
severity="error",
|
|
message=str(e),
|
|
))
|
|
return result
|
|
|
|
result.content = content
|
|
result.rules, result.sitemaps = self.parse_robots(content)
|
|
|
|
# Analyze content
|
|
self._analyze_syntax(result)
|
|
self._analyze_rules(result)
|
|
self._analyze_sitemaps(result)
|
|
|
|
# Calculate stats
|
|
result.stats = {
|
|
"user_agents_count": len(result.rules),
|
|
"user_agents": [r.user_agent for r in result.rules],
|
|
"total_disallow_rules": sum(len(r.disallow) for r in result.rules),
|
|
"total_allow_rules": sum(len(r.allow) for r in result.rules),
|
|
"sitemaps_count": len(result.sitemaps),
|
|
"has_crawl_delay": any(r.crawl_delay for r in result.rules),
|
|
"content_length": len(content),
|
|
}
|
|
|
|
return result
|
|
|
|
def _analyze_syntax(self, result: RobotsResult) -> None:
|
|
"""Check for syntax issues."""
|
|
lines = result.content.split("\n")
|
|
|
|
for line_num, line in enumerate(lines, 1):
|
|
line = line.strip()
|
|
|
|
# Skip empty lines and comments
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
|
|
# Check for valid directive
|
|
if ":" not in line:
|
|
result.issues.append(RobotsIssue(
|
|
severity="warning",
|
|
message=f"Invalid line (missing colon): {line[:50]}",
|
|
line_number=line_num,
|
|
))
|
|
continue
|
|
|
|
directive, value = line.split(":", 1)
|
|
directive = directive.strip().lower()
|
|
|
|
valid_directives = {
|
|
"user-agent", "disallow", "allow",
|
|
"crawl-delay", "sitemap", "host",
|
|
}
|
|
|
|
if directive not in valid_directives:
|
|
result.issues.append(RobotsIssue(
|
|
severity="info",
|
|
message=f"Unknown directive: {directive}",
|
|
line_number=line_num,
|
|
directive=directive,
|
|
))
|
|
|
|
def _analyze_rules(self, result: RobotsResult) -> None:
|
|
"""Analyze blocking rules."""
|
|
# Check if there are any rules
|
|
if not result.rules:
|
|
result.issues.append(RobotsIssue(
|
|
severity="info",
|
|
message="No user-agent rules defined",
|
|
suggestion="Add User-agent: * rules to control crawling",
|
|
))
|
|
return
|
|
|
|
# Check for wildcard rule
|
|
has_wildcard = any(r.user_agent == "*" for r in result.rules)
|
|
if not has_wildcard:
|
|
result.issues.append(RobotsIssue(
|
|
severity="info",
|
|
message="No wildcard (*) user-agent defined",
|
|
suggestion="Consider adding User-agent: * as fallback",
|
|
))
|
|
|
|
# Check for blocking important resources
|
|
for rules in result.rules:
|
|
for disallow in rules.disallow:
|
|
# Check if blocking root
|
|
if disallow == "/":
|
|
result.issues.append(RobotsIssue(
|
|
severity="error",
|
|
message=f"Blocking entire site for {rules.user_agent}",
|
|
directive=f"Disallow: {disallow}",
|
|
suggestion="This will prevent indexing. Is this intentional?",
|
|
))
|
|
|
|
# Check if blocking CSS/JS
|
|
if any(ext in disallow.lower() for ext in [".css", ".js"]):
|
|
result.issues.append(RobotsIssue(
|
|
severity="warning",
|
|
message=f"Blocking CSS/JS files for {rules.user_agent}",
|
|
directive=f"Disallow: {disallow}",
|
|
suggestion="May affect rendering and SEO",
|
|
))
|
|
|
|
# Check for blocking images
|
|
if any(ext in disallow.lower() for ext in [".jpg", ".png", ".gif", ".webp"]):
|
|
result.issues.append(RobotsIssue(
|
|
severity="info",
|
|
message=f"Blocking image files for {rules.user_agent}",
|
|
directive=f"Disallow: {disallow}",
|
|
))
|
|
|
|
# Check crawl delay
|
|
if rules.crawl_delay:
|
|
if rules.crawl_delay > 10:
|
|
result.issues.append(RobotsIssue(
|
|
severity="warning",
|
|
message=f"High crawl-delay ({rules.crawl_delay}s) for {rules.user_agent}",
|
|
directive=f"Crawl-delay: {rules.crawl_delay}",
|
|
suggestion="May significantly slow indexing",
|
|
))
|
|
elif rules.crawl_delay > 0:
|
|
result.issues.append(RobotsIssue(
|
|
severity="info",
|
|
message=f"Crawl-delay set to {rules.crawl_delay}s for {rules.user_agent}",
|
|
))
|
|
|
|
def _analyze_sitemaps(self, result: RobotsResult) -> None:
|
|
"""Analyze sitemap declarations."""
|
|
if not result.sitemaps:
|
|
result.issues.append(RobotsIssue(
|
|
severity="warning",
|
|
message="No sitemap declared in robots.txt",
|
|
suggestion="Add Sitemap: directive to help crawlers find your sitemap",
|
|
))
|
|
else:
|
|
for sitemap in result.sitemaps:
|
|
if not sitemap.startswith("http"):
|
|
result.issues.append(RobotsIssue(
|
|
severity="warning",
|
|
message=f"Sitemap URL should be absolute: {sitemap}",
|
|
directive=f"Sitemap: {sitemap}",
|
|
))
|
|
|
|
def test_url(self, robots_url: str, test_path: str,
|
|
user_agent: str = "Googlebot") -> dict:
|
|
"""Test if a specific URL is allowed."""
|
|
# Use Python's built-in parser
|
|
rp = RobotFileParser()
|
|
|
|
# Ensure robots.txt URL
|
|
parsed = urlparse(robots_url)
|
|
if not parsed.path.endswith("robots.txt"):
|
|
robots_txt_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
|
else:
|
|
robots_txt_url = robots_url
|
|
|
|
rp.set_url(robots_txt_url)
|
|
try:
|
|
rp.read()
|
|
except Exception as e:
|
|
return {
|
|
"path": test_path,
|
|
"user_agent": user_agent,
|
|
"allowed": None,
|
|
"error": str(e),
|
|
}
|
|
|
|
# Build full URL for testing
|
|
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
|
full_url = urljoin(base_url, test_path)
|
|
|
|
allowed = rp.can_fetch(user_agent, full_url)
|
|
|
|
return {
|
|
"path": test_path,
|
|
"user_agent": user_agent,
|
|
"allowed": allowed,
|
|
"full_url": full_url,
|
|
}
|
|
|
|
def generate_report(self, result: RobotsResult) -> str:
|
|
"""Generate human-readable analysis report."""
|
|
lines = [
|
|
"=" * 60,
|
|
"Robots.txt Analysis Report",
|
|
"=" * 60,
|
|
f"URL: {result.url}",
|
|
f"Accessible: {'Yes' if result.accessible else 'No'}",
|
|
f"Timestamp: {result.timestamp}",
|
|
"",
|
|
]
|
|
|
|
if result.accessible:
|
|
lines.append("Statistics:")
|
|
for key, value in result.stats.items():
|
|
if key == "user_agents":
|
|
lines.append(f" {key}: {', '.join(value) if value else 'None'}")
|
|
else:
|
|
lines.append(f" {key}: {value}")
|
|
lines.append("")
|
|
|
|
if result.sitemaps:
|
|
lines.append(f"Sitemaps ({len(result.sitemaps)}):")
|
|
for sitemap in result.sitemaps:
|
|
lines.append(f" - {sitemap}")
|
|
lines.append("")
|
|
|
|
if result.rules:
|
|
lines.append("Rules Summary:")
|
|
for rules in result.rules:
|
|
lines.append(f"\n User-agent: {rules.user_agent}")
|
|
if rules.disallow:
|
|
lines.append(f" Disallow: {len(rules.disallow)} rules")
|
|
for d in rules.disallow[:5]:
|
|
lines.append(f" - {d}")
|
|
if len(rules.disallow) > 5:
|
|
lines.append(f" ... and {len(rules.disallow) - 5} more")
|
|
if rules.allow:
|
|
lines.append(f" Allow: {len(rules.allow)} rules")
|
|
for a in rules.allow[:3]:
|
|
lines.append(f" - {a}")
|
|
if rules.crawl_delay:
|
|
lines.append(f" Crawl-delay: {rules.crawl_delay}s")
|
|
lines.append("")
|
|
|
|
if result.issues:
|
|
lines.append("Issues Found:")
|
|
errors = [i for i in result.issues if i.severity == "error"]
|
|
warnings = [i for i in result.issues if i.severity == "warning"]
|
|
infos = [i for i in result.issues if i.severity == "info"]
|
|
|
|
if errors:
|
|
lines.append(f"\n ERRORS ({len(errors)}):")
|
|
for issue in errors:
|
|
lines.append(f" - {issue.message}")
|
|
if issue.directive:
|
|
lines.append(f" Directive: {issue.directive}")
|
|
if issue.suggestion:
|
|
lines.append(f" Suggestion: {issue.suggestion}")
|
|
|
|
if warnings:
|
|
lines.append(f"\n WARNINGS ({len(warnings)}):")
|
|
for issue in warnings:
|
|
lines.append(f" - {issue.message}")
|
|
if issue.suggestion:
|
|
lines.append(f" Suggestion: {issue.suggestion}")
|
|
|
|
if infos:
|
|
lines.append(f"\n INFO ({len(infos)}):")
|
|
for issue in infos:
|
|
lines.append(f" - {issue.message}")
|
|
|
|
lines.append("")
|
|
lines.append("=" * 60)
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
"""Main entry point for CLI usage."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Analyze robots.txt configuration",
|
|
)
|
|
parser.add_argument("--url", "-u", required=True,
|
|
help="URL to robots.txt or domain")
|
|
parser.add_argument("--test-url", "-t",
|
|
help="Test if specific URL path is allowed")
|
|
parser.add_argument("--user-agent", "-a", default="Googlebot",
|
|
help="User agent for testing (default: Googlebot)")
|
|
parser.add_argument("--output", "-o", help="Output file for JSON report")
|
|
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
|
|
args = parser.parse_args()
|
|
|
|
checker = RobotsChecker()
|
|
|
|
if args.test_url:
|
|
# Test specific URL
|
|
test_result = checker.test_url(args.url, args.test_url, args.user_agent)
|
|
if args.json:
|
|
print(json.dumps(test_result, indent=2))
|
|
else:
|
|
status = "ALLOWED" if test_result["allowed"] else "BLOCKED"
|
|
print(f"URL: {test_result['path']}")
|
|
print(f"User-Agent: {test_result['user_agent']}")
|
|
print(f"Status: {status}")
|
|
else:
|
|
# Full analysis
|
|
result = checker.analyze(args.url)
|
|
|
|
if args.json or args.output:
|
|
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
|
|
if args.output:
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
f.write(output)
|
|
logger.info(f"Report written to {args.output}")
|
|
else:
|
|
print(output)
|
|
else:
|
|
print(checker.generate_report(result))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|