Restructure skill numbering: SEO 11-30, GTM 60-69, reserve 19-28 for future skills
Renumber 12 existing skills to new ranges: - SEO: 11→13, 12→18, 13→16, 14→17, 15→14, 16→15, 17→29, 18→30, 19→12 - GTM: 20→60, 21→61, 22→62 Update cross-references in gateway architect/builder skills, GTM guardian README, CLAUDE.md (skill tables + directory layout), and AGENTS.md (domain routing ranges). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,540 @@
|
||||
"""
|
||||
Robots.txt Checker - Analyze robots.txt configuration
|
||||
=====================================================
|
||||
Purpose: Parse and analyze robots.txt for SEO compliance
|
||||
Python: 3.10+
|
||||
Usage:
|
||||
python robots_checker.py --url https://example.com/robots.txt
|
||||
python robots_checker.py --url https://example.com --test-url /admin/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from urllib.robotparser import RobotFileParser
|
||||
|
||||
import requests
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RobotsIssue:
|
||||
"""Represents a robots.txt issue."""
|
||||
|
||||
severity: str # "error", "warning", "info"
|
||||
message: str
|
||||
line_number: int | None = None
|
||||
directive: str | None = None
|
||||
suggestion: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class UserAgentRules:
|
||||
"""Rules for a specific user-agent."""
|
||||
|
||||
user_agent: str
|
||||
disallow: list[str] = field(default_factory=list)
|
||||
allow: list[str] = field(default_factory=list)
|
||||
crawl_delay: float | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RobotsResult:
|
||||
"""Complete robots.txt analysis result."""
|
||||
|
||||
url: str
|
||||
accessible: bool = True
|
||||
content: str = ""
|
||||
rules: list[UserAgentRules] = field(default_factory=list)
|
||||
sitemaps: list[str] = field(default_factory=list)
|
||||
issues: list[RobotsIssue] = field(default_factory=list)
|
||||
stats: dict = field(default_factory=dict)
|
||||
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON output."""
|
||||
return {
|
||||
"url": self.url,
|
||||
"accessible": self.accessible,
|
||||
"sitemaps": self.sitemaps,
|
||||
"rules": [
|
||||
{
|
||||
"user_agent": r.user_agent,
|
||||
"disallow": r.disallow,
|
||||
"allow": r.allow,
|
||||
"crawl_delay": r.crawl_delay,
|
||||
}
|
||||
for r in self.rules
|
||||
],
|
||||
"issues": [
|
||||
{
|
||||
"severity": i.severity,
|
||||
"message": i.message,
|
||||
"line_number": i.line_number,
|
||||
"directive": i.directive,
|
||||
"suggestion": i.suggestion,
|
||||
}
|
||||
for i in self.issues
|
||||
],
|
||||
"stats": self.stats,
|
||||
"timestamp": self.timestamp,
|
||||
}
|
||||
|
||||
|
||||
class RobotsChecker:
|
||||
"""Analyze robots.txt configuration."""
|
||||
|
||||
# Common user agents
|
||||
USER_AGENTS = {
|
||||
"*": "All bots",
|
||||
"Googlebot": "Google crawler",
|
||||
"Googlebot-Image": "Google Image crawler",
|
||||
"Googlebot-News": "Google News crawler",
|
||||
"Googlebot-Video": "Google Video crawler",
|
||||
"Bingbot": "Bing crawler",
|
||||
"Slurp": "Yahoo crawler",
|
||||
"DuckDuckBot": "DuckDuckGo crawler",
|
||||
"Baiduspider": "Baidu crawler",
|
||||
"Yandex": "Yandex crawler",
|
||||
"facebot": "Facebook crawler",
|
||||
"Twitterbot": "Twitter crawler",
|
||||
"LinkedInBot": "LinkedIn crawler",
|
||||
}
|
||||
|
||||
# Paths that should generally not be blocked
|
||||
IMPORTANT_PATHS = [
|
||||
"/",
|
||||
"/*.css",
|
||||
"/*.js",
|
||||
"/*.jpg",
|
||||
"/*.jpeg",
|
||||
"/*.png",
|
||||
"/*.gif",
|
||||
"/*.svg",
|
||||
"/*.webp",
|
||||
]
|
||||
|
||||
# Paths commonly blocked
|
||||
COMMON_BLOCKED = [
|
||||
"/admin",
|
||||
"/wp-admin",
|
||||
"/login",
|
||||
"/private",
|
||||
"/api",
|
||||
"/cgi-bin",
|
||||
"/tmp",
|
||||
"/search",
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
|
||||
})
|
||||
|
||||
def fetch_robots(self, url: str) -> str | None:
|
||||
"""Fetch robots.txt content."""
|
||||
# Ensure we're fetching robots.txt
|
||||
parsed = urlparse(url)
|
||||
if not parsed.path.endswith("robots.txt"):
|
||||
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
||||
else:
|
||||
robots_url = url
|
||||
|
||||
try:
|
||||
response = self.session.get(robots_url, timeout=10)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
elif response.status_code == 404:
|
||||
return None
|
||||
else:
|
||||
raise RuntimeError(f"HTTP {response.status_code}")
|
||||
except requests.RequestException as e:
|
||||
raise RuntimeError(f"Failed to fetch robots.txt: {e}")
|
||||
|
||||
def parse_robots(self, content: str) -> tuple[list[UserAgentRules], list[str]]:
|
||||
"""Parse robots.txt content."""
|
||||
rules = []
|
||||
sitemaps = []
|
||||
current_ua = None
|
||||
current_rules = None
|
||||
|
||||
for line_num, line in enumerate(content.split("\n"), 1):
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines and comments
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
|
||||
# Parse directive
|
||||
if ":" not in line:
|
||||
continue
|
||||
|
||||
directive, value = line.split(":", 1)
|
||||
directive = directive.strip().lower()
|
||||
value = value.strip()
|
||||
|
||||
if directive == "user-agent":
|
||||
# Save previous user-agent rules
|
||||
if current_rules:
|
||||
rules.append(current_rules)
|
||||
|
||||
current_ua = value
|
||||
current_rules = UserAgentRules(user_agent=value)
|
||||
|
||||
elif directive == "disallow" and current_rules:
|
||||
if value: # Empty disallow means allow all
|
||||
current_rules.disallow.append(value)
|
||||
|
||||
elif directive == "allow" and current_rules:
|
||||
if value:
|
||||
current_rules.allow.append(value)
|
||||
|
||||
elif directive == "crawl-delay" and current_rules:
|
||||
try:
|
||||
current_rules.crawl_delay = float(value)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
elif directive == "sitemap":
|
||||
if value:
|
||||
sitemaps.append(value)
|
||||
|
||||
# Don't forget last user-agent
|
||||
if current_rules:
|
||||
rules.append(current_rules)
|
||||
|
||||
return rules, sitemaps
|
||||
|
||||
def analyze(self, url: str) -> RobotsResult:
|
||||
"""Analyze robots.txt."""
|
||||
result = RobotsResult(url=url)
|
||||
|
||||
# Fetch robots.txt
|
||||
try:
|
||||
content = self.fetch_robots(url)
|
||||
if content is None:
|
||||
result.accessible = False
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message="No robots.txt found (returns 404)",
|
||||
suggestion="Consider creating a robots.txt file",
|
||||
))
|
||||
return result
|
||||
except RuntimeError as e:
|
||||
result.accessible = False
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="error",
|
||||
message=str(e),
|
||||
))
|
||||
return result
|
||||
|
||||
result.content = content
|
||||
result.rules, result.sitemaps = self.parse_robots(content)
|
||||
|
||||
# Analyze content
|
||||
self._analyze_syntax(result)
|
||||
self._analyze_rules(result)
|
||||
self._analyze_sitemaps(result)
|
||||
|
||||
# Calculate stats
|
||||
result.stats = {
|
||||
"user_agents_count": len(result.rules),
|
||||
"user_agents": [r.user_agent for r in result.rules],
|
||||
"total_disallow_rules": sum(len(r.disallow) for r in result.rules),
|
||||
"total_allow_rules": sum(len(r.allow) for r in result.rules),
|
||||
"sitemaps_count": len(result.sitemaps),
|
||||
"has_crawl_delay": any(r.crawl_delay for r in result.rules),
|
||||
"content_length": len(content),
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
def _analyze_syntax(self, result: RobotsResult) -> None:
|
||||
"""Check for syntax issues."""
|
||||
lines = result.content.split("\n")
|
||||
|
||||
for line_num, line in enumerate(lines, 1):
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines and comments
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
|
||||
# Check for valid directive
|
||||
if ":" not in line:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message=f"Invalid line (missing colon): {line[:50]}",
|
||||
line_number=line_num,
|
||||
))
|
||||
continue
|
||||
|
||||
directive, value = line.split(":", 1)
|
||||
directive = directive.strip().lower()
|
||||
|
||||
valid_directives = {
|
||||
"user-agent", "disallow", "allow",
|
||||
"crawl-delay", "sitemap", "host",
|
||||
}
|
||||
|
||||
if directive not in valid_directives:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message=f"Unknown directive: {directive}",
|
||||
line_number=line_num,
|
||||
directive=directive,
|
||||
))
|
||||
|
||||
def _analyze_rules(self, result: RobotsResult) -> None:
|
||||
"""Analyze blocking rules."""
|
||||
# Check if there are any rules
|
||||
if not result.rules:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message="No user-agent rules defined",
|
||||
suggestion="Add User-agent: * rules to control crawling",
|
||||
))
|
||||
return
|
||||
|
||||
# Check for wildcard rule
|
||||
has_wildcard = any(r.user_agent == "*" for r in result.rules)
|
||||
if not has_wildcard:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message="No wildcard (*) user-agent defined",
|
||||
suggestion="Consider adding User-agent: * as fallback",
|
||||
))
|
||||
|
||||
# Check for blocking important resources
|
||||
for rules in result.rules:
|
||||
for disallow in rules.disallow:
|
||||
# Check if blocking root
|
||||
if disallow == "/":
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="error",
|
||||
message=f"Blocking entire site for {rules.user_agent}",
|
||||
directive=f"Disallow: {disallow}",
|
||||
suggestion="This will prevent indexing. Is this intentional?",
|
||||
))
|
||||
|
||||
# Check if blocking CSS/JS
|
||||
if any(ext in disallow.lower() for ext in [".css", ".js"]):
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message=f"Blocking CSS/JS files for {rules.user_agent}",
|
||||
directive=f"Disallow: {disallow}",
|
||||
suggestion="May affect rendering and SEO",
|
||||
))
|
||||
|
||||
# Check for blocking images
|
||||
if any(ext in disallow.lower() for ext in [".jpg", ".png", ".gif", ".webp"]):
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message=f"Blocking image files for {rules.user_agent}",
|
||||
directive=f"Disallow: {disallow}",
|
||||
))
|
||||
|
||||
# Check crawl delay
|
||||
if rules.crawl_delay:
|
||||
if rules.crawl_delay > 10:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message=f"High crawl-delay ({rules.crawl_delay}s) for {rules.user_agent}",
|
||||
directive=f"Crawl-delay: {rules.crawl_delay}",
|
||||
suggestion="May significantly slow indexing",
|
||||
))
|
||||
elif rules.crawl_delay > 0:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="info",
|
||||
message=f"Crawl-delay set to {rules.crawl_delay}s for {rules.user_agent}",
|
||||
))
|
||||
|
||||
def _analyze_sitemaps(self, result: RobotsResult) -> None:
|
||||
"""Analyze sitemap declarations."""
|
||||
if not result.sitemaps:
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message="No sitemap declared in robots.txt",
|
||||
suggestion="Add Sitemap: directive to help crawlers find your sitemap",
|
||||
))
|
||||
else:
|
||||
for sitemap in result.sitemaps:
|
||||
if not sitemap.startswith("http"):
|
||||
result.issues.append(RobotsIssue(
|
||||
severity="warning",
|
||||
message=f"Sitemap URL should be absolute: {sitemap}",
|
||||
directive=f"Sitemap: {sitemap}",
|
||||
))
|
||||
|
||||
def test_url(self, robots_url: str, test_path: str,
|
||||
user_agent: str = "Googlebot") -> dict:
|
||||
"""Test if a specific URL is allowed."""
|
||||
# Use Python's built-in parser
|
||||
rp = RobotFileParser()
|
||||
|
||||
# Ensure robots.txt URL
|
||||
parsed = urlparse(robots_url)
|
||||
if not parsed.path.endswith("robots.txt"):
|
||||
robots_txt_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
||||
else:
|
||||
robots_txt_url = robots_url
|
||||
|
||||
rp.set_url(robots_txt_url)
|
||||
try:
|
||||
rp.read()
|
||||
except Exception as e:
|
||||
return {
|
||||
"path": test_path,
|
||||
"user_agent": user_agent,
|
||||
"allowed": None,
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
# Build full URL for testing
|
||||
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
||||
full_url = urljoin(base_url, test_path)
|
||||
|
||||
allowed = rp.can_fetch(user_agent, full_url)
|
||||
|
||||
return {
|
||||
"path": test_path,
|
||||
"user_agent": user_agent,
|
||||
"allowed": allowed,
|
||||
"full_url": full_url,
|
||||
}
|
||||
|
||||
def generate_report(self, result: RobotsResult) -> str:
|
||||
"""Generate human-readable analysis report."""
|
||||
lines = [
|
||||
"=" * 60,
|
||||
"Robots.txt Analysis Report",
|
||||
"=" * 60,
|
||||
f"URL: {result.url}",
|
||||
f"Accessible: {'Yes' if result.accessible else 'No'}",
|
||||
f"Timestamp: {result.timestamp}",
|
||||
"",
|
||||
]
|
||||
|
||||
if result.accessible:
|
||||
lines.append("Statistics:")
|
||||
for key, value in result.stats.items():
|
||||
if key == "user_agents":
|
||||
lines.append(f" {key}: {', '.join(value) if value else 'None'}")
|
||||
else:
|
||||
lines.append(f" {key}: {value}")
|
||||
lines.append("")
|
||||
|
||||
if result.sitemaps:
|
||||
lines.append(f"Sitemaps ({len(result.sitemaps)}):")
|
||||
for sitemap in result.sitemaps:
|
||||
lines.append(f" - {sitemap}")
|
||||
lines.append("")
|
||||
|
||||
if result.rules:
|
||||
lines.append("Rules Summary:")
|
||||
for rules in result.rules:
|
||||
lines.append(f"\n User-agent: {rules.user_agent}")
|
||||
if rules.disallow:
|
||||
lines.append(f" Disallow: {len(rules.disallow)} rules")
|
||||
for d in rules.disallow[:5]:
|
||||
lines.append(f" - {d}")
|
||||
if len(rules.disallow) > 5:
|
||||
lines.append(f" ... and {len(rules.disallow) - 5} more")
|
||||
if rules.allow:
|
||||
lines.append(f" Allow: {len(rules.allow)} rules")
|
||||
for a in rules.allow[:3]:
|
||||
lines.append(f" - {a}")
|
||||
if rules.crawl_delay:
|
||||
lines.append(f" Crawl-delay: {rules.crawl_delay}s")
|
||||
lines.append("")
|
||||
|
||||
if result.issues:
|
||||
lines.append("Issues Found:")
|
||||
errors = [i for i in result.issues if i.severity == "error"]
|
||||
warnings = [i for i in result.issues if i.severity == "warning"]
|
||||
infos = [i for i in result.issues if i.severity == "info"]
|
||||
|
||||
if errors:
|
||||
lines.append(f"\n ERRORS ({len(errors)}):")
|
||||
for issue in errors:
|
||||
lines.append(f" - {issue.message}")
|
||||
if issue.directive:
|
||||
lines.append(f" Directive: {issue.directive}")
|
||||
if issue.suggestion:
|
||||
lines.append(f" Suggestion: {issue.suggestion}")
|
||||
|
||||
if warnings:
|
||||
lines.append(f"\n WARNINGS ({len(warnings)}):")
|
||||
for issue in warnings:
|
||||
lines.append(f" - {issue.message}")
|
||||
if issue.suggestion:
|
||||
lines.append(f" Suggestion: {issue.suggestion}")
|
||||
|
||||
if infos:
|
||||
lines.append(f"\n INFO ({len(infos)}):")
|
||||
for issue in infos:
|
||||
lines.append(f" - {issue.message}")
|
||||
|
||||
lines.append("")
|
||||
lines.append("=" * 60)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for CLI usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze robots.txt configuration",
|
||||
)
|
||||
parser.add_argument("--url", "-u", required=True,
|
||||
help="URL to robots.txt or domain")
|
||||
parser.add_argument("--test-url", "-t",
|
||||
help="Test if specific URL path is allowed")
|
||||
parser.add_argument("--user-agent", "-a", default="Googlebot",
|
||||
help="User agent for testing (default: Googlebot)")
|
||||
parser.add_argument("--output", "-o", help="Output file for JSON report")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
checker = RobotsChecker()
|
||||
|
||||
if args.test_url:
|
||||
# Test specific URL
|
||||
test_result = checker.test_url(args.url, args.test_url, args.user_agent)
|
||||
if args.json:
|
||||
print(json.dumps(test_result, indent=2))
|
||||
else:
|
||||
status = "ALLOWED" if test_result["allowed"] else "BLOCKED"
|
||||
print(f"URL: {test_result['path']}")
|
||||
print(f"User-Agent: {test_result['user_agent']}")
|
||||
print(f"Status: {status}")
|
||||
else:
|
||||
# Full analysis
|
||||
result = checker.analyze(args.url)
|
||||
|
||||
if args.json or args.output:
|
||||
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
logger.info(f"Report written to {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
else:
|
||||
print(checker.generate_report(result))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user