refactor: Reorganize skill numbering and update documentation

Skill Numbering Changes:
- 01-03: OurDigital core (was 30-32)
- 31-32: Notion tools (was 01-02)
- 99_archive: Renamed from _archive for sorting

New Files:
- AGENTS.md: Claude Code agent routing guide
- requirements.txt for 00-claude-code-setting, 32-notion-writer, 43-jamie-youtube-manager

Documentation Updates:
- CLAUDE.md: Updated skill inventory (23 skills)
- AUDIT_REPORT.md: Current completion status (91%)
- Archived REFACTORING_PLAN.md (most tasks complete)

Removed:
- ga-agent-skills/ (moved to separate repo ~/Project/dintel-ga4-agent)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-23 18:42:39 +07:00
parent ae193d5e08
commit b69e4b6f3a
100 changed files with 655 additions and 1812 deletions

View File

@@ -0,0 +1,540 @@
"""
Robots.txt Checker - Analyze robots.txt configuration
=====================================================
Purpose: Parse and analyze robots.txt for SEO compliance
Python: 3.10+
Usage:
python robots_checker.py --url https://example.com/robots.txt
python robots_checker.py --url https://example.com --test-url /admin/
"""
import argparse
import json
import logging
import re
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
import requests
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
@dataclass
class RobotsIssue:
"""Represents a robots.txt issue."""
severity: str # "error", "warning", "info"
message: str
line_number: int | None = None
directive: str | None = None
suggestion: str | None = None
@dataclass
class UserAgentRules:
"""Rules for a specific user-agent."""
user_agent: str
disallow: list[str] = field(default_factory=list)
allow: list[str] = field(default_factory=list)
crawl_delay: float | None = None
@dataclass
class RobotsResult:
"""Complete robots.txt analysis result."""
url: str
accessible: bool = True
content: str = ""
rules: list[UserAgentRules] = field(default_factory=list)
sitemaps: list[str] = field(default_factory=list)
issues: list[RobotsIssue] = field(default_factory=list)
stats: dict = field(default_factory=dict)
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
def to_dict(self) -> dict:
"""Convert to dictionary for JSON output."""
return {
"url": self.url,
"accessible": self.accessible,
"sitemaps": self.sitemaps,
"rules": [
{
"user_agent": r.user_agent,
"disallow": r.disallow,
"allow": r.allow,
"crawl_delay": r.crawl_delay,
}
for r in self.rules
],
"issues": [
{
"severity": i.severity,
"message": i.message,
"line_number": i.line_number,
"directive": i.directive,
"suggestion": i.suggestion,
}
for i in self.issues
],
"stats": self.stats,
"timestamp": self.timestamp,
}
class RobotsChecker:
"""Analyze robots.txt configuration."""
# Common user agents
USER_AGENTS = {
"*": "All bots",
"Googlebot": "Google crawler",
"Googlebot-Image": "Google Image crawler",
"Googlebot-News": "Google News crawler",
"Googlebot-Video": "Google Video crawler",
"Bingbot": "Bing crawler",
"Slurp": "Yahoo crawler",
"DuckDuckBot": "DuckDuckGo crawler",
"Baiduspider": "Baidu crawler",
"Yandex": "Yandex crawler",
"facebot": "Facebook crawler",
"Twitterbot": "Twitter crawler",
"LinkedInBot": "LinkedIn crawler",
}
# Paths that should generally not be blocked
IMPORTANT_PATHS = [
"/",
"/*.css",
"/*.js",
"/*.jpg",
"/*.jpeg",
"/*.png",
"/*.gif",
"/*.svg",
"/*.webp",
]
# Paths commonly blocked
COMMON_BLOCKED = [
"/admin",
"/wp-admin",
"/login",
"/private",
"/api",
"/cgi-bin",
"/tmp",
"/search",
]
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
})
def fetch_robots(self, url: str) -> str | None:
"""Fetch robots.txt content."""
# Ensure we're fetching robots.txt
parsed = urlparse(url)
if not parsed.path.endswith("robots.txt"):
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
else:
robots_url = url
try:
response = self.session.get(robots_url, timeout=10)
if response.status_code == 200:
return response.text
elif response.status_code == 404:
return None
else:
raise RuntimeError(f"HTTP {response.status_code}")
except requests.RequestException as e:
raise RuntimeError(f"Failed to fetch robots.txt: {e}")
def parse_robots(self, content: str) -> tuple[list[UserAgentRules], list[str]]:
"""Parse robots.txt content."""
rules = []
sitemaps = []
current_ua = None
current_rules = None
for line_num, line in enumerate(content.split("\n"), 1):
line = line.strip()
# Skip empty lines and comments
if not line or line.startswith("#"):
continue
# Parse directive
if ":" not in line:
continue
directive, value = line.split(":", 1)
directive = directive.strip().lower()
value = value.strip()
if directive == "user-agent":
# Save previous user-agent rules
if current_rules:
rules.append(current_rules)
current_ua = value
current_rules = UserAgentRules(user_agent=value)
elif directive == "disallow" and current_rules:
if value: # Empty disallow means allow all
current_rules.disallow.append(value)
elif directive == "allow" and current_rules:
if value:
current_rules.allow.append(value)
elif directive == "crawl-delay" and current_rules:
try:
current_rules.crawl_delay = float(value)
except ValueError:
pass
elif directive == "sitemap":
if value:
sitemaps.append(value)
# Don't forget last user-agent
if current_rules:
rules.append(current_rules)
return rules, sitemaps
def analyze(self, url: str) -> RobotsResult:
"""Analyze robots.txt."""
result = RobotsResult(url=url)
# Fetch robots.txt
try:
content = self.fetch_robots(url)
if content is None:
result.accessible = False
result.issues.append(RobotsIssue(
severity="info",
message="No robots.txt found (returns 404)",
suggestion="Consider creating a robots.txt file",
))
return result
except RuntimeError as e:
result.accessible = False
result.issues.append(RobotsIssue(
severity="error",
message=str(e),
))
return result
result.content = content
result.rules, result.sitemaps = self.parse_robots(content)
# Analyze content
self._analyze_syntax(result)
self._analyze_rules(result)
self._analyze_sitemaps(result)
# Calculate stats
result.stats = {
"user_agents_count": len(result.rules),
"user_agents": [r.user_agent for r in result.rules],
"total_disallow_rules": sum(len(r.disallow) for r in result.rules),
"total_allow_rules": sum(len(r.allow) for r in result.rules),
"sitemaps_count": len(result.sitemaps),
"has_crawl_delay": any(r.crawl_delay for r in result.rules),
"content_length": len(content),
}
return result
def _analyze_syntax(self, result: RobotsResult) -> None:
"""Check for syntax issues."""
lines = result.content.split("\n")
for line_num, line in enumerate(lines, 1):
line = line.strip()
# Skip empty lines and comments
if not line or line.startswith("#"):
continue
# Check for valid directive
if ":" not in line:
result.issues.append(RobotsIssue(
severity="warning",
message=f"Invalid line (missing colon): {line[:50]}",
line_number=line_num,
))
continue
directive, value = line.split(":", 1)
directive = directive.strip().lower()
valid_directives = {
"user-agent", "disallow", "allow",
"crawl-delay", "sitemap", "host",
}
if directive not in valid_directives:
result.issues.append(RobotsIssue(
severity="info",
message=f"Unknown directive: {directive}",
line_number=line_num,
directive=directive,
))
def _analyze_rules(self, result: RobotsResult) -> None:
"""Analyze blocking rules."""
# Check if there are any rules
if not result.rules:
result.issues.append(RobotsIssue(
severity="info",
message="No user-agent rules defined",
suggestion="Add User-agent: * rules to control crawling",
))
return
# Check for wildcard rule
has_wildcard = any(r.user_agent == "*" for r in result.rules)
if not has_wildcard:
result.issues.append(RobotsIssue(
severity="info",
message="No wildcard (*) user-agent defined",
suggestion="Consider adding User-agent: * as fallback",
))
# Check for blocking important resources
for rules in result.rules:
for disallow in rules.disallow:
# Check if blocking root
if disallow == "/":
result.issues.append(RobotsIssue(
severity="error",
message=f"Blocking entire site for {rules.user_agent}",
directive=f"Disallow: {disallow}",
suggestion="This will prevent indexing. Is this intentional?",
))
# Check if blocking CSS/JS
if any(ext in disallow.lower() for ext in [".css", ".js"]):
result.issues.append(RobotsIssue(
severity="warning",
message=f"Blocking CSS/JS files for {rules.user_agent}",
directive=f"Disallow: {disallow}",
suggestion="May affect rendering and SEO",
))
# Check for blocking images
if any(ext in disallow.lower() for ext in [".jpg", ".png", ".gif", ".webp"]):
result.issues.append(RobotsIssue(
severity="info",
message=f"Blocking image files for {rules.user_agent}",
directive=f"Disallow: {disallow}",
))
# Check crawl delay
if rules.crawl_delay:
if rules.crawl_delay > 10:
result.issues.append(RobotsIssue(
severity="warning",
message=f"High crawl-delay ({rules.crawl_delay}s) for {rules.user_agent}",
directive=f"Crawl-delay: {rules.crawl_delay}",
suggestion="May significantly slow indexing",
))
elif rules.crawl_delay > 0:
result.issues.append(RobotsIssue(
severity="info",
message=f"Crawl-delay set to {rules.crawl_delay}s for {rules.user_agent}",
))
def _analyze_sitemaps(self, result: RobotsResult) -> None:
"""Analyze sitemap declarations."""
if not result.sitemaps:
result.issues.append(RobotsIssue(
severity="warning",
message="No sitemap declared in robots.txt",
suggestion="Add Sitemap: directive to help crawlers find your sitemap",
))
else:
for sitemap in result.sitemaps:
if not sitemap.startswith("http"):
result.issues.append(RobotsIssue(
severity="warning",
message=f"Sitemap URL should be absolute: {sitemap}",
directive=f"Sitemap: {sitemap}",
))
def test_url(self, robots_url: str, test_path: str,
user_agent: str = "Googlebot") -> dict:
"""Test if a specific URL is allowed."""
# Use Python's built-in parser
rp = RobotFileParser()
# Ensure robots.txt URL
parsed = urlparse(robots_url)
if not parsed.path.endswith("robots.txt"):
robots_txt_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
else:
robots_txt_url = robots_url
rp.set_url(robots_txt_url)
try:
rp.read()
except Exception as e:
return {
"path": test_path,
"user_agent": user_agent,
"allowed": None,
"error": str(e),
}
# Build full URL for testing
base_url = f"{parsed.scheme}://{parsed.netloc}"
full_url = urljoin(base_url, test_path)
allowed = rp.can_fetch(user_agent, full_url)
return {
"path": test_path,
"user_agent": user_agent,
"allowed": allowed,
"full_url": full_url,
}
def generate_report(self, result: RobotsResult) -> str:
"""Generate human-readable analysis report."""
lines = [
"=" * 60,
"Robots.txt Analysis Report",
"=" * 60,
f"URL: {result.url}",
f"Accessible: {'Yes' if result.accessible else 'No'}",
f"Timestamp: {result.timestamp}",
"",
]
if result.accessible:
lines.append("Statistics:")
for key, value in result.stats.items():
if key == "user_agents":
lines.append(f" {key}: {', '.join(value) if value else 'None'}")
else:
lines.append(f" {key}: {value}")
lines.append("")
if result.sitemaps:
lines.append(f"Sitemaps ({len(result.sitemaps)}):")
for sitemap in result.sitemaps:
lines.append(f" - {sitemap}")
lines.append("")
if result.rules:
lines.append("Rules Summary:")
for rules in result.rules:
lines.append(f"\n User-agent: {rules.user_agent}")
if rules.disallow:
lines.append(f" Disallow: {len(rules.disallow)} rules")
for d in rules.disallow[:5]:
lines.append(f" - {d}")
if len(rules.disallow) > 5:
lines.append(f" ... and {len(rules.disallow) - 5} more")
if rules.allow:
lines.append(f" Allow: {len(rules.allow)} rules")
for a in rules.allow[:3]:
lines.append(f" - {a}")
if rules.crawl_delay:
lines.append(f" Crawl-delay: {rules.crawl_delay}s")
lines.append("")
if result.issues:
lines.append("Issues Found:")
errors = [i for i in result.issues if i.severity == "error"]
warnings = [i for i in result.issues if i.severity == "warning"]
infos = [i for i in result.issues if i.severity == "info"]
if errors:
lines.append(f"\n ERRORS ({len(errors)}):")
for issue in errors:
lines.append(f" - {issue.message}")
if issue.directive:
lines.append(f" Directive: {issue.directive}")
if issue.suggestion:
lines.append(f" Suggestion: {issue.suggestion}")
if warnings:
lines.append(f"\n WARNINGS ({len(warnings)}):")
for issue in warnings:
lines.append(f" - {issue.message}")
if issue.suggestion:
lines.append(f" Suggestion: {issue.suggestion}")
if infos:
lines.append(f"\n INFO ({len(infos)}):")
for issue in infos:
lines.append(f" - {issue.message}")
lines.append("")
lines.append("=" * 60)
return "\n".join(lines)
def main():
"""Main entry point for CLI usage."""
parser = argparse.ArgumentParser(
description="Analyze robots.txt configuration",
)
parser.add_argument("--url", "-u", required=True,
help="URL to robots.txt or domain")
parser.add_argument("--test-url", "-t",
help="Test if specific URL path is allowed")
parser.add_argument("--user-agent", "-a", default="Googlebot",
help="User agent for testing (default: Googlebot)")
parser.add_argument("--output", "-o", help="Output file for JSON report")
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
checker = RobotsChecker()
if args.test_url:
# Test specific URL
test_result = checker.test_url(args.url, args.test_url, args.user_agent)
if args.json:
print(json.dumps(test_result, indent=2))
else:
status = "ALLOWED" if test_result["allowed"] else "BLOCKED"
print(f"URL: {test_result['path']}")
print(f"User-Agent: {test_result['user_agent']}")
print(f"Status: {status}")
else:
# Full analysis
result = checker.analyze(args.url)
if args.json or args.output:
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
logger.info(f"Report written to {args.output}")
else:
print(output)
else:
print(checker.generate_report(result))
if __name__ == "__main__":
main()