Add SEO skills 19-28, 31-32 with full Python implementations

12 new skills: Keyword Strategy, SERP Analysis, Position Tracking,
Link Building, Content Strategy, E-Commerce SEO, KPI Framework,
International SEO, AI Visibility, Knowledge Graph, Competitor Intel,
and Crawl Budget. ~20K lines of Python across 25 domain scripts.
Updated skill 11 pipeline table and repo CLAUDE.md.
Enhanced skill 18 local SEO workflow from jamie.clinic audit.

Note: Skill 26 hreflang_validator.py pending (content filter block).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-13 12:05:59 +09:00
parent 159f7ec3f7
commit a3ff965b87
125 changed files with 25948 additions and 173 deletions

View File

@@ -0,0 +1,805 @@
"""
Crawl Budget Analyzer - Identify crawl waste and generate recommendations
=========================================================================
Purpose: Analyze server access logs for crawl budget efficiency, detect waste
(parameter URLs, redirect chains, soft 404s, duplicates), find orphan
pages, profile per-bot behavior, and produce prioritized recommendations.
Python: 3.10+
"""
import argparse
import json
import logging
import re
import sys
from collections import Counter, defaultdict
from dataclasses import asdict, dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any
from urllib.parse import parse_qs, urlparse
import requests
from bs4 import BeautifulSoup
from log_parser import BotIdentification, LogEntry, LogParser
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
WASTE_PARAMS = {"sort", "filter", "order", "orderby", "dir", "direction"}
TRACKING_PARAMS_RE = re.compile(r"^utm_", re.IGNORECASE)
PAGINATION_PARAM = "page"
HIGH_PAGE_THRESHOLD = 5
SOFT_404_MAX_SIZE = 1024 # bytes - pages smaller than this may be soft 404s
REDIRECT_STATUSES = {301, 302, 303, 307, 308}
TOP_N_URLS = 50
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class CrawlWaste:
"""A category of crawl budget waste."""
waste_type: str
urls: list[str]
count: int
pct_of_total: float
recommendation: str
def to_dict(self) -> dict:
return {
"waste_type": self.waste_type,
"count": self.count,
"pct_of_total": round(self.pct_of_total, 2),
"recommendation": self.recommendation,
"sample_urls": self.urls[:20],
}
@dataclass
class OrphanPage:
"""A page that is either in the sitemap but uncrawled, or crawled but not in sitemap."""
url: str
in_sitemap: bool
crawled: bool
last_crawl_date: str | None = None
def to_dict(self) -> dict:
return asdict(self)
@dataclass
class BotProfile:
"""Per-bot crawl behavior profile."""
name: str
total_requests: int = 0
requests_per_day: float = 0.0
crawl_depth_distribution: dict[int, int] = field(default_factory=dict)
peak_hours: list[int] = field(default_factory=list)
status_breakdown: dict[str, int] = field(default_factory=dict)
top_crawled_urls: list[tuple[str, int]] = field(default_factory=list)
unique_urls: int = 0
days_active: int = 0
def to_dict(self) -> dict:
return {
"name": self.name,
"total_requests": self.total_requests,
"requests_per_day": round(self.requests_per_day, 1),
"crawl_depth_distribution": self.crawl_depth_distribution,
"peak_hours": self.peak_hours,
"status_breakdown": self.status_breakdown,
"top_crawled_urls": [{"url": u, "count": c} for u, c in self.top_crawled_urls],
"unique_urls": self.unique_urls,
"days_active": self.days_active,
}
@dataclass
class CrawlRecommendation:
"""A single optimization recommendation."""
category: str
priority: str # critical, high, medium, low
action: str
impact: str
details: str
def to_dict(self) -> dict:
return asdict(self)
@dataclass
class CrawlBudgetResult:
"""Complete crawl budget analysis result."""
log_file: str
analysis_period: dict[str, str]
total_bot_requests: int
bots: dict[str, BotProfile]
waste: list[CrawlWaste]
total_waste_pct: float
orphan_pages: dict[str, list[OrphanPage]]
recommendations: list[CrawlRecommendation]
efficiency_score: int
timestamp: str
def to_dict(self) -> dict:
return {
"log_file": self.log_file,
"analysis_period": self.analysis_period,
"total_bot_requests": self.total_bot_requests,
"bots": {n: p.to_dict() for n, p in self.bots.items()},
"waste": {w.waste_type: w.to_dict() for w in self.waste},
"total_waste_pct": round(self.total_waste_pct, 2),
"orphan_pages": {
k: [o.to_dict() for o in v]
for k, v in self.orphan_pages.items()
},
"recommendations": [r.to_dict() for r in self.recommendations],
"efficiency_score": self.efficiency_score,
"timestamp": self.timestamp,
}
# ---------------------------------------------------------------------------
# CrawlBudgetAnalyzer
# ---------------------------------------------------------------------------
class CrawlBudgetAnalyzer:
"""Analyze crawl budget efficiency from server access logs."""
def __init__(
self,
log_file: str,
sitemap_url: str | None = None,
target_url: str | None = None,
):
self.log_file = log_file
self.sitemap_url = sitemap_url
self.target_url = target_url
self._bot_entries: list[tuple[LogEntry, BotIdentification]] = []
self._sitemap_urls: set[str] = set()
# -- data loading ---------------------------------------------------------
def load_log_data(self, log_file: str) -> list[tuple[LogEntry, BotIdentification]]:
"""Use LogParser to load all bot requests from the log file."""
parser = LogParser(log_file=log_file, fmt="auto")
entries = parser.parse()
logger.info(f"Loaded {len(entries):,} bot entries from {log_file}")
self._bot_entries = entries
return entries
def load_sitemap_urls(self, sitemap_url: str) -> set[str]:
"""Fetch and parse an XML sitemap, returning the set of URLs."""
urls: set[str] = set()
try:
resp = requests.get(sitemap_url, timeout=30, headers={
"User-Agent": "CrawlBudgetAnalyzer/1.0",
})
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "lxml-xml")
# Handle sitemap index
sitemap_tags = soup.find_all("sitemap")
if sitemap_tags:
for st in sitemap_tags:
loc = st.find("loc")
if loc and loc.text:
child_urls = self._fetch_sitemap_child(loc.text.strip())
urls.update(child_urls)
else:
for url_tag in soup.find_all("url"):
loc = url_tag.find("loc")
if loc and loc.text:
urls.add(self._normalize_url(loc.text.strip()))
logger.info(f"Loaded {len(urls):,} URLs from sitemap: {sitemap_url}")
except Exception as e:
logger.error(f"Failed to load sitemap {sitemap_url}: {e}")
self._sitemap_urls = urls
return urls
def _fetch_sitemap_child(self, url: str) -> set[str]:
"""Fetch a child sitemap from a sitemap index."""
urls: set[str] = set()
try:
resp = requests.get(url, timeout=30, headers={
"User-Agent": "CrawlBudgetAnalyzer/1.0",
})
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "lxml-xml")
for url_tag in soup.find_all("url"):
loc = url_tag.find("loc")
if loc and loc.text:
urls.add(self._normalize_url(loc.text.strip()))
except Exception as e:
logger.warning(f"Failed to fetch child sitemap {url}: {e}")
return urls
@staticmethod
def _normalize_url(url: str) -> str:
"""Normalize a URL by removing trailing slash and lowercasing the scheme/host."""
parsed = urlparse(url)
path = parsed.path.rstrip("/") or "/"
return f"{parsed.scheme}://{parsed.netloc}{path}"
# -- waste identification -------------------------------------------------
def identify_parameter_waste(
self,
bot_requests: list[tuple[LogEntry, BotIdentification]],
) -> CrawlWaste:
"""Find URLs with unnecessary query parameters wasting crawl budget."""
waste_urls: list[str] = []
for entry, _ in bot_requests:
parsed = urlparse(entry.url)
if not parsed.query:
continue
params = parse_qs(parsed.query)
param_keys = {k.lower() for k in params}
# Check for waste parameters
has_waste = bool(param_keys & WASTE_PARAMS)
# Check for tracking parameters
has_tracking = any(TRACKING_PARAMS_RE.match(k) for k in param_keys)
# Check for deep pagination
page_val = params.get(PAGINATION_PARAM, params.get("p", [None]))
has_deep_page = False
if page_val and page_val[0]:
try:
if int(page_val[0]) > HIGH_PAGE_THRESHOLD:
has_deep_page = True
except (ValueError, TypeError):
pass
if has_waste or has_tracking or has_deep_page:
waste_urls.append(entry.url)
total = len(bot_requests)
count = len(waste_urls)
pct = (count / total * 100) if total else 0.0
return CrawlWaste(
waste_type="parameter_urls",
urls=list(set(waste_urls)),
count=count,
pct_of_total=pct,
recommendation=(
"robots.txt에 불필요한 parameter URL 패턴을 Disallow로 추가하거나, "
"Google Search Console의 URL Parameters 설정을 활용하세요. "
"UTM 파라미터가 포함된 URL은 canonical 태그로 처리하세요."
),
)
def identify_redirect_chains(
self,
bot_requests: list[tuple[LogEntry, BotIdentification]],
) -> CrawlWaste:
"""Find URLs that repeatedly return redirect status codes."""
redirect_urls: list[str] = []
redirect_counter: Counter = Counter()
for entry, _ in bot_requests:
if entry.status_code in REDIRECT_STATUSES:
redirect_counter[entry.url] += 1
redirect_urls.append(entry.url)
# URLs redirected more than once are chain candidates
chain_urls = [url for url, cnt in redirect_counter.items() if cnt >= 2]
total = len(bot_requests)
count = len(redirect_urls)
pct = (count / total * 100) if total else 0.0
return CrawlWaste(
waste_type="redirect_chains",
urls=chain_urls,
count=count,
pct_of_total=pct,
recommendation=(
"301/302 리다이렉트가 반복적으로 크롤링되고 있습니다. "
"내부 링크를 최종 목적지 URL로 직접 업데이트하고, "
"리다이렉트 체인을 단일 리다이렉트로 단축하세요."
),
)
def identify_soft_404s(
self,
bot_requests: list[tuple[LogEntry, BotIdentification]],
) -> CrawlWaste:
"""Find 200-status pages with suspiciously small response sizes."""
soft_404_urls: list[str] = []
for entry, _ in bot_requests:
if entry.status_code == 200 and entry.response_size < SOFT_404_MAX_SIZE:
if entry.response_size > 0:
soft_404_urls.append(entry.url)
total = len(bot_requests)
count = len(soft_404_urls)
pct = (count / total * 100) if total else 0.0
return CrawlWaste(
waste_type="soft_404s",
urls=list(set(soft_404_urls)),
count=count,
pct_of_total=pct,
recommendation=(
"200 상태 코드를 반환하지만 콘텐츠가 거의 없는 Soft 404 페이지입니다. "
"실제 404 상태 코드를 반환하거나, 해당 페이지에 noindex 태그를 추가하세요."
),
)
def identify_duplicate_crawls(
self,
bot_requests: list[tuple[LogEntry, BotIdentification]],
) -> CrawlWaste:
"""Find duplicate URL variants: www/non-www, trailing slash, etc."""
url_variants: dict[str, set[str]] = defaultdict(set)
for entry, _ in bot_requests:
parsed = urlparse(entry.url)
# Normalize: strip www, strip trailing slash, lowercase
host = parsed.netloc.lower().lstrip("www.")
path = parsed.path.rstrip("/") or "/"
canonical = f"{host}{path}"
full_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
url_variants[canonical].add(full_url)
# Identify canonicals with multiple variants
duplicate_urls: list[str] = []
for canonical, variants in url_variants.items():
if len(variants) > 1:
duplicate_urls.extend(variants)
total = len(bot_requests)
# Count how many requests hit duplicate variant URLs
dup_set = set(duplicate_urls)
dup_request_count = sum(1 for e, _ in bot_requests if f"{urlparse(e.url).scheme}://{urlparse(e.url).netloc}{urlparse(e.url).path}" in dup_set)
pct = (dup_request_count / total * 100) if total else 0.0
return CrawlWaste(
waste_type="duplicate_urls",
urls=duplicate_urls[:TOP_N_URLS],
count=dup_request_count,
pct_of_total=pct,
recommendation=(
"www/non-www, trailing slash 유무 등 중복 URL 변형이 크롤링되고 있습니다. "
"301 리다이렉트로 canonical URL로 통합하고, "
"rel=canonical 태그를 정확히 설정하세요."
),
)
# -- bot profiling --------------------------------------------------------
def profile_bots(
self,
bot_requests: list[tuple[LogEntry, BotIdentification]],
) -> dict[str, BotProfile]:
"""Generate per-bot behavior profiles."""
bot_data: dict[str, dict] = defaultdict(lambda: {
"urls": Counter(),
"statuses": Counter(),
"hours": Counter(),
"days": set(),
"depths": Counter(),
"count": 0,
})
for entry, bot in bot_requests:
bd = bot_data[bot.name]
bd["count"] += 1
bd["urls"][entry.url] += 1
bd["statuses"][str(entry.status_code)] += 1
# URL depth = number of path segments
depth = len([s for s in urlparse(entry.url).path.split("/") if s])
bd["depths"][depth] += 1
if entry.timestamp:
bd["hours"][entry.timestamp.hour] += 1
bd["days"].add(entry.timestamp.strftime("%Y-%m-%d"))
profiles: dict[str, BotProfile] = {}
for name, bd in bot_data.items():
days_active = len(bd["days"]) or 1
rpd = bd["count"] / days_active
# Top 3 peak hours
top_hours = sorted(bd["hours"].items(), key=lambda x: -x[1])[:3]
peak = [h for h, _ in top_hours]
profiles[name] = BotProfile(
name=name,
total_requests=bd["count"],
requests_per_day=rpd,
crawl_depth_distribution=dict(sorted(bd["depths"].items())),
peak_hours=peak,
status_breakdown=dict(bd["statuses"]),
top_crawled_urls=bd["urls"].most_common(TOP_N_URLS),
unique_urls=len(bd["urls"]),
days_active=days_active,
)
return profiles
# -- orphan detection -----------------------------------------------------
def detect_orphan_pages(
self,
crawled_urls: set[str],
sitemap_urls: set[str],
) -> dict[str, list[OrphanPage]]:
"""Compare crawled URLs with sitemap URLs to find orphans."""
in_sitemap_not_crawled = sitemap_urls - crawled_urls
crawled_not_in_sitemap = crawled_urls - sitemap_urls
return {
"in_sitemap_not_crawled": [
OrphanPage(url=u, in_sitemap=True, crawled=False)
for u in sorted(in_sitemap_not_crawled)
],
"crawled_not_in_sitemap": [
OrphanPage(url=u, in_sitemap=False, crawled=True)
for u in sorted(crawled_not_in_sitemap)
],
}
# -- efficiency score -----------------------------------------------------
@staticmethod
def calculate_efficiency_score(total_waste_pct: float) -> int:
"""Calculate crawl efficiency score: 100 - waste%, capped at [0, 100]."""
score = int(100 - total_waste_pct)
return max(0, min(100, score))
# -- recommendations ------------------------------------------------------
def generate_recommendations(
self,
waste: list[CrawlWaste],
orphans: dict[str, list[OrphanPage]],
bot_profiles: dict[str, BotProfile],
) -> list[CrawlRecommendation]:
"""Generate prioritized crawl budget optimization recommendations."""
recs: list[CrawlRecommendation] = []
# Waste-based recommendations
for w in waste:
if w.pct_of_total > 5.0:
priority = "critical"
elif w.pct_of_total > 2.0:
priority = "high"
elif w.pct_of_total > 0.5:
priority = "medium"
else:
priority = "low"
if w.waste_type == "parameter_urls" and w.count > 0:
recs.append(CrawlRecommendation(
category="URL Parameters",
priority=priority,
action="robots.txt에 parameter URL 패턴 Disallow 규칙 추가",
impact=f"크롤 요청 {w.pct_of_total:.1f}% 절감 가능",
details=(
f"{w.count:,}건의 parameter URL이 크롤링되었습니다. "
f"sort, filter, utm_* 등 불필요한 파라미터를 차단하세요."
),
))
elif w.waste_type == "redirect_chains" and w.count > 0:
recs.append(CrawlRecommendation(
category="Redirect Chains",
priority=priority,
action="리다이렉트 체인을 단축하고 내부 링크 업데이트",
impact=f"크롤 요청 {w.pct_of_total:.1f}% 절감 가능",
details=(
f"{w.count:,}건의 리다이렉트 요청이 발생했습니다. "
f"내부 링크를 최종 URL로 직접 연결하세요."
),
))
elif w.waste_type == "soft_404s" and w.count > 0:
recs.append(CrawlRecommendation(
category="Soft 404s",
priority=priority,
action="Soft 404 페이지에 적절한 HTTP 상태 코드 또는 noindex 적용",
impact=f"크롤 요청 {w.pct_of_total:.1f}% 절감 가능",
details=(
f"{w.count:,}건의 Soft 404가 감지되었습니다. "
f"적절한 404 응답 또는 noindex meta 태그를 설정하세요."
),
))
elif w.waste_type == "duplicate_urls" and w.count > 0:
recs.append(CrawlRecommendation(
category="Duplicate URLs",
priority=priority,
action="URL 정규화 및 canonical 태그 설정",
impact=f"크롤 요청 {w.pct_of_total:.1f}% 절감 가능",
details=(
f"{w.count:,}건의 중복 URL 변형이 크롤링되었습니다. "
f"www/non-www, trailing slash 통합을 진행하세요."
),
))
# Orphan page recommendations
not_crawled = orphans.get("in_sitemap_not_crawled", [])
not_in_sitemap = orphans.get("crawled_not_in_sitemap", [])
if len(not_crawled) > 0:
pct = len(not_crawled) / max(len(self._sitemap_urls), 1) * 100
priority = "critical" if pct > 30 else "high" if pct > 10 else "medium"
recs.append(CrawlRecommendation(
category="Orphan Pages (Uncrawled)",
priority=priority,
action="사이트맵에 있으나 크롤링되지 않은 페이지의 내부 링크 강화",
impact=f"사이트맵 URL의 {pct:.1f}%가 미크롤 상태",
details=(
f"{len(not_crawled):,}개 URL이 사이트맵에 있지만 "
f"봇이 크롤링하지 않았습니다. 내부 링크를 추가하세요."
),
))
if len(not_in_sitemap) > 0:
recs.append(CrawlRecommendation(
category="Orphan Pages (Unlisted)",
priority="medium",
action="크롤링되었으나 사이트맵에 없는 페이지를 사이트맵에 추가 또는 차단",
impact=f"{len(not_in_sitemap):,}개 URL이 사이트맵에 미등록",
details=(
f"봇이 크롤링한 {len(not_in_sitemap):,}개 URL이 "
f"사이트맵에 포함되어 있지 않습니다. 유효한 페이지는 "
f"사이트맵에 추가하고, 불필요한 페이지는 robots.txt로 차단하세요."
),
))
# Bot-specific recommendations
for name, profile in bot_profiles.items():
error_count = sum(
v for k, v in profile.status_breakdown.items()
if k.startswith("4") or k.startswith("5")
)
error_pct = (error_count / profile.total_requests * 100) if profile.total_requests else 0
if error_pct > 10:
recs.append(CrawlRecommendation(
category=f"Bot Errors ({name})",
priority="high" if error_pct > 20 else "medium",
action=f"{name}의 4xx/5xx 오류율 {error_pct:.1f}% 개선 필요",
impact=f"{name} 크롤 예산의 {error_pct:.1f}%가 오류에 소비",
details=(
f"{name}이(가) {error_count:,}건의 오류 응답을 받았습니다. "
f"깨진 링크를 수정하고 서버 안정성을 개선하세요."
),
))
# Sort by priority
priority_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
recs.sort(key=lambda r: priority_order.get(r.priority, 4))
return recs
# -- orchestrator ---------------------------------------------------------
def analyze(self, scope: str = "all") -> CrawlBudgetResult:
"""Orchestrate the full crawl budget analysis."""
# Load log data
entries = self.load_log_data(self.log_file)
if not entries:
logger.warning("No bot entries found in log file.")
# Load sitemap if provided
if self.sitemap_url:
self.load_sitemap_urls(self.sitemap_url)
# Profile bots
bot_profiles: dict[str, BotProfile] = {}
if scope in ("all", "bots"):
bot_profiles = self.profile_bots(entries)
# Identify waste
waste: list[CrawlWaste] = []
if scope in ("all", "waste"):
waste.append(self.identify_parameter_waste(entries))
waste.append(self.identify_redirect_chains(entries))
waste.append(self.identify_soft_404s(entries))
waste.append(self.identify_duplicate_crawls(entries))
total_waste_pct = sum(w.pct_of_total for w in waste)
# Detect orphan pages
orphans: dict[str, list[OrphanPage]] = {
"in_sitemap_not_crawled": [],
"crawled_not_in_sitemap": [],
}
if scope in ("all", "orphans") and self._sitemap_urls:
crawled_urls: set[str] = set()
for entry, _ in entries:
# Build full URL from path for comparison
if self.target_url:
parsed_target = urlparse(self.target_url)
full = f"{parsed_target.scheme}://{parsed_target.netloc}{entry.url}"
crawled_urls.add(self._normalize_url(full))
else:
crawled_urls.add(entry.url)
orphans = self.detect_orphan_pages(crawled_urls, self._sitemap_urls)
# Efficiency score
efficiency_score = self.calculate_efficiency_score(total_waste_pct)
# Recommendations
recommendations = self.generate_recommendations(waste, orphans, bot_profiles)
# Date range from entries
timestamps = [e.timestamp for e, _ in entries if e.timestamp]
analysis_period = {}
if timestamps:
analysis_period = {
"from": min(timestamps).strftime("%Y-%m-%d"),
"to": max(timestamps).strftime("%Y-%m-%d"),
}
return CrawlBudgetResult(
log_file=self.log_file,
analysis_period=analysis_period,
total_bot_requests=len(entries),
bots=bot_profiles,
waste=waste,
total_waste_pct=total_waste_pct,
orphan_pages=orphans,
recommendations=recommendations,
efficiency_score=efficiency_score,
timestamp=datetime.now().isoformat(),
)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(
description="Analyze crawl budget efficiency and generate optimization recommendations.",
)
parser.add_argument(
"--log-file",
required=True,
help="Path to server access log file",
)
parser.add_argument(
"--sitemap",
default=None,
help="URL of XML sitemap for orphan page detection",
)
parser.add_argument(
"--url",
default=None,
help="Target website URL (used for URL normalization and Ahrefs)",
)
parser.add_argument(
"--scope",
choices=["all", "waste", "orphans", "bots"],
default="all",
help="Analysis scope (default: all)",
)
parser.add_argument(
"--ahrefs",
action="store_true",
help="Include Ahrefs page history comparison (requires MCP tool)",
)
parser.add_argument(
"--json",
action="store_true",
help="Output in JSON format",
)
parser.add_argument(
"--output",
default=None,
help="Write output to file instead of stdout",
)
args = parser.parse_args()
# Validate log file exists
if not Path(args.log_file).exists():
logger.error(f"Log file not found: {args.log_file}")
sys.exit(1)
analyzer = CrawlBudgetAnalyzer(
log_file=args.log_file,
sitemap_url=args.sitemap,
target_url=args.url,
)
result = analyzer.analyze(scope=args.scope)
if args.json:
output_data = result.to_dict()
output_str = json.dumps(output_data, indent=2, ensure_ascii=False)
else:
lines = _format_text_report(result)
output_str = "\n".join(lines)
if args.output:
Path(args.output).write_text(output_str, encoding="utf-8")
logger.info(f"Output written to {args.output}")
else:
print(output_str)
def _format_text_report(result: CrawlBudgetResult) -> list[str]:
"""Format the analysis result as a human-readable text report."""
lines = [
"=" * 70,
"Crawl Budget Analysis Report",
"=" * 70,
f"Log File: {result.log_file}",
f"Total Bot Requests: {result.total_bot_requests:,}",
f"Efficiency Score: {result.efficiency_score}/100",
f"Total Waste: {result.total_waste_pct:.1f}%",
]
if result.analysis_period:
lines.append(
f"Period: {result.analysis_period.get('from', 'N/A')} ~ "
f"{result.analysis_period.get('to', 'N/A')}"
)
lines.append("")
# Bot profiles
if result.bots:
lines.append("-" * 60)
lines.append("Bot Profiles")
lines.append("-" * 60)
for name, profile in sorted(result.bots.items(), key=lambda x: -x[1].total_requests):
lines.append(f"\n [{name.upper()}]")
lines.append(f" Requests: {profile.total_requests:,}")
lines.append(f" Unique URLs: {profile.unique_urls:,}")
lines.append(f" Requests/Day: {profile.requests_per_day:,.1f}")
lines.append(f" Days Active: {profile.days_active}")
lines.append(f" Peak Hours: {profile.peak_hours}")
lines.append(f" Status: {profile.status_breakdown}")
lines.append("")
# Waste breakdown
if result.waste:
lines.append("-" * 60)
lines.append("Crawl Waste Breakdown")
lines.append("-" * 60)
for w in result.waste:
if w.count > 0:
lines.append(f"\n [{w.waste_type}]")
lines.append(f" Count: {w.count:,} ({w.pct_of_total:.1f}%)")
lines.append(f" Recommendation: {w.recommendation}")
if w.urls:
lines.append(f" Sample URLs:")
for u in w.urls[:5]:
lines.append(f" - {u}")
lines.append("")
# Orphan pages
not_crawled = result.orphan_pages.get("in_sitemap_not_crawled", [])
not_in_sitemap = result.orphan_pages.get("crawled_not_in_sitemap", [])
if not_crawled or not_in_sitemap:
lines.append("-" * 60)
lines.append("Orphan Pages")
lines.append("-" * 60)
if not_crawled:
lines.append(f"\n In Sitemap but Not Crawled: {len(not_crawled):,}")
for op in not_crawled[:10]:
lines.append(f" - {op.url}")
if not_in_sitemap:
lines.append(f"\n Crawled but Not in Sitemap: {len(not_in_sitemap):,}")
for op in not_in_sitemap[:10]:
lines.append(f" - {op.url}")
lines.append("")
# Recommendations
if result.recommendations:
lines.append("-" * 60)
lines.append("Recommendations")
lines.append("-" * 60)
for i, rec in enumerate(result.recommendations, 1):
lines.append(f"\n {i}. [{rec.priority.upper()}] {rec.category}")
lines.append(f" Action: {rec.action}")
lines.append(f" Impact: {rec.impact}")
lines.append(f" Details: {rec.details}")
lines.append("")
lines.append(f"Generated: {result.timestamp}")
return lines
if __name__ == "__main__":
main()