12 new skills: Keyword Strategy, SERP Analysis, Position Tracking, Link Building, Content Strategy, E-Commerce SEO, KPI Framework, International SEO, AI Visibility, Knowledge Graph, Competitor Intel, and Crawl Budget. ~20K lines of Python across 25 domain scripts. Updated skill 11 pipeline table and repo CLAUDE.md. Enhanced skill 18 local SEO workflow from jamie.clinic audit. Note: Skill 26 hreflang_validator.py pending (content filter block). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
806 lines
31 KiB
Python
806 lines
31 KiB
Python
"""
|
|
Crawl Budget Analyzer - Identify crawl waste and generate recommendations
|
|
=========================================================================
|
|
Purpose: Analyze server access logs for crawl budget efficiency, detect waste
|
|
(parameter URLs, redirect chains, soft 404s, duplicates), find orphan
|
|
pages, profile per-bot behavior, and produce prioritized recommendations.
|
|
Python: 3.10+
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import re
|
|
import sys
|
|
from collections import Counter, defaultdict
|
|
from dataclasses import asdict, dataclass, field
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from urllib.parse import parse_qs, urlparse
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from log_parser import BotIdentification, LogEntry, LogParser
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Constants
|
|
# ---------------------------------------------------------------------------
|
|
|
|
WASTE_PARAMS = {"sort", "filter", "order", "orderby", "dir", "direction"}
|
|
TRACKING_PARAMS_RE = re.compile(r"^utm_", re.IGNORECASE)
|
|
PAGINATION_PARAM = "page"
|
|
HIGH_PAGE_THRESHOLD = 5
|
|
SOFT_404_MAX_SIZE = 1024 # bytes - pages smaller than this may be soft 404s
|
|
REDIRECT_STATUSES = {301, 302, 303, 307, 308}
|
|
TOP_N_URLS = 50
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data classes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@dataclass
|
|
class CrawlWaste:
|
|
"""A category of crawl budget waste."""
|
|
waste_type: str
|
|
urls: list[str]
|
|
count: int
|
|
pct_of_total: float
|
|
recommendation: str
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"waste_type": self.waste_type,
|
|
"count": self.count,
|
|
"pct_of_total": round(self.pct_of_total, 2),
|
|
"recommendation": self.recommendation,
|
|
"sample_urls": self.urls[:20],
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class OrphanPage:
|
|
"""A page that is either in the sitemap but uncrawled, or crawled but not in sitemap."""
|
|
url: str
|
|
in_sitemap: bool
|
|
crawled: bool
|
|
last_crawl_date: str | None = None
|
|
|
|
def to_dict(self) -> dict:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class BotProfile:
|
|
"""Per-bot crawl behavior profile."""
|
|
name: str
|
|
total_requests: int = 0
|
|
requests_per_day: float = 0.0
|
|
crawl_depth_distribution: dict[int, int] = field(default_factory=dict)
|
|
peak_hours: list[int] = field(default_factory=list)
|
|
status_breakdown: dict[str, int] = field(default_factory=dict)
|
|
top_crawled_urls: list[tuple[str, int]] = field(default_factory=list)
|
|
unique_urls: int = 0
|
|
days_active: int = 0
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"name": self.name,
|
|
"total_requests": self.total_requests,
|
|
"requests_per_day": round(self.requests_per_day, 1),
|
|
"crawl_depth_distribution": self.crawl_depth_distribution,
|
|
"peak_hours": self.peak_hours,
|
|
"status_breakdown": self.status_breakdown,
|
|
"top_crawled_urls": [{"url": u, "count": c} for u, c in self.top_crawled_urls],
|
|
"unique_urls": self.unique_urls,
|
|
"days_active": self.days_active,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class CrawlRecommendation:
|
|
"""A single optimization recommendation."""
|
|
category: str
|
|
priority: str # critical, high, medium, low
|
|
action: str
|
|
impact: str
|
|
details: str
|
|
|
|
def to_dict(self) -> dict:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass
|
|
class CrawlBudgetResult:
|
|
"""Complete crawl budget analysis result."""
|
|
log_file: str
|
|
analysis_period: dict[str, str]
|
|
total_bot_requests: int
|
|
bots: dict[str, BotProfile]
|
|
waste: list[CrawlWaste]
|
|
total_waste_pct: float
|
|
orphan_pages: dict[str, list[OrphanPage]]
|
|
recommendations: list[CrawlRecommendation]
|
|
efficiency_score: int
|
|
timestamp: str
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"log_file": self.log_file,
|
|
"analysis_period": self.analysis_period,
|
|
"total_bot_requests": self.total_bot_requests,
|
|
"bots": {n: p.to_dict() for n, p in self.bots.items()},
|
|
"waste": {w.waste_type: w.to_dict() for w in self.waste},
|
|
"total_waste_pct": round(self.total_waste_pct, 2),
|
|
"orphan_pages": {
|
|
k: [o.to_dict() for o in v]
|
|
for k, v in self.orphan_pages.items()
|
|
},
|
|
"recommendations": [r.to_dict() for r in self.recommendations],
|
|
"efficiency_score": self.efficiency_score,
|
|
"timestamp": self.timestamp,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CrawlBudgetAnalyzer
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class CrawlBudgetAnalyzer:
|
|
"""Analyze crawl budget efficiency from server access logs."""
|
|
|
|
def __init__(
|
|
self,
|
|
log_file: str,
|
|
sitemap_url: str | None = None,
|
|
target_url: str | None = None,
|
|
):
|
|
self.log_file = log_file
|
|
self.sitemap_url = sitemap_url
|
|
self.target_url = target_url
|
|
self._bot_entries: list[tuple[LogEntry, BotIdentification]] = []
|
|
self._sitemap_urls: set[str] = set()
|
|
|
|
# -- data loading ---------------------------------------------------------
|
|
|
|
def load_log_data(self, log_file: str) -> list[tuple[LogEntry, BotIdentification]]:
|
|
"""Use LogParser to load all bot requests from the log file."""
|
|
parser = LogParser(log_file=log_file, fmt="auto")
|
|
entries = parser.parse()
|
|
logger.info(f"Loaded {len(entries):,} bot entries from {log_file}")
|
|
self._bot_entries = entries
|
|
return entries
|
|
|
|
def load_sitemap_urls(self, sitemap_url: str) -> set[str]:
|
|
"""Fetch and parse an XML sitemap, returning the set of URLs."""
|
|
urls: set[str] = set()
|
|
try:
|
|
resp = requests.get(sitemap_url, timeout=30, headers={
|
|
"User-Agent": "CrawlBudgetAnalyzer/1.0",
|
|
})
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.content, "lxml-xml")
|
|
|
|
# Handle sitemap index
|
|
sitemap_tags = soup.find_all("sitemap")
|
|
if sitemap_tags:
|
|
for st in sitemap_tags:
|
|
loc = st.find("loc")
|
|
if loc and loc.text:
|
|
child_urls = self._fetch_sitemap_child(loc.text.strip())
|
|
urls.update(child_urls)
|
|
else:
|
|
for url_tag in soup.find_all("url"):
|
|
loc = url_tag.find("loc")
|
|
if loc and loc.text:
|
|
urls.add(self._normalize_url(loc.text.strip()))
|
|
|
|
logger.info(f"Loaded {len(urls):,} URLs from sitemap: {sitemap_url}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to load sitemap {sitemap_url}: {e}")
|
|
|
|
self._sitemap_urls = urls
|
|
return urls
|
|
|
|
def _fetch_sitemap_child(self, url: str) -> set[str]:
|
|
"""Fetch a child sitemap from a sitemap index."""
|
|
urls: set[str] = set()
|
|
try:
|
|
resp = requests.get(url, timeout=30, headers={
|
|
"User-Agent": "CrawlBudgetAnalyzer/1.0",
|
|
})
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.content, "lxml-xml")
|
|
for url_tag in soup.find_all("url"):
|
|
loc = url_tag.find("loc")
|
|
if loc and loc.text:
|
|
urls.add(self._normalize_url(loc.text.strip()))
|
|
except Exception as e:
|
|
logger.warning(f"Failed to fetch child sitemap {url}: {e}")
|
|
return urls
|
|
|
|
@staticmethod
|
|
def _normalize_url(url: str) -> str:
|
|
"""Normalize a URL by removing trailing slash and lowercasing the scheme/host."""
|
|
parsed = urlparse(url)
|
|
path = parsed.path.rstrip("/") or "/"
|
|
return f"{parsed.scheme}://{parsed.netloc}{path}"
|
|
|
|
# -- waste identification -------------------------------------------------
|
|
|
|
def identify_parameter_waste(
|
|
self,
|
|
bot_requests: list[tuple[LogEntry, BotIdentification]],
|
|
) -> CrawlWaste:
|
|
"""Find URLs with unnecessary query parameters wasting crawl budget."""
|
|
waste_urls: list[str] = []
|
|
for entry, _ in bot_requests:
|
|
parsed = urlparse(entry.url)
|
|
if not parsed.query:
|
|
continue
|
|
params = parse_qs(parsed.query)
|
|
param_keys = {k.lower() for k in params}
|
|
# Check for waste parameters
|
|
has_waste = bool(param_keys & WASTE_PARAMS)
|
|
# Check for tracking parameters
|
|
has_tracking = any(TRACKING_PARAMS_RE.match(k) for k in param_keys)
|
|
# Check for deep pagination
|
|
page_val = params.get(PAGINATION_PARAM, params.get("p", [None]))
|
|
has_deep_page = False
|
|
if page_val and page_val[0]:
|
|
try:
|
|
if int(page_val[0]) > HIGH_PAGE_THRESHOLD:
|
|
has_deep_page = True
|
|
except (ValueError, TypeError):
|
|
pass
|
|
if has_waste or has_tracking or has_deep_page:
|
|
waste_urls.append(entry.url)
|
|
|
|
total = len(bot_requests)
|
|
count = len(waste_urls)
|
|
pct = (count / total * 100) if total else 0.0
|
|
return CrawlWaste(
|
|
waste_type="parameter_urls",
|
|
urls=list(set(waste_urls)),
|
|
count=count,
|
|
pct_of_total=pct,
|
|
recommendation=(
|
|
"robots.txt에 불필요한 parameter URL 패턴을 Disallow로 추가하거나, "
|
|
"Google Search Console의 URL Parameters 설정을 활용하세요. "
|
|
"UTM 파라미터가 포함된 URL은 canonical 태그로 처리하세요."
|
|
),
|
|
)
|
|
|
|
def identify_redirect_chains(
|
|
self,
|
|
bot_requests: list[tuple[LogEntry, BotIdentification]],
|
|
) -> CrawlWaste:
|
|
"""Find URLs that repeatedly return redirect status codes."""
|
|
redirect_urls: list[str] = []
|
|
redirect_counter: Counter = Counter()
|
|
for entry, _ in bot_requests:
|
|
if entry.status_code in REDIRECT_STATUSES:
|
|
redirect_counter[entry.url] += 1
|
|
redirect_urls.append(entry.url)
|
|
|
|
# URLs redirected more than once are chain candidates
|
|
chain_urls = [url for url, cnt in redirect_counter.items() if cnt >= 2]
|
|
total = len(bot_requests)
|
|
count = len(redirect_urls)
|
|
pct = (count / total * 100) if total else 0.0
|
|
return CrawlWaste(
|
|
waste_type="redirect_chains",
|
|
urls=chain_urls,
|
|
count=count,
|
|
pct_of_total=pct,
|
|
recommendation=(
|
|
"301/302 리다이렉트가 반복적으로 크롤링되고 있습니다. "
|
|
"내부 링크를 최종 목적지 URL로 직접 업데이트하고, "
|
|
"리다이렉트 체인을 단일 리다이렉트로 단축하세요."
|
|
),
|
|
)
|
|
|
|
def identify_soft_404s(
|
|
self,
|
|
bot_requests: list[tuple[LogEntry, BotIdentification]],
|
|
) -> CrawlWaste:
|
|
"""Find 200-status pages with suspiciously small response sizes."""
|
|
soft_404_urls: list[str] = []
|
|
for entry, _ in bot_requests:
|
|
if entry.status_code == 200 and entry.response_size < SOFT_404_MAX_SIZE:
|
|
if entry.response_size > 0:
|
|
soft_404_urls.append(entry.url)
|
|
|
|
total = len(bot_requests)
|
|
count = len(soft_404_urls)
|
|
pct = (count / total * 100) if total else 0.0
|
|
return CrawlWaste(
|
|
waste_type="soft_404s",
|
|
urls=list(set(soft_404_urls)),
|
|
count=count,
|
|
pct_of_total=pct,
|
|
recommendation=(
|
|
"200 상태 코드를 반환하지만 콘텐츠가 거의 없는 Soft 404 페이지입니다. "
|
|
"실제 404 상태 코드를 반환하거나, 해당 페이지에 noindex 태그를 추가하세요."
|
|
),
|
|
)
|
|
|
|
def identify_duplicate_crawls(
|
|
self,
|
|
bot_requests: list[tuple[LogEntry, BotIdentification]],
|
|
) -> CrawlWaste:
|
|
"""Find duplicate URL variants: www/non-www, trailing slash, etc."""
|
|
url_variants: dict[str, set[str]] = defaultdict(set)
|
|
for entry, _ in bot_requests:
|
|
parsed = urlparse(entry.url)
|
|
# Normalize: strip www, strip trailing slash, lowercase
|
|
host = parsed.netloc.lower().lstrip("www.")
|
|
path = parsed.path.rstrip("/") or "/"
|
|
canonical = f"{host}{path}"
|
|
full_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
|
url_variants[canonical].add(full_url)
|
|
|
|
# Identify canonicals with multiple variants
|
|
duplicate_urls: list[str] = []
|
|
for canonical, variants in url_variants.items():
|
|
if len(variants) > 1:
|
|
duplicate_urls.extend(variants)
|
|
|
|
total = len(bot_requests)
|
|
# Count how many requests hit duplicate variant URLs
|
|
dup_set = set(duplicate_urls)
|
|
dup_request_count = sum(1 for e, _ in bot_requests if f"{urlparse(e.url).scheme}://{urlparse(e.url).netloc}{urlparse(e.url).path}" in dup_set)
|
|
pct = (dup_request_count / total * 100) if total else 0.0
|
|
return CrawlWaste(
|
|
waste_type="duplicate_urls",
|
|
urls=duplicate_urls[:TOP_N_URLS],
|
|
count=dup_request_count,
|
|
pct_of_total=pct,
|
|
recommendation=(
|
|
"www/non-www, trailing slash 유무 등 중복 URL 변형이 크롤링되고 있습니다. "
|
|
"301 리다이렉트로 canonical URL로 통합하고, "
|
|
"rel=canonical 태그를 정확히 설정하세요."
|
|
),
|
|
)
|
|
|
|
# -- bot profiling --------------------------------------------------------
|
|
|
|
def profile_bots(
|
|
self,
|
|
bot_requests: list[tuple[LogEntry, BotIdentification]],
|
|
) -> dict[str, BotProfile]:
|
|
"""Generate per-bot behavior profiles."""
|
|
bot_data: dict[str, dict] = defaultdict(lambda: {
|
|
"urls": Counter(),
|
|
"statuses": Counter(),
|
|
"hours": Counter(),
|
|
"days": set(),
|
|
"depths": Counter(),
|
|
"count": 0,
|
|
})
|
|
|
|
for entry, bot in bot_requests:
|
|
bd = bot_data[bot.name]
|
|
bd["count"] += 1
|
|
bd["urls"][entry.url] += 1
|
|
bd["statuses"][str(entry.status_code)] += 1
|
|
# URL depth = number of path segments
|
|
depth = len([s for s in urlparse(entry.url).path.split("/") if s])
|
|
bd["depths"][depth] += 1
|
|
if entry.timestamp:
|
|
bd["hours"][entry.timestamp.hour] += 1
|
|
bd["days"].add(entry.timestamp.strftime("%Y-%m-%d"))
|
|
|
|
profiles: dict[str, BotProfile] = {}
|
|
for name, bd in bot_data.items():
|
|
days_active = len(bd["days"]) or 1
|
|
rpd = bd["count"] / days_active
|
|
# Top 3 peak hours
|
|
top_hours = sorted(bd["hours"].items(), key=lambda x: -x[1])[:3]
|
|
peak = [h for h, _ in top_hours]
|
|
profiles[name] = BotProfile(
|
|
name=name,
|
|
total_requests=bd["count"],
|
|
requests_per_day=rpd,
|
|
crawl_depth_distribution=dict(sorted(bd["depths"].items())),
|
|
peak_hours=peak,
|
|
status_breakdown=dict(bd["statuses"]),
|
|
top_crawled_urls=bd["urls"].most_common(TOP_N_URLS),
|
|
unique_urls=len(bd["urls"]),
|
|
days_active=days_active,
|
|
)
|
|
return profiles
|
|
|
|
# -- orphan detection -----------------------------------------------------
|
|
|
|
def detect_orphan_pages(
|
|
self,
|
|
crawled_urls: set[str],
|
|
sitemap_urls: set[str],
|
|
) -> dict[str, list[OrphanPage]]:
|
|
"""Compare crawled URLs with sitemap URLs to find orphans."""
|
|
in_sitemap_not_crawled = sitemap_urls - crawled_urls
|
|
crawled_not_in_sitemap = crawled_urls - sitemap_urls
|
|
|
|
return {
|
|
"in_sitemap_not_crawled": [
|
|
OrphanPage(url=u, in_sitemap=True, crawled=False)
|
|
for u in sorted(in_sitemap_not_crawled)
|
|
],
|
|
"crawled_not_in_sitemap": [
|
|
OrphanPage(url=u, in_sitemap=False, crawled=True)
|
|
for u in sorted(crawled_not_in_sitemap)
|
|
],
|
|
}
|
|
|
|
# -- efficiency score -----------------------------------------------------
|
|
|
|
@staticmethod
|
|
def calculate_efficiency_score(total_waste_pct: float) -> int:
|
|
"""Calculate crawl efficiency score: 100 - waste%, capped at [0, 100]."""
|
|
score = int(100 - total_waste_pct)
|
|
return max(0, min(100, score))
|
|
|
|
# -- recommendations ------------------------------------------------------
|
|
|
|
def generate_recommendations(
|
|
self,
|
|
waste: list[CrawlWaste],
|
|
orphans: dict[str, list[OrphanPage]],
|
|
bot_profiles: dict[str, BotProfile],
|
|
) -> list[CrawlRecommendation]:
|
|
"""Generate prioritized crawl budget optimization recommendations."""
|
|
recs: list[CrawlRecommendation] = []
|
|
|
|
# Waste-based recommendations
|
|
for w in waste:
|
|
if w.pct_of_total > 5.0:
|
|
priority = "critical"
|
|
elif w.pct_of_total > 2.0:
|
|
priority = "high"
|
|
elif w.pct_of_total > 0.5:
|
|
priority = "medium"
|
|
else:
|
|
priority = "low"
|
|
|
|
if w.waste_type == "parameter_urls" and w.count > 0:
|
|
recs.append(CrawlRecommendation(
|
|
category="URL Parameters",
|
|
priority=priority,
|
|
action="robots.txt에 parameter URL 패턴 Disallow 규칙 추가",
|
|
impact=f"크롤 요청 {w.pct_of_total:.1f}% 절감 가능",
|
|
details=(
|
|
f"총 {w.count:,}건의 parameter URL이 크롤링되었습니다. "
|
|
f"sort, filter, utm_* 등 불필요한 파라미터를 차단하세요."
|
|
),
|
|
))
|
|
elif w.waste_type == "redirect_chains" and w.count > 0:
|
|
recs.append(CrawlRecommendation(
|
|
category="Redirect Chains",
|
|
priority=priority,
|
|
action="리다이렉트 체인을 단축하고 내부 링크 업데이트",
|
|
impact=f"크롤 요청 {w.pct_of_total:.1f}% 절감 가능",
|
|
details=(
|
|
f"총 {w.count:,}건의 리다이렉트 요청이 발생했습니다. "
|
|
f"내부 링크를 최종 URL로 직접 연결하세요."
|
|
),
|
|
))
|
|
elif w.waste_type == "soft_404s" and w.count > 0:
|
|
recs.append(CrawlRecommendation(
|
|
category="Soft 404s",
|
|
priority=priority,
|
|
action="Soft 404 페이지에 적절한 HTTP 상태 코드 또는 noindex 적용",
|
|
impact=f"크롤 요청 {w.pct_of_total:.1f}% 절감 가능",
|
|
details=(
|
|
f"총 {w.count:,}건의 Soft 404가 감지되었습니다. "
|
|
f"적절한 404 응답 또는 noindex meta 태그를 설정하세요."
|
|
),
|
|
))
|
|
elif w.waste_type == "duplicate_urls" and w.count > 0:
|
|
recs.append(CrawlRecommendation(
|
|
category="Duplicate URLs",
|
|
priority=priority,
|
|
action="URL 정규화 및 canonical 태그 설정",
|
|
impact=f"크롤 요청 {w.pct_of_total:.1f}% 절감 가능",
|
|
details=(
|
|
f"총 {w.count:,}건의 중복 URL 변형이 크롤링되었습니다. "
|
|
f"www/non-www, trailing slash 통합을 진행하세요."
|
|
),
|
|
))
|
|
|
|
# Orphan page recommendations
|
|
not_crawled = orphans.get("in_sitemap_not_crawled", [])
|
|
not_in_sitemap = orphans.get("crawled_not_in_sitemap", [])
|
|
|
|
if len(not_crawled) > 0:
|
|
pct = len(not_crawled) / max(len(self._sitemap_urls), 1) * 100
|
|
priority = "critical" if pct > 30 else "high" if pct > 10 else "medium"
|
|
recs.append(CrawlRecommendation(
|
|
category="Orphan Pages (Uncrawled)",
|
|
priority=priority,
|
|
action="사이트맵에 있으나 크롤링되지 않은 페이지의 내부 링크 강화",
|
|
impact=f"사이트맵 URL의 {pct:.1f}%가 미크롤 상태",
|
|
details=(
|
|
f"총 {len(not_crawled):,}개 URL이 사이트맵에 있지만 "
|
|
f"봇이 크롤링하지 않았습니다. 내부 링크를 추가하세요."
|
|
),
|
|
))
|
|
|
|
if len(not_in_sitemap) > 0:
|
|
recs.append(CrawlRecommendation(
|
|
category="Orphan Pages (Unlisted)",
|
|
priority="medium",
|
|
action="크롤링되었으나 사이트맵에 없는 페이지를 사이트맵에 추가 또는 차단",
|
|
impact=f"{len(not_in_sitemap):,}개 URL이 사이트맵에 미등록",
|
|
details=(
|
|
f"봇이 크롤링한 {len(not_in_sitemap):,}개 URL이 "
|
|
f"사이트맵에 포함되어 있지 않습니다. 유효한 페이지는 "
|
|
f"사이트맵에 추가하고, 불필요한 페이지는 robots.txt로 차단하세요."
|
|
),
|
|
))
|
|
|
|
# Bot-specific recommendations
|
|
for name, profile in bot_profiles.items():
|
|
error_count = sum(
|
|
v for k, v in profile.status_breakdown.items()
|
|
if k.startswith("4") or k.startswith("5")
|
|
)
|
|
error_pct = (error_count / profile.total_requests * 100) if profile.total_requests else 0
|
|
if error_pct > 10:
|
|
recs.append(CrawlRecommendation(
|
|
category=f"Bot Errors ({name})",
|
|
priority="high" if error_pct > 20 else "medium",
|
|
action=f"{name}의 4xx/5xx 오류율 {error_pct:.1f}% 개선 필요",
|
|
impact=f"{name} 크롤 예산의 {error_pct:.1f}%가 오류에 소비",
|
|
details=(
|
|
f"{name}이(가) {error_count:,}건의 오류 응답을 받았습니다. "
|
|
f"깨진 링크를 수정하고 서버 안정성을 개선하세요."
|
|
),
|
|
))
|
|
|
|
# Sort by priority
|
|
priority_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
|
|
recs.sort(key=lambda r: priority_order.get(r.priority, 4))
|
|
return recs
|
|
|
|
# -- orchestrator ---------------------------------------------------------
|
|
|
|
def analyze(self, scope: str = "all") -> CrawlBudgetResult:
|
|
"""Orchestrate the full crawl budget analysis."""
|
|
# Load log data
|
|
entries = self.load_log_data(self.log_file)
|
|
if not entries:
|
|
logger.warning("No bot entries found in log file.")
|
|
|
|
# Load sitemap if provided
|
|
if self.sitemap_url:
|
|
self.load_sitemap_urls(self.sitemap_url)
|
|
|
|
# Profile bots
|
|
bot_profiles: dict[str, BotProfile] = {}
|
|
if scope in ("all", "bots"):
|
|
bot_profiles = self.profile_bots(entries)
|
|
|
|
# Identify waste
|
|
waste: list[CrawlWaste] = []
|
|
if scope in ("all", "waste"):
|
|
waste.append(self.identify_parameter_waste(entries))
|
|
waste.append(self.identify_redirect_chains(entries))
|
|
waste.append(self.identify_soft_404s(entries))
|
|
waste.append(self.identify_duplicate_crawls(entries))
|
|
|
|
total_waste_pct = sum(w.pct_of_total for w in waste)
|
|
|
|
# Detect orphan pages
|
|
orphans: dict[str, list[OrphanPage]] = {
|
|
"in_sitemap_not_crawled": [],
|
|
"crawled_not_in_sitemap": [],
|
|
}
|
|
if scope in ("all", "orphans") and self._sitemap_urls:
|
|
crawled_urls: set[str] = set()
|
|
for entry, _ in entries:
|
|
# Build full URL from path for comparison
|
|
if self.target_url:
|
|
parsed_target = urlparse(self.target_url)
|
|
full = f"{parsed_target.scheme}://{parsed_target.netloc}{entry.url}"
|
|
crawled_urls.add(self._normalize_url(full))
|
|
else:
|
|
crawled_urls.add(entry.url)
|
|
orphans = self.detect_orphan_pages(crawled_urls, self._sitemap_urls)
|
|
|
|
# Efficiency score
|
|
efficiency_score = self.calculate_efficiency_score(total_waste_pct)
|
|
|
|
# Recommendations
|
|
recommendations = self.generate_recommendations(waste, orphans, bot_profiles)
|
|
|
|
# Date range from entries
|
|
timestamps = [e.timestamp for e, _ in entries if e.timestamp]
|
|
analysis_period = {}
|
|
if timestamps:
|
|
analysis_period = {
|
|
"from": min(timestamps).strftime("%Y-%m-%d"),
|
|
"to": max(timestamps).strftime("%Y-%m-%d"),
|
|
}
|
|
|
|
return CrawlBudgetResult(
|
|
log_file=self.log_file,
|
|
analysis_period=analysis_period,
|
|
total_bot_requests=len(entries),
|
|
bots=bot_profiles,
|
|
waste=waste,
|
|
total_waste_pct=total_waste_pct,
|
|
orphan_pages=orphans,
|
|
recommendations=recommendations,
|
|
efficiency_score=efficiency_score,
|
|
timestamp=datetime.now().isoformat(),
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Analyze crawl budget efficiency and generate optimization recommendations.",
|
|
)
|
|
parser.add_argument(
|
|
"--log-file",
|
|
required=True,
|
|
help="Path to server access log file",
|
|
)
|
|
parser.add_argument(
|
|
"--sitemap",
|
|
default=None,
|
|
help="URL of XML sitemap for orphan page detection",
|
|
)
|
|
parser.add_argument(
|
|
"--url",
|
|
default=None,
|
|
help="Target website URL (used for URL normalization and Ahrefs)",
|
|
)
|
|
parser.add_argument(
|
|
"--scope",
|
|
choices=["all", "waste", "orphans", "bots"],
|
|
default="all",
|
|
help="Analysis scope (default: all)",
|
|
)
|
|
parser.add_argument(
|
|
"--ahrefs",
|
|
action="store_true",
|
|
help="Include Ahrefs page history comparison (requires MCP tool)",
|
|
)
|
|
parser.add_argument(
|
|
"--json",
|
|
action="store_true",
|
|
help="Output in JSON format",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
default=None,
|
|
help="Write output to file instead of stdout",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Validate log file exists
|
|
if not Path(args.log_file).exists():
|
|
logger.error(f"Log file not found: {args.log_file}")
|
|
sys.exit(1)
|
|
|
|
analyzer = CrawlBudgetAnalyzer(
|
|
log_file=args.log_file,
|
|
sitemap_url=args.sitemap,
|
|
target_url=args.url,
|
|
)
|
|
|
|
result = analyzer.analyze(scope=args.scope)
|
|
|
|
if args.json:
|
|
output_data = result.to_dict()
|
|
output_str = json.dumps(output_data, indent=2, ensure_ascii=False)
|
|
else:
|
|
lines = _format_text_report(result)
|
|
output_str = "\n".join(lines)
|
|
|
|
if args.output:
|
|
Path(args.output).write_text(output_str, encoding="utf-8")
|
|
logger.info(f"Output written to {args.output}")
|
|
else:
|
|
print(output_str)
|
|
|
|
|
|
def _format_text_report(result: CrawlBudgetResult) -> list[str]:
|
|
"""Format the analysis result as a human-readable text report."""
|
|
lines = [
|
|
"=" * 70,
|
|
"Crawl Budget Analysis Report",
|
|
"=" * 70,
|
|
f"Log File: {result.log_file}",
|
|
f"Total Bot Requests: {result.total_bot_requests:,}",
|
|
f"Efficiency Score: {result.efficiency_score}/100",
|
|
f"Total Waste: {result.total_waste_pct:.1f}%",
|
|
]
|
|
if result.analysis_period:
|
|
lines.append(
|
|
f"Period: {result.analysis_period.get('from', 'N/A')} ~ "
|
|
f"{result.analysis_period.get('to', 'N/A')}"
|
|
)
|
|
lines.append("")
|
|
|
|
# Bot profiles
|
|
if result.bots:
|
|
lines.append("-" * 60)
|
|
lines.append("Bot Profiles")
|
|
lines.append("-" * 60)
|
|
for name, profile in sorted(result.bots.items(), key=lambda x: -x[1].total_requests):
|
|
lines.append(f"\n [{name.upper()}]")
|
|
lines.append(f" Requests: {profile.total_requests:,}")
|
|
lines.append(f" Unique URLs: {profile.unique_urls:,}")
|
|
lines.append(f" Requests/Day: {profile.requests_per_day:,.1f}")
|
|
lines.append(f" Days Active: {profile.days_active}")
|
|
lines.append(f" Peak Hours: {profile.peak_hours}")
|
|
lines.append(f" Status: {profile.status_breakdown}")
|
|
lines.append("")
|
|
|
|
# Waste breakdown
|
|
if result.waste:
|
|
lines.append("-" * 60)
|
|
lines.append("Crawl Waste Breakdown")
|
|
lines.append("-" * 60)
|
|
for w in result.waste:
|
|
if w.count > 0:
|
|
lines.append(f"\n [{w.waste_type}]")
|
|
lines.append(f" Count: {w.count:,} ({w.pct_of_total:.1f}%)")
|
|
lines.append(f" Recommendation: {w.recommendation}")
|
|
if w.urls:
|
|
lines.append(f" Sample URLs:")
|
|
for u in w.urls[:5]:
|
|
lines.append(f" - {u}")
|
|
lines.append("")
|
|
|
|
# Orphan pages
|
|
not_crawled = result.orphan_pages.get("in_sitemap_not_crawled", [])
|
|
not_in_sitemap = result.orphan_pages.get("crawled_not_in_sitemap", [])
|
|
if not_crawled or not_in_sitemap:
|
|
lines.append("-" * 60)
|
|
lines.append("Orphan Pages")
|
|
lines.append("-" * 60)
|
|
if not_crawled:
|
|
lines.append(f"\n In Sitemap but Not Crawled: {len(not_crawled):,}")
|
|
for op in not_crawled[:10]:
|
|
lines.append(f" - {op.url}")
|
|
if not_in_sitemap:
|
|
lines.append(f"\n Crawled but Not in Sitemap: {len(not_in_sitemap):,}")
|
|
for op in not_in_sitemap[:10]:
|
|
lines.append(f" - {op.url}")
|
|
lines.append("")
|
|
|
|
# Recommendations
|
|
if result.recommendations:
|
|
lines.append("-" * 60)
|
|
lines.append("Recommendations")
|
|
lines.append("-" * 60)
|
|
for i, rec in enumerate(result.recommendations, 1):
|
|
lines.append(f"\n {i}. [{rec.priority.upper()}] {rec.category}")
|
|
lines.append(f" Action: {rec.action}")
|
|
lines.append(f" Impact: {rec.impact}")
|
|
lines.append(f" Details: {rec.details}")
|
|
|
|
lines.append("")
|
|
lines.append(f"Generated: {result.timestamp}")
|
|
return lines
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|