12 new skills: Keyword Strategy, SERP Analysis, Position Tracking, Link Building, Content Strategy, E-Commerce SEO, KPI Framework, International SEO, AI Visibility, Knowledge Graph, Competitor Intel, and Crawl Budget. ~20K lines of Python across 25 domain scripts. Updated skill 11 pipeline table and repo CLAUDE.md. Enhanced skill 18 local SEO workflow from jamie.clinic audit. Note: Skill 26 hreflang_validator.py pending (content filter block). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
717 lines
27 KiB
Python
717 lines
27 KiB
Python
"""
|
|
Content Auditor - SEO Content Inventory & Performance Analysis
|
|
==============================================================
|
|
Purpose: Build content inventory, score performance, detect decay,
|
|
classify content types, and analyze Korean content patterns.
|
|
Python: 3.10+
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import re
|
|
import sys
|
|
from dataclasses import asdict, dataclass, field
|
|
from datetime import datetime, timedelta
|
|
from typing import Any
|
|
from urllib.parse import urlparse
|
|
|
|
import aiohttp
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from base_client import BaseAsyncClient, config
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data classes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@dataclass
|
|
class ContentPage:
|
|
"""Single content page with performance metrics."""
|
|
url: str
|
|
title: str = ""
|
|
content_type: str = "other"
|
|
word_count: int = 0
|
|
traffic: int = 0
|
|
keywords_count: int = 0
|
|
backlinks: int = 0
|
|
performance_score: float = 0.0
|
|
last_modified: str = ""
|
|
is_decaying: bool = False
|
|
decay_rate: float = 0.0
|
|
korean_pattern: str = ""
|
|
topics: list[str] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class ContentInventory:
|
|
"""Aggregated content inventory summary."""
|
|
total_pages: int = 0
|
|
by_type: dict[str, int] = field(default_factory=dict)
|
|
avg_performance_score: float = 0.0
|
|
avg_word_count: float = 0.0
|
|
pages: list[ContentPage] = field(default_factory=list)
|
|
freshness_distribution: dict[str, int] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class ContentAuditResult:
|
|
"""Full content audit result."""
|
|
url: str
|
|
timestamp: str = ""
|
|
content_inventory: ContentInventory = field(default_factory=ContentInventory)
|
|
top_performers: list[ContentPage] = field(default_factory=list)
|
|
decaying_content: list[ContentPage] = field(default_factory=list)
|
|
korean_content_analysis: dict[str, Any] = field(default_factory=dict)
|
|
recommendations: list[str] = field(default_factory=list)
|
|
errors: list[str] = field(default_factory=list)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# URL pattern rules for content type classification
|
|
# ---------------------------------------------------------------------------
|
|
|
|
CONTENT_TYPE_PATTERNS = {
|
|
"blog": [
|
|
r"/blog/", r"/post/", r"/posts/", r"/article/", r"/articles/",
|
|
r"/news/", r"/magazine/", r"/stories/", r"/insights/",
|
|
r"/블로그/", r"/소식/", r"/뉴스/",
|
|
],
|
|
"product": [
|
|
r"/product/", r"/products/", r"/shop/", r"/store/",
|
|
r"/item/", r"/goods/", r"/catalog/",
|
|
r"/제품/", r"/상품/", r"/쇼핑/",
|
|
],
|
|
"service": [
|
|
r"/service/", r"/services/", r"/solutions/", r"/offering/",
|
|
r"/진료/", r"/서비스/", r"/시술/", r"/치료/",
|
|
],
|
|
"landing": [
|
|
r"/lp/", r"/landing/", r"/campaign/", r"/promo/",
|
|
r"/event/", r"/이벤트/", r"/프로모션/",
|
|
],
|
|
"resource": [
|
|
r"/resource/", r"/resources/", r"/guide/", r"/guides/",
|
|
r"/whitepaper/", r"/ebook/", r"/download/", r"/faq/",
|
|
r"/help/", r"/support/", r"/가이드/", r"/자료/",
|
|
],
|
|
}
|
|
|
|
KOREAN_CONTENT_PATTERNS = {
|
|
"naver_blog_style": [
|
|
r"후기", r"리뷰", r"체험", r"솔직후기", r"방문후기",
|
|
r"사용후기", r"이용후기",
|
|
],
|
|
"listicle": [
|
|
r"추천", r"베스트", r"TOP\s*\d+", r"\d+선", r"\d+가지",
|
|
r"모음", r"정리", r"비교",
|
|
],
|
|
"how_to": [
|
|
r"방법", r"하는\s*법", r"하는\s*방법", r"가이드",
|
|
r"따라하기", r"시작하기", r"알아보기",
|
|
],
|
|
"informational": [
|
|
r"이란", r"뜻", r"의미", r"차이", r"비교",
|
|
r"장단점", r"효과", r"부작용", r"비용", r"가격",
|
|
],
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ContentAuditor
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class ContentAuditor(BaseAsyncClient):
|
|
"""Content auditor using Ahrefs API and sitemap crawling."""
|
|
|
|
def __init__(self, max_concurrent: int = 5, requests_per_second: float = 2.0):
|
|
super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second)
|
|
self.session: aiohttp.ClientSession | None = None
|
|
|
|
async def _ensure_session(self) -> aiohttp.ClientSession:
|
|
if self.session is None or self.session.closed:
|
|
timeout = aiohttp.ClientTimeout(total=30)
|
|
self.session = aiohttp.ClientSession(timeout=timeout)
|
|
return self.session
|
|
|
|
async def close(self) -> None:
|
|
if self.session and not self.session.closed:
|
|
await self.session.close()
|
|
|
|
# ------------------------------------------------------------------
|
|
# Ahrefs data retrieval
|
|
# ------------------------------------------------------------------
|
|
|
|
async def get_top_pages(self, url: str, limit: int = 100) -> list[dict]:
|
|
"""
|
|
Retrieve top pages via Ahrefs site-explorer-top-pages.
|
|
|
|
Returns list of dicts with keys: url, traffic, keywords, value, top_keyword.
|
|
"""
|
|
self.logger.info(f"Fetching top pages from Ahrefs for {url}")
|
|
target = urlparse(url).netloc or url
|
|
try:
|
|
# Ahrefs MCP call: site-explorer-top-pages
|
|
# In MCP context this would be called by the agent.
|
|
# Standalone fallback: use REST API if AHREFS_API_KEY is set.
|
|
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
|
|
if not api_key:
|
|
self.logger.warning("AHREFS_API_KEY not set; returning empty top pages")
|
|
return []
|
|
|
|
resp = requests.get(
|
|
"https://api.ahrefs.com/v3/site-explorer/top-pages",
|
|
params={"target": target, "limit": limit, "select": "url,traffic,keywords,value,top_keyword"},
|
|
headers={"Authorization": f"Bearer {api_key}"},
|
|
timeout=30,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
pages = data.get("pages", data.get("items", []))
|
|
self.logger.info(f"Retrieved {len(pages)} top pages")
|
|
return pages
|
|
except Exception as exc:
|
|
self.logger.warning(f"Ahrefs top-pages lookup failed: {exc}")
|
|
return []
|
|
|
|
async def get_pages_by_traffic(self, url: str, limit: int = 100) -> list[dict]:
|
|
"""
|
|
Retrieve pages sorted by organic traffic via Ahrefs site-explorer-pages-by-traffic.
|
|
|
|
Returns list of dicts with keys: url, traffic, keywords, top_keyword.
|
|
"""
|
|
self.logger.info(f"Fetching pages-by-traffic from Ahrefs for {url}")
|
|
target = urlparse(url).netloc or url
|
|
try:
|
|
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
|
|
if not api_key:
|
|
self.logger.warning("AHREFS_API_KEY not set; returning empty traffic pages")
|
|
return []
|
|
|
|
resp = requests.get(
|
|
"https://api.ahrefs.com/v3/site-explorer/pages-by-traffic",
|
|
params={"target": target, "limit": limit, "select": "url,traffic,keywords,top_keyword"},
|
|
headers={"Authorization": f"Bearer {api_key}"},
|
|
timeout=30,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
pages = data.get("pages", data.get("items", []))
|
|
self.logger.info(f"Retrieved {len(pages)} pages by traffic")
|
|
return pages
|
|
except Exception as exc:
|
|
self.logger.warning(f"Ahrefs pages-by-traffic lookup failed: {exc}")
|
|
return []
|
|
|
|
# ------------------------------------------------------------------
|
|
# Sitemap crawling
|
|
# ------------------------------------------------------------------
|
|
|
|
async def crawl_sitemap(self, url: str) -> list[str]:
|
|
"""Discover URLs from sitemap.xml."""
|
|
sitemap_urls_to_try = [
|
|
f"{url.rstrip('/')}/sitemap.xml",
|
|
f"{url.rstrip('/')}/sitemap_index.xml",
|
|
f"{url.rstrip('/')}/post-sitemap.xml",
|
|
]
|
|
discovered: list[str] = []
|
|
session = await self._ensure_session()
|
|
|
|
for sitemap_url in sitemap_urls_to_try:
|
|
try:
|
|
async with session.get(sitemap_url) as resp:
|
|
if resp.status != 200:
|
|
continue
|
|
text = await resp.text()
|
|
soup = BeautifulSoup(text, "lxml-xml")
|
|
|
|
# Sitemap index
|
|
sitemaps = soup.find_all("sitemap")
|
|
if sitemaps:
|
|
for sm in sitemaps:
|
|
loc = sm.find("loc")
|
|
if loc:
|
|
child_urls = await self._parse_sitemap(session, loc.text.strip())
|
|
discovered.extend(child_urls)
|
|
else:
|
|
urls = soup.find_all("url")
|
|
for u in urls:
|
|
loc = u.find("loc")
|
|
if loc:
|
|
discovered.append(loc.text.strip())
|
|
|
|
if discovered:
|
|
self.logger.info(f"Discovered {len(discovered)} URLs from {sitemap_url}")
|
|
break
|
|
except Exception as exc:
|
|
self.logger.debug(f"Failed to fetch {sitemap_url}: {exc}")
|
|
|
|
return list(set(discovered))
|
|
|
|
async def _parse_sitemap(self, session: aiohttp.ClientSession, sitemap_url: str) -> list[str]:
|
|
"""Parse a single sitemap XML and return URLs."""
|
|
urls: list[str] = []
|
|
try:
|
|
async with session.get(sitemap_url) as resp:
|
|
if resp.status != 200:
|
|
return urls
|
|
text = await resp.text()
|
|
soup = BeautifulSoup(text, "lxml-xml")
|
|
for u in soup.find_all("url"):
|
|
loc = u.find("loc")
|
|
if loc:
|
|
urls.append(loc.text.strip())
|
|
except Exception as exc:
|
|
self.logger.debug(f"Failed to parse sitemap {sitemap_url}: {exc}")
|
|
return urls
|
|
|
|
# ------------------------------------------------------------------
|
|
# Content type classification
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def classify_content_type(url: str, title: str = "") -> str:
|
|
"""
|
|
Classify content type based on URL path patterns and title.
|
|
|
|
Returns one of: blog, product, service, landing, resource, other.
|
|
"""
|
|
combined = f"{url.lower()} {title.lower()}"
|
|
scores: dict[str, int] = {}
|
|
|
|
for ctype, patterns in CONTENT_TYPE_PATTERNS.items():
|
|
score = 0
|
|
for pattern in patterns:
|
|
if re.search(pattern, combined, re.IGNORECASE):
|
|
score += 1
|
|
if score > 0:
|
|
scores[ctype] = score
|
|
|
|
if not scores:
|
|
return "other"
|
|
return max(scores, key=scores.get)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Performance scoring
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def score_performance(page: ContentPage) -> float:
|
|
"""
|
|
Compute composite performance score (0-100) from traffic, keywords, backlinks.
|
|
|
|
Weights:
|
|
- Traffic: 50% (log-scaled, 10k+ traffic = max)
|
|
- Keywords count: 30% (log-scaled, 500+ = max)
|
|
- Backlinks: 20% (log-scaled, 100+ = max)
|
|
"""
|
|
import math
|
|
|
|
traffic_score = min(100, (math.log10(max(page.traffic, 1)) / math.log10(10000)) * 100)
|
|
keywords_score = min(100, (math.log10(max(page.keywords_count, 1)) / math.log10(500)) * 100)
|
|
backlinks_score = min(100, (math.log10(max(page.backlinks, 1)) / math.log10(100)) * 100)
|
|
|
|
composite = (traffic_score * 0.50) + (keywords_score * 0.30) + (backlinks_score * 0.20)
|
|
return round(min(100, max(0, composite)), 1)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Content decay detection
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def detect_decay(pages: list[ContentPage], threshold: float = -20.0) -> list[ContentPage]:
|
|
"""
|
|
Flag pages with declining traffic trend.
|
|
|
|
Uses a simple heuristic: pages with low performance score relative to
|
|
their keyword count indicate potential decay. In production, historical
|
|
traffic data from Ahrefs metrics-history would be used.
|
|
|
|
Args:
|
|
pages: List of content pages with metrics.
|
|
threshold: Decay rate threshold (percentage decline).
|
|
|
|
Returns:
|
|
List of pages flagged as decaying.
|
|
"""
|
|
decaying: list[ContentPage] = []
|
|
for page in pages:
|
|
# Heuristic: high keyword count but low traffic suggests decay
|
|
if page.keywords_count > 10 and page.traffic < 50:
|
|
page.is_decaying = True
|
|
page.decay_rate = -50.0 if page.traffic == 0 else round(
|
|
-((page.keywords_count * 10 - page.traffic) / max(page.keywords_count * 10, 1)) * 100, 1
|
|
)
|
|
if page.decay_rate <= threshold:
|
|
decaying.append(page)
|
|
elif page.performance_score < 20 and page.keywords_count > 5:
|
|
page.is_decaying = True
|
|
page.decay_rate = round(-max(30, 100 - page.performance_score * 2), 1)
|
|
if page.decay_rate <= threshold:
|
|
decaying.append(page)
|
|
|
|
decaying.sort(key=lambda p: p.decay_rate)
|
|
return decaying
|
|
|
|
# ------------------------------------------------------------------
|
|
# Freshness assessment
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def analyze_freshness(pages: list[ContentPage]) -> dict[str, int]:
|
|
"""
|
|
Categorize pages by freshness based on last_modified dates.
|
|
|
|
Returns distribution: fresh (< 3 months), aging (3-12 months),
|
|
stale (> 12 months), unknown (no date).
|
|
"""
|
|
now = datetime.now()
|
|
distribution = {"fresh": 0, "aging": 0, "stale": 0, "unknown": 0}
|
|
|
|
for page in pages:
|
|
if not page.last_modified:
|
|
distribution["unknown"] += 1
|
|
continue
|
|
try:
|
|
# Try common date formats
|
|
for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%S%z"):
|
|
try:
|
|
modified = datetime.strptime(
|
|
page.last_modified.replace("+00:00", "").replace("Z", ""), fmt.replace("%z", "")
|
|
)
|
|
break
|
|
except ValueError:
|
|
continue
|
|
else:
|
|
distribution["unknown"] += 1
|
|
continue
|
|
|
|
age = now - modified
|
|
if age < timedelta(days=90):
|
|
distribution["fresh"] += 1
|
|
elif age < timedelta(days=365):
|
|
distribution["aging"] += 1
|
|
else:
|
|
distribution["stale"] += 1
|
|
except Exception:
|
|
distribution["unknown"] += 1
|
|
|
|
return distribution
|
|
|
|
# ------------------------------------------------------------------
|
|
# Korean content pattern identification
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def identify_korean_patterns(pages: list[ContentPage]) -> dict[str, Any]:
|
|
"""
|
|
Detect Korean content patterns across pages.
|
|
|
|
Identifies Naver Blog style review content, listicles,
|
|
how-to guides, and informational content patterns.
|
|
|
|
Returns summary with counts and example URLs per pattern.
|
|
"""
|
|
results: dict[str, Any] = {
|
|
"total_korean_content": 0,
|
|
"patterns": {},
|
|
}
|
|
|
|
for pattern_name, keywords in KOREAN_CONTENT_PATTERNS.items():
|
|
matches: list[dict[str, str]] = []
|
|
for page in pages:
|
|
combined = f"{page.url} {page.title}"
|
|
for keyword in keywords:
|
|
if re.search(keyword, combined, re.IGNORECASE):
|
|
matches.append({"url": page.url, "title": page.title, "matched_keyword": keyword})
|
|
break
|
|
|
|
results["patterns"][pattern_name] = {
|
|
"count": len(matches),
|
|
"examples": matches[:5],
|
|
}
|
|
|
|
korean_urls = set()
|
|
for pattern_data in results["patterns"].values():
|
|
for example in pattern_data["examples"]:
|
|
korean_urls.add(example["url"])
|
|
results["total_korean_content"] = len(korean_urls)
|
|
|
|
return results
|
|
|
|
# ------------------------------------------------------------------
|
|
# Orchestration
|
|
# ------------------------------------------------------------------
|
|
|
|
async def audit(
|
|
self,
|
|
url: str,
|
|
detect_decay_flag: bool = False,
|
|
content_type_filter: str | None = None,
|
|
limit: int = 200,
|
|
) -> ContentAuditResult:
|
|
"""
|
|
Run full content audit: inventory, scoring, decay, Korean patterns.
|
|
|
|
Args:
|
|
url: Target website URL.
|
|
detect_decay_flag: Whether to run decay detection.
|
|
content_type_filter: Filter by content type (blog, product, etc.).
|
|
limit: Maximum pages to analyze.
|
|
|
|
Returns:
|
|
ContentAuditResult with inventory, top performers, decay, analysis.
|
|
"""
|
|
result = ContentAuditResult(
|
|
url=url,
|
|
timestamp=datetime.now().isoformat(),
|
|
)
|
|
|
|
self.logger.info(f"Starting content audit for {url}")
|
|
|
|
# 1. Gather pages from Ahrefs and sitemap
|
|
top_pages_data, traffic_pages_data, sitemap_urls = await asyncio.gather(
|
|
self.get_top_pages(url, limit=limit),
|
|
self.get_pages_by_traffic(url, limit=limit),
|
|
self.crawl_sitemap(url),
|
|
)
|
|
|
|
# 2. Merge and deduplicate pages
|
|
page_map: dict[str, ContentPage] = {}
|
|
|
|
for item in top_pages_data:
|
|
page_url = item.get("url", "")
|
|
if not page_url:
|
|
continue
|
|
page_map[page_url] = ContentPage(
|
|
url=page_url,
|
|
title=item.get("top_keyword", ""),
|
|
traffic=int(item.get("traffic", 0)),
|
|
keywords_count=int(item.get("keywords", 0)),
|
|
backlinks=int(item.get("value", 0)),
|
|
)
|
|
|
|
for item in traffic_pages_data:
|
|
page_url = item.get("url", "")
|
|
if not page_url:
|
|
continue
|
|
if page_url in page_map:
|
|
existing = page_map[page_url]
|
|
existing.traffic = max(existing.traffic, int(item.get("traffic", 0)))
|
|
existing.keywords_count = max(existing.keywords_count, int(item.get("keywords", 0)))
|
|
else:
|
|
page_map[page_url] = ContentPage(
|
|
url=page_url,
|
|
title=item.get("top_keyword", ""),
|
|
traffic=int(item.get("traffic", 0)),
|
|
keywords_count=int(item.get("keywords", 0)),
|
|
)
|
|
|
|
# Add sitemap URLs not already present
|
|
for s_url in sitemap_urls:
|
|
if s_url not in page_map:
|
|
page_map[s_url] = ContentPage(url=s_url)
|
|
|
|
# 3. Classify and score
|
|
all_pages: list[ContentPage] = []
|
|
for page in page_map.values():
|
|
page.content_type = self.classify_content_type(page.url, page.title)
|
|
page.performance_score = self.score_performance(page)
|
|
all_pages.append(page)
|
|
|
|
# 4. Filter by content type if requested
|
|
if content_type_filter:
|
|
all_pages = [p for p in all_pages if p.content_type == content_type_filter]
|
|
|
|
# 5. Build inventory
|
|
by_type: dict[str, int] = {}
|
|
for page in all_pages:
|
|
by_type[page.content_type] = by_type.get(page.content_type, 0) + 1
|
|
|
|
avg_score = (
|
|
sum(p.performance_score for p in all_pages) / len(all_pages)
|
|
if all_pages else 0.0
|
|
)
|
|
avg_word_count = (
|
|
sum(p.word_count for p in all_pages) / len(all_pages)
|
|
if all_pages else 0.0
|
|
)
|
|
|
|
freshness = self.analyze_freshness(all_pages)
|
|
|
|
result.content_inventory = ContentInventory(
|
|
total_pages=len(all_pages),
|
|
by_type=by_type,
|
|
avg_performance_score=round(avg_score, 1),
|
|
avg_word_count=round(avg_word_count, 1),
|
|
pages=sorted(all_pages, key=lambda p: p.performance_score, reverse=True)[:limit],
|
|
freshness_distribution=freshness,
|
|
)
|
|
|
|
# 6. Top performers
|
|
result.top_performers = sorted(all_pages, key=lambda p: p.performance_score, reverse=True)[:20]
|
|
|
|
# 7. Decay detection
|
|
if detect_decay_flag:
|
|
result.decaying_content = self.detect_decay(all_pages)
|
|
|
|
# 8. Korean content analysis
|
|
result.korean_content_analysis = self.identify_korean_patterns(all_pages)
|
|
|
|
# 9. Recommendations
|
|
result.recommendations = self._generate_recommendations(result)
|
|
|
|
self.logger.info(
|
|
f"Audit complete: {len(all_pages)} pages, "
|
|
f"{len(result.top_performers)} top performers, "
|
|
f"{len(result.decaying_content)} decaying"
|
|
)
|
|
|
|
return result
|
|
|
|
@staticmethod
|
|
def _generate_recommendations(result: ContentAuditResult) -> list[str]:
|
|
"""Generate actionable recommendations from audit data."""
|
|
recs: list[str] = []
|
|
inv = result.content_inventory
|
|
|
|
# Low average score
|
|
if inv.avg_performance_score < 30:
|
|
recs.append(
|
|
"전체 콘텐츠 평균 성과 점수가 낮습니다 ({:.0f}/100). "
|
|
"상위 콘텐츠 패턴을 분석하여 저성과 페이지를 개선하세요.".format(inv.avg_performance_score)
|
|
)
|
|
|
|
# Stale content
|
|
stale = inv.freshness_distribution.get("stale", 0)
|
|
total = inv.total_pages or 1
|
|
if stale / total > 0.3:
|
|
recs.append(
|
|
f"오래된 콘텐츠가 {stale}개 ({stale * 100 // total}%)입니다. "
|
|
"콘텐츠 업데이트 또는 통합을 고려하세요."
|
|
)
|
|
|
|
# Decaying content
|
|
if len(result.decaying_content) > 5:
|
|
recs.append(
|
|
f"트래픽이 감소하는 콘텐츠가 {len(result.decaying_content)}개 감지되었습니다. "
|
|
"상위 감소 페이지부터 콘텐츠 리프레시를 진행하세요."
|
|
)
|
|
|
|
# Content type balance
|
|
blog_count = inv.by_type.get("blog", 0)
|
|
if blog_count == 0:
|
|
recs.append(
|
|
"블로그 콘텐츠가 없습니다. SEO 트래픽 확보를 위해 "
|
|
"블로그 콘텐츠 전략을 수립하세요."
|
|
)
|
|
|
|
# Korean content opportunities
|
|
korean = result.korean_content_analysis
|
|
review_count = korean.get("patterns", {}).get("naver_blog_style", {}).get("count", 0)
|
|
if review_count == 0:
|
|
recs.append(
|
|
"후기/리뷰 콘텐츠가 없습니다. 한국 시장에서 후기 콘텐츠는 "
|
|
"전환율에 큰 영향을 미치므로 후기 콘텐츠 생성을 권장합니다."
|
|
)
|
|
|
|
if not recs:
|
|
recs.append("현재 콘텐츠 전략이 양호합니다. 지속적인 모니터링을 권장합니다.")
|
|
|
|
return recs
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def build_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(
|
|
description="SEO Content Auditor - inventory, scoring, and decay detection",
|
|
)
|
|
parser.add_argument("--url", required=True, help="Target website URL")
|
|
parser.add_argument("--decay", action="store_true", help="Enable content decay detection")
|
|
parser.add_argument("--type", dest="content_type", help="Filter by content type (blog, product, service, landing, resource)")
|
|
parser.add_argument("--limit", type=int, default=200, help="Maximum pages to analyze (default: 200)")
|
|
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
parser.add_argument("--output", help="Save output to file")
|
|
return parser
|
|
|
|
|
|
def format_text_report(result: ContentAuditResult) -> str:
|
|
"""Format audit result as human-readable text."""
|
|
lines: list[str] = []
|
|
lines.append(f"## Content Audit: {result.url}")
|
|
lines.append(f"**Date**: {result.timestamp[:10]}")
|
|
lines.append("")
|
|
|
|
inv = result.content_inventory
|
|
lines.append(f"### Content Inventory")
|
|
lines.append(f"- Total pages: {inv.total_pages}")
|
|
lines.append(f"- Average performance score: {inv.avg_performance_score}/100")
|
|
lines.append(f"- Content types: {json.dumps(inv.by_type, ensure_ascii=False)}")
|
|
lines.append(f"- Freshness: {json.dumps(inv.freshness_distribution, ensure_ascii=False)}")
|
|
lines.append("")
|
|
|
|
lines.append("### Top Performers")
|
|
for i, page in enumerate(result.top_performers[:10], 1):
|
|
lines.append(f" {i}. [{page.performance_score:.0f}] {page.url} (traffic: {page.traffic})")
|
|
lines.append("")
|
|
|
|
if result.decaying_content:
|
|
lines.append("### Decaying Content")
|
|
for i, page in enumerate(result.decaying_content[:10], 1):
|
|
lines.append(f" {i}. [{page.decay_rate:+.0f}%] {page.url} (traffic: {page.traffic})")
|
|
lines.append("")
|
|
|
|
if result.korean_content_analysis.get("patterns"):
|
|
lines.append("### Korean Content Patterns")
|
|
for pattern_name, data in result.korean_content_analysis["patterns"].items():
|
|
lines.append(f" - {pattern_name}: {data['count']} pages")
|
|
lines.append("")
|
|
|
|
lines.append("### Recommendations")
|
|
for i, rec in enumerate(result.recommendations, 1):
|
|
lines.append(f" {i}. {rec}")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
async def main() -> None:
|
|
parser = build_parser()
|
|
args = parser.parse_args()
|
|
|
|
auditor = ContentAuditor()
|
|
try:
|
|
result = await auditor.audit(
|
|
url=args.url,
|
|
detect_decay_flag=args.decay,
|
|
content_type_filter=args.content_type,
|
|
limit=args.limit,
|
|
)
|
|
|
|
if args.json:
|
|
output = json.dumps(asdict(result), ensure_ascii=False, indent=2, default=str)
|
|
else:
|
|
output = format_text_report(result)
|
|
|
|
if args.output:
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
f.write(output)
|
|
logger.info(f"Output saved to {args.output}")
|
|
else:
|
|
print(output)
|
|
|
|
finally:
|
|
await auditor.close()
|
|
auditor.print_stats()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|