Add SEO skills 19-28, 31-32 with full Python implementations
12 new skills: Keyword Strategy, SERP Analysis, Position Tracking, Link Building, Content Strategy, E-Commerce SEO, KPI Framework, International SEO, AI Visibility, Knowledge Graph, Competitor Intel, and Crawl Budget. ~20K lines of Python across 25 domain scripts. Updated skill 11 pipeline table and repo CLAUDE.md. Enhanced skill 18 local SEO workflow from jamie.clinic audit. Note: Skill 26 hreflang_validator.py pending (content filter block). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
Base Client - Shared async client utilities
|
||||
===========================================
|
||||
Purpose: Rate-limited async operations for API clients
|
||||
Python: 3.10+
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from asyncio import Semaphore
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable, TypeVar
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
retry_if_exception_type,
|
||||
)
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Logging setup
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Rate limiter using token bucket algorithm."""
|
||||
|
||||
def __init__(self, rate: float, per: float = 1.0):
|
||||
"""
|
||||
Initialize rate limiter.
|
||||
|
||||
Args:
|
||||
rate: Number of requests allowed
|
||||
per: Time period in seconds (default: 1 second)
|
||||
"""
|
||||
self.rate = rate
|
||||
self.per = per
|
||||
self.tokens = rate
|
||||
self.last_update = datetime.now()
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def acquire(self) -> None:
|
||||
"""Acquire a token, waiting if necessary."""
|
||||
async with self._lock:
|
||||
now = datetime.now()
|
||||
elapsed = (now - self.last_update).total_seconds()
|
||||
self.tokens = min(self.rate, self.tokens + elapsed * (self.rate / self.per))
|
||||
self.last_update = now
|
||||
|
||||
if self.tokens < 1:
|
||||
wait_time = (1 - self.tokens) * (self.per / self.rate)
|
||||
await asyncio.sleep(wait_time)
|
||||
self.tokens = 0
|
||||
else:
|
||||
self.tokens -= 1
|
||||
|
||||
|
||||
class BaseAsyncClient:
|
||||
"""Base class for async API clients with rate limiting."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_concurrent: int = 5,
|
||||
requests_per_second: float = 3.0,
|
||||
logger: logging.Logger | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize base client.
|
||||
|
||||
Args:
|
||||
max_concurrent: Maximum concurrent requests
|
||||
requests_per_second: Rate limit
|
||||
logger: Logger instance
|
||||
"""
|
||||
self.semaphore = Semaphore(max_concurrent)
|
||||
self.rate_limiter = RateLimiter(requests_per_second)
|
||||
self.logger = logger or logging.getLogger(self.__class__.__name__)
|
||||
self.stats = {
|
||||
"requests": 0,
|
||||
"success": 0,
|
||||
"errors": 0,
|
||||
"retries": 0,
|
||||
}
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=2, max=10),
|
||||
retry=retry_if_exception_type(Exception),
|
||||
)
|
||||
async def _rate_limited_request(
|
||||
self,
|
||||
coro: Callable[[], Any],
|
||||
) -> Any:
|
||||
"""Execute a request with rate limiting and retry."""
|
||||
async with self.semaphore:
|
||||
await self.rate_limiter.acquire()
|
||||
self.stats["requests"] += 1
|
||||
try:
|
||||
result = await coro()
|
||||
self.stats["success"] += 1
|
||||
return result
|
||||
except Exception as e:
|
||||
self.stats["errors"] += 1
|
||||
self.logger.error(f"Request failed: {e}")
|
||||
raise
|
||||
|
||||
async def batch_requests(
|
||||
self,
|
||||
requests: list[Callable[[], Any]],
|
||||
desc: str = "Processing",
|
||||
) -> list[Any]:
|
||||
"""Execute multiple requests concurrently."""
|
||||
try:
|
||||
from tqdm.asyncio import tqdm
|
||||
has_tqdm = True
|
||||
except ImportError:
|
||||
has_tqdm = False
|
||||
|
||||
async def execute(req: Callable) -> Any:
|
||||
try:
|
||||
return await self._rate_limited_request(req)
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
tasks = [execute(req) for req in requests]
|
||||
|
||||
if has_tqdm:
|
||||
results = []
|
||||
for coro in tqdm.as_completed(tasks, total=len(tasks), desc=desc):
|
||||
result = await coro
|
||||
results.append(result)
|
||||
return results
|
||||
else:
|
||||
return await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
def print_stats(self) -> None:
|
||||
"""Print request statistics."""
|
||||
self.logger.info("=" * 40)
|
||||
self.logger.info("Request Statistics:")
|
||||
self.logger.info(f" Total Requests: {self.stats['requests']}")
|
||||
self.logger.info(f" Successful: {self.stats['success']}")
|
||||
self.logger.info(f" Errors: {self.stats['errors']}")
|
||||
self.logger.info("=" * 40)
|
||||
|
||||
|
||||
class ConfigManager:
|
||||
"""Manage API configuration and credentials."""
|
||||
|
||||
def __init__(self):
|
||||
load_dotenv()
|
||||
|
||||
@property
|
||||
def google_credentials_path(self) -> str | None:
|
||||
"""Get Google service account credentials path."""
|
||||
# Prefer SEO-specific credentials, fallback to general credentials
|
||||
seo_creds = os.path.expanduser("~/.credential/ourdigital-seo-agent.json")
|
||||
if os.path.exists(seo_creds):
|
||||
return seo_creds
|
||||
return os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
|
||||
|
||||
@property
|
||||
def pagespeed_api_key(self) -> str | None:
|
||||
"""Get PageSpeed Insights API key."""
|
||||
return os.getenv("PAGESPEED_API_KEY")
|
||||
|
||||
@property
|
||||
def custom_search_api_key(self) -> str | None:
|
||||
"""Get Custom Search API key."""
|
||||
return os.getenv("CUSTOM_SEARCH_API_KEY")
|
||||
|
||||
@property
|
||||
def custom_search_engine_id(self) -> str | None:
|
||||
"""Get Custom Search Engine ID."""
|
||||
return os.getenv("CUSTOM_SEARCH_ENGINE_ID")
|
||||
|
||||
@property
|
||||
def notion_token(self) -> str | None:
|
||||
"""Get Notion API token."""
|
||||
return os.getenv("NOTION_TOKEN") or os.getenv("NOTION_API_KEY")
|
||||
|
||||
def validate_google_credentials(self) -> bool:
|
||||
"""Validate Google credentials are configured."""
|
||||
creds_path = self.google_credentials_path
|
||||
if not creds_path:
|
||||
return False
|
||||
return os.path.exists(creds_path)
|
||||
|
||||
def get_required(self, key: str) -> str:
|
||||
"""Get required environment variable or raise error."""
|
||||
value = os.getenv(key)
|
||||
if not value:
|
||||
raise ValueError(f"Missing required environment variable: {key}")
|
||||
return value
|
||||
|
||||
|
||||
# Singleton config instance
|
||||
config = ConfigManager()
|
||||
@@ -0,0 +1,716 @@
|
||||
"""
|
||||
Content Auditor - SEO Content Inventory & Performance Analysis
|
||||
==============================================================
|
||||
Purpose: Build content inventory, score performance, detect decay,
|
||||
classify content types, and analyze Korean content patterns.
|
||||
Python: 3.10+
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from base_client import BaseAsyncClient, config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data classes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class ContentPage:
|
||||
"""Single content page with performance metrics."""
|
||||
url: str
|
||||
title: str = ""
|
||||
content_type: str = "other"
|
||||
word_count: int = 0
|
||||
traffic: int = 0
|
||||
keywords_count: int = 0
|
||||
backlinks: int = 0
|
||||
performance_score: float = 0.0
|
||||
last_modified: str = ""
|
||||
is_decaying: bool = False
|
||||
decay_rate: float = 0.0
|
||||
korean_pattern: str = ""
|
||||
topics: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContentInventory:
|
||||
"""Aggregated content inventory summary."""
|
||||
total_pages: int = 0
|
||||
by_type: dict[str, int] = field(default_factory=dict)
|
||||
avg_performance_score: float = 0.0
|
||||
avg_word_count: float = 0.0
|
||||
pages: list[ContentPage] = field(default_factory=list)
|
||||
freshness_distribution: dict[str, int] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContentAuditResult:
|
||||
"""Full content audit result."""
|
||||
url: str
|
||||
timestamp: str = ""
|
||||
content_inventory: ContentInventory = field(default_factory=ContentInventory)
|
||||
top_performers: list[ContentPage] = field(default_factory=list)
|
||||
decaying_content: list[ContentPage] = field(default_factory=list)
|
||||
korean_content_analysis: dict[str, Any] = field(default_factory=dict)
|
||||
recommendations: list[str] = field(default_factory=list)
|
||||
errors: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# URL pattern rules for content type classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
CONTENT_TYPE_PATTERNS = {
|
||||
"blog": [
|
||||
r"/blog/", r"/post/", r"/posts/", r"/article/", r"/articles/",
|
||||
r"/news/", r"/magazine/", r"/stories/", r"/insights/",
|
||||
r"/블로그/", r"/소식/", r"/뉴스/",
|
||||
],
|
||||
"product": [
|
||||
r"/product/", r"/products/", r"/shop/", r"/store/",
|
||||
r"/item/", r"/goods/", r"/catalog/",
|
||||
r"/제품/", r"/상품/", r"/쇼핑/",
|
||||
],
|
||||
"service": [
|
||||
r"/service/", r"/services/", r"/solutions/", r"/offering/",
|
||||
r"/진료/", r"/서비스/", r"/시술/", r"/치료/",
|
||||
],
|
||||
"landing": [
|
||||
r"/lp/", r"/landing/", r"/campaign/", r"/promo/",
|
||||
r"/event/", r"/이벤트/", r"/프로모션/",
|
||||
],
|
||||
"resource": [
|
||||
r"/resource/", r"/resources/", r"/guide/", r"/guides/",
|
||||
r"/whitepaper/", r"/ebook/", r"/download/", r"/faq/",
|
||||
r"/help/", r"/support/", r"/가이드/", r"/자료/",
|
||||
],
|
||||
}
|
||||
|
||||
KOREAN_CONTENT_PATTERNS = {
|
||||
"naver_blog_style": [
|
||||
r"후기", r"리뷰", r"체험", r"솔직후기", r"방문후기",
|
||||
r"사용후기", r"이용후기",
|
||||
],
|
||||
"listicle": [
|
||||
r"추천", r"베스트", r"TOP\s*\d+", r"\d+선", r"\d+가지",
|
||||
r"모음", r"정리", r"비교",
|
||||
],
|
||||
"how_to": [
|
||||
r"방법", r"하는\s*법", r"하는\s*방법", r"가이드",
|
||||
r"따라하기", r"시작하기", r"알아보기",
|
||||
],
|
||||
"informational": [
|
||||
r"이란", r"뜻", r"의미", r"차이", r"비교",
|
||||
r"장단점", r"효과", r"부작용", r"비용", r"가격",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ContentAuditor
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class ContentAuditor(BaseAsyncClient):
|
||||
"""Content auditor using Ahrefs API and sitemap crawling."""
|
||||
|
||||
def __init__(self, max_concurrent: int = 5, requests_per_second: float = 2.0):
|
||||
super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second)
|
||||
self.session: aiohttp.ClientSession | None = None
|
||||
|
||||
async def _ensure_session(self) -> aiohttp.ClientSession:
|
||||
if self.session is None or self.session.closed:
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
self.session = aiohttp.ClientSession(timeout=timeout)
|
||||
return self.session
|
||||
|
||||
async def close(self) -> None:
|
||||
if self.session and not self.session.closed:
|
||||
await self.session.close()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Ahrefs data retrieval
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_top_pages(self, url: str, limit: int = 100) -> list[dict]:
|
||||
"""
|
||||
Retrieve top pages via Ahrefs site-explorer-top-pages.
|
||||
|
||||
Returns list of dicts with keys: url, traffic, keywords, value, top_keyword.
|
||||
"""
|
||||
self.logger.info(f"Fetching top pages from Ahrefs for {url}")
|
||||
target = urlparse(url).netloc or url
|
||||
try:
|
||||
# Ahrefs MCP call: site-explorer-top-pages
|
||||
# In MCP context this would be called by the agent.
|
||||
# Standalone fallback: use REST API if AHREFS_API_KEY is set.
|
||||
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
|
||||
if not api_key:
|
||||
self.logger.warning("AHREFS_API_KEY not set; returning empty top pages")
|
||||
return []
|
||||
|
||||
resp = requests.get(
|
||||
"https://api.ahrefs.com/v3/site-explorer/top-pages",
|
||||
params={"target": target, "limit": limit, "select": "url,traffic,keywords,value,top_keyword"},
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
timeout=30,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
pages = data.get("pages", data.get("items", []))
|
||||
self.logger.info(f"Retrieved {len(pages)} top pages")
|
||||
return pages
|
||||
except Exception as exc:
|
||||
self.logger.warning(f"Ahrefs top-pages lookup failed: {exc}")
|
||||
return []
|
||||
|
||||
async def get_pages_by_traffic(self, url: str, limit: int = 100) -> list[dict]:
|
||||
"""
|
||||
Retrieve pages sorted by organic traffic via Ahrefs site-explorer-pages-by-traffic.
|
||||
|
||||
Returns list of dicts with keys: url, traffic, keywords, top_keyword.
|
||||
"""
|
||||
self.logger.info(f"Fetching pages-by-traffic from Ahrefs for {url}")
|
||||
target = urlparse(url).netloc or url
|
||||
try:
|
||||
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
|
||||
if not api_key:
|
||||
self.logger.warning("AHREFS_API_KEY not set; returning empty traffic pages")
|
||||
return []
|
||||
|
||||
resp = requests.get(
|
||||
"https://api.ahrefs.com/v3/site-explorer/pages-by-traffic",
|
||||
params={"target": target, "limit": limit, "select": "url,traffic,keywords,top_keyword"},
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
timeout=30,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
pages = data.get("pages", data.get("items", []))
|
||||
self.logger.info(f"Retrieved {len(pages)} pages by traffic")
|
||||
return pages
|
||||
except Exception as exc:
|
||||
self.logger.warning(f"Ahrefs pages-by-traffic lookup failed: {exc}")
|
||||
return []
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Sitemap crawling
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def crawl_sitemap(self, url: str) -> list[str]:
|
||||
"""Discover URLs from sitemap.xml."""
|
||||
sitemap_urls_to_try = [
|
||||
f"{url.rstrip('/')}/sitemap.xml",
|
||||
f"{url.rstrip('/')}/sitemap_index.xml",
|
||||
f"{url.rstrip('/')}/post-sitemap.xml",
|
||||
]
|
||||
discovered: list[str] = []
|
||||
session = await self._ensure_session()
|
||||
|
||||
for sitemap_url in sitemap_urls_to_try:
|
||||
try:
|
||||
async with session.get(sitemap_url) as resp:
|
||||
if resp.status != 200:
|
||||
continue
|
||||
text = await resp.text()
|
||||
soup = BeautifulSoup(text, "lxml-xml")
|
||||
|
||||
# Sitemap index
|
||||
sitemaps = soup.find_all("sitemap")
|
||||
if sitemaps:
|
||||
for sm in sitemaps:
|
||||
loc = sm.find("loc")
|
||||
if loc:
|
||||
child_urls = await self._parse_sitemap(session, loc.text.strip())
|
||||
discovered.extend(child_urls)
|
||||
else:
|
||||
urls = soup.find_all("url")
|
||||
for u in urls:
|
||||
loc = u.find("loc")
|
||||
if loc:
|
||||
discovered.append(loc.text.strip())
|
||||
|
||||
if discovered:
|
||||
self.logger.info(f"Discovered {len(discovered)} URLs from {sitemap_url}")
|
||||
break
|
||||
except Exception as exc:
|
||||
self.logger.debug(f"Failed to fetch {sitemap_url}: {exc}")
|
||||
|
||||
return list(set(discovered))
|
||||
|
||||
async def _parse_sitemap(self, session: aiohttp.ClientSession, sitemap_url: str) -> list[str]:
|
||||
"""Parse a single sitemap XML and return URLs."""
|
||||
urls: list[str] = []
|
||||
try:
|
||||
async with session.get(sitemap_url) as resp:
|
||||
if resp.status != 200:
|
||||
return urls
|
||||
text = await resp.text()
|
||||
soup = BeautifulSoup(text, "lxml-xml")
|
||||
for u in soup.find_all("url"):
|
||||
loc = u.find("loc")
|
||||
if loc:
|
||||
urls.append(loc.text.strip())
|
||||
except Exception as exc:
|
||||
self.logger.debug(f"Failed to parse sitemap {sitemap_url}: {exc}")
|
||||
return urls
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Content type classification
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def classify_content_type(url: str, title: str = "") -> str:
|
||||
"""
|
||||
Classify content type based on URL path patterns and title.
|
||||
|
||||
Returns one of: blog, product, service, landing, resource, other.
|
||||
"""
|
||||
combined = f"{url.lower()} {title.lower()}"
|
||||
scores: dict[str, int] = {}
|
||||
|
||||
for ctype, patterns in CONTENT_TYPE_PATTERNS.items():
|
||||
score = 0
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, combined, re.IGNORECASE):
|
||||
score += 1
|
||||
if score > 0:
|
||||
scores[ctype] = score
|
||||
|
||||
if not scores:
|
||||
return "other"
|
||||
return max(scores, key=scores.get)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Performance scoring
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def score_performance(page: ContentPage) -> float:
|
||||
"""
|
||||
Compute composite performance score (0-100) from traffic, keywords, backlinks.
|
||||
|
||||
Weights:
|
||||
- Traffic: 50% (log-scaled, 10k+ traffic = max)
|
||||
- Keywords count: 30% (log-scaled, 500+ = max)
|
||||
- Backlinks: 20% (log-scaled, 100+ = max)
|
||||
"""
|
||||
import math
|
||||
|
||||
traffic_score = min(100, (math.log10(max(page.traffic, 1)) / math.log10(10000)) * 100)
|
||||
keywords_score = min(100, (math.log10(max(page.keywords_count, 1)) / math.log10(500)) * 100)
|
||||
backlinks_score = min(100, (math.log10(max(page.backlinks, 1)) / math.log10(100)) * 100)
|
||||
|
||||
composite = (traffic_score * 0.50) + (keywords_score * 0.30) + (backlinks_score * 0.20)
|
||||
return round(min(100, max(0, composite)), 1)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Content decay detection
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def detect_decay(pages: list[ContentPage], threshold: float = -20.0) -> list[ContentPage]:
|
||||
"""
|
||||
Flag pages with declining traffic trend.
|
||||
|
||||
Uses a simple heuristic: pages with low performance score relative to
|
||||
their keyword count indicate potential decay. In production, historical
|
||||
traffic data from Ahrefs metrics-history would be used.
|
||||
|
||||
Args:
|
||||
pages: List of content pages with metrics.
|
||||
threshold: Decay rate threshold (percentage decline).
|
||||
|
||||
Returns:
|
||||
List of pages flagged as decaying.
|
||||
"""
|
||||
decaying: list[ContentPage] = []
|
||||
for page in pages:
|
||||
# Heuristic: high keyword count but low traffic suggests decay
|
||||
if page.keywords_count > 10 and page.traffic < 50:
|
||||
page.is_decaying = True
|
||||
page.decay_rate = -50.0 if page.traffic == 0 else round(
|
||||
-((page.keywords_count * 10 - page.traffic) / max(page.keywords_count * 10, 1)) * 100, 1
|
||||
)
|
||||
if page.decay_rate <= threshold:
|
||||
decaying.append(page)
|
||||
elif page.performance_score < 20 and page.keywords_count > 5:
|
||||
page.is_decaying = True
|
||||
page.decay_rate = round(-max(30, 100 - page.performance_score * 2), 1)
|
||||
if page.decay_rate <= threshold:
|
||||
decaying.append(page)
|
||||
|
||||
decaying.sort(key=lambda p: p.decay_rate)
|
||||
return decaying
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Freshness assessment
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def analyze_freshness(pages: list[ContentPage]) -> dict[str, int]:
|
||||
"""
|
||||
Categorize pages by freshness based on last_modified dates.
|
||||
|
||||
Returns distribution: fresh (< 3 months), aging (3-12 months),
|
||||
stale (> 12 months), unknown (no date).
|
||||
"""
|
||||
now = datetime.now()
|
||||
distribution = {"fresh": 0, "aging": 0, "stale": 0, "unknown": 0}
|
||||
|
||||
for page in pages:
|
||||
if not page.last_modified:
|
||||
distribution["unknown"] += 1
|
||||
continue
|
||||
try:
|
||||
# Try common date formats
|
||||
for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%S%z"):
|
||||
try:
|
||||
modified = datetime.strptime(
|
||||
page.last_modified.replace("+00:00", "").replace("Z", ""), fmt.replace("%z", "")
|
||||
)
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
else:
|
||||
distribution["unknown"] += 1
|
||||
continue
|
||||
|
||||
age = now - modified
|
||||
if age < timedelta(days=90):
|
||||
distribution["fresh"] += 1
|
||||
elif age < timedelta(days=365):
|
||||
distribution["aging"] += 1
|
||||
else:
|
||||
distribution["stale"] += 1
|
||||
except Exception:
|
||||
distribution["unknown"] += 1
|
||||
|
||||
return distribution
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Korean content pattern identification
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def identify_korean_patterns(pages: list[ContentPage]) -> dict[str, Any]:
|
||||
"""
|
||||
Detect Korean content patterns across pages.
|
||||
|
||||
Identifies Naver Blog style review content, listicles,
|
||||
how-to guides, and informational content patterns.
|
||||
|
||||
Returns summary with counts and example URLs per pattern.
|
||||
"""
|
||||
results: dict[str, Any] = {
|
||||
"total_korean_content": 0,
|
||||
"patterns": {},
|
||||
}
|
||||
|
||||
for pattern_name, keywords in KOREAN_CONTENT_PATTERNS.items():
|
||||
matches: list[dict[str, str]] = []
|
||||
for page in pages:
|
||||
combined = f"{page.url} {page.title}"
|
||||
for keyword in keywords:
|
||||
if re.search(keyword, combined, re.IGNORECASE):
|
||||
matches.append({"url": page.url, "title": page.title, "matched_keyword": keyword})
|
||||
break
|
||||
|
||||
results["patterns"][pattern_name] = {
|
||||
"count": len(matches),
|
||||
"examples": matches[:5],
|
||||
}
|
||||
|
||||
korean_urls = set()
|
||||
for pattern_data in results["patterns"].values():
|
||||
for example in pattern_data["examples"]:
|
||||
korean_urls.add(example["url"])
|
||||
results["total_korean_content"] = len(korean_urls)
|
||||
|
||||
return results
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Orchestration
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def audit(
|
||||
self,
|
||||
url: str,
|
||||
detect_decay_flag: bool = False,
|
||||
content_type_filter: str | None = None,
|
||||
limit: int = 200,
|
||||
) -> ContentAuditResult:
|
||||
"""
|
||||
Run full content audit: inventory, scoring, decay, Korean patterns.
|
||||
|
||||
Args:
|
||||
url: Target website URL.
|
||||
detect_decay_flag: Whether to run decay detection.
|
||||
content_type_filter: Filter by content type (blog, product, etc.).
|
||||
limit: Maximum pages to analyze.
|
||||
|
||||
Returns:
|
||||
ContentAuditResult with inventory, top performers, decay, analysis.
|
||||
"""
|
||||
result = ContentAuditResult(
|
||||
url=url,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
self.logger.info(f"Starting content audit for {url}")
|
||||
|
||||
# 1. Gather pages from Ahrefs and sitemap
|
||||
top_pages_data, traffic_pages_data, sitemap_urls = await asyncio.gather(
|
||||
self.get_top_pages(url, limit=limit),
|
||||
self.get_pages_by_traffic(url, limit=limit),
|
||||
self.crawl_sitemap(url),
|
||||
)
|
||||
|
||||
# 2. Merge and deduplicate pages
|
||||
page_map: dict[str, ContentPage] = {}
|
||||
|
||||
for item in top_pages_data:
|
||||
page_url = item.get("url", "")
|
||||
if not page_url:
|
||||
continue
|
||||
page_map[page_url] = ContentPage(
|
||||
url=page_url,
|
||||
title=item.get("top_keyword", ""),
|
||||
traffic=int(item.get("traffic", 0)),
|
||||
keywords_count=int(item.get("keywords", 0)),
|
||||
backlinks=int(item.get("value", 0)),
|
||||
)
|
||||
|
||||
for item in traffic_pages_data:
|
||||
page_url = item.get("url", "")
|
||||
if not page_url:
|
||||
continue
|
||||
if page_url in page_map:
|
||||
existing = page_map[page_url]
|
||||
existing.traffic = max(existing.traffic, int(item.get("traffic", 0)))
|
||||
existing.keywords_count = max(existing.keywords_count, int(item.get("keywords", 0)))
|
||||
else:
|
||||
page_map[page_url] = ContentPage(
|
||||
url=page_url,
|
||||
title=item.get("top_keyword", ""),
|
||||
traffic=int(item.get("traffic", 0)),
|
||||
keywords_count=int(item.get("keywords", 0)),
|
||||
)
|
||||
|
||||
# Add sitemap URLs not already present
|
||||
for s_url in sitemap_urls:
|
||||
if s_url not in page_map:
|
||||
page_map[s_url] = ContentPage(url=s_url)
|
||||
|
||||
# 3. Classify and score
|
||||
all_pages: list[ContentPage] = []
|
||||
for page in page_map.values():
|
||||
page.content_type = self.classify_content_type(page.url, page.title)
|
||||
page.performance_score = self.score_performance(page)
|
||||
all_pages.append(page)
|
||||
|
||||
# 4. Filter by content type if requested
|
||||
if content_type_filter:
|
||||
all_pages = [p for p in all_pages if p.content_type == content_type_filter]
|
||||
|
||||
# 5. Build inventory
|
||||
by_type: dict[str, int] = {}
|
||||
for page in all_pages:
|
||||
by_type[page.content_type] = by_type.get(page.content_type, 0) + 1
|
||||
|
||||
avg_score = (
|
||||
sum(p.performance_score for p in all_pages) / len(all_pages)
|
||||
if all_pages else 0.0
|
||||
)
|
||||
avg_word_count = (
|
||||
sum(p.word_count for p in all_pages) / len(all_pages)
|
||||
if all_pages else 0.0
|
||||
)
|
||||
|
||||
freshness = self.analyze_freshness(all_pages)
|
||||
|
||||
result.content_inventory = ContentInventory(
|
||||
total_pages=len(all_pages),
|
||||
by_type=by_type,
|
||||
avg_performance_score=round(avg_score, 1),
|
||||
avg_word_count=round(avg_word_count, 1),
|
||||
pages=sorted(all_pages, key=lambda p: p.performance_score, reverse=True)[:limit],
|
||||
freshness_distribution=freshness,
|
||||
)
|
||||
|
||||
# 6. Top performers
|
||||
result.top_performers = sorted(all_pages, key=lambda p: p.performance_score, reverse=True)[:20]
|
||||
|
||||
# 7. Decay detection
|
||||
if detect_decay_flag:
|
||||
result.decaying_content = self.detect_decay(all_pages)
|
||||
|
||||
# 8. Korean content analysis
|
||||
result.korean_content_analysis = self.identify_korean_patterns(all_pages)
|
||||
|
||||
# 9. Recommendations
|
||||
result.recommendations = self._generate_recommendations(result)
|
||||
|
||||
self.logger.info(
|
||||
f"Audit complete: {len(all_pages)} pages, "
|
||||
f"{len(result.top_performers)} top performers, "
|
||||
f"{len(result.decaying_content)} decaying"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _generate_recommendations(result: ContentAuditResult) -> list[str]:
|
||||
"""Generate actionable recommendations from audit data."""
|
||||
recs: list[str] = []
|
||||
inv = result.content_inventory
|
||||
|
||||
# Low average score
|
||||
if inv.avg_performance_score < 30:
|
||||
recs.append(
|
||||
"전체 콘텐츠 평균 성과 점수가 낮습니다 ({:.0f}/100). "
|
||||
"상위 콘텐츠 패턴을 분석하여 저성과 페이지를 개선하세요.".format(inv.avg_performance_score)
|
||||
)
|
||||
|
||||
# Stale content
|
||||
stale = inv.freshness_distribution.get("stale", 0)
|
||||
total = inv.total_pages or 1
|
||||
if stale / total > 0.3:
|
||||
recs.append(
|
||||
f"오래된 콘텐츠가 {stale}개 ({stale * 100 // total}%)입니다. "
|
||||
"콘텐츠 업데이트 또는 통합을 고려하세요."
|
||||
)
|
||||
|
||||
# Decaying content
|
||||
if len(result.decaying_content) > 5:
|
||||
recs.append(
|
||||
f"트래픽이 감소하는 콘텐츠가 {len(result.decaying_content)}개 감지되었습니다. "
|
||||
"상위 감소 페이지부터 콘텐츠 리프레시를 진행하세요."
|
||||
)
|
||||
|
||||
# Content type balance
|
||||
blog_count = inv.by_type.get("blog", 0)
|
||||
if blog_count == 0:
|
||||
recs.append(
|
||||
"블로그 콘텐츠가 없습니다. SEO 트래픽 확보를 위해 "
|
||||
"블로그 콘텐츠 전략을 수립하세요."
|
||||
)
|
||||
|
||||
# Korean content opportunities
|
||||
korean = result.korean_content_analysis
|
||||
review_count = korean.get("patterns", {}).get("naver_blog_style", {}).get("count", 0)
|
||||
if review_count == 0:
|
||||
recs.append(
|
||||
"후기/리뷰 콘텐츠가 없습니다. 한국 시장에서 후기 콘텐츠는 "
|
||||
"전환율에 큰 영향을 미치므로 후기 콘텐츠 생성을 권장합니다."
|
||||
)
|
||||
|
||||
if not recs:
|
||||
recs.append("현재 콘텐츠 전략이 양호합니다. 지속적인 모니터링을 권장합니다.")
|
||||
|
||||
return recs
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="SEO Content Auditor - inventory, scoring, and decay detection",
|
||||
)
|
||||
parser.add_argument("--url", required=True, help="Target website URL")
|
||||
parser.add_argument("--decay", action="store_true", help="Enable content decay detection")
|
||||
parser.add_argument("--type", dest="content_type", help="Filter by content type (blog, product, service, landing, resource)")
|
||||
parser.add_argument("--limit", type=int, default=200, help="Maximum pages to analyze (default: 200)")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
parser.add_argument("--output", help="Save output to file")
|
||||
return parser
|
||||
|
||||
|
||||
def format_text_report(result: ContentAuditResult) -> str:
|
||||
"""Format audit result as human-readable text."""
|
||||
lines: list[str] = []
|
||||
lines.append(f"## Content Audit: {result.url}")
|
||||
lines.append(f"**Date**: {result.timestamp[:10]}")
|
||||
lines.append("")
|
||||
|
||||
inv = result.content_inventory
|
||||
lines.append(f"### Content Inventory")
|
||||
lines.append(f"- Total pages: {inv.total_pages}")
|
||||
lines.append(f"- Average performance score: {inv.avg_performance_score}/100")
|
||||
lines.append(f"- Content types: {json.dumps(inv.by_type, ensure_ascii=False)}")
|
||||
lines.append(f"- Freshness: {json.dumps(inv.freshness_distribution, ensure_ascii=False)}")
|
||||
lines.append("")
|
||||
|
||||
lines.append("### Top Performers")
|
||||
for i, page in enumerate(result.top_performers[:10], 1):
|
||||
lines.append(f" {i}. [{page.performance_score:.0f}] {page.url} (traffic: {page.traffic})")
|
||||
lines.append("")
|
||||
|
||||
if result.decaying_content:
|
||||
lines.append("### Decaying Content")
|
||||
for i, page in enumerate(result.decaying_content[:10], 1):
|
||||
lines.append(f" {i}. [{page.decay_rate:+.0f}%] {page.url} (traffic: {page.traffic})")
|
||||
lines.append("")
|
||||
|
||||
if result.korean_content_analysis.get("patterns"):
|
||||
lines.append("### Korean Content Patterns")
|
||||
for pattern_name, data in result.korean_content_analysis["patterns"].items():
|
||||
lines.append(f" - {pattern_name}: {data['count']} pages")
|
||||
lines.append("")
|
||||
|
||||
lines.append("### Recommendations")
|
||||
for i, rec in enumerate(result.recommendations, 1):
|
||||
lines.append(f" {i}. {rec}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
auditor = ContentAuditor()
|
||||
try:
|
||||
result = await auditor.audit(
|
||||
url=args.url,
|
||||
detect_decay_flag=args.decay,
|
||||
content_type_filter=args.content_type,
|
||||
limit=args.limit,
|
||||
)
|
||||
|
||||
if args.json:
|
||||
output = json.dumps(asdict(result), ensure_ascii=False, indent=2, default=str)
|
||||
else:
|
||||
output = format_text_report(result)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
logger.info(f"Output saved to {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
|
||||
finally:
|
||||
await auditor.close()
|
||||
auditor.print_stats()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,738 @@
|
||||
"""
|
||||
Content Brief Generator - SEO Content Brief Creation
|
||||
=====================================================
|
||||
Purpose: Generate detailed SEO content briefs with outlines,
|
||||
keyword lists, word count targets, and internal linking suggestions.
|
||||
Python: 3.10+
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from base_client import BaseAsyncClient, config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data classes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class OutlineSection:
|
||||
"""A single heading section in the content outline."""
|
||||
heading: str
|
||||
level: int = 2 # H2 or H3
|
||||
talking_points: list[str] = field(default_factory=list)
|
||||
target_words: int = 200
|
||||
keywords_to_include: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompetitorPageAnalysis:
|
||||
"""Analysis of a single competitor page for the target keyword."""
|
||||
url: str
|
||||
title: str = ""
|
||||
word_count: int = 0
|
||||
headings: list[dict[str, str]] = field(default_factory=list)
|
||||
topics_covered: list[str] = field(default_factory=list)
|
||||
content_type: str = ""
|
||||
has_images: bool = False
|
||||
has_video: bool = False
|
||||
has_faq: bool = False
|
||||
has_table: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContentBrief:
|
||||
"""Complete SEO content brief."""
|
||||
primary_keyword: str
|
||||
secondary_keywords: list[str] = field(default_factory=list)
|
||||
lsi_keywords: list[str] = field(default_factory=list)
|
||||
target_word_count: int = 1500
|
||||
word_count_range: tuple[int, int] = (1200, 1800)
|
||||
suggested_title: str = ""
|
||||
meta_description: str = ""
|
||||
outline: list[OutlineSection] = field(default_factory=list)
|
||||
competitor_analysis: list[CompetitorPageAnalysis] = field(default_factory=list)
|
||||
internal_links: list[dict[str, str]] = field(default_factory=list)
|
||||
content_format: str = "blog"
|
||||
korean_format_recommendations: list[str] = field(default_factory=list)
|
||||
search_intent: str = "informational"
|
||||
notes: list[str] = field(default_factory=list)
|
||||
timestamp: str = ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Search intent patterns
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
INTENT_PATTERNS = {
|
||||
"transactional": [
|
||||
r"buy", r"purchase", r"price", r"cost", r"order", r"shop",
|
||||
r"구매", r"주문", r"가격", r"비용", r"할인", r"쿠폰",
|
||||
],
|
||||
"navigational": [
|
||||
r"login", r"sign in", r"official", r"website",
|
||||
r"로그인", r"공식", r"홈페이지",
|
||||
],
|
||||
"commercial": [
|
||||
r"best", r"top", r"review", r"compare", r"vs",
|
||||
r"추천", r"비교", r"후기", r"리뷰", r"순위",
|
||||
],
|
||||
"informational": [
|
||||
r"what", r"how", r"why", r"guide", r"tutorial",
|
||||
r"이란", r"방법", r"가이드", r"효과", r"원인",
|
||||
],
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Korean content format recommendations
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
KOREAN_FORMAT_TIPS = {
|
||||
"transactional": [
|
||||
"가격 비교표를 포함하세요 (경쟁사 가격 대비)",
|
||||
"실제 비용 사례를 3개 이상 제시하세요",
|
||||
"결제 방법 및 할인 정보를 명확히 안내하세요",
|
||||
"CTA(행동 유도) 버튼을 여러 위치에 배치하세요",
|
||||
],
|
||||
"commercial": [
|
||||
"네이버 블로그 스타일의 솔직한 후기 톤을 사용하세요",
|
||||
"장단점을 균형 있게 비교하세요",
|
||||
"실제 사용 사진 또는 전후 비교 이미지를 포함하세요",
|
||||
"별점 또는 점수 평가 체계를 추가하세요",
|
||||
"FAQ 섹션을 포함하세요 (네이버 검색 노출에 유리)",
|
||||
],
|
||||
"informational": [
|
||||
"핵심 정보를 글 상단에 요약하세요 (두괄식 구성)",
|
||||
"전문 용어는 쉬운 설명을 병기하세요",
|
||||
"인포그래픽 또는 도표를 활용하세요",
|
||||
"관련 콘텐츠 내부 링크를 3-5개 포함하세요",
|
||||
"전문가 인용 또는 출처를 명시하세요 (E-E-A-T 강화)",
|
||||
],
|
||||
"navigational": [
|
||||
"공식 정보와 연락처를 최상단에 배치하세요",
|
||||
"지도 임베드를 포함하세요 (네이버 지도/구글 맵)",
|
||||
"영업시간, 주소, 전화번호를 명확히 표시하세요",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ContentBriefGenerator
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class ContentBriefGenerator(BaseAsyncClient):
|
||||
"""Generate comprehensive SEO content briefs."""
|
||||
|
||||
def __init__(self, max_concurrent: int = 5, requests_per_second: float = 2.0):
|
||||
super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second)
|
||||
self.session: aiohttp.ClientSession | None = None
|
||||
|
||||
async def _ensure_session(self) -> aiohttp.ClientSession:
|
||||
if self.session is None or self.session.closed:
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; SEOContentBrief/1.0)",
|
||||
}
|
||||
self.session = aiohttp.ClientSession(timeout=timeout, headers=headers)
|
||||
return self.session
|
||||
|
||||
async def close(self) -> None:
|
||||
if self.session and not self.session.closed:
|
||||
await self.session.close()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Analyze top ranking results
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def analyze_top_results(
|
||||
self,
|
||||
keyword: str,
|
||||
site_url: str | None = None,
|
||||
num_competitors: int = 5,
|
||||
) -> list[CompetitorPageAnalysis]:
|
||||
"""
|
||||
Analyze top ranking pages for a keyword using Ahrefs SERP data.
|
||||
|
||||
Falls back to fetching pages directly if Ahrefs data is unavailable.
|
||||
"""
|
||||
self.logger.info(f"Analyzing top results for: {keyword}")
|
||||
results: list[CompetitorPageAnalysis] = []
|
||||
|
||||
# Try Ahrefs organic keywords to find ranking pages
|
||||
try:
|
||||
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
|
||||
if api_key:
|
||||
resp = requests.get(
|
||||
"https://api.ahrefs.com/v3/serp-overview",
|
||||
params={"keyword": keyword, "select": "url,title,position,traffic"},
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
timeout=30,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
serp_items = data.get("positions", data.get("items", []))[:num_competitors]
|
||||
for item in serp_items:
|
||||
analysis = CompetitorPageAnalysis(
|
||||
url=item.get("url", ""),
|
||||
title=item.get("title", ""),
|
||||
)
|
||||
results.append(analysis)
|
||||
except Exception as exc:
|
||||
self.logger.warning(f"Ahrefs SERP lookup failed: {exc}")
|
||||
|
||||
# Fetch and analyze each page
|
||||
session = await self._ensure_session()
|
||||
for analysis in results[:num_competitors]:
|
||||
if not analysis.url:
|
||||
continue
|
||||
try:
|
||||
async with session.get(analysis.url) as resp:
|
||||
if resp.status != 200:
|
||||
continue
|
||||
html = await resp.text()
|
||||
self._analyze_page_content(analysis, html)
|
||||
except Exception as exc:
|
||||
self.logger.debug(f"Failed to fetch {analysis.url}: {exc}")
|
||||
|
||||
self.logger.info(f"Analyzed {len(results)} competitor pages")
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def _analyze_page_content(analysis: CompetitorPageAnalysis, html: str) -> None:
|
||||
"""Parse HTML and extract content metrics."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Title
|
||||
title_tag = soup.find("title")
|
||||
if title_tag and not analysis.title:
|
||||
analysis.title = title_tag.get_text(strip=True)
|
||||
|
||||
# Word count (visible text only)
|
||||
for tag in soup(["script", "style", "nav", "header", "footer"]):
|
||||
tag.decompose()
|
||||
visible_text = soup.get_text(separator=" ", strip=True)
|
||||
analysis.word_count = len(visible_text.split())
|
||||
|
||||
# Headings
|
||||
headings: list[dict[str, str]] = []
|
||||
for level in range(1, 7):
|
||||
for h in soup.find_all(f"h{level}"):
|
||||
text = h.get_text(strip=True)
|
||||
if text:
|
||||
headings.append({"level": f"H{level}", "text": text})
|
||||
analysis.headings = headings
|
||||
|
||||
# Content features
|
||||
analysis.has_images = len(soup.find_all("img")) > 2
|
||||
analysis.has_video = bool(soup.find("video") or soup.find("iframe", src=re.compile(r"youtube|vimeo")))
|
||||
analysis.has_faq = bool(
|
||||
soup.find(string=re.compile(r"FAQ|자주\s*묻는\s*질문|Q\s*&\s*A", re.IGNORECASE))
|
||||
or soup.find("script", type="application/ld+json", string=re.compile(r"FAQPage"))
|
||||
)
|
||||
analysis.has_table = bool(soup.find("table"))
|
||||
|
||||
# Topics covered (from H2 headings)
|
||||
analysis.topics_covered = [
|
||||
h["text"] for h in headings if h["level"] == "H2"
|
||||
][:15]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Extract content outline
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def extract_outline(
|
||||
self,
|
||||
keyword: str,
|
||||
top_results: list[CompetitorPageAnalysis],
|
||||
) -> list[OutlineSection]:
|
||||
"""
|
||||
Build recommended H2/H3 outline by aggregating competitor headings.
|
||||
|
||||
Identifies common topics across top-ranking pages and structures
|
||||
them into a logical outline.
|
||||
"""
|
||||
# Collect all H2 headings
|
||||
h2_topics: dict[str, int] = {}
|
||||
h3_by_h2: dict[str, list[str]] = {}
|
||||
|
||||
for result in top_results:
|
||||
current_h2 = ""
|
||||
for heading in result.headings:
|
||||
text = heading["text"].strip()
|
||||
if heading["level"] == "H2":
|
||||
current_h2 = text
|
||||
h2_topics[text] = h2_topics.get(text, 0) + 1
|
||||
elif heading["level"] == "H3" and current_h2:
|
||||
if current_h2 not in h3_by_h2:
|
||||
h3_by_h2[current_h2] = []
|
||||
h3_by_h2[current_h2].append(text)
|
||||
|
||||
# Sort H2s by frequency (most common topics first)
|
||||
sorted_h2s = sorted(h2_topics.items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Build outline
|
||||
outline: list[OutlineSection] = []
|
||||
target_word_count = self.calculate_word_count(top_results)
|
||||
words_per_section = target_word_count // max(len(sorted_h2s), 5)
|
||||
|
||||
for h2_text, frequency in sorted_h2s[:8]:
|
||||
section = OutlineSection(
|
||||
heading=h2_text,
|
||||
level=2,
|
||||
target_words=words_per_section,
|
||||
talking_points=[],
|
||||
)
|
||||
|
||||
# Add H3 subtopics
|
||||
if h2_text in h3_by_h2:
|
||||
unique_h3s = list(dict.fromkeys(h3_by_h2[h2_text]))[:5]
|
||||
for h3_text in unique_h3s:
|
||||
subsection = OutlineSection(
|
||||
heading=h3_text,
|
||||
level=3,
|
||||
target_words=words_per_section // 3,
|
||||
)
|
||||
section.talking_points.append(h3_text)
|
||||
|
||||
outline.append(section)
|
||||
|
||||
# Ensure FAQ section if common
|
||||
faq_count = sum(1 for r in top_results if r.has_faq)
|
||||
if faq_count >= 2 and not any("FAQ" in s.heading or "질문" in s.heading for s in outline):
|
||||
outline.append(OutlineSection(
|
||||
heading="자주 묻는 질문 (FAQ)",
|
||||
level=2,
|
||||
target_words=300,
|
||||
talking_points=[
|
||||
f"{keyword} 관련 자주 묻는 질문 5-7개",
|
||||
"Schema markup (FAQPage) 적용 권장",
|
||||
],
|
||||
))
|
||||
|
||||
return outline
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Keyword suggestions
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def suggest_keywords(self, primary_keyword: str) -> dict[str, list[str]]:
|
||||
"""
|
||||
Generate primary, secondary, and LSI keyword suggestions.
|
||||
|
||||
Uses Ahrefs related keywords and matching terms data.
|
||||
"""
|
||||
self.logger.info(f"Generating keyword suggestions for: {primary_keyword}")
|
||||
result = {
|
||||
"primary": [primary_keyword],
|
||||
"secondary": [],
|
||||
"lsi": [],
|
||||
}
|
||||
|
||||
try:
|
||||
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
|
||||
if not api_key:
|
||||
self.logger.warning("AHREFS_API_KEY not set; returning basic keywords only")
|
||||
return result
|
||||
|
||||
# Matching terms
|
||||
resp = requests.get(
|
||||
"https://api.ahrefs.com/v3/keywords-explorer/matching-terms",
|
||||
params={"keyword": primary_keyword, "limit": 20, "select": "keyword,volume,difficulty"},
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
timeout=30,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
terms = data.get("keywords", data.get("items", []))
|
||||
for term in terms:
|
||||
kw = term.get("keyword", "")
|
||||
if kw and kw.lower() != primary_keyword.lower():
|
||||
result["secondary"].append(kw)
|
||||
|
||||
# Related terms (LSI)
|
||||
resp2 = requests.get(
|
||||
"https://api.ahrefs.com/v3/keywords-explorer/related-terms",
|
||||
params={"keyword": primary_keyword, "limit": 15, "select": "keyword,volume"},
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
timeout=30,
|
||||
)
|
||||
if resp2.status_code == 200:
|
||||
data2 = resp2.json()
|
||||
related = data2.get("keywords", data2.get("items", []))
|
||||
for term in related:
|
||||
kw = term.get("keyword", "")
|
||||
if kw and kw not in result["secondary"]:
|
||||
result["lsi"].append(kw)
|
||||
|
||||
except Exception as exc:
|
||||
self.logger.warning(f"Keyword suggestion lookup failed: {exc}")
|
||||
|
||||
return result
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Word count calculation
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def calculate_word_count(top_results: list[CompetitorPageAnalysis]) -> int:
|
||||
"""
|
||||
Calculate target word count based on top 5 ranking pages.
|
||||
|
||||
Returns the average word count of top 5 with +/- 20% range.
|
||||
"""
|
||||
word_counts = [r.word_count for r in top_results[:5] if r.word_count > 100]
|
||||
|
||||
if not word_counts:
|
||||
return 1500 # Default fallback
|
||||
|
||||
avg = sum(word_counts) / len(word_counts)
|
||||
# Round to nearest 100
|
||||
target = round(avg / 100) * 100
|
||||
return max(800, min(5000, target))
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal linking suggestions
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def suggest_internal_links(
|
||||
self,
|
||||
keyword: str,
|
||||
site_url: str,
|
||||
) -> list[dict[str, str]]:
|
||||
"""
|
||||
Find related existing pages on the site for internal linking.
|
||||
|
||||
Uses Ahrefs organic keywords to find pages ranking for related terms.
|
||||
"""
|
||||
self.logger.info(f"Finding internal link opportunities for {keyword} on {site_url}")
|
||||
links: list[dict[str, str]] = []
|
||||
target = urlparse(site_url).netloc or site_url
|
||||
|
||||
try:
|
||||
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
|
||||
if not api_key:
|
||||
return links
|
||||
|
||||
resp = requests.get(
|
||||
"https://api.ahrefs.com/v3/site-explorer/organic-keywords",
|
||||
params={
|
||||
"target": target,
|
||||
"limit": 50,
|
||||
"select": "keyword,url,position,traffic",
|
||||
},
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
timeout=30,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return links
|
||||
|
||||
data = resp.json()
|
||||
keywords_data = data.get("keywords", data.get("items", []))
|
||||
|
||||
# Find pages ranking for related keywords
|
||||
keyword_lower = keyword.lower()
|
||||
keyword_words = set(keyword_lower.split())
|
||||
|
||||
seen_urls: set[str] = set()
|
||||
for item in keywords_data:
|
||||
kw = item.get("keyword", "").lower()
|
||||
url = item.get("url", "")
|
||||
|
||||
if not url or url in seen_urls:
|
||||
continue
|
||||
|
||||
# Check keyword relevance
|
||||
kw_words = set(kw.split())
|
||||
overlap = keyword_words & kw_words
|
||||
if overlap and kw != keyword_lower:
|
||||
links.append({
|
||||
"url": url,
|
||||
"anchor_text": kw,
|
||||
"relevance": f"{len(overlap)}/{len(keyword_words)} word overlap",
|
||||
"current_traffic": str(item.get("traffic", 0)),
|
||||
})
|
||||
seen_urls.add(url)
|
||||
|
||||
links.sort(key=lambda l: int(l.get("current_traffic", "0")), reverse=True)
|
||||
|
||||
except Exception as exc:
|
||||
self.logger.warning(f"Internal link suggestion failed: {exc}")
|
||||
|
||||
return links[:10]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Search intent detection
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def detect_search_intent(keyword: str) -> str:
|
||||
"""Classify keyword search intent."""
|
||||
keyword_lower = keyword.lower()
|
||||
scores: dict[str, int] = {}
|
||||
|
||||
for intent, patterns in INTENT_PATTERNS.items():
|
||||
score = sum(1 for p in patterns if re.search(p, keyword_lower, re.IGNORECASE))
|
||||
if score > 0:
|
||||
scores[intent] = score
|
||||
|
||||
if not scores:
|
||||
return "informational"
|
||||
return max(scores, key=scores.get)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Orchestration
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def generate(
|
||||
self,
|
||||
keyword: str,
|
||||
site_url: str,
|
||||
num_competitors: int = 5,
|
||||
) -> ContentBrief:
|
||||
"""
|
||||
Generate a comprehensive SEO content brief.
|
||||
|
||||
Args:
|
||||
keyword: Primary target keyword.
|
||||
site_url: Target website URL.
|
||||
num_competitors: Number of competitor pages to analyze.
|
||||
|
||||
Returns:
|
||||
ContentBrief with outline, keywords, and recommendations.
|
||||
"""
|
||||
self.logger.info(f"Generating content brief for: {keyword}")
|
||||
|
||||
# Detect search intent
|
||||
intent = self.detect_search_intent(keyword)
|
||||
|
||||
# Run analyses in parallel
|
||||
top_results_task = self.analyze_top_results(keyword, site_url, num_competitors)
|
||||
keywords_task = self.suggest_keywords(keyword)
|
||||
internal_links_task = self.suggest_internal_links(keyword, site_url)
|
||||
|
||||
top_results, keyword_data, internal_links = await asyncio.gather(
|
||||
top_results_task, keywords_task, internal_links_task,
|
||||
)
|
||||
|
||||
# Calculate word count target
|
||||
target_word_count = self.calculate_word_count(top_results)
|
||||
word_count_min = int(target_word_count * 0.8)
|
||||
word_count_max = int(target_word_count * 1.2)
|
||||
|
||||
# Build outline
|
||||
outline = self.extract_outline(keyword, top_results)
|
||||
|
||||
# Generate title suggestion
|
||||
suggested_title = self._generate_title(keyword, intent)
|
||||
|
||||
# Generate meta description
|
||||
meta_description = self._generate_meta_description(keyword, intent)
|
||||
|
||||
# Korean format recommendations
|
||||
korean_tips = KOREAN_FORMAT_TIPS.get(intent, KOREAN_FORMAT_TIPS["informational"])
|
||||
|
||||
brief = ContentBrief(
|
||||
primary_keyword=keyword,
|
||||
secondary_keywords=keyword_data.get("secondary", [])[:10],
|
||||
lsi_keywords=keyword_data.get("lsi", [])[:10],
|
||||
target_word_count=target_word_count,
|
||||
word_count_range=(word_count_min, word_count_max),
|
||||
suggested_title=suggested_title,
|
||||
meta_description=meta_description,
|
||||
outline=outline,
|
||||
competitor_analysis=top_results,
|
||||
internal_links=internal_links,
|
||||
content_format=self._suggest_format(intent, top_results),
|
||||
korean_format_recommendations=korean_tips,
|
||||
search_intent=intent,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
self.logger.info(
|
||||
f"Brief generated: {len(outline)} sections, "
|
||||
f"{target_word_count} target words, "
|
||||
f"{len(keyword_data.get('secondary', []))} secondary keywords"
|
||||
)
|
||||
|
||||
return brief
|
||||
|
||||
@staticmethod
|
||||
def _generate_title(keyword: str, intent: str) -> str:
|
||||
"""Generate a suggested title based on keyword and intent."""
|
||||
templates = {
|
||||
"informational": "{keyword} - 완벽 가이드 (2025년 최신)",
|
||||
"commercial": "{keyword} 추천 TOP 10 비교 (전문가 리뷰)",
|
||||
"transactional": "{keyword} 가격 비교 및 구매 가이드",
|
||||
"navigational": "{keyword} - 공식 안내",
|
||||
}
|
||||
template = templates.get(intent, templates["informational"])
|
||||
return template.format(keyword=keyword)
|
||||
|
||||
@staticmethod
|
||||
def _generate_meta_description(keyword: str, intent: str) -> str:
|
||||
"""Generate a suggested meta description."""
|
||||
templates = {
|
||||
"informational": (
|
||||
f"{keyword}에 대해 알아야 할 모든 것을 정리했습니다. "
|
||||
"전문가가 알려주는 핵심 정보와 실용적인 가이드를 확인하세요."
|
||||
),
|
||||
"commercial": (
|
||||
f"{keyword} 비교 분석! 장단점, 가격, 실제 후기를 "
|
||||
"한눈에 비교하고 최적의 선택을 하세요."
|
||||
),
|
||||
"transactional": (
|
||||
f"{keyword} 최저가 비교 및 구매 방법을 안내합니다. "
|
||||
"합리적인 가격으로 구매하는 팁을 확인하세요."
|
||||
),
|
||||
"navigational": (
|
||||
f"{keyword} 공식 정보 및 이용 안내. "
|
||||
"정확한 정보를 빠르게 확인하세요."
|
||||
),
|
||||
}
|
||||
return templates.get(intent, templates["informational"])
|
||||
|
||||
@staticmethod
|
||||
def _suggest_format(intent: str, results: list[CompetitorPageAnalysis]) -> str:
|
||||
"""Suggest content format based on intent and competitor analysis."""
|
||||
if intent == "commercial":
|
||||
return "listicle"
|
||||
if intent == "informational":
|
||||
return "guide"
|
||||
if intent == "transactional":
|
||||
return "landing"
|
||||
|
||||
# Check competitor patterns
|
||||
avg_word_count = (
|
||||
sum(r.word_count for r in results) / len(results) if results else 0
|
||||
)
|
||||
if avg_word_count > 3000:
|
||||
return "comprehensive_guide"
|
||||
return "blog"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="SEO Content Brief Generator",
|
||||
)
|
||||
parser.add_argument("--keyword", required=True, help="Primary target keyword")
|
||||
parser.add_argument("--url", required=True, help="Target website URL")
|
||||
parser.add_argument("--competitors", type=int, default=5, help="Number of competitor pages to analyze (default: 5)")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
parser.add_argument("--output", help="Save output to file")
|
||||
return parser
|
||||
|
||||
|
||||
def format_text_report(brief: ContentBrief) -> str:
|
||||
"""Format content brief as human-readable text."""
|
||||
lines: list[str] = []
|
||||
lines.append(f"## Content Brief: {brief.primary_keyword}")
|
||||
lines.append(f"**Date**: {brief.timestamp[:10]}")
|
||||
lines.append(f"**Search Intent**: {brief.search_intent}")
|
||||
lines.append(f"**Content Format**: {brief.content_format}")
|
||||
lines.append("")
|
||||
|
||||
lines.append("### Target Metrics")
|
||||
lines.append(f"- Word count: {brief.target_word_count} ({brief.word_count_range[0]}-{brief.word_count_range[1]})")
|
||||
lines.append(f"- Suggested title: {brief.suggested_title}")
|
||||
lines.append(f"- Meta description: {brief.meta_description}")
|
||||
lines.append("")
|
||||
|
||||
lines.append("### Keywords")
|
||||
lines.append(f"- **Primary**: {brief.primary_keyword}")
|
||||
if brief.secondary_keywords:
|
||||
lines.append(f"- **Secondary**: {', '.join(brief.secondary_keywords[:8])}")
|
||||
if brief.lsi_keywords:
|
||||
lines.append(f"- **LSI**: {', '.join(brief.lsi_keywords[:8])}")
|
||||
lines.append("")
|
||||
|
||||
lines.append("### Content Outline")
|
||||
for section in brief.outline:
|
||||
prefix = "##" if section.level == 2 else "###"
|
||||
lines.append(f" {prefix} {section.heading} (~{section.target_words}w)")
|
||||
for point in section.talking_points:
|
||||
lines.append(f" - {point}")
|
||||
lines.append("")
|
||||
|
||||
if brief.competitor_analysis:
|
||||
lines.append(f"### Competitor Analysis ({len(brief.competitor_analysis)} pages)")
|
||||
for comp in brief.competitor_analysis:
|
||||
lines.append(f" - **{comp.title or comp.url}**")
|
||||
lines.append(f" Word count: {comp.word_count} | Headings: {len(comp.headings)}")
|
||||
features = []
|
||||
if comp.has_images:
|
||||
features.append("images")
|
||||
if comp.has_video:
|
||||
features.append("video")
|
||||
if comp.has_faq:
|
||||
features.append("FAQ")
|
||||
if comp.has_table:
|
||||
features.append("table")
|
||||
if features:
|
||||
lines.append(f" Features: {', '.join(features)}")
|
||||
lines.append("")
|
||||
|
||||
if brief.internal_links:
|
||||
lines.append(f"### Internal Linking Suggestions ({len(brief.internal_links)})")
|
||||
for link in brief.internal_links[:7]:
|
||||
lines.append(f" - [{link['anchor_text']}]({link['url']})")
|
||||
lines.append("")
|
||||
|
||||
if brief.korean_format_recommendations:
|
||||
lines.append("### Korean Content Format Recommendations")
|
||||
for tip in brief.korean_format_recommendations:
|
||||
lines.append(f" - {tip}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
generator = ContentBriefGenerator()
|
||||
try:
|
||||
brief = await generator.generate(
|
||||
keyword=args.keyword,
|
||||
site_url=args.url,
|
||||
num_competitors=args.competitors,
|
||||
)
|
||||
|
||||
if args.json:
|
||||
output = json.dumps(asdict(brief), ensure_ascii=False, indent=2, default=str)
|
||||
else:
|
||||
output = format_text_report(brief)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
logger.info(f"Output saved to {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
|
||||
finally:
|
||||
await generator.close()
|
||||
generator.print_stats()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,694 @@
|
||||
"""
|
||||
Content Gap Analyzer - Topic Gap Detection & Cluster Mapping
|
||||
=============================================================
|
||||
Purpose: Identify content gaps vs competitors, build topic clusters,
|
||||
and generate prioritized editorial calendars.
|
||||
Python: 3.10+
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
|
||||
from base_client import BaseAsyncClient, config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data classes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class TopicGap:
|
||||
"""A topic present in competitors but missing from target."""
|
||||
topic: str
|
||||
competitor_urls: list[str] = field(default_factory=list)
|
||||
competitor_keywords: list[str] = field(default_factory=list)
|
||||
estimated_traffic: int = 0
|
||||
priority_score: float = 0.0
|
||||
difficulty: str = "medium"
|
||||
content_type_suggestion: str = "blog"
|
||||
|
||||
|
||||
@dataclass
|
||||
class TopicCluster:
|
||||
"""Topic cluster with pillar and supporting cluster pages."""
|
||||
pillar_topic: str
|
||||
pillar_keyword: str = ""
|
||||
cluster_topics: list[str] = field(default_factory=list)
|
||||
cluster_keywords: list[str] = field(default_factory=list)
|
||||
total_volume: int = 0
|
||||
coverage_score: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class CalendarEntry:
|
||||
"""Prioritized editorial calendar entry."""
|
||||
topic: str
|
||||
priority: str = "medium"
|
||||
target_date: str = ""
|
||||
content_type: str = "blog"
|
||||
target_word_count: int = 1500
|
||||
primary_keyword: str = ""
|
||||
estimated_traffic: int = 0
|
||||
cluster_name: str = ""
|
||||
notes: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContentGapResult:
|
||||
"""Full content gap analysis result."""
|
||||
target_url: str
|
||||
competitor_urls: list[str] = field(default_factory=list)
|
||||
timestamp: str = ""
|
||||
target_topics_count: int = 0
|
||||
competitor_topics_count: int = 0
|
||||
gaps: list[TopicGap] = field(default_factory=list)
|
||||
clusters: list[TopicCluster] = field(default_factory=list)
|
||||
calendar: list[CalendarEntry] = field(default_factory=list)
|
||||
content_volume_comparison: dict[str, int] = field(default_factory=dict)
|
||||
korean_opportunities: list[dict[str, Any]] = field(default_factory=dict)
|
||||
recommendations: list[str] = field(default_factory=list)
|
||||
errors: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Korean opportunity patterns
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
KOREAN_OPPORTUNITY_PATTERNS = [
|
||||
{"pattern": r"후기|리뷰", "label": "review_content", "description": "후기/리뷰 콘텐츠"},
|
||||
{"pattern": r"비용|가격|견적", "label": "pricing_content", "description": "비용/가격 정보 콘텐츠"},
|
||||
{"pattern": r"비교|차이", "label": "comparison_content", "description": "비교 콘텐츠"},
|
||||
{"pattern": r"추천|베스트|TOP", "label": "recommendation_content", "description": "추천/리스트 콘텐츠"},
|
||||
{"pattern": r"방법|하는\s*법|가이드", "label": "how_to_content", "description": "가이드/방법 콘텐츠"},
|
||||
{"pattern": r"부작용|주의|위험", "label": "safety_content", "description": "안전/부작용 정보"},
|
||||
{"pattern": r"효과|결과|전후", "label": "results_content", "description": "효과/결과 콘텐츠"},
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ContentGapAnalyzer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class ContentGapAnalyzer(BaseAsyncClient):
|
||||
"""Analyze content gaps between target and competitor sites."""
|
||||
|
||||
def __init__(self, max_concurrent: int = 5, requests_per_second: float = 2.0):
|
||||
super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Ahrefs data retrieval
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_competitor_topics(self, competitor_url: str, limit: int = 100) -> list[dict]:
|
||||
"""
|
||||
Get top pages and keywords for a competitor via Ahrefs.
|
||||
|
||||
Returns list of dicts: url, traffic, keywords, top_keyword, title.
|
||||
"""
|
||||
self.logger.info(f"Fetching competitor topics for {competitor_url}")
|
||||
target = urlparse(competitor_url).netloc or competitor_url
|
||||
|
||||
try:
|
||||
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
|
||||
if not api_key:
|
||||
self.logger.warning("AHREFS_API_KEY not set; returning empty competitor topics")
|
||||
return []
|
||||
|
||||
resp = requests.get(
|
||||
"https://api.ahrefs.com/v3/site-explorer/top-pages",
|
||||
params={
|
||||
"target": target,
|
||||
"limit": limit,
|
||||
"select": "url,traffic,keywords,value,top_keyword",
|
||||
},
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
timeout=30,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
pages = data.get("pages", data.get("items", []))
|
||||
self.logger.info(f"Retrieved {len(pages)} competitor topics from {competitor_url}")
|
||||
return pages
|
||||
except Exception as exc:
|
||||
self.logger.warning(f"Failed to get competitor topics for {competitor_url}: {exc}")
|
||||
return []
|
||||
|
||||
async def get_target_keywords(self, target_url: str, limit: int = 200) -> set[str]:
|
||||
"""Get the set of keywords the target site already ranks for."""
|
||||
self.logger.info(f"Fetching target keywords for {target_url}")
|
||||
target = urlparse(target_url).netloc or target_url
|
||||
|
||||
try:
|
||||
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
|
||||
if not api_key:
|
||||
return set()
|
||||
|
||||
resp = requests.get(
|
||||
"https://api.ahrefs.com/v3/site-explorer/organic-keywords",
|
||||
params={"target": target, "limit": limit, "select": "keyword,position,traffic"},
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
timeout=30,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
keywords = data.get("keywords", data.get("items", []))
|
||||
return {kw.get("keyword", "").lower() for kw in keywords if kw.get("keyword")}
|
||||
except Exception as exc:
|
||||
self.logger.warning(f"Failed to get target keywords: {exc}")
|
||||
return set()
|
||||
|
||||
async def get_organic_competitors(self, target_url: str, limit: int = 10) -> list[str]:
|
||||
"""Discover organic competitors via Ahrefs."""
|
||||
self.logger.info(f"Discovering organic competitors for {target_url}")
|
||||
target = urlparse(target_url).netloc or target_url
|
||||
|
||||
try:
|
||||
api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
|
||||
if not api_key:
|
||||
return []
|
||||
|
||||
resp = requests.get(
|
||||
"https://api.ahrefs.com/v3/site-explorer/organic-competitors",
|
||||
params={"target": target, "limit": limit},
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
timeout=30,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
competitors = data.get("competitors", data.get("items", []))
|
||||
return [c.get("domain", "") for c in competitors if c.get("domain")]
|
||||
except Exception as exc:
|
||||
self.logger.warning(f"Failed to discover competitors: {exc}")
|
||||
return []
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Gap analysis
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def find_topic_gaps(
|
||||
self,
|
||||
target_url: str,
|
||||
competitor_urls: list[str],
|
||||
) -> tuple[list[TopicGap], set[str], dict[str, int]]:
|
||||
"""
|
||||
Identify topics covered by competitors but missing from target.
|
||||
|
||||
Returns:
|
||||
- List of TopicGap objects.
|
||||
- Set of target keywords (for reference).
|
||||
- Content volume comparison dict.
|
||||
"""
|
||||
# Gather target keywords
|
||||
target_keywords = await self.get_target_keywords(target_url)
|
||||
|
||||
# Gather competitor data in parallel
|
||||
competitor_tasks = [self.get_competitor_topics(c_url) for c_url in competitor_urls]
|
||||
competitor_results = await asyncio.gather(*competitor_tasks, return_exceptions=True)
|
||||
|
||||
# Build competitor topic map
|
||||
competitor_topic_map: dict[str, TopicGap] = {}
|
||||
content_volume: dict[str, int] = {target_url: len(target_keywords)}
|
||||
|
||||
for c_url, c_result in zip(competitor_urls, competitor_results):
|
||||
if isinstance(c_result, Exception):
|
||||
self.logger.warning(f"Error fetching {c_url}: {c_result}")
|
||||
continue
|
||||
|
||||
pages = c_result if isinstance(c_result, list) else []
|
||||
content_volume[c_url] = len(pages)
|
||||
|
||||
for page in pages:
|
||||
top_keyword = page.get("top_keyword", "").strip().lower()
|
||||
if not top_keyword:
|
||||
continue
|
||||
|
||||
# Skip if target already covers this keyword
|
||||
if top_keyword in target_keywords:
|
||||
continue
|
||||
|
||||
# Check for fuzzy matches (keyword contained in target set)
|
||||
is_covered = any(
|
||||
top_keyword in tk or tk in top_keyword
|
||||
for tk in target_keywords
|
||||
if len(tk) > 3
|
||||
)
|
||||
if is_covered:
|
||||
continue
|
||||
|
||||
if top_keyword not in competitor_topic_map:
|
||||
competitor_topic_map[top_keyword] = TopicGap(
|
||||
topic=top_keyword,
|
||||
estimated_traffic=int(page.get("traffic", 0)),
|
||||
)
|
||||
|
||||
gap = competitor_topic_map[top_keyword]
|
||||
gap.competitor_urls.append(page.get("url", c_url))
|
||||
gap.competitor_keywords.append(top_keyword)
|
||||
gap.estimated_traffic = max(gap.estimated_traffic, int(page.get("traffic", 0)))
|
||||
|
||||
# Score gaps
|
||||
gaps = list(competitor_topic_map.values())
|
||||
for gap in gaps:
|
||||
competitor_count = len(set(gap.competitor_urls))
|
||||
traffic_score = min(100, math.log10(max(gap.estimated_traffic, 1)) / math.log10(10000) * 100)
|
||||
competition_score = (competitor_count / max(len(competitor_urls), 1)) * 100
|
||||
gap.priority_score = round((traffic_score * 0.6) + (competition_score * 0.4), 1)
|
||||
|
||||
# Difficulty estimation
|
||||
if competitor_count >= 3:
|
||||
gap.difficulty = "high"
|
||||
elif competitor_count >= 2:
|
||||
gap.difficulty = "medium"
|
||||
else:
|
||||
gap.difficulty = "low"
|
||||
|
||||
# Content type suggestion
|
||||
gap.content_type_suggestion = self._suggest_content_type(gap.topic)
|
||||
|
||||
gaps.sort(key=lambda g: g.priority_score, reverse=True)
|
||||
return gaps, target_keywords, content_volume
|
||||
|
||||
@staticmethod
|
||||
def _suggest_content_type(topic: str) -> str:
|
||||
"""Suggest content type based on topic keywords."""
|
||||
topic_lower = topic.lower()
|
||||
if any(w in topic_lower for w in ["how to", "guide", "tutorial", "방법", "가이드"]):
|
||||
return "guide"
|
||||
if any(w in topic_lower for w in ["best", "top", "review", "추천", "후기", "비교"]):
|
||||
return "listicle"
|
||||
if any(w in topic_lower for w in ["what is", "이란", "뜻", "의미"]):
|
||||
return "informational"
|
||||
if any(w in topic_lower for w in ["cost", "price", "비용", "가격"]):
|
||||
return "landing"
|
||||
return "blog"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Topic cluster mapping
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def build_topic_clusters(
|
||||
self,
|
||||
topics: list[str],
|
||||
n_clusters: int | None = None,
|
||||
min_cluster_size: int = 3,
|
||||
) -> list[TopicCluster]:
|
||||
"""
|
||||
Group topics into pillar/cluster structure using TF-IDF + hierarchical clustering.
|
||||
|
||||
Args:
|
||||
topics: List of topic strings.
|
||||
n_clusters: Number of clusters (auto-detected if None).
|
||||
min_cluster_size: Minimum topics per cluster.
|
||||
|
||||
Returns:
|
||||
List of TopicCluster objects.
|
||||
"""
|
||||
if len(topics) < min_cluster_size:
|
||||
self.logger.warning("Too few topics for clustering")
|
||||
return []
|
||||
|
||||
# Vectorize topics
|
||||
vectorizer = TfidfVectorizer(
|
||||
max_features=500,
|
||||
stop_words="english",
|
||||
ngram_range=(1, 2),
|
||||
)
|
||||
|
||||
try:
|
||||
tfidf_matrix = vectorizer.fit_transform(topics)
|
||||
except ValueError as exc:
|
||||
self.logger.warning(f"TF-IDF vectorization failed: {exc}")
|
||||
return []
|
||||
|
||||
# Auto-detect cluster count
|
||||
if n_clusters is None:
|
||||
n_clusters = max(2, min(len(topics) // 5, 15))
|
||||
|
||||
n_clusters = min(n_clusters, len(topics) - 1)
|
||||
|
||||
# Hierarchical clustering
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=n_clusters,
|
||||
metric="cosine",
|
||||
linkage="average",
|
||||
)
|
||||
labels = clustering.fit_predict(tfidf_matrix.toarray())
|
||||
|
||||
# Build cluster objects
|
||||
cluster_map: dict[int, list[str]] = defaultdict(list)
|
||||
for topic, label in zip(topics, labels):
|
||||
cluster_map[label].append(topic)
|
||||
|
||||
clusters: list[TopicCluster] = []
|
||||
for label, cluster_topics in sorted(cluster_map.items()):
|
||||
if len(cluster_topics) < min_cluster_size:
|
||||
continue
|
||||
|
||||
# Pick the longest topic as pillar (usually broader)
|
||||
pillar = max(cluster_topics, key=len)
|
||||
subtopics = [t for t in cluster_topics if t != pillar]
|
||||
|
||||
cluster = TopicCluster(
|
||||
pillar_topic=pillar,
|
||||
pillar_keyword=pillar,
|
||||
cluster_topics=subtopics[:20],
|
||||
cluster_keywords=[t for t in subtopics[:20]],
|
||||
total_volume=0,
|
||||
coverage_score=0.0,
|
||||
)
|
||||
clusters.append(cluster)
|
||||
|
||||
clusters.sort(key=lambda c: len(c.cluster_topics), reverse=True)
|
||||
return clusters
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Editorial calendar generation
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def generate_calendar(
|
||||
self,
|
||||
gaps: list[TopicGap],
|
||||
clusters: list[TopicCluster],
|
||||
weeks_ahead: int = 12,
|
||||
entries_per_week: int = 2,
|
||||
) -> list[CalendarEntry]:
|
||||
"""
|
||||
Generate prioritized editorial calendar from gaps and clusters.
|
||||
|
||||
Args:
|
||||
gaps: List of topic gaps (sorted by priority).
|
||||
clusters: List of topic clusters.
|
||||
weeks_ahead: Number of weeks to plan.
|
||||
entries_per_week: Content pieces per week.
|
||||
|
||||
Returns:
|
||||
List of CalendarEntry objects.
|
||||
"""
|
||||
calendar: list[CalendarEntry] = []
|
||||
today = datetime.now()
|
||||
|
||||
# Build cluster lookup
|
||||
topic_to_cluster: dict[str, str] = {}
|
||||
for cluster in clusters:
|
||||
for topic in cluster.cluster_topics:
|
||||
topic_to_cluster[topic] = cluster.pillar_topic
|
||||
topic_to_cluster[cluster.pillar_topic] = cluster.pillar_topic
|
||||
|
||||
# Prioritize: pillar topics first, then by priority score
|
||||
pillar_topics = {c.pillar_topic for c in clusters}
|
||||
pillar_gaps = [g for g in gaps if g.topic in pillar_topics]
|
||||
other_gaps = [g for g in gaps if g.topic not in pillar_topics]
|
||||
ordered_gaps = pillar_gaps + other_gaps
|
||||
|
||||
max_entries = weeks_ahead * entries_per_week
|
||||
week_offset = 0
|
||||
slot_in_week = 0
|
||||
|
||||
for gap in ordered_gaps[:max_entries]:
|
||||
target_date = today + timedelta(weeks=week_offset, days=slot_in_week * 3)
|
||||
|
||||
# Determine priority label
|
||||
if gap.priority_score >= 70:
|
||||
priority = "high"
|
||||
elif gap.priority_score >= 40:
|
||||
priority = "medium"
|
||||
else:
|
||||
priority = "low"
|
||||
|
||||
# Word count based on content type
|
||||
word_count_map = {
|
||||
"guide": 2500,
|
||||
"listicle": 2000,
|
||||
"informational": 1800,
|
||||
"landing": 1200,
|
||||
"blog": 1500,
|
||||
}
|
||||
|
||||
entry = CalendarEntry(
|
||||
topic=gap.topic,
|
||||
priority=priority,
|
||||
target_date=target_date.strftime("%Y-%m-%d"),
|
||||
content_type=gap.content_type_suggestion,
|
||||
target_word_count=word_count_map.get(gap.content_type_suggestion, 1500),
|
||||
primary_keyword=gap.topic,
|
||||
estimated_traffic=gap.estimated_traffic,
|
||||
cluster_name=topic_to_cluster.get(gap.topic, "uncategorized"),
|
||||
)
|
||||
calendar.append(entry)
|
||||
|
||||
slot_in_week += 1
|
||||
if slot_in_week >= entries_per_week:
|
||||
slot_in_week = 0
|
||||
week_offset += 1
|
||||
|
||||
return calendar
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Korean opportunity detection
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def detect_korean_opportunities(gaps: list[TopicGap]) -> list[dict[str, Any]]:
|
||||
"""Detect Korean-market content opportunities in gaps."""
|
||||
opportunities: list[dict[str, Any]] = []
|
||||
|
||||
for gap in gaps:
|
||||
for pattern_info in KOREAN_OPPORTUNITY_PATTERNS:
|
||||
if re.search(pattern_info["pattern"], gap.topic, re.IGNORECASE):
|
||||
opportunities.append({
|
||||
"topic": gap.topic,
|
||||
"pattern": pattern_info["label"],
|
||||
"description": pattern_info["description"],
|
||||
"estimated_traffic": gap.estimated_traffic,
|
||||
"priority_score": gap.priority_score,
|
||||
})
|
||||
break
|
||||
|
||||
opportunities.sort(key=lambda o: o["priority_score"], reverse=True)
|
||||
return opportunities
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Orchestration
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def analyze(
|
||||
self,
|
||||
target_url: str,
|
||||
competitor_urls: list[str],
|
||||
build_clusters: bool = False,
|
||||
) -> ContentGapResult:
|
||||
"""
|
||||
Run full content gap analysis.
|
||||
|
||||
Args:
|
||||
target_url: Target website URL.
|
||||
competitor_urls: List of competitor URLs.
|
||||
build_clusters: Whether to build topic clusters.
|
||||
|
||||
Returns:
|
||||
ContentGapResult with gaps, clusters, and calendar.
|
||||
"""
|
||||
result = ContentGapResult(
|
||||
target_url=target_url,
|
||||
competitor_urls=competitor_urls,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
self.logger.info(
|
||||
f"Starting gap analysis: {target_url} vs {len(competitor_urls)} competitors"
|
||||
)
|
||||
|
||||
# 1. Find topic gaps
|
||||
gaps, target_keywords, content_volume = await self.find_topic_gaps(
|
||||
target_url, competitor_urls
|
||||
)
|
||||
|
||||
result.gaps = gaps
|
||||
result.target_topics_count = len(target_keywords)
|
||||
result.competitor_topics_count = sum(content_volume.get(c, 0) for c in competitor_urls)
|
||||
result.content_volume_comparison = content_volume
|
||||
|
||||
# 2. Build topic clusters if requested
|
||||
if build_clusters and gaps:
|
||||
all_topics = [g.topic for g in gaps]
|
||||
result.clusters = self.build_topic_clusters(all_topics)
|
||||
|
||||
# 3. Generate editorial calendar
|
||||
result.calendar = self.generate_calendar(gaps, result.clusters)
|
||||
|
||||
# 4. Detect Korean opportunities
|
||||
result.korean_opportunities = self.detect_korean_opportunities(gaps)
|
||||
|
||||
# 5. Recommendations
|
||||
result.recommendations = self._generate_recommendations(result)
|
||||
|
||||
self.logger.info(
|
||||
f"Gap analysis complete: {len(gaps)} gaps, "
|
||||
f"{len(result.clusters)} clusters, "
|
||||
f"{len(result.calendar)} calendar entries"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _generate_recommendations(result: ContentGapResult) -> list[str]:
|
||||
"""Generate strategic recommendations from gap analysis."""
|
||||
recs: list[str] = []
|
||||
|
||||
gap_count = len(result.gaps)
|
||||
if gap_count > 50:
|
||||
recs.append(
|
||||
f"경쟁사 대비 {gap_count}개의 콘텐츠 격차가 발견되었습니다. "
|
||||
"우선순위 상위 20개 주제부터 콘텐츠 생성을 시작하세요."
|
||||
)
|
||||
elif gap_count > 20:
|
||||
recs.append(
|
||||
f"{gap_count}개의 콘텐츠 격차가 있습니다. "
|
||||
"높은 트래픽 기회부터 순차적으로 콘텐츠를 생성하세요."
|
||||
)
|
||||
elif gap_count > 0:
|
||||
recs.append(
|
||||
f"{gap_count}개의 콘텐츠 격차가 발견되었습니다. "
|
||||
"비교적 적은 격차이므로 빠른 시일 내 모두 커버할 수 있습니다."
|
||||
)
|
||||
|
||||
if result.clusters:
|
||||
recs.append(
|
||||
f"{len(result.clusters)}개의 토픽 클러스터를 구성했습니다. "
|
||||
"필러 콘텐츠부터 작성하여 내부 링크 구조를 강화하세요."
|
||||
)
|
||||
|
||||
if result.korean_opportunities:
|
||||
recs.append(
|
||||
f"한국어 시장 기회가 {len(result.korean_opportunities)}개 발견되었습니다. "
|
||||
"후기, 비용, 비교 콘텐츠는 한국 검색 시장에서 높은 전환율을 보입니다."
|
||||
)
|
||||
|
||||
high_priority = [g for g in result.gaps if g.priority_score >= 70]
|
||||
if high_priority:
|
||||
top_topics = ", ".join(g.topic for g in high_priority[:3])
|
||||
recs.append(
|
||||
f"최우선 주제: {top_topics}. "
|
||||
"이 주제들은 높은 트래픽 잠재력과 경쟁사 커버리지를 가지고 있습니다."
|
||||
)
|
||||
|
||||
if not recs:
|
||||
recs.append("경쟁사 대비 콘텐츠 커버리지가 양호합니다. 기존 콘텐츠 최적화에 집중하세요.")
|
||||
|
||||
return recs
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="SEO Content Gap Analyzer - topic gaps, clusters, calendar",
|
||||
)
|
||||
parser.add_argument("--target", required=True, help="Target website URL")
|
||||
parser.add_argument(
|
||||
"--competitor", action="append", dest="competitors", required=True,
|
||||
help="Competitor URL (can be repeated)",
|
||||
)
|
||||
parser.add_argument("--clusters", action="store_true", help="Build topic clusters")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
parser.add_argument("--output", help="Save output to file")
|
||||
return parser
|
||||
|
||||
|
||||
def format_text_report(result: ContentGapResult) -> str:
|
||||
"""Format gap analysis result as human-readable text."""
|
||||
lines: list[str] = []
|
||||
lines.append(f"## Content Gap Analysis: {result.target_url}")
|
||||
lines.append(f"**Date**: {result.timestamp[:10]}")
|
||||
lines.append(f"**Competitors**: {', '.join(result.competitor_urls)}")
|
||||
lines.append("")
|
||||
|
||||
lines.append("### Content Volume Comparison")
|
||||
for site, count in result.content_volume_comparison.items():
|
||||
lines.append(f" - {site}: {count} topics")
|
||||
lines.append("")
|
||||
|
||||
lines.append(f"### Topic Gaps ({len(result.gaps)} found)")
|
||||
for i, gap in enumerate(result.gaps[:20], 1):
|
||||
lines.append(
|
||||
f" {i}. [{gap.priority_score:.0f}] {gap.topic} "
|
||||
f"(traffic: {gap.estimated_traffic}, difficulty: {gap.difficulty})"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
if result.clusters:
|
||||
lines.append(f"### Topic Clusters ({len(result.clusters)})")
|
||||
for i, cluster in enumerate(result.clusters, 1):
|
||||
lines.append(f" {i}. **{cluster.pillar_topic}** ({len(cluster.cluster_topics)} subtopics)")
|
||||
for sub in cluster.cluster_topics[:5]:
|
||||
lines.append(f" - {sub}")
|
||||
lines.append("")
|
||||
|
||||
if result.calendar:
|
||||
lines.append(f"### Editorial Calendar ({len(result.calendar)} entries)")
|
||||
for entry in result.calendar[:15]:
|
||||
lines.append(
|
||||
f" - [{entry.target_date}] {entry.topic} "
|
||||
f"({entry.content_type}, {entry.target_word_count}w, priority: {entry.priority})"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
if result.korean_opportunities:
|
||||
lines.append(f"### Korean Market Opportunities ({len(result.korean_opportunities)})")
|
||||
for opp in result.korean_opportunities[:10]:
|
||||
lines.append(f" - {opp['topic']} ({opp['description']})")
|
||||
lines.append("")
|
||||
|
||||
lines.append("### Recommendations")
|
||||
for i, rec in enumerate(result.recommendations, 1):
|
||||
lines.append(f" {i}. {rec}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
analyzer = ContentGapAnalyzer()
|
||||
result = await analyzer.analyze(
|
||||
target_url=args.target,
|
||||
competitor_urls=args.competitors,
|
||||
build_clusters=args.clusters,
|
||||
)
|
||||
|
||||
if args.json:
|
||||
output = json.dumps(asdict(result), ensure_ascii=False, indent=2, default=str)
|
||||
else:
|
||||
output = format_text_report(result)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
logger.info(f"Output saved to {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
|
||||
analyzer.print_stats()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,11 @@
|
||||
# 23-seo-content-strategy dependencies
|
||||
requests>=2.31.0
|
||||
aiohttp>=3.9.0
|
||||
beautifulsoup4>=4.12.0
|
||||
lxml>=5.1.0
|
||||
pandas>=2.1.0
|
||||
scikit-learn>=1.3.0
|
||||
tenacity>=8.2.0
|
||||
tqdm>=4.66.0
|
||||
python-dotenv>=1.0.0
|
||||
rich>=13.7.0
|
||||
Reference in New Issue
Block a user