our-claude-skills/custom-skills/23-seo-content-strategy/code/scripts/content_brief_generator.py

"""
Content Brief Generator - SEO Content Brief Creation
=====================================================
Purpose: Generate detailed SEO content briefs with outlines,
         keyword lists, word count targets, and internal linking suggestions.
Python: 3.10+
"""

import argparse
import asyncio
import json
import logging
import math
import re
import sys
from dataclasses import asdict, dataclass, field
from datetime import datetime
from typing import Any
from urllib.parse import urlparse

import aiohttp
import requests
from bs4 import BeautifulSoup

from base_client import BaseAsyncClient, config

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class OutlineSection:
    """A single heading section in the content outline."""
    heading: str
    level: int = 2  # H2 or H3
    talking_points: list[str] = field(default_factory=list)
    target_words: int = 200
    keywords_to_include: list[str] = field(default_factory=list)


@dataclass
class CompetitorPageAnalysis:
    """Analysis of a single competitor page for the target keyword."""
    url: str
    title: str = ""
    word_count: int = 0
    headings: list[dict[str, str]] = field(default_factory=list)
    topics_covered: list[str] = field(default_factory=list)
    content_type: str = ""
    has_images: bool = False
    has_video: bool = False
    has_faq: bool = False
    has_table: bool = False


@dataclass
class ContentBrief:
    """Complete SEO content brief."""
    primary_keyword: str
    secondary_keywords: list[str] = field(default_factory=list)
    lsi_keywords: list[str] = field(default_factory=list)
    target_word_count: int = 1500
    word_count_range: tuple[int, int] = (1200, 1800)
    suggested_title: str = ""
    meta_description: str = ""
    outline: list[OutlineSection] = field(default_factory=list)
    competitor_analysis: list[CompetitorPageAnalysis] = field(default_factory=list)
    internal_links: list[dict[str, str]] = field(default_factory=list)
    content_format: str = "blog"
    korean_format_recommendations: list[str] = field(default_factory=list)
    search_intent: str = "informational"
    notes: list[str] = field(default_factory=list)
    timestamp: str = ""


# ---------------------------------------------------------------------------
# Search intent patterns
# ---------------------------------------------------------------------------

INTENT_PATTERNS = {
    "transactional": [
        r"buy", r"purchase", r"price", r"cost", r"order", r"shop",
        r"구매", r"주문", r"가격", r"비용", r"할인", r"쿠폰",
    ],
    "navigational": [
        r"login", r"sign in", r"official", r"website",
        r"로그인", r"공식", r"홈페이지",
    ],
    "commercial": [
        r"best", r"top", r"review", r"compare", r"vs",
        r"추천", r"비교", r"후기", r"리뷰", r"순위",
    ],
    "informational": [
        r"what", r"how", r"why", r"guide", r"tutorial",
        r"이란", r"방법", r"가이드", r"효과", r"원인",
    ],
}

# ---------------------------------------------------------------------------
# Korean content format recommendations
# ---------------------------------------------------------------------------

KOREAN_FORMAT_TIPS = {
    "transactional": [
        "가격 비교표를 포함하세요 (경쟁사 가격 대비)",
        "실제 비용 사례를 3개 이상 제시하세요",
        "결제 방법 및 할인 정보를 명확히 안내하세요",
        "CTA(행동 유도) 버튼을 여러 위치에 배치하세요",
    ],
    "commercial": [
        "네이버 블로그 스타일의 솔직한 후기 톤을 사용하세요",
        "장단점을 균형 있게 비교하세요",
        "실제 사용 사진 또는 전후 비교 이미지를 포함하세요",
        "별점 또는 점수 평가 체계를 추가하세요",
        "FAQ 섹션을 포함하세요 (네이버 검색 노출에 유리)",
    ],
    "informational": [
        "핵심 정보를 글 상단에 요약하세요 (두괄식 구성)",
        "전문 용어는 쉬운 설명을 병기하세요",
        "인포그래픽 또는 도표를 활용하세요",
        "관련 콘텐츠 내부 링크를 3-5개 포함하세요",
        "전문가 인용 또는 출처를 명시하세요 (E-E-A-T 강화)",
    ],
    "navigational": [
        "공식 정보와 연락처를 최상단에 배치하세요",
        "지도 임베드를 포함하세요 (네이버 지도/구글 맵)",
        "영업시간, 주소, 전화번호를 명확히 표시하세요",
    ],
}


# ---------------------------------------------------------------------------
# ContentBriefGenerator
# ---------------------------------------------------------------------------

class ContentBriefGenerator(BaseAsyncClient):
    """Generate comprehensive SEO content briefs."""

    def __init__(self, max_concurrent: int = 5, requests_per_second: float = 2.0):
        super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second)
        self.session: aiohttp.ClientSession | None = None

    async def _ensure_session(self) -> aiohttp.ClientSession:
        if self.session is None or self.session.closed:
            timeout = aiohttp.ClientTimeout(total=30)
            headers = {
                "User-Agent": "Mozilla/5.0 (compatible; SEOContentBrief/1.0)",
            }
            self.session = aiohttp.ClientSession(timeout=timeout, headers=headers)
        return self.session

    async def close(self) -> None:
        if self.session and not self.session.closed:
            await self.session.close()

    # ------------------------------------------------------------------
    # Analyze top ranking results
    # ------------------------------------------------------------------

    async def analyze_top_results(
        self,
        keyword: str,
        site_url: str | None = None,
        num_competitors: int = 5,
    ) -> list[CompetitorPageAnalysis]:
        """
        Analyze top ranking pages for a keyword using Ahrefs SERP data.

        Falls back to fetching pages directly if Ahrefs data is unavailable.
        """
        self.logger.info(f"Analyzing top results for: {keyword}")
        results: list[CompetitorPageAnalysis] = []

        # Try Ahrefs organic keywords to find ranking pages
        try:
            api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
            if api_key:
                resp = requests.get(
                    "https://api.ahrefs.com/v3/serp-overview",
                    params={"keyword": keyword, "select": "url,title,position,traffic"},
                    headers={"Authorization": f"Bearer {api_key}"},
                    timeout=30,
                )
                if resp.status_code == 200:
                    data = resp.json()
                    serp_items = data.get("positions", data.get("items", []))[:num_competitors]
                    for item in serp_items:
                        analysis = CompetitorPageAnalysis(
                            url=item.get("url", ""),
                            title=item.get("title", ""),
                        )
                        results.append(analysis)
        except Exception as exc:
            self.logger.warning(f"Ahrefs SERP lookup failed: {exc}")

        # Fetch and analyze each page
        session = await self._ensure_session()
        for analysis in results[:num_competitors]:
            if not analysis.url:
                continue
            try:
                async with session.get(analysis.url) as resp:
                    if resp.status != 200:
                        continue
                    html = await resp.text()
                    self._analyze_page_content(analysis, html)
            except Exception as exc:
                self.logger.debug(f"Failed to fetch {analysis.url}: {exc}")

        self.logger.info(f"Analyzed {len(results)} competitor pages")
        return results

    @staticmethod
    def _analyze_page_content(analysis: CompetitorPageAnalysis, html: str) -> None:
        """Parse HTML and extract content metrics."""
        soup = BeautifulSoup(html, "html.parser")

        # Title
        title_tag = soup.find("title")
        if title_tag and not analysis.title:
            analysis.title = title_tag.get_text(strip=True)

        # Word count (visible text only)
        for tag in soup(["script", "style", "nav", "header", "footer"]):
            tag.decompose()
        visible_text = soup.get_text(separator=" ", strip=True)
        analysis.word_count = len(visible_text.split())

        # Headings
        headings: list[dict[str, str]] = []
        for level in range(1, 7):
            for h in soup.find_all(f"h{level}"):
                text = h.get_text(strip=True)
                if text:
                    headings.append({"level": f"H{level}", "text": text})
        analysis.headings = headings

        # Content features
        analysis.has_images = len(soup.find_all("img")) > 2
        analysis.has_video = bool(soup.find("video") or soup.find("iframe", src=re.compile(r"youtube|vimeo")))
        analysis.has_faq = bool(
            soup.find(string=re.compile(r"FAQ|자주\s*묻는\s*질문|Q\s*&\s*A", re.IGNORECASE))
            or soup.find("script", type="application/ld+json", string=re.compile(r"FAQPage"))
        )
        analysis.has_table = bool(soup.find("table"))

        # Topics covered (from H2 headings)
        analysis.topics_covered = [
            h["text"] for h in headings if h["level"] == "H2"
        ][:15]

    # ------------------------------------------------------------------
    # Extract content outline
    # ------------------------------------------------------------------

    def extract_outline(
        self,
        keyword: str,
        top_results: list[CompetitorPageAnalysis],
    ) -> list[OutlineSection]:
        """
        Build recommended H2/H3 outline by aggregating competitor headings.

        Identifies common topics across top-ranking pages and structures
        them into a logical outline.
        """
        # Collect all H2 headings
        h2_topics: dict[str, int] = {}
        h3_by_h2: dict[str, list[str]] = {}

        for result in top_results:
            current_h2 = ""
            for heading in result.headings:
                text = heading["text"].strip()
                if heading["level"] == "H2":
                    current_h2 = text
                    h2_topics[text] = h2_topics.get(text, 0) + 1
                elif heading["level"] == "H3" and current_h2:
                    if current_h2 not in h3_by_h2:
                        h3_by_h2[current_h2] = []
                    h3_by_h2[current_h2].append(text)

        # Sort H2s by frequency (most common topics first)
        sorted_h2s = sorted(h2_topics.items(), key=lambda x: x[1], reverse=True)

        # Build outline
        outline: list[OutlineSection] = []
        target_word_count = self.calculate_word_count(top_results)
        words_per_section = target_word_count // max(len(sorted_h2s), 5)

        for h2_text, frequency in sorted_h2s[:8]:
            section = OutlineSection(
                heading=h2_text,
                level=2,
                target_words=words_per_section,
                talking_points=[],
            )

            # Add H3 subtopics
            if h2_text in h3_by_h2:
                unique_h3s = list(dict.fromkeys(h3_by_h2[h2_text]))[:5]
                for h3_text in unique_h3s:
                    subsection = OutlineSection(
                        heading=h3_text,
                        level=3,
                        target_words=words_per_section // 3,
                    )
                    section.talking_points.append(h3_text)

            outline.append(section)

        # Ensure FAQ section if common
        faq_count = sum(1 for r in top_results if r.has_faq)
        if faq_count >= 2 and not any("FAQ" in s.heading or "질문" in s.heading for s in outline):
            outline.append(OutlineSection(
                heading="자주 묻는 질문 (FAQ)",
                level=2,
                target_words=300,
                talking_points=[
                    f"{keyword} 관련 자주 묻는 질문 5-7개",
                    "Schema markup (FAQPage) 적용 권장",
                ],
            ))

        return outline

    # ------------------------------------------------------------------
    # Keyword suggestions
    # ------------------------------------------------------------------

    async def suggest_keywords(self, primary_keyword: str) -> dict[str, list[str]]:
        """
        Generate primary, secondary, and LSI keyword suggestions.

        Uses Ahrefs related keywords and matching terms data.
        """
        self.logger.info(f"Generating keyword suggestions for: {primary_keyword}")
        result = {
            "primary": [primary_keyword],
            "secondary": [],
            "lsi": [],
        }

        try:
            api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
            if not api_key:
                self.logger.warning("AHREFS_API_KEY not set; returning basic keywords only")
                return result

            # Matching terms
            resp = requests.get(
                "https://api.ahrefs.com/v3/keywords-explorer/matching-terms",
                params={"keyword": primary_keyword, "limit": 20, "select": "keyword,volume,difficulty"},
                headers={"Authorization": f"Bearer {api_key}"},
                timeout=30,
            )
            if resp.status_code == 200:
                data = resp.json()
                terms = data.get("keywords", data.get("items", []))
                for term in terms:
                    kw = term.get("keyword", "")
                    if kw and kw.lower() != primary_keyword.lower():
                        result["secondary"].append(kw)

            # Related terms (LSI)
            resp2 = requests.get(
                "https://api.ahrefs.com/v3/keywords-explorer/related-terms",
                params={"keyword": primary_keyword, "limit": 15, "select": "keyword,volume"},
                headers={"Authorization": f"Bearer {api_key}"},
                timeout=30,
            )
            if resp2.status_code == 200:
                data2 = resp2.json()
                related = data2.get("keywords", data2.get("items", []))
                for term in related:
                    kw = term.get("keyword", "")
                    if kw and kw not in result["secondary"]:
                        result["lsi"].append(kw)

        except Exception as exc:
            self.logger.warning(f"Keyword suggestion lookup failed: {exc}")

        return result

    # ------------------------------------------------------------------
    # Word count calculation
    # ------------------------------------------------------------------

    @staticmethod
    def calculate_word_count(top_results: list[CompetitorPageAnalysis]) -> int:
        """
        Calculate target word count based on top 5 ranking pages.

        Returns the average word count of top 5 with +/- 20% range.
        """
        word_counts = [r.word_count for r in top_results[:5] if r.word_count > 100]

        if not word_counts:
            return 1500  # Default fallback

        avg = sum(word_counts) / len(word_counts)
        # Round to nearest 100
        target = round(avg / 100) * 100
        return max(800, min(5000, target))

    # ------------------------------------------------------------------
    # Internal linking suggestions
    # ------------------------------------------------------------------

    async def suggest_internal_links(
        self,
        keyword: str,
        site_url: str,
    ) -> list[dict[str, str]]:
        """
        Find related existing pages on the site for internal linking.

        Uses Ahrefs organic keywords to find pages ranking for related terms.
        """
        self.logger.info(f"Finding internal link opportunities for {keyword} on {site_url}")
        links: list[dict[str, str]] = []
        target = urlparse(site_url).netloc or site_url

        try:
            api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
            if not api_key:
                return links

            resp = requests.get(
                "https://api.ahrefs.com/v3/site-explorer/organic-keywords",
                params={
                    "target": target,
                    "limit": 50,
                    "select": "keyword,url,position,traffic",
                },
                headers={"Authorization": f"Bearer {api_key}"},
                timeout=30,
            )
            if resp.status_code != 200:
                return links

            data = resp.json()
            keywords_data = data.get("keywords", data.get("items", []))

            # Find pages ranking for related keywords
            keyword_lower = keyword.lower()
            keyword_words = set(keyword_lower.split())

            seen_urls: set[str] = set()
            for item in keywords_data:
                kw = item.get("keyword", "").lower()
                url = item.get("url", "")

                if not url or url in seen_urls:
                    continue

                # Check keyword relevance
                kw_words = set(kw.split())
                overlap = keyword_words & kw_words
                if overlap and kw != keyword_lower:
                    links.append({
                        "url": url,
                        "anchor_text": kw,
                        "relevance": f"{len(overlap)}/{len(keyword_words)} word overlap",
                        "current_traffic": str(item.get("traffic", 0)),
                    })
                    seen_urls.add(url)

            links.sort(key=lambda l: int(l.get("current_traffic", "0")), reverse=True)

        except Exception as exc:
            self.logger.warning(f"Internal link suggestion failed: {exc}")

        return links[:10]

    # ------------------------------------------------------------------
    # Search intent detection
    # ------------------------------------------------------------------

    @staticmethod
    def detect_search_intent(keyword: str) -> str:
        """Classify keyword search intent."""
        keyword_lower = keyword.lower()
        scores: dict[str, int] = {}

        for intent, patterns in INTENT_PATTERNS.items():
            score = sum(1 for p in patterns if re.search(p, keyword_lower, re.IGNORECASE))
            if score > 0:
                scores[intent] = score

        if not scores:
            return "informational"
        return max(scores, key=scores.get)

    # ------------------------------------------------------------------
    # Orchestration
    # ------------------------------------------------------------------

    async def generate(
        self,
        keyword: str,
        site_url: str,
        num_competitors: int = 5,
    ) -> ContentBrief:
        """
        Generate a comprehensive SEO content brief.

        Args:
            keyword: Primary target keyword.
            site_url: Target website URL.
            num_competitors: Number of competitor pages to analyze.

        Returns:
            ContentBrief with outline, keywords, and recommendations.
        """
        self.logger.info(f"Generating content brief for: {keyword}")

        # Detect search intent
        intent = self.detect_search_intent(keyword)

        # Run analyses in parallel
        top_results_task = self.analyze_top_results(keyword, site_url, num_competitors)
        keywords_task = self.suggest_keywords(keyword)
        internal_links_task = self.suggest_internal_links(keyword, site_url)

        top_results, keyword_data, internal_links = await asyncio.gather(
            top_results_task, keywords_task, internal_links_task,
        )

        # Calculate word count target
        target_word_count = self.calculate_word_count(top_results)
        word_count_min = int(target_word_count * 0.8)
        word_count_max = int(target_word_count * 1.2)

        # Build outline
        outline = self.extract_outline(keyword, top_results)

        # Generate title suggestion
        suggested_title = self._generate_title(keyword, intent)

        # Generate meta description
        meta_description = self._generate_meta_description(keyword, intent)

        # Korean format recommendations
        korean_tips = KOREAN_FORMAT_TIPS.get(intent, KOREAN_FORMAT_TIPS["informational"])

        brief = ContentBrief(
            primary_keyword=keyword,
            secondary_keywords=keyword_data.get("secondary", [])[:10],
            lsi_keywords=keyword_data.get("lsi", [])[:10],
            target_word_count=target_word_count,
            word_count_range=(word_count_min, word_count_max),
            suggested_title=suggested_title,
            meta_description=meta_description,
            outline=outline,
            competitor_analysis=top_results,
            internal_links=internal_links,
            content_format=self._suggest_format(intent, top_results),
            korean_format_recommendations=korean_tips,
            search_intent=intent,
            timestamp=datetime.now().isoformat(),
        )

        self.logger.info(
            f"Brief generated: {len(outline)} sections, "
            f"{target_word_count} target words, "
            f"{len(keyword_data.get('secondary', []))} secondary keywords"
        )

        return brief

    @staticmethod
    def _generate_title(keyword: str, intent: str) -> str:
        """Generate a suggested title based on keyword and intent."""
        templates = {
            "informational": "{keyword} - 완벽 가이드 (2025년 최신)",
            "commercial": "{keyword} 추천 TOP 10 비교 (전문가 리뷰)",
            "transactional": "{keyword} 가격 비교 및 구매 가이드",
            "navigational": "{keyword} - 공식 안내",
        }
        template = templates.get(intent, templates["informational"])
        return template.format(keyword=keyword)

    @staticmethod
    def _generate_meta_description(keyword: str, intent: str) -> str:
        """Generate a suggested meta description."""
        templates = {
            "informational": (
                f"{keyword}에 대해 알아야 할 모든 것을 정리했습니다. "
                "전문가가 알려주는 핵심 정보와 실용적인 가이드를 확인하세요."
            ),
            "commercial": (
                f"{keyword} 비교 분석! 장단점, 가격, 실제 후기를 "
                "한눈에 비교하고 최적의 선택을 하세요."
            ),
            "transactional": (
                f"{keyword} 최저가 비교 및 구매 방법을 안내합니다. "
                "합리적인 가격으로 구매하는 팁을 확인하세요."
            ),
            "navigational": (
                f"{keyword} 공식 정보 및 이용 안내. "
                "정확한 정보를 빠르게 확인하세요."
            ),
        }
        return templates.get(intent, templates["informational"])

    @staticmethod
    def _suggest_format(intent: str, results: list[CompetitorPageAnalysis]) -> str:
        """Suggest content format based on intent and competitor analysis."""
        if intent == "commercial":
            return "listicle"
        if intent == "informational":
            return "guide"
        if intent == "transactional":
            return "landing"

        # Check competitor patterns
        avg_word_count = (
            sum(r.word_count for r in results) / len(results) if results else 0
        )
        if avg_word_count > 3000:
            return "comprehensive_guide"
        return "blog"


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="SEO Content Brief Generator",
    )
    parser.add_argument("--keyword", required=True, help="Primary target keyword")
    parser.add_argument("--url", required=True, help="Target website URL")
    parser.add_argument("--competitors", type=int, default=5, help="Number of competitor pages to analyze (default: 5)")
    parser.add_argument("--json", action="store_true", help="Output as JSON")
    parser.add_argument("--output", help="Save output to file")
    return parser


def format_text_report(brief: ContentBrief) -> str:
    """Format content brief as human-readable text."""
    lines: list[str] = []
    lines.append(f"## Content Brief: {brief.primary_keyword}")
    lines.append(f"**Date**: {brief.timestamp[:10]}")
    lines.append(f"**Search Intent**: {brief.search_intent}")
    lines.append(f"**Content Format**: {brief.content_format}")
    lines.append("")

    lines.append("### Target Metrics")
    lines.append(f"- Word count: {brief.target_word_count} ({brief.word_count_range[0]}-{brief.word_count_range[1]})")
    lines.append(f"- Suggested title: {brief.suggested_title}")
    lines.append(f"- Meta description: {brief.meta_description}")
    lines.append("")

    lines.append("### Keywords")
    lines.append(f"- **Primary**: {brief.primary_keyword}")
    if brief.secondary_keywords:
        lines.append(f"- **Secondary**: {', '.join(brief.secondary_keywords[:8])}")
    if brief.lsi_keywords:
        lines.append(f"- **LSI**: {', '.join(brief.lsi_keywords[:8])}")
    lines.append("")

    lines.append("### Content Outline")
    for section in brief.outline:
        prefix = "##" if section.level == 2 else "###"
        lines.append(f"  {prefix} {section.heading} (~{section.target_words}w)")
        for point in section.talking_points:
            lines.append(f"    - {point}")
    lines.append("")

    if brief.competitor_analysis:
        lines.append(f"### Competitor Analysis ({len(brief.competitor_analysis)} pages)")
        for comp in brief.competitor_analysis:
            lines.append(f"  - **{comp.title or comp.url}**")
            lines.append(f"    Word count: {comp.word_count} | Headings: {len(comp.headings)}")
            features = []
            if comp.has_images:
                features.append("images")
            if comp.has_video:
                features.append("video")
            if comp.has_faq:
                features.append("FAQ")
            if comp.has_table:
                features.append("table")
            if features:
                lines.append(f"    Features: {', '.join(features)}")
        lines.append("")

    if brief.internal_links:
        lines.append(f"### Internal Linking Suggestions ({len(brief.internal_links)})")
        for link in brief.internal_links[:7]:
            lines.append(f"  - [{link['anchor_text']}]({link['url']})")
        lines.append("")

    if brief.korean_format_recommendations:
        lines.append("### Korean Content Format Recommendations")
        for tip in brief.korean_format_recommendations:
            lines.append(f"  - {tip}")

    return "\n".join(lines)


async def main() -> None:
    parser = build_parser()
    args = parser.parse_args()

    generator = ContentBriefGenerator()
    try:
        brief = await generator.generate(
            keyword=args.keyword,
            site_url=args.url,
            num_competitors=args.competitors,
        )

        if args.json:
            output = json.dumps(asdict(brief), ensure_ascii=False, indent=2, default=str)
        else:
            output = format_text_report(brief)

        if args.output:
            with open(args.output, "w", encoding="utf-8") as f:
                f.write(output)
            logger.info(f"Output saved to {args.output}")
        else:
            print(output)

    finally:
        await generator.close()
        generator.print_stats()


if __name__ == "__main__":
    asyncio.run(main())