"""
Content Gap Analyzer - Topic Gap Detection & Cluster Mapping
=============================================================
Purpose: Identify content gaps vs competitors, build topic clusters,
         and generate prioritized editorial calendars.
Python: 3.10+
"""

import argparse
import asyncio
import json
import logging
import math
import re
import sys
from collections import defaultdict
from dataclasses import asdict, dataclass, field
from datetime import datetime, timedelta
from typing import Any
from urllib.parse import urlparse

import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering

from base_client import BaseAsyncClient, config

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class TopicGap:
    """A topic present in competitors but missing from target."""
    topic: str
    competitor_urls: list[str] = field(default_factory=list)
    competitor_keywords: list[str] = field(default_factory=list)
    estimated_traffic: int = 0
    priority_score: float = 0.0
    difficulty: str = "medium"
    content_type_suggestion: str = "blog"


@dataclass
class TopicCluster:
    """Topic cluster with pillar and supporting cluster pages."""
    pillar_topic: str
    pillar_keyword: str = ""
    cluster_topics: list[str] = field(default_factory=list)
    cluster_keywords: list[str] = field(default_factory=list)
    total_volume: int = 0
    coverage_score: float = 0.0


@dataclass
class CalendarEntry:
    """Prioritized editorial calendar entry."""
    topic: str
    priority: str = "medium"
    target_date: str = ""
    content_type: str = "blog"
    target_word_count: int = 1500
    primary_keyword: str = ""
    estimated_traffic: int = 0
    cluster_name: str = ""
    notes: str = ""


@dataclass
class ContentGapResult:
    """Full content gap analysis result."""
    target_url: str
    competitor_urls: list[str] = field(default_factory=list)
    timestamp: str = ""
    target_topics_count: int = 0
    competitor_topics_count: int = 0
    gaps: list[TopicGap] = field(default_factory=list)
    clusters: list[TopicCluster] = field(default_factory=list)
    calendar: list[CalendarEntry] = field(default_factory=list)
    content_volume_comparison: dict[str, int] = field(default_factory=dict)
    korean_opportunities: list[dict[str, Any]] = field(default_factory=dict)
    recommendations: list[str] = field(default_factory=list)
    errors: list[str] = field(default_factory=list)


# ---------------------------------------------------------------------------
# Korean opportunity patterns
# ---------------------------------------------------------------------------

KOREAN_OPPORTUNITY_PATTERNS = [
    {"pattern": r"후기|리뷰", "label": "review_content", "description": "후기/리뷰 콘텐츠"},
    {"pattern": r"비용|가격|견적", "label": "pricing_content", "description": "비용/가격 정보 콘텐츠"},
    {"pattern": r"비교|차이", "label": "comparison_content", "description": "비교 콘텐츠"},
    {"pattern": r"추천|베스트|TOP", "label": "recommendation_content", "description": "추천/리스트 콘텐츠"},
    {"pattern": r"방법|하는\s*법|가이드", "label": "how_to_content", "description": "가이드/방법 콘텐츠"},
    {"pattern": r"부작용|주의|위험", "label": "safety_content", "description": "안전/부작용 정보"},
    {"pattern": r"효과|결과|전후", "label": "results_content", "description": "효과/결과 콘텐츠"},
]


# ---------------------------------------------------------------------------
# ContentGapAnalyzer
# ---------------------------------------------------------------------------

class ContentGapAnalyzer(BaseAsyncClient):
    """Analyze content gaps between target and competitor sites."""

    def __init__(self, max_concurrent: int = 5, requests_per_second: float = 2.0):
        super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second)

    # ------------------------------------------------------------------
    # Ahrefs data retrieval
    # ------------------------------------------------------------------

    async def get_competitor_topics(self, competitor_url: str, limit: int = 100) -> list[dict]:
        """
        Get top pages and keywords for a competitor via Ahrefs.

        Returns list of dicts: url, traffic, keywords, top_keyword, title.
        """
        self.logger.info(f"Fetching competitor topics for {competitor_url}")
        target = urlparse(competitor_url).netloc or competitor_url

        try:
            api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
            if not api_key:
                self.logger.warning("AHREFS_API_KEY not set; returning empty competitor topics")
                return []

            resp = requests.get(
                "https://api.ahrefs.com/v3/site-explorer/top-pages",
                params={
                    "target": target,
                    "limit": limit,
                    "select": "url,traffic,keywords,value,top_keyword",
                },
                headers={"Authorization": f"Bearer {api_key}"},
                timeout=30,
            )
            resp.raise_for_status()
            data = resp.json()
            pages = data.get("pages", data.get("items", []))
            self.logger.info(f"Retrieved {len(pages)} competitor topics from {competitor_url}")
            return pages
        except Exception as exc:
            self.logger.warning(f"Failed to get competitor topics for {competitor_url}: {exc}")
            return []

    async def get_target_keywords(self, target_url: str, limit: int = 200) -> set[str]:
        """Get the set of keywords the target site already ranks for."""
        self.logger.info(f"Fetching target keywords for {target_url}")
        target = urlparse(target_url).netloc or target_url

        try:
            api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
            if not api_key:
                return set()

            resp = requests.get(
                "https://api.ahrefs.com/v3/site-explorer/organic-keywords",
                params={"target": target, "limit": limit, "select": "keyword,position,traffic"},
                headers={"Authorization": f"Bearer {api_key}"},
                timeout=30,
            )
            resp.raise_for_status()
            data = resp.json()
            keywords = data.get("keywords", data.get("items", []))
            return {kw.get("keyword", "").lower() for kw in keywords if kw.get("keyword")}
        except Exception as exc:
            self.logger.warning(f"Failed to get target keywords: {exc}")
            return set()

    async def get_organic_competitors(self, target_url: str, limit: int = 10) -> list[str]:
        """Discover organic competitors via Ahrefs."""
        self.logger.info(f"Discovering organic competitors for {target_url}")
        target = urlparse(target_url).netloc or target_url

        try:
            api_key = config.get_required("AHREFS_API_KEY") if hasattr(config, "get_required") else None
            if not api_key:
                return []

            resp = requests.get(
                "https://api.ahrefs.com/v3/site-explorer/organic-competitors",
                params={"target": target, "limit": limit},
                headers={"Authorization": f"Bearer {api_key}"},
                timeout=30,
            )
            resp.raise_for_status()
            data = resp.json()
            competitors = data.get("competitors", data.get("items", []))
            return [c.get("domain", "") for c in competitors if c.get("domain")]
        except Exception as exc:
            self.logger.warning(f"Failed to discover competitors: {exc}")
            return []

    # ------------------------------------------------------------------
    # Gap analysis
    # ------------------------------------------------------------------

    async def find_topic_gaps(
        self,
        target_url: str,
        competitor_urls: list[str],
    ) -> tuple[list[TopicGap], set[str], dict[str, int]]:
        """
        Identify topics covered by competitors but missing from target.

        Returns:
            - List of TopicGap objects.
            - Set of target keywords (for reference).
            - Content volume comparison dict.
        """
        # Gather target keywords
        target_keywords = await self.get_target_keywords(target_url)

        # Gather competitor data in parallel
        competitor_tasks = [self.get_competitor_topics(c_url) for c_url in competitor_urls]
        competitor_results = await asyncio.gather(*competitor_tasks, return_exceptions=True)

        # Build competitor topic map
        competitor_topic_map: dict[str, TopicGap] = {}
        content_volume: dict[str, int] = {target_url: len(target_keywords)}

        for c_url, c_result in zip(competitor_urls, competitor_results):
            if isinstance(c_result, Exception):
                self.logger.warning(f"Error fetching {c_url}: {c_result}")
                continue

            pages = c_result if isinstance(c_result, list) else []
            content_volume[c_url] = len(pages)

            for page in pages:
                top_keyword = page.get("top_keyword", "").strip().lower()
                if not top_keyword:
                    continue

                # Skip if target already covers this keyword
                if top_keyword in target_keywords:
                    continue

                # Check for fuzzy matches (keyword contained in target set)
                is_covered = any(
                    top_keyword in tk or tk in top_keyword
                    for tk in target_keywords
                    if len(tk) > 3
                )
                if is_covered:
                    continue

                if top_keyword not in competitor_topic_map:
                    competitor_topic_map[top_keyword] = TopicGap(
                        topic=top_keyword,
                        estimated_traffic=int(page.get("traffic", 0)),
                    )

                gap = competitor_topic_map[top_keyword]
                gap.competitor_urls.append(page.get("url", c_url))
                gap.competitor_keywords.append(top_keyword)
                gap.estimated_traffic = max(gap.estimated_traffic, int(page.get("traffic", 0)))

        # Score gaps
        gaps = list(competitor_topic_map.values())
        for gap in gaps:
            competitor_count = len(set(gap.competitor_urls))
            traffic_score = min(100, math.log10(max(gap.estimated_traffic, 1)) / math.log10(10000) * 100)
            competition_score = (competitor_count / max(len(competitor_urls), 1)) * 100
            gap.priority_score = round((traffic_score * 0.6) + (competition_score * 0.4), 1)

            # Difficulty estimation
            if competitor_count >= 3:
                gap.difficulty = "high"
            elif competitor_count >= 2:
                gap.difficulty = "medium"
            else:
                gap.difficulty = "low"

            # Content type suggestion
            gap.content_type_suggestion = self._suggest_content_type(gap.topic)

        gaps.sort(key=lambda g: g.priority_score, reverse=True)
        return gaps, target_keywords, content_volume

    @staticmethod
    def _suggest_content_type(topic: str) -> str:
        """Suggest content type based on topic keywords."""
        topic_lower = topic.lower()
        if any(w in topic_lower for w in ["how to", "guide", "tutorial", "방법", "가이드"]):
            return "guide"
        if any(w in topic_lower for w in ["best", "top", "review", "추천", "후기", "비교"]):
            return "listicle"
        if any(w in topic_lower for w in ["what is", "이란", "뜻", "의미"]):
            return "informational"
        if any(w in topic_lower for w in ["cost", "price", "비용", "가격"]):
            return "landing"
        return "blog"

    # ------------------------------------------------------------------
    # Topic cluster mapping
    # ------------------------------------------------------------------

    def build_topic_clusters(
        self,
        topics: list[str],
        n_clusters: int | None = None,
        min_cluster_size: int = 3,
    ) -> list[TopicCluster]:
        """
        Group topics into pillar/cluster structure using TF-IDF + hierarchical clustering.

        Args:
            topics: List of topic strings.
            n_clusters: Number of clusters (auto-detected if None).
            min_cluster_size: Minimum topics per cluster.

        Returns:
            List of TopicCluster objects.
        """
        if len(topics) < min_cluster_size:
            self.logger.warning("Too few topics for clustering")
            return []

        # Vectorize topics
        vectorizer = TfidfVectorizer(
            max_features=500,
            stop_words="english",
            ngram_range=(1, 2),
        )

        try:
            tfidf_matrix = vectorizer.fit_transform(topics)
        except ValueError as exc:
            self.logger.warning(f"TF-IDF vectorization failed: {exc}")
            return []

        # Auto-detect cluster count
        if n_clusters is None:
            n_clusters = max(2, min(len(topics) // 5, 15))

        n_clusters = min(n_clusters, len(topics) - 1)

        # Hierarchical clustering
        clustering = AgglomerativeClustering(
            n_clusters=n_clusters,
            metric="cosine",
            linkage="average",
        )
        labels = clustering.fit_predict(tfidf_matrix.toarray())

        # Build cluster objects
        cluster_map: dict[int, list[str]] = defaultdict(list)
        for topic, label in zip(topics, labels):
            cluster_map[label].append(topic)

        clusters: list[TopicCluster] = []
        for label, cluster_topics in sorted(cluster_map.items()):
            if len(cluster_topics) < min_cluster_size:
                continue

            # Pick the longest topic as pillar (usually broader)
            pillar = max(cluster_topics, key=len)
            subtopics = [t for t in cluster_topics if t != pillar]

            cluster = TopicCluster(
                pillar_topic=pillar,
                pillar_keyword=pillar,
                cluster_topics=subtopics[:20],
                cluster_keywords=[t for t in subtopics[:20]],
                total_volume=0,
                coverage_score=0.0,
            )
            clusters.append(cluster)

        clusters.sort(key=lambda c: len(c.cluster_topics), reverse=True)
        return clusters

    # ------------------------------------------------------------------
    # Editorial calendar generation
    # ------------------------------------------------------------------

    def generate_calendar(
        self,
        gaps: list[TopicGap],
        clusters: list[TopicCluster],
        weeks_ahead: int = 12,
        entries_per_week: int = 2,
    ) -> list[CalendarEntry]:
        """
        Generate prioritized editorial calendar from gaps and clusters.

        Args:
            gaps: List of topic gaps (sorted by priority).
            clusters: List of topic clusters.
            weeks_ahead: Number of weeks to plan.
            entries_per_week: Content pieces per week.

        Returns:
            List of CalendarEntry objects.
        """
        calendar: list[CalendarEntry] = []
        today = datetime.now()

        # Build cluster lookup
        topic_to_cluster: dict[str, str] = {}
        for cluster in clusters:
            for topic in cluster.cluster_topics:
                topic_to_cluster[topic] = cluster.pillar_topic
            topic_to_cluster[cluster.pillar_topic] = cluster.pillar_topic

        # Prioritize: pillar topics first, then by priority score
        pillar_topics = {c.pillar_topic for c in clusters}
        pillar_gaps = [g for g in gaps if g.topic in pillar_topics]
        other_gaps = [g for g in gaps if g.topic not in pillar_topics]
        ordered_gaps = pillar_gaps + other_gaps

        max_entries = weeks_ahead * entries_per_week
        week_offset = 0
        slot_in_week = 0

        for gap in ordered_gaps[:max_entries]:
            target_date = today + timedelta(weeks=week_offset, days=slot_in_week * 3)

            # Determine priority label
            if gap.priority_score >= 70:
                priority = "high"
            elif gap.priority_score >= 40:
                priority = "medium"
            else:
                priority = "low"

            # Word count based on content type
            word_count_map = {
                "guide": 2500,
                "listicle": 2000,
                "informational": 1800,
                "landing": 1200,
                "blog": 1500,
            }

            entry = CalendarEntry(
                topic=gap.topic,
                priority=priority,
                target_date=target_date.strftime("%Y-%m-%d"),
                content_type=gap.content_type_suggestion,
                target_word_count=word_count_map.get(gap.content_type_suggestion, 1500),
                primary_keyword=gap.topic,
                estimated_traffic=gap.estimated_traffic,
                cluster_name=topic_to_cluster.get(gap.topic, "uncategorized"),
            )
            calendar.append(entry)

            slot_in_week += 1
            if slot_in_week >= entries_per_week:
                slot_in_week = 0
                week_offset += 1

        return calendar

    # ------------------------------------------------------------------
    # Korean opportunity detection
    # ------------------------------------------------------------------

    @staticmethod
    def detect_korean_opportunities(gaps: list[TopicGap]) -> list[dict[str, Any]]:
        """Detect Korean-market content opportunities in gaps."""
        opportunities: list[dict[str, Any]] = []

        for gap in gaps:
            for pattern_info in KOREAN_OPPORTUNITY_PATTERNS:
                if re.search(pattern_info["pattern"], gap.topic, re.IGNORECASE):
                    opportunities.append({
                        "topic": gap.topic,
                        "pattern": pattern_info["label"],
                        "description": pattern_info["description"],
                        "estimated_traffic": gap.estimated_traffic,
                        "priority_score": gap.priority_score,
                    })
                    break

        opportunities.sort(key=lambda o: o["priority_score"], reverse=True)
        return opportunities

    # ------------------------------------------------------------------
    # Orchestration
    # ------------------------------------------------------------------

    async def analyze(
        self,
        target_url: str,
        competitor_urls: list[str],
        build_clusters: bool = False,
    ) -> ContentGapResult:
        """
        Run full content gap analysis.

        Args:
            target_url: Target website URL.
            competitor_urls: List of competitor URLs.
            build_clusters: Whether to build topic clusters.

        Returns:
            ContentGapResult with gaps, clusters, and calendar.
        """
        result = ContentGapResult(
            target_url=target_url,
            competitor_urls=competitor_urls,
            timestamp=datetime.now().isoformat(),
        )

        self.logger.info(
            f"Starting gap analysis: {target_url} vs {len(competitor_urls)} competitors"
        )

        # 1. Find topic gaps
        gaps, target_keywords, content_volume = await self.find_topic_gaps(
            target_url, competitor_urls
        )

        result.gaps = gaps
        result.target_topics_count = len(target_keywords)
        result.competitor_topics_count = sum(content_volume.get(c, 0) for c in competitor_urls)
        result.content_volume_comparison = content_volume

        # 2. Build topic clusters if requested
        if build_clusters and gaps:
            all_topics = [g.topic for g in gaps]
            result.clusters = self.build_topic_clusters(all_topics)

        # 3. Generate editorial calendar
        result.calendar = self.generate_calendar(gaps, result.clusters)

        # 4. Detect Korean opportunities
        result.korean_opportunities = self.detect_korean_opportunities(gaps)

        # 5. Recommendations
        result.recommendations = self._generate_recommendations(result)

        self.logger.info(
            f"Gap analysis complete: {len(gaps)} gaps, "
            f"{len(result.clusters)} clusters, "
            f"{len(result.calendar)} calendar entries"
        )

        return result

    @staticmethod
    def _generate_recommendations(result: ContentGapResult) -> list[str]:
        """Generate strategic recommendations from gap analysis."""
        recs: list[str] = []

        gap_count = len(result.gaps)
        if gap_count > 50:
            recs.append(
                f"경쟁사 대비 {gap_count}개의 콘텐츠 격차가 발견되었습니다. "
                "우선순위 상위 20개 주제부터 콘텐츠 생성을 시작하세요."
            )
        elif gap_count > 20:
            recs.append(
                f"{gap_count}개의 콘텐츠 격차가 있습니다. "
                "높은 트래픽 기회부터 순차적으로 콘텐츠를 생성하세요."
            )
        elif gap_count > 0:
            recs.append(
                f"{gap_count}개의 콘텐츠 격차가 발견되었습니다. "
                "비교적 적은 격차이므로 빠른 시일 내 모두 커버할 수 있습니다."
            )

        if result.clusters:
            recs.append(
                f"{len(result.clusters)}개의 토픽 클러스터를 구성했습니다. "
                "필러 콘텐츠부터 작성하여 내부 링크 구조를 강화하세요."
            )

        if result.korean_opportunities:
            recs.append(
                f"한국어 시장 기회가 {len(result.korean_opportunities)}개 발견되었습니다. "
                "후기, 비용, 비교 콘텐츠는 한국 검색 시장에서 높은 전환율을 보입니다."
            )

        high_priority = [g for g in result.gaps if g.priority_score >= 70]
        if high_priority:
            top_topics = ", ".join(g.topic for g in high_priority[:3])
            recs.append(
                f"최우선 주제: {top_topics}. "
                "이 주제들은 높은 트래픽 잠재력과 경쟁사 커버리지를 가지고 있습니다."
            )

        if not recs:
            recs.append("경쟁사 대비 콘텐츠 커버리지가 양호합니다. 기존 콘텐츠 최적화에 집중하세요.")

        return recs


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="SEO Content Gap Analyzer - topic gaps, clusters, calendar",
    )
    parser.add_argument("--target", required=True, help="Target website URL")
    parser.add_argument(
        "--competitor", action="append", dest="competitors", required=True,
        help="Competitor URL (can be repeated)",
    )
    parser.add_argument("--clusters", action="store_true", help="Build topic clusters")
    parser.add_argument("--json", action="store_true", help="Output as JSON")
    parser.add_argument("--output", help="Save output to file")
    return parser


def format_text_report(result: ContentGapResult) -> str:
    """Format gap analysis result as human-readable text."""
    lines: list[str] = []
    lines.append(f"## Content Gap Analysis: {result.target_url}")
    lines.append(f"**Date**: {result.timestamp[:10]}")
    lines.append(f"**Competitors**: {', '.join(result.competitor_urls)}")
    lines.append("")

    lines.append("### Content Volume Comparison")
    for site, count in result.content_volume_comparison.items():
        lines.append(f"  - {site}: {count} topics")
    lines.append("")

    lines.append(f"### Topic Gaps ({len(result.gaps)} found)")
    for i, gap in enumerate(result.gaps[:20], 1):
        lines.append(
            f"  {i}. [{gap.priority_score:.0f}] {gap.topic} "
            f"(traffic: {gap.estimated_traffic}, difficulty: {gap.difficulty})"
        )
    lines.append("")

    if result.clusters:
        lines.append(f"### Topic Clusters ({len(result.clusters)})")
        for i, cluster in enumerate(result.clusters, 1):
            lines.append(f"  {i}. **{cluster.pillar_topic}** ({len(cluster.cluster_topics)} subtopics)")
            for sub in cluster.cluster_topics[:5]:
                lines.append(f"     - {sub}")
        lines.append("")

    if result.calendar:
        lines.append(f"### Editorial Calendar ({len(result.calendar)} entries)")
        for entry in result.calendar[:15]:
            lines.append(
                f"  - [{entry.target_date}] {entry.topic} "
                f"({entry.content_type}, {entry.target_word_count}w, priority: {entry.priority})"
            )
        lines.append("")

    if result.korean_opportunities:
        lines.append(f"### Korean Market Opportunities ({len(result.korean_opportunities)})")
        for opp in result.korean_opportunities[:10]:
            lines.append(f"  - {opp['topic']} ({opp['description']})")
        lines.append("")

    lines.append("### Recommendations")
    for i, rec in enumerate(result.recommendations, 1):
        lines.append(f"  {i}. {rec}")

    return "\n".join(lines)


async def main() -> None:
    parser = build_parser()
    args = parser.parse_args()

    analyzer = ContentGapAnalyzer()
    result = await analyzer.analyze(
        target_url=args.target,
        competitor_urls=args.competitors,
        build_clusters=args.clusters,
    )

    if args.json:
        output = json.dumps(asdict(result), ensure_ascii=False, indent=2, default=str)
    else:
        output = format_text_report(result)

    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
            f.write(output)
        logger.info(f"Output saved to {args.output}")
    else:
        print(output)

    analyzer.print_stats()


if __name__ == "__main__":
    asyncio.run(main())