""" Knowledge Graph Analyzer ========================= Purpose: Analyze entity presence in Google Knowledge Graph, Knowledge Panels, Wikipedia, Wikidata, and Korean equivalents (Naver encyclopedia, 지식iN). Python: 3.10+ """ import argparse import asyncio import json import logging import re import sys from dataclasses import asdict, dataclass, field from datetime import datetime from typing import Any from urllib.parse import quote, urljoin import aiohttp from bs4 import BeautifulSoup from rich.console import Console from rich.table import Table from base_client import BaseAsyncClient, ConfigManager, config logger = logging.getLogger(__name__) console = Console() # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- EXPECTED_ATTRIBUTES = [ "name", "type", "description", "logo", "website", "founded", "ceo", "headquarters", "parent_organization", "subsidiaries", "social_twitter", "social_facebook", "social_linkedin", "social_youtube", "social_instagram", "stock_ticker", "industry", "employees", "revenue", ] @dataclass class KnowledgePanelAttribute: """Single attribute extracted from a Knowledge Panel.""" name: str value: str | None = None present: bool = False @dataclass class KnowledgePanel: """Represents a detected Knowledge Panel.""" detected: bool = False entity_type: str | None = None attributes: list[KnowledgePanelAttribute] = field(default_factory=list) completeness_score: float = 0.0 raw_snippet: str | None = None @dataclass class WikiPresence: """Wikipedia or Wikidata presence record.""" platform: str = "" # "wikipedia" or "wikidata" present: bool = False url: str | None = None qid: str | None = None # Wikidata QID (e.g. Q20710) language: str = "en" @dataclass class NaverPresence: """Naver encyclopedia and 지식iN presence.""" encyclopedia_present: bool = False encyclopedia_url: str | None = None knowledge_in_present: bool = False knowledge_in_count: int = 0 knowledge_in_url: str | None = None @dataclass class KnowledgeGraphResult: """Full Knowledge Graph analysis result.""" entity: str = "" language: str = "en" knowledge_panel: KnowledgePanel = field(default_factory=KnowledgePanel) wikipedia: WikiPresence = field(default_factory=lambda: WikiPresence(platform="wikipedia")) wikidata: WikiPresence = field(default_factory=lambda: WikiPresence(platform="wikidata")) naver: NaverPresence = field(default_factory=NaverPresence) competitors: list[dict[str, Any]] = field(default_factory=list) overall_score: float = 0.0 recommendations: list[str] = field(default_factory=list) timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) def to_dict(self) -> dict[str, Any]: return asdict(self) # --------------------------------------------------------------------------- # Knowledge Graph Analyzer # --------------------------------------------------------------------------- class KnowledgeGraphAnalyzer(BaseAsyncClient): """Analyze entity presence in Knowledge Graph and related platforms.""" GOOGLE_SEARCH_URL = "https://www.google.com/search" WIKIPEDIA_API_URL = "https://{lang}.wikipedia.org/api/rest_v1/page/summary/{title}" WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php" NAVER_SEARCH_URL = "https://search.naver.com/search.naver" NAVER_ENCYCLOPEDIA_URL = "https://terms.naver.com/search.naver" NAVER_KIN_URL = "https://kin.naver.com/search/list.naver" HEADERS = { "User-Agent": ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), "Accept-Language": "en-US,en;q=0.9", } def __init__(self, **kwargs): super().__init__(**kwargs) self.config = config # ------------------------------------------------------------------ # Google entity search # ------------------------------------------------------------------ async def search_entity( self, entity_name: str, language: str = "en", session: aiohttp.ClientSession | None = None, ) -> dict[str, Any]: """Search Google for entity to detect Knowledge Panel signals.""" params = {"q": entity_name, "hl": language, "gl": "us" if language == "en" else "kr"} headers = {**self.HEADERS} if language == "ko": headers["Accept-Language"] = "ko-KR,ko;q=0.9" params["gl"] = "kr" own_session = session is None if own_session: session = aiohttp.ClientSession() try: async with session.get( self.GOOGLE_SEARCH_URL, params=params, headers=headers, timeout=aiohttp.ClientTimeout(total=20) ) as resp: if resp.status != 200: logger.warning("Google search returned status %d", resp.status) return {"html": "", "status": resp.status} html = await resp.text() return {"html": html, "status": resp.status} except Exception as exc: logger.error("Google search failed: %s", exc) return {"html": "", "status": 0, "error": str(exc)} finally: if own_session: await session.close() # ------------------------------------------------------------------ # Knowledge Panel detection # ------------------------------------------------------------------ def detect_knowledge_panel(self, search_data: dict[str, Any]) -> KnowledgePanel: """Parse search results HTML for Knowledge Panel indicators.""" html = search_data.get("html", "") if not html: return KnowledgePanel(detected=False) soup = BeautifulSoup(html, "lxml") kp = KnowledgePanel() # Knowledge Panel is typically in a div with class 'kp-wholepage' or 'knowledge-panel' kp_selectors = [ "div.kp-wholepage", "div.knowledge-panel", "div[data-attrid='title']", "div.kp-header", "div[class*='kno-']", "div.osrp-blk", ] kp_element = None for selector in kp_selectors: kp_element = soup.select_one(selector) if kp_element: break if kp_element: kp.detected = True kp.raw_snippet = kp_element.get_text(separator=" ", strip=True)[:500] else: # Fallback: check for common KP text patterns text = soup.get_text(separator=" ", strip=True).lower() kp_indicators = [ "wikipedia", "description", "founded", "ceo", "headquarters", "subsidiaries", "parent organization", ] matches = sum(1 for ind in kp_indicators if ind in text) if matches >= 3: kp.detected = True kp.raw_snippet = text[:500] return kp # ------------------------------------------------------------------ # Attribute extraction # ------------------------------------------------------------------ def extract_attributes(self, kp: KnowledgePanel, html: str = "") -> list[KnowledgePanelAttribute]: """Extract entity attributes from Knowledge Panel data.""" attributes: list[KnowledgePanelAttribute] = [] text = (kp.raw_snippet or "").lower() # Parse HTML for structured attribute data soup = BeautifulSoup(html, "lxml") if html else None attribute_patterns = { "name": r"^(.+?)(?:\s+is\s+|\s*[-|]\s*)", "type": r"(?:is\s+(?:a|an)\s+)(\w[\w\s]+?)(?:\.|,|\s+based)", "description": r"(?:is\s+)(.{20,200}?)(?:\.\s)", "founded": r"(?:founded|established|incorporated)\s*(?:in|:)?\s*(\d{4})", "ceo": r"(?:ceo|chief executive|chairman)\s*(?::|is)?\s*([A-Z][\w\s.]+?)(?:,|\.|;|\s{2})", "headquarters": r"(?:headquarters?|hq|based in)\s*(?::|is|in)?\s*([A-Z][\w\s,]+?)(?:\.|;|\s{2})", "stock_ticker": r"(?:stock|ticker|symbol)\s*(?::|is)?\s*([A-Z]{1,5}(?:\s*:\s*[A-Z]{1,5})?)", "employees": r"(?:employees?|staff|workforce)\s*(?::|is)?\s*([\d,]+)", "revenue": r"(?:revenue|sales)\s*(?::|is)?\s*([\$\d,.]+\s*(?:billion|million|B|M)?)", "industry": r"(?:industry|sector)\s*(?::|is)?\s*([\w\s&]+?)(?:\.|,|;)", } social_patterns = { "social_twitter": r"(?:twitter\.com|x\.com)/(\w+)", "social_facebook": r"facebook\.com/([\w.]+)", "social_linkedin": r"linkedin\.com/(?:company|in)/([\w-]+)", "social_youtube": r"youtube\.com/(?:@|channel/|user/)([\w-]+)", "social_instagram": r"instagram\.com/([\w.]+)", } full_text = kp.raw_snippet or "" html_text = "" if soup: html_text = soup.get_text(separator=" ", strip=True) combined = f"{full_text} {html_text}" for attr_name, pattern in attribute_patterns.items(): match = re.search(pattern, combined, re.IGNORECASE) present = match is not None value = match.group(1).strip() if match else None attributes.append(KnowledgePanelAttribute(name=attr_name, value=value, present=present)) # Social profiles for attr_name, pattern in social_patterns.items(): match = re.search(pattern, combined, re.IGNORECASE) present = match is not None value = match.group(1).strip() if match else None attributes.append(KnowledgePanelAttribute(name=attr_name, value=value, present=present)) # Logo detection from HTML logo_present = False if soup: logo_img = soup.select_one("img[data-atf], g-img img, img.kno-fb-img, img[alt*='logo']") if logo_img: logo_present = True attributes.append(KnowledgePanelAttribute(name="logo", value=None, present=logo_present)) # Website detection website_present = False if soup: site_link = soup.select_one("a[data-attrid*='website'], a.ab_button[href*='http']") if site_link: website_present = True value = site_link.get("href", "") attributes.append(KnowledgePanelAttribute(name="website", value=value if website_present else None, present=website_present)) return attributes # ------------------------------------------------------------------ # Completeness scoring # ------------------------------------------------------------------ def score_completeness(self, attributes: list[KnowledgePanelAttribute]) -> float: """Score attribute completeness (0-100) based on filled vs expected.""" if not attributes: return 0.0 weights = { "name": 10, "type": 8, "description": 10, "logo": 8, "website": 10, "founded": 5, "ceo": 5, "headquarters": 5, "parent_organization": 3, "subsidiaries": 3, "social_twitter": 4, "social_facebook": 4, "social_linkedin": 4, "social_youtube": 3, "social_instagram": 3, "stock_ticker": 3, "industry": 5, "employees": 3, "revenue": 4, } total_weight = sum(weights.values()) earned = 0.0 attr_map = {a.name: a for a in attributes} for attr_name, weight in weights.items(): attr = attr_map.get(attr_name) if attr and attr.present: earned += weight return round((earned / total_weight) * 100, 1) if total_weight > 0 else 0.0 # ------------------------------------------------------------------ # Wikipedia check # ------------------------------------------------------------------ async def check_wikipedia( self, entity_name: str, language: str = "en", session: aiohttp.ClientSession | None = None, ) -> WikiPresence: """Check Wikipedia article existence for entity.""" wiki = WikiPresence(platform="wikipedia", language=language) title = entity_name.replace(" ", "_") url = self.WIKIPEDIA_API_URL.format(lang=language, title=quote(title)) own_session = session is None if own_session: session = aiohttp.ClientSession() try: async with session.get(url, headers=self.HEADERS, timeout=aiohttp.ClientTimeout(total=15)) as resp: if resp.status == 200: data = await resp.json() wiki.present = data.get("type") != "disambiguation" wiki.url = data.get("content_urls", {}).get("desktop", {}).get("page", "") if not wiki.url: wiki.url = f"https://{language}.wikipedia.org/wiki/{quote(title)}" logger.info("Wikipedia article found for '%s' (%s)", entity_name, language) elif resp.status == 404: wiki.present = False logger.info("No Wikipedia article for '%s' (%s)", entity_name, language) else: logger.warning("Wikipedia API returned status %d", resp.status) except Exception as exc: logger.error("Wikipedia check failed: %s", exc) finally: if own_session: await session.close() return wiki # ------------------------------------------------------------------ # Wikidata check # ------------------------------------------------------------------ async def check_wikidata( self, entity_name: str, session: aiohttp.ClientSession | None = None, ) -> WikiPresence: """Check Wikidata QID existence for entity.""" wiki = WikiPresence(platform="wikidata") params = { "action": "wbsearchentities", "search": entity_name, "language": "en", "format": "json", "limit": 5, } own_session = session is None if own_session: session = aiohttp.ClientSession() try: async with session.get( self.WIKIDATA_API_URL, params=params, headers=self.HEADERS, timeout=aiohttp.ClientTimeout(total=15), ) as resp: if resp.status == 200: data = await resp.json() results = data.get("search", []) if results: top = results[0] wiki.present = True wiki.qid = top.get("id", "") wiki.url = top.get("concepturi", f"https://www.wikidata.org/wiki/{wiki.qid}") logger.info("Wikidata entity found: %s (%s)", wiki.qid, entity_name) else: wiki.present = False logger.info("No Wikidata entity for '%s'", entity_name) else: logger.warning("Wikidata API returned status %d", resp.status) except Exception as exc: logger.error("Wikidata check failed: %s", exc) finally: if own_session: await session.close() return wiki # ------------------------------------------------------------------ # Naver encyclopedia # ------------------------------------------------------------------ async def check_naver_encyclopedia( self, entity_name: str, session: aiohttp.ClientSession | None = None, ) -> dict[str, Any]: """Check Naver encyclopedia (네이버 백과사전) presence.""" result = {"present": False, "url": None} params = {"query": entity_name, "searchType": 0} headers = { **self.HEADERS, "Accept-Language": "ko-KR,ko;q=0.9", } own_session = session is None if own_session: session = aiohttp.ClientSession() try: async with session.get( self.NAVER_ENCYCLOPEDIA_URL, params=params, headers=headers, timeout=aiohttp.ClientTimeout(total=15), ) as resp: if resp.status == 200: html = await resp.text() soup = BeautifulSoup(html, "lxml") # Look for search result entries entries = soup.select("ul.content_list li, div.search_result a, a.title") if entries: result["present"] = True first_link = entries[0].find("a") if first_link and first_link.get("href"): href = first_link["href"] if not href.startswith("http"): href = urljoin("https://terms.naver.com", href) result["url"] = href else: result["url"] = f"https://terms.naver.com/search.naver?query={quote(entity_name)}" logger.info("Naver encyclopedia entry found for '%s'", entity_name) else: # Fallback: check page text for result indicators text = soup.get_text() if entity_name in text and "검색결과가 없습니다" not in text: result["present"] = True result["url"] = f"https://terms.naver.com/search.naver?query={quote(entity_name)}" else: logger.warning("Naver encyclopedia returned status %d", resp.status) except Exception as exc: logger.error("Naver encyclopedia check failed: %s", exc) finally: if own_session: await session.close() return result # ------------------------------------------------------------------ # Naver knowledge iN # ------------------------------------------------------------------ async def check_naver_knowledge_in( self, entity_name: str, session: aiohttp.ClientSession | None = None, ) -> dict[str, Any]: """Check Naver knowledge iN (지식iN) entries.""" result = {"present": False, "count": 0, "url": None} params = {"query": entity_name} headers = { **self.HEADERS, "Accept-Language": "ko-KR,ko;q=0.9", } own_session = session is None if own_session: session = aiohttp.ClientSession() try: async with session.get( self.NAVER_KIN_URL, params=params, headers=headers, timeout=aiohttp.ClientTimeout(total=15), ) as resp: if resp.status == 200: html = await resp.text() soup = BeautifulSoup(html, "lxml") # Extract total result count count_el = soup.select_one("span.number, em.total_count, span.result_count") count = 0 if count_el: count_text = count_el.get_text(strip=True).replace(",", "") count_match = re.search(r"(\d+)", count_text) if count_match: count = int(count_match.group(1)) # Also check for list items entries = soup.select("ul.basic1 li, ul._list li, div.search_list li") if count > 0 or entries: result["present"] = True result["count"] = count if count > 0 else len(entries) result["url"] = f"https://kin.naver.com/search/list.naver?query={quote(entity_name)}" logger.info("Naver 지식iN: %d entries for '%s'", result["count"], entity_name) else: logger.info("No Naver 지식iN entries for '%s'", entity_name) else: logger.warning("Naver 지식iN returned status %d", resp.status) except Exception as exc: logger.error("Naver 지식iN check failed: %s", exc) finally: if own_session: await session.close() return result # ------------------------------------------------------------------ # Recommendations # ------------------------------------------------------------------ def generate_recommendations(self, result: KnowledgeGraphResult) -> list[str]: """Generate actionable recommendations based on analysis.""" recs: list[str] = [] kp = result.knowledge_panel if not kp.detected: recs.append( "Knowledge Panel이 감지되지 않았습니다. Google에 엔티티 등록을 위해 " "Wikipedia 페이지 생성, Wikidata 항목 추가, 구조화된 데이터(Organization schema) 구현을 권장합니다." ) elif kp.completeness_score < 50: recs.append( f"Knowledge Panel 완성도가 {kp.completeness_score}%로 낮습니다. " "누락된 속성(소셜 프로필, 설명, 로고 등)을 보강하세요." ) if not result.wikipedia.present: recs.append( "Wikipedia 문서가 없습니다. 주목할 만한 출처(reliable sources)를 확보한 후 " "Wikipedia 문서 생성을 고려하세요." ) if not result.wikidata.present: recs.append( "Wikidata 항목이 없습니다. Wikidata에 엔티티를 등록하여 " "Knowledge Graph 인식을 강화하세요." ) if not result.naver.encyclopedia_present: recs.append( "네이버 백과사전에 등록되어 있지 않습니다. 한국 시장 SEO를 위해 " "네이버 백과사전 등재를 검토하세요." ) if result.naver.knowledge_in_count < 5: recs.append( "네이버 지식iN에 관련 콘텐츠가 부족합니다. Q&A 콘텐츠를 통해 " "브랜드 엔티티 인지도를 높이세요." ) # Check social profile completeness attr_map = {a.name: a for a in kp.attributes} missing_social = [] for soc in ["social_twitter", "social_facebook", "social_linkedin", "social_youtube"]: attr = attr_map.get(soc) if not attr or not attr.present: missing_social.append(soc.replace("social_", "").title()) if missing_social: recs.append( f"소셜 프로필 연결 누락: {', '.join(missing_social)}. " "웹사이트 schema의 sameAs 속성에 소셜 프로필을 추가하세요." ) if not recs: recs.append("Knowledge Graph 엔티티 상태가 양호합니다. 현재 수준을 유지하세요.") return recs # ------------------------------------------------------------------ # Main orchestrator # ------------------------------------------------------------------ async def analyze( self, entity_name: str, language: str = "en", include_wiki: bool = True, include_naver: bool = True, ) -> KnowledgeGraphResult: """Orchestrate full Knowledge Graph analysis.""" result = KnowledgeGraphResult(entity=entity_name, language=language) logger.info("Starting Knowledge Graph analysis for '%s' (lang=%s)", entity_name, language) async with aiohttp.ClientSession() as session: # Step 1: Search entity on Google search_data = await self.search_entity(entity_name, language, session) # Step 2: Detect Knowledge Panel kp = self.detect_knowledge_panel(search_data) # Step 3: Extract attributes if kp.detected: kp.attributes = self.extract_attributes(kp, search_data.get("html", "")) kp.completeness_score = self.score_completeness(kp.attributes) # Detect entity type from attributes for attr in kp.attributes: if attr.name == "type" and attr.present: kp.entity_type = attr.value break result.knowledge_panel = kp # Step 4: Wikipedia and Wikidata checks (parallel) if include_wiki: wiki_task = self.check_wikipedia(entity_name, language, session) wikidata_task = self.check_wikidata(entity_name, session) result.wikipedia, result.wikidata = await asyncio.gather(wiki_task, wikidata_task) # Step 5: Naver checks (parallel) if include_naver: enc_task = self.check_naver_encyclopedia(entity_name, session) kin_task = self.check_naver_knowledge_in(entity_name, session) enc_result, kin_result = await asyncio.gather(enc_task, kin_task) result.naver = NaverPresence( encyclopedia_present=enc_result.get("present", False), encyclopedia_url=enc_result.get("url"), knowledge_in_present=kin_result.get("present", False), knowledge_in_count=kin_result.get("count", 0), knowledge_in_url=kin_result.get("url"), ) # Step 6: Compute overall score scores = [] if kp.detected: scores.append(kp.completeness_score * 0.35) else: scores.append(0) scores.append(20.0 if result.wikipedia.present else 0) scores.append(15.0 if result.wikidata.present else 0) scores.append(15.0 if result.naver.encyclopedia_present else 0) scores.append(15.0 if result.naver.knowledge_in_present else 0) result.overall_score = round(sum(scores), 1) # Step 7: Recommendations result.recommendations = self.generate_recommendations(result) logger.info("Analysis complete. Overall score: %.1f", result.overall_score) return result # --------------------------------------------------------------------------- # CLI display helpers # --------------------------------------------------------------------------- def display_result(result: KnowledgeGraphResult) -> None: """Display analysis result in a rich table.""" console.print() console.print(f"[bold cyan]Knowledge Graph Analysis: {result.entity}[/bold cyan]") console.print(f"Language: {result.language} | Score: {result.overall_score}/100") console.print() # Knowledge Panel table kp = result.knowledge_panel table = Table(title="Knowledge Panel", show_header=True) table.add_column("Property", style="bold") table.add_column("Value") table.add_column("Status") table.add_row("Detected", str(kp.detected), "[green]OK[/]" if kp.detected else "[red]Missing[/]") table.add_row("Entity Type", kp.entity_type or "-", "[green]OK[/]" if kp.entity_type else "[yellow]Unknown[/]") table.add_row("Completeness", f"{kp.completeness_score}%", "[green]OK[/]" if kp.completeness_score >= 50 else "[red]Low[/]") for attr in kp.attributes: status = "[green]Present[/]" if attr.present else "[red]Missing[/]" table.add_row(f" {attr.name}", attr.value or "-", status) console.print(table) console.print() # Platform presence table plat_table = Table(title="Platform Presence", show_header=True) plat_table.add_column("Platform", style="bold") plat_table.add_column("Present") plat_table.add_column("Details") plat_table.add_row( "Wikipedia", "[green]Yes[/]" if result.wikipedia.present else "[red]No[/]", result.wikipedia.url or "-", ) plat_table.add_row( "Wikidata", "[green]Yes[/]" if result.wikidata.present else "[red]No[/]", result.wikidata.qid or "-", ) plat_table.add_row( "Naver Encyclopedia", "[green]Yes[/]" if result.naver.encyclopedia_present else "[red]No[/]", result.naver.encyclopedia_url or "-", ) plat_table.add_row( "Naver 지식iN", "[green]Yes[/]" if result.naver.knowledge_in_present else "[red]No[/]", f"{result.naver.knowledge_in_count} entries" if result.naver.knowledge_in_present else "-", ) console.print(plat_table) console.print() # Recommendations console.print("[bold yellow]Recommendations:[/bold yellow]") for i, rec in enumerate(result.recommendations, 1): console.print(f" {i}. {rec}") console.print() # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Knowledge Graph & Entity Analyzer", formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument("--entity", required=True, help="Entity name to analyze") parser.add_argument("--language", default="en", choices=["en", "ko", "ja", "zh"], help="Language (default: en)") parser.add_argument("--wiki", action="store_true", default=True, help="Include Wikipedia/Wikidata check (default: True)") parser.add_argument("--no-wiki", action="store_true", help="Skip Wikipedia/Wikidata check") parser.add_argument("--no-naver", action="store_true", help="Skip Naver checks") parser.add_argument("--json", action="store_true", help="Output as JSON") parser.add_argument("--output", type=str, help="Output file path") return parser.parse_args() async def main() -> None: args = parse_args() analyzer = KnowledgeGraphAnalyzer() result = await analyzer.analyze( entity_name=args.entity, language=args.language, include_wiki=not args.no_wiki, include_naver=not args.no_naver, ) if args.json: output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2) if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(output) console.print(f"[green]Output saved to {args.output}[/green]") else: print(output) else: display_result(result) if args.output: with open(args.output, "w", encoding="utf-8") as f: json.dump(result.to_dict(), f, ensure_ascii=False, indent=2) console.print(f"[green]Output saved to {args.output}[/green]") if __name__ == "__main__": asyncio.run(main())