""" Entity Auditor =============== Purpose: Audit entity SEO signals including PAA monitoring, FAQ schema tracking, entity markup validation, and brand SERP analysis. Python: 3.10+ """ import argparse import asyncio import json import logging import re import sys from dataclasses import asdict, dataclass, field from datetime import datetime from typing import Any from urllib.parse import quote, urljoin, urlparse import aiohttp from bs4 import BeautifulSoup from rich.console import Console from rich.table import Table from base_client import BaseAsyncClient, ConfigManager, config logger = logging.getLogger(__name__) console = Console() # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class PaaQuestion: """A People Also Ask question found in SERP.""" question: str = "" keyword: str = "" position: int = 0 source_url: str | None = None @dataclass class FaqRichResult: """FAQ rich result tracking entry.""" url: str = "" question_count: int = 0 appearing_in_serp: bool = False questions: list[str] = field(default_factory=list) schema_valid: bool = False @dataclass class EntitySchema: """Entity structured data found on a website.""" type: str = "" # Organization, Person, LocalBusiness, etc. properties: dict[str, Any] = field(default_factory=dict) same_as_links: list[str] = field(default_factory=list) completeness: float = 0.0 issues: list[str] = field(default_factory=list) @dataclass class BrandSerpResult: """What appears when searching for the brand name.""" query: str = "" features: list[str] = field(default_factory=list) paa_count: int = 0 faq_count: int = 0 knowledge_panel: bool = False sitelinks: bool = False social_profiles: list[str] = field(default_factory=list) top_results: list[dict[str, str]] = field(default_factory=list) @dataclass class EntityAuditResult: """Full entity SEO audit result.""" url: str = "" entity_name: str = "" paa_questions: list[PaaQuestion] = field(default_factory=list) faq_rich_results: list[FaqRichResult] = field(default_factory=list) entity_schemas: list[EntitySchema] = field(default_factory=list) brand_serp: BrandSerpResult = field(default_factory=BrandSerpResult) social_profile_status: dict[str, bool] = field(default_factory=dict) overall_score: float = 0.0 recommendations: list[str] = field(default_factory=list) timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) def to_dict(self) -> dict[str, Any]: return asdict(self) # --------------------------------------------------------------------------- # Entity Auditor # --------------------------------------------------------------------------- class EntityAuditor(BaseAsyncClient): """Audit entity SEO signals and rich result presence.""" GOOGLE_SEARCH_URL = "https://www.google.com/search" HEADERS = { "User-Agent": ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), "Accept-Language": "en-US,en;q=0.9", } PAA_KEYWORD_TEMPLATES = [ "{entity}", "{entity} reviews", "{entity} vs", "what is {entity}", "{entity} pricing", "{entity} alternatives", "is {entity} good", "{entity} benefits", "how to use {entity}", "{entity} complaints", ] EXPECTED_SCHEMA_PROPERTIES = { "Organization": [ "name", "url", "logo", "description", "sameAs", "contactPoint", "address", "foundingDate", "founder", "numberOfEmployees", "email", "telephone", ], "Person": [ "name", "url", "image", "description", "sameAs", "jobTitle", "worksFor", "alumniOf", "birthDate", ], "LocalBusiness": [ "name", "url", "image", "description", "sameAs", "address", "telephone", "openingHours", "geo", "priceRange", "aggregateRating", ], } def __init__(self, **kwargs): super().__init__(**kwargs) self.config = config # ------------------------------------------------------------------ # PAA monitoring # ------------------------------------------------------------------ async def monitor_paa( self, entity_name: str, keywords: list[str] | None = None, session: aiohttp.ClientSession | None = None, ) -> list[PaaQuestion]: """Search brand keywords and extract People Also Ask questions.""" if keywords is None: keywords = [t.format(entity=entity_name) for t in self.PAA_KEYWORD_TEMPLATES] paa_questions: list[PaaQuestion] = [] own_session = session is None if own_session: session = aiohttp.ClientSession() try: for keyword in keywords: params = {"q": keyword, "hl": "en", "gl": "us"} try: async with session.get( self.GOOGLE_SEARCH_URL, params=params, headers=self.HEADERS, timeout=aiohttp.ClientTimeout(total=20), ) as resp: if resp.status != 200: logger.warning("Search for '%s' returned status %d", keyword, resp.status) continue html = await resp.text() soup = BeautifulSoup(html, "lxml") # PAA box selectors paa_selectors = [ "div[data-sgrd] div[data-q]", "div.related-question-pair", "div[jsname] div[data-q]", "div.wQiwMc", ] position = 0 for selector in paa_selectors: elements = soup.select(selector) for el in elements: question_text = el.get("data-q", "") or el.get_text(strip=True) if question_text and len(question_text) > 5: position += 1 paa_questions.append(PaaQuestion( question=question_text, keyword=keyword, position=position, )) # Fallback: regex for PAA-like questions if not paa_questions: text = soup.get_text(separator="\n") q_patterns = re.findall( r"((?:What|How|Why|When|Where|Who|Is|Can|Does|Do|Which)\s+[^?\n]{10,80}\??)", text, ) for i, q in enumerate(q_patterns[:8]): paa_questions.append(PaaQuestion( question=q.strip(), keyword=keyword, position=i + 1, )) except Exception as exc: logger.error("PAA search failed for '%s': %s", keyword, exc) continue # Rate limit between searches await asyncio.sleep(1.5) finally: if own_session: await session.close() # Deduplicate questions seen = set() unique = [] for q in paa_questions: key = q.question.lower().strip() if key not in seen: seen.add(key) unique.append(q) logger.info("Found %d unique PAA questions for '%s'", len(unique), entity_name) return unique # ------------------------------------------------------------------ # FAQ rich result tracking # ------------------------------------------------------------------ async def track_faq_rich_results( self, url: str, session: aiohttp.ClientSession | None = None, ) -> list[FaqRichResult]: """Check pages for FAQPage schema and SERP appearance.""" faq_results: list[FaqRichResult] = [] domain = urlparse(url).netloc own_session = session is None if own_session: session = aiohttp.ClientSession() try: # Fetch the page and look for FAQ schema async with session.get( url, headers=self.HEADERS, timeout=aiohttp.ClientTimeout(total=20), ) as resp: if resp.status != 200: logger.warning("Page %s returned status %d", url, resp.status) return faq_results html = await resp.text() soup = BeautifulSoup(html, "lxml") # Find JSON-LD scripts with FAQPage scripts = soup.find_all("script", type="application/ld+json") for script in scripts: try: data = json.loads(script.string or "{}") items = data if isinstance(data, list) else [data] for item in items: schema_type = item.get("@type", "") if schema_type == "FAQPage" or ( isinstance(schema_type, list) and "FAQPage" in schema_type ): questions = item.get("mainEntity", []) faq = FaqRichResult( url=url, question_count=len(questions), questions=[ q.get("name", "") for q in questions if isinstance(q, dict) ], schema_valid=True, ) faq_results.append(faq) # Check for nested @graph graph = item.get("@graph", []) for g_item in graph: if g_item.get("@type") == "FAQPage": questions = g_item.get("mainEntity", []) faq = FaqRichResult( url=url, question_count=len(questions), questions=[ q.get("name", "") for q in questions if isinstance(q, dict) ], schema_valid=True, ) faq_results.append(faq) except json.JSONDecodeError: continue # Also check for microdata FAQ markup faq_items = soup.select("[itemtype*='FAQPage'] [itemprop='mainEntity']") if faq_items and not faq_results: questions = [] for item in faq_items: q_el = item.select_one("[itemprop='name']") if q_el: questions.append(q_el.get_text(strip=True)) faq_results.append(FaqRichResult( url=url, question_count=len(questions), questions=questions, schema_valid=True, )) except Exception as exc: logger.error("FAQ tracking failed for %s: %s", url, exc) finally: if own_session: await session.close() logger.info("Found %d FAQ schemas on %s", len(faq_results), url) return faq_results # ------------------------------------------------------------------ # Entity schema audit # ------------------------------------------------------------------ async def audit_entity_schema( self, url: str, session: aiohttp.ClientSession | None = None, ) -> list[EntitySchema]: """Check Organization/Person/LocalBusiness schema on website.""" schemas: list[EntitySchema] = [] target_types = {"Organization", "Person", "LocalBusiness", "Corporation", "MedicalBusiness"} own_session = session is None if own_session: session = aiohttp.ClientSession() try: async with session.get( url, headers=self.HEADERS, timeout=aiohttp.ClientTimeout(total=20), ) as resp: if resp.status != 200: logger.warning("Page %s returned status %d", url, resp.status) return schemas html = await resp.text() soup = BeautifulSoup(html, "lxml") scripts = soup.find_all("script", type="application/ld+json") for script in scripts: try: data = json.loads(script.string or "{}") items = data if isinstance(data, list) else [data] # Include @graph nested items expanded = [] for item in items: expanded.append(item) if "@graph" in item: expanded.extend(item["@graph"]) for item in expanded: item_type = item.get("@type", "") if isinstance(item_type, list): matching = [t for t in item_type if t in target_types] if not matching: continue item_type = matching[0] elif item_type not in target_types: continue same_as = item.get("sameAs", []) if isinstance(same_as, str): same_as = [same_as] # Calculate completeness base_type = item_type if base_type == "Corporation": base_type = "Organization" elif base_type == "MedicalBusiness": base_type = "LocalBusiness" expected = self.EXPECTED_SCHEMA_PROPERTIES.get(base_type, []) present = [k for k in expected if k in item and item[k]] completeness = round((len(present) / len(expected)) * 100, 1) if expected else 0 # Check for issues issues = [] if "name" not in item: issues.append("Missing 'name' property") if "url" not in item: issues.append("Missing 'url' property") if not same_as: issues.append("No 'sameAs' links (social profiles)") if "logo" not in item and base_type == "Organization": issues.append("Missing 'logo' property") if "description" not in item: issues.append("Missing 'description' property") schema = EntitySchema( type=item_type, properties={k: (str(v)[:100] if not isinstance(v, (list, dict)) else v) for k, v in item.items() if k != "@context"}, same_as_links=same_as, completeness=completeness, issues=issues, ) schemas.append(schema) except json.JSONDecodeError: continue except Exception as exc: logger.error("Entity schema audit failed for %s: %s", url, exc) finally: if own_session: await session.close() logger.info("Found %d entity schemas on %s", len(schemas), url) return schemas # ------------------------------------------------------------------ # Brand SERP analysis # ------------------------------------------------------------------ async def analyze_brand_serp( self, entity_name: str, session: aiohttp.ClientSession | None = None, ) -> BrandSerpResult: """Analyze what appears in SERP for the brand name search.""" result = BrandSerpResult(query=entity_name) own_session = session is None if own_session: session = aiohttp.ClientSession() try: params = {"q": entity_name, "hl": "en", "gl": "us"} async with session.get( self.GOOGLE_SEARCH_URL, params=params, headers=self.HEADERS, timeout=aiohttp.ClientTimeout(total=20), ) as resp: if resp.status != 200: return result html = await resp.text() soup = BeautifulSoup(html, "lxml") text = soup.get_text(separator=" ", strip=True).lower() # Detect SERP features feature_indicators = { "knowledge_panel": ["kp-wholepage", "knowledge-panel", "kno-"], "sitelinks": ["sitelinks", "site-links"], "people_also_ask": ["related-question-pair", "data-q"], "faq_rich_result": ["faqpage", "frequently asked"], "featured_snippet": ["featured-snippet", "data-tts"], "image_pack": ["image-result", "img-brk"], "video_carousel": ["video-result", "vid-"], "twitter_carousel": ["twitter-timeline", "g-scrolling-carousel"], "reviews": ["star-rating", "aggregate-rating"], "local_pack": ["local-pack", "local_pack"], } for feature, indicators in feature_indicators.items(): for ind in indicators: if ind in str(soup).lower(): result.features.append(feature) break result.knowledge_panel = "knowledge_panel" in result.features result.sitelinks = "sitelinks" in result.features # Count PAA questions paa_elements = soup.select("div[data-q], div.related-question-pair") result.paa_count = len(paa_elements) if result.paa_count > 0 and "people_also_ask" not in result.features: result.features.append("people_also_ask") # Detect social profiles in results social_domains = { "twitter.com": "twitter", "x.com": "twitter", "facebook.com": "facebook", "linkedin.com": "linkedin", "youtube.com": "youtube", "instagram.com": "instagram", "github.com": "github", "pinterest.com": "pinterest", } links = soup.find_all("a", href=True) for link in links: href = link["href"] for domain, name in social_domains.items(): if domain in href and name not in result.social_profiles: result.social_profiles.append(name) # Extract top organic results result_divs = soup.select("div.g, div[data-sokoban-container]")[:10] for div in result_divs: title_el = div.select_one("h3") link_el = div.select_one("a[href]") if title_el and link_el: result.top_results.append({ "title": title_el.get_text(strip=True), "url": link_el.get("href", ""), }) except Exception as exc: logger.error("Brand SERP analysis failed for '%s': %s", entity_name, exc) finally: if own_session: await session.close() return result # ------------------------------------------------------------------ # Social profile link validation # ------------------------------------------------------------------ async def check_social_profile_links( self, same_as_links: list[str], session: aiohttp.ClientSession | None = None, ) -> dict[str, bool]: """Validate sameAs URLs are accessible.""" status: dict[str, bool] = {} if not same_as_links: return status own_session = session is None if own_session: session = aiohttp.ClientSession() try: for link in same_as_links: try: async with session.head( link, headers=self.HEADERS, timeout=aiohttp.ClientTimeout(total=10), allow_redirects=True, ) as resp: status[link] = resp.status < 400 except Exception: status[link] = False await asyncio.sleep(0.5) finally: if own_session: await session.close() accessible = sum(1 for v in status.values() if v) logger.info("Social profile links: %d/%d accessible", accessible, len(status)) return status # ------------------------------------------------------------------ # Recommendations # ------------------------------------------------------------------ def generate_recommendations(self, result: EntityAuditResult) -> list[str]: """Generate actionable entity SEO improvement recommendations.""" recs: list[str] = [] # PAA recommendations if not result.paa_questions: recs.append( "브랜드 관련 People Also Ask(PAA) 질문이 감지되지 않았습니다. " "FAQ 콘텐츠를 작성하여 PAA 노출 기회를 확보하세요." ) elif len(result.paa_questions) < 5: recs.append( f"PAA 질문이 {len(result.paa_questions)}개만 감지되었습니다. " "더 다양한 키워드에 대한 Q&A 콘텐츠를 강화하세요." ) # FAQ schema recommendations if not result.faq_rich_results: recs.append( "FAQPage schema가 감지되지 않았습니다. " "FAQ 페이지에 FAQPage JSON-LD를 추가하여 Rich Result를 확보하세요." ) else: invalid = [f for f in result.faq_rich_results if not f.schema_valid] if invalid: recs.append( f"{len(invalid)}개의 FAQ schema에 유효성 문제가 있습니다. " "Google Rich Results Test로 검증하세요." ) # Entity schema recommendations if not result.entity_schemas: recs.append( "Organization/Person/LocalBusiness schema가 없습니다. " "홈페이지에 Organization schema JSON-LD를 추가하세요." ) else: for schema in result.entity_schemas: if schema.completeness < 50: recs.append( f"{schema.type} schema 완성도가 {schema.completeness}%입니다. " f"누락 항목: {', '.join(schema.issues[:3])}" ) if not schema.same_as_links: recs.append( f"{schema.type} schema에 sameAs 속성이 없습니다. " "소셜 미디어 프로필 URL을 sameAs에 추가하세요." ) # Brand SERP recommendations serp = result.brand_serp if not serp.knowledge_panel: recs.append( "브랜드 검색 시 Knowledge Panel이 표시되지 않습니다. " "Wikipedia, Wikidata, 구조화된 데이터를 통해 엔티티 인식을 강화하세요." ) if not serp.sitelinks: recs.append( "Sitelinks가 표시되지 않습니다. " "사이트 구조와 내부 링크를 개선하세요." ) if len(serp.social_profiles) < 3: recs.append( f"SERP에 소셜 프로필이 {len(serp.social_profiles)}개만 표시됩니다. " "주요 소셜 미디어 프로필을 활성화하고 schema sameAs에 연결하세요." ) # Social profile accessibility broken = [url for url, ok in result.social_profile_status.items() if not ok] if broken: recs.append( f"접근 불가한 소셜 프로필 링크 {len(broken)}개: " f"{', '.join(broken[:3])}. sameAs URL을 업데이트하세요." ) if not recs: recs.append("Entity SEO 상태가 양호합니다. 현재 수준을 유지하세요.") return recs # ------------------------------------------------------------------ # Scoring # ------------------------------------------------------------------ def compute_score(self, result: EntityAuditResult) -> float: """Compute overall entity SEO score (0-100).""" score = 0.0 # PAA presence (15 points) paa_count = len(result.paa_questions) if paa_count >= 10: score += 15 elif paa_count >= 5: score += 10 elif paa_count > 0: score += 5 # FAQ schema (15 points) if result.faq_rich_results: valid_count = sum(1 for f in result.faq_rich_results if f.schema_valid) score += min(15, valid_count * 5) # Entity schema (25 points) if result.entity_schemas: best_completeness = max(s.completeness for s in result.entity_schemas) score += best_completeness * 0.25 # Brand SERP features (25 points) serp = result.brand_serp if serp.knowledge_panel: score += 10 if serp.sitelinks: score += 5 score += min(10, len(serp.features) * 2) # Social profiles (10 points) if result.social_profile_status: accessible = sum(1 for v in result.social_profile_status.values() if v) total = len(result.social_profile_status) score += (accessible / total) * 10 if total > 0 else 0 # sameAs links (10 points) total_same_as = sum(len(s.same_as_links) for s in result.entity_schemas) score += min(10, total_same_as * 2) return round(min(100, score), 1) # ------------------------------------------------------------------ # Main orchestrator # ------------------------------------------------------------------ async def audit( self, url: str, entity_name: str, include_paa: bool = True, include_faq: bool = True, ) -> EntityAuditResult: """Orchestrate full entity SEO audit.""" result = EntityAuditResult(url=url, entity_name=entity_name) logger.info("Starting entity audit for '%s' at %s", entity_name, url) async with aiohttp.ClientSession() as session: # Parallel tasks: entity schema, brand SERP, FAQ tasks = [ self.audit_entity_schema(url, session), self.analyze_brand_serp(entity_name, session), ] if include_faq: tasks.append(self.track_faq_rich_results(url, session)) results = await asyncio.gather(*tasks, return_exceptions=True) # Unpack results if not isinstance(results[0], Exception): result.entity_schemas = results[0] else: logger.error("Entity schema audit failed: %s", results[0]) if not isinstance(results[1], Exception): result.brand_serp = results[1] else: logger.error("Brand SERP analysis failed: %s", results[1]) if include_faq and len(results) > 2 and not isinstance(results[2], Exception): result.faq_rich_results = results[2] # PAA monitoring (sequential due to rate limits) if include_paa: result.paa_questions = await self.monitor_paa(entity_name, session=session) # Validate social profile links from schema all_same_as = [] for schema in result.entity_schemas: all_same_as.extend(schema.same_as_links) if all_same_as: result.social_profile_status = await self.check_social_profile_links( list(set(all_same_as)), session ) # Compute score and recommendations result.overall_score = self.compute_score(result) result.recommendations = self.generate_recommendations(result) logger.info("Entity audit complete. Score: %.1f", result.overall_score) return result # --------------------------------------------------------------------------- # CLI display helpers # --------------------------------------------------------------------------- def display_result(result: EntityAuditResult) -> None: """Display audit result in rich tables.""" console.print() console.print(f"[bold cyan]Entity SEO Audit: {result.entity_name}[/bold cyan]") console.print(f"URL: {result.url} | Score: {result.overall_score}/100") console.print() # Entity Schema table if result.entity_schemas: table = Table(title="Entity Schema Markup", show_header=True) table.add_column("Type", style="bold") table.add_column("Completeness") table.add_column("sameAs Links") table.add_column("Issues") for schema in result.entity_schemas: issues_text = "; ".join(schema.issues[:3]) if schema.issues else "None" table.add_row( schema.type, f"{schema.completeness}%", str(len(schema.same_as_links)), issues_text, ) console.print(table) else: console.print("[red]No entity schema markup found on website![/red]") console.print() # Brand SERP table serp = result.brand_serp serp_table = Table(title="Brand SERP Analysis", show_header=True) serp_table.add_column("Feature", style="bold") serp_table.add_column("Status") serp_table.add_row("Knowledge Panel", "[green]Yes[/]" if serp.knowledge_panel else "[red]No[/]") serp_table.add_row("Sitelinks", "[green]Yes[/]" if serp.sitelinks else "[red]No[/]") serp_table.add_row("PAA Count", str(serp.paa_count)) serp_table.add_row("SERP Features", ", ".join(serp.features) if serp.features else "None") serp_table.add_row("Social Profiles", ", ".join(serp.social_profiles) if serp.social_profiles else "None") console.print(serp_table) console.print() # PAA Questions if result.paa_questions: paa_table = Table(title=f"People Also Ask ({len(result.paa_questions)} questions)", show_header=True) paa_table.add_column("#", style="dim") paa_table.add_column("Question") paa_table.add_column("Keyword") for i, q in enumerate(result.paa_questions[:15], 1): paa_table.add_row(str(i), q.question, q.keyword) console.print(paa_table) console.print() # FAQ Rich Results if result.faq_rich_results: faq_table = Table(title="FAQ Rich Results", show_header=True) faq_table.add_column("URL") faq_table.add_column("Questions") faq_table.add_column("Valid") for faq in result.faq_rich_results: faq_table.add_row( faq.url[:60], str(faq.question_count), "[green]Yes[/]" if faq.schema_valid else "[red]No[/]", ) console.print(faq_table) console.print() # Social Profile Status if result.social_profile_status: sp_table = Table(title="Social Profile Link Status", show_header=True) sp_table.add_column("URL") sp_table.add_column("Accessible") for link, accessible in result.social_profile_status.items(): sp_table.add_row( link[:70], "[green]Yes[/]" if accessible else "[red]No[/]", ) console.print(sp_table) console.print() # Recommendations console.print("[bold yellow]Recommendations:[/bold yellow]") for i, rec in enumerate(result.recommendations, 1): console.print(f" {i}. {rec}") console.print() # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Entity SEO Auditor", formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument("--url", required=True, help="Website URL to audit") parser.add_argument("--entity", required=True, help="Entity/brand name") parser.add_argument("--paa", action="store_true", default=True, help="Include PAA monitoring (default: True)") parser.add_argument("--no-paa", action="store_true", help="Skip PAA monitoring") parser.add_argument("--faq", action="store_true", default=True, help="Include FAQ tracking (default: True)") parser.add_argument("--no-faq", action="store_true", help="Skip FAQ tracking") parser.add_argument("--json", action="store_true", help="Output as JSON") parser.add_argument("--output", type=str, help="Output file path") return parser.parse_args() async def main() -> None: args = parse_args() auditor = EntityAuditor() result = await auditor.audit( url=args.url, entity_name=args.entity, include_paa=not args.no_paa, include_faq=not args.no_faq, ) if args.json: output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2) if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(output) console.print(f"[green]Output saved to {args.output}[/green]") else: print(output) else: display_result(result) if args.output: with open(args.output, "w", encoding="utf-8") as f: json.dump(result.to_dict(), f, ensure_ascii=False, indent=2) console.print(f"[green]Output saved to {args.output}[/green]") if __name__ == "__main__": asyncio.run(main())