""" Product Schema Checker ====================== Purpose: Validate Product structured data (JSON-LD, Microdata, RDFa) for Google and Naver rich result eligibility. Python: 3.10+ """ import argparse import asyncio import json import logging import re import sys from dataclasses import asdict, dataclass, field from datetime import datetime from typing import Any from urllib.parse import urljoin, urlparse import aiohttp from bs4 import BeautifulSoup from rich.console import Console from rich.table import Table from base_client import BaseAsyncClient, config logger = logging.getLogger(__name__) console = Console() # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class SchemaProperty: """Single property within a schema object.""" name: str value: Any required: bool valid: bool error: str = "" @dataclass class ProductSchema: """Validation result for one product schema on a page.""" url: str schema_type: str # Product, Offer, AggregateRating, etc. properties: list[dict] # list of SchemaProperty as dicts is_valid: bool = False rich_result_eligible: bool = False errors: list[str] = field(default_factory=list) warnings: list[str] = field(default_factory=list) @dataclass class SchemaCheckResult: """Complete schema check result for one or more pages.""" urls_checked: int = 0 pages_with_schema: int = 0 pages_without_schema: int = 0 schemas: list[dict] = field(default_factory=list) common_errors: list[str] = field(default_factory=list) common_warnings: list[str] = field(default_factory=list) naver_shopping_issues: list[dict] = field(default_factory=list) score: int = 0 timestamp: str = "" def calculate_score(self) -> int: """Score 0-100 based on schema completeness.""" if self.urls_checked == 0: self.score = 0 return 0 coverage = self.pages_with_schema / self.urls_checked valid_schemas = sum(1 for s in self.schemas if s.get("is_valid")) validity_rate = valid_schemas / max(len(self.schemas), 1) eligible = sum(1 for s in self.schemas if s.get("rich_result_eligible")) eligibility_rate = eligible / max(len(self.schemas), 1) self.score = int(coverage * 40 + validity_rate * 35 + eligibility_rate * 25) return self.score # --------------------------------------------------------------------------- # Schema requirements # --------------------------------------------------------------------------- PRODUCT_REQUIRED = {"name", "image", "description"} PRODUCT_RECOMMENDED = { "brand", "sku", "gtin", "gtin8", "gtin13", "gtin14", "mpn", "offers", "review", "aggregateRating", "color", "material", } OFFER_REQUIRED = {"price", "priceCurrency", "availability"} OFFER_RECOMMENDED = { "url", "priceValidUntil", "itemCondition", "seller", "shippingDetails", "hasMerchantReturnPolicy", } AGGREGATE_RATING_REQUIRED = {"ratingValue", "reviewCount"} AGGREGATE_RATING_RECOMMENDED = {"bestRating", "worstRating", "ratingCount"} REVIEW_REQUIRED = {"author", "reviewRating"} REVIEW_RECOMMENDED = {"datePublished", "reviewBody", "name"} BREADCRUMB_REQUIRED = {"itemListElement"} AVAILABILITY_VALUES = { "https://schema.org/InStock", "https://schema.org/OutOfStock", "https://schema.org/PreOrder", "https://schema.org/BackOrder", "https://schema.org/Discontinued", "https://schema.org/InStoreOnly", "https://schema.org/OnlineOnly", "https://schema.org/LimitedAvailability", "https://schema.org/SoldOut", "http://schema.org/InStock", "http://schema.org/OutOfStock", "http://schema.org/PreOrder", "http://schema.org/BackOrder", "http://schema.org/Discontinued", "InStock", "OutOfStock", "PreOrder", "BackOrder", "Discontinued", } ITEM_CONDITION_VALUES = { "https://schema.org/NewCondition", "https://schema.org/UsedCondition", "https://schema.org/RefurbishedCondition", "https://schema.org/DamagedCondition", "http://schema.org/NewCondition", "http://schema.org/UsedCondition", "http://schema.org/RefurbishedCondition", "NewCondition", "UsedCondition", "RefurbishedCondition", } # --------------------------------------------------------------------------- # Main checker # --------------------------------------------------------------------------- class ProductSchemaChecker(BaseAsyncClient): """Validate Product structured data on e-commerce pages.""" def __init__( self, max_concurrent: int = 10, requests_per_second: float = 5.0, timeout: int = 30, ): super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second) self.timeout = aiohttp.ClientTimeout(total=timeout) self.headers = { "User-Agent": ( "Mozilla/5.0 (compatible; ProductSchemaChecker/1.0; " "+https://ourdigital.org)" ), "Accept": "text/html,application/xhtml+xml", "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7", } # ------------------------------------------------------------------ # Page fetching # ------------------------------------------------------------------ async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> str: """Fetch page HTML.""" try: async with session.get(url, headers=self.headers, timeout=self.timeout, allow_redirects=True, ssl=False) as resp: return await resp.text(errors="replace") except Exception as exc: self.logger.warning(f"Failed to fetch {url}: {exc}") return "" # ------------------------------------------------------------------ # Schema extraction # ------------------------------------------------------------------ def extract_schemas(self, html: str, page_url: str) -> list[dict]: """Extract all structured data from HTML (JSON-LD, Microdata, RDFa).""" schemas: list[dict] = [] soup = BeautifulSoup(html, "lxml") # --- JSON-LD --- for script in soup.find_all("script", attrs={"type": "application/ld+json"}): try: text = script.string or script.get_text() if not text: continue data = json.loads(text) if isinstance(data, list): for item in data: if isinstance(item, dict): schemas.append(item) elif isinstance(data, dict): # Handle @graph if "@graph" in data: for item in data["@graph"]: if isinstance(item, dict): schemas.append(item) else: schemas.append(data) except (json.JSONDecodeError, TypeError) as exc: self.logger.debug(f"JSON-LD parse error on {page_url}: {exc}") # --- Microdata --- for item_scope in soup.find_all(attrs={"itemscope": True}): item_type = item_scope.get("itemtype", "") if "Product" in item_type or "Offer" in item_type: microdata = self._parse_microdata(item_scope) if microdata: schemas.append(microdata) return schemas def _parse_microdata(self, element) -> dict: """Parse microdata from an itemscope element.""" result: dict[str, Any] = {} item_type = element.get("itemtype", "") if item_type: type_name = item_type.rstrip("/").split("/")[-1] result["@type"] = type_name for prop in element.find_all(attrs={"itemprop": True}, recursive=True): name = prop.get("itemprop", "") if not name: continue # Nested itemscope if prop.get("itemscope") is not None: result[name] = self._parse_microdata(prop) elif prop.name == "meta": result[name] = prop.get("content", "") elif prop.name == "link": result[name] = prop.get("href", "") elif prop.name == "img": result[name] = prop.get("src", "") elif prop.name == "time": result[name] = prop.get("datetime", prop.get_text(strip=True)) else: result[name] = prop.get_text(strip=True) return result # ------------------------------------------------------------------ # Validation methods # ------------------------------------------------------------------ def validate_product_schema(self, schema_data: dict, page_url: str) -> ProductSchema: """Validate a Product schema object.""" ps = ProductSchema( url=page_url, schema_type="Product", properties=[], ) # Check required properties for prop_name in PRODUCT_REQUIRED: value = schema_data.get(prop_name) valid = bool(value) error = "" if valid else f"Missing required property: {prop_name}" sp = SchemaProperty( name=prop_name, value=value, required=True, valid=valid, error=error, ) ps.properties.append(asdict(sp)) if not valid: ps.errors.append(error) # Check recommended properties for prop_name in PRODUCT_RECOMMENDED: value = schema_data.get(prop_name) sp = SchemaProperty( name=prop_name, value=value if value else None, required=False, valid=bool(value), error="" if value else f"Missing recommended property: {prop_name}", ) ps.properties.append(asdict(sp)) if not value: ps.warnings.append(f"Missing recommended property: {prop_name}") # Validate offers offers = schema_data.get("offers") if offers: if isinstance(offers, list): for offer in offers: offer_errors = self.validate_offer_schema(offer) ps.errors.extend(offer_errors["errors"]) ps.warnings.extend(offer_errors["warnings"]) elif isinstance(offers, dict): offer_errors = self.validate_offer_schema(offers) ps.errors.extend(offer_errors["errors"]) ps.warnings.extend(offer_errors["warnings"]) else: ps.errors.append("Missing 'offers' property (required for rich results)") # Validate aggregateRating agg_rating = schema_data.get("aggregateRating") if agg_rating and isinstance(agg_rating, dict): rating_result = self.validate_aggregate_rating(agg_rating) ps.errors.extend(rating_result["errors"]) ps.warnings.extend(rating_result["warnings"]) # Validate reviews review = schema_data.get("review") if review: reviews = review if isinstance(review, list) else [review] for r in reviews[:5]: # Check up to 5 reviews if isinstance(r, dict): review_result = self.validate_review_schema(r) ps.errors.extend(review_result["errors"]) ps.warnings.extend(review_result["warnings"]) ps.is_valid = len(ps.errors) == 0 ps.rich_result_eligible = self.check_rich_result_eligibility(schema_data) return ps def validate_offer_schema(self, offer_data: dict) -> dict[str, list[str]]: """Validate an Offer schema object.""" errors: list[str] = [] warnings: list[str] = [] for prop_name in OFFER_REQUIRED: value = offer_data.get(prop_name) if not value: errors.append(f"Offer missing required property: {prop_name}") # Validate price format price = offer_data.get("price") if price is not None: price_str = str(price).replace(",", "").strip() if not re.match(r"^\d+(\.\d+)?$", price_str): errors.append(f"Invalid price format: '{price}' (must be numeric)") elif float(price_str) <= 0: warnings.append(f"Price is zero or negative: {price}") # Validate priceCurrency currency = offer_data.get("priceCurrency", "") valid_currencies = {"KRW", "USD", "EUR", "JPY", "CNY", "GBP"} if currency and currency.upper() not in valid_currencies: warnings.append(f"Unusual currency code: {currency}") # Validate availability availability = offer_data.get("availability", "") if availability and availability not in AVAILABILITY_VALUES: errors.append( f"Invalid availability value: '{availability}'. " f"Use schema.org values like https://schema.org/InStock" ) # Validate itemCondition condition = offer_data.get("itemCondition", "") if condition and condition not in ITEM_CONDITION_VALUES: warnings.append(f"Invalid itemCondition: '{condition}'") # Check recommended for prop_name in OFFER_RECOMMENDED: if not offer_data.get(prop_name): warnings.append(f"Offer missing recommended property: {prop_name}") return {"errors": errors, "warnings": warnings} def validate_aggregate_rating(self, rating_data: dict) -> dict[str, list[str]]: """Validate AggregateRating schema.""" errors: list[str] = [] warnings: list[str] = [] for prop_name in AGGREGATE_RATING_REQUIRED: value = rating_data.get(prop_name) if value is None: errors.append(f"AggregateRating missing required: {prop_name}") # Validate ratingValue range rating_value = rating_data.get("ratingValue") best_rating = rating_data.get("bestRating", 5) worst_rating = rating_data.get("worstRating", 1) if rating_value is not None: try: rv = float(rating_value) br = float(best_rating) wr = float(worst_rating) if rv < wr or rv > br: errors.append( f"ratingValue ({rv}) outside range [{wr}, {br}]" ) except (ValueError, TypeError): errors.append(f"Invalid ratingValue format: {rating_value}") # Validate reviewCount review_count = rating_data.get("reviewCount") if review_count is not None: try: rc = int(review_count) if rc < 0: errors.append(f"Negative reviewCount: {rc}") except (ValueError, TypeError): errors.append(f"Invalid reviewCount format: {review_count}") for prop_name in AGGREGATE_RATING_RECOMMENDED: if not rating_data.get(prop_name): warnings.append(f"AggregateRating missing recommended: {prop_name}") return {"errors": errors, "warnings": warnings} def validate_review_schema(self, review_data: dict) -> dict[str, list[str]]: """Validate Review schema.""" errors: list[str] = [] warnings: list[str] = [] # Author validation author = review_data.get("author") if not author: errors.append("Review missing required: author") elif isinstance(author, dict): author_name = author.get("name", "") if not author_name: errors.append("Review author missing 'name' property") elif isinstance(author, str): if len(author.strip()) == 0: errors.append("Review author is empty string") # reviewRating validation review_rating = review_data.get("reviewRating") if not review_rating: errors.append("Review missing required: reviewRating") elif isinstance(review_rating, dict): rv = review_rating.get("ratingValue") if rv is None: errors.append("reviewRating missing ratingValue") for prop_name in REVIEW_RECOMMENDED: if not review_data.get(prop_name): warnings.append(f"Review missing recommended: {prop_name}") return {"errors": errors, "warnings": warnings} def validate_breadcrumb(self, schema_data: dict) -> dict[str, list[str]]: """Validate BreadcrumbList schema.""" errors: list[str] = [] warnings: list[str] = [] items = schema_data.get("itemListElement") if not items: errors.append("BreadcrumbList missing itemListElement") return {"errors": errors, "warnings": warnings} if not isinstance(items, list): errors.append("itemListElement should be an array") return {"errors": errors, "warnings": warnings} for i, item in enumerate(items): if not isinstance(item, dict): errors.append(f"Breadcrumb item {i} is not an object") continue position = item.get("position") if position is None: errors.append(f"Breadcrumb item {i} missing 'position'") name = item.get("name") or (item.get("item", {}).get("name") if isinstance(item.get("item"), dict) else None) if not name: warnings.append(f"Breadcrumb item {i} missing 'name'") return {"errors": errors, "warnings": warnings} # ------------------------------------------------------------------ # Rich result eligibility # ------------------------------------------------------------------ def check_rich_result_eligibility(self, schema_data: dict) -> bool: """Assess Google rich result eligibility for Product schema.""" # Must have name, image, and offers with price if not schema_data.get("name"): return False if not schema_data.get("image"): return False offers = schema_data.get("offers") if not offers: return False offer_list = offers if isinstance(offers, list) else [offers] for offer in offer_list: if not isinstance(offer, dict): continue if offer.get("price") and offer.get("priceCurrency") and offer.get("availability"): return True return False # ------------------------------------------------------------------ # Naver Shopping requirements # ------------------------------------------------------------------ def check_naver_shopping_requirements(self, schema_data: dict, page_url: str) -> list[dict]: """Check Naver Shopping specific schema requirements.""" issues: list[dict] = [] # Naver Shopping requires Product name in Korean for Korean market name = schema_data.get("name", "") korean_chars = len(re.findall(r"[\uac00-\ud7af]", str(name))) if korean_chars == 0 and name: issues.append({ "url": page_url, "type": "naver_product_name", "severity": "medium", "message": "Product name has no Korean characters", "recommendation": "Include Korean product name for Naver Shopping visibility.", }) # Naver prefers specific category mapping if not schema_data.get("category"): issues.append({ "url": page_url, "type": "naver_category", "severity": "low", "message": "Missing 'category' property for Naver Shopping categorization", "recommendation": "Add category property matching Naver Shopping category taxonomy.", }) # Naver requires image image = schema_data.get("image") if not image: issues.append({ "url": page_url, "type": "naver_image", "severity": "high", "message": "Missing product image (required for Naver Shopping)", "recommendation": "Add at least one high-quality product image URL.", }) elif isinstance(image, str): if not image.startswith("http"): issues.append({ "url": page_url, "type": "naver_image_url", "severity": "medium", "message": "Product image URL is relative (should be absolute)", "recommendation": "Use absolute URLs for product images.", }) # Naver requires price in KRW offers = schema_data.get("offers") if offers: offer_list = offers if isinstance(offers, list) else [offers] for offer in offer_list: if isinstance(offer, dict): currency = offer.get("priceCurrency", "") if currency and currency.upper() != "KRW": issues.append({ "url": page_url, "type": "naver_currency", "severity": "medium", "message": f"Price currency is {currency}, not KRW", "recommendation": "For Naver Shopping, provide price in KRW.", }) # Check brand/manufacturer if not schema_data.get("brand") and not schema_data.get("manufacturer"): issues.append({ "url": page_url, "type": "naver_brand", "severity": "low", "message": "Missing brand/manufacturer (helpful for Naver Shopping filters)", "recommendation": "Add brand or manufacturer property.", }) return issues # ------------------------------------------------------------------ # Orchestrator # ------------------------------------------------------------------ async def check( self, urls: list[str] | None = None, sitemap_url: str | None = None, sample_size: int = 50, ) -> SchemaCheckResult: """Run schema validation on URLs or sitemap.""" result = SchemaCheckResult(timestamp=datetime.now().isoformat()) target_urls: list[str] = [] async with aiohttp.ClientSession() as session: if sitemap_url: # Fetch URLs from sitemap target_urls = await self._urls_from_sitemap(session, sitemap_url, sample_size) if urls: target_urls.extend(urls) target_urls = list(set(target_urls))[:sample_size] result.urls_checked = len(target_urls) self.logger.info(f"Checking {len(target_urls)} URLs for Product schema") error_counter: dict[str, int] = {} warning_counter: dict[str, int] = {} for url in target_urls: html = await self._fetch_page(session, url) if not html: result.pages_without_schema += 1 continue schemas = self.extract_schemas(html, url) product_schemas = [ s for s in schemas if self._get_schema_type(s) in ("Product", "ProductGroup") ] breadcrumb_schemas = [ s for s in schemas if self._get_schema_type(s) == "BreadcrumbList" ] if not product_schemas: result.pages_without_schema += 1 continue result.pages_with_schema += 1 for ps_data in product_schemas: ps = self.validate_product_schema(ps_data, url) result.schemas.append(asdict(ps)) for err in ps.errors: error_counter[err] = error_counter.get(err, 0) + 1 for warn in ps.warnings: warning_counter[warn] = warning_counter.get(warn, 0) + 1 # Naver Shopping checks naver_issues = self.check_naver_shopping_requirements(ps_data, url) result.naver_shopping_issues.extend(naver_issues) # Validate breadcrumbs for bc_data in breadcrumb_schemas: bc_result = self.validate_breadcrumb(bc_data) for err in bc_result["errors"]: error_counter[err] = error_counter.get(err, 0) + 1 # Aggregate common errors/warnings result.common_errors = sorted( error_counter.keys(), key=lambda k: error_counter[k], reverse=True, )[:20] result.common_warnings = sorted( warning_counter.keys(), key=lambda k: warning_counter[k], reverse=True, )[:20] result.calculate_score() return result async def _urls_from_sitemap( self, session: aiohttp.ClientSession, sitemap_url: str, limit: int, ) -> list[str]: """Fetch product URLs from sitemap.""" urls: list[str] = [] try: async with session.get(sitemap_url, headers=self.headers, timeout=self.timeout, ssl=False) as resp: if resp.status != 200: return urls text = await resp.text(errors="replace") soup = BeautifulSoup(text, "lxml-xml") # Handle sitemap index sitemapindex = soup.find_all("sitemap") if sitemapindex: for sm in sitemapindex[:3]: loc = sm.find("loc") if loc: child_urls = await self._urls_from_sitemap(session, loc.text.strip(), limit) urls.extend(child_urls) if len(urls) >= limit: break else: for tag in soup.find_all("url"): loc = tag.find("loc") if loc: urls.append(loc.text.strip()) if len(urls) >= limit: break except Exception as exc: self.logger.warning(f"Sitemap parse failed: {exc}") return urls[:limit] @staticmethod def _get_schema_type(schema: dict) -> str: """Get the @type from a schema dict, handling various formats.""" schema_type = schema.get("@type", "") if isinstance(schema_type, list): return schema_type[0] if schema_type else "" return str(schema_type) # --------------------------------------------------------------------------- # CLI output helpers # --------------------------------------------------------------------------- def print_rich_report(result: SchemaCheckResult) -> None: """Print a rich-formatted report to the console.""" console.print(f"\n[bold cyan]Product Schema Validation Report[/bold cyan]") console.print(f"Timestamp: {result.timestamp}") console.print(f"URLs checked: {result.urls_checked}") # Coverage coverage = (result.pages_with_schema / max(result.urls_checked, 1)) * 100 cov_color = "green" if coverage >= 90 else "yellow" if coverage >= 50 else "red" console.print(f"Schema coverage: [{cov_color}]{coverage:.0f}%[/{cov_color}] " f"({result.pages_with_schema}/{result.urls_checked})") # Score score_color = "green" if result.score >= 80 else "yellow" if result.score >= 50 else "red" console.print(f"[bold {score_color}]Score: {result.score}/100[/bold {score_color}]") # Validity summary valid = sum(1 for s in result.schemas if s.get("is_valid")) eligible = sum(1 for s in result.schemas if s.get("rich_result_eligible")) total = len(result.schemas) table = Table(title="Schema Summary") table.add_column("Metric", style="bold") table.add_column("Value", justify="right") table.add_row("Total schemas found", str(total)) table.add_row("Valid schemas", str(valid)) table.add_row("Rich result eligible", str(eligible)) table.add_row("Pages without schema", str(result.pages_without_schema)) console.print(table) # Common errors if result.common_errors: console.print(f"\n[bold red]Common Errors ({len(result.common_errors)}):[/bold red]") for err in result.common_errors[:10]: console.print(f" [red]-[/red] {err}") # Common warnings if result.common_warnings: console.print(f"\n[bold yellow]Common Warnings ({len(result.common_warnings)}):[/bold yellow]") for warn in result.common_warnings[:10]: console.print(f" [yellow]-[/yellow] {warn}") # Naver Shopping issues if result.naver_shopping_issues: console.print(f"\n[bold magenta]Naver Shopping Issues ({len(result.naver_shopping_issues)}):[/bold magenta]") seen: set[str] = set() for issue in result.naver_shopping_issues: key = f"{issue['type']}:{issue['message']}" if key not in seen: seen.add(key) console.print(f" [{issue.get('severity', 'medium')}] {issue['message']}") console.print(f" [dim]{issue['recommendation']}[/dim]") # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main() -> None: parser = argparse.ArgumentParser( description="Product Schema Checker - Validate e-commerce structured data", ) group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--url", nargs="+", help="Product page URL(s) to validate") group.add_argument("--sitemap", help="Sitemap URL to fetch product pages from") parser.add_argument( "--sample", type=int, default=50, help="Max URLs to check from sitemap (default: 50)", ) parser.add_argument("--json", action="store_true", help="Output as JSON") parser.add_argument("--output", type=str, help="Save output to file") args = parser.parse_args() checker = ProductSchemaChecker() result = asyncio.run( checker.check( urls=args.url, sitemap_url=args.sitemap, sample_size=args.sample, ) ) if args.json: output = json.dumps(asdict(result), indent=2, ensure_ascii=False, default=str) if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(output) console.print(f"[green]Results saved to {args.output}[/green]") else: print(output) else: print_rich_report(result) if args.output: output = json.dumps(asdict(result), indent=2, ensure_ascii=False, default=str) with open(args.output, "w", encoding="utf-8") as f: f.write(output) console.print(f"\n[green]JSON results also saved to {args.output}[/green]") checker.print_stats() if __name__ == "__main__": main()