12 new skills: Keyword Strategy, SERP Analysis, Position Tracking, Link Building, Content Strategy, E-Commerce SEO, KPI Framework, International SEO, AI Visibility, Knowledge Graph, Competitor Intel, and Crawl Budget. ~20K lines of Python across 25 domain scripts. Updated skill 11 pipeline table and repo CLAUDE.md. Enhanced skill 18 local SEO workflow from jamie.clinic audit. Note: Skill 26 hreflang_validator.py pending (content filter block). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
806 lines
31 KiB
Python
806 lines
31 KiB
Python
"""
|
|
Product Schema Checker
|
|
======================
|
|
Purpose: Validate Product structured data (JSON-LD, Microdata, RDFa)
|
|
for Google and Naver rich result eligibility.
|
|
Python: 3.10+
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import re
|
|
import sys
|
|
from dataclasses import asdict, dataclass, field
|
|
from datetime import datetime
|
|
from typing import Any
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
import aiohttp
|
|
from bs4 import BeautifulSoup
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
|
|
from base_client import BaseAsyncClient, config
|
|
|
|
logger = logging.getLogger(__name__)
|
|
console = Console()
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data classes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@dataclass
|
|
class SchemaProperty:
|
|
"""Single property within a schema object."""
|
|
name: str
|
|
value: Any
|
|
required: bool
|
|
valid: bool
|
|
error: str = ""
|
|
|
|
|
|
@dataclass
|
|
class ProductSchema:
|
|
"""Validation result for one product schema on a page."""
|
|
url: str
|
|
schema_type: str # Product, Offer, AggregateRating, etc.
|
|
properties: list[dict] # list of SchemaProperty as dicts
|
|
is_valid: bool = False
|
|
rich_result_eligible: bool = False
|
|
errors: list[str] = field(default_factory=list)
|
|
warnings: list[str] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class SchemaCheckResult:
|
|
"""Complete schema check result for one or more pages."""
|
|
urls_checked: int = 0
|
|
pages_with_schema: int = 0
|
|
pages_without_schema: int = 0
|
|
schemas: list[dict] = field(default_factory=list)
|
|
common_errors: list[str] = field(default_factory=list)
|
|
common_warnings: list[str] = field(default_factory=list)
|
|
naver_shopping_issues: list[dict] = field(default_factory=list)
|
|
score: int = 0
|
|
timestamp: str = ""
|
|
|
|
def calculate_score(self) -> int:
|
|
"""Score 0-100 based on schema completeness."""
|
|
if self.urls_checked == 0:
|
|
self.score = 0
|
|
return 0
|
|
coverage = self.pages_with_schema / self.urls_checked
|
|
valid_schemas = sum(1 for s in self.schemas if s.get("is_valid"))
|
|
validity_rate = valid_schemas / max(len(self.schemas), 1)
|
|
eligible = sum(1 for s in self.schemas if s.get("rich_result_eligible"))
|
|
eligibility_rate = eligible / max(len(self.schemas), 1)
|
|
self.score = int(coverage * 40 + validity_rate * 35 + eligibility_rate * 25)
|
|
return self.score
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Schema requirements
|
|
# ---------------------------------------------------------------------------
|
|
|
|
PRODUCT_REQUIRED = {"name", "image", "description"}
|
|
PRODUCT_RECOMMENDED = {
|
|
"brand", "sku", "gtin", "gtin8", "gtin13", "gtin14", "mpn",
|
|
"offers", "review", "aggregateRating", "color", "material",
|
|
}
|
|
|
|
OFFER_REQUIRED = {"price", "priceCurrency", "availability"}
|
|
OFFER_RECOMMENDED = {
|
|
"url", "priceValidUntil", "itemCondition", "seller",
|
|
"shippingDetails", "hasMerchantReturnPolicy",
|
|
}
|
|
|
|
AGGREGATE_RATING_REQUIRED = {"ratingValue", "reviewCount"}
|
|
AGGREGATE_RATING_RECOMMENDED = {"bestRating", "worstRating", "ratingCount"}
|
|
|
|
REVIEW_REQUIRED = {"author", "reviewRating"}
|
|
REVIEW_RECOMMENDED = {"datePublished", "reviewBody", "name"}
|
|
|
|
BREADCRUMB_REQUIRED = {"itemListElement"}
|
|
|
|
AVAILABILITY_VALUES = {
|
|
"https://schema.org/InStock",
|
|
"https://schema.org/OutOfStock",
|
|
"https://schema.org/PreOrder",
|
|
"https://schema.org/BackOrder",
|
|
"https://schema.org/Discontinued",
|
|
"https://schema.org/InStoreOnly",
|
|
"https://schema.org/OnlineOnly",
|
|
"https://schema.org/LimitedAvailability",
|
|
"https://schema.org/SoldOut",
|
|
"http://schema.org/InStock",
|
|
"http://schema.org/OutOfStock",
|
|
"http://schema.org/PreOrder",
|
|
"http://schema.org/BackOrder",
|
|
"http://schema.org/Discontinued",
|
|
"InStock", "OutOfStock", "PreOrder", "BackOrder", "Discontinued",
|
|
}
|
|
|
|
ITEM_CONDITION_VALUES = {
|
|
"https://schema.org/NewCondition",
|
|
"https://schema.org/UsedCondition",
|
|
"https://schema.org/RefurbishedCondition",
|
|
"https://schema.org/DamagedCondition",
|
|
"http://schema.org/NewCondition",
|
|
"http://schema.org/UsedCondition",
|
|
"http://schema.org/RefurbishedCondition",
|
|
"NewCondition", "UsedCondition", "RefurbishedCondition",
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main checker
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class ProductSchemaChecker(BaseAsyncClient):
|
|
"""Validate Product structured data on e-commerce pages."""
|
|
|
|
def __init__(
|
|
self,
|
|
max_concurrent: int = 10,
|
|
requests_per_second: float = 5.0,
|
|
timeout: int = 30,
|
|
):
|
|
super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second)
|
|
self.timeout = aiohttp.ClientTimeout(total=timeout)
|
|
self.headers = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (compatible; ProductSchemaChecker/1.0; "
|
|
"+https://ourdigital.org)"
|
|
),
|
|
"Accept": "text/html,application/xhtml+xml",
|
|
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
|
|
}
|
|
|
|
# ------------------------------------------------------------------
|
|
# Page fetching
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> str:
|
|
"""Fetch page HTML."""
|
|
try:
|
|
async with session.get(url, headers=self.headers, timeout=self.timeout,
|
|
allow_redirects=True, ssl=False) as resp:
|
|
return await resp.text(errors="replace")
|
|
except Exception as exc:
|
|
self.logger.warning(f"Failed to fetch {url}: {exc}")
|
|
return ""
|
|
|
|
# ------------------------------------------------------------------
|
|
# Schema extraction
|
|
# ------------------------------------------------------------------
|
|
|
|
def extract_schemas(self, html: str, page_url: str) -> list[dict]:
|
|
"""Extract all structured data from HTML (JSON-LD, Microdata, RDFa)."""
|
|
schemas: list[dict] = []
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
# --- JSON-LD ---
|
|
for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
|
|
try:
|
|
text = script.string or script.get_text()
|
|
if not text:
|
|
continue
|
|
data = json.loads(text)
|
|
if isinstance(data, list):
|
|
for item in data:
|
|
if isinstance(item, dict):
|
|
schemas.append(item)
|
|
elif isinstance(data, dict):
|
|
# Handle @graph
|
|
if "@graph" in data:
|
|
for item in data["@graph"]:
|
|
if isinstance(item, dict):
|
|
schemas.append(item)
|
|
else:
|
|
schemas.append(data)
|
|
except (json.JSONDecodeError, TypeError) as exc:
|
|
self.logger.debug(f"JSON-LD parse error on {page_url}: {exc}")
|
|
|
|
# --- Microdata ---
|
|
for item_scope in soup.find_all(attrs={"itemscope": True}):
|
|
item_type = item_scope.get("itemtype", "")
|
|
if "Product" in item_type or "Offer" in item_type:
|
|
microdata = self._parse_microdata(item_scope)
|
|
if microdata:
|
|
schemas.append(microdata)
|
|
|
|
return schemas
|
|
|
|
def _parse_microdata(self, element) -> dict:
|
|
"""Parse microdata from an itemscope element."""
|
|
result: dict[str, Any] = {}
|
|
item_type = element.get("itemtype", "")
|
|
if item_type:
|
|
type_name = item_type.rstrip("/").split("/")[-1]
|
|
result["@type"] = type_name
|
|
|
|
for prop in element.find_all(attrs={"itemprop": True}, recursive=True):
|
|
name = prop.get("itemprop", "")
|
|
if not name:
|
|
continue
|
|
# Nested itemscope
|
|
if prop.get("itemscope") is not None:
|
|
result[name] = self._parse_microdata(prop)
|
|
elif prop.name == "meta":
|
|
result[name] = prop.get("content", "")
|
|
elif prop.name == "link":
|
|
result[name] = prop.get("href", "")
|
|
elif prop.name == "img":
|
|
result[name] = prop.get("src", "")
|
|
elif prop.name == "time":
|
|
result[name] = prop.get("datetime", prop.get_text(strip=True))
|
|
else:
|
|
result[name] = prop.get_text(strip=True)
|
|
|
|
return result
|
|
|
|
# ------------------------------------------------------------------
|
|
# Validation methods
|
|
# ------------------------------------------------------------------
|
|
|
|
def validate_product_schema(self, schema_data: dict, page_url: str) -> ProductSchema:
|
|
"""Validate a Product schema object."""
|
|
ps = ProductSchema(
|
|
url=page_url,
|
|
schema_type="Product",
|
|
properties=[],
|
|
)
|
|
|
|
# Check required properties
|
|
for prop_name in PRODUCT_REQUIRED:
|
|
value = schema_data.get(prop_name)
|
|
valid = bool(value)
|
|
error = "" if valid else f"Missing required property: {prop_name}"
|
|
sp = SchemaProperty(
|
|
name=prop_name, value=value, required=True, valid=valid, error=error,
|
|
)
|
|
ps.properties.append(asdict(sp))
|
|
if not valid:
|
|
ps.errors.append(error)
|
|
|
|
# Check recommended properties
|
|
for prop_name in PRODUCT_RECOMMENDED:
|
|
value = schema_data.get(prop_name)
|
|
sp = SchemaProperty(
|
|
name=prop_name, value=value if value else None,
|
|
required=False, valid=bool(value),
|
|
error="" if value else f"Missing recommended property: {prop_name}",
|
|
)
|
|
ps.properties.append(asdict(sp))
|
|
if not value:
|
|
ps.warnings.append(f"Missing recommended property: {prop_name}")
|
|
|
|
# Validate offers
|
|
offers = schema_data.get("offers")
|
|
if offers:
|
|
if isinstance(offers, list):
|
|
for offer in offers:
|
|
offer_errors = self.validate_offer_schema(offer)
|
|
ps.errors.extend(offer_errors["errors"])
|
|
ps.warnings.extend(offer_errors["warnings"])
|
|
elif isinstance(offers, dict):
|
|
offer_errors = self.validate_offer_schema(offers)
|
|
ps.errors.extend(offer_errors["errors"])
|
|
ps.warnings.extend(offer_errors["warnings"])
|
|
else:
|
|
ps.errors.append("Missing 'offers' property (required for rich results)")
|
|
|
|
# Validate aggregateRating
|
|
agg_rating = schema_data.get("aggregateRating")
|
|
if agg_rating and isinstance(agg_rating, dict):
|
|
rating_result = self.validate_aggregate_rating(agg_rating)
|
|
ps.errors.extend(rating_result["errors"])
|
|
ps.warnings.extend(rating_result["warnings"])
|
|
|
|
# Validate reviews
|
|
review = schema_data.get("review")
|
|
if review:
|
|
reviews = review if isinstance(review, list) else [review]
|
|
for r in reviews[:5]: # Check up to 5 reviews
|
|
if isinstance(r, dict):
|
|
review_result = self.validate_review_schema(r)
|
|
ps.errors.extend(review_result["errors"])
|
|
ps.warnings.extend(review_result["warnings"])
|
|
|
|
ps.is_valid = len(ps.errors) == 0
|
|
ps.rich_result_eligible = self.check_rich_result_eligibility(schema_data)
|
|
|
|
return ps
|
|
|
|
def validate_offer_schema(self, offer_data: dict) -> dict[str, list[str]]:
|
|
"""Validate an Offer schema object."""
|
|
errors: list[str] = []
|
|
warnings: list[str] = []
|
|
|
|
for prop_name in OFFER_REQUIRED:
|
|
value = offer_data.get(prop_name)
|
|
if not value:
|
|
errors.append(f"Offer missing required property: {prop_name}")
|
|
|
|
# Validate price format
|
|
price = offer_data.get("price")
|
|
if price is not None:
|
|
price_str = str(price).replace(",", "").strip()
|
|
if not re.match(r"^\d+(\.\d+)?$", price_str):
|
|
errors.append(f"Invalid price format: '{price}' (must be numeric)")
|
|
elif float(price_str) <= 0:
|
|
warnings.append(f"Price is zero or negative: {price}")
|
|
|
|
# Validate priceCurrency
|
|
currency = offer_data.get("priceCurrency", "")
|
|
valid_currencies = {"KRW", "USD", "EUR", "JPY", "CNY", "GBP"}
|
|
if currency and currency.upper() not in valid_currencies:
|
|
warnings.append(f"Unusual currency code: {currency}")
|
|
|
|
# Validate availability
|
|
availability = offer_data.get("availability", "")
|
|
if availability and availability not in AVAILABILITY_VALUES:
|
|
errors.append(
|
|
f"Invalid availability value: '{availability}'. "
|
|
f"Use schema.org values like https://schema.org/InStock"
|
|
)
|
|
|
|
# Validate itemCondition
|
|
condition = offer_data.get("itemCondition", "")
|
|
if condition and condition not in ITEM_CONDITION_VALUES:
|
|
warnings.append(f"Invalid itemCondition: '{condition}'")
|
|
|
|
# Check recommended
|
|
for prop_name in OFFER_RECOMMENDED:
|
|
if not offer_data.get(prop_name):
|
|
warnings.append(f"Offer missing recommended property: {prop_name}")
|
|
|
|
return {"errors": errors, "warnings": warnings}
|
|
|
|
def validate_aggregate_rating(self, rating_data: dict) -> dict[str, list[str]]:
|
|
"""Validate AggregateRating schema."""
|
|
errors: list[str] = []
|
|
warnings: list[str] = []
|
|
|
|
for prop_name in AGGREGATE_RATING_REQUIRED:
|
|
value = rating_data.get(prop_name)
|
|
if value is None:
|
|
errors.append(f"AggregateRating missing required: {prop_name}")
|
|
|
|
# Validate ratingValue range
|
|
rating_value = rating_data.get("ratingValue")
|
|
best_rating = rating_data.get("bestRating", 5)
|
|
worst_rating = rating_data.get("worstRating", 1)
|
|
if rating_value is not None:
|
|
try:
|
|
rv = float(rating_value)
|
|
br = float(best_rating)
|
|
wr = float(worst_rating)
|
|
if rv < wr or rv > br:
|
|
errors.append(
|
|
f"ratingValue ({rv}) outside range [{wr}, {br}]"
|
|
)
|
|
except (ValueError, TypeError):
|
|
errors.append(f"Invalid ratingValue format: {rating_value}")
|
|
|
|
# Validate reviewCount
|
|
review_count = rating_data.get("reviewCount")
|
|
if review_count is not None:
|
|
try:
|
|
rc = int(review_count)
|
|
if rc < 0:
|
|
errors.append(f"Negative reviewCount: {rc}")
|
|
except (ValueError, TypeError):
|
|
errors.append(f"Invalid reviewCount format: {review_count}")
|
|
|
|
for prop_name in AGGREGATE_RATING_RECOMMENDED:
|
|
if not rating_data.get(prop_name):
|
|
warnings.append(f"AggregateRating missing recommended: {prop_name}")
|
|
|
|
return {"errors": errors, "warnings": warnings}
|
|
|
|
def validate_review_schema(self, review_data: dict) -> dict[str, list[str]]:
|
|
"""Validate Review schema."""
|
|
errors: list[str] = []
|
|
warnings: list[str] = []
|
|
|
|
# Author validation
|
|
author = review_data.get("author")
|
|
if not author:
|
|
errors.append("Review missing required: author")
|
|
elif isinstance(author, dict):
|
|
author_name = author.get("name", "")
|
|
if not author_name:
|
|
errors.append("Review author missing 'name' property")
|
|
elif isinstance(author, str):
|
|
if len(author.strip()) == 0:
|
|
errors.append("Review author is empty string")
|
|
|
|
# reviewRating validation
|
|
review_rating = review_data.get("reviewRating")
|
|
if not review_rating:
|
|
errors.append("Review missing required: reviewRating")
|
|
elif isinstance(review_rating, dict):
|
|
rv = review_rating.get("ratingValue")
|
|
if rv is None:
|
|
errors.append("reviewRating missing ratingValue")
|
|
|
|
for prop_name in REVIEW_RECOMMENDED:
|
|
if not review_data.get(prop_name):
|
|
warnings.append(f"Review missing recommended: {prop_name}")
|
|
|
|
return {"errors": errors, "warnings": warnings}
|
|
|
|
def validate_breadcrumb(self, schema_data: dict) -> dict[str, list[str]]:
|
|
"""Validate BreadcrumbList schema."""
|
|
errors: list[str] = []
|
|
warnings: list[str] = []
|
|
|
|
items = schema_data.get("itemListElement")
|
|
if not items:
|
|
errors.append("BreadcrumbList missing itemListElement")
|
|
return {"errors": errors, "warnings": warnings}
|
|
|
|
if not isinstance(items, list):
|
|
errors.append("itemListElement should be an array")
|
|
return {"errors": errors, "warnings": warnings}
|
|
|
|
for i, item in enumerate(items):
|
|
if not isinstance(item, dict):
|
|
errors.append(f"Breadcrumb item {i} is not an object")
|
|
continue
|
|
position = item.get("position")
|
|
if position is None:
|
|
errors.append(f"Breadcrumb item {i} missing 'position'")
|
|
name = item.get("name") or (item.get("item", {}).get("name") if isinstance(item.get("item"), dict) else None)
|
|
if not name:
|
|
warnings.append(f"Breadcrumb item {i} missing 'name'")
|
|
|
|
return {"errors": errors, "warnings": warnings}
|
|
|
|
# ------------------------------------------------------------------
|
|
# Rich result eligibility
|
|
# ------------------------------------------------------------------
|
|
|
|
def check_rich_result_eligibility(self, schema_data: dict) -> bool:
|
|
"""Assess Google rich result eligibility for Product schema."""
|
|
# Must have name, image, and offers with price
|
|
if not schema_data.get("name"):
|
|
return False
|
|
if not schema_data.get("image"):
|
|
return False
|
|
|
|
offers = schema_data.get("offers")
|
|
if not offers:
|
|
return False
|
|
|
|
offer_list = offers if isinstance(offers, list) else [offers]
|
|
for offer in offer_list:
|
|
if not isinstance(offer, dict):
|
|
continue
|
|
if offer.get("price") and offer.get("priceCurrency") and offer.get("availability"):
|
|
return True
|
|
|
|
return False
|
|
|
|
# ------------------------------------------------------------------
|
|
# Naver Shopping requirements
|
|
# ------------------------------------------------------------------
|
|
|
|
def check_naver_shopping_requirements(self, schema_data: dict, page_url: str) -> list[dict]:
|
|
"""Check Naver Shopping specific schema requirements."""
|
|
issues: list[dict] = []
|
|
|
|
# Naver Shopping requires Product name in Korean for Korean market
|
|
name = schema_data.get("name", "")
|
|
korean_chars = len(re.findall(r"[\uac00-\ud7af]", str(name)))
|
|
if korean_chars == 0 and name:
|
|
issues.append({
|
|
"url": page_url,
|
|
"type": "naver_product_name",
|
|
"severity": "medium",
|
|
"message": "Product name has no Korean characters",
|
|
"recommendation": "Include Korean product name for Naver Shopping visibility.",
|
|
})
|
|
|
|
# Naver prefers specific category mapping
|
|
if not schema_data.get("category"):
|
|
issues.append({
|
|
"url": page_url,
|
|
"type": "naver_category",
|
|
"severity": "low",
|
|
"message": "Missing 'category' property for Naver Shopping categorization",
|
|
"recommendation": "Add category property matching Naver Shopping category taxonomy.",
|
|
})
|
|
|
|
# Naver requires image
|
|
image = schema_data.get("image")
|
|
if not image:
|
|
issues.append({
|
|
"url": page_url,
|
|
"type": "naver_image",
|
|
"severity": "high",
|
|
"message": "Missing product image (required for Naver Shopping)",
|
|
"recommendation": "Add at least one high-quality product image URL.",
|
|
})
|
|
elif isinstance(image, str):
|
|
if not image.startswith("http"):
|
|
issues.append({
|
|
"url": page_url,
|
|
"type": "naver_image_url",
|
|
"severity": "medium",
|
|
"message": "Product image URL is relative (should be absolute)",
|
|
"recommendation": "Use absolute URLs for product images.",
|
|
})
|
|
|
|
# Naver requires price in KRW
|
|
offers = schema_data.get("offers")
|
|
if offers:
|
|
offer_list = offers if isinstance(offers, list) else [offers]
|
|
for offer in offer_list:
|
|
if isinstance(offer, dict):
|
|
currency = offer.get("priceCurrency", "")
|
|
if currency and currency.upper() != "KRW":
|
|
issues.append({
|
|
"url": page_url,
|
|
"type": "naver_currency",
|
|
"severity": "medium",
|
|
"message": f"Price currency is {currency}, not KRW",
|
|
"recommendation": "For Naver Shopping, provide price in KRW.",
|
|
})
|
|
|
|
# Check brand/manufacturer
|
|
if not schema_data.get("brand") and not schema_data.get("manufacturer"):
|
|
issues.append({
|
|
"url": page_url,
|
|
"type": "naver_brand",
|
|
"severity": "low",
|
|
"message": "Missing brand/manufacturer (helpful for Naver Shopping filters)",
|
|
"recommendation": "Add brand or manufacturer property.",
|
|
})
|
|
|
|
return issues
|
|
|
|
# ------------------------------------------------------------------
|
|
# Orchestrator
|
|
# ------------------------------------------------------------------
|
|
|
|
async def check(
|
|
self,
|
|
urls: list[str] | None = None,
|
|
sitemap_url: str | None = None,
|
|
sample_size: int = 50,
|
|
) -> SchemaCheckResult:
|
|
"""Run schema validation on URLs or sitemap."""
|
|
result = SchemaCheckResult(timestamp=datetime.now().isoformat())
|
|
target_urls: list[str] = []
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
if sitemap_url:
|
|
# Fetch URLs from sitemap
|
|
target_urls = await self._urls_from_sitemap(session, sitemap_url, sample_size)
|
|
if urls:
|
|
target_urls.extend(urls)
|
|
|
|
target_urls = list(set(target_urls))[:sample_size]
|
|
result.urls_checked = len(target_urls)
|
|
self.logger.info(f"Checking {len(target_urls)} URLs for Product schema")
|
|
|
|
error_counter: dict[str, int] = {}
|
|
warning_counter: dict[str, int] = {}
|
|
|
|
for url in target_urls:
|
|
html = await self._fetch_page(session, url)
|
|
if not html:
|
|
result.pages_without_schema += 1
|
|
continue
|
|
|
|
schemas = self.extract_schemas(html, url)
|
|
product_schemas = [
|
|
s for s in schemas
|
|
if self._get_schema_type(s) in ("Product", "ProductGroup")
|
|
]
|
|
breadcrumb_schemas = [
|
|
s for s in schemas
|
|
if self._get_schema_type(s) == "BreadcrumbList"
|
|
]
|
|
|
|
if not product_schemas:
|
|
result.pages_without_schema += 1
|
|
continue
|
|
|
|
result.pages_with_schema += 1
|
|
|
|
for ps_data in product_schemas:
|
|
ps = self.validate_product_schema(ps_data, url)
|
|
result.schemas.append(asdict(ps))
|
|
|
|
for err in ps.errors:
|
|
error_counter[err] = error_counter.get(err, 0) + 1
|
|
for warn in ps.warnings:
|
|
warning_counter[warn] = warning_counter.get(warn, 0) + 1
|
|
|
|
# Naver Shopping checks
|
|
naver_issues = self.check_naver_shopping_requirements(ps_data, url)
|
|
result.naver_shopping_issues.extend(naver_issues)
|
|
|
|
# Validate breadcrumbs
|
|
for bc_data in breadcrumb_schemas:
|
|
bc_result = self.validate_breadcrumb(bc_data)
|
|
for err in bc_result["errors"]:
|
|
error_counter[err] = error_counter.get(err, 0) + 1
|
|
|
|
# Aggregate common errors/warnings
|
|
result.common_errors = sorted(
|
|
error_counter.keys(),
|
|
key=lambda k: error_counter[k],
|
|
reverse=True,
|
|
)[:20]
|
|
result.common_warnings = sorted(
|
|
warning_counter.keys(),
|
|
key=lambda k: warning_counter[k],
|
|
reverse=True,
|
|
)[:20]
|
|
|
|
result.calculate_score()
|
|
return result
|
|
|
|
async def _urls_from_sitemap(
|
|
self,
|
|
session: aiohttp.ClientSession,
|
|
sitemap_url: str,
|
|
limit: int,
|
|
) -> list[str]:
|
|
"""Fetch product URLs from sitemap."""
|
|
urls: list[str] = []
|
|
try:
|
|
async with session.get(sitemap_url, headers=self.headers,
|
|
timeout=self.timeout, ssl=False) as resp:
|
|
if resp.status != 200:
|
|
return urls
|
|
text = await resp.text(errors="replace")
|
|
soup = BeautifulSoup(text, "lxml-xml")
|
|
|
|
# Handle sitemap index
|
|
sitemapindex = soup.find_all("sitemap")
|
|
if sitemapindex:
|
|
for sm in sitemapindex[:3]:
|
|
loc = sm.find("loc")
|
|
if loc:
|
|
child_urls = await self._urls_from_sitemap(session, loc.text.strip(), limit)
|
|
urls.extend(child_urls)
|
|
if len(urls) >= limit:
|
|
break
|
|
else:
|
|
for tag in soup.find_all("url"):
|
|
loc = tag.find("loc")
|
|
if loc:
|
|
urls.append(loc.text.strip())
|
|
if len(urls) >= limit:
|
|
break
|
|
except Exception as exc:
|
|
self.logger.warning(f"Sitemap parse failed: {exc}")
|
|
|
|
return urls[:limit]
|
|
|
|
@staticmethod
|
|
def _get_schema_type(schema: dict) -> str:
|
|
"""Get the @type from a schema dict, handling various formats."""
|
|
schema_type = schema.get("@type", "")
|
|
if isinstance(schema_type, list):
|
|
return schema_type[0] if schema_type else ""
|
|
return str(schema_type)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI output helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def print_rich_report(result: SchemaCheckResult) -> None:
|
|
"""Print a rich-formatted report to the console."""
|
|
console.print(f"\n[bold cyan]Product Schema Validation Report[/bold cyan]")
|
|
console.print(f"Timestamp: {result.timestamp}")
|
|
console.print(f"URLs checked: {result.urls_checked}")
|
|
|
|
# Coverage
|
|
coverage = (result.pages_with_schema / max(result.urls_checked, 1)) * 100
|
|
cov_color = "green" if coverage >= 90 else "yellow" if coverage >= 50 else "red"
|
|
console.print(f"Schema coverage: [{cov_color}]{coverage:.0f}%[/{cov_color}] "
|
|
f"({result.pages_with_schema}/{result.urls_checked})")
|
|
|
|
# Score
|
|
score_color = "green" if result.score >= 80 else "yellow" if result.score >= 50 else "red"
|
|
console.print(f"[bold {score_color}]Score: {result.score}/100[/bold {score_color}]")
|
|
|
|
# Validity summary
|
|
valid = sum(1 for s in result.schemas if s.get("is_valid"))
|
|
eligible = sum(1 for s in result.schemas if s.get("rich_result_eligible"))
|
|
total = len(result.schemas)
|
|
|
|
table = Table(title="Schema Summary")
|
|
table.add_column("Metric", style="bold")
|
|
table.add_column("Value", justify="right")
|
|
table.add_row("Total schemas found", str(total))
|
|
table.add_row("Valid schemas", str(valid))
|
|
table.add_row("Rich result eligible", str(eligible))
|
|
table.add_row("Pages without schema", str(result.pages_without_schema))
|
|
console.print(table)
|
|
|
|
# Common errors
|
|
if result.common_errors:
|
|
console.print(f"\n[bold red]Common Errors ({len(result.common_errors)}):[/bold red]")
|
|
for err in result.common_errors[:10]:
|
|
console.print(f" [red]-[/red] {err}")
|
|
|
|
# Common warnings
|
|
if result.common_warnings:
|
|
console.print(f"\n[bold yellow]Common Warnings ({len(result.common_warnings)}):[/bold yellow]")
|
|
for warn in result.common_warnings[:10]:
|
|
console.print(f" [yellow]-[/yellow] {warn}")
|
|
|
|
# Naver Shopping issues
|
|
if result.naver_shopping_issues:
|
|
console.print(f"\n[bold magenta]Naver Shopping Issues ({len(result.naver_shopping_issues)}):[/bold magenta]")
|
|
seen: set[str] = set()
|
|
for issue in result.naver_shopping_issues:
|
|
key = f"{issue['type']}:{issue['message']}"
|
|
if key not in seen:
|
|
seen.add(key)
|
|
console.print(f" [{issue.get('severity', 'medium')}] {issue['message']}")
|
|
console.print(f" [dim]{issue['recommendation']}[/dim]")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Product Schema Checker - Validate e-commerce structured data",
|
|
)
|
|
group = parser.add_mutually_exclusive_group(required=True)
|
|
group.add_argument("--url", nargs="+", help="Product page URL(s) to validate")
|
|
group.add_argument("--sitemap", help="Sitemap URL to fetch product pages from")
|
|
parser.add_argument(
|
|
"--sample",
|
|
type=int,
|
|
default=50,
|
|
help="Max URLs to check from sitemap (default: 50)",
|
|
)
|
|
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
parser.add_argument("--output", type=str, help="Save output to file")
|
|
args = parser.parse_args()
|
|
|
|
checker = ProductSchemaChecker()
|
|
result = asyncio.run(
|
|
checker.check(
|
|
urls=args.url,
|
|
sitemap_url=args.sitemap,
|
|
sample_size=args.sample,
|
|
)
|
|
)
|
|
|
|
if args.json:
|
|
output = json.dumps(asdict(result), indent=2, ensure_ascii=False, default=str)
|
|
if args.output:
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
f.write(output)
|
|
console.print(f"[green]Results saved to {args.output}[/green]")
|
|
else:
|
|
print(output)
|
|
else:
|
|
print_rich_report(result)
|
|
if args.output:
|
|
output = json.dumps(asdict(result), indent=2, ensure_ascii=False, default=str)
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
f.write(output)
|
|
console.print(f"\n[green]JSON results also saved to {args.output}[/green]")
|
|
|
|
checker.print_stats()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|