our-claude-skills/custom-skills/24-seo-ecommerce/code/scripts/product_schema_checker.py

"""
Product Schema Checker
======================
Purpose: Validate Product structured data (JSON-LD, Microdata, RDFa)
         for Google and Naver rich result eligibility.
Python: 3.10+
"""

import argparse
import asyncio
import json
import logging
import re
import sys
from dataclasses import asdict, dataclass, field
from datetime import datetime
from typing import Any
from urllib.parse import urljoin, urlparse

import aiohttp
from bs4 import BeautifulSoup
from rich.console import Console
from rich.table import Table

from base_client import BaseAsyncClient, config

logger = logging.getLogger(__name__)
console = Console()

# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class SchemaProperty:
    """Single property within a schema object."""
    name: str
    value: Any
    required: bool
    valid: bool
    error: str = ""


@dataclass
class ProductSchema:
    """Validation result for one product schema on a page."""
    url: str
    schema_type: str               # Product, Offer, AggregateRating, etc.
    properties: list[dict]          # list of SchemaProperty as dicts
    is_valid: bool = False
    rich_result_eligible: bool = False
    errors: list[str] = field(default_factory=list)
    warnings: list[str] = field(default_factory=list)


@dataclass
class SchemaCheckResult:
    """Complete schema check result for one or more pages."""
    urls_checked: int = 0
    pages_with_schema: int = 0
    pages_without_schema: int = 0
    schemas: list[dict] = field(default_factory=list)
    common_errors: list[str] = field(default_factory=list)
    common_warnings: list[str] = field(default_factory=list)
    naver_shopping_issues: list[dict] = field(default_factory=list)
    score: int = 0
    timestamp: str = ""

    def calculate_score(self) -> int:
        """Score 0-100 based on schema completeness."""
        if self.urls_checked == 0:
            self.score = 0
            return 0
        coverage = self.pages_with_schema / self.urls_checked
        valid_schemas = sum(1 for s in self.schemas if s.get("is_valid"))
        validity_rate = valid_schemas / max(len(self.schemas), 1)
        eligible = sum(1 for s in self.schemas if s.get("rich_result_eligible"))
        eligibility_rate = eligible / max(len(self.schemas), 1)
        self.score = int(coverage * 40 + validity_rate * 35 + eligibility_rate * 25)
        return self.score


# ---------------------------------------------------------------------------
# Schema requirements
# ---------------------------------------------------------------------------

PRODUCT_REQUIRED = {"name", "image", "description"}
PRODUCT_RECOMMENDED = {
    "brand", "sku", "gtin", "gtin8", "gtin13", "gtin14", "mpn",
    "offers", "review", "aggregateRating", "color", "material",
}

OFFER_REQUIRED = {"price", "priceCurrency", "availability"}
OFFER_RECOMMENDED = {
    "url", "priceValidUntil", "itemCondition", "seller",
    "shippingDetails", "hasMerchantReturnPolicy",
}

AGGREGATE_RATING_REQUIRED = {"ratingValue", "reviewCount"}
AGGREGATE_RATING_RECOMMENDED = {"bestRating", "worstRating", "ratingCount"}

REVIEW_REQUIRED = {"author", "reviewRating"}
REVIEW_RECOMMENDED = {"datePublished", "reviewBody", "name"}

BREADCRUMB_REQUIRED = {"itemListElement"}

AVAILABILITY_VALUES = {
    "https://schema.org/InStock",
    "https://schema.org/OutOfStock",
    "https://schema.org/PreOrder",
    "https://schema.org/BackOrder",
    "https://schema.org/Discontinued",
    "https://schema.org/InStoreOnly",
    "https://schema.org/OnlineOnly",
    "https://schema.org/LimitedAvailability",
    "https://schema.org/SoldOut",
    "http://schema.org/InStock",
    "http://schema.org/OutOfStock",
    "http://schema.org/PreOrder",
    "http://schema.org/BackOrder",
    "http://schema.org/Discontinued",
    "InStock", "OutOfStock", "PreOrder", "BackOrder", "Discontinued",
}

ITEM_CONDITION_VALUES = {
    "https://schema.org/NewCondition",
    "https://schema.org/UsedCondition",
    "https://schema.org/RefurbishedCondition",
    "https://schema.org/DamagedCondition",
    "http://schema.org/NewCondition",
    "http://schema.org/UsedCondition",
    "http://schema.org/RefurbishedCondition",
    "NewCondition", "UsedCondition", "RefurbishedCondition",
}


# ---------------------------------------------------------------------------
# Main checker
# ---------------------------------------------------------------------------

class ProductSchemaChecker(BaseAsyncClient):
    """Validate Product structured data on e-commerce pages."""

    def __init__(
        self,
        max_concurrent: int = 10,
        requests_per_second: float = 5.0,
        timeout: int = 30,
    ):
        super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second)
        self.timeout = aiohttp.ClientTimeout(total=timeout)
        self.headers = {
            "User-Agent": (
                "Mozilla/5.0 (compatible; ProductSchemaChecker/1.0; "
                "+https://ourdigital.org)"
            ),
            "Accept": "text/html,application/xhtml+xml",
            "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
        }

    # ------------------------------------------------------------------
    # Page fetching
    # ------------------------------------------------------------------

    async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> str:
        """Fetch page HTML."""
        try:
            async with session.get(url, headers=self.headers, timeout=self.timeout,
                                   allow_redirects=True, ssl=False) as resp:
                return await resp.text(errors="replace")
        except Exception as exc:
            self.logger.warning(f"Failed to fetch {url}: {exc}")
            return ""

    # ------------------------------------------------------------------
    # Schema extraction
    # ------------------------------------------------------------------

    def extract_schemas(self, html: str, page_url: str) -> list[dict]:
        """Extract all structured data from HTML (JSON-LD, Microdata, RDFa)."""
        schemas: list[dict] = []
        soup = BeautifulSoup(html, "lxml")

        # --- JSON-LD ---
        for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
            try:
                text = script.string or script.get_text()
                if not text:
                    continue
                data = json.loads(text)
                if isinstance(data, list):
                    for item in data:
                        if isinstance(item, dict):
                            schemas.append(item)
                elif isinstance(data, dict):
                    # Handle @graph
                    if "@graph" in data:
                        for item in data["@graph"]:
                            if isinstance(item, dict):
                                schemas.append(item)
                    else:
                        schemas.append(data)
            except (json.JSONDecodeError, TypeError) as exc:
                self.logger.debug(f"JSON-LD parse error on {page_url}: {exc}")

        # --- Microdata ---
        for item_scope in soup.find_all(attrs={"itemscope": True}):
            item_type = item_scope.get("itemtype", "")
            if "Product" in item_type or "Offer" in item_type:
                microdata = self._parse_microdata(item_scope)
                if microdata:
                    schemas.append(microdata)

        return schemas

    def _parse_microdata(self, element) -> dict:
        """Parse microdata from an itemscope element."""
        result: dict[str, Any] = {}
        item_type = element.get("itemtype", "")
        if item_type:
            type_name = item_type.rstrip("/").split("/")[-1]
            result["@type"] = type_name

        for prop in element.find_all(attrs={"itemprop": True}, recursive=True):
            name = prop.get("itemprop", "")
            if not name:
                continue
            # Nested itemscope
            if prop.get("itemscope") is not None:
                result[name] = self._parse_microdata(prop)
            elif prop.name == "meta":
                result[name] = prop.get("content", "")
            elif prop.name == "link":
                result[name] = prop.get("href", "")
            elif prop.name == "img":
                result[name] = prop.get("src", "")
            elif prop.name == "time":
                result[name] = prop.get("datetime", prop.get_text(strip=True))
            else:
                result[name] = prop.get_text(strip=True)

        return result

    # ------------------------------------------------------------------
    # Validation methods
    # ------------------------------------------------------------------

    def validate_product_schema(self, schema_data: dict, page_url: str) -> ProductSchema:
        """Validate a Product schema object."""
        ps = ProductSchema(
            url=page_url,
            schema_type="Product",
            properties=[],
        )

        # Check required properties
        for prop_name in PRODUCT_REQUIRED:
            value = schema_data.get(prop_name)
            valid = bool(value)
            error = "" if valid else f"Missing required property: {prop_name}"
            sp = SchemaProperty(
                name=prop_name, value=value, required=True, valid=valid, error=error,
            )
            ps.properties.append(asdict(sp))
            if not valid:
                ps.errors.append(error)

        # Check recommended properties
        for prop_name in PRODUCT_RECOMMENDED:
            value = schema_data.get(prop_name)
            sp = SchemaProperty(
                name=prop_name, value=value if value else None,
                required=False, valid=bool(value),
                error="" if value else f"Missing recommended property: {prop_name}",
            )
            ps.properties.append(asdict(sp))
            if not value:
                ps.warnings.append(f"Missing recommended property: {prop_name}")

        # Validate offers
        offers = schema_data.get("offers")
        if offers:
            if isinstance(offers, list):
                for offer in offers:
                    offer_errors = self.validate_offer_schema(offer)
                    ps.errors.extend(offer_errors["errors"])
                    ps.warnings.extend(offer_errors["warnings"])
            elif isinstance(offers, dict):
                offer_errors = self.validate_offer_schema(offers)
                ps.errors.extend(offer_errors["errors"])
                ps.warnings.extend(offer_errors["warnings"])
        else:
            ps.errors.append("Missing 'offers' property (required for rich results)")

        # Validate aggregateRating
        agg_rating = schema_data.get("aggregateRating")
        if agg_rating and isinstance(agg_rating, dict):
            rating_result = self.validate_aggregate_rating(agg_rating)
            ps.errors.extend(rating_result["errors"])
            ps.warnings.extend(rating_result["warnings"])

        # Validate reviews
        review = schema_data.get("review")
        if review:
            reviews = review if isinstance(review, list) else [review]
            for r in reviews[:5]:  # Check up to 5 reviews
                if isinstance(r, dict):
                    review_result = self.validate_review_schema(r)
                    ps.errors.extend(review_result["errors"])
                    ps.warnings.extend(review_result["warnings"])

        ps.is_valid = len(ps.errors) == 0
        ps.rich_result_eligible = self.check_rich_result_eligibility(schema_data)

        return ps

    def validate_offer_schema(self, offer_data: dict) -> dict[str, list[str]]:
        """Validate an Offer schema object."""
        errors: list[str] = []
        warnings: list[str] = []

        for prop_name in OFFER_REQUIRED:
            value = offer_data.get(prop_name)
            if not value:
                errors.append(f"Offer missing required property: {prop_name}")

        # Validate price format
        price = offer_data.get("price")
        if price is not None:
            price_str = str(price).replace(",", "").strip()
            if not re.match(r"^\d+(\.\d+)?$", price_str):
                errors.append(f"Invalid price format: '{price}' (must be numeric)")
            elif float(price_str) <= 0:
                warnings.append(f"Price is zero or negative: {price}")

        # Validate priceCurrency
        currency = offer_data.get("priceCurrency", "")
        valid_currencies = {"KRW", "USD", "EUR", "JPY", "CNY", "GBP"}
        if currency and currency.upper() not in valid_currencies:
            warnings.append(f"Unusual currency code: {currency}")

        # Validate availability
        availability = offer_data.get("availability", "")
        if availability and availability not in AVAILABILITY_VALUES:
            errors.append(
                f"Invalid availability value: '{availability}'. "
                f"Use schema.org values like https://schema.org/InStock"
            )

        # Validate itemCondition
        condition = offer_data.get("itemCondition", "")
        if condition and condition not in ITEM_CONDITION_VALUES:
            warnings.append(f"Invalid itemCondition: '{condition}'")

        # Check recommended
        for prop_name in OFFER_RECOMMENDED:
            if not offer_data.get(prop_name):
                warnings.append(f"Offer missing recommended property: {prop_name}")

        return {"errors": errors, "warnings": warnings}

    def validate_aggregate_rating(self, rating_data: dict) -> dict[str, list[str]]:
        """Validate AggregateRating schema."""
        errors: list[str] = []
        warnings: list[str] = []

        for prop_name in AGGREGATE_RATING_REQUIRED:
            value = rating_data.get(prop_name)
            if value is None:
                errors.append(f"AggregateRating missing required: {prop_name}")

        # Validate ratingValue range
        rating_value = rating_data.get("ratingValue")
        best_rating = rating_data.get("bestRating", 5)
        worst_rating = rating_data.get("worstRating", 1)
        if rating_value is not None:
            try:
                rv = float(rating_value)
                br = float(best_rating)
                wr = float(worst_rating)
                if rv < wr or rv > br:
                    errors.append(
                        f"ratingValue ({rv}) outside range [{wr}, {br}]"
                    )
            except (ValueError, TypeError):
                errors.append(f"Invalid ratingValue format: {rating_value}")

        # Validate reviewCount
        review_count = rating_data.get("reviewCount")
        if review_count is not None:
            try:
                rc = int(review_count)
                if rc < 0:
                    errors.append(f"Negative reviewCount: {rc}")
            except (ValueError, TypeError):
                errors.append(f"Invalid reviewCount format: {review_count}")

        for prop_name in AGGREGATE_RATING_RECOMMENDED:
            if not rating_data.get(prop_name):
                warnings.append(f"AggregateRating missing recommended: {prop_name}")

        return {"errors": errors, "warnings": warnings}

    def validate_review_schema(self, review_data: dict) -> dict[str, list[str]]:
        """Validate Review schema."""
        errors: list[str] = []
        warnings: list[str] = []

        # Author validation
        author = review_data.get("author")
        if not author:
            errors.append("Review missing required: author")
        elif isinstance(author, dict):
            author_name = author.get("name", "")
            if not author_name:
                errors.append("Review author missing 'name' property")
        elif isinstance(author, str):
            if len(author.strip()) == 0:
                errors.append("Review author is empty string")

        # reviewRating validation
        review_rating = review_data.get("reviewRating")
        if not review_rating:
            errors.append("Review missing required: reviewRating")
        elif isinstance(review_rating, dict):
            rv = review_rating.get("ratingValue")
            if rv is None:
                errors.append("reviewRating missing ratingValue")

        for prop_name in REVIEW_RECOMMENDED:
            if not review_data.get(prop_name):
                warnings.append(f"Review missing recommended: {prop_name}")

        return {"errors": errors, "warnings": warnings}

    def validate_breadcrumb(self, schema_data: dict) -> dict[str, list[str]]:
        """Validate BreadcrumbList schema."""
        errors: list[str] = []
        warnings: list[str] = []

        items = schema_data.get("itemListElement")
        if not items:
            errors.append("BreadcrumbList missing itemListElement")
            return {"errors": errors, "warnings": warnings}

        if not isinstance(items, list):
            errors.append("itemListElement should be an array")
            return {"errors": errors, "warnings": warnings}

        for i, item in enumerate(items):
            if not isinstance(item, dict):
                errors.append(f"Breadcrumb item {i} is not an object")
                continue
            position = item.get("position")
            if position is None:
                errors.append(f"Breadcrumb item {i} missing 'position'")
            name = item.get("name") or (item.get("item", {}).get("name") if isinstance(item.get("item"), dict) else None)
            if not name:
                warnings.append(f"Breadcrumb item {i} missing 'name'")

        return {"errors": errors, "warnings": warnings}

    # ------------------------------------------------------------------
    # Rich result eligibility
    # ------------------------------------------------------------------

    def check_rich_result_eligibility(self, schema_data: dict) -> bool:
        """Assess Google rich result eligibility for Product schema."""
        # Must have name, image, and offers with price
        if not schema_data.get("name"):
            return False
        if not schema_data.get("image"):
            return False

        offers = schema_data.get("offers")
        if not offers:
            return False

        offer_list = offers if isinstance(offers, list) else [offers]
        for offer in offer_list:
            if not isinstance(offer, dict):
                continue
            if offer.get("price") and offer.get("priceCurrency") and offer.get("availability"):
                return True

        return False

    # ------------------------------------------------------------------
    # Naver Shopping requirements
    # ------------------------------------------------------------------

    def check_naver_shopping_requirements(self, schema_data: dict, page_url: str) -> list[dict]:
        """Check Naver Shopping specific schema requirements."""
        issues: list[dict] = []

        # Naver Shopping requires Product name in Korean for Korean market
        name = schema_data.get("name", "")
        korean_chars = len(re.findall(r"[\uac00-\ud7af]", str(name)))
        if korean_chars == 0 and name:
            issues.append({
                "url": page_url,
                "type": "naver_product_name",
                "severity": "medium",
                "message": "Product name has no Korean characters",
                "recommendation": "Include Korean product name for Naver Shopping visibility.",
            })

        # Naver prefers specific category mapping
        if not schema_data.get("category"):
            issues.append({
                "url": page_url,
                "type": "naver_category",
                "severity": "low",
                "message": "Missing 'category' property for Naver Shopping categorization",
                "recommendation": "Add category property matching Naver Shopping category taxonomy.",
            })

        # Naver requires image
        image = schema_data.get("image")
        if not image:
            issues.append({
                "url": page_url,
                "type": "naver_image",
                "severity": "high",
                "message": "Missing product image (required for Naver Shopping)",
                "recommendation": "Add at least one high-quality product image URL.",
            })
        elif isinstance(image, str):
            if not image.startswith("http"):
                issues.append({
                    "url": page_url,
                    "type": "naver_image_url",
                    "severity": "medium",
                    "message": "Product image URL is relative (should be absolute)",
                    "recommendation": "Use absolute URLs for product images.",
                })

        # Naver requires price in KRW
        offers = schema_data.get("offers")
        if offers:
            offer_list = offers if isinstance(offers, list) else [offers]
            for offer in offer_list:
                if isinstance(offer, dict):
                    currency = offer.get("priceCurrency", "")
                    if currency and currency.upper() != "KRW":
                        issues.append({
                            "url": page_url,
                            "type": "naver_currency",
                            "severity": "medium",
                            "message": f"Price currency is {currency}, not KRW",
                            "recommendation": "For Naver Shopping, provide price in KRW.",
                        })

        # Check brand/manufacturer
        if not schema_data.get("brand") and not schema_data.get("manufacturer"):
            issues.append({
                "url": page_url,
                "type": "naver_brand",
                "severity": "low",
                "message": "Missing brand/manufacturer (helpful for Naver Shopping filters)",
                "recommendation": "Add brand or manufacturer property.",
            })

        return issues

    # ------------------------------------------------------------------
    # Orchestrator
    # ------------------------------------------------------------------

    async def check(
        self,
        urls: list[str] | None = None,
        sitemap_url: str | None = None,
        sample_size: int = 50,
    ) -> SchemaCheckResult:
        """Run schema validation on URLs or sitemap."""
        result = SchemaCheckResult(timestamp=datetime.now().isoformat())
        target_urls: list[str] = []

        async with aiohttp.ClientSession() as session:
            if sitemap_url:
                # Fetch URLs from sitemap
                target_urls = await self._urls_from_sitemap(session, sitemap_url, sample_size)
            if urls:
                target_urls.extend(urls)

            target_urls = list(set(target_urls))[:sample_size]
            result.urls_checked = len(target_urls)
            self.logger.info(f"Checking {len(target_urls)} URLs for Product schema")

            error_counter: dict[str, int] = {}
            warning_counter: dict[str, int] = {}

            for url in target_urls:
                html = await self._fetch_page(session, url)
                if not html:
                    result.pages_without_schema += 1
                    continue

                schemas = self.extract_schemas(html, url)
                product_schemas = [
                    s for s in schemas
                    if self._get_schema_type(s) in ("Product", "ProductGroup")
                ]
                breadcrumb_schemas = [
                    s for s in schemas
                    if self._get_schema_type(s) == "BreadcrumbList"
                ]

                if not product_schemas:
                    result.pages_without_schema += 1
                    continue

                result.pages_with_schema += 1

                for ps_data in product_schemas:
                    ps = self.validate_product_schema(ps_data, url)
                    result.schemas.append(asdict(ps))

                    for err in ps.errors:
                        error_counter[err] = error_counter.get(err, 0) + 1
                    for warn in ps.warnings:
                        warning_counter[warn] = warning_counter.get(warn, 0) + 1

                    # Naver Shopping checks
                    naver_issues = self.check_naver_shopping_requirements(ps_data, url)
                    result.naver_shopping_issues.extend(naver_issues)

                # Validate breadcrumbs
                for bc_data in breadcrumb_schemas:
                    bc_result = self.validate_breadcrumb(bc_data)
                    for err in bc_result["errors"]:
                        error_counter[err] = error_counter.get(err, 0) + 1

            # Aggregate common errors/warnings
            result.common_errors = sorted(
                error_counter.keys(),
                key=lambda k: error_counter[k],
                reverse=True,
            )[:20]
            result.common_warnings = sorted(
                warning_counter.keys(),
                key=lambda k: warning_counter[k],
                reverse=True,
            )[:20]

        result.calculate_score()
        return result

    async def _urls_from_sitemap(
        self,
        session: aiohttp.ClientSession,
        sitemap_url: str,
        limit: int,
    ) -> list[str]:
        """Fetch product URLs from sitemap."""
        urls: list[str] = []
        try:
            async with session.get(sitemap_url, headers=self.headers,
                                   timeout=self.timeout, ssl=False) as resp:
                if resp.status != 200:
                    return urls
                text = await resp.text(errors="replace")
                soup = BeautifulSoup(text, "lxml-xml")

                # Handle sitemap index
                sitemapindex = soup.find_all("sitemap")
                if sitemapindex:
                    for sm in sitemapindex[:3]:
                        loc = sm.find("loc")
                        if loc:
                            child_urls = await self._urls_from_sitemap(session, loc.text.strip(), limit)
                            urls.extend(child_urls)
                            if len(urls) >= limit:
                                break
                else:
                    for tag in soup.find_all("url"):
                        loc = tag.find("loc")
                        if loc:
                            urls.append(loc.text.strip())
                            if len(urls) >= limit:
                                break
        except Exception as exc:
            self.logger.warning(f"Sitemap parse failed: {exc}")

        return urls[:limit]

    @staticmethod
    def _get_schema_type(schema: dict) -> str:
        """Get the @type from a schema dict, handling various formats."""
        schema_type = schema.get("@type", "")
        if isinstance(schema_type, list):
            return schema_type[0] if schema_type else ""
        return str(schema_type)


# ---------------------------------------------------------------------------
# CLI output helpers
# ---------------------------------------------------------------------------

def print_rich_report(result: SchemaCheckResult) -> None:
    """Print a rich-formatted report to the console."""
    console.print(f"\n[bold cyan]Product Schema Validation Report[/bold cyan]")
    console.print(f"Timestamp: {result.timestamp}")
    console.print(f"URLs checked: {result.urls_checked}")

    # Coverage
    coverage = (result.pages_with_schema / max(result.urls_checked, 1)) * 100
    cov_color = "green" if coverage >= 90 else "yellow" if coverage >= 50 else "red"
    console.print(f"Schema coverage: [{cov_color}]{coverage:.0f}%[/{cov_color}] "
                  f"({result.pages_with_schema}/{result.urls_checked})")

    # Score
    score_color = "green" if result.score >= 80 else "yellow" if result.score >= 50 else "red"
    console.print(f"[bold {score_color}]Score: {result.score}/100[/bold {score_color}]")

    # Validity summary
    valid = sum(1 for s in result.schemas if s.get("is_valid"))
    eligible = sum(1 for s in result.schemas if s.get("rich_result_eligible"))
    total = len(result.schemas)

    table = Table(title="Schema Summary")
    table.add_column("Metric", style="bold")
    table.add_column("Value", justify="right")
    table.add_row("Total schemas found", str(total))
    table.add_row("Valid schemas", str(valid))
    table.add_row("Rich result eligible", str(eligible))
    table.add_row("Pages without schema", str(result.pages_without_schema))
    console.print(table)

    # Common errors
    if result.common_errors:
        console.print(f"\n[bold red]Common Errors ({len(result.common_errors)}):[/bold red]")
        for err in result.common_errors[:10]:
            console.print(f"  [red]-[/red] {err}")

    # Common warnings
    if result.common_warnings:
        console.print(f"\n[bold yellow]Common Warnings ({len(result.common_warnings)}):[/bold yellow]")
        for warn in result.common_warnings[:10]:
            console.print(f"  [yellow]-[/yellow] {warn}")

    # Naver Shopping issues
    if result.naver_shopping_issues:
        console.print(f"\n[bold magenta]Naver Shopping Issues ({len(result.naver_shopping_issues)}):[/bold magenta]")
        seen: set[str] = set()
        for issue in result.naver_shopping_issues:
            key = f"{issue['type']}:{issue['message']}"
            if key not in seen:
                seen.add(key)
                console.print(f"  [{issue.get('severity', 'medium')}] {issue['message']}")
                console.print(f"    [dim]{issue['recommendation']}[/dim]")


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main() -> None:
    parser = argparse.ArgumentParser(
        description="Product Schema Checker - Validate e-commerce structured data",
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--url", nargs="+", help="Product page URL(s) to validate")
    group.add_argument("--sitemap", help="Sitemap URL to fetch product pages from")
    parser.add_argument(
        "--sample",
        type=int,
        default=50,
        help="Max URLs to check from sitemap (default: 50)",
    )
    parser.add_argument("--json", action="store_true", help="Output as JSON")
    parser.add_argument("--output", type=str, help="Save output to file")
    args = parser.parse_args()

    checker = ProductSchemaChecker()
    result = asyncio.run(
        checker.check(
            urls=args.url,
            sitemap_url=args.sitemap,
            sample_size=args.sample,
        )
    )

    if args.json:
        output = json.dumps(asdict(result), indent=2, ensure_ascii=False, default=str)
        if args.output:
            with open(args.output, "w", encoding="utf-8") as f:
                f.write(output)
            console.print(f"[green]Results saved to {args.output}[/green]")
        else:
            print(output)
    else:
        print_rich_report(result)
        if args.output:
            output = json.dumps(asdict(result), indent=2, ensure_ascii=False, default=str)
            with open(args.output, "w", encoding="utf-8") as f:
                f.write(output)
            console.print(f"\n[green]JSON results also saved to {args.output}[/green]")

    checker.print_stats()


if __name__ == "__main__":
    main()