""" E-Commerce SEO Auditor ====================== Purpose: Audit product pages, category taxonomy, duplicate content, pagination SEO, and Korean marketplace presence. Python: 3.10+ """ import argparse import asyncio import json import logging import re import sys from dataclasses import asdict, dataclass, field from datetime import datetime from typing import Any from urllib.parse import parse_qs, urljoin, urlparse import aiohttp from bs4 import BeautifulSoup from rich.console import Console from rich.table import Table from base_client import BaseAsyncClient, config logger = logging.getLogger(__name__) console = Console() # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class ProductPageIssue: """Single issue found on a product page.""" url: str issue_type: str # title, meta_desc, h1, image_alt, internal_link, canonical, pagination severity: str # critical, high, medium, low message: str recommendation: str @dataclass class CategoryNode: """A node in the category taxonomy tree.""" url: str name: str depth: int children_count: int has_breadcrumb: bool @dataclass class DuplicateGroup: """Group of duplicate or near-duplicate product URLs.""" canonical_url: str duplicate_urls: list[str] reason: str # parameter_variant, product_variant, pagination_missing_canonical @dataclass class MarketplacePresence: """Presence record for a Korean marketplace.""" platform: str # naver_smart_store, coupang, gmarket, 11st found: bool url: str | None = None product_count: int = 0 @dataclass class EcommerceAuditResult: """Complete e-commerce SEO audit result.""" url: str product_pages_audited: int = 0 issues: dict[str, list[dict]] = field(default_factory=lambda: { "critical": [], "high": [], "medium": [], "low": [] }) category_structure: dict[str, Any] = field(default_factory=dict) duplicate_groups: list[dict] = field(default_factory=list) pagination_issues: list[dict] = field(default_factory=list) korean_marketplaces: dict[str, dict] = field(default_factory=dict) naver_smart_store: dict[str, Any] = field(default_factory=dict) score: int = 0 timestamp: str = "" def add_issue(self, issue: ProductPageIssue) -> None: self.issues[issue.severity].append(asdict(issue)) def calculate_score(self) -> int: """Score 0-100 based on issue severity counts.""" penalties = { "critical": 15, "high": 8, "medium": 3, "low": 1, } total_penalty = sum( len(items) * penalties[sev] for sev, items in self.issues.items() ) self.score = max(0, 100 - total_penalty) return self.score # --------------------------------------------------------------------------- # Product URL pattern helpers # --------------------------------------------------------------------------- PRODUCT_URL_PATTERNS = [ r"/product[s]?/", r"/item[s]?/", r"/p/", r"/dp/", r"/goods/", r"/shop/", r"/detail/", r"\?product_id=", r"\?item_id=", r"\?goodsno=", ] CATEGORY_URL_PATTERNS = [ r"/category/", r"/categories/", r"/collections?/", r"/c/", r"/department/", r"/browse/", ] FACETED_NAV_PARAMS = [ "color", "size", "sort", "order", "filter", "brand", "price_min", "price_max", "page", "per_page", "view", "material", "rating", "availability", ] def is_product_url(url: str) -> bool: """Check if a URL looks like a product page.""" return any(re.search(pat, url, re.IGNORECASE) for pat in PRODUCT_URL_PATTERNS) def is_category_url(url: str) -> bool: """Check if a URL looks like a category page.""" return any(re.search(pat, url, re.IGNORECASE) for pat in CATEGORY_URL_PATTERNS) def get_faceted_params(url: str) -> dict[str, list[str]]: """Extract faceted navigation parameters from a URL.""" parsed = urlparse(url) params = parse_qs(parsed.query) return {k: v for k, v in params.items() if k.lower() in FACETED_NAV_PARAMS} # --------------------------------------------------------------------------- # Main auditor # --------------------------------------------------------------------------- class EcommerceAuditor(BaseAsyncClient): """E-commerce SEO auditor with product, category, and marketplace checks.""" def __init__( self, max_concurrent: int = 10, requests_per_second: float = 5.0, timeout: int = 30, ): super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second) self.timeout = aiohttp.ClientTimeout(total=timeout) self.headers = { "User-Agent": ( "Mozilla/5.0 (compatible; EcommerceSEOBot/1.0; " "+https://ourdigital.org)" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7", } # ------------------------------------------------------------------ # Page fetching # ------------------------------------------------------------------ async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> tuple[str, str]: """Fetch a page and return (final_url, html).""" try: async with session.get(url, headers=self.headers, timeout=self.timeout, allow_redirects=True, ssl=False) as resp: html = await resp.text(errors="replace") return str(resp.url), html except Exception as exc: self.logger.warning(f"Failed to fetch {url}: {exc}") return url, "" # ------------------------------------------------------------------ # Product page discovery via Ahrefs # ------------------------------------------------------------------ async def get_product_pages(self, domain: str, sample: int = 50) -> list[dict]: """ Discover product pages using Ahrefs pages-by-traffic. Falls back to sitemap crawling if Ahrefs is unavailable. Returns list of dicts with keys: url, traffic, keywords. """ pages: list[dict] = [] # Attempt Ahrefs via environment / MCP self.logger.info(f"Discovering product pages for {domain} (sample={sample})") # Fallback: fetch sitemap and filter product URLs sitemap_urls = await self._fetch_sitemap_urls(domain) product_urls = [u for u in sitemap_urls if is_product_url(u)] if not product_urls: # Broader heuristic: any URL with 3+ path segments product_urls = [ u for u in sitemap_urls if len(urlparse(u).path.strip("/").split("/")) >= 2 ] for url in product_urls[:sample]: pages.append({"url": url, "traffic": 0, "keywords": 0}) self.logger.info(f"Found {len(pages)} product page candidates") return pages async def _fetch_sitemap_urls(self, domain: str) -> list[str]: """Fetch URLs from the site's XML sitemap.""" urls: list[str] = [] base = f"https://{domain}" if not domain.startswith("http") else domain parsed = urlparse(base) sitemap_url = f"{parsed.scheme}://{parsed.netloc}/sitemap.xml" async with aiohttp.ClientSession() as session: try: async with session.get(sitemap_url, headers=self.headers, timeout=self.timeout, ssl=False) as resp: if resp.status == 200: text = await resp.text(errors="replace") soup = BeautifulSoup(text, "lxml-xml") # Handle sitemap index sitemapindex = soup.find_all("sitemap") if sitemapindex: for sm in sitemapindex[:5]: loc = sm.find("loc") if loc: child_urls = await self._parse_sitemap(session, loc.text.strip()) urls.extend(child_urls) else: url_tags = soup.find_all("url") for tag in url_tags: loc = tag.find("loc") if loc: urls.append(loc.text.strip()) except Exception as exc: self.logger.warning(f"Sitemap fetch failed: {exc}") return urls async def _parse_sitemap(self, session: aiohttp.ClientSession, url: str) -> list[str]: """Parse a single sitemap XML file and return its URLs.""" urls: list[str] = [] try: async with session.get(url, headers=self.headers, timeout=self.timeout, ssl=False) as resp: if resp.status == 200: text = await resp.text(errors="replace") soup = BeautifulSoup(text, "lxml-xml") for tag in soup.find_all("url"): loc = tag.find("loc") if loc: urls.append(loc.text.strip()) except Exception as exc: self.logger.warning(f"Failed to parse sitemap {url}: {exc}") return urls # ------------------------------------------------------------------ # Product page audit # ------------------------------------------------------------------ async def audit_product_page( self, session: aiohttp.ClientSession, page_url: str, ) -> list[ProductPageIssue]: """Audit a single product page for SEO issues.""" issues: list[ProductPageIssue] = [] _, html = await self._fetch_page(session, page_url) if not html: issues.append(ProductPageIssue( url=page_url, issue_type="accessibility", severity="critical", message="Page returned empty or could not be fetched", recommendation="Verify the URL is accessible and returns valid HTML.", )) return issues soup = BeautifulSoup(html, "lxml") # --- Title tag --- title_tag = soup.find("title") title_text = title_tag.get_text(strip=True) if title_tag else "" if not title_text: issues.append(ProductPageIssue( url=page_url, issue_type="title", severity="critical", message="Missing tag", recommendation="Add a unique title containing the product name (under 60 characters).", )) elif len(title_text) > 60: issues.append(ProductPageIssue( url=page_url, issue_type="title", severity="medium", message=f"Title too long ({len(title_text)} chars): {title_text[:80]}...", recommendation="Shorten title to under 60 characters for full SERP display.", )) elif len(title_text) < 15: issues.append(ProductPageIssue( url=page_url, issue_type="title", severity="medium", message=f"Title too short ({len(title_text)} chars): {title_text}", recommendation="Expand title with product name, key feature, and brand.", )) # --- Meta description --- meta_desc_tag = soup.find("meta", attrs={"name": re.compile(r"description", re.I)}) meta_desc = meta_desc_tag.get("content", "").strip() if meta_desc_tag else "" if not meta_desc: issues.append(ProductPageIssue( url=page_url, issue_type="meta_desc", severity="high", message="Missing meta description", recommendation="Add meta description with product features and price info (under 155 chars).", )) elif len(meta_desc) > 155: issues.append(ProductPageIssue( url=page_url, issue_type="meta_desc", severity="low", message=f"Meta description too long ({len(meta_desc)} chars)", recommendation="Trim to under 155 characters for full SERP display.", )) # --- H1 tag --- h1_tags = soup.find_all("h1") if not h1_tags: issues.append(ProductPageIssue( url=page_url, issue_type="h1", severity="high", message="Missing H1 tag on product page", recommendation="Add a single H1 with the product name.", )) elif len(h1_tags) > 1: issues.append(ProductPageIssue( url=page_url, issue_type="h1", severity="medium", message=f"Multiple H1 tags found ({len(h1_tags)})", recommendation="Use a single H1 for the product name; use H2/H3 for subsections.", )) # --- Image alt text --- images = soup.find_all("img") product_images = [ img for img in images if img.get("src") and not any( skip in (img.get("src", "") + img.get("class", [""])[0] if img.get("class") else img.get("src", "")) for skip in ["logo", "icon", "badge", "banner", "sprite", "pixel", "tracking"] ) ] missing_alt = [img for img in product_images if not img.get("alt", "").strip()] if missing_alt: issues.append(ProductPageIssue( url=page_url, issue_type="image_alt", severity="high", message=f"{len(missing_alt)} product image(s) missing alt text", recommendation="Add descriptive alt text with product name to all product images.", )) generic_alt = [ img for img in product_images if img.get("alt", "").strip().lower() in [ "image", "photo", "product", "picture", "img", "product image", "상품 이미지", "이미지", "사진", ] ] if generic_alt: issues.append(ProductPageIssue( url=page_url, issue_type="image_alt", severity="medium", message=f"{len(generic_alt)} image(s) with generic alt text", recommendation="Replace generic alt text with specific product descriptions.", )) # --- Canonical tag --- canonical = soup.find("link", attrs={"rel": "canonical"}) if not canonical: issues.append(ProductPageIssue( url=page_url, issue_type="canonical", severity="high", message="Missing canonical tag on product page", recommendation="Add <link rel='canonical'> pointing to the preferred product URL.", )) else: canonical_href = canonical.get("href", "").strip() if canonical_href and canonical_href != page_url: # Only flag if significantly different (not just trailing slash) norm_canonical = canonical_href.rstrip("/") norm_page = page_url.rstrip("/") if norm_canonical != norm_page: issues.append(ProductPageIssue( url=page_url, issue_type="canonical", severity="medium", message=f"Canonical points to different URL: {canonical_href}", recommendation="Verify canonical is correct; ensure product variants point to the main product.", )) # --- Internal links --- internal_links = [] parsed_page = urlparse(page_url) for a_tag in soup.find_all("a", href=True): href = a_tag["href"] full_url = urljoin(page_url, href) parsed_link = urlparse(full_url) if parsed_link.netloc == parsed_page.netloc: internal_links.append(full_url) if len(internal_links) < 3: issues.append(ProductPageIssue( url=page_url, issue_type="internal_link", severity="medium", message=f"Only {len(internal_links)} internal links found", recommendation="Add related product links, category breadcrumbs, and cross-sell sections.", )) # --- Open Graph / social meta --- og_title = soup.find("meta", attrs={"property": "og:title"}) og_image = soup.find("meta", attrs={"property": "og:image"}) if not og_title or not og_image: issues.append(ProductPageIssue( url=page_url, issue_type="social_meta", severity="low", message="Missing Open Graph tags (og:title or og:image)", recommendation="Add OG tags for better social sharing of product pages.", )) return issues # ------------------------------------------------------------------ # Category taxonomy analysis # ------------------------------------------------------------------ async def analyze_category_taxonomy( self, session: aiohttp.ClientSession, base_url: str, max_categories: int = 50, ) -> dict[str, Any]: """Analyze category page structure and taxonomy depth.""" result: dict[str, Any] = { "categories_found": 0, "max_depth": 0, "avg_depth": 0.0, "breadcrumbs_present": 0, "breadcrumbs_missing": 0, "faceted_nav_issues": [], "nodes": [], } # Discover category URLs from sitemap sitemap_urls = await self._fetch_sitemap_urls(base_url) category_urls = [u for u in sitemap_urls if is_category_url(u)][:max_categories] if not category_urls: # Try crawling homepage for category links _, html = await self._fetch_page(session, base_url) if html: soup = BeautifulSoup(html, "lxml") for a_tag in soup.find_all("a", href=True): full_url = urljoin(base_url, a_tag["href"]) if is_category_url(full_url) and full_url not in category_urls: category_urls.append(full_url) if len(category_urls) >= max_categories: break depths: list[int] = [] for cat_url in category_urls: _, html = await self._fetch_page(session, cat_url) if not html: continue soup = BeautifulSoup(html, "lxml") parsed = urlparse(cat_url) path_parts = [p for p in parsed.path.strip("/").split("/") if p] depth = len(path_parts) depths.append(depth) # Check breadcrumb has_breadcrumb = bool( soup.find("nav", attrs={"aria-label": re.compile(r"breadcrumb", re.I)}) or soup.find(attrs={"class": re.compile(r"breadcrumb", re.I)}) or soup.find("script", string=re.compile(r"BreadcrumbList", re.I)) or soup.find("ol", attrs={"itemtype": re.compile(r"BreadcrumbList", re.I)}) ) if has_breadcrumb: result["breadcrumbs_present"] += 1 else: result["breadcrumbs_missing"] += 1 # Category name from H1 or title h1 = soup.find("h1") cat_name = h1.get_text(strip=True) if h1 else path_parts[-1] if path_parts else "unknown" # Count child category links children = 0 for a_tag in soup.find_all("a", href=True): link = urljoin(cat_url, a_tag["href"]) if is_category_url(link) and link != cat_url: children += 1 node = CategoryNode( url=cat_url, name=cat_name, depth=depth, children_count=children, has_breadcrumb=has_breadcrumb, ) result["nodes"].append(asdict(node)) # Faceted navigation check faceted = get_faceted_params(cat_url) if faceted: robots_meta = soup.find("meta", attrs={"name": "robots"}) robots_content = robots_meta.get("content", "").lower() if robots_meta else "" canonical = soup.find("link", attrs={"rel": "canonical"}) canonical_href = canonical.get("href", "").strip() if canonical else "" if "noindex" not in robots_content and canonical_href == cat_url: result["faceted_nav_issues"].append({ "url": cat_url, "params": list(faceted.keys()), "message": "Faceted URL is indexable without canonical to base category", "recommendation": ( "Add noindex or canonical to the non-filtered category URL " "to prevent duplicate content." ), }) if depths: result["max_depth"] = max(depths) result["avg_depth"] = round(sum(depths) / len(depths), 1) result["categories_found"] = len(category_urls) return result # ------------------------------------------------------------------ # Duplicate content detection # ------------------------------------------------------------------ async def detect_duplicates(self, page_urls: list[str]) -> list[DuplicateGroup]: """Detect potential duplicate content from parameter variants.""" groups: list[DuplicateGroup] = [] base_to_variants: dict[str, list[str]] = {} for url in page_urls: parsed = urlparse(url) base = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" params = parse_qs(parsed.query) faceted = {k: v for k, v in params.items() if k.lower() in FACETED_NAV_PARAMS} if faceted: base_to_variants.setdefault(base, []).append(url) for base_url, variants in base_to_variants.items(): if len(variants) > 1: groups.append(DuplicateGroup( canonical_url=base_url, duplicate_urls=variants, reason="parameter_variant", )) # Check for product variant duplicates (e.g., /product/123-red vs /product/123-blue) slug_groups: dict[str, list[str]] = {} for url in page_urls: parsed = urlparse(url) path = parsed.path.rstrip("/") # Strip trailing color/size suffixes base_slug = re.sub( r"[-_](red|blue|green|black|white|small|medium|large|xs|s|m|l|xl|xxl)$", "", path, flags=re.IGNORECASE, ) if base_slug != path: slug_groups.setdefault(base_slug, []).append(url) for base_slug, variants in slug_groups.items(): if len(variants) > 1: groups.append(DuplicateGroup( canonical_url=variants[0], duplicate_urls=variants[1:], reason="product_variant", )) return groups # ------------------------------------------------------------------ # Pagination SEO # ------------------------------------------------------------------ async def check_pagination_seo( self, session: aiohttp.ClientSession, page_url: str, ) -> list[ProductPageIssue]: """Check pagination implementation for SEO best practices.""" issues: list[ProductPageIssue] = [] _, html = await self._fetch_page(session, page_url) if not html: return issues soup = BeautifulSoup(html, "lxml") # Look for pagination links pagination_links = [] for a_tag in soup.find_all("a", href=True): href = a_tag["href"] full_url = urljoin(page_url, href) params = parse_qs(urlparse(full_url).query) if "page" in params or "p" in params or re.search(r"/page/\d+", full_url): pagination_links.append(full_url) if not pagination_links: return issues # Check rel=prev/next (deprecated by Google but still useful) rel_prev = soup.find("link", attrs={"rel": "prev"}) rel_next = soup.find("link", attrs={"rel": "next"}) if not rel_prev and not rel_next: issues.append(ProductPageIssue( url=page_url, issue_type="pagination", severity="low", message="No rel=prev/next links on paginated page", recommendation=( "While Google no longer uses rel=prev/next, other engines may. " "Consider adding them for broader compatibility." ), )) # Check canonical on paginated pages canonical = soup.find("link", attrs={"rel": "canonical"}) if canonical: canonical_href = canonical.get("href", "").strip() # If canonical points to page 1 on a non-page-1 URL, flag it parsed_page = urlparse(page_url) page_params = parse_qs(parsed_page.query) current_page_num = page_params.get("page", page_params.get("p", ["1"]))[0] if current_page_num != "1": parsed_canonical = urlparse(canonical_href) canon_params = parse_qs(parsed_canonical.query) canon_page_num = canon_params.get("page", canon_params.get("p", ["1"]))[0] if canon_page_num == "1" or canonical_href.rstrip("/") == page_url.split("?")[0].rstrip("/"): issues.append(ProductPageIssue( url=page_url, issue_type="pagination", severity="high", message=f"Page {current_page_num} canonical points to page 1", recommendation=( "Each paginated page should self-reference its own canonical URL " "to ensure all pages are indexable." ), )) # Check robots noindex on filtered/sorted pages robots_meta = soup.find("meta", attrs={"name": "robots"}) if robots_meta: content = robots_meta.get("content", "").lower() if "noindex" in content and pagination_links: issues.append(ProductPageIssue( url=page_url, issue_type="pagination", severity="medium", message="Paginated page has noindex but contains product links", recommendation=( "Ensure products on noindex pages are still discoverable " "via other indexed pages or sitemap." ), )) return issues # ------------------------------------------------------------------ # Korean marketplace presence # ------------------------------------------------------------------ async def check_korean_marketplaces( self, session: aiohttp.ClientSession, brand_name: str, ) -> dict[str, MarketplacePresence]: """Search for brand presence on Korean marketplace platforms.""" marketplaces = {} search_configs = { "naver_smart_store": { "search_url": f"https://search.shopping.naver.com/search/all?query={brand_name}", "platform": "Naver Smart Store", "indicator_patterns": [r"smartstore\.naver\.com", r"brand\.naver\.com"], }, "coupang": { "search_url": f"https://www.coupang.com/np/search?component=&q={brand_name}", "platform": "Coupang", "indicator_patterns": [r"coupang\.com/vp/products/"], }, "gmarket": { "search_url": f"https://browse.gmarket.co.kr/search?keyword={brand_name}", "platform": "Gmarket", "indicator_patterns": [r"gmarket\.co\.kr/item/"], }, "11st": { "search_url": f"https://search.11st.co.kr/Search.tmall?kwd={brand_name}", "platform": "11번가", "indicator_patterns": [r"11st\.co\.kr/products/"], }, } for key, cfg in search_configs.items(): presence = MarketplacePresence(platform=cfg["platform"], found=False) try: _, html = await self._fetch_page(session, cfg["search_url"]) if html: for pattern in cfg["indicator_patterns"]: matches = re.findall(pattern, html) if matches: presence.found = True presence.product_count = len(matches) # Extract first matching URL url_match = re.search( rf'href=["\']?(https?://[^"\'>\s]*{pattern}[^"\'>\s]*)', html, ) if url_match: presence.url = url_match.group(1) break except Exception as exc: self.logger.warning(f"Marketplace check failed for {key}: {exc}") marketplaces[key] = presence return marketplaces # ------------------------------------------------------------------ # Naver Smart Store optimization # ------------------------------------------------------------------ async def check_naver_smart_store( self, session: aiohttp.ClientSession, url: str, ) -> dict[str, Any]: """Check Naver Smart Store-specific SEO elements.""" result: dict[str, Any] = { "is_smart_store": False, "issues": [], "optimizations": [], } parsed = urlparse(url) is_smart_store = "smartstore.naver.com" in parsed.netloc or "brand.naver.com" in parsed.netloc result["is_smart_store"] = is_smart_store _, html = await self._fetch_page(session, url) if not html: return result soup = BeautifulSoup(html, "lxml") # Check Naver-specific meta tags naver_site_verification = soup.find("meta", attrs={"name": "naver-site-verification"}) if not naver_site_verification: result["issues"].append({ "type": "naver_verification", "severity": "medium", "message": "Missing naver-site-verification meta tag", "recommendation": "Add Naver Search Advisor verification tag.", }) # Check for Naver Shopping structured data attributes product_schema = soup.find("script", string=re.compile(r'"@type"\s*:\s*"Product"')) if not product_schema: result["issues"].append({ "type": "naver_schema", "severity": "high", "message": "Missing Product schema for Naver Shopping", "recommendation": "Add Product JSON-LD with Korean product names and descriptions.", }) # Check Korean content optimization body_text = soup.get_text(separator=" ", strip=True) korean_chars = len(re.findall(r"[\uac00-\ud7af]", body_text)) total_chars = len(body_text) if total_chars > 0: korean_ratio = korean_chars / total_chars if korean_ratio < 0.3 and is_smart_store: result["issues"].append({ "type": "korean_content", "severity": "medium", "message": f"Low Korean content ratio ({korean_ratio:.0%}) for Korean marketplace", "recommendation": "Increase Korean language content for Naver search visibility.", }) # Smart Store specific: check product detail image text detail_images = soup.find_all("img", attrs={"class": re.compile(r"detail|product", re.I)}) if detail_images and not soup.find("div", attrs={"class": re.compile(r"product.*(desc|detail|content)", re.I)}): result["optimizations"].append({ "type": "detail_text", "message": "Product details appear to be image-only", "recommendation": ( "Add HTML text product descriptions alongside images " "for Naver search indexing." ), }) return result # ------------------------------------------------------------------ # Orchestrator # ------------------------------------------------------------------ async def audit( self, url: str, scope: str = "all", sample: int = 50, check_marketplaces: bool = False, ) -> EcommerceAuditResult: """Run the full e-commerce SEO audit.""" result = EcommerceAuditResult(url=url, timestamp=datetime.now().isoformat()) parsed = urlparse(url if url.startswith("http") else f"https://{url}") domain = f"{parsed.scheme}://{parsed.netloc}" async with aiohttp.ClientSession() as session: # --- Product page audit --- if scope in ("all", "products"): self.logger.info("=== Product Page Audit ===") pages = await self.get_product_pages(domain, sample=sample) result.product_pages_audited = len(pages) for page_info in pages: page_issues = await self.audit_product_page(session, page_info["url"]) for issue in page_issues: result.add_issue(issue) # Duplicate detection all_urls = [p["url"] for p in pages] sitemap_urls = await self._fetch_sitemap_urls(domain) all_urls.extend(sitemap_urls) dup_groups = await self.detect_duplicates(list(set(all_urls))) result.duplicate_groups = [asdict(dg) for dg in dup_groups] for dg in dup_groups: result.add_issue(ProductPageIssue( url=dg.canonical_url, issue_type="duplicate", severity="high" if dg.reason == "parameter_variant" else "medium", message=f"Duplicate group ({dg.reason}): {len(dg.duplicate_urls)} variants", recommendation="Implement canonical tags or parameter handling in GSC/Naver.", )) # Pagination check on category-like pages category_like = [u for u in sitemap_urls if is_category_url(u)][:10] for cat_url in category_like: pag_issues = await self.check_pagination_seo(session, cat_url) result.pagination_issues.extend([asdict(i) for i in pag_issues]) for issue in pag_issues: result.add_issue(issue) # --- Category taxonomy --- if scope in ("all", "categories"): self.logger.info("=== Category Taxonomy Analysis ===") cat_result = await self.analyze_category_taxonomy(session, domain) result.category_structure = cat_result if cat_result.get("max_depth", 0) > 4: result.add_issue(ProductPageIssue( url=domain, issue_type="category_depth", severity="medium", message=f"Category depth exceeds 4 levels (max: {cat_result['max_depth']})", recommendation="Flatten category structure to 3-4 levels for better crawlability.", )) if cat_result.get("breadcrumbs_missing", 0) > 0: missing = cat_result["breadcrumbs_missing"] total = cat_result.get("categories_found", 1) result.add_issue(ProductPageIssue( url=domain, issue_type="breadcrumb", severity="high" if missing > total * 0.5 else "medium", message=f"{missing} category pages missing breadcrumb navigation", recommendation="Add BreadcrumbList schema and visible breadcrumbs to all category pages.", )) for fni in cat_result.get("faceted_nav_issues", []): result.add_issue(ProductPageIssue( url=fni["url"], issue_type="faceted_nav", severity="high", message=fni["message"], recommendation=fni["recommendation"], )) # --- Korean marketplaces --- if check_marketplaces: self.logger.info("=== Korean Marketplace Presence ===") # Extract brand name from site _, home_html = await self._fetch_page(session, domain) brand_name = "" if home_html: home_soup = BeautifulSoup(home_html, "lxml") og_site = home_soup.find("meta", attrs={"property": "og:site_name"}) if og_site: brand_name = og_site.get("content", "").strip() if not brand_name: title_tag = home_soup.find("title") if title_tag: brand_name = title_tag.get_text(strip=True).split("|")[0].split("-")[0].strip() if brand_name: mp_results = await self.check_korean_marketplaces(session, brand_name) result.korean_marketplaces = { k: asdict(v) for k, v in mp_results.items() } # Naver Smart Store check naver_result = await self.check_naver_smart_store(session, domain) result.naver_smart_store = naver_result for naver_issue in naver_result.get("issues", []): result.add_issue(ProductPageIssue( url=domain, issue_type=naver_issue["type"], severity=naver_issue["severity"], message=naver_issue["message"], recommendation=naver_issue["recommendation"], )) result.calculate_score() return result # --------------------------------------------------------------------------- # CLI output helpers # --------------------------------------------------------------------------- def print_rich_report(result: EcommerceAuditResult) -> None: """Print a rich-formatted report to the console.""" console.print(f"\n[bold cyan]E-Commerce SEO Audit Report[/bold cyan]") console.print(f"URL: {result.url}") console.print(f"Product Pages Audited: {result.product_pages_audited}") console.print(f"Timestamp: {result.timestamp}") # Score score_color = "green" if result.score >= 80 else "yellow" if result.score >= 50 else "red" console.print(f"\n[bold {score_color}]Score: {result.score}/100[/bold {score_color}]") # Issues summary table = Table(title="Issues Summary") table.add_column("Severity", style="bold") table.add_column("Count", justify="right") for sev in ["critical", "high", "medium", "low"]: color = {"critical": "red", "high": "yellow", "medium": "cyan", "low": "dim"}[sev] table.add_row(f"[{color}]{sev.upper()}[/{color}]", str(len(result.issues[sev]))) console.print(table) # Top issues for sev in ["critical", "high"]: if result.issues[sev]: console.print(f"\n[bold red]{sev.upper()} Issues:[/bold red]") for issue in result.issues[sev][:10]: console.print(f" - [{issue['issue_type']}] {issue['message']}") console.print(f" [dim]{issue['recommendation']}[/dim]") # Category structure if result.category_structure: cs = result.category_structure console.print(f"\n[bold]Category Structure:[/bold]") console.print(f" Categories found: {cs.get('categories_found', 0)}") console.print(f" Max depth: {cs.get('max_depth', 0)}") console.print(f" Breadcrumbs present: {cs.get('breadcrumbs_present', 0)}") console.print(f" Breadcrumbs missing: {cs.get('breadcrumbs_missing', 0)}") # Duplicates if result.duplicate_groups: console.print(f"\n[bold]Duplicate Groups: {len(result.duplicate_groups)}[/bold]") for dg in result.duplicate_groups[:5]: console.print(f" [{dg['reason']}] {dg['canonical_url']} ({len(dg['duplicate_urls'])} variants)") # Korean marketplaces if result.korean_marketplaces: console.print(f"\n[bold]Korean Marketplace Presence:[/bold]") for key, mp in result.korean_marketplaces.items(): status = "[green]Found[/green]" if mp.get("found") else "[red]Not Found[/red]" console.print(f" {mp.get('platform', key)}: {status}") if mp.get("url"): console.print(f" URL: {mp['url']}") # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main() -> None: parser = argparse.ArgumentParser( description="E-Commerce SEO Auditor - Product page and marketplace audit", ) parser.add_argument("--url", required=True, help="Target website URL") parser.add_argument( "--scope", choices=["all", "products", "categories"], default="all", help="Audit scope (default: all)", ) parser.add_argument( "--korean-marketplaces", action="store_true", help="Check Korean marketplace presence (Coupang, Gmarket, 11번가, Naver)", ) parser.add_argument( "--sample", type=int, default=50, help="Number of product pages to sample (default: 50)", ) parser.add_argument("--json", action="store_true", help="Output as JSON") parser.add_argument("--output", type=str, help="Save output to file") args = parser.parse_args() auditor = EcommerceAuditor() result = asyncio.run( auditor.audit( url=args.url, scope=args.scope, sample=args.sample, check_marketplaces=args.korean_marketplaces, ) ) if args.json: output = json.dumps(asdict(result), indent=2, ensure_ascii=False, default=str) if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(output) console.print(f"[green]Results saved to {args.output}[/green]") else: print(output) else: print_rich_report(result) if args.output: output = json.dumps(asdict(result), indent=2, ensure_ascii=False, default=str) with open(args.output, "w", encoding="utf-8") as f: f.write(output) console.print(f"\n[green]JSON results also saved to {args.output}[/green]") auditor.print_stats() if __name__ == "__main__": main()