Files
our-claude-skills/custom-skills/24-seo-ecommerce/code/scripts/ecommerce_auditor.py
Andrew Yim a3ff965b87 Add SEO skills 19-28, 31-32 with full Python implementations
12 new skills: Keyword Strategy, SERP Analysis, Position Tracking,
Link Building, Content Strategy, E-Commerce SEO, KPI Framework,
International SEO, AI Visibility, Knowledge Graph, Competitor Intel,
and Crawl Budget. ~20K lines of Python across 25 domain scripts.
Updated skill 11 pipeline table and repo CLAUDE.md.
Enhanced skill 18 local SEO workflow from jamie.clinic audit.

Note: Skill 26 hreflang_validator.py pending (content filter block).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 12:05:59 +09:00

1047 lines
43 KiB
Python

"""
E-Commerce SEO Auditor
======================
Purpose: Audit product pages, category taxonomy, duplicate content,
pagination SEO, and Korean marketplace presence.
Python: 3.10+
"""
import argparse
import asyncio
import json
import logging
import re
import sys
from dataclasses import asdict, dataclass, field
from datetime import datetime
from typing import Any
from urllib.parse import parse_qs, urljoin, urlparse
import aiohttp
from bs4 import BeautifulSoup
from rich.console import Console
from rich.table import Table
from base_client import BaseAsyncClient, config
logger = logging.getLogger(__name__)
console = Console()
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class ProductPageIssue:
"""Single issue found on a product page."""
url: str
issue_type: str # title, meta_desc, h1, image_alt, internal_link, canonical, pagination
severity: str # critical, high, medium, low
message: str
recommendation: str
@dataclass
class CategoryNode:
"""A node in the category taxonomy tree."""
url: str
name: str
depth: int
children_count: int
has_breadcrumb: bool
@dataclass
class DuplicateGroup:
"""Group of duplicate or near-duplicate product URLs."""
canonical_url: str
duplicate_urls: list[str]
reason: str # parameter_variant, product_variant, pagination_missing_canonical
@dataclass
class MarketplacePresence:
"""Presence record for a Korean marketplace."""
platform: str # naver_smart_store, coupang, gmarket, 11st
found: bool
url: str | None = None
product_count: int = 0
@dataclass
class EcommerceAuditResult:
"""Complete e-commerce SEO audit result."""
url: str
product_pages_audited: int = 0
issues: dict[str, list[dict]] = field(default_factory=lambda: {
"critical": [], "high": [], "medium": [], "low": []
})
category_structure: dict[str, Any] = field(default_factory=dict)
duplicate_groups: list[dict] = field(default_factory=list)
pagination_issues: list[dict] = field(default_factory=list)
korean_marketplaces: dict[str, dict] = field(default_factory=dict)
naver_smart_store: dict[str, Any] = field(default_factory=dict)
score: int = 0
timestamp: str = ""
def add_issue(self, issue: ProductPageIssue) -> None:
self.issues[issue.severity].append(asdict(issue))
def calculate_score(self) -> int:
"""Score 0-100 based on issue severity counts."""
penalties = {
"critical": 15,
"high": 8,
"medium": 3,
"low": 1,
}
total_penalty = sum(
len(items) * penalties[sev]
for sev, items in self.issues.items()
)
self.score = max(0, 100 - total_penalty)
return self.score
# ---------------------------------------------------------------------------
# Product URL pattern helpers
# ---------------------------------------------------------------------------
PRODUCT_URL_PATTERNS = [
r"/product[s]?/",
r"/item[s]?/",
r"/p/",
r"/dp/",
r"/goods/",
r"/shop/",
r"/detail/",
r"\?product_id=",
r"\?item_id=",
r"\?goodsno=",
]
CATEGORY_URL_PATTERNS = [
r"/category/",
r"/categories/",
r"/collections?/",
r"/c/",
r"/department/",
r"/browse/",
]
FACETED_NAV_PARAMS = [
"color", "size", "sort", "order", "filter", "brand",
"price_min", "price_max", "page", "per_page", "view",
"material", "rating", "availability",
]
def is_product_url(url: str) -> bool:
"""Check if a URL looks like a product page."""
return any(re.search(pat, url, re.IGNORECASE) for pat in PRODUCT_URL_PATTERNS)
def is_category_url(url: str) -> bool:
"""Check if a URL looks like a category page."""
return any(re.search(pat, url, re.IGNORECASE) for pat in CATEGORY_URL_PATTERNS)
def get_faceted_params(url: str) -> dict[str, list[str]]:
"""Extract faceted navigation parameters from a URL."""
parsed = urlparse(url)
params = parse_qs(parsed.query)
return {k: v for k, v in params.items() if k.lower() in FACETED_NAV_PARAMS}
# ---------------------------------------------------------------------------
# Main auditor
# ---------------------------------------------------------------------------
class EcommerceAuditor(BaseAsyncClient):
"""E-commerce SEO auditor with product, category, and marketplace checks."""
def __init__(
self,
max_concurrent: int = 10,
requests_per_second: float = 5.0,
timeout: int = 30,
):
super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second)
self.timeout = aiohttp.ClientTimeout(total=timeout)
self.headers = {
"User-Agent": (
"Mozilla/5.0 (compatible; EcommerceSEOBot/1.0; "
"+https://ourdigital.org)"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
}
# ------------------------------------------------------------------
# Page fetching
# ------------------------------------------------------------------
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> tuple[str, str]:
"""Fetch a page and return (final_url, html)."""
try:
async with session.get(url, headers=self.headers, timeout=self.timeout,
allow_redirects=True, ssl=False) as resp:
html = await resp.text(errors="replace")
return str(resp.url), html
except Exception as exc:
self.logger.warning(f"Failed to fetch {url}: {exc}")
return url, ""
# ------------------------------------------------------------------
# Product page discovery via Ahrefs
# ------------------------------------------------------------------
async def get_product_pages(self, domain: str, sample: int = 50) -> list[dict]:
"""
Discover product pages using Ahrefs pages-by-traffic.
Falls back to sitemap crawling if Ahrefs is unavailable.
Returns list of dicts with keys: url, traffic, keywords.
"""
pages: list[dict] = []
# Attempt Ahrefs via environment / MCP
self.logger.info(f"Discovering product pages for {domain} (sample={sample})")
# Fallback: fetch sitemap and filter product URLs
sitemap_urls = await self._fetch_sitemap_urls(domain)
product_urls = [u for u in sitemap_urls if is_product_url(u)]
if not product_urls:
# Broader heuristic: any URL with 3+ path segments
product_urls = [
u for u in sitemap_urls
if len(urlparse(u).path.strip("/").split("/")) >= 2
]
for url in product_urls[:sample]:
pages.append({"url": url, "traffic": 0, "keywords": 0})
self.logger.info(f"Found {len(pages)} product page candidates")
return pages
async def _fetch_sitemap_urls(self, domain: str) -> list[str]:
"""Fetch URLs from the site's XML sitemap."""
urls: list[str] = []
base = f"https://{domain}" if not domain.startswith("http") else domain
parsed = urlparse(base)
sitemap_url = f"{parsed.scheme}://{parsed.netloc}/sitemap.xml"
async with aiohttp.ClientSession() as session:
try:
async with session.get(sitemap_url, headers=self.headers,
timeout=self.timeout, ssl=False) as resp:
if resp.status == 200:
text = await resp.text(errors="replace")
soup = BeautifulSoup(text, "lxml-xml")
# Handle sitemap index
sitemapindex = soup.find_all("sitemap")
if sitemapindex:
for sm in sitemapindex[:5]:
loc = sm.find("loc")
if loc:
child_urls = await self._parse_sitemap(session, loc.text.strip())
urls.extend(child_urls)
else:
url_tags = soup.find_all("url")
for tag in url_tags:
loc = tag.find("loc")
if loc:
urls.append(loc.text.strip())
except Exception as exc:
self.logger.warning(f"Sitemap fetch failed: {exc}")
return urls
async def _parse_sitemap(self, session: aiohttp.ClientSession, url: str) -> list[str]:
"""Parse a single sitemap XML file and return its URLs."""
urls: list[str] = []
try:
async with session.get(url, headers=self.headers,
timeout=self.timeout, ssl=False) as resp:
if resp.status == 200:
text = await resp.text(errors="replace")
soup = BeautifulSoup(text, "lxml-xml")
for tag in soup.find_all("url"):
loc = tag.find("loc")
if loc:
urls.append(loc.text.strip())
except Exception as exc:
self.logger.warning(f"Failed to parse sitemap {url}: {exc}")
return urls
# ------------------------------------------------------------------
# Product page audit
# ------------------------------------------------------------------
async def audit_product_page(
self,
session: aiohttp.ClientSession,
page_url: str,
) -> list[ProductPageIssue]:
"""Audit a single product page for SEO issues."""
issues: list[ProductPageIssue] = []
_, html = await self._fetch_page(session, page_url)
if not html:
issues.append(ProductPageIssue(
url=page_url, issue_type="accessibility", severity="critical",
message="Page returned empty or could not be fetched",
recommendation="Verify the URL is accessible and returns valid HTML.",
))
return issues
soup = BeautifulSoup(html, "lxml")
# --- Title tag ---
title_tag = soup.find("title")
title_text = title_tag.get_text(strip=True) if title_tag else ""
if not title_text:
issues.append(ProductPageIssue(
url=page_url, issue_type="title", severity="critical",
message="Missing <title> tag",
recommendation="Add a unique title containing the product name (under 60 characters).",
))
elif len(title_text) > 60:
issues.append(ProductPageIssue(
url=page_url, issue_type="title", severity="medium",
message=f"Title too long ({len(title_text)} chars): {title_text[:80]}...",
recommendation="Shorten title to under 60 characters for full SERP display.",
))
elif len(title_text) < 15:
issues.append(ProductPageIssue(
url=page_url, issue_type="title", severity="medium",
message=f"Title too short ({len(title_text)} chars): {title_text}",
recommendation="Expand title with product name, key feature, and brand.",
))
# --- Meta description ---
meta_desc_tag = soup.find("meta", attrs={"name": re.compile(r"description", re.I)})
meta_desc = meta_desc_tag.get("content", "").strip() if meta_desc_tag else ""
if not meta_desc:
issues.append(ProductPageIssue(
url=page_url, issue_type="meta_desc", severity="high",
message="Missing meta description",
recommendation="Add meta description with product features and price info (under 155 chars).",
))
elif len(meta_desc) > 155:
issues.append(ProductPageIssue(
url=page_url, issue_type="meta_desc", severity="low",
message=f"Meta description too long ({len(meta_desc)} chars)",
recommendation="Trim to under 155 characters for full SERP display.",
))
# --- H1 tag ---
h1_tags = soup.find_all("h1")
if not h1_tags:
issues.append(ProductPageIssue(
url=page_url, issue_type="h1", severity="high",
message="Missing H1 tag on product page",
recommendation="Add a single H1 with the product name.",
))
elif len(h1_tags) > 1:
issues.append(ProductPageIssue(
url=page_url, issue_type="h1", severity="medium",
message=f"Multiple H1 tags found ({len(h1_tags)})",
recommendation="Use a single H1 for the product name; use H2/H3 for subsections.",
))
# --- Image alt text ---
images = soup.find_all("img")
product_images = [
img for img in images
if img.get("src") and not any(
skip in (img.get("src", "") + img.get("class", [""])[0] if img.get("class") else img.get("src", ""))
for skip in ["logo", "icon", "badge", "banner", "sprite", "pixel", "tracking"]
)
]
missing_alt = [img for img in product_images if not img.get("alt", "").strip()]
if missing_alt:
issues.append(ProductPageIssue(
url=page_url, issue_type="image_alt", severity="high",
message=f"{len(missing_alt)} product image(s) missing alt text",
recommendation="Add descriptive alt text with product name to all product images.",
))
generic_alt = [
img for img in product_images
if img.get("alt", "").strip().lower() in [
"image", "photo", "product", "picture", "img", "product image",
"상품 이미지", "이미지", "사진",
]
]
if generic_alt:
issues.append(ProductPageIssue(
url=page_url, issue_type="image_alt", severity="medium",
message=f"{len(generic_alt)} image(s) with generic alt text",
recommendation="Replace generic alt text with specific product descriptions.",
))
# --- Canonical tag ---
canonical = soup.find("link", attrs={"rel": "canonical"})
if not canonical:
issues.append(ProductPageIssue(
url=page_url, issue_type="canonical", severity="high",
message="Missing canonical tag on product page",
recommendation="Add <link rel='canonical'> pointing to the preferred product URL.",
))
else:
canonical_href = canonical.get("href", "").strip()
if canonical_href and canonical_href != page_url:
# Only flag if significantly different (not just trailing slash)
norm_canonical = canonical_href.rstrip("/")
norm_page = page_url.rstrip("/")
if norm_canonical != norm_page:
issues.append(ProductPageIssue(
url=page_url, issue_type="canonical", severity="medium",
message=f"Canonical points to different URL: {canonical_href}",
recommendation="Verify canonical is correct; ensure product variants point to the main product.",
))
# --- Internal links ---
internal_links = []
parsed_page = urlparse(page_url)
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
full_url = urljoin(page_url, href)
parsed_link = urlparse(full_url)
if parsed_link.netloc == parsed_page.netloc:
internal_links.append(full_url)
if len(internal_links) < 3:
issues.append(ProductPageIssue(
url=page_url, issue_type="internal_link", severity="medium",
message=f"Only {len(internal_links)} internal links found",
recommendation="Add related product links, category breadcrumbs, and cross-sell sections.",
))
# --- Open Graph / social meta ---
og_title = soup.find("meta", attrs={"property": "og:title"})
og_image = soup.find("meta", attrs={"property": "og:image"})
if not og_title or not og_image:
issues.append(ProductPageIssue(
url=page_url, issue_type="social_meta", severity="low",
message="Missing Open Graph tags (og:title or og:image)",
recommendation="Add OG tags for better social sharing of product pages.",
))
return issues
# ------------------------------------------------------------------
# Category taxonomy analysis
# ------------------------------------------------------------------
async def analyze_category_taxonomy(
self,
session: aiohttp.ClientSession,
base_url: str,
max_categories: int = 50,
) -> dict[str, Any]:
"""Analyze category page structure and taxonomy depth."""
result: dict[str, Any] = {
"categories_found": 0,
"max_depth": 0,
"avg_depth": 0.0,
"breadcrumbs_present": 0,
"breadcrumbs_missing": 0,
"faceted_nav_issues": [],
"nodes": [],
}
# Discover category URLs from sitemap
sitemap_urls = await self._fetch_sitemap_urls(base_url)
category_urls = [u for u in sitemap_urls if is_category_url(u)][:max_categories]
if not category_urls:
# Try crawling homepage for category links
_, html = await self._fetch_page(session, base_url)
if html:
soup = BeautifulSoup(html, "lxml")
for a_tag in soup.find_all("a", href=True):
full_url = urljoin(base_url, a_tag["href"])
if is_category_url(full_url) and full_url not in category_urls:
category_urls.append(full_url)
if len(category_urls) >= max_categories:
break
depths: list[int] = []
for cat_url in category_urls:
_, html = await self._fetch_page(session, cat_url)
if not html:
continue
soup = BeautifulSoup(html, "lxml")
parsed = urlparse(cat_url)
path_parts = [p for p in parsed.path.strip("/").split("/") if p]
depth = len(path_parts)
depths.append(depth)
# Check breadcrumb
has_breadcrumb = bool(
soup.find("nav", attrs={"aria-label": re.compile(r"breadcrumb", re.I)})
or soup.find(attrs={"class": re.compile(r"breadcrumb", re.I)})
or soup.find("script", string=re.compile(r"BreadcrumbList", re.I))
or soup.find("ol", attrs={"itemtype": re.compile(r"BreadcrumbList", re.I)})
)
if has_breadcrumb:
result["breadcrumbs_present"] += 1
else:
result["breadcrumbs_missing"] += 1
# Category name from H1 or title
h1 = soup.find("h1")
cat_name = h1.get_text(strip=True) if h1 else path_parts[-1] if path_parts else "unknown"
# Count child category links
children = 0
for a_tag in soup.find_all("a", href=True):
link = urljoin(cat_url, a_tag["href"])
if is_category_url(link) and link != cat_url:
children += 1
node = CategoryNode(
url=cat_url,
name=cat_name,
depth=depth,
children_count=children,
has_breadcrumb=has_breadcrumb,
)
result["nodes"].append(asdict(node))
# Faceted navigation check
faceted = get_faceted_params(cat_url)
if faceted:
robots_meta = soup.find("meta", attrs={"name": "robots"})
robots_content = robots_meta.get("content", "").lower() if robots_meta else ""
canonical = soup.find("link", attrs={"rel": "canonical"})
canonical_href = canonical.get("href", "").strip() if canonical else ""
if "noindex" not in robots_content and canonical_href == cat_url:
result["faceted_nav_issues"].append({
"url": cat_url,
"params": list(faceted.keys()),
"message": "Faceted URL is indexable without canonical to base category",
"recommendation": (
"Add noindex or canonical to the non-filtered category URL "
"to prevent duplicate content."
),
})
if depths:
result["max_depth"] = max(depths)
result["avg_depth"] = round(sum(depths) / len(depths), 1)
result["categories_found"] = len(category_urls)
return result
# ------------------------------------------------------------------
# Duplicate content detection
# ------------------------------------------------------------------
async def detect_duplicates(self, page_urls: list[str]) -> list[DuplicateGroup]:
"""Detect potential duplicate content from parameter variants."""
groups: list[DuplicateGroup] = []
base_to_variants: dict[str, list[str]] = {}
for url in page_urls:
parsed = urlparse(url)
base = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
params = parse_qs(parsed.query)
faceted = {k: v for k, v in params.items() if k.lower() in FACETED_NAV_PARAMS}
if faceted:
base_to_variants.setdefault(base, []).append(url)
for base_url, variants in base_to_variants.items():
if len(variants) > 1:
groups.append(DuplicateGroup(
canonical_url=base_url,
duplicate_urls=variants,
reason="parameter_variant",
))
# Check for product variant duplicates (e.g., /product/123-red vs /product/123-blue)
slug_groups: dict[str, list[str]] = {}
for url in page_urls:
parsed = urlparse(url)
path = parsed.path.rstrip("/")
# Strip trailing color/size suffixes
base_slug = re.sub(
r"[-_](red|blue|green|black|white|small|medium|large|xs|s|m|l|xl|xxl)$",
"", path, flags=re.IGNORECASE,
)
if base_slug != path:
slug_groups.setdefault(base_slug, []).append(url)
for base_slug, variants in slug_groups.items():
if len(variants) > 1:
groups.append(DuplicateGroup(
canonical_url=variants[0],
duplicate_urls=variants[1:],
reason="product_variant",
))
return groups
# ------------------------------------------------------------------
# Pagination SEO
# ------------------------------------------------------------------
async def check_pagination_seo(
self,
session: aiohttp.ClientSession,
page_url: str,
) -> list[ProductPageIssue]:
"""Check pagination implementation for SEO best practices."""
issues: list[ProductPageIssue] = []
_, html = await self._fetch_page(session, page_url)
if not html:
return issues
soup = BeautifulSoup(html, "lxml")
# Look for pagination links
pagination_links = []
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
full_url = urljoin(page_url, href)
params = parse_qs(urlparse(full_url).query)
if "page" in params or "p" in params or re.search(r"/page/\d+", full_url):
pagination_links.append(full_url)
if not pagination_links:
return issues
# Check rel=prev/next (deprecated by Google but still useful)
rel_prev = soup.find("link", attrs={"rel": "prev"})
rel_next = soup.find("link", attrs={"rel": "next"})
if not rel_prev and not rel_next:
issues.append(ProductPageIssue(
url=page_url, issue_type="pagination", severity="low",
message="No rel=prev/next links on paginated page",
recommendation=(
"While Google no longer uses rel=prev/next, other engines may. "
"Consider adding them for broader compatibility."
),
))
# Check canonical on paginated pages
canonical = soup.find("link", attrs={"rel": "canonical"})
if canonical:
canonical_href = canonical.get("href", "").strip()
# If canonical points to page 1 on a non-page-1 URL, flag it
parsed_page = urlparse(page_url)
page_params = parse_qs(parsed_page.query)
current_page_num = page_params.get("page", page_params.get("p", ["1"]))[0]
if current_page_num != "1":
parsed_canonical = urlparse(canonical_href)
canon_params = parse_qs(parsed_canonical.query)
canon_page_num = canon_params.get("page", canon_params.get("p", ["1"]))[0]
if canon_page_num == "1" or canonical_href.rstrip("/") == page_url.split("?")[0].rstrip("/"):
issues.append(ProductPageIssue(
url=page_url, issue_type="pagination", severity="high",
message=f"Page {current_page_num} canonical points to page 1",
recommendation=(
"Each paginated page should self-reference its own canonical URL "
"to ensure all pages are indexable."
),
))
# Check robots noindex on filtered/sorted pages
robots_meta = soup.find("meta", attrs={"name": "robots"})
if robots_meta:
content = robots_meta.get("content", "").lower()
if "noindex" in content and pagination_links:
issues.append(ProductPageIssue(
url=page_url, issue_type="pagination", severity="medium",
message="Paginated page has noindex but contains product links",
recommendation=(
"Ensure products on noindex pages are still discoverable "
"via other indexed pages or sitemap."
),
))
return issues
# ------------------------------------------------------------------
# Korean marketplace presence
# ------------------------------------------------------------------
async def check_korean_marketplaces(
self,
session: aiohttp.ClientSession,
brand_name: str,
) -> dict[str, MarketplacePresence]:
"""Search for brand presence on Korean marketplace platforms."""
marketplaces = {}
search_configs = {
"naver_smart_store": {
"search_url": f"https://search.shopping.naver.com/search/all?query={brand_name}",
"platform": "Naver Smart Store",
"indicator_patterns": [r"smartstore\.naver\.com", r"brand\.naver\.com"],
},
"coupang": {
"search_url": f"https://www.coupang.com/np/search?component=&q={brand_name}",
"platform": "Coupang",
"indicator_patterns": [r"coupang\.com/vp/products/"],
},
"gmarket": {
"search_url": f"https://browse.gmarket.co.kr/search?keyword={brand_name}",
"platform": "Gmarket",
"indicator_patterns": [r"gmarket\.co\.kr/item/"],
},
"11st": {
"search_url": f"https://search.11st.co.kr/Search.tmall?kwd={brand_name}",
"platform": "11번가",
"indicator_patterns": [r"11st\.co\.kr/products/"],
},
}
for key, cfg in search_configs.items():
presence = MarketplacePresence(platform=cfg["platform"], found=False)
try:
_, html = await self._fetch_page(session, cfg["search_url"])
if html:
for pattern in cfg["indicator_patterns"]:
matches = re.findall(pattern, html)
if matches:
presence.found = True
presence.product_count = len(matches)
# Extract first matching URL
url_match = re.search(
rf'href=["\']?(https?://[^"\'>\s]*{pattern}[^"\'>\s]*)',
html,
)
if url_match:
presence.url = url_match.group(1)
break
except Exception as exc:
self.logger.warning(f"Marketplace check failed for {key}: {exc}")
marketplaces[key] = presence
return marketplaces
# ------------------------------------------------------------------
# Naver Smart Store optimization
# ------------------------------------------------------------------
async def check_naver_smart_store(
self,
session: aiohttp.ClientSession,
url: str,
) -> dict[str, Any]:
"""Check Naver Smart Store-specific SEO elements."""
result: dict[str, Any] = {
"is_smart_store": False,
"issues": [],
"optimizations": [],
}
parsed = urlparse(url)
is_smart_store = "smartstore.naver.com" in parsed.netloc or "brand.naver.com" in parsed.netloc
result["is_smart_store"] = is_smart_store
_, html = await self._fetch_page(session, url)
if not html:
return result
soup = BeautifulSoup(html, "lxml")
# Check Naver-specific meta tags
naver_site_verification = soup.find("meta", attrs={"name": "naver-site-verification"})
if not naver_site_verification:
result["issues"].append({
"type": "naver_verification",
"severity": "medium",
"message": "Missing naver-site-verification meta tag",
"recommendation": "Add Naver Search Advisor verification tag.",
})
# Check for Naver Shopping structured data attributes
product_schema = soup.find("script", string=re.compile(r'"@type"\s*:\s*"Product"'))
if not product_schema:
result["issues"].append({
"type": "naver_schema",
"severity": "high",
"message": "Missing Product schema for Naver Shopping",
"recommendation": "Add Product JSON-LD with Korean product names and descriptions.",
})
# Check Korean content optimization
body_text = soup.get_text(separator=" ", strip=True)
korean_chars = len(re.findall(r"[\uac00-\ud7af]", body_text))
total_chars = len(body_text)
if total_chars > 0:
korean_ratio = korean_chars / total_chars
if korean_ratio < 0.3 and is_smart_store:
result["issues"].append({
"type": "korean_content",
"severity": "medium",
"message": f"Low Korean content ratio ({korean_ratio:.0%}) for Korean marketplace",
"recommendation": "Increase Korean language content for Naver search visibility.",
})
# Smart Store specific: check product detail image text
detail_images = soup.find_all("img", attrs={"class": re.compile(r"detail|product", re.I)})
if detail_images and not soup.find("div", attrs={"class": re.compile(r"product.*(desc|detail|content)", re.I)}):
result["optimizations"].append({
"type": "detail_text",
"message": "Product details appear to be image-only",
"recommendation": (
"Add HTML text product descriptions alongside images "
"for Naver search indexing."
),
})
return result
# ------------------------------------------------------------------
# Orchestrator
# ------------------------------------------------------------------
async def audit(
self,
url: str,
scope: str = "all",
sample: int = 50,
check_marketplaces: bool = False,
) -> EcommerceAuditResult:
"""Run the full e-commerce SEO audit."""
result = EcommerceAuditResult(url=url, timestamp=datetime.now().isoformat())
parsed = urlparse(url if url.startswith("http") else f"https://{url}")
domain = f"{parsed.scheme}://{parsed.netloc}"
async with aiohttp.ClientSession() as session:
# --- Product page audit ---
if scope in ("all", "products"):
self.logger.info("=== Product Page Audit ===")
pages = await self.get_product_pages(domain, sample=sample)
result.product_pages_audited = len(pages)
for page_info in pages:
page_issues = await self.audit_product_page(session, page_info["url"])
for issue in page_issues:
result.add_issue(issue)
# Duplicate detection
all_urls = [p["url"] for p in pages]
sitemap_urls = await self._fetch_sitemap_urls(domain)
all_urls.extend(sitemap_urls)
dup_groups = await self.detect_duplicates(list(set(all_urls)))
result.duplicate_groups = [asdict(dg) for dg in dup_groups]
for dg in dup_groups:
result.add_issue(ProductPageIssue(
url=dg.canonical_url,
issue_type="duplicate",
severity="high" if dg.reason == "parameter_variant" else "medium",
message=f"Duplicate group ({dg.reason}): {len(dg.duplicate_urls)} variants",
recommendation="Implement canonical tags or parameter handling in GSC/Naver.",
))
# Pagination check on category-like pages
category_like = [u for u in sitemap_urls if is_category_url(u)][:10]
for cat_url in category_like:
pag_issues = await self.check_pagination_seo(session, cat_url)
result.pagination_issues.extend([asdict(i) for i in pag_issues])
for issue in pag_issues:
result.add_issue(issue)
# --- Category taxonomy ---
if scope in ("all", "categories"):
self.logger.info("=== Category Taxonomy Analysis ===")
cat_result = await self.analyze_category_taxonomy(session, domain)
result.category_structure = cat_result
if cat_result.get("max_depth", 0) > 4:
result.add_issue(ProductPageIssue(
url=domain,
issue_type="category_depth",
severity="medium",
message=f"Category depth exceeds 4 levels (max: {cat_result['max_depth']})",
recommendation="Flatten category structure to 3-4 levels for better crawlability.",
))
if cat_result.get("breadcrumbs_missing", 0) > 0:
missing = cat_result["breadcrumbs_missing"]
total = cat_result.get("categories_found", 1)
result.add_issue(ProductPageIssue(
url=domain,
issue_type="breadcrumb",
severity="high" if missing > total * 0.5 else "medium",
message=f"{missing} category pages missing breadcrumb navigation",
recommendation="Add BreadcrumbList schema and visible breadcrumbs to all category pages.",
))
for fni in cat_result.get("faceted_nav_issues", []):
result.add_issue(ProductPageIssue(
url=fni["url"],
issue_type="faceted_nav",
severity="high",
message=fni["message"],
recommendation=fni["recommendation"],
))
# --- Korean marketplaces ---
if check_marketplaces:
self.logger.info("=== Korean Marketplace Presence ===")
# Extract brand name from site
_, home_html = await self._fetch_page(session, domain)
brand_name = ""
if home_html:
home_soup = BeautifulSoup(home_html, "lxml")
og_site = home_soup.find("meta", attrs={"property": "og:site_name"})
if og_site:
brand_name = og_site.get("content", "").strip()
if not brand_name:
title_tag = home_soup.find("title")
if title_tag:
brand_name = title_tag.get_text(strip=True).split("|")[0].split("-")[0].strip()
if brand_name:
mp_results = await self.check_korean_marketplaces(session, brand_name)
result.korean_marketplaces = {
k: asdict(v) for k, v in mp_results.items()
}
# Naver Smart Store check
naver_result = await self.check_naver_smart_store(session, domain)
result.naver_smart_store = naver_result
for naver_issue in naver_result.get("issues", []):
result.add_issue(ProductPageIssue(
url=domain,
issue_type=naver_issue["type"],
severity=naver_issue["severity"],
message=naver_issue["message"],
recommendation=naver_issue["recommendation"],
))
result.calculate_score()
return result
# ---------------------------------------------------------------------------
# CLI output helpers
# ---------------------------------------------------------------------------
def print_rich_report(result: EcommerceAuditResult) -> None:
"""Print a rich-formatted report to the console."""
console.print(f"\n[bold cyan]E-Commerce SEO Audit Report[/bold cyan]")
console.print(f"URL: {result.url}")
console.print(f"Product Pages Audited: {result.product_pages_audited}")
console.print(f"Timestamp: {result.timestamp}")
# Score
score_color = "green" if result.score >= 80 else "yellow" if result.score >= 50 else "red"
console.print(f"\n[bold {score_color}]Score: {result.score}/100[/bold {score_color}]")
# Issues summary
table = Table(title="Issues Summary")
table.add_column("Severity", style="bold")
table.add_column("Count", justify="right")
for sev in ["critical", "high", "medium", "low"]:
color = {"critical": "red", "high": "yellow", "medium": "cyan", "low": "dim"}[sev]
table.add_row(f"[{color}]{sev.upper()}[/{color}]", str(len(result.issues[sev])))
console.print(table)
# Top issues
for sev in ["critical", "high"]:
if result.issues[sev]:
console.print(f"\n[bold red]{sev.upper()} Issues:[/bold red]")
for issue in result.issues[sev][:10]:
console.print(f" - [{issue['issue_type']}] {issue['message']}")
console.print(f" [dim]{issue['recommendation']}[/dim]")
# Category structure
if result.category_structure:
cs = result.category_structure
console.print(f"\n[bold]Category Structure:[/bold]")
console.print(f" Categories found: {cs.get('categories_found', 0)}")
console.print(f" Max depth: {cs.get('max_depth', 0)}")
console.print(f" Breadcrumbs present: {cs.get('breadcrumbs_present', 0)}")
console.print(f" Breadcrumbs missing: {cs.get('breadcrumbs_missing', 0)}")
# Duplicates
if result.duplicate_groups:
console.print(f"\n[bold]Duplicate Groups: {len(result.duplicate_groups)}[/bold]")
for dg in result.duplicate_groups[:5]:
console.print(f" [{dg['reason']}] {dg['canonical_url']} ({len(dg['duplicate_urls'])} variants)")
# Korean marketplaces
if result.korean_marketplaces:
console.print(f"\n[bold]Korean Marketplace Presence:[/bold]")
for key, mp in result.korean_marketplaces.items():
status = "[green]Found[/green]" if mp.get("found") else "[red]Not Found[/red]"
console.print(f" {mp.get('platform', key)}: {status}")
if mp.get("url"):
console.print(f" URL: {mp['url']}")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(
description="E-Commerce SEO Auditor - Product page and marketplace audit",
)
parser.add_argument("--url", required=True, help="Target website URL")
parser.add_argument(
"--scope",
choices=["all", "products", "categories"],
default="all",
help="Audit scope (default: all)",
)
parser.add_argument(
"--korean-marketplaces",
action="store_true",
help="Check Korean marketplace presence (Coupang, Gmarket, 11번가, Naver)",
)
parser.add_argument(
"--sample",
type=int,
default=50,
help="Number of product pages to sample (default: 50)",
)
parser.add_argument("--json", action="store_true", help="Output as JSON")
parser.add_argument("--output", type=str, help="Save output to file")
args = parser.parse_args()
auditor = EcommerceAuditor()
result = asyncio.run(
auditor.audit(
url=args.url,
scope=args.scope,
sample=args.sample,
check_marketplaces=args.korean_marketplaces,
)
)
if args.json:
output = json.dumps(asdict(result), indent=2, ensure_ascii=False, default=str)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
console.print(f"[green]Results saved to {args.output}[/green]")
else:
print(output)
else:
print_rich_report(result)
if args.output:
output = json.dumps(asdict(result), indent=2, ensure_ascii=False, default=str)
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
console.print(f"\n[green]JSON results also saved to {args.output}[/green]")
auditor.print_stats()
if __name__ == "__main__":
main()