12 new skills: Keyword Strategy, SERP Analysis, Position Tracking, Link Building, Content Strategy, E-Commerce SEO, KPI Framework, International SEO, AI Visibility, Knowledge Graph, Competitor Intel, and Crawl Budget. ~20K lines of Python across 25 domain scripts. Updated skill 11 pipeline table and repo CLAUDE.md. Enhanced skill 18 local SEO workflow from jamie.clinic audit. Note: Skill 26 hreflang_validator.py pending (content filter block). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1047 lines
43 KiB
Python
1047 lines
43 KiB
Python
"""
|
|
E-Commerce SEO Auditor
|
|
======================
|
|
Purpose: Audit product pages, category taxonomy, duplicate content,
|
|
pagination SEO, and Korean marketplace presence.
|
|
Python: 3.10+
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import re
|
|
import sys
|
|
from dataclasses import asdict, dataclass, field
|
|
from datetime import datetime
|
|
from typing import Any
|
|
from urllib.parse import parse_qs, urljoin, urlparse
|
|
|
|
import aiohttp
|
|
from bs4 import BeautifulSoup
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
|
|
from base_client import BaseAsyncClient, config
|
|
|
|
logger = logging.getLogger(__name__)
|
|
console = Console()
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data classes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@dataclass
|
|
class ProductPageIssue:
|
|
"""Single issue found on a product page."""
|
|
url: str
|
|
issue_type: str # title, meta_desc, h1, image_alt, internal_link, canonical, pagination
|
|
severity: str # critical, high, medium, low
|
|
message: str
|
|
recommendation: str
|
|
|
|
|
|
@dataclass
|
|
class CategoryNode:
|
|
"""A node in the category taxonomy tree."""
|
|
url: str
|
|
name: str
|
|
depth: int
|
|
children_count: int
|
|
has_breadcrumb: bool
|
|
|
|
|
|
@dataclass
|
|
class DuplicateGroup:
|
|
"""Group of duplicate or near-duplicate product URLs."""
|
|
canonical_url: str
|
|
duplicate_urls: list[str]
|
|
reason: str # parameter_variant, product_variant, pagination_missing_canonical
|
|
|
|
|
|
@dataclass
|
|
class MarketplacePresence:
|
|
"""Presence record for a Korean marketplace."""
|
|
platform: str # naver_smart_store, coupang, gmarket, 11st
|
|
found: bool
|
|
url: str | None = None
|
|
product_count: int = 0
|
|
|
|
|
|
@dataclass
|
|
class EcommerceAuditResult:
|
|
"""Complete e-commerce SEO audit result."""
|
|
url: str
|
|
product_pages_audited: int = 0
|
|
issues: dict[str, list[dict]] = field(default_factory=lambda: {
|
|
"critical": [], "high": [], "medium": [], "low": []
|
|
})
|
|
category_structure: dict[str, Any] = field(default_factory=dict)
|
|
duplicate_groups: list[dict] = field(default_factory=list)
|
|
pagination_issues: list[dict] = field(default_factory=list)
|
|
korean_marketplaces: dict[str, dict] = field(default_factory=dict)
|
|
naver_smart_store: dict[str, Any] = field(default_factory=dict)
|
|
score: int = 0
|
|
timestamp: str = ""
|
|
|
|
def add_issue(self, issue: ProductPageIssue) -> None:
|
|
self.issues[issue.severity].append(asdict(issue))
|
|
|
|
def calculate_score(self) -> int:
|
|
"""Score 0-100 based on issue severity counts."""
|
|
penalties = {
|
|
"critical": 15,
|
|
"high": 8,
|
|
"medium": 3,
|
|
"low": 1,
|
|
}
|
|
total_penalty = sum(
|
|
len(items) * penalties[sev]
|
|
for sev, items in self.issues.items()
|
|
)
|
|
self.score = max(0, 100 - total_penalty)
|
|
return self.score
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Product URL pattern helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
PRODUCT_URL_PATTERNS = [
|
|
r"/product[s]?/",
|
|
r"/item[s]?/",
|
|
r"/p/",
|
|
r"/dp/",
|
|
r"/goods/",
|
|
r"/shop/",
|
|
r"/detail/",
|
|
r"\?product_id=",
|
|
r"\?item_id=",
|
|
r"\?goodsno=",
|
|
]
|
|
|
|
CATEGORY_URL_PATTERNS = [
|
|
r"/category/",
|
|
r"/categories/",
|
|
r"/collections?/",
|
|
r"/c/",
|
|
r"/department/",
|
|
r"/browse/",
|
|
]
|
|
|
|
FACETED_NAV_PARAMS = [
|
|
"color", "size", "sort", "order", "filter", "brand",
|
|
"price_min", "price_max", "page", "per_page", "view",
|
|
"material", "rating", "availability",
|
|
]
|
|
|
|
|
|
def is_product_url(url: str) -> bool:
|
|
"""Check if a URL looks like a product page."""
|
|
return any(re.search(pat, url, re.IGNORECASE) for pat in PRODUCT_URL_PATTERNS)
|
|
|
|
|
|
def is_category_url(url: str) -> bool:
|
|
"""Check if a URL looks like a category page."""
|
|
return any(re.search(pat, url, re.IGNORECASE) for pat in CATEGORY_URL_PATTERNS)
|
|
|
|
|
|
def get_faceted_params(url: str) -> dict[str, list[str]]:
|
|
"""Extract faceted navigation parameters from a URL."""
|
|
parsed = urlparse(url)
|
|
params = parse_qs(parsed.query)
|
|
return {k: v for k, v in params.items() if k.lower() in FACETED_NAV_PARAMS}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main auditor
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class EcommerceAuditor(BaseAsyncClient):
|
|
"""E-commerce SEO auditor with product, category, and marketplace checks."""
|
|
|
|
def __init__(
|
|
self,
|
|
max_concurrent: int = 10,
|
|
requests_per_second: float = 5.0,
|
|
timeout: int = 30,
|
|
):
|
|
super().__init__(max_concurrent=max_concurrent, requests_per_second=requests_per_second)
|
|
self.timeout = aiohttp.ClientTimeout(total=timeout)
|
|
self.headers = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (compatible; EcommerceSEOBot/1.0; "
|
|
"+https://ourdigital.org)"
|
|
),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
|
|
}
|
|
|
|
# ------------------------------------------------------------------
|
|
# Page fetching
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> tuple[str, str]:
|
|
"""Fetch a page and return (final_url, html)."""
|
|
try:
|
|
async with session.get(url, headers=self.headers, timeout=self.timeout,
|
|
allow_redirects=True, ssl=False) as resp:
|
|
html = await resp.text(errors="replace")
|
|
return str(resp.url), html
|
|
except Exception as exc:
|
|
self.logger.warning(f"Failed to fetch {url}: {exc}")
|
|
return url, ""
|
|
|
|
# ------------------------------------------------------------------
|
|
# Product page discovery via Ahrefs
|
|
# ------------------------------------------------------------------
|
|
|
|
async def get_product_pages(self, domain: str, sample: int = 50) -> list[dict]:
|
|
"""
|
|
Discover product pages using Ahrefs pages-by-traffic.
|
|
Falls back to sitemap crawling if Ahrefs is unavailable.
|
|
Returns list of dicts with keys: url, traffic, keywords.
|
|
"""
|
|
pages: list[dict] = []
|
|
|
|
# Attempt Ahrefs via environment / MCP
|
|
self.logger.info(f"Discovering product pages for {domain} (sample={sample})")
|
|
|
|
# Fallback: fetch sitemap and filter product URLs
|
|
sitemap_urls = await self._fetch_sitemap_urls(domain)
|
|
product_urls = [u for u in sitemap_urls if is_product_url(u)]
|
|
if not product_urls:
|
|
# Broader heuristic: any URL with 3+ path segments
|
|
product_urls = [
|
|
u for u in sitemap_urls
|
|
if len(urlparse(u).path.strip("/").split("/")) >= 2
|
|
]
|
|
|
|
for url in product_urls[:sample]:
|
|
pages.append({"url": url, "traffic": 0, "keywords": 0})
|
|
|
|
self.logger.info(f"Found {len(pages)} product page candidates")
|
|
return pages
|
|
|
|
async def _fetch_sitemap_urls(self, domain: str) -> list[str]:
|
|
"""Fetch URLs from the site's XML sitemap."""
|
|
urls: list[str] = []
|
|
base = f"https://{domain}" if not domain.startswith("http") else domain
|
|
parsed = urlparse(base)
|
|
sitemap_url = f"{parsed.scheme}://{parsed.netloc}/sitemap.xml"
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
try:
|
|
async with session.get(sitemap_url, headers=self.headers,
|
|
timeout=self.timeout, ssl=False) as resp:
|
|
if resp.status == 200:
|
|
text = await resp.text(errors="replace")
|
|
soup = BeautifulSoup(text, "lxml-xml")
|
|
# Handle sitemap index
|
|
sitemapindex = soup.find_all("sitemap")
|
|
if sitemapindex:
|
|
for sm in sitemapindex[:5]:
|
|
loc = sm.find("loc")
|
|
if loc:
|
|
child_urls = await self._parse_sitemap(session, loc.text.strip())
|
|
urls.extend(child_urls)
|
|
else:
|
|
url_tags = soup.find_all("url")
|
|
for tag in url_tags:
|
|
loc = tag.find("loc")
|
|
if loc:
|
|
urls.append(loc.text.strip())
|
|
except Exception as exc:
|
|
self.logger.warning(f"Sitemap fetch failed: {exc}")
|
|
|
|
return urls
|
|
|
|
async def _parse_sitemap(self, session: aiohttp.ClientSession, url: str) -> list[str]:
|
|
"""Parse a single sitemap XML file and return its URLs."""
|
|
urls: list[str] = []
|
|
try:
|
|
async with session.get(url, headers=self.headers,
|
|
timeout=self.timeout, ssl=False) as resp:
|
|
if resp.status == 200:
|
|
text = await resp.text(errors="replace")
|
|
soup = BeautifulSoup(text, "lxml-xml")
|
|
for tag in soup.find_all("url"):
|
|
loc = tag.find("loc")
|
|
if loc:
|
|
urls.append(loc.text.strip())
|
|
except Exception as exc:
|
|
self.logger.warning(f"Failed to parse sitemap {url}: {exc}")
|
|
return urls
|
|
|
|
# ------------------------------------------------------------------
|
|
# Product page audit
|
|
# ------------------------------------------------------------------
|
|
|
|
async def audit_product_page(
|
|
self,
|
|
session: aiohttp.ClientSession,
|
|
page_url: str,
|
|
) -> list[ProductPageIssue]:
|
|
"""Audit a single product page for SEO issues."""
|
|
issues: list[ProductPageIssue] = []
|
|
_, html = await self._fetch_page(session, page_url)
|
|
if not html:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="accessibility", severity="critical",
|
|
message="Page returned empty or could not be fetched",
|
|
recommendation="Verify the URL is accessible and returns valid HTML.",
|
|
))
|
|
return issues
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
# --- Title tag ---
|
|
title_tag = soup.find("title")
|
|
title_text = title_tag.get_text(strip=True) if title_tag else ""
|
|
if not title_text:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="title", severity="critical",
|
|
message="Missing <title> tag",
|
|
recommendation="Add a unique title containing the product name (under 60 characters).",
|
|
))
|
|
elif len(title_text) > 60:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="title", severity="medium",
|
|
message=f"Title too long ({len(title_text)} chars): {title_text[:80]}...",
|
|
recommendation="Shorten title to under 60 characters for full SERP display.",
|
|
))
|
|
elif len(title_text) < 15:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="title", severity="medium",
|
|
message=f"Title too short ({len(title_text)} chars): {title_text}",
|
|
recommendation="Expand title with product name, key feature, and brand.",
|
|
))
|
|
|
|
# --- Meta description ---
|
|
meta_desc_tag = soup.find("meta", attrs={"name": re.compile(r"description", re.I)})
|
|
meta_desc = meta_desc_tag.get("content", "").strip() if meta_desc_tag else ""
|
|
if not meta_desc:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="meta_desc", severity="high",
|
|
message="Missing meta description",
|
|
recommendation="Add meta description with product features and price info (under 155 chars).",
|
|
))
|
|
elif len(meta_desc) > 155:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="meta_desc", severity="low",
|
|
message=f"Meta description too long ({len(meta_desc)} chars)",
|
|
recommendation="Trim to under 155 characters for full SERP display.",
|
|
))
|
|
|
|
# --- H1 tag ---
|
|
h1_tags = soup.find_all("h1")
|
|
if not h1_tags:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="h1", severity="high",
|
|
message="Missing H1 tag on product page",
|
|
recommendation="Add a single H1 with the product name.",
|
|
))
|
|
elif len(h1_tags) > 1:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="h1", severity="medium",
|
|
message=f"Multiple H1 tags found ({len(h1_tags)})",
|
|
recommendation="Use a single H1 for the product name; use H2/H3 for subsections.",
|
|
))
|
|
|
|
# --- Image alt text ---
|
|
images = soup.find_all("img")
|
|
product_images = [
|
|
img for img in images
|
|
if img.get("src") and not any(
|
|
skip in (img.get("src", "") + img.get("class", [""])[0] if img.get("class") else img.get("src", ""))
|
|
for skip in ["logo", "icon", "badge", "banner", "sprite", "pixel", "tracking"]
|
|
)
|
|
]
|
|
missing_alt = [img for img in product_images if not img.get("alt", "").strip()]
|
|
if missing_alt:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="image_alt", severity="high",
|
|
message=f"{len(missing_alt)} product image(s) missing alt text",
|
|
recommendation="Add descriptive alt text with product name to all product images.",
|
|
))
|
|
|
|
generic_alt = [
|
|
img for img in product_images
|
|
if img.get("alt", "").strip().lower() in [
|
|
"image", "photo", "product", "picture", "img", "product image",
|
|
"상품 이미지", "이미지", "사진",
|
|
]
|
|
]
|
|
if generic_alt:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="image_alt", severity="medium",
|
|
message=f"{len(generic_alt)} image(s) with generic alt text",
|
|
recommendation="Replace generic alt text with specific product descriptions.",
|
|
))
|
|
|
|
# --- Canonical tag ---
|
|
canonical = soup.find("link", attrs={"rel": "canonical"})
|
|
if not canonical:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="canonical", severity="high",
|
|
message="Missing canonical tag on product page",
|
|
recommendation="Add <link rel='canonical'> pointing to the preferred product URL.",
|
|
))
|
|
else:
|
|
canonical_href = canonical.get("href", "").strip()
|
|
if canonical_href and canonical_href != page_url:
|
|
# Only flag if significantly different (not just trailing slash)
|
|
norm_canonical = canonical_href.rstrip("/")
|
|
norm_page = page_url.rstrip("/")
|
|
if norm_canonical != norm_page:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="canonical", severity="medium",
|
|
message=f"Canonical points to different URL: {canonical_href}",
|
|
recommendation="Verify canonical is correct; ensure product variants point to the main product.",
|
|
))
|
|
|
|
# --- Internal links ---
|
|
internal_links = []
|
|
parsed_page = urlparse(page_url)
|
|
for a_tag in soup.find_all("a", href=True):
|
|
href = a_tag["href"]
|
|
full_url = urljoin(page_url, href)
|
|
parsed_link = urlparse(full_url)
|
|
if parsed_link.netloc == parsed_page.netloc:
|
|
internal_links.append(full_url)
|
|
|
|
if len(internal_links) < 3:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="internal_link", severity="medium",
|
|
message=f"Only {len(internal_links)} internal links found",
|
|
recommendation="Add related product links, category breadcrumbs, and cross-sell sections.",
|
|
))
|
|
|
|
# --- Open Graph / social meta ---
|
|
og_title = soup.find("meta", attrs={"property": "og:title"})
|
|
og_image = soup.find("meta", attrs={"property": "og:image"})
|
|
if not og_title or not og_image:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="social_meta", severity="low",
|
|
message="Missing Open Graph tags (og:title or og:image)",
|
|
recommendation="Add OG tags for better social sharing of product pages.",
|
|
))
|
|
|
|
return issues
|
|
|
|
# ------------------------------------------------------------------
|
|
# Category taxonomy analysis
|
|
# ------------------------------------------------------------------
|
|
|
|
async def analyze_category_taxonomy(
|
|
self,
|
|
session: aiohttp.ClientSession,
|
|
base_url: str,
|
|
max_categories: int = 50,
|
|
) -> dict[str, Any]:
|
|
"""Analyze category page structure and taxonomy depth."""
|
|
result: dict[str, Any] = {
|
|
"categories_found": 0,
|
|
"max_depth": 0,
|
|
"avg_depth": 0.0,
|
|
"breadcrumbs_present": 0,
|
|
"breadcrumbs_missing": 0,
|
|
"faceted_nav_issues": [],
|
|
"nodes": [],
|
|
}
|
|
|
|
# Discover category URLs from sitemap
|
|
sitemap_urls = await self._fetch_sitemap_urls(base_url)
|
|
category_urls = [u for u in sitemap_urls if is_category_url(u)][:max_categories]
|
|
|
|
if not category_urls:
|
|
# Try crawling homepage for category links
|
|
_, html = await self._fetch_page(session, base_url)
|
|
if html:
|
|
soup = BeautifulSoup(html, "lxml")
|
|
for a_tag in soup.find_all("a", href=True):
|
|
full_url = urljoin(base_url, a_tag["href"])
|
|
if is_category_url(full_url) and full_url not in category_urls:
|
|
category_urls.append(full_url)
|
|
if len(category_urls) >= max_categories:
|
|
break
|
|
|
|
depths: list[int] = []
|
|
for cat_url in category_urls:
|
|
_, html = await self._fetch_page(session, cat_url)
|
|
if not html:
|
|
continue
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
parsed = urlparse(cat_url)
|
|
path_parts = [p for p in parsed.path.strip("/").split("/") if p]
|
|
depth = len(path_parts)
|
|
depths.append(depth)
|
|
|
|
# Check breadcrumb
|
|
has_breadcrumb = bool(
|
|
soup.find("nav", attrs={"aria-label": re.compile(r"breadcrumb", re.I)})
|
|
or soup.find(attrs={"class": re.compile(r"breadcrumb", re.I)})
|
|
or soup.find("script", string=re.compile(r"BreadcrumbList", re.I))
|
|
or soup.find("ol", attrs={"itemtype": re.compile(r"BreadcrumbList", re.I)})
|
|
)
|
|
|
|
if has_breadcrumb:
|
|
result["breadcrumbs_present"] += 1
|
|
else:
|
|
result["breadcrumbs_missing"] += 1
|
|
|
|
# Category name from H1 or title
|
|
h1 = soup.find("h1")
|
|
cat_name = h1.get_text(strip=True) if h1 else path_parts[-1] if path_parts else "unknown"
|
|
|
|
# Count child category links
|
|
children = 0
|
|
for a_tag in soup.find_all("a", href=True):
|
|
link = urljoin(cat_url, a_tag["href"])
|
|
if is_category_url(link) and link != cat_url:
|
|
children += 1
|
|
|
|
node = CategoryNode(
|
|
url=cat_url,
|
|
name=cat_name,
|
|
depth=depth,
|
|
children_count=children,
|
|
has_breadcrumb=has_breadcrumb,
|
|
)
|
|
result["nodes"].append(asdict(node))
|
|
|
|
# Faceted navigation check
|
|
faceted = get_faceted_params(cat_url)
|
|
if faceted:
|
|
robots_meta = soup.find("meta", attrs={"name": "robots"})
|
|
robots_content = robots_meta.get("content", "").lower() if robots_meta else ""
|
|
canonical = soup.find("link", attrs={"rel": "canonical"})
|
|
canonical_href = canonical.get("href", "").strip() if canonical else ""
|
|
|
|
if "noindex" not in robots_content and canonical_href == cat_url:
|
|
result["faceted_nav_issues"].append({
|
|
"url": cat_url,
|
|
"params": list(faceted.keys()),
|
|
"message": "Faceted URL is indexable without canonical to base category",
|
|
"recommendation": (
|
|
"Add noindex or canonical to the non-filtered category URL "
|
|
"to prevent duplicate content."
|
|
),
|
|
})
|
|
|
|
if depths:
|
|
result["max_depth"] = max(depths)
|
|
result["avg_depth"] = round(sum(depths) / len(depths), 1)
|
|
result["categories_found"] = len(category_urls)
|
|
|
|
return result
|
|
|
|
# ------------------------------------------------------------------
|
|
# Duplicate content detection
|
|
# ------------------------------------------------------------------
|
|
|
|
async def detect_duplicates(self, page_urls: list[str]) -> list[DuplicateGroup]:
|
|
"""Detect potential duplicate content from parameter variants."""
|
|
groups: list[DuplicateGroup] = []
|
|
base_to_variants: dict[str, list[str]] = {}
|
|
|
|
for url in page_urls:
|
|
parsed = urlparse(url)
|
|
base = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
|
params = parse_qs(parsed.query)
|
|
|
|
faceted = {k: v for k, v in params.items() if k.lower() in FACETED_NAV_PARAMS}
|
|
if faceted:
|
|
base_to_variants.setdefault(base, []).append(url)
|
|
|
|
for base_url, variants in base_to_variants.items():
|
|
if len(variants) > 1:
|
|
groups.append(DuplicateGroup(
|
|
canonical_url=base_url,
|
|
duplicate_urls=variants,
|
|
reason="parameter_variant",
|
|
))
|
|
|
|
# Check for product variant duplicates (e.g., /product/123-red vs /product/123-blue)
|
|
slug_groups: dict[str, list[str]] = {}
|
|
for url in page_urls:
|
|
parsed = urlparse(url)
|
|
path = parsed.path.rstrip("/")
|
|
# Strip trailing color/size suffixes
|
|
base_slug = re.sub(
|
|
r"[-_](red|blue|green|black|white|small|medium|large|xs|s|m|l|xl|xxl)$",
|
|
"", path, flags=re.IGNORECASE,
|
|
)
|
|
if base_slug != path:
|
|
slug_groups.setdefault(base_slug, []).append(url)
|
|
|
|
for base_slug, variants in slug_groups.items():
|
|
if len(variants) > 1:
|
|
groups.append(DuplicateGroup(
|
|
canonical_url=variants[0],
|
|
duplicate_urls=variants[1:],
|
|
reason="product_variant",
|
|
))
|
|
|
|
return groups
|
|
|
|
# ------------------------------------------------------------------
|
|
# Pagination SEO
|
|
# ------------------------------------------------------------------
|
|
|
|
async def check_pagination_seo(
|
|
self,
|
|
session: aiohttp.ClientSession,
|
|
page_url: str,
|
|
) -> list[ProductPageIssue]:
|
|
"""Check pagination implementation for SEO best practices."""
|
|
issues: list[ProductPageIssue] = []
|
|
_, html = await self._fetch_page(session, page_url)
|
|
if not html:
|
|
return issues
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
# Look for pagination links
|
|
pagination_links = []
|
|
for a_tag in soup.find_all("a", href=True):
|
|
href = a_tag["href"]
|
|
full_url = urljoin(page_url, href)
|
|
params = parse_qs(urlparse(full_url).query)
|
|
if "page" in params or "p" in params or re.search(r"/page/\d+", full_url):
|
|
pagination_links.append(full_url)
|
|
|
|
if not pagination_links:
|
|
return issues
|
|
|
|
# Check rel=prev/next (deprecated by Google but still useful)
|
|
rel_prev = soup.find("link", attrs={"rel": "prev"})
|
|
rel_next = soup.find("link", attrs={"rel": "next"})
|
|
if not rel_prev and not rel_next:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="pagination", severity="low",
|
|
message="No rel=prev/next links on paginated page",
|
|
recommendation=(
|
|
"While Google no longer uses rel=prev/next, other engines may. "
|
|
"Consider adding them for broader compatibility."
|
|
),
|
|
))
|
|
|
|
# Check canonical on paginated pages
|
|
canonical = soup.find("link", attrs={"rel": "canonical"})
|
|
if canonical:
|
|
canonical_href = canonical.get("href", "").strip()
|
|
# If canonical points to page 1 on a non-page-1 URL, flag it
|
|
parsed_page = urlparse(page_url)
|
|
page_params = parse_qs(parsed_page.query)
|
|
current_page_num = page_params.get("page", page_params.get("p", ["1"]))[0]
|
|
|
|
if current_page_num != "1":
|
|
parsed_canonical = urlparse(canonical_href)
|
|
canon_params = parse_qs(parsed_canonical.query)
|
|
canon_page_num = canon_params.get("page", canon_params.get("p", ["1"]))[0]
|
|
|
|
if canon_page_num == "1" or canonical_href.rstrip("/") == page_url.split("?")[0].rstrip("/"):
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="pagination", severity="high",
|
|
message=f"Page {current_page_num} canonical points to page 1",
|
|
recommendation=(
|
|
"Each paginated page should self-reference its own canonical URL "
|
|
"to ensure all pages are indexable."
|
|
),
|
|
))
|
|
|
|
# Check robots noindex on filtered/sorted pages
|
|
robots_meta = soup.find("meta", attrs={"name": "robots"})
|
|
if robots_meta:
|
|
content = robots_meta.get("content", "").lower()
|
|
if "noindex" in content and pagination_links:
|
|
issues.append(ProductPageIssue(
|
|
url=page_url, issue_type="pagination", severity="medium",
|
|
message="Paginated page has noindex but contains product links",
|
|
recommendation=(
|
|
"Ensure products on noindex pages are still discoverable "
|
|
"via other indexed pages or sitemap."
|
|
),
|
|
))
|
|
|
|
return issues
|
|
|
|
# ------------------------------------------------------------------
|
|
# Korean marketplace presence
|
|
# ------------------------------------------------------------------
|
|
|
|
async def check_korean_marketplaces(
|
|
self,
|
|
session: aiohttp.ClientSession,
|
|
brand_name: str,
|
|
) -> dict[str, MarketplacePresence]:
|
|
"""Search for brand presence on Korean marketplace platforms."""
|
|
marketplaces = {}
|
|
|
|
search_configs = {
|
|
"naver_smart_store": {
|
|
"search_url": f"https://search.shopping.naver.com/search/all?query={brand_name}",
|
|
"platform": "Naver Smart Store",
|
|
"indicator_patterns": [r"smartstore\.naver\.com", r"brand\.naver\.com"],
|
|
},
|
|
"coupang": {
|
|
"search_url": f"https://www.coupang.com/np/search?component=&q={brand_name}",
|
|
"platform": "Coupang",
|
|
"indicator_patterns": [r"coupang\.com/vp/products/"],
|
|
},
|
|
"gmarket": {
|
|
"search_url": f"https://browse.gmarket.co.kr/search?keyword={brand_name}",
|
|
"platform": "Gmarket",
|
|
"indicator_patterns": [r"gmarket\.co\.kr/item/"],
|
|
},
|
|
"11st": {
|
|
"search_url": f"https://search.11st.co.kr/Search.tmall?kwd={brand_name}",
|
|
"platform": "11번가",
|
|
"indicator_patterns": [r"11st\.co\.kr/products/"],
|
|
},
|
|
}
|
|
|
|
for key, cfg in search_configs.items():
|
|
presence = MarketplacePresence(platform=cfg["platform"], found=False)
|
|
try:
|
|
_, html = await self._fetch_page(session, cfg["search_url"])
|
|
if html:
|
|
for pattern in cfg["indicator_patterns"]:
|
|
matches = re.findall(pattern, html)
|
|
if matches:
|
|
presence.found = True
|
|
presence.product_count = len(matches)
|
|
# Extract first matching URL
|
|
url_match = re.search(
|
|
rf'href=["\']?(https?://[^"\'>\s]*{pattern}[^"\'>\s]*)',
|
|
html,
|
|
)
|
|
if url_match:
|
|
presence.url = url_match.group(1)
|
|
break
|
|
except Exception as exc:
|
|
self.logger.warning(f"Marketplace check failed for {key}: {exc}")
|
|
|
|
marketplaces[key] = presence
|
|
|
|
return marketplaces
|
|
|
|
# ------------------------------------------------------------------
|
|
# Naver Smart Store optimization
|
|
# ------------------------------------------------------------------
|
|
|
|
async def check_naver_smart_store(
|
|
self,
|
|
session: aiohttp.ClientSession,
|
|
url: str,
|
|
) -> dict[str, Any]:
|
|
"""Check Naver Smart Store-specific SEO elements."""
|
|
result: dict[str, Any] = {
|
|
"is_smart_store": False,
|
|
"issues": [],
|
|
"optimizations": [],
|
|
}
|
|
|
|
parsed = urlparse(url)
|
|
is_smart_store = "smartstore.naver.com" in parsed.netloc or "brand.naver.com" in parsed.netloc
|
|
result["is_smart_store"] = is_smart_store
|
|
|
|
_, html = await self._fetch_page(session, url)
|
|
if not html:
|
|
return result
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
# Check Naver-specific meta tags
|
|
naver_site_verification = soup.find("meta", attrs={"name": "naver-site-verification"})
|
|
if not naver_site_verification:
|
|
result["issues"].append({
|
|
"type": "naver_verification",
|
|
"severity": "medium",
|
|
"message": "Missing naver-site-verification meta tag",
|
|
"recommendation": "Add Naver Search Advisor verification tag.",
|
|
})
|
|
|
|
# Check for Naver Shopping structured data attributes
|
|
product_schema = soup.find("script", string=re.compile(r'"@type"\s*:\s*"Product"'))
|
|
if not product_schema:
|
|
result["issues"].append({
|
|
"type": "naver_schema",
|
|
"severity": "high",
|
|
"message": "Missing Product schema for Naver Shopping",
|
|
"recommendation": "Add Product JSON-LD with Korean product names and descriptions.",
|
|
})
|
|
|
|
# Check Korean content optimization
|
|
body_text = soup.get_text(separator=" ", strip=True)
|
|
korean_chars = len(re.findall(r"[\uac00-\ud7af]", body_text))
|
|
total_chars = len(body_text)
|
|
if total_chars > 0:
|
|
korean_ratio = korean_chars / total_chars
|
|
if korean_ratio < 0.3 and is_smart_store:
|
|
result["issues"].append({
|
|
"type": "korean_content",
|
|
"severity": "medium",
|
|
"message": f"Low Korean content ratio ({korean_ratio:.0%}) for Korean marketplace",
|
|
"recommendation": "Increase Korean language content for Naver search visibility.",
|
|
})
|
|
|
|
# Smart Store specific: check product detail image text
|
|
detail_images = soup.find_all("img", attrs={"class": re.compile(r"detail|product", re.I)})
|
|
if detail_images and not soup.find("div", attrs={"class": re.compile(r"product.*(desc|detail|content)", re.I)}):
|
|
result["optimizations"].append({
|
|
"type": "detail_text",
|
|
"message": "Product details appear to be image-only",
|
|
"recommendation": (
|
|
"Add HTML text product descriptions alongside images "
|
|
"for Naver search indexing."
|
|
),
|
|
})
|
|
|
|
return result
|
|
|
|
# ------------------------------------------------------------------
|
|
# Orchestrator
|
|
# ------------------------------------------------------------------
|
|
|
|
async def audit(
|
|
self,
|
|
url: str,
|
|
scope: str = "all",
|
|
sample: int = 50,
|
|
check_marketplaces: bool = False,
|
|
) -> EcommerceAuditResult:
|
|
"""Run the full e-commerce SEO audit."""
|
|
result = EcommerceAuditResult(url=url, timestamp=datetime.now().isoformat())
|
|
parsed = urlparse(url if url.startswith("http") else f"https://{url}")
|
|
domain = f"{parsed.scheme}://{parsed.netloc}"
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
# --- Product page audit ---
|
|
if scope in ("all", "products"):
|
|
self.logger.info("=== Product Page Audit ===")
|
|
pages = await self.get_product_pages(domain, sample=sample)
|
|
result.product_pages_audited = len(pages)
|
|
|
|
for page_info in pages:
|
|
page_issues = await self.audit_product_page(session, page_info["url"])
|
|
for issue in page_issues:
|
|
result.add_issue(issue)
|
|
|
|
# Duplicate detection
|
|
all_urls = [p["url"] for p in pages]
|
|
sitemap_urls = await self._fetch_sitemap_urls(domain)
|
|
all_urls.extend(sitemap_urls)
|
|
dup_groups = await self.detect_duplicates(list(set(all_urls)))
|
|
result.duplicate_groups = [asdict(dg) for dg in dup_groups]
|
|
for dg in dup_groups:
|
|
result.add_issue(ProductPageIssue(
|
|
url=dg.canonical_url,
|
|
issue_type="duplicate",
|
|
severity="high" if dg.reason == "parameter_variant" else "medium",
|
|
message=f"Duplicate group ({dg.reason}): {len(dg.duplicate_urls)} variants",
|
|
recommendation="Implement canonical tags or parameter handling in GSC/Naver.",
|
|
))
|
|
|
|
# Pagination check on category-like pages
|
|
category_like = [u for u in sitemap_urls if is_category_url(u)][:10]
|
|
for cat_url in category_like:
|
|
pag_issues = await self.check_pagination_seo(session, cat_url)
|
|
result.pagination_issues.extend([asdict(i) for i in pag_issues])
|
|
for issue in pag_issues:
|
|
result.add_issue(issue)
|
|
|
|
# --- Category taxonomy ---
|
|
if scope in ("all", "categories"):
|
|
self.logger.info("=== Category Taxonomy Analysis ===")
|
|
cat_result = await self.analyze_category_taxonomy(session, domain)
|
|
result.category_structure = cat_result
|
|
|
|
if cat_result.get("max_depth", 0) > 4:
|
|
result.add_issue(ProductPageIssue(
|
|
url=domain,
|
|
issue_type="category_depth",
|
|
severity="medium",
|
|
message=f"Category depth exceeds 4 levels (max: {cat_result['max_depth']})",
|
|
recommendation="Flatten category structure to 3-4 levels for better crawlability.",
|
|
))
|
|
|
|
if cat_result.get("breadcrumbs_missing", 0) > 0:
|
|
missing = cat_result["breadcrumbs_missing"]
|
|
total = cat_result.get("categories_found", 1)
|
|
result.add_issue(ProductPageIssue(
|
|
url=domain,
|
|
issue_type="breadcrumb",
|
|
severity="high" if missing > total * 0.5 else "medium",
|
|
message=f"{missing} category pages missing breadcrumb navigation",
|
|
recommendation="Add BreadcrumbList schema and visible breadcrumbs to all category pages.",
|
|
))
|
|
|
|
for fni in cat_result.get("faceted_nav_issues", []):
|
|
result.add_issue(ProductPageIssue(
|
|
url=fni["url"],
|
|
issue_type="faceted_nav",
|
|
severity="high",
|
|
message=fni["message"],
|
|
recommendation=fni["recommendation"],
|
|
))
|
|
|
|
# --- Korean marketplaces ---
|
|
if check_marketplaces:
|
|
self.logger.info("=== Korean Marketplace Presence ===")
|
|
# Extract brand name from site
|
|
_, home_html = await self._fetch_page(session, domain)
|
|
brand_name = ""
|
|
if home_html:
|
|
home_soup = BeautifulSoup(home_html, "lxml")
|
|
og_site = home_soup.find("meta", attrs={"property": "og:site_name"})
|
|
if og_site:
|
|
brand_name = og_site.get("content", "").strip()
|
|
if not brand_name:
|
|
title_tag = home_soup.find("title")
|
|
if title_tag:
|
|
brand_name = title_tag.get_text(strip=True).split("|")[0].split("-")[0].strip()
|
|
|
|
if brand_name:
|
|
mp_results = await self.check_korean_marketplaces(session, brand_name)
|
|
result.korean_marketplaces = {
|
|
k: asdict(v) for k, v in mp_results.items()
|
|
}
|
|
|
|
# Naver Smart Store check
|
|
naver_result = await self.check_naver_smart_store(session, domain)
|
|
result.naver_smart_store = naver_result
|
|
for naver_issue in naver_result.get("issues", []):
|
|
result.add_issue(ProductPageIssue(
|
|
url=domain,
|
|
issue_type=naver_issue["type"],
|
|
severity=naver_issue["severity"],
|
|
message=naver_issue["message"],
|
|
recommendation=naver_issue["recommendation"],
|
|
))
|
|
|
|
result.calculate_score()
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI output helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def print_rich_report(result: EcommerceAuditResult) -> None:
|
|
"""Print a rich-formatted report to the console."""
|
|
console.print(f"\n[bold cyan]E-Commerce SEO Audit Report[/bold cyan]")
|
|
console.print(f"URL: {result.url}")
|
|
console.print(f"Product Pages Audited: {result.product_pages_audited}")
|
|
console.print(f"Timestamp: {result.timestamp}")
|
|
|
|
# Score
|
|
score_color = "green" if result.score >= 80 else "yellow" if result.score >= 50 else "red"
|
|
console.print(f"\n[bold {score_color}]Score: {result.score}/100[/bold {score_color}]")
|
|
|
|
# Issues summary
|
|
table = Table(title="Issues Summary")
|
|
table.add_column("Severity", style="bold")
|
|
table.add_column("Count", justify="right")
|
|
for sev in ["critical", "high", "medium", "low"]:
|
|
color = {"critical": "red", "high": "yellow", "medium": "cyan", "low": "dim"}[sev]
|
|
table.add_row(f"[{color}]{sev.upper()}[/{color}]", str(len(result.issues[sev])))
|
|
console.print(table)
|
|
|
|
# Top issues
|
|
for sev in ["critical", "high"]:
|
|
if result.issues[sev]:
|
|
console.print(f"\n[bold red]{sev.upper()} Issues:[/bold red]")
|
|
for issue in result.issues[sev][:10]:
|
|
console.print(f" - [{issue['issue_type']}] {issue['message']}")
|
|
console.print(f" [dim]{issue['recommendation']}[/dim]")
|
|
|
|
# Category structure
|
|
if result.category_structure:
|
|
cs = result.category_structure
|
|
console.print(f"\n[bold]Category Structure:[/bold]")
|
|
console.print(f" Categories found: {cs.get('categories_found', 0)}")
|
|
console.print(f" Max depth: {cs.get('max_depth', 0)}")
|
|
console.print(f" Breadcrumbs present: {cs.get('breadcrumbs_present', 0)}")
|
|
console.print(f" Breadcrumbs missing: {cs.get('breadcrumbs_missing', 0)}")
|
|
|
|
# Duplicates
|
|
if result.duplicate_groups:
|
|
console.print(f"\n[bold]Duplicate Groups: {len(result.duplicate_groups)}[/bold]")
|
|
for dg in result.duplicate_groups[:5]:
|
|
console.print(f" [{dg['reason']}] {dg['canonical_url']} ({len(dg['duplicate_urls'])} variants)")
|
|
|
|
# Korean marketplaces
|
|
if result.korean_marketplaces:
|
|
console.print(f"\n[bold]Korean Marketplace Presence:[/bold]")
|
|
for key, mp in result.korean_marketplaces.items():
|
|
status = "[green]Found[/green]" if mp.get("found") else "[red]Not Found[/red]"
|
|
console.print(f" {mp.get('platform', key)}: {status}")
|
|
if mp.get("url"):
|
|
console.print(f" URL: {mp['url']}")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="E-Commerce SEO Auditor - Product page and marketplace audit",
|
|
)
|
|
parser.add_argument("--url", required=True, help="Target website URL")
|
|
parser.add_argument(
|
|
"--scope",
|
|
choices=["all", "products", "categories"],
|
|
default="all",
|
|
help="Audit scope (default: all)",
|
|
)
|
|
parser.add_argument(
|
|
"--korean-marketplaces",
|
|
action="store_true",
|
|
help="Check Korean marketplace presence (Coupang, Gmarket, 11번가, Naver)",
|
|
)
|
|
parser.add_argument(
|
|
"--sample",
|
|
type=int,
|
|
default=50,
|
|
help="Number of product pages to sample (default: 50)",
|
|
)
|
|
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
parser.add_argument("--output", type=str, help="Save output to file")
|
|
args = parser.parse_args()
|
|
|
|
auditor = EcommerceAuditor()
|
|
result = asyncio.run(
|
|
auditor.audit(
|
|
url=args.url,
|
|
scope=args.scope,
|
|
sample=args.sample,
|
|
check_marketplaces=args.korean_marketplaces,
|
|
)
|
|
)
|
|
|
|
if args.json:
|
|
output = json.dumps(asdict(result), indent=2, ensure_ascii=False, default=str)
|
|
if args.output:
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
f.write(output)
|
|
console.print(f"[green]Results saved to {args.output}[/green]")
|
|
else:
|
|
print(output)
|
|
else:
|
|
print_rich_report(result)
|
|
if args.output:
|
|
output = json.dumps(asdict(result), indent=2, ensure_ascii=False, default=str)
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
f.write(output)
|
|
console.print(f"\n[green]JSON results also saved to {args.output}[/green]")
|
|
|
|
auditor.print_stats()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|