Files
our-claude-skills/custom-skills/26-seo-international/code/scripts/international_auditor.py
Andrew Yim a3ff965b87 Add SEO skills 19-28, 31-32 with full Python implementations
12 new skills: Keyword Strategy, SERP Analysis, Position Tracking,
Link Building, Content Strategy, E-Commerce SEO, KPI Framework,
International SEO, AI Visibility, Knowledge Graph, Competitor Intel,
and Crawl Budget. ~20K lines of Python across 25 domain scripts.
Updated skill 11 pipeline table and repo CLAUDE.md.
Enhanced skill 18 local SEO workflow from jamie.clinic audit.

Note: Skill 26 hreflang_validator.py pending (content filter block).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 12:05:59 +09:00

1193 lines
46 KiB
Python

"""
International SEO Auditor - Multi-language site audit
=====================================================
Purpose: Audit international SEO implementation including URL structure, content parity, language detection, and redirect logic
Python: 3.10+
Usage: python international_auditor.py --url https://example.com --json
"""
import argparse
import asyncio
import json
import logging
import re
import sys
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Optional
from urllib.parse import urlparse, urljoin, parse_qs
import aiohttp
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger("international_auditor")
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
KEY_PAGES = ["home", "about", "contact", "products", "services", "blog", "faq"]
CCTLD_MAP: dict[str, str] = {
".kr": "ko", ".jp": "ja", ".cn": "zh", ".de": "de", ".fr": "fr",
".es": "es", ".it": "it", ".pt": "pt", ".nl": "nl", ".ru": "ru",
".br": "pt", ".mx": "es", ".co.uk": "en", ".ca": "en", ".au": "en",
".in": "en", ".co.kr": "ko", ".co.jp": "ja", ".com.cn": "zh",
}
LANG_SUBDIRECTORY_PATTERN = re.compile(
r"^/(?P<lang>[a-z]{2}(?:-[a-z]{2})?)(?:/|$)", re.IGNORECASE,
)
LANG_SUBDOMAIN_PATTERN = re.compile(
r"^(?P<lang>[a-z]{2}(?:-[a-z]{2})?)\.", re.IGNORECASE,
)
ACCEPT_LANGUAGE_HEADERS = [
{"Accept-Language": "ko-KR,ko;q=0.9"},
{"Accept-Language": "en-US,en;q=0.9"},
{"Accept-Language": "ja-JP,ja;q=0.9"},
{"Accept-Language": "zh-CN,zh;q=0.9"},
]
URL_STRUCTURE_INFO: dict[str, dict] = {
"cctld": {
"pros": [
"Strong geo-targeting signal to search engines",
"Clear country association for users",
"Independent domain authority per country",
],
"cons": [
"Expensive to acquire and maintain multiple domains",
"Domain authority is not shared across properties",
"Complex infrastructure management",
],
},
"subdomain": {
"pros": [
"Easy to set up on a single domain",
"Can be hosted on different servers/locations",
"Clear separation of language versions",
],
"cons": [
"Subdomains may be treated as separate sites by search engines",
"Limited geo-targeting signal compared to ccTLD",
"Domain authority dilution across subdomains",
],
},
"subdirectory": {
"pros": [
"All content inherits root domain authority",
"Simplest infrastructure and lowest cost",
"Best for consolidating SEO signals",
],
"cons": [
"Server must handle all language versions",
"Geo-targeting relies on Search Console settings and hreflang",
"Less clear country targeting than ccTLD",
],
},
"parameter": {
"pros": [
"Easy to implement technically",
],
"cons": [
"Not recommended by Google",
"URL parameters can be ignored by crawlers",
"Poor user experience and shareability",
"No geo-targeting signal",
],
},
}
# ---------------------------------------------------------------------------
# Unicode range helpers
# ---------------------------------------------------------------------------
RE_HANGUL = re.compile(r"[\uAC00-\uD7AF]")
RE_HIRAGANA = re.compile(r"[\u3040-\u309F]")
RE_KATAKANA = re.compile(r"[\u30A0-\u30FF]")
RE_CJK_UNIFIED = re.compile(r"[\u4E00-\u9FFF]")
RE_LATIN_EXTENDED = re.compile(r"[A-Za-z\u00C0-\u024F]")
# ---------------------------------------------------------------------------
# Dataclasses
# ---------------------------------------------------------------------------
@dataclass
class URLStructureAnalysis:
"""Analysis of the international URL structure pattern."""
pattern: str = "" # cctld / subdomain / subdirectory / parameter
examples: list[str] = field(default_factory=list)
languages_found: list[str] = field(default_factory=list)
recommendation: str = ""
pros: list[str] = field(default_factory=list)
cons: list[str] = field(default_factory=list)
@dataclass
class ContentParityEntry:
"""Content parity data for a single language version."""
language: str = ""
page_count: int = 0
key_pages_present: list[str] = field(default_factory=list)
freshness_score: float = 0.0
missing_pages: list[str] = field(default_factory=list)
@dataclass
class LanguageDetection:
"""Language detection audit for a single page."""
page_url: str = ""
declared_lang: str = ""
html_lang_attr: str = ""
content_language_header: str = ""
detected_lang: str = ""
is_consistent: bool = True
@dataclass
class RedirectLogic:
"""International redirect behaviour audit."""
has_ip_redirect: bool = False
has_language_redirect: bool = False
is_forced: bool = False
redirect_type: str = "" # 301 / 302 / js / none
recommendation: str = ""
@dataclass
class KoreanExpansion:
"""Korean-market expansion analysis."""
primary_lang: str = "ko"
target_markets: list[str] = field(default_factory=list)
cjk_url_issues: list[str] = field(default_factory=list)
regional_search_engines: dict[str, str] = field(default_factory=dict)
priority_recommendations: list[str] = field(default_factory=list)
@dataclass
class InternationalAuditResult:
"""Top-level result container for the full international audit."""
url: str = ""
url_structure: Optional[URLStructureAnalysis] = None
languages_detected: list[str] = field(default_factory=list)
content_parity: dict[str, ContentParityEntry] = field(default_factory=dict)
language_detection_issues: list[LanguageDetection] = field(default_factory=list)
redirect_logic: Optional[RedirectLogic] = None
korean_expansion: Optional[KoreanExpansion] = None
issues: list[str] = field(default_factory=list)
recommendations: list[str] = field(default_factory=list)
score: int = 100
timestamp: str = ""
# ---------------------------------------------------------------------------
# Auditor
# ---------------------------------------------------------------------------
class InternationalAuditor:
"""Perform a comprehensive international SEO audit on a target URL."""
def __init__(
self,
url: str,
korean_expansion: bool = False,
scope: str | None = None,
):
self.url = url.rstrip("/")
self.korean_expansion_flag = korean_expansion
self.scope = scope or "all"
self.parsed = urlparse(self.url)
self.base_url = f"{self.parsed.scheme}://{self.parsed.netloc}"
self.timeout = aiohttp.ClientTimeout(total=30)
self._session: aiohttp.ClientSession | None = None
# ------------------------------------------------------------------
# Public entry point
# ------------------------------------------------------------------
async def audit(self) -> InternationalAuditResult:
"""Run the full international SEO audit and return results."""
result = InternationalAuditResult(
url=self.url,
timestamp=datetime.utcnow().isoformat(),
)
async with aiohttp.ClientSession(timeout=self.timeout) as session:
self._session = session
try:
# 1. URL structure analysis
if self.scope in ("all", "structure"):
logger.info("Analysing URL structure ...")
result.url_structure = await self._analyze_url_structure(self.url)
# 2. Discover language versions
logger.info("Discovering language versions ...")
lang_versions = await self._discover_language_versions(self.url)
result.languages_detected = list(lang_versions.keys())
# 3. Content parity
if self.scope in ("all", "parity"):
logger.info("Auditing content parity ...")
result.content_parity = await self._audit_content_parity(lang_versions)
# 4. Language detection
if self.scope in ("all", "detection"):
logger.info("Checking language detection consistency ...")
all_pages: list[tuple[str, str]] = []
for lang, urls in lang_versions.items():
for u in urls[:5]:
all_pages.append((lang, u))
detections = await self._check_language_detection(all_pages)
result.language_detection_issues = [
d for d in detections if not d.is_consistent
]
# 5. Redirect logic
if self.scope in ("all", "redirects"):
logger.info("Auditing redirect logic ...")
result.redirect_logic = await self._audit_redirect_logic(self.url)
# 6. Korean expansion
if self.korean_expansion_flag:
logger.info("Analysing Korean expansion opportunities ...")
result.korean_expansion = await self._analyze_korean_expansion(self.url)
# Collect issues and recommendations
self._collect_issues(result)
result.score = self._calculate_score(result)
except Exception as exc:
logger.error("Audit failed: %s", exc)
result.issues.append(f"Audit error: {exc}")
result.score = 0
finally:
self._session = None
return result
# ------------------------------------------------------------------
# URL structure analysis
# ------------------------------------------------------------------
async def _analyze_url_structure(self, url: str) -> URLStructureAnalysis:
"""Determine the international URL pattern in use."""
parsed = urlparse(url)
hostname = parsed.netloc.lower()
path = parsed.path
analysis = URLStructureAnalysis()
# Check ccTLD
for tld, lang in CCTLD_MAP.items():
if hostname.endswith(tld):
analysis.pattern = "cctld"
analysis.examples.append(f"{hostname} -> {lang}")
analysis.languages_found.append(lang)
break
# Check subdomain pattern
if not analysis.pattern:
match = LANG_SUBDOMAIN_PATTERN.match(hostname)
if match:
lang_code = match.group("lang").lower()
if len(lang_code) == 2 and lang_code not in ("ww", "ww"):
analysis.pattern = "subdomain"
analysis.examples.append(f"{hostname}")
analysis.languages_found.append(lang_code)
# Check subdirectory pattern
if not analysis.pattern:
match = LANG_SUBDIRECTORY_PATTERN.match(path)
if match:
lang_code = match.group("lang").lower()
analysis.pattern = "subdirectory"
analysis.examples.append(f"{path}")
analysis.languages_found.append(lang_code)
# Check URL parameter
if not analysis.pattern:
qs = parse_qs(parsed.query)
for param in ("lang", "language", "hl", "locale", "lg"):
if param in qs:
analysis.pattern = "parameter"
analysis.examples.append(f"?{param}={qs[param][0]}")
analysis.languages_found.append(qs[param][0])
break
# Try to discover additional language versions from page HTML
try:
html = await self._fetch_text(url)
if html:
hreflang_langs = self._extract_hreflang_langs(html)
for lang in hreflang_langs:
if lang not in analysis.languages_found:
analysis.languages_found.append(lang)
# Refine pattern from hreflang hrefs if not yet determined
if not analysis.pattern and hreflang_langs:
hrefs = self._extract_hreflang_hrefs(html)
analysis.pattern = self._detect_pattern_from_hrefs(hrefs)
except Exception as exc:
logger.warning("Could not fetch page for structure analysis: %s", exc)
if not analysis.pattern:
analysis.pattern = "unknown"
# Attach pros/cons/recommendation
info = URL_STRUCTURE_INFO.get(analysis.pattern, {})
analysis.pros = info.get("pros", [])
analysis.cons = info.get("cons", [])
if analysis.pattern == "parameter":
analysis.recommendation = (
"URL parameters are not recommended for international targeting. "
"Consider migrating to subdirectories or subdomains."
)
elif analysis.pattern == "cctld":
analysis.recommendation = (
"ccTLDs provide the strongest geo-targeting signal. "
"Ensure each domain has sufficient content and backlinks."
)
elif analysis.pattern == "subdomain":
analysis.recommendation = (
"Subdomains can work well but ensure hreflang tags are properly "
"implemented to signal relationships between language versions."
)
elif analysis.pattern == "subdirectory":
analysis.recommendation = (
"Subdirectories are the most cost-effective approach and consolidate "
"domain authority. This is the recommended pattern for most sites."
)
else:
analysis.recommendation = (
"Could not determine the URL structure pattern. "
"Implement a clear international URL strategy using subdirectories, "
"subdomains, or ccTLDs with proper hreflang tags."
)
return analysis
# ------------------------------------------------------------------
# Discover language versions
# ------------------------------------------------------------------
async def _discover_language_versions(self, url: str) -> dict[str, list[str]]:
"""Discover language versions from hreflang tags and common patterns."""
lang_versions: dict[str, list[str]] = {}
try:
html = await self._fetch_text(url)
except Exception:
html = None
# 1. Extract from hreflang tags
if html:
hreflang_map = self._extract_hreflang_map(html)
for lang, href in hreflang_map.items():
if lang == "x-default":
continue
lang_key = lang.split("-")[0].lower()
lang_versions.setdefault(lang_key, []).append(href)
# 2. Probe common subdirectory patterns
common_langs = ["en", "ko", "ja", "zh", "de", "fr", "es", "pt"]
probe_tasks = []
for lang in common_langs:
if lang not in lang_versions:
candidate = f"{self.base_url}/{lang}/"
probe_tasks.append(self._probe_url(lang, candidate))
if probe_tasks:
results = await asyncio.gather(*probe_tasks, return_exceptions=True)
for item in results:
if isinstance(item, tuple):
lang, probe_url = item
lang_versions.setdefault(lang, []).append(probe_url)
# 3. Probe common subdomain patterns
domain_parts = self.parsed.netloc.split(".")
if len(domain_parts) >= 2:
root_domain = ".".join(domain_parts[-2:])
subdomain_tasks = []
for lang in common_langs:
if lang not in lang_versions:
candidate = f"{self.parsed.scheme}://{lang}.{root_domain}/"
subdomain_tasks.append(self._probe_url(lang, candidate))
if subdomain_tasks:
results = await asyncio.gather(*subdomain_tasks, return_exceptions=True)
for item in results:
if isinstance(item, tuple):
lang, probe_url = item
lang_versions.setdefault(lang, []).append(probe_url)
# Ensure the original URL is listed under some language
if not lang_versions:
detected = "unknown"
if html:
detected = self._detect_language_from_content(html) or "unknown"
lang_versions[detected] = [url]
return lang_versions
async def _probe_url(self, lang: str, url: str) -> tuple[str, str] | None:
"""Check if a URL returns 200 and return (lang, url) or raise."""
try:
async with self._session.head(
url, allow_redirects=True, timeout=self.timeout,
) as resp:
if resp.status == 200:
return (lang, url)
except Exception:
pass
raise ValueError(f"Probe failed for {url}")
# ------------------------------------------------------------------
# Content parity
# ------------------------------------------------------------------
async def _audit_content_parity(
self,
lang_versions: dict[str, list[str]],
) -> dict[str, ContentParityEntry]:
"""Audit content parity across discovered language versions."""
parity: dict[str, ContentParityEntry] = {}
for lang, urls in lang_versions.items():
entry = ContentParityEntry(language=lang, page_count=len(urls))
# Determine base URL for this language version
if urls:
lang_base = urls[0].rstrip("/")
present_keys = await self._check_key_pages(lang_base)
entry.key_pages_present = present_keys
entry.missing_pages = [
p for p in KEY_PAGES if p not in present_keys
]
# Freshness score from Last-Modified header on first URL
entry.freshness_score = await self._get_freshness_score(urls[0])
parity[lang] = entry
return parity
async def _check_key_pages(self, lang_base_url: str) -> list[str]:
"""Check which key pages exist under a language base URL."""
present: list[str] = []
page_paths = {
"home": "/",
"about": "/about",
"contact": "/contact",
"products": "/products",
"services": "/services",
"blog": "/blog",
"faq": "/faq",
}
tasks = []
for name, path in page_paths.items():
full_url = lang_base_url.rstrip("/") + path
tasks.append(self._check_page_exists(name, full_url))
results = await asyncio.gather(*tasks, return_exceptions=True)
for item in results:
if isinstance(item, str):
present.append(item)
return present
async def _check_page_exists(self, name: str, url: str) -> str | None:
"""Return the page name if URL returns 200, else raise."""
try:
async with self._session.head(
url, allow_redirects=True, timeout=self.timeout,
) as resp:
if resp.status == 200:
return name
except Exception:
pass
raise ValueError(f"Page not found: {url}")
async def _get_freshness_score(self, url: str) -> float:
"""Compute a freshness score (0-100) based on Last-Modified header."""
try:
async with self._session.head(
url, allow_redirects=True, timeout=self.timeout,
) as resp:
last_modified = resp.headers.get("Last-Modified")
if last_modified:
from email.utils import parsedate_to_datetime
mod_dt = parsedate_to_datetime(last_modified)
age_days = (datetime.utcnow() - mod_dt.replace(tzinfo=None)).days
if age_days <= 7:
return 100.0
elif age_days <= 30:
return 90.0
elif age_days <= 90:
return 70.0
elif age_days <= 180:
return 50.0
elif age_days <= 365:
return 30.0
else:
return 10.0
# No Last-Modified header -- neutral score
return 50.0
except Exception:
return 0.0
# ------------------------------------------------------------------
# Language detection
# ------------------------------------------------------------------
async def _check_language_detection(
self,
pages: list[tuple[str, str]],
) -> list[LanguageDetection]:
"""Check language declaration consistency for a list of pages."""
tasks = [
self._detect_single_page(declared_lang, page_url)
for declared_lang, page_url in pages
]
results = await asyncio.gather(*tasks, return_exceptions=True)
detections: list[LanguageDetection] = []
for r in results:
if isinstance(r, LanguageDetection):
detections.append(r)
return detections
async def _detect_single_page(
self,
declared_lang: str,
page_url: str,
) -> LanguageDetection:
"""Perform language detection checks on a single page."""
detection = LanguageDetection(
page_url=page_url,
declared_lang=declared_lang,
)
try:
async with self._session.get(
page_url, allow_redirects=True, timeout=self.timeout,
) as resp:
# Content-Language header
cl_header = resp.headers.get("Content-Language", "")
detection.content_language_header = cl_header.strip()
html = await resp.text(errors="replace")
# HTML lang attribute
lang_match = re.search(
r'<html[^>]*\slang=["\']([^"\']+)["\']',
html,
re.IGNORECASE,
)
if lang_match:
detection.html_lang_attr = lang_match.group(1).strip()
# Detect from content
detection.detected_lang = self._detect_language_from_content(html)
# Consistency check
expected = declared_lang.split("-")[0].lower()
html_lang_base = detection.html_lang_attr.split("-")[0].lower() if detection.html_lang_attr else ""
cl_base = detection.content_language_header.split("-")[0].lower() if detection.content_language_header else ""
detected_base = detection.detected_lang.split("-")[0].lower() if detection.detected_lang else ""
inconsistencies = []
if html_lang_base and html_lang_base != expected:
inconsistencies.append("html_lang_attr")
if cl_base and cl_base != expected:
inconsistencies.append("content_language_header")
if detected_base and detected_base != expected:
inconsistencies.append("detected_content")
detection.is_consistent = len(inconsistencies) == 0
except Exception as exc:
logger.warning("Language detection failed for %s: %s", page_url, exc)
detection.is_consistent = False
return detection
def _detect_language_from_content(self, html: str) -> str:
"""Simple heuristic language detection using Unicode ranges."""
# Strip HTML tags for content analysis
text = re.sub(r"<[^>]+>", " ", html)
text = re.sub(r"\s+", " ", text)[:5000] # limit analysis scope
counts = {
"ko": len(RE_HANGUL.findall(text)),
"ja": len(RE_HIRAGANA.findall(text)) + len(RE_KATAKANA.findall(text)),
"zh": len(RE_CJK_UNIFIED.findall(text)),
"en": len(RE_LATIN_EXTENDED.findall(text)),
}
# Japanese text also contains CJK unified characters (kanji).
# If both hiragana/katakana AND CJK are present, prefer Japanese.
if counts["ja"] > 0 and counts["zh"] > 0:
if counts["ja"] > counts["zh"] * 0.1:
counts["zh"] = 0 # attribute CJK chars to Japanese context
# Korean text can contain some CJK (hanja) but Hangul dominates.
if counts["ko"] > 0 and counts["zh"] > 0:
if counts["ko"] > counts["zh"]:
counts["zh"] = 0
if not any(counts.values()):
return "unknown"
dominant = max(counts, key=counts.get)
# Require a meaningful amount of the dominant script
if counts[dominant] < 20:
return "unknown"
return dominant
# ------------------------------------------------------------------
# Redirect logic
# ------------------------------------------------------------------
async def _audit_redirect_logic(self, url: str) -> RedirectLogic:
"""Audit redirect behaviour for different Accept-Language headers."""
logic = RedirectLogic()
redirect_targets: dict[str, str] = {}
for headers in ACCEPT_LANGUAGE_HEADERS:
try:
async with self._session.get(
url,
allow_redirects=False,
headers=headers,
timeout=self.timeout,
) as resp:
lang_key = list(headers.values())[0].split(",")[0]
if resp.status in (301, 302, 303, 307, 308):
location = resp.headers.get("Location", "")
redirect_targets[lang_key] = location
logic.has_language_redirect = True
if resp.status in (301, 302):
logic.redirect_type = str(resp.status)
logic.is_forced = True
else:
redirect_targets[lang_key] = str(resp.url)
except Exception as exc:
logger.warning("Redirect check failed for %s: %s", headers, exc)
# Detect IP-based redirects by comparing redirect targets
unique_targets = set(redirect_targets.values())
if len(unique_targets) > 1:
logic.has_language_redirect = True
# Check for JS-based redirects in page source
try:
html = await self._fetch_text(url)
if html:
js_redirect_patterns = [
r"navigator\.language",
r"navigator\.languages",
r"window\.location\s*=",
r"location\.href\s*=",
r"geo[_-]?redirect",
r"ip[_-]?redirect",
]
for pattern in js_redirect_patterns:
if re.search(pattern, html, re.IGNORECASE):
if "geo" in pattern or "ip" in pattern:
logic.has_ip_redirect = True
if not logic.redirect_type:
logic.redirect_type = "js"
break
except Exception:
pass
# Recommendation
if logic.is_forced:
logic.recommendation = (
"Forced language/region redirects detected (HTTP {}). "
"Best practice: suggest language via a banner or interstitial "
"rather than forcing a redirect. Forced redirects can prevent "
"search engines from crawling all language versions and frustrate "
"users who prefer a different language."
).format(logic.redirect_type)
elif logic.has_language_redirect:
logic.recommendation = (
"Language-based content negotiation detected. Ensure all language "
"versions remain directly accessible via their canonical URLs and "
"that Googlebot can crawl every version without redirect loops."
)
else:
logic.recommendation = (
"No forced language redirects detected. Consider adding a "
"language suggestion banner for users whose browser language "
"differs from the page language."
)
return logic
# ------------------------------------------------------------------
# Korean expansion analysis
# ------------------------------------------------------------------
async def _analyze_korean_expansion(self, url: str) -> KoreanExpansion:
"""Analyse expansion opportunities for Korean-primary sites."""
expansion = KoreanExpansion(
primary_lang="ko",
target_markets=["ja", "zh", "en"],
regional_search_engines={
"ko": "Naver (naver.com) -- dominant in Korea; also consider Daum",
"ja": "Yahoo Japan (yahoo.co.jp) -- significant market share alongside Google",
"zh": "Baidu (baidu.com) -- dominant in mainland China; also Sogou, 360 Search",
"en": "Google (google.com) -- global default",
},
)
# Check CJK URL encoding issues
try:
html = await self._fetch_text(url)
if html:
# Find links with non-ASCII characters in URLs
href_pattern = re.compile(r'href=["\']([^"\']+)["\']', re.IGNORECASE)
for match in href_pattern.finditer(html):
href = match.group(1)
# Check for unencoded CJK characters in URLs
if RE_HANGUL.search(href):
expansion.cjk_url_issues.append(
f"Unencoded Korean characters in URL: {href[:100]}"
)
if RE_CJK_UNIFIED.search(href):
expansion.cjk_url_issues.append(
f"Unencoded CJK characters in URL: {href[:100]}"
)
if RE_HIRAGANA.search(href) or RE_KATAKANA.search(href):
expansion.cjk_url_issues.append(
f"Unencoded Japanese characters in URL: {href[:100]}"
)
# De-duplicate
expansion.cjk_url_issues = list(dict.fromkeys(expansion.cjk_url_issues))[:20]
except Exception as exc:
logger.warning("CJK URL check failed: %s", exc)
# Priority recommendations
expansion.priority_recommendations = [
"Implement hreflang tags linking ko, ja, zh, and en versions with x-default",
"Use subdirectories (/ko/, /ja/, /zh/, /en/) for cost-effective international structure",
"Create a Naver Search Advisor (searchadvisor.naver.com) account for Korean SEO",
"Submit sitemap to Baidu Webmaster Tools for Chinese market visibility",
"Register with Yahoo Japan Search Console (optional but recommended for Japan)",
"Ensure all URLs use percent-encoded paths -- avoid raw CJK characters in hrefs",
"Provide a language/region selector accessible from every page",
"Localise content beyond translation: currency, date formats, cultural references",
"Consider separate social media strategies per market (KakaoTalk for KR, LINE for JP, WeChat for CN)",
"Monitor Core Web Vitals per region -- CDN edge presence matters for CJK markets",
]
return expansion
# ------------------------------------------------------------------
# Issue collection
# ------------------------------------------------------------------
def _collect_issues(self, result: InternationalAuditResult) -> None:
"""Populate issues and recommendations from audit sub-results."""
# URL structure issues
if result.url_structure:
if result.url_structure.pattern == "parameter":
result.issues.append(
"URL parameters used for language targeting -- not recommended"
)
if result.url_structure.pattern == "unknown":
result.issues.append(
"Could not detect a clear international URL structure"
)
if len(result.url_structure.languages_found) <= 1:
result.issues.append(
"Only one language version detected -- consider expanding"
)
# Content parity issues
if result.content_parity:
primary_count = max(
(e.page_count for e in result.content_parity.values()), default=0,
)
for lang, entry in result.content_parity.items():
if entry.missing_pages:
result.issues.append(
f"Language '{lang}' is missing key pages: "
f"{', '.join(entry.missing_pages)}"
)
if primary_count > 0 and entry.page_count < primary_count * 0.5:
result.issues.append(
f"Language '{lang}' has significantly fewer pages "
f"({entry.page_count}) than the primary version ({primary_count})"
)
# Language detection issues
for det in result.language_detection_issues:
result.issues.append(
f"Language inconsistency on {det.page_url}: "
f"declared={det.declared_lang}, html_lang={det.html_lang_attr}, "
f"detected={det.detected_lang}"
)
# Redirect logic issues
if result.redirect_logic:
if result.redirect_logic.is_forced:
result.issues.append(
"Forced language/region redirects detected -- "
"this can block search engine crawling"
)
if result.redirect_logic.has_ip_redirect:
result.issues.append(
"IP-based redirect logic detected -- "
"ensure Googlebot can access all versions"
)
# Check for x-default in hreflang
has_xdefault = False
if result.url_structure and result.url_structure.languages_found:
try:
html = None # already fetched; recheck from structure data
# We check for x-default in languages_found (hreflang extraction adds it)
if "x-default" in result.url_structure.languages_found:
has_xdefault = True
result.url_structure.languages_found.remove("x-default")
except Exception:
pass
if not has_xdefault and len(result.languages_detected) > 1:
result.issues.append(
"No x-default hreflang tag detected -- "
"add x-default to specify the fallback page"
)
# Recommendations
if result.issues:
result.recommendations.append(
"Address the issues listed above to improve international SEO health"
)
if result.url_structure and result.url_structure.recommendation:
result.recommendations.append(result.url_structure.recommendation)
if result.redirect_logic and result.redirect_logic.recommendation:
result.recommendations.append(result.redirect_logic.recommendation)
if result.korean_expansion:
result.recommendations.extend(
result.korean_expansion.priority_recommendations[:5]
)
# ------------------------------------------------------------------
# Score calculation
# ------------------------------------------------------------------
def _calculate_score(self, result: InternationalAuditResult) -> int:
"""Calculate an overall international SEO score (0-100)."""
score = 100
# URL structure penalties
if result.url_structure:
if result.url_structure.pattern == "parameter":
score -= 15
elif result.url_structure.pattern == "unknown":
score -= 10
if len(result.url_structure.languages_found) <= 1:
score -= 5
# Content parity penalties
if result.content_parity:
for entry in result.content_parity.values():
score -= 5 * len(entry.missing_pages)
# Language detection penalties
score -= 10 * len(result.language_detection_issues)
# Redirect logic penalties
if result.redirect_logic:
if result.redirect_logic.is_forced:
score -= 15
# No x-default penalty
for issue in result.issues:
if "x-default" in issue:
score -= 10
break
# CJK encoding issues
if result.korean_expansion:
cjk_count = len(result.korean_expansion.cjk_url_issues)
score -= 5 * min(cjk_count, 5) # cap at 25 points
return max(0, score)
# ------------------------------------------------------------------
# HTML / hreflang helpers
# ------------------------------------------------------------------
async def _fetch_text(self, url: str) -> str | None:
"""Fetch URL content as text."""
try:
async with self._session.get(
url, allow_redirects=True, timeout=self.timeout,
) as resp:
if resp.status == 200:
return await resp.text(errors="replace")
except Exception as exc:
logger.warning("Failed to fetch %s: %s", url, exc)
return None
def _extract_hreflang_langs(self, html: str) -> list[str]:
"""Extract language codes from hreflang link tags."""
pattern = re.compile(
r'<link[^>]*\brel=["\']alternate["\'][^>]*\bhreflang=["\']([^"\']+)["\']',
re.IGNORECASE,
)
return [m.group(1).lower() for m in pattern.finditer(html)]
def _extract_hreflang_hrefs(self, html: str) -> list[str]:
"""Extract href values from hreflang link tags."""
pattern = re.compile(
r'<link[^>]*\brel=["\']alternate["\'][^>]*\bhref=["\']([^"\']+)["\']',
re.IGNORECASE,
)
return [m.group(1) for m in pattern.finditer(html)]
def _extract_hreflang_map(self, html: str) -> dict[str, str]:
"""Extract a mapping of hreflang language code to href."""
result: dict[str, str] = {}
# Match both attribute orderings
pattern1 = re.compile(
r'<link[^>]*\brel=["\']alternate["\']'
r'[^>]*\bhreflang=["\']([^"\']+)["\']'
r'[^>]*\bhref=["\']([^"\']+)["\']',
re.IGNORECASE,
)
pattern2 = re.compile(
r'<link[^>]*\bhreflang=["\']([^"\']+)["\']'
r'[^>]*\bhref=["\']([^"\']+)["\']'
r'[^>]*\brel=["\']alternate["\']',
re.IGNORECASE,
)
for m in pattern1.finditer(html):
result[m.group(1).lower()] = m.group(2)
for m in pattern2.finditer(html):
result[m.group(1).lower()] = m.group(2)
return result
def _detect_pattern_from_hrefs(self, hrefs: list[str]) -> str:
"""Guess the URL structure pattern from a list of hreflang hrefs."""
for href in hrefs:
parsed = urlparse(href)
# ccTLD check
for tld in CCTLD_MAP:
if parsed.netloc.endswith(tld):
return "cctld"
# subdomain check
if LANG_SUBDOMAIN_PATTERN.match(parsed.netloc):
return "subdomain"
# subdirectory check
if LANG_SUBDIRECTORY_PATTERN.match(parsed.path):
return "subdirectory"
# parameter check
qs = parse_qs(parsed.query)
for param in ("lang", "language", "hl", "locale", "lg"):
if param in qs:
return "parameter"
return "unknown"
# ---------------------------------------------------------------------------
# Serialisation helper
# ---------------------------------------------------------------------------
def _result_to_dict(result: InternationalAuditResult) -> dict:
"""Convert the audit result dataclass tree to a plain dict."""
data = asdict(result)
# Flatten content_parity keys
if "content_parity" in data and isinstance(data["content_parity"], dict):
cleaned = {}
for key, value in data["content_parity"].items():
cleaned[key] = value
data["content_parity"] = cleaned
return data
def _print_text_report(result: InternationalAuditResult) -> None:
"""Print a human-readable text report to stdout."""
print("=" * 70)
print(" International SEO Audit Report")
print("=" * 70)
print(f" URL: {result.url}")
print(f" Timestamp: {result.timestamp}")
print(f" Score: {result.score}/100")
print("=" * 70)
# URL Structure
if result.url_structure:
print("\n--- URL Structure ---")
print(f" Pattern: {result.url_structure.pattern}")
print(f" Languages: {', '.join(result.url_structure.languages_found)}")
if result.url_structure.examples:
print(f" Examples: {', '.join(result.url_structure.examples)}")
print(f" Recommendation: {result.url_structure.recommendation}")
# Languages detected
if result.languages_detected:
print(f"\n--- Languages Detected ---")
print(f" {', '.join(result.languages_detected)}")
# Content parity
if result.content_parity:
print("\n--- Content Parity ---")
for lang, entry in result.content_parity.items():
missing = ", ".join(entry.missing_pages) if entry.missing_pages else "none"
present = ", ".join(entry.key_pages_present) if entry.key_pages_present else "none"
print(f" [{lang}] pages={entry.page_count} freshness={entry.freshness_score:.0f}")
print(f" key pages present: {present}")
print(f" missing: {missing}")
# Language detection issues
if result.language_detection_issues:
print("\n--- Language Detection Issues ---")
for det in result.language_detection_issues:
print(f" {det.page_url}")
print(f" declared={det.declared_lang} html_lang={det.html_lang_attr} "
f"header={det.content_language_header} detected={det.detected_lang}")
# Redirect logic
if result.redirect_logic:
print("\n--- Redirect Logic ---")
print(f" Language redirect: {result.redirect_logic.has_language_redirect}")
print(f" IP redirect: {result.redirect_logic.has_ip_redirect}")
print(f" Forced: {result.redirect_logic.is_forced}")
print(f" Type: {result.redirect_logic.redirect_type or 'none'}")
print(f" Recommendation: {result.redirect_logic.recommendation}")
# Korean expansion
if result.korean_expansion:
print("\n--- Korean Expansion ---")
print(f" Target markets: {', '.join(result.korean_expansion.target_markets)}")
if result.korean_expansion.cjk_url_issues:
print(f" CJK URL issues: {len(result.korean_expansion.cjk_url_issues)}")
for issue in result.korean_expansion.cjk_url_issues[:5]:
print(f" - {issue}")
print(" Regional search engines:")
for region, engine in result.korean_expansion.regional_search_engines.items():
print(f" [{region}] {engine}")
if result.korean_expansion.priority_recommendations:
print(" Priority recommendations:")
for rec in result.korean_expansion.priority_recommendations[:5]:
print(f" - {rec}")
# Issues
if result.issues:
print("\n--- Issues ---")
for issue in result.issues:
print(f" - {issue}")
# Recommendations
if result.recommendations:
print("\n--- Recommendations ---")
for rec in result.recommendations:
print(f" - {rec}")
print("\n" + "=" * 70)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def build_parser() -> argparse.ArgumentParser:
"""Build the argument parser."""
parser = argparse.ArgumentParser(
description="International SEO Auditor -- analyse multi-language site implementation",
)
parser.add_argument(
"--url",
required=True,
help="Target URL to audit",
)
parser.add_argument(
"--scope",
choices=["structure", "parity", "detection", "redirects", "all"],
default="all",
help="Audit scope (default: all)",
)
parser.add_argument(
"--korean-expansion",
action="store_true",
default=False,
help="Enable Korean expansion analysis",
)
parser.add_argument(
"--json",
action="store_true",
dest="json_output",
default=False,
help="Output results as JSON",
)
parser.add_argument(
"--output",
type=str,
default=None,
help="Write output to file instead of stdout",
)
return parser
async def async_main(args: argparse.Namespace) -> None:
"""Async entry point."""
auditor = InternationalAuditor(
url=args.url,
korean_expansion=args.korean_expansion,
scope=args.scope,
)
result = await auditor.audit()
if args.json_output:
output = json.dumps(
_result_to_dict(result),
indent=2,
ensure_ascii=False,
default=str,
)
else:
# For file output in text mode, capture via print; for stdout, print directly.
if args.output:
import io
buf = io.StringIO()
_old_stdout = sys.stdout
sys.stdout = buf
_print_text_report(result)
sys.stdout = _old_stdout
output = buf.getvalue()
else:
_print_text_report(result)
return
if args.output:
with open(args.output, "w", encoding="utf-8") as fh:
fh.write(output)
logger.info("Output written to %s", args.output)
else:
print(output)
def main() -> None:
"""CLI entry point."""
parser = build_parser()
args = parser.parse_args()
asyncio.run(async_main(args))
if __name__ == "__main__":
main()