12 new skills: Keyword Strategy, SERP Analysis, Position Tracking, Link Building, Content Strategy, E-Commerce SEO, KPI Framework, International SEO, AI Visibility, Knowledge Graph, Competitor Intel, and Crawl Budget. ~20K lines of Python across 25 domain scripts. Updated skill 11 pipeline table and repo CLAUDE.md. Enhanced skill 18 local SEO workflow from jamie.clinic audit. Note: Skill 26 hreflang_validator.py pending (content filter block). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1193 lines
46 KiB
Python
1193 lines
46 KiB
Python
"""
|
|
International SEO Auditor - Multi-language site audit
|
|
=====================================================
|
|
Purpose: Audit international SEO implementation including URL structure, content parity, language detection, and redirect logic
|
|
Python: 3.10+
|
|
Usage: python international_auditor.py --url https://example.com --json
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import re
|
|
import sys
|
|
from dataclasses import dataclass, field, asdict
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
from urllib.parse import urlparse, urljoin, parse_qs
|
|
|
|
import aiohttp
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Logging
|
|
# ---------------------------------------------------------------------------
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
)
|
|
logger = logging.getLogger("international_auditor")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Constants
|
|
# ---------------------------------------------------------------------------
|
|
KEY_PAGES = ["home", "about", "contact", "products", "services", "blog", "faq"]
|
|
|
|
CCTLD_MAP: dict[str, str] = {
|
|
".kr": "ko", ".jp": "ja", ".cn": "zh", ".de": "de", ".fr": "fr",
|
|
".es": "es", ".it": "it", ".pt": "pt", ".nl": "nl", ".ru": "ru",
|
|
".br": "pt", ".mx": "es", ".co.uk": "en", ".ca": "en", ".au": "en",
|
|
".in": "en", ".co.kr": "ko", ".co.jp": "ja", ".com.cn": "zh",
|
|
}
|
|
|
|
LANG_SUBDIRECTORY_PATTERN = re.compile(
|
|
r"^/(?P<lang>[a-z]{2}(?:-[a-z]{2})?)(?:/|$)", re.IGNORECASE,
|
|
)
|
|
|
|
LANG_SUBDOMAIN_PATTERN = re.compile(
|
|
r"^(?P<lang>[a-z]{2}(?:-[a-z]{2})?)\.", re.IGNORECASE,
|
|
)
|
|
|
|
ACCEPT_LANGUAGE_HEADERS = [
|
|
{"Accept-Language": "ko-KR,ko;q=0.9"},
|
|
{"Accept-Language": "en-US,en;q=0.9"},
|
|
{"Accept-Language": "ja-JP,ja;q=0.9"},
|
|
{"Accept-Language": "zh-CN,zh;q=0.9"},
|
|
]
|
|
|
|
URL_STRUCTURE_INFO: dict[str, dict] = {
|
|
"cctld": {
|
|
"pros": [
|
|
"Strong geo-targeting signal to search engines",
|
|
"Clear country association for users",
|
|
"Independent domain authority per country",
|
|
],
|
|
"cons": [
|
|
"Expensive to acquire and maintain multiple domains",
|
|
"Domain authority is not shared across properties",
|
|
"Complex infrastructure management",
|
|
],
|
|
},
|
|
"subdomain": {
|
|
"pros": [
|
|
"Easy to set up on a single domain",
|
|
"Can be hosted on different servers/locations",
|
|
"Clear separation of language versions",
|
|
],
|
|
"cons": [
|
|
"Subdomains may be treated as separate sites by search engines",
|
|
"Limited geo-targeting signal compared to ccTLD",
|
|
"Domain authority dilution across subdomains",
|
|
],
|
|
},
|
|
"subdirectory": {
|
|
"pros": [
|
|
"All content inherits root domain authority",
|
|
"Simplest infrastructure and lowest cost",
|
|
"Best for consolidating SEO signals",
|
|
],
|
|
"cons": [
|
|
"Server must handle all language versions",
|
|
"Geo-targeting relies on Search Console settings and hreflang",
|
|
"Less clear country targeting than ccTLD",
|
|
],
|
|
},
|
|
"parameter": {
|
|
"pros": [
|
|
"Easy to implement technically",
|
|
],
|
|
"cons": [
|
|
"Not recommended by Google",
|
|
"URL parameters can be ignored by crawlers",
|
|
"Poor user experience and shareability",
|
|
"No geo-targeting signal",
|
|
],
|
|
},
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Unicode range helpers
|
|
# ---------------------------------------------------------------------------
|
|
RE_HANGUL = re.compile(r"[\uAC00-\uD7AF]")
|
|
RE_HIRAGANA = re.compile(r"[\u3040-\u309F]")
|
|
RE_KATAKANA = re.compile(r"[\u30A0-\u30FF]")
|
|
RE_CJK_UNIFIED = re.compile(r"[\u4E00-\u9FFF]")
|
|
RE_LATIN_EXTENDED = re.compile(r"[A-Za-z\u00C0-\u024F]")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Dataclasses
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@dataclass
|
|
class URLStructureAnalysis:
|
|
"""Analysis of the international URL structure pattern."""
|
|
pattern: str = "" # cctld / subdomain / subdirectory / parameter
|
|
examples: list[str] = field(default_factory=list)
|
|
languages_found: list[str] = field(default_factory=list)
|
|
recommendation: str = ""
|
|
pros: list[str] = field(default_factory=list)
|
|
cons: list[str] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class ContentParityEntry:
|
|
"""Content parity data for a single language version."""
|
|
language: str = ""
|
|
page_count: int = 0
|
|
key_pages_present: list[str] = field(default_factory=list)
|
|
freshness_score: float = 0.0
|
|
missing_pages: list[str] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class LanguageDetection:
|
|
"""Language detection audit for a single page."""
|
|
page_url: str = ""
|
|
declared_lang: str = ""
|
|
html_lang_attr: str = ""
|
|
content_language_header: str = ""
|
|
detected_lang: str = ""
|
|
is_consistent: bool = True
|
|
|
|
|
|
@dataclass
|
|
class RedirectLogic:
|
|
"""International redirect behaviour audit."""
|
|
has_ip_redirect: bool = False
|
|
has_language_redirect: bool = False
|
|
is_forced: bool = False
|
|
redirect_type: str = "" # 301 / 302 / js / none
|
|
recommendation: str = ""
|
|
|
|
|
|
@dataclass
|
|
class KoreanExpansion:
|
|
"""Korean-market expansion analysis."""
|
|
primary_lang: str = "ko"
|
|
target_markets: list[str] = field(default_factory=list)
|
|
cjk_url_issues: list[str] = field(default_factory=list)
|
|
regional_search_engines: dict[str, str] = field(default_factory=dict)
|
|
priority_recommendations: list[str] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class InternationalAuditResult:
|
|
"""Top-level result container for the full international audit."""
|
|
url: str = ""
|
|
url_structure: Optional[URLStructureAnalysis] = None
|
|
languages_detected: list[str] = field(default_factory=list)
|
|
content_parity: dict[str, ContentParityEntry] = field(default_factory=dict)
|
|
language_detection_issues: list[LanguageDetection] = field(default_factory=list)
|
|
redirect_logic: Optional[RedirectLogic] = None
|
|
korean_expansion: Optional[KoreanExpansion] = None
|
|
issues: list[str] = field(default_factory=list)
|
|
recommendations: list[str] = field(default_factory=list)
|
|
score: int = 100
|
|
timestamp: str = ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Auditor
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class InternationalAuditor:
|
|
"""Perform a comprehensive international SEO audit on a target URL."""
|
|
|
|
def __init__(
|
|
self,
|
|
url: str,
|
|
korean_expansion: bool = False,
|
|
scope: str | None = None,
|
|
):
|
|
self.url = url.rstrip("/")
|
|
self.korean_expansion_flag = korean_expansion
|
|
self.scope = scope or "all"
|
|
self.parsed = urlparse(self.url)
|
|
self.base_url = f"{self.parsed.scheme}://{self.parsed.netloc}"
|
|
self.timeout = aiohttp.ClientTimeout(total=30)
|
|
self._session: aiohttp.ClientSession | None = None
|
|
|
|
# ------------------------------------------------------------------
|
|
# Public entry point
|
|
# ------------------------------------------------------------------
|
|
|
|
async def audit(self) -> InternationalAuditResult:
|
|
"""Run the full international SEO audit and return results."""
|
|
result = InternationalAuditResult(
|
|
url=self.url,
|
|
timestamp=datetime.utcnow().isoformat(),
|
|
)
|
|
|
|
async with aiohttp.ClientSession(timeout=self.timeout) as session:
|
|
self._session = session
|
|
|
|
try:
|
|
# 1. URL structure analysis
|
|
if self.scope in ("all", "structure"):
|
|
logger.info("Analysing URL structure ...")
|
|
result.url_structure = await self._analyze_url_structure(self.url)
|
|
|
|
# 2. Discover language versions
|
|
logger.info("Discovering language versions ...")
|
|
lang_versions = await self._discover_language_versions(self.url)
|
|
result.languages_detected = list(lang_versions.keys())
|
|
|
|
# 3. Content parity
|
|
if self.scope in ("all", "parity"):
|
|
logger.info("Auditing content parity ...")
|
|
result.content_parity = await self._audit_content_parity(lang_versions)
|
|
|
|
# 4. Language detection
|
|
if self.scope in ("all", "detection"):
|
|
logger.info("Checking language detection consistency ...")
|
|
all_pages: list[tuple[str, str]] = []
|
|
for lang, urls in lang_versions.items():
|
|
for u in urls[:5]:
|
|
all_pages.append((lang, u))
|
|
detections = await self._check_language_detection(all_pages)
|
|
result.language_detection_issues = [
|
|
d for d in detections if not d.is_consistent
|
|
]
|
|
|
|
# 5. Redirect logic
|
|
if self.scope in ("all", "redirects"):
|
|
logger.info("Auditing redirect logic ...")
|
|
result.redirect_logic = await self._audit_redirect_logic(self.url)
|
|
|
|
# 6. Korean expansion
|
|
if self.korean_expansion_flag:
|
|
logger.info("Analysing Korean expansion opportunities ...")
|
|
result.korean_expansion = await self._analyze_korean_expansion(self.url)
|
|
|
|
# Collect issues and recommendations
|
|
self._collect_issues(result)
|
|
result.score = self._calculate_score(result)
|
|
|
|
except Exception as exc:
|
|
logger.error("Audit failed: %s", exc)
|
|
result.issues.append(f"Audit error: {exc}")
|
|
result.score = 0
|
|
|
|
finally:
|
|
self._session = None
|
|
|
|
return result
|
|
|
|
# ------------------------------------------------------------------
|
|
# URL structure analysis
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _analyze_url_structure(self, url: str) -> URLStructureAnalysis:
|
|
"""Determine the international URL pattern in use."""
|
|
parsed = urlparse(url)
|
|
hostname = parsed.netloc.lower()
|
|
path = parsed.path
|
|
|
|
analysis = URLStructureAnalysis()
|
|
|
|
# Check ccTLD
|
|
for tld, lang in CCTLD_MAP.items():
|
|
if hostname.endswith(tld):
|
|
analysis.pattern = "cctld"
|
|
analysis.examples.append(f"{hostname} -> {lang}")
|
|
analysis.languages_found.append(lang)
|
|
break
|
|
|
|
# Check subdomain pattern
|
|
if not analysis.pattern:
|
|
match = LANG_SUBDOMAIN_PATTERN.match(hostname)
|
|
if match:
|
|
lang_code = match.group("lang").lower()
|
|
if len(lang_code) == 2 and lang_code not in ("ww", "ww"):
|
|
analysis.pattern = "subdomain"
|
|
analysis.examples.append(f"{hostname}")
|
|
analysis.languages_found.append(lang_code)
|
|
|
|
# Check subdirectory pattern
|
|
if not analysis.pattern:
|
|
match = LANG_SUBDIRECTORY_PATTERN.match(path)
|
|
if match:
|
|
lang_code = match.group("lang").lower()
|
|
analysis.pattern = "subdirectory"
|
|
analysis.examples.append(f"{path}")
|
|
analysis.languages_found.append(lang_code)
|
|
|
|
# Check URL parameter
|
|
if not analysis.pattern:
|
|
qs = parse_qs(parsed.query)
|
|
for param in ("lang", "language", "hl", "locale", "lg"):
|
|
if param in qs:
|
|
analysis.pattern = "parameter"
|
|
analysis.examples.append(f"?{param}={qs[param][0]}")
|
|
analysis.languages_found.append(qs[param][0])
|
|
break
|
|
|
|
# Try to discover additional language versions from page HTML
|
|
try:
|
|
html = await self._fetch_text(url)
|
|
if html:
|
|
hreflang_langs = self._extract_hreflang_langs(html)
|
|
for lang in hreflang_langs:
|
|
if lang not in analysis.languages_found:
|
|
analysis.languages_found.append(lang)
|
|
|
|
# Refine pattern from hreflang hrefs if not yet determined
|
|
if not analysis.pattern and hreflang_langs:
|
|
hrefs = self._extract_hreflang_hrefs(html)
|
|
analysis.pattern = self._detect_pattern_from_hrefs(hrefs)
|
|
except Exception as exc:
|
|
logger.warning("Could not fetch page for structure analysis: %s", exc)
|
|
|
|
if not analysis.pattern:
|
|
analysis.pattern = "unknown"
|
|
|
|
# Attach pros/cons/recommendation
|
|
info = URL_STRUCTURE_INFO.get(analysis.pattern, {})
|
|
analysis.pros = info.get("pros", [])
|
|
analysis.cons = info.get("cons", [])
|
|
|
|
if analysis.pattern == "parameter":
|
|
analysis.recommendation = (
|
|
"URL parameters are not recommended for international targeting. "
|
|
"Consider migrating to subdirectories or subdomains."
|
|
)
|
|
elif analysis.pattern == "cctld":
|
|
analysis.recommendation = (
|
|
"ccTLDs provide the strongest geo-targeting signal. "
|
|
"Ensure each domain has sufficient content and backlinks."
|
|
)
|
|
elif analysis.pattern == "subdomain":
|
|
analysis.recommendation = (
|
|
"Subdomains can work well but ensure hreflang tags are properly "
|
|
"implemented to signal relationships between language versions."
|
|
)
|
|
elif analysis.pattern == "subdirectory":
|
|
analysis.recommendation = (
|
|
"Subdirectories are the most cost-effective approach and consolidate "
|
|
"domain authority. This is the recommended pattern for most sites."
|
|
)
|
|
else:
|
|
analysis.recommendation = (
|
|
"Could not determine the URL structure pattern. "
|
|
"Implement a clear international URL strategy using subdirectories, "
|
|
"subdomains, or ccTLDs with proper hreflang tags."
|
|
)
|
|
|
|
return analysis
|
|
|
|
# ------------------------------------------------------------------
|
|
# Discover language versions
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _discover_language_versions(self, url: str) -> dict[str, list[str]]:
|
|
"""Discover language versions from hreflang tags and common patterns."""
|
|
lang_versions: dict[str, list[str]] = {}
|
|
|
|
try:
|
|
html = await self._fetch_text(url)
|
|
except Exception:
|
|
html = None
|
|
|
|
# 1. Extract from hreflang tags
|
|
if html:
|
|
hreflang_map = self._extract_hreflang_map(html)
|
|
for lang, href in hreflang_map.items():
|
|
if lang == "x-default":
|
|
continue
|
|
lang_key = lang.split("-")[0].lower()
|
|
lang_versions.setdefault(lang_key, []).append(href)
|
|
|
|
# 2. Probe common subdirectory patterns
|
|
common_langs = ["en", "ko", "ja", "zh", "de", "fr", "es", "pt"]
|
|
probe_tasks = []
|
|
for lang in common_langs:
|
|
if lang not in lang_versions:
|
|
candidate = f"{self.base_url}/{lang}/"
|
|
probe_tasks.append(self._probe_url(lang, candidate))
|
|
|
|
if probe_tasks:
|
|
results = await asyncio.gather(*probe_tasks, return_exceptions=True)
|
|
for item in results:
|
|
if isinstance(item, tuple):
|
|
lang, probe_url = item
|
|
lang_versions.setdefault(lang, []).append(probe_url)
|
|
|
|
# 3. Probe common subdomain patterns
|
|
domain_parts = self.parsed.netloc.split(".")
|
|
if len(domain_parts) >= 2:
|
|
root_domain = ".".join(domain_parts[-2:])
|
|
subdomain_tasks = []
|
|
for lang in common_langs:
|
|
if lang not in lang_versions:
|
|
candidate = f"{self.parsed.scheme}://{lang}.{root_domain}/"
|
|
subdomain_tasks.append(self._probe_url(lang, candidate))
|
|
if subdomain_tasks:
|
|
results = await asyncio.gather(*subdomain_tasks, return_exceptions=True)
|
|
for item in results:
|
|
if isinstance(item, tuple):
|
|
lang, probe_url = item
|
|
lang_versions.setdefault(lang, []).append(probe_url)
|
|
|
|
# Ensure the original URL is listed under some language
|
|
if not lang_versions:
|
|
detected = "unknown"
|
|
if html:
|
|
detected = self._detect_language_from_content(html) or "unknown"
|
|
lang_versions[detected] = [url]
|
|
|
|
return lang_versions
|
|
|
|
async def _probe_url(self, lang: str, url: str) -> tuple[str, str] | None:
|
|
"""Check if a URL returns 200 and return (lang, url) or raise."""
|
|
try:
|
|
async with self._session.head(
|
|
url, allow_redirects=True, timeout=self.timeout,
|
|
) as resp:
|
|
if resp.status == 200:
|
|
return (lang, url)
|
|
except Exception:
|
|
pass
|
|
raise ValueError(f"Probe failed for {url}")
|
|
|
|
# ------------------------------------------------------------------
|
|
# Content parity
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _audit_content_parity(
|
|
self,
|
|
lang_versions: dict[str, list[str]],
|
|
) -> dict[str, ContentParityEntry]:
|
|
"""Audit content parity across discovered language versions."""
|
|
parity: dict[str, ContentParityEntry] = {}
|
|
|
|
for lang, urls in lang_versions.items():
|
|
entry = ContentParityEntry(language=lang, page_count=len(urls))
|
|
|
|
# Determine base URL for this language version
|
|
if urls:
|
|
lang_base = urls[0].rstrip("/")
|
|
present_keys = await self._check_key_pages(lang_base)
|
|
entry.key_pages_present = present_keys
|
|
entry.missing_pages = [
|
|
p for p in KEY_PAGES if p not in present_keys
|
|
]
|
|
|
|
# Freshness score from Last-Modified header on first URL
|
|
entry.freshness_score = await self._get_freshness_score(urls[0])
|
|
|
|
parity[lang] = entry
|
|
|
|
return parity
|
|
|
|
async def _check_key_pages(self, lang_base_url: str) -> list[str]:
|
|
"""Check which key pages exist under a language base URL."""
|
|
present: list[str] = []
|
|
page_paths = {
|
|
"home": "/",
|
|
"about": "/about",
|
|
"contact": "/contact",
|
|
"products": "/products",
|
|
"services": "/services",
|
|
"blog": "/blog",
|
|
"faq": "/faq",
|
|
}
|
|
tasks = []
|
|
for name, path in page_paths.items():
|
|
full_url = lang_base_url.rstrip("/") + path
|
|
tasks.append(self._check_page_exists(name, full_url))
|
|
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
for item in results:
|
|
if isinstance(item, str):
|
|
present.append(item)
|
|
return present
|
|
|
|
async def _check_page_exists(self, name: str, url: str) -> str | None:
|
|
"""Return the page name if URL returns 200, else raise."""
|
|
try:
|
|
async with self._session.head(
|
|
url, allow_redirects=True, timeout=self.timeout,
|
|
) as resp:
|
|
if resp.status == 200:
|
|
return name
|
|
except Exception:
|
|
pass
|
|
raise ValueError(f"Page not found: {url}")
|
|
|
|
async def _get_freshness_score(self, url: str) -> float:
|
|
"""Compute a freshness score (0-100) based on Last-Modified header."""
|
|
try:
|
|
async with self._session.head(
|
|
url, allow_redirects=True, timeout=self.timeout,
|
|
) as resp:
|
|
last_modified = resp.headers.get("Last-Modified")
|
|
if last_modified:
|
|
from email.utils import parsedate_to_datetime
|
|
mod_dt = parsedate_to_datetime(last_modified)
|
|
age_days = (datetime.utcnow() - mod_dt.replace(tzinfo=None)).days
|
|
if age_days <= 7:
|
|
return 100.0
|
|
elif age_days <= 30:
|
|
return 90.0
|
|
elif age_days <= 90:
|
|
return 70.0
|
|
elif age_days <= 180:
|
|
return 50.0
|
|
elif age_days <= 365:
|
|
return 30.0
|
|
else:
|
|
return 10.0
|
|
# No Last-Modified header -- neutral score
|
|
return 50.0
|
|
except Exception:
|
|
return 0.0
|
|
|
|
# ------------------------------------------------------------------
|
|
# Language detection
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _check_language_detection(
|
|
self,
|
|
pages: list[tuple[str, str]],
|
|
) -> list[LanguageDetection]:
|
|
"""Check language declaration consistency for a list of pages."""
|
|
tasks = [
|
|
self._detect_single_page(declared_lang, page_url)
|
|
for declared_lang, page_url in pages
|
|
]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
detections: list[LanguageDetection] = []
|
|
for r in results:
|
|
if isinstance(r, LanguageDetection):
|
|
detections.append(r)
|
|
return detections
|
|
|
|
async def _detect_single_page(
|
|
self,
|
|
declared_lang: str,
|
|
page_url: str,
|
|
) -> LanguageDetection:
|
|
"""Perform language detection checks on a single page."""
|
|
detection = LanguageDetection(
|
|
page_url=page_url,
|
|
declared_lang=declared_lang,
|
|
)
|
|
|
|
try:
|
|
async with self._session.get(
|
|
page_url, allow_redirects=True, timeout=self.timeout,
|
|
) as resp:
|
|
# Content-Language header
|
|
cl_header = resp.headers.get("Content-Language", "")
|
|
detection.content_language_header = cl_header.strip()
|
|
|
|
html = await resp.text(errors="replace")
|
|
|
|
# HTML lang attribute
|
|
lang_match = re.search(
|
|
r'<html[^>]*\slang=["\']([^"\']+)["\']',
|
|
html,
|
|
re.IGNORECASE,
|
|
)
|
|
if lang_match:
|
|
detection.html_lang_attr = lang_match.group(1).strip()
|
|
|
|
# Detect from content
|
|
detection.detected_lang = self._detect_language_from_content(html)
|
|
|
|
# Consistency check
|
|
expected = declared_lang.split("-")[0].lower()
|
|
html_lang_base = detection.html_lang_attr.split("-")[0].lower() if detection.html_lang_attr else ""
|
|
cl_base = detection.content_language_header.split("-")[0].lower() if detection.content_language_header else ""
|
|
detected_base = detection.detected_lang.split("-")[0].lower() if detection.detected_lang else ""
|
|
|
|
inconsistencies = []
|
|
if html_lang_base and html_lang_base != expected:
|
|
inconsistencies.append("html_lang_attr")
|
|
if cl_base and cl_base != expected:
|
|
inconsistencies.append("content_language_header")
|
|
if detected_base and detected_base != expected:
|
|
inconsistencies.append("detected_content")
|
|
|
|
detection.is_consistent = len(inconsistencies) == 0
|
|
|
|
except Exception as exc:
|
|
logger.warning("Language detection failed for %s: %s", page_url, exc)
|
|
detection.is_consistent = False
|
|
|
|
return detection
|
|
|
|
def _detect_language_from_content(self, html: str) -> str:
|
|
"""Simple heuristic language detection using Unicode ranges."""
|
|
# Strip HTML tags for content analysis
|
|
text = re.sub(r"<[^>]+>", " ", html)
|
|
text = re.sub(r"\s+", " ", text)[:5000] # limit analysis scope
|
|
|
|
counts = {
|
|
"ko": len(RE_HANGUL.findall(text)),
|
|
"ja": len(RE_HIRAGANA.findall(text)) + len(RE_KATAKANA.findall(text)),
|
|
"zh": len(RE_CJK_UNIFIED.findall(text)),
|
|
"en": len(RE_LATIN_EXTENDED.findall(text)),
|
|
}
|
|
|
|
# Japanese text also contains CJK unified characters (kanji).
|
|
# If both hiragana/katakana AND CJK are present, prefer Japanese.
|
|
if counts["ja"] > 0 and counts["zh"] > 0:
|
|
if counts["ja"] > counts["zh"] * 0.1:
|
|
counts["zh"] = 0 # attribute CJK chars to Japanese context
|
|
|
|
# Korean text can contain some CJK (hanja) but Hangul dominates.
|
|
if counts["ko"] > 0 and counts["zh"] > 0:
|
|
if counts["ko"] > counts["zh"]:
|
|
counts["zh"] = 0
|
|
|
|
if not any(counts.values()):
|
|
return "unknown"
|
|
|
|
dominant = max(counts, key=counts.get)
|
|
|
|
# Require a meaningful amount of the dominant script
|
|
if counts[dominant] < 20:
|
|
return "unknown"
|
|
|
|
return dominant
|
|
|
|
# ------------------------------------------------------------------
|
|
# Redirect logic
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _audit_redirect_logic(self, url: str) -> RedirectLogic:
|
|
"""Audit redirect behaviour for different Accept-Language headers."""
|
|
logic = RedirectLogic()
|
|
redirect_targets: dict[str, str] = {}
|
|
|
|
for headers in ACCEPT_LANGUAGE_HEADERS:
|
|
try:
|
|
async with self._session.get(
|
|
url,
|
|
allow_redirects=False,
|
|
headers=headers,
|
|
timeout=self.timeout,
|
|
) as resp:
|
|
lang_key = list(headers.values())[0].split(",")[0]
|
|
if resp.status in (301, 302, 303, 307, 308):
|
|
location = resp.headers.get("Location", "")
|
|
redirect_targets[lang_key] = location
|
|
logic.has_language_redirect = True
|
|
if resp.status in (301, 302):
|
|
logic.redirect_type = str(resp.status)
|
|
logic.is_forced = True
|
|
else:
|
|
redirect_targets[lang_key] = str(resp.url)
|
|
except Exception as exc:
|
|
logger.warning("Redirect check failed for %s: %s", headers, exc)
|
|
|
|
# Detect IP-based redirects by comparing redirect targets
|
|
unique_targets = set(redirect_targets.values())
|
|
if len(unique_targets) > 1:
|
|
logic.has_language_redirect = True
|
|
|
|
# Check for JS-based redirects in page source
|
|
try:
|
|
html = await self._fetch_text(url)
|
|
if html:
|
|
js_redirect_patterns = [
|
|
r"navigator\.language",
|
|
r"navigator\.languages",
|
|
r"window\.location\s*=",
|
|
r"location\.href\s*=",
|
|
r"geo[_-]?redirect",
|
|
r"ip[_-]?redirect",
|
|
]
|
|
for pattern in js_redirect_patterns:
|
|
if re.search(pattern, html, re.IGNORECASE):
|
|
if "geo" in pattern or "ip" in pattern:
|
|
logic.has_ip_redirect = True
|
|
if not logic.redirect_type:
|
|
logic.redirect_type = "js"
|
|
break
|
|
except Exception:
|
|
pass
|
|
|
|
# Recommendation
|
|
if logic.is_forced:
|
|
logic.recommendation = (
|
|
"Forced language/region redirects detected (HTTP {}). "
|
|
"Best practice: suggest language via a banner or interstitial "
|
|
"rather than forcing a redirect. Forced redirects can prevent "
|
|
"search engines from crawling all language versions and frustrate "
|
|
"users who prefer a different language."
|
|
).format(logic.redirect_type)
|
|
elif logic.has_language_redirect:
|
|
logic.recommendation = (
|
|
"Language-based content negotiation detected. Ensure all language "
|
|
"versions remain directly accessible via their canonical URLs and "
|
|
"that Googlebot can crawl every version without redirect loops."
|
|
)
|
|
else:
|
|
logic.recommendation = (
|
|
"No forced language redirects detected. Consider adding a "
|
|
"language suggestion banner for users whose browser language "
|
|
"differs from the page language."
|
|
)
|
|
|
|
return logic
|
|
|
|
# ------------------------------------------------------------------
|
|
# Korean expansion analysis
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _analyze_korean_expansion(self, url: str) -> KoreanExpansion:
|
|
"""Analyse expansion opportunities for Korean-primary sites."""
|
|
expansion = KoreanExpansion(
|
|
primary_lang="ko",
|
|
target_markets=["ja", "zh", "en"],
|
|
regional_search_engines={
|
|
"ko": "Naver (naver.com) -- dominant in Korea; also consider Daum",
|
|
"ja": "Yahoo Japan (yahoo.co.jp) -- significant market share alongside Google",
|
|
"zh": "Baidu (baidu.com) -- dominant in mainland China; also Sogou, 360 Search",
|
|
"en": "Google (google.com) -- global default",
|
|
},
|
|
)
|
|
|
|
# Check CJK URL encoding issues
|
|
try:
|
|
html = await self._fetch_text(url)
|
|
if html:
|
|
# Find links with non-ASCII characters in URLs
|
|
href_pattern = re.compile(r'href=["\']([^"\']+)["\']', re.IGNORECASE)
|
|
for match in href_pattern.finditer(html):
|
|
href = match.group(1)
|
|
# Check for unencoded CJK characters in URLs
|
|
if RE_HANGUL.search(href):
|
|
expansion.cjk_url_issues.append(
|
|
f"Unencoded Korean characters in URL: {href[:100]}"
|
|
)
|
|
if RE_CJK_UNIFIED.search(href):
|
|
expansion.cjk_url_issues.append(
|
|
f"Unencoded CJK characters in URL: {href[:100]}"
|
|
)
|
|
if RE_HIRAGANA.search(href) or RE_KATAKANA.search(href):
|
|
expansion.cjk_url_issues.append(
|
|
f"Unencoded Japanese characters in URL: {href[:100]}"
|
|
)
|
|
|
|
# De-duplicate
|
|
expansion.cjk_url_issues = list(dict.fromkeys(expansion.cjk_url_issues))[:20]
|
|
except Exception as exc:
|
|
logger.warning("CJK URL check failed: %s", exc)
|
|
|
|
# Priority recommendations
|
|
expansion.priority_recommendations = [
|
|
"Implement hreflang tags linking ko, ja, zh, and en versions with x-default",
|
|
"Use subdirectories (/ko/, /ja/, /zh/, /en/) for cost-effective international structure",
|
|
"Create a Naver Search Advisor (searchadvisor.naver.com) account for Korean SEO",
|
|
"Submit sitemap to Baidu Webmaster Tools for Chinese market visibility",
|
|
"Register with Yahoo Japan Search Console (optional but recommended for Japan)",
|
|
"Ensure all URLs use percent-encoded paths -- avoid raw CJK characters in hrefs",
|
|
"Provide a language/region selector accessible from every page",
|
|
"Localise content beyond translation: currency, date formats, cultural references",
|
|
"Consider separate social media strategies per market (KakaoTalk for KR, LINE for JP, WeChat for CN)",
|
|
"Monitor Core Web Vitals per region -- CDN edge presence matters for CJK markets",
|
|
]
|
|
|
|
return expansion
|
|
|
|
# ------------------------------------------------------------------
|
|
# Issue collection
|
|
# ------------------------------------------------------------------
|
|
|
|
def _collect_issues(self, result: InternationalAuditResult) -> None:
|
|
"""Populate issues and recommendations from audit sub-results."""
|
|
# URL structure issues
|
|
if result.url_structure:
|
|
if result.url_structure.pattern == "parameter":
|
|
result.issues.append(
|
|
"URL parameters used for language targeting -- not recommended"
|
|
)
|
|
if result.url_structure.pattern == "unknown":
|
|
result.issues.append(
|
|
"Could not detect a clear international URL structure"
|
|
)
|
|
if len(result.url_structure.languages_found) <= 1:
|
|
result.issues.append(
|
|
"Only one language version detected -- consider expanding"
|
|
)
|
|
|
|
# Content parity issues
|
|
if result.content_parity:
|
|
primary_count = max(
|
|
(e.page_count for e in result.content_parity.values()), default=0,
|
|
)
|
|
for lang, entry in result.content_parity.items():
|
|
if entry.missing_pages:
|
|
result.issues.append(
|
|
f"Language '{lang}' is missing key pages: "
|
|
f"{', '.join(entry.missing_pages)}"
|
|
)
|
|
if primary_count > 0 and entry.page_count < primary_count * 0.5:
|
|
result.issues.append(
|
|
f"Language '{lang}' has significantly fewer pages "
|
|
f"({entry.page_count}) than the primary version ({primary_count})"
|
|
)
|
|
|
|
# Language detection issues
|
|
for det in result.language_detection_issues:
|
|
result.issues.append(
|
|
f"Language inconsistency on {det.page_url}: "
|
|
f"declared={det.declared_lang}, html_lang={det.html_lang_attr}, "
|
|
f"detected={det.detected_lang}"
|
|
)
|
|
|
|
# Redirect logic issues
|
|
if result.redirect_logic:
|
|
if result.redirect_logic.is_forced:
|
|
result.issues.append(
|
|
"Forced language/region redirects detected -- "
|
|
"this can block search engine crawling"
|
|
)
|
|
if result.redirect_logic.has_ip_redirect:
|
|
result.issues.append(
|
|
"IP-based redirect logic detected -- "
|
|
"ensure Googlebot can access all versions"
|
|
)
|
|
|
|
# Check for x-default in hreflang
|
|
has_xdefault = False
|
|
if result.url_structure and result.url_structure.languages_found:
|
|
try:
|
|
html = None # already fetched; recheck from structure data
|
|
# We check for x-default in languages_found (hreflang extraction adds it)
|
|
if "x-default" in result.url_structure.languages_found:
|
|
has_xdefault = True
|
|
result.url_structure.languages_found.remove("x-default")
|
|
except Exception:
|
|
pass
|
|
if not has_xdefault and len(result.languages_detected) > 1:
|
|
result.issues.append(
|
|
"No x-default hreflang tag detected -- "
|
|
"add x-default to specify the fallback page"
|
|
)
|
|
|
|
# Recommendations
|
|
if result.issues:
|
|
result.recommendations.append(
|
|
"Address the issues listed above to improve international SEO health"
|
|
)
|
|
if result.url_structure and result.url_structure.recommendation:
|
|
result.recommendations.append(result.url_structure.recommendation)
|
|
if result.redirect_logic and result.redirect_logic.recommendation:
|
|
result.recommendations.append(result.redirect_logic.recommendation)
|
|
if result.korean_expansion:
|
|
result.recommendations.extend(
|
|
result.korean_expansion.priority_recommendations[:5]
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Score calculation
|
|
# ------------------------------------------------------------------
|
|
|
|
def _calculate_score(self, result: InternationalAuditResult) -> int:
|
|
"""Calculate an overall international SEO score (0-100)."""
|
|
score = 100
|
|
|
|
# URL structure penalties
|
|
if result.url_structure:
|
|
if result.url_structure.pattern == "parameter":
|
|
score -= 15
|
|
elif result.url_structure.pattern == "unknown":
|
|
score -= 10
|
|
if len(result.url_structure.languages_found) <= 1:
|
|
score -= 5
|
|
|
|
# Content parity penalties
|
|
if result.content_parity:
|
|
for entry in result.content_parity.values():
|
|
score -= 5 * len(entry.missing_pages)
|
|
|
|
# Language detection penalties
|
|
score -= 10 * len(result.language_detection_issues)
|
|
|
|
# Redirect logic penalties
|
|
if result.redirect_logic:
|
|
if result.redirect_logic.is_forced:
|
|
score -= 15
|
|
|
|
# No x-default penalty
|
|
for issue in result.issues:
|
|
if "x-default" in issue:
|
|
score -= 10
|
|
break
|
|
|
|
# CJK encoding issues
|
|
if result.korean_expansion:
|
|
cjk_count = len(result.korean_expansion.cjk_url_issues)
|
|
score -= 5 * min(cjk_count, 5) # cap at 25 points
|
|
|
|
return max(0, score)
|
|
|
|
# ------------------------------------------------------------------
|
|
# HTML / hreflang helpers
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _fetch_text(self, url: str) -> str | None:
|
|
"""Fetch URL content as text."""
|
|
try:
|
|
async with self._session.get(
|
|
url, allow_redirects=True, timeout=self.timeout,
|
|
) as resp:
|
|
if resp.status == 200:
|
|
return await resp.text(errors="replace")
|
|
except Exception as exc:
|
|
logger.warning("Failed to fetch %s: %s", url, exc)
|
|
return None
|
|
|
|
def _extract_hreflang_langs(self, html: str) -> list[str]:
|
|
"""Extract language codes from hreflang link tags."""
|
|
pattern = re.compile(
|
|
r'<link[^>]*\brel=["\']alternate["\'][^>]*\bhreflang=["\']([^"\']+)["\']',
|
|
re.IGNORECASE,
|
|
)
|
|
return [m.group(1).lower() for m in pattern.finditer(html)]
|
|
|
|
def _extract_hreflang_hrefs(self, html: str) -> list[str]:
|
|
"""Extract href values from hreflang link tags."""
|
|
pattern = re.compile(
|
|
r'<link[^>]*\brel=["\']alternate["\'][^>]*\bhref=["\']([^"\']+)["\']',
|
|
re.IGNORECASE,
|
|
)
|
|
return [m.group(1) for m in pattern.finditer(html)]
|
|
|
|
def _extract_hreflang_map(self, html: str) -> dict[str, str]:
|
|
"""Extract a mapping of hreflang language code to href."""
|
|
result: dict[str, str] = {}
|
|
# Match both attribute orderings
|
|
pattern1 = re.compile(
|
|
r'<link[^>]*\brel=["\']alternate["\']'
|
|
r'[^>]*\bhreflang=["\']([^"\']+)["\']'
|
|
r'[^>]*\bhref=["\']([^"\']+)["\']',
|
|
re.IGNORECASE,
|
|
)
|
|
pattern2 = re.compile(
|
|
r'<link[^>]*\bhreflang=["\']([^"\']+)["\']'
|
|
r'[^>]*\bhref=["\']([^"\']+)["\']'
|
|
r'[^>]*\brel=["\']alternate["\']',
|
|
re.IGNORECASE,
|
|
)
|
|
for m in pattern1.finditer(html):
|
|
result[m.group(1).lower()] = m.group(2)
|
|
for m in pattern2.finditer(html):
|
|
result[m.group(1).lower()] = m.group(2)
|
|
return result
|
|
|
|
def _detect_pattern_from_hrefs(self, hrefs: list[str]) -> str:
|
|
"""Guess the URL structure pattern from a list of hreflang hrefs."""
|
|
for href in hrefs:
|
|
parsed = urlparse(href)
|
|
# ccTLD check
|
|
for tld in CCTLD_MAP:
|
|
if parsed.netloc.endswith(tld):
|
|
return "cctld"
|
|
# subdomain check
|
|
if LANG_SUBDOMAIN_PATTERN.match(parsed.netloc):
|
|
return "subdomain"
|
|
# subdirectory check
|
|
if LANG_SUBDIRECTORY_PATTERN.match(parsed.path):
|
|
return "subdirectory"
|
|
# parameter check
|
|
qs = parse_qs(parsed.query)
|
|
for param in ("lang", "language", "hl", "locale", "lg"):
|
|
if param in qs:
|
|
return "parameter"
|
|
return "unknown"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Serialisation helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _result_to_dict(result: InternationalAuditResult) -> dict:
|
|
"""Convert the audit result dataclass tree to a plain dict."""
|
|
data = asdict(result)
|
|
# Flatten content_parity keys
|
|
if "content_parity" in data and isinstance(data["content_parity"], dict):
|
|
cleaned = {}
|
|
for key, value in data["content_parity"].items():
|
|
cleaned[key] = value
|
|
data["content_parity"] = cleaned
|
|
return data
|
|
|
|
|
|
def _print_text_report(result: InternationalAuditResult) -> None:
|
|
"""Print a human-readable text report to stdout."""
|
|
print("=" * 70)
|
|
print(" International SEO Audit Report")
|
|
print("=" * 70)
|
|
print(f" URL: {result.url}")
|
|
print(f" Timestamp: {result.timestamp}")
|
|
print(f" Score: {result.score}/100")
|
|
print("=" * 70)
|
|
|
|
# URL Structure
|
|
if result.url_structure:
|
|
print("\n--- URL Structure ---")
|
|
print(f" Pattern: {result.url_structure.pattern}")
|
|
print(f" Languages: {', '.join(result.url_structure.languages_found)}")
|
|
if result.url_structure.examples:
|
|
print(f" Examples: {', '.join(result.url_structure.examples)}")
|
|
print(f" Recommendation: {result.url_structure.recommendation}")
|
|
|
|
# Languages detected
|
|
if result.languages_detected:
|
|
print(f"\n--- Languages Detected ---")
|
|
print(f" {', '.join(result.languages_detected)}")
|
|
|
|
# Content parity
|
|
if result.content_parity:
|
|
print("\n--- Content Parity ---")
|
|
for lang, entry in result.content_parity.items():
|
|
missing = ", ".join(entry.missing_pages) if entry.missing_pages else "none"
|
|
present = ", ".join(entry.key_pages_present) if entry.key_pages_present else "none"
|
|
print(f" [{lang}] pages={entry.page_count} freshness={entry.freshness_score:.0f}")
|
|
print(f" key pages present: {present}")
|
|
print(f" missing: {missing}")
|
|
|
|
# Language detection issues
|
|
if result.language_detection_issues:
|
|
print("\n--- Language Detection Issues ---")
|
|
for det in result.language_detection_issues:
|
|
print(f" {det.page_url}")
|
|
print(f" declared={det.declared_lang} html_lang={det.html_lang_attr} "
|
|
f"header={det.content_language_header} detected={det.detected_lang}")
|
|
|
|
# Redirect logic
|
|
if result.redirect_logic:
|
|
print("\n--- Redirect Logic ---")
|
|
print(f" Language redirect: {result.redirect_logic.has_language_redirect}")
|
|
print(f" IP redirect: {result.redirect_logic.has_ip_redirect}")
|
|
print(f" Forced: {result.redirect_logic.is_forced}")
|
|
print(f" Type: {result.redirect_logic.redirect_type or 'none'}")
|
|
print(f" Recommendation: {result.redirect_logic.recommendation}")
|
|
|
|
# Korean expansion
|
|
if result.korean_expansion:
|
|
print("\n--- Korean Expansion ---")
|
|
print(f" Target markets: {', '.join(result.korean_expansion.target_markets)}")
|
|
if result.korean_expansion.cjk_url_issues:
|
|
print(f" CJK URL issues: {len(result.korean_expansion.cjk_url_issues)}")
|
|
for issue in result.korean_expansion.cjk_url_issues[:5]:
|
|
print(f" - {issue}")
|
|
print(" Regional search engines:")
|
|
for region, engine in result.korean_expansion.regional_search_engines.items():
|
|
print(f" [{region}] {engine}")
|
|
if result.korean_expansion.priority_recommendations:
|
|
print(" Priority recommendations:")
|
|
for rec in result.korean_expansion.priority_recommendations[:5]:
|
|
print(f" - {rec}")
|
|
|
|
# Issues
|
|
if result.issues:
|
|
print("\n--- Issues ---")
|
|
for issue in result.issues:
|
|
print(f" - {issue}")
|
|
|
|
# Recommendations
|
|
if result.recommendations:
|
|
print("\n--- Recommendations ---")
|
|
for rec in result.recommendations:
|
|
print(f" - {rec}")
|
|
|
|
print("\n" + "=" * 70)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def build_parser() -> argparse.ArgumentParser:
|
|
"""Build the argument parser."""
|
|
parser = argparse.ArgumentParser(
|
|
description="International SEO Auditor -- analyse multi-language site implementation",
|
|
)
|
|
parser.add_argument(
|
|
"--url",
|
|
required=True,
|
|
help="Target URL to audit",
|
|
)
|
|
parser.add_argument(
|
|
"--scope",
|
|
choices=["structure", "parity", "detection", "redirects", "all"],
|
|
default="all",
|
|
help="Audit scope (default: all)",
|
|
)
|
|
parser.add_argument(
|
|
"--korean-expansion",
|
|
action="store_true",
|
|
default=False,
|
|
help="Enable Korean expansion analysis",
|
|
)
|
|
parser.add_argument(
|
|
"--json",
|
|
action="store_true",
|
|
dest="json_output",
|
|
default=False,
|
|
help="Output results as JSON",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=str,
|
|
default=None,
|
|
help="Write output to file instead of stdout",
|
|
)
|
|
return parser
|
|
|
|
|
|
async def async_main(args: argparse.Namespace) -> None:
|
|
"""Async entry point."""
|
|
auditor = InternationalAuditor(
|
|
url=args.url,
|
|
korean_expansion=args.korean_expansion,
|
|
scope=args.scope,
|
|
)
|
|
|
|
result = await auditor.audit()
|
|
|
|
if args.json_output:
|
|
output = json.dumps(
|
|
_result_to_dict(result),
|
|
indent=2,
|
|
ensure_ascii=False,
|
|
default=str,
|
|
)
|
|
else:
|
|
# For file output in text mode, capture via print; for stdout, print directly.
|
|
if args.output:
|
|
import io
|
|
buf = io.StringIO()
|
|
_old_stdout = sys.stdout
|
|
sys.stdout = buf
|
|
_print_text_report(result)
|
|
sys.stdout = _old_stdout
|
|
output = buf.getvalue()
|
|
else:
|
|
_print_text_report(result)
|
|
return
|
|
|
|
if args.output:
|
|
with open(args.output, "w", encoding="utf-8") as fh:
|
|
fh.write(output)
|
|
logger.info("Output written to %s", args.output)
|
|
else:
|
|
print(output)
|
|
|
|
|
|
def main() -> None:
|
|
"""CLI entry point."""
|
|
parser = build_parser()
|
|
args = parser.parse_args()
|
|
asyncio.run(async_main(args))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|