""" International SEO Auditor - Multi-language site audit ===================================================== Purpose: Audit international SEO implementation including URL structure, content parity, language detection, and redirect logic Python: 3.10+ Usage: python international_auditor.py --url https://example.com --json """ import argparse import asyncio import json import logging import re import sys from dataclasses import dataclass, field, asdict from datetime import datetime from typing import Optional from urllib.parse import urlparse, urljoin, parse_qs import aiohttp # --------------------------------------------------------------------------- # Logging # --------------------------------------------------------------------------- logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", ) logger = logging.getLogger("international_auditor") # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- KEY_PAGES = ["home", "about", "contact", "products", "services", "blog", "faq"] CCTLD_MAP: dict[str, str] = { ".kr": "ko", ".jp": "ja", ".cn": "zh", ".de": "de", ".fr": "fr", ".es": "es", ".it": "it", ".pt": "pt", ".nl": "nl", ".ru": "ru", ".br": "pt", ".mx": "es", ".co.uk": "en", ".ca": "en", ".au": "en", ".in": "en", ".co.kr": "ko", ".co.jp": "ja", ".com.cn": "zh", } LANG_SUBDIRECTORY_PATTERN = re.compile( r"^/(?P[a-z]{2}(?:-[a-z]{2})?)(?:/|$)", re.IGNORECASE, ) LANG_SUBDOMAIN_PATTERN = re.compile( r"^(?P[a-z]{2}(?:-[a-z]{2})?)\.", re.IGNORECASE, ) ACCEPT_LANGUAGE_HEADERS = [ {"Accept-Language": "ko-KR,ko;q=0.9"}, {"Accept-Language": "en-US,en;q=0.9"}, {"Accept-Language": "ja-JP,ja;q=0.9"}, {"Accept-Language": "zh-CN,zh;q=0.9"}, ] URL_STRUCTURE_INFO: dict[str, dict] = { "cctld": { "pros": [ "Strong geo-targeting signal to search engines", "Clear country association for users", "Independent domain authority per country", ], "cons": [ "Expensive to acquire and maintain multiple domains", "Domain authority is not shared across properties", "Complex infrastructure management", ], }, "subdomain": { "pros": [ "Easy to set up on a single domain", "Can be hosted on different servers/locations", "Clear separation of language versions", ], "cons": [ "Subdomains may be treated as separate sites by search engines", "Limited geo-targeting signal compared to ccTLD", "Domain authority dilution across subdomains", ], }, "subdirectory": { "pros": [ "All content inherits root domain authority", "Simplest infrastructure and lowest cost", "Best for consolidating SEO signals", ], "cons": [ "Server must handle all language versions", "Geo-targeting relies on Search Console settings and hreflang", "Less clear country targeting than ccTLD", ], }, "parameter": { "pros": [ "Easy to implement technically", ], "cons": [ "Not recommended by Google", "URL parameters can be ignored by crawlers", "Poor user experience and shareability", "No geo-targeting signal", ], }, } # --------------------------------------------------------------------------- # Unicode range helpers # --------------------------------------------------------------------------- RE_HANGUL = re.compile(r"[\uAC00-\uD7AF]") RE_HIRAGANA = re.compile(r"[\u3040-\u309F]") RE_KATAKANA = re.compile(r"[\u30A0-\u30FF]") RE_CJK_UNIFIED = re.compile(r"[\u4E00-\u9FFF]") RE_LATIN_EXTENDED = re.compile(r"[A-Za-z\u00C0-\u024F]") # --------------------------------------------------------------------------- # Dataclasses # --------------------------------------------------------------------------- @dataclass class URLStructureAnalysis: """Analysis of the international URL structure pattern.""" pattern: str = "" # cctld / subdomain / subdirectory / parameter examples: list[str] = field(default_factory=list) languages_found: list[str] = field(default_factory=list) recommendation: str = "" pros: list[str] = field(default_factory=list) cons: list[str] = field(default_factory=list) @dataclass class ContentParityEntry: """Content parity data for a single language version.""" language: str = "" page_count: int = 0 key_pages_present: list[str] = field(default_factory=list) freshness_score: float = 0.0 missing_pages: list[str] = field(default_factory=list) @dataclass class LanguageDetection: """Language detection audit for a single page.""" page_url: str = "" declared_lang: str = "" html_lang_attr: str = "" content_language_header: str = "" detected_lang: str = "" is_consistent: bool = True @dataclass class RedirectLogic: """International redirect behaviour audit.""" has_ip_redirect: bool = False has_language_redirect: bool = False is_forced: bool = False redirect_type: str = "" # 301 / 302 / js / none recommendation: str = "" @dataclass class KoreanExpansion: """Korean-market expansion analysis.""" primary_lang: str = "ko" target_markets: list[str] = field(default_factory=list) cjk_url_issues: list[str] = field(default_factory=list) regional_search_engines: dict[str, str] = field(default_factory=dict) priority_recommendations: list[str] = field(default_factory=list) @dataclass class InternationalAuditResult: """Top-level result container for the full international audit.""" url: str = "" url_structure: Optional[URLStructureAnalysis] = None languages_detected: list[str] = field(default_factory=list) content_parity: dict[str, ContentParityEntry] = field(default_factory=dict) language_detection_issues: list[LanguageDetection] = field(default_factory=list) redirect_logic: Optional[RedirectLogic] = None korean_expansion: Optional[KoreanExpansion] = None issues: list[str] = field(default_factory=list) recommendations: list[str] = field(default_factory=list) score: int = 100 timestamp: str = "" # --------------------------------------------------------------------------- # Auditor # --------------------------------------------------------------------------- class InternationalAuditor: """Perform a comprehensive international SEO audit on a target URL.""" def __init__( self, url: str, korean_expansion: bool = False, scope: str | None = None, ): self.url = url.rstrip("/") self.korean_expansion_flag = korean_expansion self.scope = scope or "all" self.parsed = urlparse(self.url) self.base_url = f"{self.parsed.scheme}://{self.parsed.netloc}" self.timeout = aiohttp.ClientTimeout(total=30) self._session: aiohttp.ClientSession | None = None # ------------------------------------------------------------------ # Public entry point # ------------------------------------------------------------------ async def audit(self) -> InternationalAuditResult: """Run the full international SEO audit and return results.""" result = InternationalAuditResult( url=self.url, timestamp=datetime.utcnow().isoformat(), ) async with aiohttp.ClientSession(timeout=self.timeout) as session: self._session = session try: # 1. URL structure analysis if self.scope in ("all", "structure"): logger.info("Analysing URL structure ...") result.url_structure = await self._analyze_url_structure(self.url) # 2. Discover language versions logger.info("Discovering language versions ...") lang_versions = await self._discover_language_versions(self.url) result.languages_detected = list(lang_versions.keys()) # 3. Content parity if self.scope in ("all", "parity"): logger.info("Auditing content parity ...") result.content_parity = await self._audit_content_parity(lang_versions) # 4. Language detection if self.scope in ("all", "detection"): logger.info("Checking language detection consistency ...") all_pages: list[tuple[str, str]] = [] for lang, urls in lang_versions.items(): for u in urls[:5]: all_pages.append((lang, u)) detections = await self._check_language_detection(all_pages) result.language_detection_issues = [ d for d in detections if not d.is_consistent ] # 5. Redirect logic if self.scope in ("all", "redirects"): logger.info("Auditing redirect logic ...") result.redirect_logic = await self._audit_redirect_logic(self.url) # 6. Korean expansion if self.korean_expansion_flag: logger.info("Analysing Korean expansion opportunities ...") result.korean_expansion = await self._analyze_korean_expansion(self.url) # Collect issues and recommendations self._collect_issues(result) result.score = self._calculate_score(result) except Exception as exc: logger.error("Audit failed: %s", exc) result.issues.append(f"Audit error: {exc}") result.score = 0 finally: self._session = None return result # ------------------------------------------------------------------ # URL structure analysis # ------------------------------------------------------------------ async def _analyze_url_structure(self, url: str) -> URLStructureAnalysis: """Determine the international URL pattern in use.""" parsed = urlparse(url) hostname = parsed.netloc.lower() path = parsed.path analysis = URLStructureAnalysis() # Check ccTLD for tld, lang in CCTLD_MAP.items(): if hostname.endswith(tld): analysis.pattern = "cctld" analysis.examples.append(f"{hostname} -> {lang}") analysis.languages_found.append(lang) break # Check subdomain pattern if not analysis.pattern: match = LANG_SUBDOMAIN_PATTERN.match(hostname) if match: lang_code = match.group("lang").lower() if len(lang_code) == 2 and lang_code not in ("ww", "ww"): analysis.pattern = "subdomain" analysis.examples.append(f"{hostname}") analysis.languages_found.append(lang_code) # Check subdirectory pattern if not analysis.pattern: match = LANG_SUBDIRECTORY_PATTERN.match(path) if match: lang_code = match.group("lang").lower() analysis.pattern = "subdirectory" analysis.examples.append(f"{path}") analysis.languages_found.append(lang_code) # Check URL parameter if not analysis.pattern: qs = parse_qs(parsed.query) for param in ("lang", "language", "hl", "locale", "lg"): if param in qs: analysis.pattern = "parameter" analysis.examples.append(f"?{param}={qs[param][0]}") analysis.languages_found.append(qs[param][0]) break # Try to discover additional language versions from page HTML try: html = await self._fetch_text(url) if html: hreflang_langs = self._extract_hreflang_langs(html) for lang in hreflang_langs: if lang not in analysis.languages_found: analysis.languages_found.append(lang) # Refine pattern from hreflang hrefs if not yet determined if not analysis.pattern and hreflang_langs: hrefs = self._extract_hreflang_hrefs(html) analysis.pattern = self._detect_pattern_from_hrefs(hrefs) except Exception as exc: logger.warning("Could not fetch page for structure analysis: %s", exc) if not analysis.pattern: analysis.pattern = "unknown" # Attach pros/cons/recommendation info = URL_STRUCTURE_INFO.get(analysis.pattern, {}) analysis.pros = info.get("pros", []) analysis.cons = info.get("cons", []) if analysis.pattern == "parameter": analysis.recommendation = ( "URL parameters are not recommended for international targeting. " "Consider migrating to subdirectories or subdomains." ) elif analysis.pattern == "cctld": analysis.recommendation = ( "ccTLDs provide the strongest geo-targeting signal. " "Ensure each domain has sufficient content and backlinks." ) elif analysis.pattern == "subdomain": analysis.recommendation = ( "Subdomains can work well but ensure hreflang tags are properly " "implemented to signal relationships between language versions." ) elif analysis.pattern == "subdirectory": analysis.recommendation = ( "Subdirectories are the most cost-effective approach and consolidate " "domain authority. This is the recommended pattern for most sites." ) else: analysis.recommendation = ( "Could not determine the URL structure pattern. " "Implement a clear international URL strategy using subdirectories, " "subdomains, or ccTLDs with proper hreflang tags." ) return analysis # ------------------------------------------------------------------ # Discover language versions # ------------------------------------------------------------------ async def _discover_language_versions(self, url: str) -> dict[str, list[str]]: """Discover language versions from hreflang tags and common patterns.""" lang_versions: dict[str, list[str]] = {} try: html = await self._fetch_text(url) except Exception: html = None # 1. Extract from hreflang tags if html: hreflang_map = self._extract_hreflang_map(html) for lang, href in hreflang_map.items(): if lang == "x-default": continue lang_key = lang.split("-")[0].lower() lang_versions.setdefault(lang_key, []).append(href) # 2. Probe common subdirectory patterns common_langs = ["en", "ko", "ja", "zh", "de", "fr", "es", "pt"] probe_tasks = [] for lang in common_langs: if lang not in lang_versions: candidate = f"{self.base_url}/{lang}/" probe_tasks.append(self._probe_url(lang, candidate)) if probe_tasks: results = await asyncio.gather(*probe_tasks, return_exceptions=True) for item in results: if isinstance(item, tuple): lang, probe_url = item lang_versions.setdefault(lang, []).append(probe_url) # 3. Probe common subdomain patterns domain_parts = self.parsed.netloc.split(".") if len(domain_parts) >= 2: root_domain = ".".join(domain_parts[-2:]) subdomain_tasks = [] for lang in common_langs: if lang not in lang_versions: candidate = f"{self.parsed.scheme}://{lang}.{root_domain}/" subdomain_tasks.append(self._probe_url(lang, candidate)) if subdomain_tasks: results = await asyncio.gather(*subdomain_tasks, return_exceptions=True) for item in results: if isinstance(item, tuple): lang, probe_url = item lang_versions.setdefault(lang, []).append(probe_url) # Ensure the original URL is listed under some language if not lang_versions: detected = "unknown" if html: detected = self._detect_language_from_content(html) or "unknown" lang_versions[detected] = [url] return lang_versions async def _probe_url(self, lang: str, url: str) -> tuple[str, str] | None: """Check if a URL returns 200 and return (lang, url) or raise.""" try: async with self._session.head( url, allow_redirects=True, timeout=self.timeout, ) as resp: if resp.status == 200: return (lang, url) except Exception: pass raise ValueError(f"Probe failed for {url}") # ------------------------------------------------------------------ # Content parity # ------------------------------------------------------------------ async def _audit_content_parity( self, lang_versions: dict[str, list[str]], ) -> dict[str, ContentParityEntry]: """Audit content parity across discovered language versions.""" parity: dict[str, ContentParityEntry] = {} for lang, urls in lang_versions.items(): entry = ContentParityEntry(language=lang, page_count=len(urls)) # Determine base URL for this language version if urls: lang_base = urls[0].rstrip("/") present_keys = await self._check_key_pages(lang_base) entry.key_pages_present = present_keys entry.missing_pages = [ p for p in KEY_PAGES if p not in present_keys ] # Freshness score from Last-Modified header on first URL entry.freshness_score = await self._get_freshness_score(urls[0]) parity[lang] = entry return parity async def _check_key_pages(self, lang_base_url: str) -> list[str]: """Check which key pages exist under a language base URL.""" present: list[str] = [] page_paths = { "home": "/", "about": "/about", "contact": "/contact", "products": "/products", "services": "/services", "blog": "/blog", "faq": "/faq", } tasks = [] for name, path in page_paths.items(): full_url = lang_base_url.rstrip("/") + path tasks.append(self._check_page_exists(name, full_url)) results = await asyncio.gather(*tasks, return_exceptions=True) for item in results: if isinstance(item, str): present.append(item) return present async def _check_page_exists(self, name: str, url: str) -> str | None: """Return the page name if URL returns 200, else raise.""" try: async with self._session.head( url, allow_redirects=True, timeout=self.timeout, ) as resp: if resp.status == 200: return name except Exception: pass raise ValueError(f"Page not found: {url}") async def _get_freshness_score(self, url: str) -> float: """Compute a freshness score (0-100) based on Last-Modified header.""" try: async with self._session.head( url, allow_redirects=True, timeout=self.timeout, ) as resp: last_modified = resp.headers.get("Last-Modified") if last_modified: from email.utils import parsedate_to_datetime mod_dt = parsedate_to_datetime(last_modified) age_days = (datetime.utcnow() - mod_dt.replace(tzinfo=None)).days if age_days <= 7: return 100.0 elif age_days <= 30: return 90.0 elif age_days <= 90: return 70.0 elif age_days <= 180: return 50.0 elif age_days <= 365: return 30.0 else: return 10.0 # No Last-Modified header -- neutral score return 50.0 except Exception: return 0.0 # ------------------------------------------------------------------ # Language detection # ------------------------------------------------------------------ async def _check_language_detection( self, pages: list[tuple[str, str]], ) -> list[LanguageDetection]: """Check language declaration consistency for a list of pages.""" tasks = [ self._detect_single_page(declared_lang, page_url) for declared_lang, page_url in pages ] results = await asyncio.gather(*tasks, return_exceptions=True) detections: list[LanguageDetection] = [] for r in results: if isinstance(r, LanguageDetection): detections.append(r) return detections async def _detect_single_page( self, declared_lang: str, page_url: str, ) -> LanguageDetection: """Perform language detection checks on a single page.""" detection = LanguageDetection( page_url=page_url, declared_lang=declared_lang, ) try: async with self._session.get( page_url, allow_redirects=True, timeout=self.timeout, ) as resp: # Content-Language header cl_header = resp.headers.get("Content-Language", "") detection.content_language_header = cl_header.strip() html = await resp.text(errors="replace") # HTML lang attribute lang_match = re.search( r']*\slang=["\']([^"\']+)["\']', html, re.IGNORECASE, ) if lang_match: detection.html_lang_attr = lang_match.group(1).strip() # Detect from content detection.detected_lang = self._detect_language_from_content(html) # Consistency check expected = declared_lang.split("-")[0].lower() html_lang_base = detection.html_lang_attr.split("-")[0].lower() if detection.html_lang_attr else "" cl_base = detection.content_language_header.split("-")[0].lower() if detection.content_language_header else "" detected_base = detection.detected_lang.split("-")[0].lower() if detection.detected_lang else "" inconsistencies = [] if html_lang_base and html_lang_base != expected: inconsistencies.append("html_lang_attr") if cl_base and cl_base != expected: inconsistencies.append("content_language_header") if detected_base and detected_base != expected: inconsistencies.append("detected_content") detection.is_consistent = len(inconsistencies) == 0 except Exception as exc: logger.warning("Language detection failed for %s: %s", page_url, exc) detection.is_consistent = False return detection def _detect_language_from_content(self, html: str) -> str: """Simple heuristic language detection using Unicode ranges.""" # Strip HTML tags for content analysis text = re.sub(r"<[^>]+>", " ", html) text = re.sub(r"\s+", " ", text)[:5000] # limit analysis scope counts = { "ko": len(RE_HANGUL.findall(text)), "ja": len(RE_HIRAGANA.findall(text)) + len(RE_KATAKANA.findall(text)), "zh": len(RE_CJK_UNIFIED.findall(text)), "en": len(RE_LATIN_EXTENDED.findall(text)), } # Japanese text also contains CJK unified characters (kanji). # If both hiragana/katakana AND CJK are present, prefer Japanese. if counts["ja"] > 0 and counts["zh"] > 0: if counts["ja"] > counts["zh"] * 0.1: counts["zh"] = 0 # attribute CJK chars to Japanese context # Korean text can contain some CJK (hanja) but Hangul dominates. if counts["ko"] > 0 and counts["zh"] > 0: if counts["ko"] > counts["zh"]: counts["zh"] = 0 if not any(counts.values()): return "unknown" dominant = max(counts, key=counts.get) # Require a meaningful amount of the dominant script if counts[dominant] < 20: return "unknown" return dominant # ------------------------------------------------------------------ # Redirect logic # ------------------------------------------------------------------ async def _audit_redirect_logic(self, url: str) -> RedirectLogic: """Audit redirect behaviour for different Accept-Language headers.""" logic = RedirectLogic() redirect_targets: dict[str, str] = {} for headers in ACCEPT_LANGUAGE_HEADERS: try: async with self._session.get( url, allow_redirects=False, headers=headers, timeout=self.timeout, ) as resp: lang_key = list(headers.values())[0].split(",")[0] if resp.status in (301, 302, 303, 307, 308): location = resp.headers.get("Location", "") redirect_targets[lang_key] = location logic.has_language_redirect = True if resp.status in (301, 302): logic.redirect_type = str(resp.status) logic.is_forced = True else: redirect_targets[lang_key] = str(resp.url) except Exception as exc: logger.warning("Redirect check failed for %s: %s", headers, exc) # Detect IP-based redirects by comparing redirect targets unique_targets = set(redirect_targets.values()) if len(unique_targets) > 1: logic.has_language_redirect = True # Check for JS-based redirects in page source try: html = await self._fetch_text(url) if html: js_redirect_patterns = [ r"navigator\.language", r"navigator\.languages", r"window\.location\s*=", r"location\.href\s*=", r"geo[_-]?redirect", r"ip[_-]?redirect", ] for pattern in js_redirect_patterns: if re.search(pattern, html, re.IGNORECASE): if "geo" in pattern or "ip" in pattern: logic.has_ip_redirect = True if not logic.redirect_type: logic.redirect_type = "js" break except Exception: pass # Recommendation if logic.is_forced: logic.recommendation = ( "Forced language/region redirects detected (HTTP {}). " "Best practice: suggest language via a banner or interstitial " "rather than forcing a redirect. Forced redirects can prevent " "search engines from crawling all language versions and frustrate " "users who prefer a different language." ).format(logic.redirect_type) elif logic.has_language_redirect: logic.recommendation = ( "Language-based content negotiation detected. Ensure all language " "versions remain directly accessible via their canonical URLs and " "that Googlebot can crawl every version without redirect loops." ) else: logic.recommendation = ( "No forced language redirects detected. Consider adding a " "language suggestion banner for users whose browser language " "differs from the page language." ) return logic # ------------------------------------------------------------------ # Korean expansion analysis # ------------------------------------------------------------------ async def _analyze_korean_expansion(self, url: str) -> KoreanExpansion: """Analyse expansion opportunities for Korean-primary sites.""" expansion = KoreanExpansion( primary_lang="ko", target_markets=["ja", "zh", "en"], regional_search_engines={ "ko": "Naver (naver.com) -- dominant in Korea; also consider Daum", "ja": "Yahoo Japan (yahoo.co.jp) -- significant market share alongside Google", "zh": "Baidu (baidu.com) -- dominant in mainland China; also Sogou, 360 Search", "en": "Google (google.com) -- global default", }, ) # Check CJK URL encoding issues try: html = await self._fetch_text(url) if html: # Find links with non-ASCII characters in URLs href_pattern = re.compile(r'href=["\']([^"\']+)["\']', re.IGNORECASE) for match in href_pattern.finditer(html): href = match.group(1) # Check for unencoded CJK characters in URLs if RE_HANGUL.search(href): expansion.cjk_url_issues.append( f"Unencoded Korean characters in URL: {href[:100]}" ) if RE_CJK_UNIFIED.search(href): expansion.cjk_url_issues.append( f"Unencoded CJK characters in URL: {href[:100]}" ) if RE_HIRAGANA.search(href) or RE_KATAKANA.search(href): expansion.cjk_url_issues.append( f"Unencoded Japanese characters in URL: {href[:100]}" ) # De-duplicate expansion.cjk_url_issues = list(dict.fromkeys(expansion.cjk_url_issues))[:20] except Exception as exc: logger.warning("CJK URL check failed: %s", exc) # Priority recommendations expansion.priority_recommendations = [ "Implement hreflang tags linking ko, ja, zh, and en versions with x-default", "Use subdirectories (/ko/, /ja/, /zh/, /en/) for cost-effective international structure", "Create a Naver Search Advisor (searchadvisor.naver.com) account for Korean SEO", "Submit sitemap to Baidu Webmaster Tools for Chinese market visibility", "Register with Yahoo Japan Search Console (optional but recommended for Japan)", "Ensure all URLs use percent-encoded paths -- avoid raw CJK characters in hrefs", "Provide a language/region selector accessible from every page", "Localise content beyond translation: currency, date formats, cultural references", "Consider separate social media strategies per market (KakaoTalk for KR, LINE for JP, WeChat for CN)", "Monitor Core Web Vitals per region -- CDN edge presence matters for CJK markets", ] return expansion # ------------------------------------------------------------------ # Issue collection # ------------------------------------------------------------------ def _collect_issues(self, result: InternationalAuditResult) -> None: """Populate issues and recommendations from audit sub-results.""" # URL structure issues if result.url_structure: if result.url_structure.pattern == "parameter": result.issues.append( "URL parameters used for language targeting -- not recommended" ) if result.url_structure.pattern == "unknown": result.issues.append( "Could not detect a clear international URL structure" ) if len(result.url_structure.languages_found) <= 1: result.issues.append( "Only one language version detected -- consider expanding" ) # Content parity issues if result.content_parity: primary_count = max( (e.page_count for e in result.content_parity.values()), default=0, ) for lang, entry in result.content_parity.items(): if entry.missing_pages: result.issues.append( f"Language '{lang}' is missing key pages: " f"{', '.join(entry.missing_pages)}" ) if primary_count > 0 and entry.page_count < primary_count * 0.5: result.issues.append( f"Language '{lang}' has significantly fewer pages " f"({entry.page_count}) than the primary version ({primary_count})" ) # Language detection issues for det in result.language_detection_issues: result.issues.append( f"Language inconsistency on {det.page_url}: " f"declared={det.declared_lang}, html_lang={det.html_lang_attr}, " f"detected={det.detected_lang}" ) # Redirect logic issues if result.redirect_logic: if result.redirect_logic.is_forced: result.issues.append( "Forced language/region redirects detected -- " "this can block search engine crawling" ) if result.redirect_logic.has_ip_redirect: result.issues.append( "IP-based redirect logic detected -- " "ensure Googlebot can access all versions" ) # Check for x-default in hreflang has_xdefault = False if result.url_structure and result.url_structure.languages_found: try: html = None # already fetched; recheck from structure data # We check for x-default in languages_found (hreflang extraction adds it) if "x-default" in result.url_structure.languages_found: has_xdefault = True result.url_structure.languages_found.remove("x-default") except Exception: pass if not has_xdefault and len(result.languages_detected) > 1: result.issues.append( "No x-default hreflang tag detected -- " "add x-default to specify the fallback page" ) # Recommendations if result.issues: result.recommendations.append( "Address the issues listed above to improve international SEO health" ) if result.url_structure and result.url_structure.recommendation: result.recommendations.append(result.url_structure.recommendation) if result.redirect_logic and result.redirect_logic.recommendation: result.recommendations.append(result.redirect_logic.recommendation) if result.korean_expansion: result.recommendations.extend( result.korean_expansion.priority_recommendations[:5] ) # ------------------------------------------------------------------ # Score calculation # ------------------------------------------------------------------ def _calculate_score(self, result: InternationalAuditResult) -> int: """Calculate an overall international SEO score (0-100).""" score = 100 # URL structure penalties if result.url_structure: if result.url_structure.pattern == "parameter": score -= 15 elif result.url_structure.pattern == "unknown": score -= 10 if len(result.url_structure.languages_found) <= 1: score -= 5 # Content parity penalties if result.content_parity: for entry in result.content_parity.values(): score -= 5 * len(entry.missing_pages) # Language detection penalties score -= 10 * len(result.language_detection_issues) # Redirect logic penalties if result.redirect_logic: if result.redirect_logic.is_forced: score -= 15 # No x-default penalty for issue in result.issues: if "x-default" in issue: score -= 10 break # CJK encoding issues if result.korean_expansion: cjk_count = len(result.korean_expansion.cjk_url_issues) score -= 5 * min(cjk_count, 5) # cap at 25 points return max(0, score) # ------------------------------------------------------------------ # HTML / hreflang helpers # ------------------------------------------------------------------ async def _fetch_text(self, url: str) -> str | None: """Fetch URL content as text.""" try: async with self._session.get( url, allow_redirects=True, timeout=self.timeout, ) as resp: if resp.status == 200: return await resp.text(errors="replace") except Exception as exc: logger.warning("Failed to fetch %s: %s", url, exc) return None def _extract_hreflang_langs(self, html: str) -> list[str]: """Extract language codes from hreflang link tags.""" pattern = re.compile( r']*\brel=["\']alternate["\'][^>]*\bhreflang=["\']([^"\']+)["\']', re.IGNORECASE, ) return [m.group(1).lower() for m in pattern.finditer(html)] def _extract_hreflang_hrefs(self, html: str) -> list[str]: """Extract href values from hreflang link tags.""" pattern = re.compile( r']*\brel=["\']alternate["\'][^>]*\bhref=["\']([^"\']+)["\']', re.IGNORECASE, ) return [m.group(1) for m in pattern.finditer(html)] def _extract_hreflang_map(self, html: str) -> dict[str, str]: """Extract a mapping of hreflang language code to href.""" result: dict[str, str] = {} # Match both attribute orderings pattern1 = re.compile( r']*\brel=["\']alternate["\']' r'[^>]*\bhreflang=["\']([^"\']+)["\']' r'[^>]*\bhref=["\']([^"\']+)["\']', re.IGNORECASE, ) pattern2 = re.compile( r']*\bhreflang=["\']([^"\']+)["\']' r'[^>]*\bhref=["\']([^"\']+)["\']' r'[^>]*\brel=["\']alternate["\']', re.IGNORECASE, ) for m in pattern1.finditer(html): result[m.group(1).lower()] = m.group(2) for m in pattern2.finditer(html): result[m.group(1).lower()] = m.group(2) return result def _detect_pattern_from_hrefs(self, hrefs: list[str]) -> str: """Guess the URL structure pattern from a list of hreflang hrefs.""" for href in hrefs: parsed = urlparse(href) # ccTLD check for tld in CCTLD_MAP: if parsed.netloc.endswith(tld): return "cctld" # subdomain check if LANG_SUBDOMAIN_PATTERN.match(parsed.netloc): return "subdomain" # subdirectory check if LANG_SUBDIRECTORY_PATTERN.match(parsed.path): return "subdirectory" # parameter check qs = parse_qs(parsed.query) for param in ("lang", "language", "hl", "locale", "lg"): if param in qs: return "parameter" return "unknown" # --------------------------------------------------------------------------- # Serialisation helper # --------------------------------------------------------------------------- def _result_to_dict(result: InternationalAuditResult) -> dict: """Convert the audit result dataclass tree to a plain dict.""" data = asdict(result) # Flatten content_parity keys if "content_parity" in data and isinstance(data["content_parity"], dict): cleaned = {} for key, value in data["content_parity"].items(): cleaned[key] = value data["content_parity"] = cleaned return data def _print_text_report(result: InternationalAuditResult) -> None: """Print a human-readable text report to stdout.""" print("=" * 70) print(" International SEO Audit Report") print("=" * 70) print(f" URL: {result.url}") print(f" Timestamp: {result.timestamp}") print(f" Score: {result.score}/100") print("=" * 70) # URL Structure if result.url_structure: print("\n--- URL Structure ---") print(f" Pattern: {result.url_structure.pattern}") print(f" Languages: {', '.join(result.url_structure.languages_found)}") if result.url_structure.examples: print(f" Examples: {', '.join(result.url_structure.examples)}") print(f" Recommendation: {result.url_structure.recommendation}") # Languages detected if result.languages_detected: print(f"\n--- Languages Detected ---") print(f" {', '.join(result.languages_detected)}") # Content parity if result.content_parity: print("\n--- Content Parity ---") for lang, entry in result.content_parity.items(): missing = ", ".join(entry.missing_pages) if entry.missing_pages else "none" present = ", ".join(entry.key_pages_present) if entry.key_pages_present else "none" print(f" [{lang}] pages={entry.page_count} freshness={entry.freshness_score:.0f}") print(f" key pages present: {present}") print(f" missing: {missing}") # Language detection issues if result.language_detection_issues: print("\n--- Language Detection Issues ---") for det in result.language_detection_issues: print(f" {det.page_url}") print(f" declared={det.declared_lang} html_lang={det.html_lang_attr} " f"header={det.content_language_header} detected={det.detected_lang}") # Redirect logic if result.redirect_logic: print("\n--- Redirect Logic ---") print(f" Language redirect: {result.redirect_logic.has_language_redirect}") print(f" IP redirect: {result.redirect_logic.has_ip_redirect}") print(f" Forced: {result.redirect_logic.is_forced}") print(f" Type: {result.redirect_logic.redirect_type or 'none'}") print(f" Recommendation: {result.redirect_logic.recommendation}") # Korean expansion if result.korean_expansion: print("\n--- Korean Expansion ---") print(f" Target markets: {', '.join(result.korean_expansion.target_markets)}") if result.korean_expansion.cjk_url_issues: print(f" CJK URL issues: {len(result.korean_expansion.cjk_url_issues)}") for issue in result.korean_expansion.cjk_url_issues[:5]: print(f" - {issue}") print(" Regional search engines:") for region, engine in result.korean_expansion.regional_search_engines.items(): print(f" [{region}] {engine}") if result.korean_expansion.priority_recommendations: print(" Priority recommendations:") for rec in result.korean_expansion.priority_recommendations[:5]: print(f" - {rec}") # Issues if result.issues: print("\n--- Issues ---") for issue in result.issues: print(f" - {issue}") # Recommendations if result.recommendations: print("\n--- Recommendations ---") for rec in result.recommendations: print(f" - {rec}") print("\n" + "=" * 70) # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def build_parser() -> argparse.ArgumentParser: """Build the argument parser.""" parser = argparse.ArgumentParser( description="International SEO Auditor -- analyse multi-language site implementation", ) parser.add_argument( "--url", required=True, help="Target URL to audit", ) parser.add_argument( "--scope", choices=["structure", "parity", "detection", "redirects", "all"], default="all", help="Audit scope (default: all)", ) parser.add_argument( "--korean-expansion", action="store_true", default=False, help="Enable Korean expansion analysis", ) parser.add_argument( "--json", action="store_true", dest="json_output", default=False, help="Output results as JSON", ) parser.add_argument( "--output", type=str, default=None, help="Write output to file instead of stdout", ) return parser async def async_main(args: argparse.Namespace) -> None: """Async entry point.""" auditor = InternationalAuditor( url=args.url, korean_expansion=args.korean_expansion, scope=args.scope, ) result = await auditor.audit() if args.json_output: output = json.dumps( _result_to_dict(result), indent=2, ensure_ascii=False, default=str, ) else: # For file output in text mode, capture via print; for stdout, print directly. if args.output: import io buf = io.StringIO() _old_stdout = sys.stdout sys.stdout = buf _print_text_report(result) sys.stdout = _old_stdout output = buf.getvalue() else: _print_text_report(result) return if args.output: with open(args.output, "w", encoding="utf-8") as fh: fh.write(output) logger.info("Output written to %s", args.output) else: print(output) def main() -> None: """CLI entry point.""" parser = build_parser() args = parser.parse_args() asyncio.run(async_main(args)) if __name__ == "__main__": main()