""" Schema Validator - Validate JSON-LD structured data markup ========================================================== Purpose: Extract and validate schema.org structured data from URLs or files Python: 3.10+ Usage: python schema_validator.py --url https://example.com python schema_validator.py --file schema.json """ import argparse import json import logging import re from dataclasses import dataclass, field from datetime import datetime from typing import Any from urllib.parse import urlparse import requests from bs4 import BeautifulSoup try: import extruct HAS_EXTRUCT = True except ImportError: HAS_EXTRUCT = False logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", ) logger = logging.getLogger(__name__) @dataclass class ValidationIssue: """Represents a validation issue found in schema.""" severity: str # "error", "warning", "info" message: str schema_type: str | None = None property_name: str | None = None suggestion: str | None = None @dataclass class ValidationResult: """Complete validation result for a schema.""" url: str | None = None schemas_found: list[dict] = field(default_factory=list) issues: list[ValidationIssue] = field(default_factory=list) valid: bool = True rich_results_eligible: dict = field(default_factory=dict) timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) def to_dict(self) -> dict: """Convert to dictionary for JSON output.""" return { "url": self.url, "schemas_found": len(self.schemas_found), "schema_types": [s.get("@type", "Unknown") for s in self.schemas_found], "valid": self.valid, "issues": [ { "severity": i.severity, "message": i.message, "schema_type": i.schema_type, "property": i.property_name, "suggestion": i.suggestion, } for i in self.issues ], "rich_results_eligible": self.rich_results_eligible, "timestamp": self.timestamp, } class SchemaValidator: """Validate schema.org structured data.""" # Required properties for common schema types REQUIRED_PROPERTIES = { "Organization": ["name", "url"], "LocalBusiness": ["name", "address"], "Product": ["name"], "Offer": ["price", "priceCurrency"], "Article": ["headline", "author", "datePublished", "publisher"], "BlogPosting": ["headline", "author", "datePublished", "publisher"], "NewsArticle": ["headline", "author", "datePublished", "publisher"], "FAQPage": ["mainEntity"], "Question": ["name", "acceptedAnswer"], "Answer": ["text"], "BreadcrumbList": ["itemListElement"], "ListItem": ["position", "name"], "WebSite": ["name", "url"], "WebPage": ["name"], "Person": ["name"], "Event": ["name", "startDate", "location"], "Review": ["reviewRating", "author"], "AggregateRating": ["ratingValue"], "ImageObject": ["url"], } # Recommended (but not required) properties RECOMMENDED_PROPERTIES = { "Organization": ["logo", "description", "contactPoint", "sameAs"], "LocalBusiness": ["telephone", "openingHoursSpecification", "geo", "image"], "Product": ["description", "image", "brand", "offers", "aggregateRating"], "Article": ["image", "dateModified", "description"], "FAQPage": [], "WebSite": ["potentialAction"], "BreadcrumbList": [], } # Google Rich Results eligible types RICH_RESULTS_TYPES = { "Article", "BlogPosting", "NewsArticle", "Product", "Review", "FAQPage", "HowTo", "LocalBusiness", "Restaurant", "Event", "Recipe", "JobPosting", "Course", "BreadcrumbList", "Organization", "WebSite", "VideoObject", } def __init__(self): self.session = requests.Session() self.session.headers.update({ "User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)" }) def extract_from_url(self, url: str) -> list[dict]: """Extract all structured data from a URL.""" try: response = self.session.get(url, timeout=30) response.raise_for_status() return self.extract_from_html(response.text, url) except requests.RequestException as e: logger.error(f"Failed to fetch URL: {e}") return [] def extract_from_html(self, html: str, base_url: str | None = None) -> list[dict]: """Extract structured data from HTML content.""" schemas = [] # Method 1: Use extruct if available (handles JSON-LD, Microdata, RDFa) if HAS_EXTRUCT: try: data = extruct.extract(html, base_url=base_url, uniform=True) schemas.extend(data.get("json-ld", [])) schemas.extend(data.get("microdata", [])) schemas.extend(data.get("rdfa", [])) except Exception as e: logger.warning(f"extruct extraction failed: {e}") # Method 2: Manual JSON-LD extraction (fallback/additional) soup = BeautifulSoup(html, "html.parser") for script in soup.find_all("script", type="application/ld+json"): try: content = script.string if content: data = json.loads(content) if isinstance(data, list): schemas.extend(data) else: schemas.append(data) except json.JSONDecodeError as e: logger.warning(f"Invalid JSON-LD: {e}") # Deduplicate schemas seen = set() unique_schemas = [] for schema in schemas: schema_str = json.dumps(schema, sort_keys=True) if schema_str not in seen: seen.add(schema_str) unique_schemas.append(schema) return unique_schemas def validate(self, url: str | None = None, html: str | None = None, schema: dict | None = None) -> ValidationResult: """Validate schema from URL, HTML, or direct schema dict.""" result = ValidationResult(url=url) # Extract schemas if schema: schemas = [schema] elif html: schemas = self.extract_from_html(html, url) elif url: schemas = self.extract_from_url(url) else: raise ValueError("Must provide url, html, or schema") result.schemas_found = schemas if not schemas: result.issues.append(ValidationIssue( severity="warning", message="No structured data found", suggestion="Add JSON-LD schema markup to improve SEO", )) result.valid = False return result # Validate each schema for schema in schemas: self._validate_schema(schema, result) # Check for errors (warnings don't affect validity) result.valid = not any(i.severity == "error" for i in result.issues) return result def _validate_schema(self, schema: dict, result: ValidationResult, parent_type: str | None = None) -> None: """Validate a single schema object.""" schema_type = schema.get("@type") if not schema_type: result.issues.append(ValidationIssue( severity="error", message="Missing @type property", schema_type=parent_type, )) return # Handle array of types if isinstance(schema_type, list): schema_type = schema_type[0] # Check required properties required = self.REQUIRED_PROPERTIES.get(schema_type, []) for prop in required: if prop not in schema: result.issues.append(ValidationIssue( severity="error", message=f"Missing required property: {prop}", schema_type=schema_type, property_name=prop, suggestion=f"Add '{prop}' property to {schema_type} schema", )) # Check recommended properties recommended = self.RECOMMENDED_PROPERTIES.get(schema_type, []) for prop in recommended: if prop not in schema: result.issues.append(ValidationIssue( severity="info", message=f"Missing recommended property: {prop}", schema_type=schema_type, property_name=prop, suggestion=f"Consider adding '{prop}' for better rich results", )) # Check Rich Results eligibility if schema_type in self.RICH_RESULTS_TYPES: result.rich_results_eligible[schema_type] = self._check_rich_results( schema, schema_type ) # Validate nested schemas for key, value in schema.items(): if key.startswith("@"): continue if isinstance(value, dict) and "@type" in value: self._validate_schema(value, result, schema_type) elif isinstance(value, list): for item in value: if isinstance(item, dict) and "@type" in item: self._validate_schema(item, result, schema_type) # Type-specific validations self._validate_type_specific(schema, schema_type, result) def _validate_type_specific(self, schema: dict, schema_type: str, result: ValidationResult) -> None: """Type-specific validation rules.""" if schema_type in ("Article", "BlogPosting", "NewsArticle"): # Check image if "image" not in schema: result.issues.append(ValidationIssue( severity="warning", message="Article without image may not show in rich results", schema_type=schema_type, property_name="image", suggestion="Add at least one image to the article", )) # Check headline length headline = schema.get("headline", "") if len(headline) > 110: result.issues.append(ValidationIssue( severity="warning", message=f"Headline too long ({len(headline)} chars, max 110)", schema_type=schema_type, property_name="headline", )) elif schema_type == "Product": offer = schema.get("offers", {}) if isinstance(offer, dict): # Check price price = offer.get("price") if price is not None: try: float(price) except (ValueError, TypeError): result.issues.append(ValidationIssue( severity="error", message=f"Invalid price value: {price}", schema_type="Offer", property_name="price", )) # Check availability availability = offer.get("availability", "") valid_availabilities = [ "InStock", "OutOfStock", "PreOrder", "Discontinued", "https://schema.org/InStock", "https://schema.org/OutOfStock", ] if availability and not any( a in availability for a in valid_availabilities ): result.issues.append(ValidationIssue( severity="warning", message=f"Unknown availability value: {availability}", schema_type="Offer", property_name="availability", )) elif schema_type == "LocalBusiness": # Check for geo coordinates if "geo" not in schema: result.issues.append(ValidationIssue( severity="info", message="Missing geo coordinates", schema_type=schema_type, property_name="geo", suggestion="Add latitude/longitude for better local search", )) elif schema_type == "FAQPage": main_entity = schema.get("mainEntity", []) if not main_entity: result.issues.append(ValidationIssue( severity="error", message="FAQPage must have at least one question", schema_type=schema_type, property_name="mainEntity", )) elif len(main_entity) < 2: result.issues.append(ValidationIssue( severity="info", message="FAQPage has only one question", schema_type=schema_type, suggestion="Add more questions for better rich results", )) def _check_rich_results(self, schema: dict, schema_type: str) -> dict: """Check if schema is eligible for Google Rich Results.""" result = { "eligible": True, "missing_for_rich_results": [], } if schema_type in ("Article", "BlogPosting", "NewsArticle"): required_for_rich = ["headline", "image", "datePublished", "author"] for prop in required_for_rich: if prop not in schema: result["eligible"] = False result["missing_for_rich_results"].append(prop) elif schema_type == "Product": if "name" not in schema: result["eligible"] = False result["missing_for_rich_results"].append("name") offer = schema.get("offers") if not offer: result["eligible"] = False result["missing_for_rich_results"].append("offers") elif schema_type == "FAQPage": if not schema.get("mainEntity"): result["eligible"] = False result["missing_for_rich_results"].append("mainEntity") return result def generate_report(self, result: ValidationResult) -> str: """Generate human-readable validation report.""" lines = [ "=" * 60, "Schema Validation Report", "=" * 60, f"URL: {result.url or 'N/A'}", f"Timestamp: {result.timestamp}", f"Valid: {'Yes' if result.valid else 'No'}", f"Schemas Found: {len(result.schemas_found)}", "", ] if result.schemas_found: lines.append("Schema Types:") for schema in result.schemas_found: schema_type = schema.get("@type", "Unknown") lines.append(f" - {schema_type}") lines.append("") if result.rich_results_eligible: lines.append("Rich Results Eligibility:") for schema_type, status in result.rich_results_eligible.items(): eligible = "Yes" if status["eligible"] else "No" lines.append(f" - {schema_type}: {eligible}") if status["missing_for_rich_results"]: missing = ", ".join(status["missing_for_rich_results"]) lines.append(f" Missing: {missing}") lines.append("") if result.issues: lines.append("Issues Found:") errors = [i for i in result.issues if i.severity == "error"] warnings = [i for i in result.issues if i.severity == "warning"] infos = [i for i in result.issues if i.severity == "info"] if errors: lines.append(f"\n ERRORS ({len(errors)}):") for issue in errors: lines.append(f" - [{issue.schema_type}] {issue.message}") if issue.suggestion: lines.append(f" Suggestion: {issue.suggestion}") if warnings: lines.append(f"\n WARNINGS ({len(warnings)}):") for issue in warnings: lines.append(f" - [{issue.schema_type}] {issue.message}") if issue.suggestion: lines.append(f" Suggestion: {issue.suggestion}") if infos: lines.append(f"\n INFO ({len(infos)}):") for issue in infos: lines.append(f" - [{issue.schema_type}] {issue.message}") if issue.suggestion: lines.append(f" Suggestion: {issue.suggestion}") lines.append("") lines.append("=" * 60) return "\n".join(lines) def main(): """Main entry point for CLI usage.""" parser = argparse.ArgumentParser( description="Validate schema.org structured data", ) parser.add_argument("--url", "-u", help="URL to validate") parser.add_argument("--file", "-f", help="JSON-LD file to validate") parser.add_argument("--output", "-o", help="Output file for JSON report") parser.add_argument("--json", action="store_true", help="Output as JSON") args = parser.parse_args() if not args.url and not args.file: parser.error("Must provide --url or --file") validator = SchemaValidator() if args.file: with open(args.file, "r", encoding="utf-8") as f: schema = json.load(f) result = validator.validate(schema=schema) else: result = validator.validate(url=args.url) if args.json or args.output: output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2) if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(output) logger.info(f"Report written to {args.output}") else: print(output) else: print(validator.generate_report(result)) if __name__ == "__main__": main()