directory changes and restructuring

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-22 02:01:41 +09:00
parent eea49f9f8c
commit 236be6c580
598 changed files with 0 additions and 0 deletions

View File

@@ -0,0 +1,113 @@
# CLAUDE.md
## Overview
Structured data validator: extract, parse, and validate JSON-LD, Microdata, and RDFa markup against schema.org vocabulary.
## Quick Start
```bash
pip install -r scripts/requirements.txt
python scripts/schema_validator.py --url https://example.com
```
## Scripts
| Script | Purpose |
|--------|---------|
| `schema_validator.py` | Extract and validate structured data |
| `base_client.py` | Shared utilities |
## Usage
```bash
# Validate page schema
python scripts/schema_validator.py --url https://example.com
# JSON output
python scripts/schema_validator.py --url https://example.com --json
# Validate local file
python scripts/schema_validator.py --file schema.json
# Check Rich Results eligibility
python scripts/schema_validator.py --url https://example.com --rich-results
```
## Supported Formats
| Format | Detection |
|--------|-----------|
| JSON-LD | `<script type="application/ld+json">` |
| Microdata | `itemscope`, `itemtype`, `itemprop` |
| RDFa | `vocab`, `typeof`, `property` |
## Validation Levels
### 1. Syntax Validation
- Valid JSON structure
- Proper nesting
- No syntax errors
### 2. Schema.org Vocabulary
- Valid @type values
- Known properties
- Correct property types
### 3. Google Rich Results
- Required properties present
- Recommended properties
- Feature-specific requirements
## Schema Types Validated
| Type | Required Properties | Rich Result |
|------|---------------------|-------------|
| Article | headline, author, datePublished | Yes |
| Product | name, offers | Yes |
| LocalBusiness | name, address | Yes |
| FAQPage | mainEntity | Yes |
| Organization | name, url | Yes |
| BreadcrumbList | itemListElement | Yes |
| WebSite | name, url | Sitelinks |
## Output
```json
{
"url": "https://example.com",
"schemas_found": 3,
"schemas": [
{
"@type": "Organization",
"valid": true,
"rich_results_eligible": true,
"issues": [],
"warnings": []
}
],
"summary": {
"valid": 3,
"invalid": 0,
"rich_results_eligible": 2
}
}
```
## Issue Severity
| Level | Description |
|-------|-------------|
| Error | Invalid schema, blocks rich results |
| Warning | Missing recommended property |
| Info | Optimization suggestion |
## Dependencies
```
extruct>=0.16.0
jsonschema>=4.21.0
rdflib>=7.0.0
lxml>=5.1.0
requests>=2.31.0
```

View File

@@ -0,0 +1,207 @@
"""
Base Client - Shared async client utilities
===========================================
Purpose: Rate-limited async operations for API clients
Python: 3.10+
"""
import asyncio
import logging
import os
from asyncio import Semaphore
from datetime import datetime
from typing import Any, Callable, TypeVar
from dotenv import load_dotenv
from tenacity import (
retry,
stop_after_attempt,
wait_exponential,
retry_if_exception_type,
)
# Load environment variables
load_dotenv()
# Logging setup
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
T = TypeVar("T")
class RateLimiter:
"""Rate limiter using token bucket algorithm."""
def __init__(self, rate: float, per: float = 1.0):
"""
Initialize rate limiter.
Args:
rate: Number of requests allowed
per: Time period in seconds (default: 1 second)
"""
self.rate = rate
self.per = per
self.tokens = rate
self.last_update = datetime.now()
self._lock = asyncio.Lock()
async def acquire(self) -> None:
"""Acquire a token, waiting if necessary."""
async with self._lock:
now = datetime.now()
elapsed = (now - self.last_update).total_seconds()
self.tokens = min(self.rate, self.tokens + elapsed * (self.rate / self.per))
self.last_update = now
if self.tokens < 1:
wait_time = (1 - self.tokens) * (self.per / self.rate)
await asyncio.sleep(wait_time)
self.tokens = 0
else:
self.tokens -= 1
class BaseAsyncClient:
"""Base class for async API clients with rate limiting."""
def __init__(
self,
max_concurrent: int = 5,
requests_per_second: float = 3.0,
logger: logging.Logger | None = None,
):
"""
Initialize base client.
Args:
max_concurrent: Maximum concurrent requests
requests_per_second: Rate limit
logger: Logger instance
"""
self.semaphore = Semaphore(max_concurrent)
self.rate_limiter = RateLimiter(requests_per_second)
self.logger = logger or logging.getLogger(self.__class__.__name__)
self.stats = {
"requests": 0,
"success": 0,
"errors": 0,
"retries": 0,
}
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=2, max=10),
retry=retry_if_exception_type(Exception),
)
async def _rate_limited_request(
self,
coro: Callable[[], Any],
) -> Any:
"""Execute a request with rate limiting and retry."""
async with self.semaphore:
await self.rate_limiter.acquire()
self.stats["requests"] += 1
try:
result = await coro()
self.stats["success"] += 1
return result
except Exception as e:
self.stats["errors"] += 1
self.logger.error(f"Request failed: {e}")
raise
async def batch_requests(
self,
requests: list[Callable[[], Any]],
desc: str = "Processing",
) -> list[Any]:
"""Execute multiple requests concurrently."""
try:
from tqdm.asyncio import tqdm
has_tqdm = True
except ImportError:
has_tqdm = False
async def execute(req: Callable) -> Any:
try:
return await self._rate_limited_request(req)
except Exception as e:
return {"error": str(e)}
tasks = [execute(req) for req in requests]
if has_tqdm:
results = []
for coro in tqdm.as_completed(tasks, total=len(tasks), desc=desc):
result = await coro
results.append(result)
return results
else:
return await asyncio.gather(*tasks, return_exceptions=True)
def print_stats(self) -> None:
"""Print request statistics."""
self.logger.info("=" * 40)
self.logger.info("Request Statistics:")
self.logger.info(f" Total Requests: {self.stats['requests']}")
self.logger.info(f" Successful: {self.stats['success']}")
self.logger.info(f" Errors: {self.stats['errors']}")
self.logger.info("=" * 40)
class ConfigManager:
"""Manage API configuration and credentials."""
def __init__(self):
load_dotenv()
@property
def google_credentials_path(self) -> str | None:
"""Get Google service account credentials path."""
# Prefer SEO-specific credentials, fallback to general credentials
seo_creds = os.path.expanduser("~/.credential/ourdigital-seo-agent.json")
if os.path.exists(seo_creds):
return seo_creds
return os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
@property
def pagespeed_api_key(self) -> str | None:
"""Get PageSpeed Insights API key."""
return os.getenv("PAGESPEED_API_KEY")
@property
def custom_search_api_key(self) -> str | None:
"""Get Custom Search API key."""
return os.getenv("CUSTOM_SEARCH_API_KEY")
@property
def custom_search_engine_id(self) -> str | None:
"""Get Custom Search Engine ID."""
return os.getenv("CUSTOM_SEARCH_ENGINE_ID")
@property
def notion_token(self) -> str | None:
"""Get Notion API token."""
return os.getenv("NOTION_TOKEN") or os.getenv("NOTION_API_KEY")
def validate_google_credentials(self) -> bool:
"""Validate Google credentials are configured."""
creds_path = self.google_credentials_path
if not creds_path:
return False
return os.path.exists(creds_path)
def get_required(self, key: str) -> str:
"""Get required environment variable or raise error."""
value = os.getenv(key)
if not value:
raise ValueError(f"Missing required environment variable: {key}")
return value
# Singleton config instance
config = ConfigManager()

View File

@@ -0,0 +1,9 @@
# 13-seo-schema-validator dependencies
extruct>=0.16.0
jsonschema>=4.21.0
rdflib>=7.0.0
lxml>=5.1.0
beautifulsoup4>=4.12.0
requests>=2.31.0
python-dotenv>=1.0.0
rich>=13.7.0

View File

@@ -0,0 +1,498 @@
"""
Schema Validator - Validate JSON-LD structured data markup
==========================================================
Purpose: Extract and validate schema.org structured data from URLs or files
Python: 3.10+
Usage:
python schema_validator.py --url https://example.com
python schema_validator.py --file schema.json
"""
import argparse
import json
import logging
import re
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
try:
import extruct
HAS_EXTRUCT = True
except ImportError:
HAS_EXTRUCT = False
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
@dataclass
class ValidationIssue:
"""Represents a validation issue found in schema."""
severity: str # "error", "warning", "info"
message: str
schema_type: str | None = None
property_name: str | None = None
suggestion: str | None = None
@dataclass
class ValidationResult:
"""Complete validation result for a schema."""
url: str | None = None
schemas_found: list[dict] = field(default_factory=list)
issues: list[ValidationIssue] = field(default_factory=list)
valid: bool = True
rich_results_eligible: dict = field(default_factory=dict)
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
def to_dict(self) -> dict:
"""Convert to dictionary for JSON output."""
return {
"url": self.url,
"schemas_found": len(self.schemas_found),
"schema_types": [s.get("@type", "Unknown") for s in self.schemas_found],
"valid": self.valid,
"issues": [
{
"severity": i.severity,
"message": i.message,
"schema_type": i.schema_type,
"property": i.property_name,
"suggestion": i.suggestion,
}
for i in self.issues
],
"rich_results_eligible": self.rich_results_eligible,
"timestamp": self.timestamp,
}
class SchemaValidator:
"""Validate schema.org structured data."""
# Required properties for common schema types
REQUIRED_PROPERTIES = {
"Organization": ["name", "url"],
"LocalBusiness": ["name", "address"],
"Product": ["name"],
"Offer": ["price", "priceCurrency"],
"Article": ["headline", "author", "datePublished", "publisher"],
"BlogPosting": ["headline", "author", "datePublished", "publisher"],
"NewsArticle": ["headline", "author", "datePublished", "publisher"],
"FAQPage": ["mainEntity"],
"Question": ["name", "acceptedAnswer"],
"Answer": ["text"],
"BreadcrumbList": ["itemListElement"],
"ListItem": ["position", "name"],
"WebSite": ["name", "url"],
"WebPage": ["name"],
"Person": ["name"],
"Event": ["name", "startDate", "location"],
"Review": ["reviewRating", "author"],
"AggregateRating": ["ratingValue"],
"ImageObject": ["url"],
}
# Recommended (but not required) properties
RECOMMENDED_PROPERTIES = {
"Organization": ["logo", "description", "contactPoint", "sameAs"],
"LocalBusiness": ["telephone", "openingHoursSpecification", "geo", "image"],
"Product": ["description", "image", "brand", "offers", "aggregateRating"],
"Article": ["image", "dateModified", "description"],
"FAQPage": [],
"WebSite": ["potentialAction"],
"BreadcrumbList": [],
}
# Google Rich Results eligible types
RICH_RESULTS_TYPES = {
"Article", "BlogPosting", "NewsArticle",
"Product", "Review",
"FAQPage", "HowTo",
"LocalBusiness", "Restaurant",
"Event",
"Recipe",
"JobPosting",
"Course",
"BreadcrumbList",
"Organization",
"WebSite",
"VideoObject",
}
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (compatible; SEOAuditBot/1.0)"
})
def extract_from_url(self, url: str) -> list[dict]:
"""Extract all structured data from a URL."""
try:
response = self.session.get(url, timeout=30)
response.raise_for_status()
return self.extract_from_html(response.text, url)
except requests.RequestException as e:
logger.error(f"Failed to fetch URL: {e}")
return []
def extract_from_html(self, html: str, base_url: str | None = None) -> list[dict]:
"""Extract structured data from HTML content."""
schemas = []
# Method 1: Use extruct if available (handles JSON-LD, Microdata, RDFa)
if HAS_EXTRUCT:
try:
data = extruct.extract(html, base_url=base_url, uniform=True)
schemas.extend(data.get("json-ld", []))
schemas.extend(data.get("microdata", []))
schemas.extend(data.get("rdfa", []))
except Exception as e:
logger.warning(f"extruct extraction failed: {e}")
# Method 2: Manual JSON-LD extraction (fallback/additional)
soup = BeautifulSoup(html, "html.parser")
for script in soup.find_all("script", type="application/ld+json"):
try:
content = script.string
if content:
data = json.loads(content)
if isinstance(data, list):
schemas.extend(data)
else:
schemas.append(data)
except json.JSONDecodeError as e:
logger.warning(f"Invalid JSON-LD: {e}")
# Deduplicate schemas
seen = set()
unique_schemas = []
for schema in schemas:
schema_str = json.dumps(schema, sort_keys=True)
if schema_str not in seen:
seen.add(schema_str)
unique_schemas.append(schema)
return unique_schemas
def validate(self, url: str | None = None, html: str | None = None,
schema: dict | None = None) -> ValidationResult:
"""Validate schema from URL, HTML, or direct schema dict."""
result = ValidationResult(url=url)
# Extract schemas
if schema:
schemas = [schema]
elif html:
schemas = self.extract_from_html(html, url)
elif url:
schemas = self.extract_from_url(url)
else:
raise ValueError("Must provide url, html, or schema")
result.schemas_found = schemas
if not schemas:
result.issues.append(ValidationIssue(
severity="warning",
message="No structured data found",
suggestion="Add JSON-LD schema markup to improve SEO",
))
result.valid = False
return result
# Validate each schema
for schema in schemas:
self._validate_schema(schema, result)
# Check for errors (warnings don't affect validity)
result.valid = not any(i.severity == "error" for i in result.issues)
return result
def _validate_schema(self, schema: dict, result: ValidationResult,
parent_type: str | None = None) -> None:
"""Validate a single schema object."""
schema_type = schema.get("@type")
if not schema_type:
result.issues.append(ValidationIssue(
severity="error",
message="Missing @type property",
schema_type=parent_type,
))
return
# Handle array of types
if isinstance(schema_type, list):
schema_type = schema_type[0]
# Check required properties
required = self.REQUIRED_PROPERTIES.get(schema_type, [])
for prop in required:
if prop not in schema:
result.issues.append(ValidationIssue(
severity="error",
message=f"Missing required property: {prop}",
schema_type=schema_type,
property_name=prop,
suggestion=f"Add '{prop}' property to {schema_type} schema",
))
# Check recommended properties
recommended = self.RECOMMENDED_PROPERTIES.get(schema_type, [])
for prop in recommended:
if prop not in schema:
result.issues.append(ValidationIssue(
severity="info",
message=f"Missing recommended property: {prop}",
schema_type=schema_type,
property_name=prop,
suggestion=f"Consider adding '{prop}' for better rich results",
))
# Check Rich Results eligibility
if schema_type in self.RICH_RESULTS_TYPES:
result.rich_results_eligible[schema_type] = self._check_rich_results(
schema, schema_type
)
# Validate nested schemas
for key, value in schema.items():
if key.startswith("@"):
continue
if isinstance(value, dict) and "@type" in value:
self._validate_schema(value, result, schema_type)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict) and "@type" in item:
self._validate_schema(item, result, schema_type)
# Type-specific validations
self._validate_type_specific(schema, schema_type, result)
def _validate_type_specific(self, schema: dict, schema_type: str,
result: ValidationResult) -> None:
"""Type-specific validation rules."""
if schema_type in ("Article", "BlogPosting", "NewsArticle"):
# Check image
if "image" not in schema:
result.issues.append(ValidationIssue(
severity="warning",
message="Article without image may not show in rich results",
schema_type=schema_type,
property_name="image",
suggestion="Add at least one image to the article",
))
# Check headline length
headline = schema.get("headline", "")
if len(headline) > 110:
result.issues.append(ValidationIssue(
severity="warning",
message=f"Headline too long ({len(headline)} chars, max 110)",
schema_type=schema_type,
property_name="headline",
))
elif schema_type == "Product":
offer = schema.get("offers", {})
if isinstance(offer, dict):
# Check price
price = offer.get("price")
if price is not None:
try:
float(price)
except (ValueError, TypeError):
result.issues.append(ValidationIssue(
severity="error",
message=f"Invalid price value: {price}",
schema_type="Offer",
property_name="price",
))
# Check availability
availability = offer.get("availability", "")
valid_availabilities = [
"InStock", "OutOfStock", "PreOrder", "Discontinued",
"https://schema.org/InStock", "https://schema.org/OutOfStock",
]
if availability and not any(
a in availability for a in valid_availabilities
):
result.issues.append(ValidationIssue(
severity="warning",
message=f"Unknown availability value: {availability}",
schema_type="Offer",
property_name="availability",
))
elif schema_type == "LocalBusiness":
# Check for geo coordinates
if "geo" not in schema:
result.issues.append(ValidationIssue(
severity="info",
message="Missing geo coordinates",
schema_type=schema_type,
property_name="geo",
suggestion="Add latitude/longitude for better local search",
))
elif schema_type == "FAQPage":
main_entity = schema.get("mainEntity", [])
if not main_entity:
result.issues.append(ValidationIssue(
severity="error",
message="FAQPage must have at least one question",
schema_type=schema_type,
property_name="mainEntity",
))
elif len(main_entity) < 2:
result.issues.append(ValidationIssue(
severity="info",
message="FAQPage has only one question",
schema_type=schema_type,
suggestion="Add more questions for better rich results",
))
def _check_rich_results(self, schema: dict, schema_type: str) -> dict:
"""Check if schema is eligible for Google Rich Results."""
result = {
"eligible": True,
"missing_for_rich_results": [],
}
if schema_type in ("Article", "BlogPosting", "NewsArticle"):
required_for_rich = ["headline", "image", "datePublished", "author"]
for prop in required_for_rich:
if prop not in schema:
result["eligible"] = False
result["missing_for_rich_results"].append(prop)
elif schema_type == "Product":
if "name" not in schema:
result["eligible"] = False
result["missing_for_rich_results"].append("name")
offer = schema.get("offers")
if not offer:
result["eligible"] = False
result["missing_for_rich_results"].append("offers")
elif schema_type == "FAQPage":
if not schema.get("mainEntity"):
result["eligible"] = False
result["missing_for_rich_results"].append("mainEntity")
return result
def generate_report(self, result: ValidationResult) -> str:
"""Generate human-readable validation report."""
lines = [
"=" * 60,
"Schema Validation Report",
"=" * 60,
f"URL: {result.url or 'N/A'}",
f"Timestamp: {result.timestamp}",
f"Valid: {'Yes' if result.valid else 'No'}",
f"Schemas Found: {len(result.schemas_found)}",
"",
]
if result.schemas_found:
lines.append("Schema Types:")
for schema in result.schemas_found:
schema_type = schema.get("@type", "Unknown")
lines.append(f" - {schema_type}")
lines.append("")
if result.rich_results_eligible:
lines.append("Rich Results Eligibility:")
for schema_type, status in result.rich_results_eligible.items():
eligible = "Yes" if status["eligible"] else "No"
lines.append(f" - {schema_type}: {eligible}")
if status["missing_for_rich_results"]:
missing = ", ".join(status["missing_for_rich_results"])
lines.append(f" Missing: {missing}")
lines.append("")
if result.issues:
lines.append("Issues Found:")
errors = [i for i in result.issues if i.severity == "error"]
warnings = [i for i in result.issues if i.severity == "warning"]
infos = [i for i in result.issues if i.severity == "info"]
if errors:
lines.append(f"\n ERRORS ({len(errors)}):")
for issue in errors:
lines.append(f" - [{issue.schema_type}] {issue.message}")
if issue.suggestion:
lines.append(f" Suggestion: {issue.suggestion}")
if warnings:
lines.append(f"\n WARNINGS ({len(warnings)}):")
for issue in warnings:
lines.append(f" - [{issue.schema_type}] {issue.message}")
if issue.suggestion:
lines.append(f" Suggestion: {issue.suggestion}")
if infos:
lines.append(f"\n INFO ({len(infos)}):")
for issue in infos:
lines.append(f" - [{issue.schema_type}] {issue.message}")
if issue.suggestion:
lines.append(f" Suggestion: {issue.suggestion}")
lines.append("")
lines.append("=" * 60)
return "\n".join(lines)
def main():
"""Main entry point for CLI usage."""
parser = argparse.ArgumentParser(
description="Validate schema.org structured data",
)
parser.add_argument("--url", "-u", help="URL to validate")
parser.add_argument("--file", "-f", help="JSON-LD file to validate")
parser.add_argument("--output", "-o", help="Output file for JSON report")
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
if not args.url and not args.file:
parser.error("Must provide --url or --file")
validator = SchemaValidator()
if args.file:
with open(args.file, "r", encoding="utf-8") as f:
schema = json.load(f)
result = validator.validate(schema=schema)
else:
result = validator.validate(url=args.url)
if args.json or args.output:
output = json.dumps(result.to_dict(), ensure_ascii=False, indent=2)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
logger.info(f"Report written to {args.output}")
else:
print(output)
else:
print(validator.generate_report(result))
if __name__ == "__main__":
main()