directory changes and restructuring

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-22 02:01:41 +09:00
parent eea49f9f8c
commit 236be6c580
598 changed files with 0 additions and 0 deletions

View File

@@ -0,0 +1,122 @@
# CLAUDE.md
## Overview
Google Search Console data retriever: search analytics (rankings, CTR, impressions), sitemap status, and index coverage.
## Quick Start
```bash
pip install -r scripts/requirements.txt
# Requires service account credentials
# ~/.credential/ourdigital-seo-agent.json
python scripts/gsc_client.py --site sc-domain:example.com --action summary
```
## Scripts
| Script | Purpose |
|--------|---------|
| `gsc_client.py` | Search Console API client |
| `base_client.py` | Shared utilities |
## Configuration
Service account setup:
```bash
# Credentials file location
~/.credential/ourdigital-seo-agent.json
# Add service account email to GSC property as user
ourdigital-seo-agent@ourdigital-insights.iam.gserviceaccount.com
```
## Usage
```bash
# Performance summary (last 28 days)
python scripts/gsc_client.py --site sc-domain:example.com --action summary
# Query-level data
python scripts/gsc_client.py --site sc-domain:example.com --action queries --limit 100
# Page-level data
python scripts/gsc_client.py --site sc-domain:example.com --action pages
# Custom date range
python scripts/gsc_client.py --site sc-domain:example.com --action queries \
--start 2024-01-01 --end 2024-01-31
# Sitemap status
python scripts/gsc_client.py --site sc-domain:example.com --action sitemaps
# JSON output
python scripts/gsc_client.py --site sc-domain:example.com --action summary --json
```
## Actions
| Action | Description |
|--------|-------------|
| `summary` | Overview metrics (clicks, impressions, CTR, position) |
| `queries` | Top search queries |
| `pages` | Top pages by clicks |
| `sitemaps` | Sitemap submission status |
| `coverage` | Index coverage issues |
## Output: Summary
```json
{
"site": "sc-domain:example.com",
"date_range": "2024-01-01 to 2024-01-28",
"totals": {
"clicks": 15000,
"impressions": 500000,
"ctr": 3.0,
"position": 12.5
}
}
```
## Output: Queries
```json
{
"queries": [
{
"query": "keyword",
"clicks": 500,
"impressions": 10000,
"ctr": 5.0,
"position": 3.2
}
]
}
```
## Rate Limits
| Limit | Value |
|-------|-------|
| Queries per minute | 1,200 |
| Rows per request | 25,000 |
## Site Property Formats
| Format | Example |
|--------|---------|
| Domain property | `sc-domain:example.com` |
| URL prefix | `https://www.example.com/` |
## Dependencies
```
google-api-python-client>=2.100.0
google-auth>=2.23.0
python-dotenv>=1.0.0
rich>=13.7.0
pandas>=2.1.0
```

View File

@@ -0,0 +1,207 @@
"""
Base Client - Shared async client utilities
===========================================
Purpose: Rate-limited async operations for API clients
Python: 3.10+
"""
import asyncio
import logging
import os
from asyncio import Semaphore
from datetime import datetime
from typing import Any, Callable, TypeVar
from dotenv import load_dotenv
from tenacity import (
retry,
stop_after_attempt,
wait_exponential,
retry_if_exception_type,
)
# Load environment variables
load_dotenv()
# Logging setup
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
T = TypeVar("T")
class RateLimiter:
"""Rate limiter using token bucket algorithm."""
def __init__(self, rate: float, per: float = 1.0):
"""
Initialize rate limiter.
Args:
rate: Number of requests allowed
per: Time period in seconds (default: 1 second)
"""
self.rate = rate
self.per = per
self.tokens = rate
self.last_update = datetime.now()
self._lock = asyncio.Lock()
async def acquire(self) -> None:
"""Acquire a token, waiting if necessary."""
async with self._lock:
now = datetime.now()
elapsed = (now - self.last_update).total_seconds()
self.tokens = min(self.rate, self.tokens + elapsed * (self.rate / self.per))
self.last_update = now
if self.tokens < 1:
wait_time = (1 - self.tokens) * (self.per / self.rate)
await asyncio.sleep(wait_time)
self.tokens = 0
else:
self.tokens -= 1
class BaseAsyncClient:
"""Base class for async API clients with rate limiting."""
def __init__(
self,
max_concurrent: int = 5,
requests_per_second: float = 3.0,
logger: logging.Logger | None = None,
):
"""
Initialize base client.
Args:
max_concurrent: Maximum concurrent requests
requests_per_second: Rate limit
logger: Logger instance
"""
self.semaphore = Semaphore(max_concurrent)
self.rate_limiter = RateLimiter(requests_per_second)
self.logger = logger or logging.getLogger(self.__class__.__name__)
self.stats = {
"requests": 0,
"success": 0,
"errors": 0,
"retries": 0,
}
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=2, max=10),
retry=retry_if_exception_type(Exception),
)
async def _rate_limited_request(
self,
coro: Callable[[], Any],
) -> Any:
"""Execute a request with rate limiting and retry."""
async with self.semaphore:
await self.rate_limiter.acquire()
self.stats["requests"] += 1
try:
result = await coro()
self.stats["success"] += 1
return result
except Exception as e:
self.stats["errors"] += 1
self.logger.error(f"Request failed: {e}")
raise
async def batch_requests(
self,
requests: list[Callable[[], Any]],
desc: str = "Processing",
) -> list[Any]:
"""Execute multiple requests concurrently."""
try:
from tqdm.asyncio import tqdm
has_tqdm = True
except ImportError:
has_tqdm = False
async def execute(req: Callable) -> Any:
try:
return await self._rate_limited_request(req)
except Exception as e:
return {"error": str(e)}
tasks = [execute(req) for req in requests]
if has_tqdm:
results = []
for coro in tqdm.as_completed(tasks, total=len(tasks), desc=desc):
result = await coro
results.append(result)
return results
else:
return await asyncio.gather(*tasks, return_exceptions=True)
def print_stats(self) -> None:
"""Print request statistics."""
self.logger.info("=" * 40)
self.logger.info("Request Statistics:")
self.logger.info(f" Total Requests: {self.stats['requests']}")
self.logger.info(f" Successful: {self.stats['success']}")
self.logger.info(f" Errors: {self.stats['errors']}")
self.logger.info("=" * 40)
class ConfigManager:
"""Manage API configuration and credentials."""
def __init__(self):
load_dotenv()
@property
def google_credentials_path(self) -> str | None:
"""Get Google service account credentials path."""
# Prefer SEO-specific credentials, fallback to general credentials
seo_creds = os.path.expanduser("~/.credential/ourdigital-seo-agent.json")
if os.path.exists(seo_creds):
return seo_creds
return os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
@property
def pagespeed_api_key(self) -> str | None:
"""Get PageSpeed Insights API key."""
return os.getenv("PAGESPEED_API_KEY")
@property
def custom_search_api_key(self) -> str | None:
"""Get Custom Search API key."""
return os.getenv("CUSTOM_SEARCH_API_KEY")
@property
def custom_search_engine_id(self) -> str | None:
"""Get Custom Search Engine ID."""
return os.getenv("CUSTOM_SEARCH_ENGINE_ID")
@property
def notion_token(self) -> str | None:
"""Get Notion API token."""
return os.getenv("NOTION_TOKEN") or os.getenv("NOTION_API_KEY")
def validate_google_credentials(self) -> bool:
"""Validate Google credentials are configured."""
creds_path = self.google_credentials_path
if not creds_path:
return False
return os.path.exists(creds_path)
def get_required(self, key: str) -> str:
"""Get required environment variable or raise error."""
value = os.getenv(key)
if not value:
raise ValueError(f"Missing required environment variable: {key}")
return value
# Singleton config instance
config = ConfigManager()

View File

@@ -0,0 +1,409 @@
"""
Google Search Console Client
============================
Purpose: Interact with Google Search Console API for SEO data
Python: 3.10+
Usage:
from gsc_client import SearchConsoleClient
client = SearchConsoleClient()
data = client.get_search_analytics("sc-domain:example.com")
"""
import logging
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from typing import Any
from google.oauth2 import service_account
from googleapiclient.discovery import build
from base_client import config
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
@dataclass
class SearchAnalyticsResult:
"""Search analytics query result."""
rows: list[dict] = field(default_factory=list)
total_clicks: int = 0
total_impressions: int = 0
average_ctr: float = 0.0
average_position: float = 0.0
@dataclass
class SitemapInfo:
"""Sitemap information from Search Console."""
path: str
last_submitted: str | None = None
last_downloaded: str | None = None
is_pending: bool = False
is_sitemaps_index: bool = False
warnings: int = 0
errors: int = 0
class SearchConsoleClient:
"""Client for Google Search Console API."""
SCOPES = ["https://www.googleapis.com/auth/webmasters.readonly"]
def __init__(self, credentials_path: str | None = None):
"""
Initialize Search Console client.
Args:
credentials_path: Path to service account JSON key
"""
self.credentials_path = credentials_path or config.google_credentials_path
self._service = None
@property
def service(self):
"""Get or create Search Console service."""
if self._service is None:
if not self.credentials_path:
raise ValueError(
"Google credentials not configured. "
"Set GOOGLE_APPLICATION_CREDENTIALS environment variable."
)
credentials = service_account.Credentials.from_service_account_file(
self.credentials_path,
scopes=self.SCOPES,
)
self._service = build("searchconsole", "v1", credentials=credentials)
return self._service
def list_sites(self) -> list[dict]:
"""List all sites accessible to the service account."""
response = self.service.sites().list().execute()
return response.get("siteEntry", [])
def get_search_analytics(
self,
site_url: str,
start_date: str | None = None,
end_date: str | None = None,
dimensions: list[str] | None = None,
row_limit: int = 25000,
filters: list[dict] | None = None,
) -> SearchAnalyticsResult:
"""
Get search analytics data.
Args:
site_url: Site URL (e.g., "sc-domain:example.com" or "https://example.com/")
start_date: Start date (YYYY-MM-DD), defaults to 30 days ago
end_date: End date (YYYY-MM-DD), defaults to yesterday
dimensions: List of dimensions (query, page, country, device, date)
row_limit: Maximum rows to return
filters: Dimension filters
Returns:
SearchAnalyticsResult with rows and summary stats
"""
# Default date range: last 30 days
if not end_date:
end_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
if not start_date:
start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
# Default dimensions
if dimensions is None:
dimensions = ["query", "page"]
request_body = {
"startDate": start_date,
"endDate": end_date,
"dimensions": dimensions,
"rowLimit": row_limit,
}
if filters:
request_body["dimensionFilterGroups"] = [{"filters": filters}]
try:
response = self.service.searchanalytics().query(
siteUrl=site_url,
body=request_body,
).execute()
except Exception as e:
logger.error(f"Failed to query search analytics: {e}")
raise
rows = response.get("rows", [])
# Calculate totals
total_clicks = sum(row.get("clicks", 0) for row in rows)
total_impressions = sum(row.get("impressions", 0) for row in rows)
total_ctr = sum(row.get("ctr", 0) for row in rows)
total_position = sum(row.get("position", 0) for row in rows)
avg_ctr = total_ctr / len(rows) if rows else 0
avg_position = total_position / len(rows) if rows else 0
return SearchAnalyticsResult(
rows=rows,
total_clicks=total_clicks,
total_impressions=total_impressions,
average_ctr=avg_ctr,
average_position=avg_position,
)
def get_top_queries(
self,
site_url: str,
limit: int = 100,
start_date: str | None = None,
end_date: str | None = None,
) -> list[dict]:
"""Get top search queries by clicks."""
result = self.get_search_analytics(
site_url=site_url,
dimensions=["query"],
row_limit=limit,
start_date=start_date,
end_date=end_date,
)
# Sort by clicks
sorted_rows = sorted(
result.rows,
key=lambda x: x.get("clicks", 0),
reverse=True,
)
return [
{
"query": row["keys"][0],
"clicks": row.get("clicks", 0),
"impressions": row.get("impressions", 0),
"ctr": row.get("ctr", 0),
"position": row.get("position", 0),
}
for row in sorted_rows[:limit]
]
def get_top_pages(
self,
site_url: str,
limit: int = 100,
start_date: str | None = None,
end_date: str | None = None,
) -> list[dict]:
"""Get top pages by clicks."""
result = self.get_search_analytics(
site_url=site_url,
dimensions=["page"],
row_limit=limit,
start_date=start_date,
end_date=end_date,
)
sorted_rows = sorted(
result.rows,
key=lambda x: x.get("clicks", 0),
reverse=True,
)
return [
{
"page": row["keys"][0],
"clicks": row.get("clicks", 0),
"impressions": row.get("impressions", 0),
"ctr": row.get("ctr", 0),
"position": row.get("position", 0),
}
for row in sorted_rows[:limit]
]
def get_sitemaps(self, site_url: str) -> list[SitemapInfo]:
"""Get list of sitemaps for a site."""
try:
response = self.service.sitemaps().list(siteUrl=site_url).execute()
except Exception as e:
logger.error(f"Failed to get sitemaps: {e}")
raise
sitemaps = []
for sm in response.get("sitemap", []):
sitemaps.append(SitemapInfo(
path=sm.get("path", ""),
last_submitted=sm.get("lastSubmitted"),
last_downloaded=sm.get("lastDownloaded"),
is_pending=sm.get("isPending", False),
is_sitemaps_index=sm.get("isSitemapsIndex", False),
warnings=sm.get("warnings", 0),
errors=sm.get("errors", 0),
))
return sitemaps
def submit_sitemap(self, site_url: str, sitemap_url: str) -> bool:
"""Submit a sitemap for indexing."""
try:
self.service.sitemaps().submit(
siteUrl=site_url,
feedpath=sitemap_url,
).execute()
logger.info(f"Submitted sitemap: {sitemap_url}")
return True
except Exception as e:
logger.error(f"Failed to submit sitemap: {e}")
return False
def inspect_url(self, site_url: str, inspection_url: str) -> dict:
"""
Inspect a URL's indexing status.
Note: This uses the URL Inspection API which may have different quotas.
"""
try:
response = self.service.urlInspection().index().inspect(
body={
"inspectionUrl": inspection_url,
"siteUrl": site_url,
}
).execute()
result = response.get("inspectionResult", {})
return {
"url": inspection_url,
"indexing_state": result.get("indexStatusResult", {}).get(
"coverageState", "Unknown"
),
"last_crawl_time": result.get("indexStatusResult", {}).get(
"lastCrawlTime"
),
"crawled_as": result.get("indexStatusResult", {}).get("crawledAs"),
"robots_txt_state": result.get("indexStatusResult", {}).get(
"robotsTxtState"
),
"mobile_usability": result.get("mobileUsabilityResult", {}).get(
"verdict", "Unknown"
),
"rich_results": result.get("richResultsResult", {}).get(
"verdict", "Unknown"
),
}
except Exception as e:
logger.error(f"Failed to inspect URL: {e}")
raise
def get_performance_summary(
self,
site_url: str,
days: int = 30,
) -> dict:
"""Get a summary of search performance."""
end_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
start_date = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
# Get overall stats
overall = self.get_search_analytics(
site_url=site_url,
dimensions=[],
start_date=start_date,
end_date=end_date,
)
# Get top queries
top_queries = self.get_top_queries(
site_url=site_url,
limit=10,
start_date=start_date,
end_date=end_date,
)
# Get top pages
top_pages = self.get_top_pages(
site_url=site_url,
limit=10,
start_date=start_date,
end_date=end_date,
)
# Get by device
by_device = self.get_search_analytics(
site_url=site_url,
dimensions=["device"],
start_date=start_date,
end_date=end_date,
)
device_breakdown = {}
for row in by_device.rows:
device = row["keys"][0]
device_breakdown[device] = {
"clicks": row.get("clicks", 0),
"impressions": row.get("impressions", 0),
"ctr": row.get("ctr", 0),
"position": row.get("position", 0),
}
return {
"period": f"{start_date} to {end_date}",
"total_clicks": overall.total_clicks,
"total_impressions": overall.total_impressions,
"average_ctr": overall.average_ctr,
"average_position": overall.average_position,
"top_queries": top_queries,
"top_pages": top_pages,
"by_device": device_breakdown,
}
def main():
"""Test the Search Console client."""
import argparse
parser = argparse.ArgumentParser(description="Google Search Console Client")
parser.add_argument("--site", "-s", required=True, help="Site URL")
parser.add_argument("--action", "-a", default="summary",
choices=["summary", "queries", "pages", "sitemaps", "inspect"],
help="Action to perform")
parser.add_argument("--url", help="URL to inspect")
parser.add_argument("--days", type=int, default=30, help="Days of data")
args = parser.parse_args()
client = SearchConsoleClient()
if args.action == "summary":
summary = client.get_performance_summary(args.site, args.days)
import json
print(json.dumps(summary, indent=2, default=str))
elif args.action == "queries":
queries = client.get_top_queries(args.site)
for q in queries[:20]:
print(f"{q['query']}: {q['clicks']} clicks, pos {q['position']:.1f}")
elif args.action == "pages":
pages = client.get_top_pages(args.site)
for p in pages[:20]:
print(f"{p['page']}: {p['clicks']} clicks, pos {p['position']:.1f}")
elif args.action == "sitemaps":
sitemaps = client.get_sitemaps(args.site)
for sm in sitemaps:
print(f"{sm.path}: errors={sm.errors}, warnings={sm.warnings}")
elif args.action == "inspect" and args.url:
result = client.inspect_url(args.site, args.url)
import json
print(json.dumps(result, indent=2))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,7 @@
# 16-seo-search-console dependencies
google-api-python-client>=2.100.0
google-auth>=2.23.0
pandas>=2.1.0
python-dotenv>=1.0.0
rich>=13.7.0
typer>=0.9.0