6 modular skills for curating, processing, and exporting reference docs: - reference-discovery: Search and validate authoritative sources - web-crawler-orchestrator: Multi-backend crawling (Firecrawl/Node/aiohttp/Scrapy) - content-repository: MySQL storage with version tracking - content-distiller: Summarization and key concept extraction - quality-reviewer: QA loop with approve/refactor/research routing - markdown-exporter: Structured output for Claude Projects or fine-tuning Cross-machine installation support: - Environment-based config (~/.reference-curator.env) - Commands tracked in repo, symlinked during install - install.sh with --minimal, --check, --uninstall modes - Firecrawl MCP as default (always available) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
140 lines
3.3 KiB
YAML
140 lines
3.3 KiB
YAML
# Reference Curator - Crawl Configuration
|
|
# Location: ~/.config/reference-curator/crawl_config.yaml
|
|
#
|
|
# Environment variables (set in ~/.reference-curator.env):
|
|
# CRAWLER_PROJECT_PATH - Path to crawler project (optional)
|
|
# REFERENCE_LIBRARY_PATH - Path to reference library storage (optional)
|
|
|
|
# Default crawler backend
|
|
# Options: nodejs, aiohttp, scrapy, firecrawl
|
|
# Set to "firecrawl" if local crawlers are not available
|
|
default_crawler: ${DEFAULT_CRAWLER:-firecrawl}
|
|
|
|
# Intelligent routing rules
|
|
# Claude will select the appropriate crawler based on these criteria
|
|
routing:
|
|
nodejs:
|
|
conditions:
|
|
- max_pages <= 50
|
|
- single_domain == true
|
|
- no_javascript_rendering == true
|
|
description: "Fast, lightweight - best for small documentation sites"
|
|
|
|
aiohttp:
|
|
conditions:
|
|
- max_pages <= 200
|
|
- needs_async == true
|
|
- seo_extraction == true
|
|
description: "Async with SEO extraction - best for technical docs"
|
|
|
|
scrapy:
|
|
conditions:
|
|
- max_pages > 200
|
|
- multi_domain == true
|
|
- needs_pipeline == true
|
|
description: "Enterprise-grade - best for large sites with complex structure"
|
|
|
|
firecrawl:
|
|
conditions:
|
|
- needs_javascript_rendering == true
|
|
- spa_site == true
|
|
- dynamic_content == true
|
|
description: "JS rendering - best for SPAs and dynamic content (MCP-based, always available)"
|
|
|
|
# Crawler locations (configurable via environment)
|
|
# If CRAWLER_PROJECT_PATH is not set, only Firecrawl MCP will be available
|
|
crawlers:
|
|
nodejs:
|
|
enabled: ${NODEJS_CRAWLER_ENABLED:-false}
|
|
path: ${CRAWLER_PROJECT_PATH}/util/js-crawler/
|
|
command: node src/crawler.js
|
|
install: npm install
|
|
|
|
aiohttp:
|
|
enabled: ${AIOHTTP_CRAWLER_ENABLED:-false}
|
|
path: ${CRAWLER_PROJECT_PATH}/
|
|
command: python -m seo_agent.crawler
|
|
install: uv sync
|
|
|
|
scrapy:
|
|
enabled: ${SCRAPY_CRAWLER_ENABLED:-false}
|
|
path: ${CRAWLER_PROJECT_PATH}/
|
|
command: scrapy crawl seo_spider
|
|
install: uv sync
|
|
|
|
firecrawl:
|
|
enabled: true # Always available via MCP
|
|
type: mcp
|
|
server: firecrawl
|
|
tools:
|
|
- firecrawl_scrape
|
|
- firecrawl_crawl
|
|
- firecrawl_map
|
|
install: "Configure firecrawl MCP server in Claude Code settings"
|
|
|
|
# Rate limiting
|
|
rate_limit:
|
|
requests_per_minute: 20
|
|
concurrent_requests: 3
|
|
|
|
# Retry settings
|
|
retry:
|
|
max_retries: 3
|
|
backoff_multiplier: 2
|
|
initial_delay_seconds: 10
|
|
|
|
# Default crawl options
|
|
default_options:
|
|
timeout: 30000 # milliseconds
|
|
max_depth: 3
|
|
max_pages: 100
|
|
respect_robots_txt: true
|
|
user_agent: "ReferenceBot/1.0"
|
|
|
|
# Content processing
|
|
processing:
|
|
max_content_size_mb: 50
|
|
supported_formats:
|
|
- html
|
|
- pdf
|
|
- markdown
|
|
|
|
# Storage paths (configurable)
|
|
raw_content_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/raw/
|
|
processed_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/processed/
|
|
|
|
# URL filtering
|
|
url_filter:
|
|
skip_extensions:
|
|
- .jpg
|
|
- .jpeg
|
|
- .png
|
|
- .gif
|
|
- .svg
|
|
- .webp
|
|
- .css
|
|
- .js
|
|
- .woff
|
|
- .woff2
|
|
- .ico
|
|
skip_patterns:
|
|
- /wp-admin/
|
|
- /wp-includes/
|
|
- /login
|
|
- /logout
|
|
|
|
# Site type detection hints
|
|
site_detection:
|
|
spa_indicators:
|
|
- "react"
|
|
- "angular"
|
|
- "vue"
|
|
- "next.js"
|
|
- "nuxt"
|
|
static_indicators:
|
|
- "hugo"
|
|
- "jekyll"
|
|
- "mkdocs"
|
|
- "docusaurus"
|
|
- "gitbook"
|