Files
our-claude-skills/custom-skills/90-reference-curator/shared/config/crawl_config.yaml
Andrew Yim 6d7a6d7a88 feat(reference-curator): Add portable skill suite for reference documentation curation
6 modular skills for curating, processing, and exporting reference docs:
- reference-discovery: Search and validate authoritative sources
- web-crawler-orchestrator: Multi-backend crawling (Firecrawl/Node/aiohttp/Scrapy)
- content-repository: MySQL storage with version tracking
- content-distiller: Summarization and key concept extraction
- quality-reviewer: QA loop with approve/refactor/research routing
- markdown-exporter: Structured output for Claude Projects or fine-tuning

Cross-machine installation support:
- Environment-based config (~/.reference-curator.env)
- Commands tracked in repo, symlinked during install
- install.sh with --minimal, --check, --uninstall modes
- Firecrawl MCP as default (always available)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 00:20:27 +07:00

140 lines
3.3 KiB
YAML

# Reference Curator - Crawl Configuration
# Location: ~/.config/reference-curator/crawl_config.yaml
#
# Environment variables (set in ~/.reference-curator.env):
# CRAWLER_PROJECT_PATH - Path to crawler project (optional)
# REFERENCE_LIBRARY_PATH - Path to reference library storage (optional)
# Default crawler backend
# Options: nodejs, aiohttp, scrapy, firecrawl
# Set to "firecrawl" if local crawlers are not available
default_crawler: ${DEFAULT_CRAWLER:-firecrawl}
# Intelligent routing rules
# Claude will select the appropriate crawler based on these criteria
routing:
nodejs:
conditions:
- max_pages <= 50
- single_domain == true
- no_javascript_rendering == true
description: "Fast, lightweight - best for small documentation sites"
aiohttp:
conditions:
- max_pages <= 200
- needs_async == true
- seo_extraction == true
description: "Async with SEO extraction - best for technical docs"
scrapy:
conditions:
- max_pages > 200
- multi_domain == true
- needs_pipeline == true
description: "Enterprise-grade - best for large sites with complex structure"
firecrawl:
conditions:
- needs_javascript_rendering == true
- spa_site == true
- dynamic_content == true
description: "JS rendering - best for SPAs and dynamic content (MCP-based, always available)"
# Crawler locations (configurable via environment)
# If CRAWLER_PROJECT_PATH is not set, only Firecrawl MCP will be available
crawlers:
nodejs:
enabled: ${NODEJS_CRAWLER_ENABLED:-false}
path: ${CRAWLER_PROJECT_PATH}/util/js-crawler/
command: node src/crawler.js
install: npm install
aiohttp:
enabled: ${AIOHTTP_CRAWLER_ENABLED:-false}
path: ${CRAWLER_PROJECT_PATH}/
command: python -m seo_agent.crawler
install: uv sync
scrapy:
enabled: ${SCRAPY_CRAWLER_ENABLED:-false}
path: ${CRAWLER_PROJECT_PATH}/
command: scrapy crawl seo_spider
install: uv sync
firecrawl:
enabled: true # Always available via MCP
type: mcp
server: firecrawl
tools:
- firecrawl_scrape
- firecrawl_crawl
- firecrawl_map
install: "Configure firecrawl MCP server in Claude Code settings"
# Rate limiting
rate_limit:
requests_per_minute: 20
concurrent_requests: 3
# Retry settings
retry:
max_retries: 3
backoff_multiplier: 2
initial_delay_seconds: 10
# Default crawl options
default_options:
timeout: 30000 # milliseconds
max_depth: 3
max_pages: 100
respect_robots_txt: true
user_agent: "ReferenceBot/1.0"
# Content processing
processing:
max_content_size_mb: 50
supported_formats:
- html
- pdf
- markdown
# Storage paths (configurable)
raw_content_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/raw/
processed_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/processed/
# URL filtering
url_filter:
skip_extensions:
- .jpg
- .jpeg
- .png
- .gif
- .svg
- .webp
- .css
- .js
- .woff
- .woff2
- .ico
skip_patterns:
- /wp-admin/
- /wp-includes/
- /login
- /logout
# Site type detection hints
site_detection:
spa_indicators:
- "react"
- "angular"
- "vue"
- "next.js"
- "nuxt"
static_indicators:
- "hugo"
- "jekyll"
- "mkdocs"
- "docusaurus"
- "gitbook"