# Reference Curator - Crawl Configuration # Location: ~/.config/reference-curator/crawl_config.yaml # # Environment variables (set in ~/.reference-curator.env): # CRAWLER_PROJECT_PATH - Path to crawler project (optional) # REFERENCE_LIBRARY_PATH - Path to reference library storage (optional) # Default crawler backend # Options: nodejs, aiohttp, scrapy, firecrawl # Set to "firecrawl" if local crawlers are not available default_crawler: ${DEFAULT_CRAWLER:-firecrawl} # Intelligent routing rules # Claude will select the appropriate crawler based on these criteria routing: nodejs: conditions: - max_pages <= 50 - single_domain == true - no_javascript_rendering == true description: "Fast, lightweight - best for small documentation sites" aiohttp: conditions: - max_pages <= 200 - needs_async == true - seo_extraction == true description: "Async with SEO extraction - best for technical docs" scrapy: conditions: - max_pages > 200 - multi_domain == true - needs_pipeline == true description: "Enterprise-grade - best for large sites with complex structure" firecrawl: conditions: - needs_javascript_rendering == true - spa_site == true - dynamic_content == true description: "JS rendering - best for SPAs and dynamic content (MCP-based, always available)" # Crawler locations (configurable via environment) # If CRAWLER_PROJECT_PATH is not set, only Firecrawl MCP will be available crawlers: nodejs: enabled: ${NODEJS_CRAWLER_ENABLED:-false} path: ${CRAWLER_PROJECT_PATH}/util/js-crawler/ command: node src/crawler.js install: npm install aiohttp: enabled: ${AIOHTTP_CRAWLER_ENABLED:-false} path: ${CRAWLER_PROJECT_PATH}/ command: python -m seo_agent.crawler install: uv sync scrapy: enabled: ${SCRAPY_CRAWLER_ENABLED:-false} path: ${CRAWLER_PROJECT_PATH}/ command: scrapy crawl seo_spider install: uv sync firecrawl: enabled: true # Always available via MCP type: mcp server: firecrawl tools: - firecrawl_scrape - firecrawl_crawl - firecrawl_map install: "Configure firecrawl MCP server in Claude Code settings" # Rate limiting rate_limit: requests_per_minute: 20 concurrent_requests: 3 # Retry settings retry: max_retries: 3 backoff_multiplier: 2 initial_delay_seconds: 10 # Default crawl options default_options: timeout: 30000 # milliseconds max_depth: 3 max_pages: 100 respect_robots_txt: true user_agent: "ReferenceBot/1.0" # Content processing processing: max_content_size_mb: 50 supported_formats: - html - pdf - markdown # Storage paths (configurable) raw_content_dir: ${REFERENCE_LIBRARY_PATH:-~/Documents/05_AI Agent/10_Reference Library}/raw/ processed_dir: ${REFERENCE_LIBRARY_PATH:-~/Documents/05_AI Agent/10_Reference Library}/processed/ # URL filtering url_filter: skip_extensions: - .jpg - .jpeg - .png - .gif - .svg - .webp - .css - .js - .woff - .woff2 - .ico skip_patterns: - /wp-admin/ - /wp-includes/ - /login - /logout # Site type detection hints site_detection: spa_indicators: - "react" - "angular" - "vue" - "next.js" - "nuxt" static_indicators: - "hugo" - "jekyll" - "mkdocs" - "docusaurus" - "gitbook"