our-claude-skills/custom-skills/90-reference-curator/shared/config/crawl_config.yaml

# Reference Curator - Crawl Configuration
# Location: ~/.config/reference-curator/crawl_config.yaml
#
# Environment variables (set in ~/.reference-curator.env):
#   CRAWLER_PROJECT_PATH - Path to crawler project (optional)
#   REFERENCE_LIBRARY_PATH - Path to reference library storage (optional)

# Default crawler backend
# Options: nodejs, aiohttp, scrapy, firecrawl
# Set to "firecrawl" if local crawlers are not available
default_crawler: ${DEFAULT_CRAWLER:-firecrawl}

# Intelligent routing rules
# Claude will select the appropriate crawler based on these criteria
routing:
  nodejs:
    conditions:
      - max_pages <= 50
      - single_domain == true
      - no_javascript_rendering == true
    description: "Fast, lightweight - best for small documentation sites"

  aiohttp:
    conditions:
      - max_pages <= 200
      - needs_async == true
      - seo_extraction == true
    description: "Async with SEO extraction - best for technical docs"

  scrapy:
    conditions:
      - max_pages > 200
      - multi_domain == true
      - needs_pipeline == true
    description: "Enterprise-grade - best for large sites with complex structure"

  firecrawl:
    conditions:
      - needs_javascript_rendering == true
      - spa_site == true
      - dynamic_content == true
    description: "JS rendering - best for SPAs and dynamic content (MCP-based, always available)"

# Crawler locations (configurable via environment)
# If CRAWLER_PROJECT_PATH is not set, only Firecrawl MCP will be available
crawlers:
  nodejs:
    enabled: ${NODEJS_CRAWLER_ENABLED:-false}
    path: ${CRAWLER_PROJECT_PATH}/util/js-crawler/
    command: node src/crawler.js
    install: npm install

  aiohttp:
    enabled: ${AIOHTTP_CRAWLER_ENABLED:-false}
    path: ${CRAWLER_PROJECT_PATH}/
    command: python -m seo_agent.crawler
    install: uv sync

  scrapy:
    enabled: ${SCRAPY_CRAWLER_ENABLED:-false}
    path: ${CRAWLER_PROJECT_PATH}/
    command: scrapy crawl seo_spider
    install: uv sync

  firecrawl:
    enabled: true  # Always available via MCP
    type: mcp
    server: firecrawl
    tools:
      - firecrawl_scrape
      - firecrawl_crawl
      - firecrawl_map
    install: "Configure firecrawl MCP server in Claude Code settings"

# Rate limiting
rate_limit:
  requests_per_minute: 20
  concurrent_requests: 3

# Retry settings
retry:
  max_retries: 3
  backoff_multiplier: 2
  initial_delay_seconds: 10

# Default crawl options
default_options:
  timeout: 30000  # milliseconds
  max_depth: 3
  max_pages: 100
  respect_robots_txt: true
  user_agent: "ReferenceBot/1.0"

# Content processing
processing:
  max_content_size_mb: 50
  supported_formats:
    - html
    - pdf
    - markdown

  # Storage paths (configurable)
  raw_content_dir: ${REFERENCE_LIBRARY_PATH:-~/Documents/05_AI Agent/10_Reference Library}/raw/
  processed_dir: ${REFERENCE_LIBRARY_PATH:-~/Documents/05_AI Agent/10_Reference Library}/processed/

# URL filtering
url_filter:
  skip_extensions:
    - .jpg
    - .jpeg
    - .png
    - .gif
    - .svg
    - .webp
    - .css
    - .js
    - .woff
    - .woff2
    - .ico
  skip_patterns:
    - /wp-admin/
    - /wp-includes/
    - /login
    - /logout

# Site type detection hints
site_detection:
  spa_indicators:
    - "react"
    - "angular"
    - "vue"
    - "next.js"
    - "nuxt"
  static_indicators:
    - "hugo"
    - "jekyll"
    - "mkdocs"
    - "docusaurus"
    - "gitbook"