feat(reference-curator): Add portable skill suite for reference documentation curation

6 modular skills for curating, processing, and exporting reference docs: - reference-discovery: Search and validate authoritative sources - web-crawler-orchestrator: Multi-backend crawling (Firecrawl/Node/aiohttp/Scrapy) - content-repository: MySQL storage with version tracking - content-distiller: Summarization and key concept extraction - quality-reviewer: QA loop with approve/refactor/research routing - markdown-exporter: Structured output for Claude Projects or fine-tuning Cross-machine installation support: - Environment-based config (~/.reference-curator.env) - Commands tracked in repo, symlinked during install - install.sh with --minimal, --check, --uninstall modes - Firecrawl MCP as default (always available) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 00:20:27 +07:00
parent e80056ae8a
commit 6d7a6d7a88
26 changed files with 4486 additions and 1 deletions
--- a/custom-skills/90-reference-curator/shared/config/crawl_config.yaml
+++ b/custom-skills/90-reference-curator/shared/config/crawl_config.yaml
@@ -0,0 +1,139 @@
+# Reference Curator - Crawl Configuration
+# Location: ~/.config/reference-curator/crawl_config.yaml
+#
+# Environment variables (set in ~/.reference-curator.env):
+#   CRAWLER_PROJECT_PATH - Path to crawler project (optional)
+#   REFERENCE_LIBRARY_PATH - Path to reference library storage (optional)
+
+# Default crawler backend
+# Options: nodejs, aiohttp, scrapy, firecrawl
+# Set to "firecrawl" if local crawlers are not available
+default_crawler: ${DEFAULT_CRAWLER:-firecrawl}
+
+# Intelligent routing rules
+# Claude will select the appropriate crawler based on these criteria
+routing:
+  nodejs:
+    conditions:
+      - max_pages <= 50
+      - single_domain == true
+      - no_javascript_rendering == true
+    description: "Fast, lightweight - best for small documentation sites"
+
+  aiohttp:
+    conditions:
+      - max_pages <= 200
+      - needs_async == true
+      - seo_extraction == true
+    description: "Async with SEO extraction - best for technical docs"
+
+  scrapy:
+    conditions:
+      - max_pages > 200
+      - multi_domain == true
+      - needs_pipeline == true
+    description: "Enterprise-grade - best for large sites with complex structure"
+
+  firecrawl:
+    conditions:
+      - needs_javascript_rendering == true
+      - spa_site == true
+      - dynamic_content == true
+    description: "JS rendering - best for SPAs and dynamic content (MCP-based, always available)"
+
+# Crawler locations (configurable via environment)
+# If CRAWLER_PROJECT_PATH is not set, only Firecrawl MCP will be available
+crawlers:
+  nodejs:
+    enabled: ${NODEJS_CRAWLER_ENABLED:-false}
+    path: ${CRAWLER_PROJECT_PATH}/util/js-crawler/
+    command: node src/crawler.js
+    install: npm install
+
+  aiohttp:
+    enabled: ${AIOHTTP_CRAWLER_ENABLED:-false}
+    path: ${CRAWLER_PROJECT_PATH}/
+    command: python -m seo_agent.crawler
+    install: uv sync
+
+  scrapy:
+    enabled: ${SCRAPY_CRAWLER_ENABLED:-false}
+    path: ${CRAWLER_PROJECT_PATH}/
+    command: scrapy crawl seo_spider
+    install: uv sync
+
+  firecrawl:
+    enabled: true  # Always available via MCP
+    type: mcp
+    server: firecrawl
+    tools:
+      - firecrawl_scrape
+      - firecrawl_crawl
+      - firecrawl_map
+    install: "Configure firecrawl MCP server in Claude Code settings"
+
+# Rate limiting
+rate_limit:
+  requests_per_minute: 20
+  concurrent_requests: 3
+
+# Retry settings
+retry:
+  max_retries: 3
+  backoff_multiplier: 2
+  initial_delay_seconds: 10
+
+# Default crawl options
+default_options:
+  timeout: 30000  # milliseconds
+  max_depth: 3
+  max_pages: 100
+  respect_robots_txt: true
+  user_agent: "ReferenceBot/1.0"
+
+# Content processing
+processing:
+  max_content_size_mb: 50
+  supported_formats:
+    - html
+    - pdf
+    - markdown
+
+  # Storage paths (configurable)
+  raw_content_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/raw/
+  processed_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/processed/
+
+# URL filtering
+url_filter:
+  skip_extensions:
+    - .jpg
+    - .jpeg
+    - .png
+    - .gif
+    - .svg
+    - .webp
+    - .css
+    - .js
+    - .woff
+    - .woff2
+    - .ico
+  skip_patterns:
+    - /wp-admin/
+    - /wp-includes/
+    - /login
+    - /logout
+
+# Site type detection hints
+site_detection:
+  spa_indicators:
+    - "react"
+    - "angular"
+    - "vue"
+    - "next.js"
+    - "nuxt"
+  static_indicators:
+    - "hugo"
+    - "jekyll"
+    - "mkdocs"
+    - "docusaurus"
+    - "gitbook"
--- a/custom-skills/90-reference-curator/shared/config/db_config.yaml
+++ b/custom-skills/90-reference-curator/shared/config/db_config.yaml
@@ -0,0 +1,31 @@
+# Reference Curator - Database Configuration
+# Location: ~/.config/reference-curator/db_config.yaml
+#
+# Environment variables (set in ~/.reference-curator.env):
+#   MYSQL_USER     - MySQL username (required)
+#   MYSQL_PASSWORD - MySQL password (required)
+#   MYSQL_HOST     - MySQL host (optional, default: localhost)
+#   MYSQL_PORT     - MySQL port (optional, default: 3306)
+
+mysql:
+  host: ${MYSQL_HOST:-localhost}
+  port: ${MYSQL_PORT:-3306}
+  database: reference_library
+  user: ${MYSQL_USER}
+  password: ${MYSQL_PASSWORD}
+  charset: utf8mb4
+
+  # Connection pool settings
+  pool_size: 5
+  pool_recycle: 3600
+
+  # SSL (if needed for remote MySQL)
+  ssl:
+    enabled: false
+    ca_cert: null
+
+# Backup settings
+backup:
+  enabled: true
+  path: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/backups/
+  retention_days: 30
--- a/custom-skills/90-reference-curator/shared/config/export_config.yaml
+++ b/custom-skills/90-reference-curator/shared/config/export_config.yaml
@@ -0,0 +1,46 @@
+# Reference Curator - Export Configuration
+# Location: ~/.config/reference-curator/export_config.yaml
+#
+# Copy this file to ~/.config/reference-curator/export_config.yaml
+
+output:
+  base_path: ~/reference-library/exports/
+
+  # Project files format (for Claude Projects)
+  project_files:
+    structure: nested_by_topic  # flat | nested_by_topic | nested_by_source
+    index_file: INDEX.md
+    include_metadata: true
+    max_file_size_kb: 500
+
+  # Fine-tuning dataset format
+  fine_tuning:
+    format: jsonl
+    include_system_prompt: true
+    system_prompt: "You are an expert on AI and prompt engineering."
+    max_tokens_per_sample: 4096
+
+  # Knowledge base format
+  knowledge_base:
+    structure: flat
+    include_toc: true
+
+# Quality thresholds
+quality:
+  min_score_for_export: 0.80
+  require_human_review: false
+  auto_approve_tier1_sources: true
+  auto_approve_min_score: 0.80
+
+# Cross-reference settings
+cross_references:
+  enabled: true
+  min_concept_overlap: 2
+  max_related_docs: 5
+
+# Verification
+verification:
+  check_broken_links: true
+  validate_markdown: true
+  check_duplicates: true
+  max_allowed_duplicates: 0