feat(reference-curator): Add portable skill suite for reference documentation curation
6 modular skills for curating, processing, and exporting reference docs: - reference-discovery: Search and validate authoritative sources - web-crawler-orchestrator: Multi-backend crawling (Firecrawl/Node/aiohttp/Scrapy) - content-repository: MySQL storage with version tracking - content-distiller: Summarization and key concept extraction - quality-reviewer: QA loop with approve/refactor/research routing - markdown-exporter: Structured output for Claude Projects or fine-tuning Cross-machine installation support: - Environment-based config (~/.reference-curator.env) - Commands tracked in repo, symlinked during install - install.sh with --minimal, --check, --uninstall modes - Firecrawl MCP as default (always available) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,139 @@
|
||||
# Reference Curator - Crawl Configuration
|
||||
# Location: ~/.config/reference-curator/crawl_config.yaml
|
||||
#
|
||||
# Environment variables (set in ~/.reference-curator.env):
|
||||
# CRAWLER_PROJECT_PATH - Path to crawler project (optional)
|
||||
# REFERENCE_LIBRARY_PATH - Path to reference library storage (optional)
|
||||
|
||||
# Default crawler backend
|
||||
# Options: nodejs, aiohttp, scrapy, firecrawl
|
||||
# Set to "firecrawl" if local crawlers are not available
|
||||
default_crawler: ${DEFAULT_CRAWLER:-firecrawl}
|
||||
|
||||
# Intelligent routing rules
|
||||
# Claude will select the appropriate crawler based on these criteria
|
||||
routing:
|
||||
nodejs:
|
||||
conditions:
|
||||
- max_pages <= 50
|
||||
- single_domain == true
|
||||
- no_javascript_rendering == true
|
||||
description: "Fast, lightweight - best for small documentation sites"
|
||||
|
||||
aiohttp:
|
||||
conditions:
|
||||
- max_pages <= 200
|
||||
- needs_async == true
|
||||
- seo_extraction == true
|
||||
description: "Async with SEO extraction - best for technical docs"
|
||||
|
||||
scrapy:
|
||||
conditions:
|
||||
- max_pages > 200
|
||||
- multi_domain == true
|
||||
- needs_pipeline == true
|
||||
description: "Enterprise-grade - best for large sites with complex structure"
|
||||
|
||||
firecrawl:
|
||||
conditions:
|
||||
- needs_javascript_rendering == true
|
||||
- spa_site == true
|
||||
- dynamic_content == true
|
||||
description: "JS rendering - best for SPAs and dynamic content (MCP-based, always available)"
|
||||
|
||||
# Crawler locations (configurable via environment)
|
||||
# If CRAWLER_PROJECT_PATH is not set, only Firecrawl MCP will be available
|
||||
crawlers:
|
||||
nodejs:
|
||||
enabled: ${NODEJS_CRAWLER_ENABLED:-false}
|
||||
path: ${CRAWLER_PROJECT_PATH}/util/js-crawler/
|
||||
command: node src/crawler.js
|
||||
install: npm install
|
||||
|
||||
aiohttp:
|
||||
enabled: ${AIOHTTP_CRAWLER_ENABLED:-false}
|
||||
path: ${CRAWLER_PROJECT_PATH}/
|
||||
command: python -m seo_agent.crawler
|
||||
install: uv sync
|
||||
|
||||
scrapy:
|
||||
enabled: ${SCRAPY_CRAWLER_ENABLED:-false}
|
||||
path: ${CRAWLER_PROJECT_PATH}/
|
||||
command: scrapy crawl seo_spider
|
||||
install: uv sync
|
||||
|
||||
firecrawl:
|
||||
enabled: true # Always available via MCP
|
||||
type: mcp
|
||||
server: firecrawl
|
||||
tools:
|
||||
- firecrawl_scrape
|
||||
- firecrawl_crawl
|
||||
- firecrawl_map
|
||||
install: "Configure firecrawl MCP server in Claude Code settings"
|
||||
|
||||
# Rate limiting
|
||||
rate_limit:
|
||||
requests_per_minute: 20
|
||||
concurrent_requests: 3
|
||||
|
||||
# Retry settings
|
||||
retry:
|
||||
max_retries: 3
|
||||
backoff_multiplier: 2
|
||||
initial_delay_seconds: 10
|
||||
|
||||
# Default crawl options
|
||||
default_options:
|
||||
timeout: 30000 # milliseconds
|
||||
max_depth: 3
|
||||
max_pages: 100
|
||||
respect_robots_txt: true
|
||||
user_agent: "ReferenceBot/1.0"
|
||||
|
||||
# Content processing
|
||||
processing:
|
||||
max_content_size_mb: 50
|
||||
supported_formats:
|
||||
- html
|
||||
- pdf
|
||||
- markdown
|
||||
|
||||
# Storage paths (configurable)
|
||||
raw_content_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/raw/
|
||||
processed_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/processed/
|
||||
|
||||
# URL filtering
|
||||
url_filter:
|
||||
skip_extensions:
|
||||
- .jpg
|
||||
- .jpeg
|
||||
- .png
|
||||
- .gif
|
||||
- .svg
|
||||
- .webp
|
||||
- .css
|
||||
- .js
|
||||
- .woff
|
||||
- .woff2
|
||||
- .ico
|
||||
skip_patterns:
|
||||
- /wp-admin/
|
||||
- /wp-includes/
|
||||
- /login
|
||||
- /logout
|
||||
|
||||
# Site type detection hints
|
||||
site_detection:
|
||||
spa_indicators:
|
||||
- "react"
|
||||
- "angular"
|
||||
- "vue"
|
||||
- "next.js"
|
||||
- "nuxt"
|
||||
static_indicators:
|
||||
- "hugo"
|
||||
- "jekyll"
|
||||
- "mkdocs"
|
||||
- "docusaurus"
|
||||
- "gitbook"
|
||||
@@ -0,0 +1,31 @@
|
||||
# Reference Curator - Database Configuration
|
||||
# Location: ~/.config/reference-curator/db_config.yaml
|
||||
#
|
||||
# Environment variables (set in ~/.reference-curator.env):
|
||||
# MYSQL_USER - MySQL username (required)
|
||||
# MYSQL_PASSWORD - MySQL password (required)
|
||||
# MYSQL_HOST - MySQL host (optional, default: localhost)
|
||||
# MYSQL_PORT - MySQL port (optional, default: 3306)
|
||||
|
||||
mysql:
|
||||
host: ${MYSQL_HOST:-localhost}
|
||||
port: ${MYSQL_PORT:-3306}
|
||||
database: reference_library
|
||||
user: ${MYSQL_USER}
|
||||
password: ${MYSQL_PASSWORD}
|
||||
charset: utf8mb4
|
||||
|
||||
# Connection pool settings
|
||||
pool_size: 5
|
||||
pool_recycle: 3600
|
||||
|
||||
# SSL (if needed for remote MySQL)
|
||||
ssl:
|
||||
enabled: false
|
||||
ca_cert: null
|
||||
|
||||
# Backup settings
|
||||
backup:
|
||||
enabled: true
|
||||
path: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/backups/
|
||||
retention_days: 30
|
||||
@@ -0,0 +1,46 @@
|
||||
# Reference Curator - Export Configuration
|
||||
# Location: ~/.config/reference-curator/export_config.yaml
|
||||
#
|
||||
# Copy this file to ~/.config/reference-curator/export_config.yaml
|
||||
|
||||
output:
|
||||
base_path: ~/reference-library/exports/
|
||||
|
||||
# Project files format (for Claude Projects)
|
||||
project_files:
|
||||
structure: nested_by_topic # flat | nested_by_topic | nested_by_source
|
||||
index_file: INDEX.md
|
||||
include_metadata: true
|
||||
max_file_size_kb: 500
|
||||
|
||||
# Fine-tuning dataset format
|
||||
fine_tuning:
|
||||
format: jsonl
|
||||
include_system_prompt: true
|
||||
system_prompt: "You are an expert on AI and prompt engineering."
|
||||
max_tokens_per_sample: 4096
|
||||
|
||||
# Knowledge base format
|
||||
knowledge_base:
|
||||
structure: flat
|
||||
include_toc: true
|
||||
|
||||
# Quality thresholds
|
||||
quality:
|
||||
min_score_for_export: 0.80
|
||||
require_human_review: false
|
||||
auto_approve_tier1_sources: true
|
||||
auto_approve_min_score: 0.80
|
||||
|
||||
# Cross-reference settings
|
||||
cross_references:
|
||||
enabled: true
|
||||
min_concept_overlap: 2
|
||||
max_related_docs: 5
|
||||
|
||||
# Verification
|
||||
verification:
|
||||
check_broken_links: true
|
||||
validate_markdown: true
|
||||
check_duplicates: true
|
||||
max_allowed_duplicates: 0
|
||||
Reference in New Issue
Block a user