feat(reference-curator): Add portable skill suite for reference documentation curation

6 modular skills for curating, processing, and exporting reference docs:
- reference-discovery: Search and validate authoritative sources
- web-crawler-orchestrator: Multi-backend crawling (Firecrawl/Node/aiohttp/Scrapy)
- content-repository: MySQL storage with version tracking
- content-distiller: Summarization and key concept extraction
- quality-reviewer: QA loop with approve/refactor/research routing
- markdown-exporter: Structured output for Claude Projects or fine-tuning

Cross-machine installation support:
- Environment-based config (~/.reference-curator.env)
- Commands tracked in repo, symlinked during install
- install.sh with --minimal, --check, --uninstall modes
- Firecrawl MCP as default (always available)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-29 00:20:27 +07:00
parent e80056ae8a
commit 6d7a6d7a88
26 changed files with 4486 additions and 1 deletions

View File

@@ -0,0 +1,139 @@
# Reference Curator - Crawl Configuration
# Location: ~/.config/reference-curator/crawl_config.yaml
#
# Environment variables (set in ~/.reference-curator.env):
# CRAWLER_PROJECT_PATH - Path to crawler project (optional)
# REFERENCE_LIBRARY_PATH - Path to reference library storage (optional)
# Default crawler backend
# Options: nodejs, aiohttp, scrapy, firecrawl
# Set to "firecrawl" if local crawlers are not available
default_crawler: ${DEFAULT_CRAWLER:-firecrawl}
# Intelligent routing rules
# Claude will select the appropriate crawler based on these criteria
routing:
nodejs:
conditions:
- max_pages <= 50
- single_domain == true
- no_javascript_rendering == true
description: "Fast, lightweight - best for small documentation sites"
aiohttp:
conditions:
- max_pages <= 200
- needs_async == true
- seo_extraction == true
description: "Async with SEO extraction - best for technical docs"
scrapy:
conditions:
- max_pages > 200
- multi_domain == true
- needs_pipeline == true
description: "Enterprise-grade - best for large sites with complex structure"
firecrawl:
conditions:
- needs_javascript_rendering == true
- spa_site == true
- dynamic_content == true
description: "JS rendering - best for SPAs and dynamic content (MCP-based, always available)"
# Crawler locations (configurable via environment)
# If CRAWLER_PROJECT_PATH is not set, only Firecrawl MCP will be available
crawlers:
nodejs:
enabled: ${NODEJS_CRAWLER_ENABLED:-false}
path: ${CRAWLER_PROJECT_PATH}/util/js-crawler/
command: node src/crawler.js
install: npm install
aiohttp:
enabled: ${AIOHTTP_CRAWLER_ENABLED:-false}
path: ${CRAWLER_PROJECT_PATH}/
command: python -m seo_agent.crawler
install: uv sync
scrapy:
enabled: ${SCRAPY_CRAWLER_ENABLED:-false}
path: ${CRAWLER_PROJECT_PATH}/
command: scrapy crawl seo_spider
install: uv sync
firecrawl:
enabled: true # Always available via MCP
type: mcp
server: firecrawl
tools:
- firecrawl_scrape
- firecrawl_crawl
- firecrawl_map
install: "Configure firecrawl MCP server in Claude Code settings"
# Rate limiting
rate_limit:
requests_per_minute: 20
concurrent_requests: 3
# Retry settings
retry:
max_retries: 3
backoff_multiplier: 2
initial_delay_seconds: 10
# Default crawl options
default_options:
timeout: 30000 # milliseconds
max_depth: 3
max_pages: 100
respect_robots_txt: true
user_agent: "ReferenceBot/1.0"
# Content processing
processing:
max_content_size_mb: 50
supported_formats:
- html
- pdf
- markdown
# Storage paths (configurable)
raw_content_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/raw/
processed_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/processed/
# URL filtering
url_filter:
skip_extensions:
- .jpg
- .jpeg
- .png
- .gif
- .svg
- .webp
- .css
- .js
- .woff
- .woff2
- .ico
skip_patterns:
- /wp-admin/
- /wp-includes/
- /login
- /logout
# Site type detection hints
site_detection:
spa_indicators:
- "react"
- "angular"
- "vue"
- "next.js"
- "nuxt"
static_indicators:
- "hugo"
- "jekyll"
- "mkdocs"
- "docusaurus"
- "gitbook"

View File

@@ -0,0 +1,31 @@
# Reference Curator - Database Configuration
# Location: ~/.config/reference-curator/db_config.yaml
#
# Environment variables (set in ~/.reference-curator.env):
# MYSQL_USER - MySQL username (required)
# MYSQL_PASSWORD - MySQL password (required)
# MYSQL_HOST - MySQL host (optional, default: localhost)
# MYSQL_PORT - MySQL port (optional, default: 3306)
mysql:
host: ${MYSQL_HOST:-localhost}
port: ${MYSQL_PORT:-3306}
database: reference_library
user: ${MYSQL_USER}
password: ${MYSQL_PASSWORD}
charset: utf8mb4
# Connection pool settings
pool_size: 5
pool_recycle: 3600
# SSL (if needed for remote MySQL)
ssl:
enabled: false
ca_cert: null
# Backup settings
backup:
enabled: true
path: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/backups/
retention_days: 30

View File

@@ -0,0 +1,46 @@
# Reference Curator - Export Configuration
# Location: ~/.config/reference-curator/export_config.yaml
#
# Copy this file to ~/.config/reference-curator/export_config.yaml
output:
base_path: ~/reference-library/exports/
# Project files format (for Claude Projects)
project_files:
structure: nested_by_topic # flat | nested_by_topic | nested_by_source
index_file: INDEX.md
include_metadata: true
max_file_size_kb: 500
# Fine-tuning dataset format
fine_tuning:
format: jsonl
include_system_prompt: true
system_prompt: "You are an expert on AI and prompt engineering."
max_tokens_per_sample: 4096
# Knowledge base format
knowledge_base:
structure: flat
include_toc: true
# Quality thresholds
quality:
min_score_for_export: 0.80
require_human_review: false
auto_approve_tier1_sources: true
auto_approve_min_score: 0.80
# Cross-reference settings
cross_references:
enabled: true
min_concept_overlap: 2
max_related_docs: 5
# Verification
verification:
check_broken_links: true
validate_markdown: true
check_duplicates: true
max_allowed_duplicates: 0

View File

@@ -0,0 +1,285 @@
-- ===========================================
-- Reference Library Database Schema
-- Version: 1.0
-- Purpose: Store and manage curated reference materials
-- ===========================================
CREATE DATABASE IF NOT EXISTS reference_library
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
USE reference_library;
-- -----------------------------
-- 1. Core Tables
-- -----------------------------
CREATE TABLE sources (
source_id INT AUTO_INCREMENT PRIMARY KEY,
source_name VARCHAR(255) NOT NULL,
source_type ENUM('official_docs', 'engineering_blog', 'research_paper',
'github_repo', 'community_guide', 'pdf_document', 'api_reference') NOT NULL,
base_url VARCHAR(500),
credibility_tier ENUM('tier1_official', 'tier2_verified', 'tier3_community') DEFAULT 'tier3_community',
vendor VARCHAR(100), -- anthropic, openai, google, etc.
is_active BOOLEAN DEFAULT TRUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
INDEX idx_vendor (vendor),
INDEX idx_source_type (source_type)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE documents (
doc_id INT AUTO_INCREMENT PRIMARY KEY,
source_id INT NOT NULL,
-- Document identification
title VARCHAR(500) NOT NULL,
url VARCHAR(1000),
url_hash CHAR(64) AS (SHA2(url, 256)) STORED, -- For deduplication
-- Content metadata
doc_type ENUM('webpage', 'pdf', 'markdown', 'api_spec', 'code_sample') NOT NULL,
language ENUM('en', 'ko', 'mixed') DEFAULT 'en',
original_publish_date DATE,
last_modified_date DATE,
-- Crawl metadata
crawl_date TIMESTAMP,
crawl_method ENUM('firecrawl', 'scrapy', 'aiohttp', 'nodejs', 'manual', 'api') DEFAULT 'aiohttp',
crawl_status ENUM('pending', 'completed', 'failed', 'stale') DEFAULT 'pending',
-- Storage
raw_content_path VARCHAR(500), -- Path to raw crawled file
raw_content_size INT, -- Bytes
-- Version tracking
version INT DEFAULT 1,
previous_version_id INT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
FOREIGN KEY (source_id) REFERENCES sources(source_id) ON DELETE CASCADE,
FOREIGN KEY (previous_version_id) REFERENCES documents(doc_id) ON DELETE SET NULL,
UNIQUE INDEX idx_url_hash (url_hash),
INDEX idx_crawl_status (crawl_status),
INDEX idx_crawl_date (crawl_date)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- -----------------------------
-- 2. Content Processing Tables
-- -----------------------------
CREATE TABLE distilled_content (
distill_id INT AUTO_INCREMENT PRIMARY KEY,
doc_id INT NOT NULL,
-- Distilled output
summary TEXT, -- Executive summary
key_concepts JSON, -- Extracted key terms and definitions
code_snippets JSON, -- Extracted code examples
structured_content MEDIUMTEXT, -- Full distilled markdown
-- Quality metrics
token_count_original INT,
token_count_distilled INT,
compression_ratio DECIMAL(5,2) AS (token_count_distilled / token_count_original * 100),
-- Processing metadata
distill_model VARCHAR(50), -- claude-opus-4-5, etc.
distill_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-- Review status
review_status ENUM('pending', 'in_review', 'approved', 'needs_refactor', 'rejected') DEFAULT 'pending',
FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE,
INDEX idx_review_status (review_status)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE review_logs (
review_id INT AUTO_INCREMENT PRIMARY KEY,
distill_id INT NOT NULL,
-- Review details
review_round INT DEFAULT 1,
reviewer_type ENUM('auto_qa', 'human', 'claude_review') NOT NULL,
-- Quality assessment
quality_score DECIMAL(3,2), -- 0.00 - 1.00
assessment JSON, -- Detailed scoring breakdown
/*
Example assessment JSON:
{
"accuracy": 0.9,
"completeness": 0.85,
"clarity": 0.95,
"prompt_engineering_quality": 0.88,
"usability": 0.82
}
*/
-- Review outcome
decision ENUM('approve', 'refactor', 'deep_research', 'reject') NOT NULL,
feedback TEXT,
refactor_instructions TEXT,
research_queries JSON, -- Additional search queries if deep_research needed
reviewed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (distill_id) REFERENCES distilled_content(distill_id) ON DELETE CASCADE,
INDEX idx_decision (decision)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- -----------------------------
-- 3. Organization & Export Tables
-- -----------------------------
CREATE TABLE topics (
topic_id INT AUTO_INCREMENT PRIMARY KEY,
topic_name VARCHAR(255) NOT NULL,
topic_slug VARCHAR(100) NOT NULL, -- URL/folder-friendly
parent_topic_id INT,
description TEXT,
FOREIGN KEY (parent_topic_id) REFERENCES topics(topic_id) ON DELETE SET NULL,
UNIQUE INDEX idx_topic_slug (topic_slug)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE document_topics (
doc_id INT NOT NULL,
topic_id INT NOT NULL,
relevance_score DECIMAL(3,2) DEFAULT 1.00,
PRIMARY KEY (doc_id, topic_id),
FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE,
FOREIGN KEY (topic_id) REFERENCES topics(topic_id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE export_jobs (
export_id INT AUTO_INCREMENT PRIMARY KEY,
-- Export configuration
export_name VARCHAR(255) NOT NULL,
export_type ENUM('project_files', 'fine_tuning', 'training_dataset', 'knowledge_base') NOT NULL,
output_format ENUM('markdown', 'jsonl', 'parquet', 'sqlite') DEFAULT 'markdown',
-- Scope
topic_filter JSON, -- Topic IDs to include
date_range_start DATE,
date_range_end DATE,
min_quality_score DECIMAL(3,2) DEFAULT 0.80,
-- Output
output_path VARCHAR(500),
total_documents INT,
total_tokens INT,
-- Status
status ENUM('pending', 'processing', 'completed', 'failed') DEFAULT 'pending',
started_at TIMESTAMP,
completed_at TIMESTAMP,
error_message TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- -----------------------------
-- 4. Tracking & Monitoring Tables
-- -----------------------------
CREATE TABLE crawl_schedule (
schedule_id INT AUTO_INCREMENT PRIMARY KEY,
source_id INT NOT NULL,
frequency ENUM('daily', 'weekly', 'biweekly', 'monthly', 'on_demand') DEFAULT 'weekly',
last_crawl TIMESTAMP,
next_crawl TIMESTAMP,
is_enabled BOOLEAN DEFAULT TRUE,
FOREIGN KEY (source_id) REFERENCES sources(source_id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE change_detection (
change_id INT AUTO_INCREMENT PRIMARY KEY,
doc_id INT NOT NULL,
detected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
change_type ENUM('content_updated', 'url_moved', 'deleted', 'new_version') NOT NULL,
previous_hash CHAR(64),
current_hash CHAR(64),
diff_summary TEXT,
action_taken ENUM('pending', 'recrawled', 'archived', 'ignored') DEFAULT 'pending',
FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- -----------------------------
-- 5. Default Data
-- -----------------------------
INSERT INTO topics (topic_name, topic_slug, description) VALUES
('Prompt Engineering', 'prompt-engineering', 'Techniques for effective LLM prompting'),
('Claude Models', 'claude-models', 'Claude model architecture, capabilities, and versions'),
('Agent Building', 'agent-building', 'AI agent design patterns and implementation'),
('Claude Code', 'claude-code', 'Claude Code CLI tool usage and best practices'),
('MCP Integrations', 'mcp-integrations', 'Model Context Protocol servers and tools'),
('API Reference', 'api-reference', 'Anthropic API documentation and usage'),
('Fine-tuning', 'fine-tuning', 'Model fine-tuning techniques and datasets');
INSERT INTO sources (source_name, source_type, base_url, credibility_tier, vendor) VALUES
('Anthropic Official Docs', 'official_docs', 'https://docs.anthropic.com', 'tier1_official', 'anthropic'),
('Claude.ai Docs', 'official_docs', 'https://docs.claude.com', 'tier1_official', 'anthropic'),
('Anthropic Engineering Blog', 'engineering_blog', 'https://anthropic.com/engineering', 'tier1_official', 'anthropic'),
('Anthropic News', 'engineering_blog', 'https://anthropic.com/news', 'tier1_official', 'anthropic'),
('Anthropic Cookbook', 'github_repo', 'https://github.com/anthropics/anthropic-cookbook', 'tier1_official', 'anthropic'),
('OpenAI Docs', 'official_docs', 'https://platform.openai.com/docs', 'tier1_official', 'openai'),
('Google AI Docs', 'official_docs', 'https://ai.google.dev/docs', 'tier1_official', 'google');
-- -----------------------------
-- 6. Useful Views
-- -----------------------------
CREATE OR REPLACE VIEW v_pending_reviews AS
SELECT
dc.distill_id,
d.doc_id,
d.title,
d.url,
dc.token_count_distilled,
dc.distill_date,
s.credibility_tier
FROM distilled_content dc
JOIN documents d ON dc.doc_id = d.doc_id
JOIN sources s ON d.source_id = s.source_id
WHERE dc.review_status = 'pending'
ORDER BY s.credibility_tier ASC, dc.distill_date ASC;
CREATE OR REPLACE VIEW v_export_ready AS
SELECT
d.doc_id,
d.title,
d.url,
dc.structured_content,
t.topic_slug,
t.topic_name,
rl.quality_score,
s.credibility_tier,
s.vendor
FROM documents d
JOIN distilled_content dc ON d.doc_id = dc.doc_id
JOIN document_topics dt ON d.doc_id = dt.doc_id
JOIN topics t ON dt.topic_id = t.topic_id
JOIN review_logs rl ON dc.distill_id = rl.distill_id
JOIN sources s ON d.source_id = s.source_id
WHERE rl.decision = 'approve'
AND rl.quality_score >= 0.80
AND rl.review_id = (
SELECT MAX(review_id)
FROM review_logs
WHERE distill_id = dc.distill_id
)
ORDER BY t.topic_slug, rl.quality_score DESC;