feat(reference-curator): Add portable skill suite for reference documentation curation

6 modular skills for curating, processing, and exporting reference docs: - reference-discovery: Search and validate authoritative sources - web-crawler-orchestrator: Multi-backend crawling (Firecrawl/Node/aiohttp/Scrapy) - content-repository: MySQL storage with version tracking - content-distiller: Summarization and key concept extraction - quality-reviewer: QA loop with approve/refactor/research routing - markdown-exporter: Structured output for Claude Projects or fine-tuning Cross-machine installation support: - Environment-based config (~/.reference-curator.env) - Commands tracked in repo, symlinked during install - install.sh with --minimal, --check, --uninstall modes - Firecrawl MCP as default (always available) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 00:20:27 +07:00
parent e80056ae8a
commit 6d7a6d7a88
26 changed files with 4486 additions and 1 deletions
--- a/custom-skills/90-reference-curator/shared/config/crawl_config.yaml
+++ b/custom-skills/90-reference-curator/shared/config/crawl_config.yaml
@@ -0,0 +1,139 @@
+# Reference Curator - Crawl Configuration
+# Location: ~/.config/reference-curator/crawl_config.yaml
+#
+# Environment variables (set in ~/.reference-curator.env):
+#   CRAWLER_PROJECT_PATH - Path to crawler project (optional)
+#   REFERENCE_LIBRARY_PATH - Path to reference library storage (optional)
+
+# Default crawler backend
+# Options: nodejs, aiohttp, scrapy, firecrawl
+# Set to "firecrawl" if local crawlers are not available
+default_crawler: ${DEFAULT_CRAWLER:-firecrawl}
+
+# Intelligent routing rules
+# Claude will select the appropriate crawler based on these criteria
+routing:
+  nodejs:
+    conditions:
+      - max_pages <= 50
+      - single_domain == true
+      - no_javascript_rendering == true
+    description: "Fast, lightweight - best for small documentation sites"
+
+  aiohttp:
+    conditions:
+      - max_pages <= 200
+      - needs_async == true
+      - seo_extraction == true
+    description: "Async with SEO extraction - best for technical docs"
+
+  scrapy:
+    conditions:
+      - max_pages > 200
+      - multi_domain == true
+      - needs_pipeline == true
+    description: "Enterprise-grade - best for large sites with complex structure"
+
+  firecrawl:
+    conditions:
+      - needs_javascript_rendering == true
+      - spa_site == true
+      - dynamic_content == true
+    description: "JS rendering - best for SPAs and dynamic content (MCP-based, always available)"
+
+# Crawler locations (configurable via environment)
+# If CRAWLER_PROJECT_PATH is not set, only Firecrawl MCP will be available
+crawlers:
+  nodejs:
+    enabled: ${NODEJS_CRAWLER_ENABLED:-false}
+    path: ${CRAWLER_PROJECT_PATH}/util/js-crawler/
+    command: node src/crawler.js
+    install: npm install
+
+  aiohttp:
+    enabled: ${AIOHTTP_CRAWLER_ENABLED:-false}
+    path: ${CRAWLER_PROJECT_PATH}/
+    command: python -m seo_agent.crawler
+    install: uv sync
+
+  scrapy:
+    enabled: ${SCRAPY_CRAWLER_ENABLED:-false}
+    path: ${CRAWLER_PROJECT_PATH}/
+    command: scrapy crawl seo_spider
+    install: uv sync
+
+  firecrawl:
+    enabled: true  # Always available via MCP
+    type: mcp
+    server: firecrawl
+    tools:
+      - firecrawl_scrape
+      - firecrawl_crawl
+      - firecrawl_map
+    install: "Configure firecrawl MCP server in Claude Code settings"
+
+# Rate limiting
+rate_limit:
+  requests_per_minute: 20
+  concurrent_requests: 3
+
+# Retry settings
+retry:
+  max_retries: 3
+  backoff_multiplier: 2
+  initial_delay_seconds: 10
+
+# Default crawl options
+default_options:
+  timeout: 30000  # milliseconds
+  max_depth: 3
+  max_pages: 100
+  respect_robots_txt: true
+  user_agent: "ReferenceBot/1.0"
+
+# Content processing
+processing:
+  max_content_size_mb: 50
+  supported_formats:
+    - html
+    - pdf
+    - markdown
+
+  # Storage paths (configurable)
+  raw_content_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/raw/
+  processed_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/processed/
+
+# URL filtering
+url_filter:
+  skip_extensions:
+    - .jpg
+    - .jpeg
+    - .png
+    - .gif
+    - .svg
+    - .webp
+    - .css
+    - .js
+    - .woff
+    - .woff2
+    - .ico
+  skip_patterns:
+    - /wp-admin/
+    - /wp-includes/
+    - /login
+    - /logout
+
+# Site type detection hints
+site_detection:
+  spa_indicators:
+    - "react"
+    - "angular"
+    - "vue"
+    - "next.js"
+    - "nuxt"
+  static_indicators:
+    - "hugo"
+    - "jekyll"
+    - "mkdocs"
+    - "docusaurus"
+    - "gitbook"
--- a/custom-skills/90-reference-curator/shared/config/db_config.yaml
+++ b/custom-skills/90-reference-curator/shared/config/db_config.yaml
@@ -0,0 +1,31 @@
+# Reference Curator - Database Configuration
+# Location: ~/.config/reference-curator/db_config.yaml
+#
+# Environment variables (set in ~/.reference-curator.env):
+#   MYSQL_USER     - MySQL username (required)
+#   MYSQL_PASSWORD - MySQL password (required)
+#   MYSQL_HOST     - MySQL host (optional, default: localhost)
+#   MYSQL_PORT     - MySQL port (optional, default: 3306)
+
+mysql:
+  host: ${MYSQL_HOST:-localhost}
+  port: ${MYSQL_PORT:-3306}
+  database: reference_library
+  user: ${MYSQL_USER}
+  password: ${MYSQL_PASSWORD}
+  charset: utf8mb4
+
+  # Connection pool settings
+  pool_size: 5
+  pool_recycle: 3600
+
+  # SSL (if needed for remote MySQL)
+  ssl:
+    enabled: false
+    ca_cert: null
+
+# Backup settings
+backup:
+  enabled: true
+  path: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/backups/
+  retention_days: 30
--- a/custom-skills/90-reference-curator/shared/config/export_config.yaml
+++ b/custom-skills/90-reference-curator/shared/config/export_config.yaml
@@ -0,0 +1,46 @@
+# Reference Curator - Export Configuration
+# Location: ~/.config/reference-curator/export_config.yaml
+#
+# Copy this file to ~/.config/reference-curator/export_config.yaml
+
+output:
+  base_path: ~/reference-library/exports/
+
+  # Project files format (for Claude Projects)
+  project_files:
+    structure: nested_by_topic  # flat | nested_by_topic | nested_by_source
+    index_file: INDEX.md
+    include_metadata: true
+    max_file_size_kb: 500
+
+  # Fine-tuning dataset format
+  fine_tuning:
+    format: jsonl
+    include_system_prompt: true
+    system_prompt: "You are an expert on AI and prompt engineering."
+    max_tokens_per_sample: 4096
+
+  # Knowledge base format
+  knowledge_base:
+    structure: flat
+    include_toc: true
+
+# Quality thresholds
+quality:
+  min_score_for_export: 0.80
+  require_human_review: false
+  auto_approve_tier1_sources: true
+  auto_approve_min_score: 0.80
+
+# Cross-reference settings
+cross_references:
+  enabled: true
+  min_concept_overlap: 2
+  max_related_docs: 5
+
+# Verification
+verification:
+  check_broken_links: true
+  validate_markdown: true
+  check_duplicates: true
+  max_allowed_duplicates: 0
--- a/custom-skills/90-reference-curator/shared/schema.sql
+++ b/custom-skills/90-reference-curator/shared/schema.sql
@@ -0,0 +1,285 @@
+-- ===========================================
+-- Reference Library Database Schema
+-- Version: 1.0
+-- Purpose: Store and manage curated reference materials
+-- ===========================================
+
+CREATE DATABASE IF NOT EXISTS reference_library
+  CHARACTER SET utf8mb4
+  COLLATE utf8mb4_unicode_ci;
+
+USE reference_library;
+
+-- -----------------------------
+-- 1. Core Tables
+-- -----------------------------
+
+CREATE TABLE sources (
+    source_id INT AUTO_INCREMENT PRIMARY KEY,
+    source_name VARCHAR(255) NOT NULL,
+    source_type ENUM('official_docs', 'engineering_blog', 'research_paper',
+                     'github_repo', 'community_guide', 'pdf_document', 'api_reference') NOT NULL,
+    base_url VARCHAR(500),
+    credibility_tier ENUM('tier1_official', 'tier2_verified', 'tier3_community') DEFAULT 'tier3_community',
+    vendor VARCHAR(100),  -- anthropic, openai, google, etc.
+    is_active BOOLEAN DEFAULT TRUE,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+
+    INDEX idx_vendor (vendor),
+    INDEX idx_source_type (source_type)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+CREATE TABLE documents (
+    doc_id INT AUTO_INCREMENT PRIMARY KEY,
+    source_id INT NOT NULL,
+
+    -- Document identification
+    title VARCHAR(500) NOT NULL,
+    url VARCHAR(1000),
+    url_hash CHAR(64) AS (SHA2(url, 256)) STORED,  -- For deduplication
+
+    -- Content metadata
+    doc_type ENUM('webpage', 'pdf', 'markdown', 'api_spec', 'code_sample') NOT NULL,
+    language ENUM('en', 'ko', 'mixed') DEFAULT 'en',
+    original_publish_date DATE,
+    last_modified_date DATE,
+
+    -- Crawl metadata
+    crawl_date TIMESTAMP,
+    crawl_method ENUM('firecrawl', 'scrapy', 'aiohttp', 'nodejs', 'manual', 'api') DEFAULT 'aiohttp',
+    crawl_status ENUM('pending', 'completed', 'failed', 'stale') DEFAULT 'pending',
+
+    -- Storage
+    raw_content_path VARCHAR(500),  -- Path to raw crawled file
+    raw_content_size INT,  -- Bytes
+
+    -- Version tracking
+    version INT DEFAULT 1,
+    previous_version_id INT,
+
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+
+    FOREIGN KEY (source_id) REFERENCES sources(source_id) ON DELETE CASCADE,
+    FOREIGN KEY (previous_version_id) REFERENCES documents(doc_id) ON DELETE SET NULL,
+
+    UNIQUE INDEX idx_url_hash (url_hash),
+    INDEX idx_crawl_status (crawl_status),
+    INDEX idx_crawl_date (crawl_date)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- -----------------------------
+-- 2. Content Processing Tables
+-- -----------------------------
+
+CREATE TABLE distilled_content (
+    distill_id INT AUTO_INCREMENT PRIMARY KEY,
+    doc_id INT NOT NULL,
+
+    -- Distilled output
+    summary TEXT,  -- Executive summary
+    key_concepts JSON,  -- Extracted key terms and definitions
+    code_snippets JSON,  -- Extracted code examples
+    structured_content MEDIUMTEXT,  -- Full distilled markdown
+
+    -- Quality metrics
+    token_count_original INT,
+    token_count_distilled INT,
+    compression_ratio DECIMAL(5,2) AS (token_count_distilled / token_count_original * 100),
+
+    -- Processing metadata
+    distill_model VARCHAR(50),  -- claude-opus-4-5, etc.
+    distill_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+
+    -- Review status
+    review_status ENUM('pending', 'in_review', 'approved', 'needs_refactor', 'rejected') DEFAULT 'pending',
+
+    FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE,
+    INDEX idx_review_status (review_status)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+CREATE TABLE review_logs (
+    review_id INT AUTO_INCREMENT PRIMARY KEY,
+    distill_id INT NOT NULL,
+
+    -- Review details
+    review_round INT DEFAULT 1,
+    reviewer_type ENUM('auto_qa', 'human', 'claude_review') NOT NULL,
+
+    -- Quality assessment
+    quality_score DECIMAL(3,2),  -- 0.00 - 1.00
+    assessment JSON,  -- Detailed scoring breakdown
+    /*
+      Example assessment JSON:
+      {
+        "accuracy": 0.9,
+        "completeness": 0.85,
+        "clarity": 0.95,
+        "prompt_engineering_quality": 0.88,
+        "usability": 0.82
+      }
+    */
+
+    -- Review outcome
+    decision ENUM('approve', 'refactor', 'deep_research', 'reject') NOT NULL,
+    feedback TEXT,
+    refactor_instructions TEXT,
+    research_queries JSON,  -- Additional search queries if deep_research needed
+
+    reviewed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+
+    FOREIGN KEY (distill_id) REFERENCES distilled_content(distill_id) ON DELETE CASCADE,
+    INDEX idx_decision (decision)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- -----------------------------
+-- 3. Organization & Export Tables
+-- -----------------------------
+
+CREATE TABLE topics (
+    topic_id INT AUTO_INCREMENT PRIMARY KEY,
+    topic_name VARCHAR(255) NOT NULL,
+    topic_slug VARCHAR(100) NOT NULL,  -- URL/folder-friendly
+    parent_topic_id INT,
+    description TEXT,
+
+    FOREIGN KEY (parent_topic_id) REFERENCES topics(topic_id) ON DELETE SET NULL,
+    UNIQUE INDEX idx_topic_slug (topic_slug)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+CREATE TABLE document_topics (
+    doc_id INT NOT NULL,
+    topic_id INT NOT NULL,
+    relevance_score DECIMAL(3,2) DEFAULT 1.00,
+
+    PRIMARY KEY (doc_id, topic_id),
+    FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE,
+    FOREIGN KEY (topic_id) REFERENCES topics(topic_id) ON DELETE CASCADE
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+CREATE TABLE export_jobs (
+    export_id INT AUTO_INCREMENT PRIMARY KEY,
+
+    -- Export configuration
+    export_name VARCHAR(255) NOT NULL,
+    export_type ENUM('project_files', 'fine_tuning', 'training_dataset', 'knowledge_base') NOT NULL,
+    output_format ENUM('markdown', 'jsonl', 'parquet', 'sqlite') DEFAULT 'markdown',
+
+    -- Scope
+    topic_filter JSON,  -- Topic IDs to include
+    date_range_start DATE,
+    date_range_end DATE,
+    min_quality_score DECIMAL(3,2) DEFAULT 0.80,
+
+    -- Output
+    output_path VARCHAR(500),
+    total_documents INT,
+    total_tokens INT,
+
+    -- Status
+    status ENUM('pending', 'processing', 'completed', 'failed') DEFAULT 'pending',
+    started_at TIMESTAMP,
+    completed_at TIMESTAMP,
+    error_message TEXT,
+
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- -----------------------------
+-- 4. Tracking & Monitoring Tables
+-- -----------------------------
+
+CREATE TABLE crawl_schedule (
+    schedule_id INT AUTO_INCREMENT PRIMARY KEY,
+    source_id INT NOT NULL,
+
+    frequency ENUM('daily', 'weekly', 'biweekly', 'monthly', 'on_demand') DEFAULT 'weekly',
+    last_crawl TIMESTAMP,
+    next_crawl TIMESTAMP,
+    is_enabled BOOLEAN DEFAULT TRUE,
+
+    FOREIGN KEY (source_id) REFERENCES sources(source_id) ON DELETE CASCADE
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+CREATE TABLE change_detection (
+    change_id INT AUTO_INCREMENT PRIMARY KEY,
+    doc_id INT NOT NULL,
+
+    detected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    change_type ENUM('content_updated', 'url_moved', 'deleted', 'new_version') NOT NULL,
+    previous_hash CHAR(64),
+    current_hash CHAR(64),
+    diff_summary TEXT,
+
+    action_taken ENUM('pending', 'recrawled', 'archived', 'ignored') DEFAULT 'pending',
+
+    FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- -----------------------------
+-- 5. Default Data
+-- -----------------------------
+
+INSERT INTO topics (topic_name, topic_slug, description) VALUES
+('Prompt Engineering', 'prompt-engineering', 'Techniques for effective LLM prompting'),
+('Claude Models', 'claude-models', 'Claude model architecture, capabilities, and versions'),
+('Agent Building', 'agent-building', 'AI agent design patterns and implementation'),
+('Claude Code', 'claude-code', 'Claude Code CLI tool usage and best practices'),
+('MCP Integrations', 'mcp-integrations', 'Model Context Protocol servers and tools'),
+('API Reference', 'api-reference', 'Anthropic API documentation and usage'),
+('Fine-tuning', 'fine-tuning', 'Model fine-tuning techniques and datasets');
+
+INSERT INTO sources (source_name, source_type, base_url, credibility_tier, vendor) VALUES
+('Anthropic Official Docs', 'official_docs', 'https://docs.anthropic.com', 'tier1_official', 'anthropic'),
+('Claude.ai Docs', 'official_docs', 'https://docs.claude.com', 'tier1_official', 'anthropic'),
+('Anthropic Engineering Blog', 'engineering_blog', 'https://anthropic.com/engineering', 'tier1_official', 'anthropic'),
+('Anthropic News', 'engineering_blog', 'https://anthropic.com/news', 'tier1_official', 'anthropic'),
+('Anthropic Cookbook', 'github_repo', 'https://github.com/anthropics/anthropic-cookbook', 'tier1_official', 'anthropic'),
+('OpenAI Docs', 'official_docs', 'https://platform.openai.com/docs', 'tier1_official', 'openai'),
+('Google AI Docs', 'official_docs', 'https://ai.google.dev/docs', 'tier1_official', 'google');
+
+-- -----------------------------
+-- 6. Useful Views
+-- -----------------------------
+
+CREATE OR REPLACE VIEW v_pending_reviews AS
+SELECT
+    dc.distill_id,
+    d.doc_id,
+    d.title,
+    d.url,
+    dc.token_count_distilled,
+    dc.distill_date,
+    s.credibility_tier
+FROM distilled_content dc
+JOIN documents d ON dc.doc_id = d.doc_id
+JOIN sources s ON d.source_id = s.source_id
+WHERE dc.review_status = 'pending'
+ORDER BY s.credibility_tier ASC, dc.distill_date ASC;
+
+CREATE OR REPLACE VIEW v_export_ready AS
+SELECT
+    d.doc_id,
+    d.title,
+    d.url,
+    dc.structured_content,
+    t.topic_slug,
+    t.topic_name,
+    rl.quality_score,
+    s.credibility_tier,
+    s.vendor
+FROM documents d
+JOIN distilled_content dc ON d.doc_id = dc.doc_id
+JOIN document_topics dt ON d.doc_id = dt.doc_id
+JOIN topics t ON dt.topic_id = t.topic_id
+JOIN review_logs rl ON dc.distill_id = rl.distill_id
+JOIN sources s ON d.source_id = s.source_id
+WHERE rl.decision = 'approve'
+AND rl.quality_score >= 0.80
+AND rl.review_id = (
+    SELECT MAX(review_id)
+    FROM review_logs
+    WHERE distill_id = dc.distill_id
+)
+ORDER BY t.topic_slug, rl.quality_score DESC;