6 modular skills for curating, processing, and exporting reference docs: - reference-discovery: Search and validate authoritative sources - web-crawler-orchestrator: Multi-backend crawling (Firecrawl/Node/aiohttp/Scrapy) - content-repository: MySQL storage with version tracking - content-distiller: Summarization and key concept extraction - quality-reviewer: QA loop with approve/refactor/research routing - markdown-exporter: Structured output for Claude Projects or fine-tuning Cross-machine installation support: - Environment-based config (~/.reference-curator.env) - Commands tracked in repo, symlinked during install - install.sh with --minimal, --check, --uninstall modes - Firecrawl MCP as default (always available) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
286 lines
10 KiB
SQL
286 lines
10 KiB
SQL
-- ===========================================
|
|
-- Reference Library Database Schema
|
|
-- Version: 1.0
|
|
-- Purpose: Store and manage curated reference materials
|
|
-- ===========================================
|
|
|
|
CREATE DATABASE IF NOT EXISTS reference_library
|
|
CHARACTER SET utf8mb4
|
|
COLLATE utf8mb4_unicode_ci;
|
|
|
|
USE reference_library;
|
|
|
|
-- -----------------------------
|
|
-- 1. Core Tables
|
|
-- -----------------------------
|
|
|
|
CREATE TABLE sources (
|
|
source_id INT AUTO_INCREMENT PRIMARY KEY,
|
|
source_name VARCHAR(255) NOT NULL,
|
|
source_type ENUM('official_docs', 'engineering_blog', 'research_paper',
|
|
'github_repo', 'community_guide', 'pdf_document', 'api_reference') NOT NULL,
|
|
base_url VARCHAR(500),
|
|
credibility_tier ENUM('tier1_official', 'tier2_verified', 'tier3_community') DEFAULT 'tier3_community',
|
|
vendor VARCHAR(100), -- anthropic, openai, google, etc.
|
|
is_active BOOLEAN DEFAULT TRUE,
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
|
|
|
INDEX idx_vendor (vendor),
|
|
INDEX idx_source_type (source_type)
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
|
|
|
CREATE TABLE documents (
|
|
doc_id INT AUTO_INCREMENT PRIMARY KEY,
|
|
source_id INT NOT NULL,
|
|
|
|
-- Document identification
|
|
title VARCHAR(500) NOT NULL,
|
|
url VARCHAR(1000),
|
|
url_hash CHAR(64) AS (SHA2(url, 256)) STORED, -- For deduplication
|
|
|
|
-- Content metadata
|
|
doc_type ENUM('webpage', 'pdf', 'markdown', 'api_spec', 'code_sample') NOT NULL,
|
|
language ENUM('en', 'ko', 'mixed') DEFAULT 'en',
|
|
original_publish_date DATE,
|
|
last_modified_date DATE,
|
|
|
|
-- Crawl metadata
|
|
crawl_date TIMESTAMP,
|
|
crawl_method ENUM('firecrawl', 'scrapy', 'aiohttp', 'nodejs', 'manual', 'api') DEFAULT 'aiohttp',
|
|
crawl_status ENUM('pending', 'completed', 'failed', 'stale') DEFAULT 'pending',
|
|
|
|
-- Storage
|
|
raw_content_path VARCHAR(500), -- Path to raw crawled file
|
|
raw_content_size INT, -- Bytes
|
|
|
|
-- Version tracking
|
|
version INT DEFAULT 1,
|
|
previous_version_id INT,
|
|
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
|
|
|
FOREIGN KEY (source_id) REFERENCES sources(source_id) ON DELETE CASCADE,
|
|
FOREIGN KEY (previous_version_id) REFERENCES documents(doc_id) ON DELETE SET NULL,
|
|
|
|
UNIQUE INDEX idx_url_hash (url_hash),
|
|
INDEX idx_crawl_status (crawl_status),
|
|
INDEX idx_crawl_date (crawl_date)
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
|
|
|
-- -----------------------------
|
|
-- 2. Content Processing Tables
|
|
-- -----------------------------
|
|
|
|
CREATE TABLE distilled_content (
|
|
distill_id INT AUTO_INCREMENT PRIMARY KEY,
|
|
doc_id INT NOT NULL,
|
|
|
|
-- Distilled output
|
|
summary TEXT, -- Executive summary
|
|
key_concepts JSON, -- Extracted key terms and definitions
|
|
code_snippets JSON, -- Extracted code examples
|
|
structured_content MEDIUMTEXT, -- Full distilled markdown
|
|
|
|
-- Quality metrics
|
|
token_count_original INT,
|
|
token_count_distilled INT,
|
|
compression_ratio DECIMAL(5,2) AS (token_count_distilled / token_count_original * 100),
|
|
|
|
-- Processing metadata
|
|
distill_model VARCHAR(50), -- claude-opus-4-5, etc.
|
|
distill_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
|
|
-- Review status
|
|
review_status ENUM('pending', 'in_review', 'approved', 'needs_refactor', 'rejected') DEFAULT 'pending',
|
|
|
|
FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE,
|
|
INDEX idx_review_status (review_status)
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
|
|
|
CREATE TABLE review_logs (
|
|
review_id INT AUTO_INCREMENT PRIMARY KEY,
|
|
distill_id INT NOT NULL,
|
|
|
|
-- Review details
|
|
review_round INT DEFAULT 1,
|
|
reviewer_type ENUM('auto_qa', 'human', 'claude_review') NOT NULL,
|
|
|
|
-- Quality assessment
|
|
quality_score DECIMAL(3,2), -- 0.00 - 1.00
|
|
assessment JSON, -- Detailed scoring breakdown
|
|
/*
|
|
Example assessment JSON:
|
|
{
|
|
"accuracy": 0.9,
|
|
"completeness": 0.85,
|
|
"clarity": 0.95,
|
|
"prompt_engineering_quality": 0.88,
|
|
"usability": 0.82
|
|
}
|
|
*/
|
|
|
|
-- Review outcome
|
|
decision ENUM('approve', 'refactor', 'deep_research', 'reject') NOT NULL,
|
|
feedback TEXT,
|
|
refactor_instructions TEXT,
|
|
research_queries JSON, -- Additional search queries if deep_research needed
|
|
|
|
reviewed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
|
|
FOREIGN KEY (distill_id) REFERENCES distilled_content(distill_id) ON DELETE CASCADE,
|
|
INDEX idx_decision (decision)
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
|
|
|
-- -----------------------------
|
|
-- 3. Organization & Export Tables
|
|
-- -----------------------------
|
|
|
|
CREATE TABLE topics (
|
|
topic_id INT AUTO_INCREMENT PRIMARY KEY,
|
|
topic_name VARCHAR(255) NOT NULL,
|
|
topic_slug VARCHAR(100) NOT NULL, -- URL/folder-friendly
|
|
parent_topic_id INT,
|
|
description TEXT,
|
|
|
|
FOREIGN KEY (parent_topic_id) REFERENCES topics(topic_id) ON DELETE SET NULL,
|
|
UNIQUE INDEX idx_topic_slug (topic_slug)
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
|
|
|
CREATE TABLE document_topics (
|
|
doc_id INT NOT NULL,
|
|
topic_id INT NOT NULL,
|
|
relevance_score DECIMAL(3,2) DEFAULT 1.00,
|
|
|
|
PRIMARY KEY (doc_id, topic_id),
|
|
FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE,
|
|
FOREIGN KEY (topic_id) REFERENCES topics(topic_id) ON DELETE CASCADE
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
|
|
|
CREATE TABLE export_jobs (
|
|
export_id INT AUTO_INCREMENT PRIMARY KEY,
|
|
|
|
-- Export configuration
|
|
export_name VARCHAR(255) NOT NULL,
|
|
export_type ENUM('project_files', 'fine_tuning', 'training_dataset', 'knowledge_base') NOT NULL,
|
|
output_format ENUM('markdown', 'jsonl', 'parquet', 'sqlite') DEFAULT 'markdown',
|
|
|
|
-- Scope
|
|
topic_filter JSON, -- Topic IDs to include
|
|
date_range_start DATE,
|
|
date_range_end DATE,
|
|
min_quality_score DECIMAL(3,2) DEFAULT 0.80,
|
|
|
|
-- Output
|
|
output_path VARCHAR(500),
|
|
total_documents INT,
|
|
total_tokens INT,
|
|
|
|
-- Status
|
|
status ENUM('pending', 'processing', 'completed', 'failed') DEFAULT 'pending',
|
|
started_at TIMESTAMP,
|
|
completed_at TIMESTAMP,
|
|
error_message TEXT,
|
|
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
|
|
|
-- -----------------------------
|
|
-- 4. Tracking & Monitoring Tables
|
|
-- -----------------------------
|
|
|
|
CREATE TABLE crawl_schedule (
|
|
schedule_id INT AUTO_INCREMENT PRIMARY KEY,
|
|
source_id INT NOT NULL,
|
|
|
|
frequency ENUM('daily', 'weekly', 'biweekly', 'monthly', 'on_demand') DEFAULT 'weekly',
|
|
last_crawl TIMESTAMP,
|
|
next_crawl TIMESTAMP,
|
|
is_enabled BOOLEAN DEFAULT TRUE,
|
|
|
|
FOREIGN KEY (source_id) REFERENCES sources(source_id) ON DELETE CASCADE
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
|
|
|
CREATE TABLE change_detection (
|
|
change_id INT AUTO_INCREMENT PRIMARY KEY,
|
|
doc_id INT NOT NULL,
|
|
|
|
detected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
change_type ENUM('content_updated', 'url_moved', 'deleted', 'new_version') NOT NULL,
|
|
previous_hash CHAR(64),
|
|
current_hash CHAR(64),
|
|
diff_summary TEXT,
|
|
|
|
action_taken ENUM('pending', 'recrawled', 'archived', 'ignored') DEFAULT 'pending',
|
|
|
|
FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
|
|
|
-- -----------------------------
|
|
-- 5. Default Data
|
|
-- -----------------------------
|
|
|
|
INSERT INTO topics (topic_name, topic_slug, description) VALUES
|
|
('Prompt Engineering', 'prompt-engineering', 'Techniques for effective LLM prompting'),
|
|
('Claude Models', 'claude-models', 'Claude model architecture, capabilities, and versions'),
|
|
('Agent Building', 'agent-building', 'AI agent design patterns and implementation'),
|
|
('Claude Code', 'claude-code', 'Claude Code CLI tool usage and best practices'),
|
|
('MCP Integrations', 'mcp-integrations', 'Model Context Protocol servers and tools'),
|
|
('API Reference', 'api-reference', 'Anthropic API documentation and usage'),
|
|
('Fine-tuning', 'fine-tuning', 'Model fine-tuning techniques and datasets');
|
|
|
|
INSERT INTO sources (source_name, source_type, base_url, credibility_tier, vendor) VALUES
|
|
('Anthropic Official Docs', 'official_docs', 'https://docs.anthropic.com', 'tier1_official', 'anthropic'),
|
|
('Claude.ai Docs', 'official_docs', 'https://docs.claude.com', 'tier1_official', 'anthropic'),
|
|
('Anthropic Engineering Blog', 'engineering_blog', 'https://anthropic.com/engineering', 'tier1_official', 'anthropic'),
|
|
('Anthropic News', 'engineering_blog', 'https://anthropic.com/news', 'tier1_official', 'anthropic'),
|
|
('Anthropic Cookbook', 'github_repo', 'https://github.com/anthropics/anthropic-cookbook', 'tier1_official', 'anthropic'),
|
|
('OpenAI Docs', 'official_docs', 'https://platform.openai.com/docs', 'tier1_official', 'openai'),
|
|
('Google AI Docs', 'official_docs', 'https://ai.google.dev/docs', 'tier1_official', 'google');
|
|
|
|
-- -----------------------------
|
|
-- 6. Useful Views
|
|
-- -----------------------------
|
|
|
|
CREATE OR REPLACE VIEW v_pending_reviews AS
|
|
SELECT
|
|
dc.distill_id,
|
|
d.doc_id,
|
|
d.title,
|
|
d.url,
|
|
dc.token_count_distilled,
|
|
dc.distill_date,
|
|
s.credibility_tier
|
|
FROM distilled_content dc
|
|
JOIN documents d ON dc.doc_id = d.doc_id
|
|
JOIN sources s ON d.source_id = s.source_id
|
|
WHERE dc.review_status = 'pending'
|
|
ORDER BY s.credibility_tier ASC, dc.distill_date ASC;
|
|
|
|
CREATE OR REPLACE VIEW v_export_ready AS
|
|
SELECT
|
|
d.doc_id,
|
|
d.title,
|
|
d.url,
|
|
dc.structured_content,
|
|
t.topic_slug,
|
|
t.topic_name,
|
|
rl.quality_score,
|
|
s.credibility_tier,
|
|
s.vendor
|
|
FROM documents d
|
|
JOIN distilled_content dc ON d.doc_id = dc.doc_id
|
|
JOIN document_topics dt ON d.doc_id = dt.doc_id
|
|
JOIN topics t ON dt.topic_id = t.topic_id
|
|
JOIN review_logs rl ON dc.distill_id = rl.distill_id
|
|
JOIN sources s ON d.source_id = s.source_id
|
|
WHERE rl.decision = 'approve'
|
|
AND rl.quality_score >= 0.80
|
|
AND rl.review_id = (
|
|
SELECT MAX(review_id)
|
|
FROM review_logs
|
|
WHERE distill_id = dc.distill_id
|
|
)
|
|
ORDER BY t.topic_slug, rl.quality_score DESC;
|