-- =========================================== -- Reference Library Database Schema -- Version: 1.0 -- Purpose: Store and manage curated reference materials -- =========================================== CREATE DATABASE IF NOT EXISTS reference_library CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; USE reference_library; -- ----------------------------- -- 1. Core Tables -- ----------------------------- CREATE TABLE sources ( source_id INT AUTO_INCREMENT PRIMARY KEY, source_name VARCHAR(255) NOT NULL, source_type ENUM('official_docs', 'engineering_blog', 'research_paper', 'github_repo', 'community_guide', 'pdf_document', 'api_reference') NOT NULL, base_url VARCHAR(500), credibility_tier ENUM('tier1_official', 'tier2_verified', 'tier3_community') DEFAULT 'tier3_community', vendor VARCHAR(100), -- anthropic, openai, google, etc. is_active BOOLEAN DEFAULT TRUE, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, INDEX idx_vendor (vendor), INDEX idx_source_type (source_type) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; CREATE TABLE documents ( doc_id INT AUTO_INCREMENT PRIMARY KEY, source_id INT NOT NULL, -- Document identification title VARCHAR(500) NOT NULL, url VARCHAR(1000), url_hash CHAR(64) AS (SHA2(url, 256)) STORED, -- For deduplication -- Content metadata doc_type ENUM('webpage', 'pdf', 'markdown', 'api_spec', 'code_sample') NOT NULL, language ENUM('en', 'ko', 'mixed') DEFAULT 'en', original_publish_date DATE, last_modified_date DATE, -- Crawl metadata crawl_date TIMESTAMP, crawl_method ENUM('firecrawl', 'scrapy', 'aiohttp', 'nodejs', 'manual', 'api') DEFAULT 'aiohttp', crawl_status ENUM('pending', 'completed', 'failed', 'stale') DEFAULT 'pending', -- Storage raw_content_path VARCHAR(500), -- Path to raw crawled file raw_content_size INT, -- Bytes -- Version tracking version INT DEFAULT 1, previous_version_id INT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, FOREIGN KEY (source_id) REFERENCES sources(source_id) ON DELETE CASCADE, FOREIGN KEY (previous_version_id) REFERENCES documents(doc_id) ON DELETE SET NULL, UNIQUE INDEX idx_url_hash (url_hash), INDEX idx_crawl_status (crawl_status), INDEX idx_crawl_date (crawl_date) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; -- ----------------------------- -- 2. Content Processing Tables -- ----------------------------- CREATE TABLE distilled_content ( distill_id INT AUTO_INCREMENT PRIMARY KEY, doc_id INT NOT NULL, -- Distilled output summary TEXT, -- Executive summary key_concepts JSON, -- Extracted key terms and definitions code_snippets JSON, -- Extracted code examples structured_content MEDIUMTEXT, -- Full distilled markdown -- Quality metrics token_count_original INT, token_count_distilled INT, compression_ratio DECIMAL(5,2) AS (token_count_distilled / token_count_original * 100), -- Processing metadata distill_model VARCHAR(50), -- claude-opus-4-5, etc. distill_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP, -- Review status review_status ENUM('pending', 'in_review', 'approved', 'needs_refactor', 'rejected') DEFAULT 'pending', FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE, INDEX idx_review_status (review_status) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; CREATE TABLE review_logs ( review_id INT AUTO_INCREMENT PRIMARY KEY, distill_id INT NOT NULL, -- Review details review_round INT DEFAULT 1, reviewer_type ENUM('auto_qa', 'human', 'claude_review') NOT NULL, -- Quality assessment quality_score DECIMAL(3,2), -- 0.00 - 1.00 assessment JSON, -- Detailed scoring breakdown /* Example assessment JSON: { "accuracy": 0.9, "completeness": 0.85, "clarity": 0.95, "prompt_engineering_quality": 0.88, "usability": 0.82 } */ -- Review outcome decision ENUM('approve', 'refactor', 'deep_research', 'reject') NOT NULL, feedback TEXT, refactor_instructions TEXT, research_queries JSON, -- Additional search queries if deep_research needed reviewed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (distill_id) REFERENCES distilled_content(distill_id) ON DELETE CASCADE, INDEX idx_decision (decision) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; -- ----------------------------- -- 3. Organization & Export Tables -- ----------------------------- CREATE TABLE topics ( topic_id INT AUTO_INCREMENT PRIMARY KEY, topic_name VARCHAR(255) NOT NULL, topic_slug VARCHAR(100) NOT NULL, -- URL/folder-friendly parent_topic_id INT, description TEXT, FOREIGN KEY (parent_topic_id) REFERENCES topics(topic_id) ON DELETE SET NULL, UNIQUE INDEX idx_topic_slug (topic_slug) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; CREATE TABLE document_topics ( doc_id INT NOT NULL, topic_id INT NOT NULL, relevance_score DECIMAL(3,2) DEFAULT 1.00, PRIMARY KEY (doc_id, topic_id), FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE, FOREIGN KEY (topic_id) REFERENCES topics(topic_id) ON DELETE CASCADE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; CREATE TABLE export_jobs ( export_id INT AUTO_INCREMENT PRIMARY KEY, -- Export configuration export_name VARCHAR(255) NOT NULL, export_type ENUM('project_files', 'fine_tuning', 'training_dataset', 'knowledge_base') NOT NULL, output_format ENUM('markdown', 'jsonl', 'parquet', 'sqlite') DEFAULT 'markdown', -- Scope topic_filter JSON, -- Topic IDs to include date_range_start DATE, date_range_end DATE, min_quality_score DECIMAL(3,2) DEFAULT 0.80, -- Output output_path VARCHAR(500), total_documents INT, total_tokens INT, -- Status status ENUM('pending', 'processing', 'completed', 'failed') DEFAULT 'pending', started_at TIMESTAMP, completed_at TIMESTAMP, error_message TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; -- ----------------------------- -- 4. Pipeline Orchestration Tables -- ----------------------------- CREATE TABLE pipeline_runs ( run_id INT AUTO_INCREMENT PRIMARY KEY, -- Input configuration run_type ENUM('topic', 'urls', 'manifest') NOT NULL, input_value TEXT NOT NULL, -- Status tracking status ENUM('running', 'completed', 'failed', 'paused') DEFAULT 'running', current_stage ENUM('discovery', 'crawling', 'storing', 'distilling', 'reviewing', 'exporting') DEFAULT 'discovery', -- Configuration options options JSON, /* Example options JSON: { "max_sources": 10, "max_pages": 50, "auto_approve": false, "threshold": 0.85, "max_iterations": 3, "export_format": "project_files" } */ -- Pipeline statistics stats JSON, /* Example stats JSON: { "sources_discovered": 5, "pages_crawled": 45, "documents_stored": 45, "documents_distilled": 45, "approved": 40, "refactored": 8, "deep_researched": 2, "rejected": 3, "needs_manual_review": 2 } */ -- Export information export_path VARCHAR(500), export_document_count INT, -- Timing started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, completed_at TIMESTAMP NULL, -- Error handling error_message TEXT, error_stage VARCHAR(50), INDEX idx_status (status), INDEX idx_run_type (run_type), INDEX idx_started_at (started_at) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; CREATE TABLE pipeline_iteration_tracker ( tracker_id INT AUTO_INCREMENT PRIMARY KEY, run_id INT NOT NULL, doc_id INT NOT NULL, -- Iteration counts refactor_count INT DEFAULT 0, deep_research_count INT DEFAULT 0, -- Final status final_decision ENUM('approved', 'rejected', 'needs_manual_review'), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, FOREIGN KEY (run_id) REFERENCES pipeline_runs(run_id) ON DELETE CASCADE, FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE, UNIQUE INDEX idx_run_doc (run_id, doc_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; -- ----------------------------- -- 5. Tracking & Monitoring Tables -- ----------------------------- CREATE TABLE crawl_schedule ( schedule_id INT AUTO_INCREMENT PRIMARY KEY, source_id INT NOT NULL, frequency ENUM('daily', 'weekly', 'biweekly', 'monthly', 'on_demand') DEFAULT 'weekly', last_crawl TIMESTAMP, next_crawl TIMESTAMP, is_enabled BOOLEAN DEFAULT TRUE, FOREIGN KEY (source_id) REFERENCES sources(source_id) ON DELETE CASCADE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; CREATE TABLE change_detection ( change_id INT AUTO_INCREMENT PRIMARY KEY, doc_id INT NOT NULL, detected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, change_type ENUM('content_updated', 'url_moved', 'deleted', 'new_version') NOT NULL, previous_hash CHAR(64), current_hash CHAR(64), diff_summary TEXT, action_taken ENUM('pending', 'recrawled', 'archived', 'ignored') DEFAULT 'pending', FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; -- ----------------------------- -- 6. Default Data -- ----------------------------- INSERT INTO topics (topic_name, topic_slug, description) VALUES ('Prompt Engineering', 'prompt-engineering', 'Techniques for effective LLM prompting'), ('Claude Models', 'claude-models', 'Claude model architecture, capabilities, and versions'), ('Agent Building', 'agent-building', 'AI agent design patterns and implementation'), ('Claude Code', 'claude-code', 'Claude Code CLI tool usage and best practices'), ('MCP Integrations', 'mcp-integrations', 'Model Context Protocol servers and tools'), ('API Reference', 'api-reference', 'Anthropic API documentation and usage'), ('Fine-tuning', 'fine-tuning', 'Model fine-tuning techniques and datasets'); INSERT INTO sources (source_name, source_type, base_url, credibility_tier, vendor) VALUES ('Anthropic Official Docs', 'official_docs', 'https://docs.anthropic.com', 'tier1_official', 'anthropic'), ('Claude.ai Docs', 'official_docs', 'https://docs.claude.com', 'tier1_official', 'anthropic'), ('Anthropic Engineering Blog', 'engineering_blog', 'https://anthropic.com/engineering', 'tier1_official', 'anthropic'), ('Anthropic News', 'engineering_blog', 'https://anthropic.com/news', 'tier1_official', 'anthropic'), ('Anthropic Cookbook', 'github_repo', 'https://github.com/anthropics/anthropic-cookbook', 'tier1_official', 'anthropic'), ('OpenAI Docs', 'official_docs', 'https://platform.openai.com/docs', 'tier1_official', 'openai'), ('Google AI Docs', 'official_docs', 'https://ai.google.dev/docs', 'tier1_official', 'google'); -- ----------------------------- -- 7. Useful Views -- ----------------------------- CREATE OR REPLACE VIEW v_pending_reviews AS SELECT dc.distill_id, d.doc_id, d.title, d.url, dc.token_count_distilled, dc.distill_date, s.credibility_tier FROM distilled_content dc JOIN documents d ON dc.doc_id = d.doc_id JOIN sources s ON d.source_id = s.source_id WHERE dc.review_status = 'pending' ORDER BY s.credibility_tier ASC, dc.distill_date ASC; CREATE OR REPLACE VIEW v_export_ready AS SELECT d.doc_id, d.title, d.url, dc.structured_content, t.topic_slug, t.topic_name, rl.quality_score, s.credibility_tier, s.vendor FROM documents d JOIN distilled_content dc ON d.doc_id = dc.doc_id JOIN document_topics dt ON d.doc_id = dt.doc_id JOIN topics t ON dt.topic_id = t.topic_id JOIN review_logs rl ON dc.distill_id = rl.distill_id JOIN sources s ON d.source_id = s.source_id WHERE rl.decision = 'approve' AND rl.quality_score >= 0.80 AND rl.review_id = ( SELECT MAX(review_id) FROM review_logs WHERE distill_id = dc.distill_id ) ORDER BY t.topic_slug, rl.quality_score DESC; CREATE OR REPLACE VIEW v_pipeline_status AS SELECT pr.run_id, pr.run_type, pr.input_value, pr.status, pr.current_stage, pr.started_at, pr.completed_at, TIMESTAMPDIFF(MINUTE, pr.started_at, COALESCE(pr.completed_at, NOW())) as duration_minutes, JSON_EXTRACT(pr.stats, '$.sources_discovered') as sources_discovered, JSON_EXTRACT(pr.stats, '$.pages_crawled') as pages_crawled, JSON_EXTRACT(pr.stats, '$.documents_stored') as documents_stored, JSON_EXTRACT(pr.stats, '$.approved') as approved, JSON_EXTRACT(pr.stats, '$.rejected') as rejected, JSON_EXTRACT(pr.stats, '$.needs_manual_review') as needs_manual_review, pr.export_path, pr.error_message FROM pipeline_runs pr ORDER BY pr.started_at DESC; CREATE OR REPLACE VIEW v_pipeline_iterations AS SELECT pit.run_id, pr.input_value, d.title, d.url, pit.refactor_count, pit.deep_research_count, (pit.refactor_count + pit.deep_research_count) as total_iterations, pit.final_decision FROM pipeline_iteration_tracker pit JOIN pipeline_runs pr ON pit.run_id = pr.run_id JOIN documents d ON pit.doc_id = d.doc_id ORDER BY pit.run_id DESC, pit.total_iterations DESC;