-- ===========================================
-- Reference Library Database Schema
-- Version: 1.0
-- Purpose: Store and manage curated reference materials
-- ===========================================

CREATE DATABASE IF NOT EXISTS reference_library
  CHARACTER SET utf8mb4
  COLLATE utf8mb4_unicode_ci;

USE reference_library;

-- -----------------------------
-- 1. Core Tables
-- -----------------------------

CREATE TABLE sources (
    source_id INT AUTO_INCREMENT PRIMARY KEY,
    source_name VARCHAR(255) NOT NULL,
    source_type ENUM('official_docs', 'engineering_blog', 'research_paper',
                     'github_repo', 'community_guide', 'pdf_document', 'api_reference') NOT NULL,
    base_url VARCHAR(500),
    credibility_tier ENUM('tier1_official', 'tier2_verified', 'tier3_community') DEFAULT 'tier3_community',
    vendor VARCHAR(100),  -- anthropic, openai, google, etc.
    is_active BOOLEAN DEFAULT TRUE,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,

    INDEX idx_vendor (vendor),
    INDEX idx_source_type (source_type)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

CREATE TABLE documents (
    doc_id INT AUTO_INCREMENT PRIMARY KEY,
    source_id INT NOT NULL,

    -- Document identification
    title VARCHAR(500) NOT NULL,
    url VARCHAR(1000),
    url_hash CHAR(64) AS (SHA2(url, 256)) STORED,  -- For deduplication

    -- Content metadata
    doc_type ENUM('webpage', 'pdf', 'markdown', 'api_spec', 'code_sample') NOT NULL,
    language ENUM('en', 'ko', 'mixed') DEFAULT 'en',
    original_publish_date DATE,
    last_modified_date DATE,

    -- Crawl metadata
    crawl_date TIMESTAMP,
    crawl_method ENUM('firecrawl', 'scrapy', 'aiohttp', 'nodejs', 'manual', 'api') DEFAULT 'aiohttp',
    crawl_status ENUM('pending', 'completed', 'failed', 'stale') DEFAULT 'pending',

    -- Storage
    raw_content_path VARCHAR(500),  -- Path to raw crawled file
    raw_content_size INT,  -- Bytes

    -- Version tracking
    version INT DEFAULT 1,
    previous_version_id INT,

    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,

    FOREIGN KEY (source_id) REFERENCES sources(source_id) ON DELETE CASCADE,
    FOREIGN KEY (previous_version_id) REFERENCES documents(doc_id) ON DELETE SET NULL,

    UNIQUE INDEX idx_url_hash (url_hash),
    INDEX idx_crawl_status (crawl_status),
    INDEX idx_crawl_date (crawl_date)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

-- -----------------------------
-- 2. Content Processing Tables
-- -----------------------------

CREATE TABLE distilled_content (
    distill_id INT AUTO_INCREMENT PRIMARY KEY,
    doc_id INT NOT NULL,

    -- Distilled output
    summary TEXT,  -- Executive summary
    key_concepts JSON,  -- Extracted key terms and definitions
    code_snippets JSON,  -- Extracted code examples
    structured_content MEDIUMTEXT,  -- Full distilled markdown

    -- Quality metrics
    token_count_original INT,
    token_count_distilled INT,
    compression_ratio DECIMAL(5,2) AS (token_count_distilled / token_count_original * 100),

    -- Processing metadata
    distill_model VARCHAR(50),  -- claude-opus-4-5, etc.
    distill_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,

    -- Review status
    review_status ENUM('pending', 'in_review', 'approved', 'needs_refactor', 'rejected') DEFAULT 'pending',

    FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE,
    INDEX idx_review_status (review_status)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

CREATE TABLE review_logs (
    review_id INT AUTO_INCREMENT PRIMARY KEY,
    distill_id INT NOT NULL,

    -- Review details
    review_round INT DEFAULT 1,
    reviewer_type ENUM('auto_qa', 'human', 'claude_review') NOT NULL,

    -- Quality assessment
    quality_score DECIMAL(3,2),  -- 0.00 - 1.00
    assessment JSON,  -- Detailed scoring breakdown
    /*
      Example assessment JSON:
      {
        "accuracy": 0.9,
        "completeness": 0.85,
        "clarity": 0.95,
        "prompt_engineering_quality": 0.88,
        "usability": 0.82
      }
    */

    -- Review outcome
    decision ENUM('approve', 'refactor', 'deep_research', 'reject') NOT NULL,
    feedback TEXT,
    refactor_instructions TEXT,
    research_queries JSON,  -- Additional search queries if deep_research needed

    reviewed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,

    FOREIGN KEY (distill_id) REFERENCES distilled_content(distill_id) ON DELETE CASCADE,
    INDEX idx_decision (decision)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

-- -----------------------------
-- 3. Organization & Export Tables
-- -----------------------------

CREATE TABLE topics (
    topic_id INT AUTO_INCREMENT PRIMARY KEY,
    topic_name VARCHAR(255) NOT NULL,
    topic_slug VARCHAR(100) NOT NULL,  -- URL/folder-friendly
    parent_topic_id INT,
    description TEXT,

    FOREIGN KEY (parent_topic_id) REFERENCES topics(topic_id) ON DELETE SET NULL,
    UNIQUE INDEX idx_topic_slug (topic_slug)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

CREATE TABLE document_topics (
    doc_id INT NOT NULL,
    topic_id INT NOT NULL,
    relevance_score DECIMAL(3,2) DEFAULT 1.00,

    PRIMARY KEY (doc_id, topic_id),
    FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE,
    FOREIGN KEY (topic_id) REFERENCES topics(topic_id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

CREATE TABLE export_jobs (
    export_id INT AUTO_INCREMENT PRIMARY KEY,

    -- Export configuration
    export_name VARCHAR(255) NOT NULL,
    export_type ENUM('project_files', 'fine_tuning', 'training_dataset', 'knowledge_base') NOT NULL,
    output_format ENUM('markdown', 'jsonl', 'parquet', 'sqlite') DEFAULT 'markdown',

    -- Scope
    topic_filter JSON,  -- Topic IDs to include
    date_range_start DATE,
    date_range_end DATE,
    min_quality_score DECIMAL(3,2) DEFAULT 0.80,

    -- Output
    output_path VARCHAR(500),
    total_documents INT,
    total_tokens INT,

    -- Status
    status ENUM('pending', 'processing', 'completed', 'failed') DEFAULT 'pending',
    started_at TIMESTAMP,
    completed_at TIMESTAMP,
    error_message TEXT,

    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

-- -----------------------------
-- 4. Pipeline Orchestration Tables
-- -----------------------------

CREATE TABLE pipeline_runs (
    run_id INT AUTO_INCREMENT PRIMARY KEY,

    -- Input configuration
    run_type ENUM('topic', 'urls', 'manifest') NOT NULL,
    input_value TEXT NOT NULL,

    -- Status tracking
    status ENUM('running', 'completed', 'failed', 'paused') DEFAULT 'running',
    current_stage ENUM('discovery', 'crawling', 'storing', 'distilling', 'reviewing', 'exporting') DEFAULT 'discovery',

    -- Configuration options
    options JSON,
    /*
      Example options JSON:
      {
        "max_sources": 10,
        "max_pages": 50,
        "auto_approve": false,
        "threshold": 0.85,
        "max_iterations": 3,
        "export_format": "project_files"
      }
    */

    -- Pipeline statistics
    stats JSON,
    /*
      Example stats JSON:
      {
        "sources_discovered": 5,
        "pages_crawled": 45,
        "documents_stored": 45,
        "documents_distilled": 45,
        "approved": 40,
        "refactored": 8,
        "deep_researched": 2,
        "rejected": 3,
        "needs_manual_review": 2
      }
    */

    -- Export information
    export_path VARCHAR(500),
    export_document_count INT,

    -- Timing
    started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    completed_at TIMESTAMP NULL,

    -- Error handling
    error_message TEXT,
    error_stage VARCHAR(50),

    INDEX idx_status (status),
    INDEX idx_run_type (run_type),
    INDEX idx_started_at (started_at)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

CREATE TABLE pipeline_iteration_tracker (
    tracker_id INT AUTO_INCREMENT PRIMARY KEY,
    run_id INT NOT NULL,
    doc_id INT NOT NULL,

    -- Iteration counts
    refactor_count INT DEFAULT 0,
    deep_research_count INT DEFAULT 0,

    -- Final status
    final_decision ENUM('approved', 'rejected', 'needs_manual_review'),

    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,

    FOREIGN KEY (run_id) REFERENCES pipeline_runs(run_id) ON DELETE CASCADE,
    FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE,

    UNIQUE INDEX idx_run_doc (run_id, doc_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

-- -----------------------------
-- 5. Tracking & Monitoring Tables
-- -----------------------------

CREATE TABLE crawl_schedule (
    schedule_id INT AUTO_INCREMENT PRIMARY KEY,
    source_id INT NOT NULL,

    frequency ENUM('daily', 'weekly', 'biweekly', 'monthly', 'on_demand') DEFAULT 'weekly',
    last_crawl TIMESTAMP,
    next_crawl TIMESTAMP,
    is_enabled BOOLEAN DEFAULT TRUE,

    FOREIGN KEY (source_id) REFERENCES sources(source_id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

CREATE TABLE change_detection (
    change_id INT AUTO_INCREMENT PRIMARY KEY,
    doc_id INT NOT NULL,

    detected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    change_type ENUM('content_updated', 'url_moved', 'deleted', 'new_version') NOT NULL,
    previous_hash CHAR(64),
    current_hash CHAR(64),
    diff_summary TEXT,

    action_taken ENUM('pending', 'recrawled', 'archived', 'ignored') DEFAULT 'pending',

    FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

-- -----------------------------
-- 6. Default Data
-- -----------------------------

INSERT INTO topics (topic_name, topic_slug, description) VALUES
('Prompt Engineering', 'prompt-engineering', 'Techniques for effective LLM prompting'),
('Claude Models', 'claude-models', 'Claude model architecture, capabilities, and versions'),
('Agent Building', 'agent-building', 'AI agent design patterns and implementation'),
('Claude Code', 'claude-code', 'Claude Code CLI tool usage and best practices'),
('MCP Integrations', 'mcp-integrations', 'Model Context Protocol servers and tools'),
('API Reference', 'api-reference', 'Anthropic API documentation and usage'),
('Fine-tuning', 'fine-tuning', 'Model fine-tuning techniques and datasets');

INSERT INTO sources (source_name, source_type, base_url, credibility_tier, vendor) VALUES
('Anthropic Official Docs', 'official_docs', 'https://docs.anthropic.com', 'tier1_official', 'anthropic'),
('Claude.ai Docs', 'official_docs', 'https://docs.claude.com', 'tier1_official', 'anthropic'),
('Anthropic Engineering Blog', 'engineering_blog', 'https://anthropic.com/engineering', 'tier1_official', 'anthropic'),
('Anthropic News', 'engineering_blog', 'https://anthropic.com/news', 'tier1_official', 'anthropic'),
('Anthropic Cookbook', 'github_repo', 'https://github.com/anthropics/anthropic-cookbook', 'tier1_official', 'anthropic'),
('OpenAI Docs', 'official_docs', 'https://platform.openai.com/docs', 'tier1_official', 'openai'),
('Google AI Docs', 'official_docs', 'https://ai.google.dev/docs', 'tier1_official', 'google');

-- -----------------------------
-- 7. Useful Views
-- -----------------------------

CREATE OR REPLACE VIEW v_pending_reviews AS
SELECT
    dc.distill_id,
    d.doc_id,
    d.title,
    d.url,
    dc.token_count_distilled,
    dc.distill_date,
    s.credibility_tier
FROM distilled_content dc
JOIN documents d ON dc.doc_id = d.doc_id
JOIN sources s ON d.source_id = s.source_id
WHERE dc.review_status = 'pending'
ORDER BY s.credibility_tier ASC, dc.distill_date ASC;

CREATE OR REPLACE VIEW v_export_ready AS
SELECT
    d.doc_id,
    d.title,
    d.url,
    dc.structured_content,
    t.topic_slug,
    t.topic_name,
    rl.quality_score,
    s.credibility_tier,
    s.vendor
FROM documents d
JOIN distilled_content dc ON d.doc_id = dc.doc_id
JOIN document_topics dt ON d.doc_id = dt.doc_id
JOIN topics t ON dt.topic_id = t.topic_id
JOIN review_logs rl ON dc.distill_id = rl.distill_id
JOIN sources s ON d.source_id = s.source_id
WHERE rl.decision = 'approve'
AND rl.quality_score >= 0.80
AND rl.review_id = (
    SELECT MAX(review_id)
    FROM review_logs
    WHERE distill_id = dc.distill_id
)
ORDER BY t.topic_slug, rl.quality_score DESC;

CREATE OR REPLACE VIEW v_pipeline_status AS
SELECT
    pr.run_id,
    pr.run_type,
    pr.input_value,
    pr.status,
    pr.current_stage,
    pr.started_at,
    pr.completed_at,
    TIMESTAMPDIFF(MINUTE, pr.started_at, COALESCE(pr.completed_at, NOW())) as duration_minutes,
    JSON_EXTRACT(pr.stats, '$.sources_discovered') as sources_discovered,
    JSON_EXTRACT(pr.stats, '$.pages_crawled') as pages_crawled,
    JSON_EXTRACT(pr.stats, '$.documents_stored') as documents_stored,
    JSON_EXTRACT(pr.stats, '$.approved') as approved,
    JSON_EXTRACT(pr.stats, '$.rejected') as rejected,
    JSON_EXTRACT(pr.stats, '$.needs_manual_review') as needs_manual_review,
    pr.export_path,
    pr.error_message
FROM pipeline_runs pr
ORDER BY pr.started_at DESC;

CREATE OR REPLACE VIEW v_pipeline_iterations AS
SELECT
    pit.run_id,
    pr.input_value,
    d.title,
    d.url,
    pit.refactor_count,
    pit.deep_research_count,
    (pit.refactor_count + pit.deep_research_count) as total_iterations,
    pit.final_decision
FROM pipeline_iteration_tracker pit
JOIN pipeline_runs pr ON pit.run_id = pr.run_id
JOIN documents d ON pit.doc_id = d.doc_id
ORDER BY pit.run_id DESC, pit.total_iterations DESC;