feat(reference-curator): Add pipeline orchestrator and refactor skill format

Pipeline Orchestrator: - Add 07-pipeline-orchestrator skill with code/CLAUDE.md and desktop/SKILL.md - Add /reference-curator-pipeline slash command for full workflow automation - Add pipeline_runs and pipeline_iteration_tracker tables to schema.sql - Add v_pipeline_status and v_pipeline_iterations views - Add pipeline_config.yaml configuration template - Update AGENTS.md with Reference Curator Skills section - Update claude-project files with pipeline documentation Skill Format Refactoring: - Extract YAML frontmatter from SKILL.md files to separate skill.yaml - Add tools/ directories with MCP tool documentation - Update SKILL-FORMAT-REQUIREMENTS.md with new structure - Add migrate-skill-structure.py script for format conversion Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 01:01:02 +07:00
parent 243b9d851c
commit d1cd1298a8
91 changed files with 2475 additions and 281 deletions
--- a/custom-skills/90-reference-curator/shared/config/pipeline_config.yaml
+++ b/custom-skills/90-reference-curator/shared/config/pipeline_config.yaml
@@ -0,0 +1,40 @@
+# Pipeline Orchestrator Configuration
+# Copy to ~/.config/reference-curator/pipeline_config.yaml
+
+pipeline:
+  # Discovery stage
+  max_sources: 10
+
+  # Crawler stage
+  max_pages: 50
+
+  # Auto-approve settings
+  auto_approve: false
+  approval_threshold: 0.85
+
+qa_loop:
+  # Maximum iterations before escalating to manual review
+  max_refactor_iterations: 3
+  max_deep_research_iterations: 2
+  max_total_iterations: 5
+
+export:
+  # Default export format: project_files, fine_tuning, jsonl
+  default_format: project_files
+
+  # Include rejected documents in a separate folder
+  include_rejected: false
+
+state:
+  # State management backend: mysql or file
+  backend: ${STATE_BACKEND:-file}
+
+  # File-based state directory (used when backend=file)
+  state_directory: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/pipeline_state/
+
+logging:
+  # Log level: DEBUG, INFO, WARNING, ERROR
+  level: INFO
+
+  # Save detailed logs for each run
+  save_run_logs: true
--- a/custom-skills/90-reference-curator/shared/schema.sql
+++ b/custom-skills/90-reference-curator/shared/schema.sql
@@ -187,7 +187,91 @@ CREATE TABLE export_jobs (
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

 -- -----------------------------
-- 4. Tracking & Monitoring Tables
+-- 4. Pipeline Orchestration Tables
+-- -----------------------------
+
+CREATE TABLE pipeline_runs (
+    run_id INT AUTO_INCREMENT PRIMARY KEY,
+
+    -- Input configuration
+    run_type ENUM('topic', 'urls', 'manifest') NOT NULL,
+    input_value TEXT NOT NULL,
+
+    -- Status tracking
+    status ENUM('running', 'completed', 'failed', 'paused') DEFAULT 'running',
+    current_stage ENUM('discovery', 'crawling', 'storing', 'distilling', 'reviewing', 'exporting') DEFAULT 'discovery',
+
+    -- Configuration options
+    options JSON,
+    /*
+      Example options JSON:
+      {
+        "max_sources": 10,
+        "max_pages": 50,
+        "auto_approve": false,
+        "threshold": 0.85,
+        "max_iterations": 3,
+        "export_format": "project_files"
+      }
+    */
+
+    -- Pipeline statistics
+    stats JSON,
+    /*
+      Example stats JSON:
+      {
+        "sources_discovered": 5,
+        "pages_crawled": 45,
+        "documents_stored": 45,
+        "documents_distilled": 45,
+        "approved": 40,
+        "refactored": 8,
+        "deep_researched": 2,
+        "rejected": 3,
+        "needs_manual_review": 2
+      }
+    */
+
+    -- Export information
+    export_path VARCHAR(500),
+    export_document_count INT,
+
+    -- Timing
+    started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    completed_at TIMESTAMP NULL,
+
+    -- Error handling
+    error_message TEXT,
+    error_stage VARCHAR(50),
+
+    INDEX idx_status (status),
+    INDEX idx_run_type (run_type),
+    INDEX idx_started_at (started_at)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+CREATE TABLE pipeline_iteration_tracker (
+    tracker_id INT AUTO_INCREMENT PRIMARY KEY,
+    run_id INT NOT NULL,
+    doc_id INT NOT NULL,
+
+    -- Iteration counts
+    refactor_count INT DEFAULT 0,
+    deep_research_count INT DEFAULT 0,
+
+    -- Final status
+    final_decision ENUM('approved', 'rejected', 'needs_manual_review'),
+
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+
+    FOREIGN KEY (run_id) REFERENCES pipeline_runs(run_id) ON DELETE CASCADE,
+    FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE,
+
+    UNIQUE INDEX idx_run_doc (run_id, doc_id)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- -----------------------------
+-- 5. Tracking & Monitoring Tables
 -- -----------------------------

 CREATE TABLE crawl_schedule (
@@ -218,7 +302,7 @@ CREATE TABLE change_detection (
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

 -- -----------------------------
-- 5. Default Data
+-- 6. Default Data
 -- -----------------------------

 INSERT INTO topics (topic_name, topic_slug, description) VALUES
@@ -240,7 +324,7 @@ INSERT INTO sources (source_name, source_type, base_url, credibility_tier, vendo
 ('Google AI Docs', 'official_docs', 'https://ai.google.dev/docs', 'tier1_official', 'google');

 -- -----------------------------
-- 6. Useful Views
+-- 7. Useful Views
 -- -----------------------------

 CREATE OR REPLACE VIEW v_pending_reviews AS
@@ -283,3 +367,39 @@ AND rl.review_id = (
    WHERE distill_id = dc.distill_id
 )
 ORDER BY t.topic_slug, rl.quality_score DESC;
+
+CREATE OR REPLACE VIEW v_pipeline_status AS
+SELECT
+    pr.run_id,
+    pr.run_type,
+    pr.input_value,
+    pr.status,
+    pr.current_stage,
+    pr.started_at,
+    pr.completed_at,
+    TIMESTAMPDIFF(MINUTE, pr.started_at, COALESCE(pr.completed_at, NOW())) as duration_minutes,
+    JSON_EXTRACT(pr.stats, '$.sources_discovered') as sources_discovered,
+    JSON_EXTRACT(pr.stats, '$.pages_crawled') as pages_crawled,
+    JSON_EXTRACT(pr.stats, '$.documents_stored') as documents_stored,
+    JSON_EXTRACT(pr.stats, '$.approved') as approved,
+    JSON_EXTRACT(pr.stats, '$.rejected') as rejected,
+    JSON_EXTRACT(pr.stats, '$.needs_manual_review') as needs_manual_review,
+    pr.export_path,
+    pr.error_message
+FROM pipeline_runs pr
+ORDER BY pr.started_at DESC;
+
+CREATE OR REPLACE VIEW v_pipeline_iterations AS
+SELECT
+    pit.run_id,
+    pr.input_value,
+    d.title,
+    d.url,
+    pit.refactor_count,
+    pit.deep_research_count,
+    (pit.refactor_count + pit.deep_research_count) as total_iterations,
+    pit.final_decision
+FROM pipeline_iteration_tracker pit
+JOIN pipeline_runs pr ON pit.run_id = pr.run_id
+JOIN documents d ON pit.doc_id = d.doc_id
+ORDER BY pit.run_id DESC, pit.total_iterations DESC;