feat(reference-curator): Add pipeline orchestrator and refactor skill format

Pipeline Orchestrator:
- Add 07-pipeline-orchestrator skill with code/CLAUDE.md and desktop/SKILL.md
- Add /reference-curator-pipeline slash command for full workflow automation
- Add pipeline_runs and pipeline_iteration_tracker tables to schema.sql
- Add v_pipeline_status and v_pipeline_iterations views
- Add pipeline_config.yaml configuration template
- Update AGENTS.md with Reference Curator Skills section
- Update claude-project files with pipeline documentation

Skill Format Refactoring:
- Extract YAML frontmatter from SKILL.md files to separate skill.yaml
- Add tools/ directories with MCP tool documentation
- Update SKILL-FORMAT-REQUIREMENTS.md with new structure
- Add migrate-skill-structure.py script for format conversion

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-29 01:01:02 +07:00
parent 243b9d851c
commit d1cd1298a8
91 changed files with 2475 additions and 281 deletions

View File

@@ -1,8 +1,3 @@
---
name: reference-discovery
description: Search and identify authoritative sources for reference materials. Validates source credibility, prioritizes by relevance, and outputs curated URL lists with metadata. Triggers on "find references", "search documentation", "discover sources", "find authoritative materials", "research topic sources".
---
# Reference Discovery
Searches for authoritative sources, validates credibility, and produces curated URL lists for crawling.

View File

@@ -0,0 +1,9 @@
# Skill metadata (extracted from SKILL.md frontmatter)
name: reference-discovery
description: |
Search and identify authoritative sources for reference materials. Validates source credibility, prioritizes by relevance, and outputs curated URL lists with metadata. Triggers on "find references", "search documentation", "discover sources", "find authoritative materials", "research topic sources".
# Optional fields
# triggers: [] # TODO: Extract from description

View File

@@ -1,8 +1,3 @@
---
name: web-crawler-orchestrator
description: Orchestrates web crawling using Firecrawl MCP. Handles rate limiting, selects crawl strategies, manages formats (HTML/PDF/markdown), and produces raw content with manifests. Triggers on "crawl URLs", "fetch documents", "scrape pages", "download references", "Firecrawl crawl".
---
# Web Crawler Orchestrator
Manages crawling operations using Firecrawl MCP with rate limiting and format handling.

View File

@@ -0,0 +1,9 @@
# Skill metadata (extracted from SKILL.md frontmatter)
name: web-crawler-orchestrator
description: |
Orchestrates web crawling using Firecrawl MCP. Handles rate limiting, selects crawl strategies, manages formats (HTML/PDF/markdown), and produces raw content with manifests. Triggers on "crawl URLs", "fetch documents", "scrape pages", "download references", "Firecrawl crawl".
# Optional fields
# triggers: [] # TODO: Extract from description

View File

@@ -1,8 +1,3 @@
---
name: content-repository
description: MySQL storage management for reference library. Use when storing crawled content, managing document versions, deduplicating URLs, querying stored references, or tracking document metadata. Triggers on keywords like "store content", "save to database", "check duplicates", "version tracking", "document retrieval", "reference library DB".
---
# Content Repository
Manages MySQL storage for the reference library system. Handles document storage, version control, deduplication, and retrieval.

View File

@@ -0,0 +1,9 @@
# Skill metadata (extracted from SKILL.md frontmatter)
name: content-repository
description: |
MySQL storage management for reference library. Use when storing crawled content, managing document versions, deduplicating URLs, querying stored references, or tracking document metadata. Triggers on keywords like "store content", "save to database", "check duplicates", "version tracking", "document retrieval", "reference library DB".
# Optional fields
# triggers: [] # TODO: Extract from description

View File

@@ -1,8 +1,3 @@
---
name: content-distiller
description: Analyzes and distills raw crawled content into concise reference materials. Extracts key concepts, code snippets, and creates structured summaries optimized for prompt engineering use cases. Triggers on "distill content", "summarize document", "extract key concepts", "process raw content", "create reference summary".
---
# Content Distiller
Transforms raw crawled content into structured, high-quality reference materials.

View File

@@ -0,0 +1,9 @@
# Skill metadata (extracted from SKILL.md frontmatter)
name: content-distiller
description: |
Analyzes and distills raw crawled content into concise reference materials. Extracts key concepts, code snippets, and creates structured summaries optimized for prompt engineering use cases. Triggers on "distill content", "summarize document", "extract key concepts", "process raw content", "create reference summary".
# Optional fields
# triggers: [] # TODO: Extract from description

View File

@@ -1,8 +1,3 @@
---
name: quality-reviewer
description: QA loop for reference library content. Scores distilled materials against prompt engineering quality criteria, routes decisions (approve/refactor/deep_research/reject), and provides actionable feedback. Triggers on "review content", "quality check", "QA review", "assess distilled content", "check reference quality", "refactoring needed".
---
# Quality Reviewer
Evaluates distilled content for quality, routes decisions, and triggers refactoring or additional research when needed.

View File

@@ -0,0 +1,9 @@
# Skill metadata (extracted from SKILL.md frontmatter)
name: quality-reviewer
description: |
QA loop for reference library content. Scores distilled materials against prompt engineering quality criteria, routes decisions (approve/refactor/deep_research/reject), and provides actionable feedback. Triggers on "review content", "quality check", "QA review", "assess distilled content", "check reference quality", "refactoring needed".
# Optional fields
# triggers: [] # TODO: Extract from description

View File

@@ -1,8 +1,3 @@
---
name: markdown-exporter
description: Exports approved reference content as structured markdown files for project knowledge or fine-tuning datasets. Generates INDEX files, organizes by topic, and maintains cross-references. Triggers on "export references", "generate project files", "create markdown output", "export for fine-tuning", "build knowledge base".
---
# Markdown Exporter
Exports approved content as structured markdown files for Claude Projects or fine-tuning.

View File

@@ -0,0 +1,9 @@
# Skill metadata (extracted from SKILL.md frontmatter)
name: markdown-exporter
description: |
Exports approved reference content as structured markdown files for project knowledge or fine-tuning datasets. Generates INDEX files, organizes by topic, and maintains cross-references. Triggers on "export references", "generate project files", "create markdown output", "export for fine-tuning", "build knowledge base".
# Optional fields
# triggers: [] # TODO: Extract from description

View File

@@ -0,0 +1,296 @@
# Pipeline Orchestrator
Coordinates the full 6-skill reference curation workflow with QA loop handling.
## Trigger Keywords
"curate references", "full pipeline", "run curation", "reference-curator-pipeline"
## Architecture
```
[Input] → discovery → crawler → repository → distiller ◄──┐
│ │
reviewer │
│ │
┌───────────────────────────────┼─────┤
▼ ▼ ▼ │
APPROVE REJECT REFACTOR ────┤
│ │ │
▼ ▼ DEEP_RESEARCH
export archive │
crawler ─┘
```
## Input Detection
Parse input to determine mode:
```python
def detect_input_mode(input_value):
if input_value.endswith('.json') and os.path.exists(input_value):
return 'manifest'
elif input_value.startswith('http://') or input_value.startswith('https://'):
return 'urls'
else:
return 'topic'
```
## Pipeline Execution
### Stage 1: Reference Discovery (Topic Mode Only)
```bash
# Skip if input mode is 'urls' or 'manifest'
if mode == 'topic':
/reference-discovery "$TOPIC" --max-sources $MAX_SOURCES
# Output: manifest.json
```
### Stage 2: Web Crawler
```bash
# From manifest or URLs
/web-crawler $INPUT --max-pages $MAX_PAGES
# Output: crawled files in ~/reference-library/raw/
```
### Stage 3: Content Repository
```bash
/content-repository store
# Output: documents stored in MySQL or file-based storage
```
### Stage 4: Content Distiller
```bash
/content-distiller all-pending
# Output: distilled content records
```
### Stage 5: Quality Reviewer
```bash
if auto_approve:
/quality-reviewer all-pending --auto-approve --threshold $THRESHOLD
else:
/quality-reviewer all-pending
```
Handle QA decisions:
- **APPROVE**: Add to export queue
- **REFACTOR**: Re-run distiller with feedback (track iteration count)
- **DEEP_RESEARCH**: Run crawler for additional sources, then distill
- **REJECT**: Archive with reason
### Stage 6: Markdown Exporter
```bash
/markdown-exporter $EXPORT_FORMAT
# Output: files in ~/reference-library/exports/
```
## State Management
### Initialize Pipeline State
```python
def init_pipeline_state(run_id, input_value, options):
state = {
"run_id": run_id,
"run_type": detect_input_mode(input_value),
"input_value": input_value,
"status": "running",
"current_stage": "discovery",
"options": options,
"stats": {
"sources_discovered": 0,
"pages_crawled": 0,
"documents_stored": 0,
"documents_distilled": 0,
"approved": 0,
"refactored": 0,
"deep_researched": 0,
"rejected": 0,
"needs_manual_review": 0
},
"started_at": datetime.now().isoformat()
}
save_state(run_id, state)
return state
```
### MySQL State (Preferred)
```sql
INSERT INTO pipeline_runs (run_type, input_value, options)
VALUES ('topic', 'Claude system prompts', '{"max_sources": 10}');
```
### File-Based Fallback
```
~/reference-library/pipeline_state/run_XXX/
├── state.json # Current stage and stats
├── manifest.json # Discovered sources
├── crawl_results.json # Crawled document paths
├── review_log.json # QA decisions per document
└── errors.log # Any errors encountered
```
## QA Loop Logic
```python
MAX_REFACTOR_ITERATIONS = 3
MAX_DEEP_RESEARCH_ITERATIONS = 2
MAX_TOTAL_ITERATIONS = 5
def handle_qa_decision(doc_id, decision, iteration_counts):
refactor_count = iteration_counts.get('refactor', 0)
research_count = iteration_counts.get('deep_research', 0)
total = refactor_count + research_count
if total >= MAX_TOTAL_ITERATIONS:
return 'needs_manual_review'
if decision == 'refactor':
if refactor_count >= MAX_REFACTOR_ITERATIONS:
return 'needs_manual_review'
iteration_counts['refactor'] = refactor_count + 1
return 're_distill'
if decision == 'deep_research':
if research_count >= MAX_DEEP_RESEARCH_ITERATIONS:
return 'needs_manual_review'
iteration_counts['deep_research'] = research_count + 1
return 're_crawl_and_distill'
return decision # approve or reject
```
## Checkpoint Strategy
Save checkpoint after each stage completes:
| Stage | Checkpoint | Resume Point |
|-------|------------|--------------|
| discovery | `manifest.json` created | → crawler |
| crawl | `crawl_results.json` | → repository |
| store | DB records or file list | → distiller |
| distill | distilled_content records | → reviewer |
| review | review_logs records | → exporter or loop |
| export | final export complete | Done |
## Progress Reporting
Report progress to user at key checkpoints:
```
[Pipeline] Stage 1/6: Discovery - Found 8 sources
[Pipeline] Stage 2/6: Crawling - 45/50 pages complete
[Pipeline] Stage 3/6: Storing - 45 documents saved
[Pipeline] Stage 4/6: Distilling - 45 documents processed
[Pipeline] Stage 5/6: Reviewing - 40 approved, 3 refactored, 2 rejected
[Pipeline] Stage 6/6: Exporting - 40 documents exported
[Pipeline] Complete! See ~/reference-library/exports/
```
## Error Handling
```python
def handle_stage_error(stage, error, state):
state['status'] = 'paused'
state['error_message'] = str(error)
state['error_stage'] = stage
save_state(state['run_id'], state)
# Log to errors.log
log_error(state['run_id'], stage, error)
# Report to user
return f"Pipeline paused at {stage}: {error}. Resume with run_id {state['run_id']}"
```
## Resume Pipeline
```python
def resume_pipeline(run_id):
state = load_state(run_id)
if state['status'] != 'paused':
return f"Pipeline {run_id} is {state['status']}, cannot resume"
stage = state['current_stage']
state['status'] = 'running'
state['error_message'] = None
save_state(run_id, state)
# Resume from failed stage
return execute_from_stage(stage, state)
```
## Output Summary
On completion, generate summary:
```json
{
"run_id": 123,
"status": "completed",
"duration_minutes": 15,
"stats": {
"sources_discovered": 5,
"pages_crawled": 45,
"documents_stored": 45,
"documents_distilled": 45,
"approved": 40,
"refactored": 8,
"deep_researched": 2,
"rejected": 3,
"needs_manual_review": 2
},
"exports": {
"format": "project_files",
"path": "~/reference-library/exports/",
"document_count": 40
},
"errors": []
}
```
## Integration Points
| Skill | Called By | Provides |
|-------|-----------|----------|
| reference-discovery | Orchestrator | manifest.json |
| web-crawler | Orchestrator | Raw crawled files |
| content-repository | Orchestrator | Stored documents |
| content-distiller | Orchestrator, QA loop | Distilled content |
| quality-reviewer | Orchestrator | QA decisions |
| markdown-exporter | Orchestrator | Final exports |
## Configuration
Read from `~/.config/reference-curator/pipeline_config.yaml`:
```yaml
pipeline:
max_sources: 10
max_pages: 50
auto_approve: false
approval_threshold: 0.85
qa_loop:
max_refactor_iterations: 3
max_deep_research_iterations: 2
max_total_iterations: 5
export:
default_format: project_files
include_rejected: false
state:
backend: mysql # or 'file'
state_directory: ~/reference-library/pipeline_state/
```

View File

@@ -0,0 +1,279 @@
# Pipeline Orchestrator
Coordinates the full reference curation workflow, handling QA loops and state management.
## Pipeline Architecture
```
[Input: Topic | URLs | Manifest]
1. reference-discovery ──────────────────┐
(skip if URLs/manifest) │
│ │
▼ │
2. web-crawler-orchestrator │
│ │
▼ │
3. content-repository │
│ │
▼ │
4. content-distiller ◄───────────────────┤
│ │
▼ │
5. quality-reviewer │
│ │
┌─────┼─────┬────────────────┐ │
▼ ▼ ▼ ▼ │
APPROVE REJECT REFACTOR DEEP_RESEARCH│
│ │ │ │ │
│ │ └─────────────┤ │
│ │ └───────┘
▼ ▼
6. markdown-exporter archive
[Complete]
```
## Input Modes
| Mode | Example Input | Pipeline Start |
|------|--------------|----------------|
| **Topic** | `"Claude system prompts"` | Stage 1 (discovery) |
| **URLs** | `["https://docs.anthropic.com/..."]` | Stage 2 (crawler) |
| **Manifest** | Path to `manifest.json` | Stage 2 (crawler) |
## Configuration Options
```yaml
pipeline:
max_sources: 10 # Discovery limit
max_pages: 50 # Pages per source
auto_approve: false # Auto-approve above threshold
approval_threshold: 0.85
qa_loop:
max_refactor_iterations: 3
max_deep_research_iterations: 2
max_total_iterations: 5
export:
format: project_files # or fine_tuning, jsonl
```
## Pipeline Execution
### Stage 1: Reference Discovery
For topic-based input, search and validate authoritative sources:
```python
def run_discovery(topic, max_sources=10):
# Uses WebSearch to find sources
# Validates credibility
# Outputs manifest.json with source URLs
sources = search_authoritative_sources(topic, max_sources)
validate_and_rank_sources(sources)
write_manifest(sources)
return manifest_path
```
### Stage 2: Web Crawler
Crawl URLs from manifest or direct input:
```python
def run_crawler(input_source, max_pages=50):
# Selects optimal crawler backend
# Respects rate limits
# Stores raw content
urls = load_urls(input_source)
for url in urls:
crawl_with_best_backend(url, max_pages)
return crawl_results
```
### Stage 3: Content Repository
Store crawled content with deduplication:
```python
def run_repository(crawl_results):
# Deduplicates by URL hash
# Tracks versions
# Returns stored doc IDs
for result in crawl_results:
store_document(result)
return stored_doc_ids
```
### Stage 4: Content Distiller
Process raw content into structured summaries:
```python
def run_distiller(doc_ids, refactor_instructions=None):
# Extracts key concepts
# Generates summaries
# Creates structured markdown
for doc_id in doc_ids:
distill_document(doc_id, instructions=refactor_instructions)
return distilled_ids
```
### Stage 5: Quality Reviewer
Score and route content based on quality:
```python
def run_reviewer(distilled_ids, auto_approve=False, threshold=0.85):
decisions = {}
for distill_id in distilled_ids:
score, assessment = score_content(distill_id)
if auto_approve and score >= threshold:
decisions[distill_id] = ('approve', None)
elif score >= 0.85:
decisions[distill_id] = ('approve', None)
elif score >= 0.60:
instructions = generate_feedback(assessment)
decisions[distill_id] = ('refactor', instructions)
elif score >= 0.40:
queries = generate_research_queries(assessment)
decisions[distill_id] = ('deep_research', queries)
else:
decisions[distill_id] = ('reject', assessment)
return decisions
```
### Stage 6: Markdown Exporter
Export approved content:
```python
def run_exporter(approved_ids, format='project_files'):
# Organizes by topic
# Generates INDEX.md
# Creates cross-references
export_documents(approved_ids, format=format)
return export_path
```
## QA Loop Handling
```python
def handle_qa_loop(distill_id, decision, iteration_tracker):
counts = iteration_tracker.get(distill_id, {'refactor': 0, 'deep_research': 0})
if decision == 'refactor':
if counts['refactor'] >= MAX_REFACTOR:
return 'needs_manual_review'
counts['refactor'] += 1
iteration_tracker[distill_id] = counts
return 're_distill'
if decision == 'deep_research':
if counts['deep_research'] >= MAX_DEEP_RESEARCH:
return 'needs_manual_review'
counts['deep_research'] += 1
iteration_tracker[distill_id] = counts
return 're_crawl'
return decision
```
## State Management
### MySQL Backend (Preferred)
```sql
SELECT run_id, status, current_stage, stats
FROM pipeline_runs
WHERE run_id = ?;
```
### File-Based Fallback
```
~/reference-library/pipeline_state/
├── run_001/
│ ├── state.json # Pipeline state
│ ├── manifest.json # Discovered sources
│ ├── crawl_results.json
│ └── review_log.json # QA decisions
```
State JSON format:
```json
{
"run_id": "run_001",
"run_type": "topic",
"input_value": "Claude system prompts",
"status": "running",
"current_stage": "distilling",
"stats": {
"sources_discovered": 5,
"pages_crawled": 45,
"approved": 0,
"refactored": 0
},
"started_at": "2026-01-29T10:00:00Z"
}
```
## Checkpointing
Checkpoint after each stage to enable resume:
| Checkpoint | Trigger | Resume From |
|------------|---------|-------------|
| `discovery_complete` | Manifest saved | → crawler |
| `crawl_complete` | All pages crawled | → repository |
| `store_complete` | Docs in database | → distiller |
| `distill_complete` | Content processed | → reviewer |
| `review_complete` | Decisions logged | → exporter |
| `export_complete` | Files generated | Done |
## Output Summary
```json
{
"run_id": 123,
"status": "completed",
"duration_minutes": 15,
"stats": {
"sources_discovered": 5,
"pages_crawled": 45,
"documents_stored": 45,
"documents_distilled": 45,
"approved": 40,
"refactored": 8,
"deep_researched": 2,
"rejected": 3,
"needs_manual_review": 2
},
"exports": {
"format": "project_files",
"path": "~/reference-library/exports/",
"document_count": 40
}
}
```
## Error Handling
On stage failure:
1. Save checkpoint with error state
2. Log error details
3. Report to user with resume instructions
```python
try:
run_stage(stage_name)
save_checkpoint(stage_name, 'complete')
except Exception as e:
save_checkpoint(stage_name, 'failed', error=str(e))
report_error(f"Pipeline paused at {stage_name}: {e}")
```

View File

@@ -0,0 +1,9 @@
# Skill metadata (extracted from SKILL.md frontmatter)
name: pipeline-orchestrator
description: |
Orchestrates the full 6-skill reference curation pipeline as a background task. Coordinates discovery → crawl → store → distill → review → export with QA loop handling. Triggers on "curate references", "run full pipeline", "reference pipeline", "automate curation".
# Optional fields
# triggers: [] # TODO: Extract from description

View File

@@ -130,37 +130,44 @@ This displays available files in `claude-project/` and optionally copies them to
## Architecture
```
[Topic Input]
─────────────────────
│ reference-discovery │ → Search & validate sources
─────────────────────
┌──────────────────────────────┐
│ reference-curator-pipeline │ (Orchestrator)
│ /reference-curator-pipeline │
└──────────────────────────────
┌───────────────────────┼───────────────────────┐
▼ ▼ ▼
[Topic Input] [URL Input] [Manifest Input]
│ │ │
▼ │ │
┌─────────────────────┐ │ │
│ reference-discovery │ ◄─────────┴───────────────────────┘
└─────────────────────┘ (skip if URLs/manifest)
┌──────────────────────────┐
│ web-crawler-orchestrator │ → Crawl (Firecrawl/Node.js/aiohttp/Scrapy)
└──────────────────────────┘
┌────────────────────┐
│ content-repository │ → Store in MySQL
└────────────────────┘
┌───────────────────┐
│ content-distiller │ → Summarize & extract
└───────────────────┘
┌──────────────────┐
│ quality-reviewer │ → QA loop
└──────────────────┘
├── REFACTOR → content-distiller
├── DEEP_RESEARCH → web-crawler-orchestrator
▼ APPROVE
│ content-distiller │ → Summarize & extract ◄─────┐
└───────────────────┘
▼ │
┌──────────────────┐
│ quality-reviewer │ → QA loop
└──────────────────┘
├── REFACTOR (max 3) ────────────────────┤
├── DEEP_RESEARCH (max 2) → crawler ─────┘
▼ APPROVE
┌───────────────────┐
│ markdown-exporter │ → Project files / Fine-tuning
└───────────────────┘
@@ -170,7 +177,35 @@ This displays available files in `claude-project/` and optionally copies them to
## User Guide
### Basic Workflow
### Full Pipeline (Recommended)
Run the complete curation workflow with a single command:
```
# From topic - runs all 6 stages automatically
/reference-curator-pipeline "Claude Code best practices" --max-sources 5
# From URLs - skip discovery, start at crawler
/reference-curator-pipeline https://docs.anthropic.com/en/docs/prompt-caching
# Resume from manifest file
/reference-curator-pipeline ./manifest.json --auto-approve
# Fine-tuning dataset output
/reference-curator-pipeline "MCP servers" --export-format fine_tuning
```
**Pipeline Options:**
- `--max-sources 10` - Max sources to discover (topic mode)
- `--max-pages 50` - Max pages per source to crawl
- `--auto-approve` - Auto-approve scores above threshold
- `--threshold 0.85` - Approval threshold
- `--max-iterations 3` - Max QA loop iterations per document
- `--export-format project_files` - Output format (project_files, fine_tuning, jsonl)
---
### Manual Workflow (Step-by-Step)
**Step 1: Discover References**
```
@@ -295,6 +330,7 @@ mysql -h $MYSQL_HOST -u $MYSQL_USER -p"$MYSQL_PASSWORD" reference_library -e "
| 04 | content-distiller | `/content-distiller` | Summarize & extract |
| 05 | quality-reviewer | `/quality-reviewer` | QA scoring & routing |
| 06 | markdown-exporter | `/markdown-exporter` | Export to markdown/JSONL |
| 07 | pipeline-orchestrator | `/reference-curator-pipeline` | Full pipeline orchestration |
---
@@ -435,7 +471,8 @@ mysql -h $MYSQL_HOST -u $MYSQL_USER -p"$MYSQL_PASSWORD" reference_library < shar
│ ├── content-repository.md
│ ├── content-distiller.md
│ ├── quality-reviewer.md
── markdown-exporter.md
── markdown-exporter.md
│ └── reference-curator-pipeline.md
├── 01-reference-discovery/
│ ├── code/CLAUDE.md # Claude Code directive
@@ -455,6 +492,9 @@ mysql -h $MYSQL_HOST -u $MYSQL_USER -p"$MYSQL_PASSWORD" reference_library < shar
├── 06-markdown-exporter/
│ ├── code/CLAUDE.md
│ └── desktop/SKILL.md
├── 07-pipeline-orchestrator/
│ ├── code/CLAUDE.md
│ └── desktop/SKILL.md
└── shared/
├── schema.sql # MySQL schema

View File

@@ -0,0 +1,175 @@
# Pipeline Orchestrator
Coordinates the full 6-skill reference curation workflow with automated QA loop handling.
## Trigger Phrases
- "curate references on [topic]"
- "run full curation pipeline"
- "automate reference curation"
- "curate these URLs: [url1, url2]"
## Input Modes
| Mode | Example | Pipeline Start |
|------|---------|----------------|
| **Topic** | "curate references on Claude system prompts" | Stage 1 (discovery) |
| **URLs** | "curate these URLs: https://docs.anthropic.com/..." | Stage 2 (crawler) |
| **Manifest** | "resume curation from manifest.json" | Stage 2 (crawler) |
## Pipeline Stages
```
1. reference-discovery (topic mode only)
2. web-crawler-orchestrator
3. content-repository
4. content-distiller ◄─────────────┐
│ │
▼ │
5. quality-reviewer │
│ │
├── APPROVE → Stage 6 │
├── REFACTOR ───────────────┤
├── DEEP_RESEARCH → Stage 2 ┘
└── REJECT → Archive
6. markdown-exporter
```
## Configuration Options
| Option | Default | Description |
|--------|---------|-------------|
| max_sources | 10 | Maximum sources to discover (topic mode) |
| max_pages | 50 | Maximum pages per source to crawl |
| auto_approve | false | Auto-approve scores above threshold |
| threshold | 0.85 | Quality score threshold for approval |
| max_iterations | 3 | Maximum QA loop iterations per document |
| export_format | project_files | Output format (project_files, fine_tuning, jsonl) |
## QA Loop Handling
The orchestrator automatically handles QA decisions:
| Decision | Action | Iteration Limit |
|----------|--------|-----------------|
| **APPROVE** | Proceed to export | - |
| **REFACTOR** | Re-distill with feedback | 3 iterations |
| **DEEP_RESEARCH** | Crawl more sources, re-distill | 2 iterations |
| **REJECT** | Archive with reason | - |
After reaching iteration limits, documents are marked `needs_manual_review`.
## State Management
### With Database
Pipeline state is tracked in `pipeline_runs` table:
- Run ID, input type, current stage
- Statistics (crawled, distilled, approved, etc.)
- Error handling and resume capability
### File-Based Fallback
State saved to `~/reference-library/pipeline_state/run_XXX/`:
- `state.json` - Current stage and statistics
- `manifest.json` - Discovered sources
- `review_log.json` - QA decisions
## Progress Tracking
The orchestrator reports progress at each stage:
```
[Pipeline] Stage 1/6: Discovery - Found 8 sources
[Pipeline] Stage 2/6: Crawling - 45/50 pages complete
[Pipeline] Stage 3/6: Storing - 45 documents saved
[Pipeline] Stage 4/6: Distilling - 45 documents processed
[Pipeline] Stage 5/6: Reviewing - 40 approved, 3 refactored, 2 rejected
[Pipeline] Stage 6/6: Exporting - 40 documents exported
[Pipeline] Complete! See ~/reference-library/exports/
```
## Output Summary
On completion, returns detailed summary:
```
Pipeline Complete:
- Sources discovered: 5
- Pages crawled: 45
- Documents stored: 45
- Approved: 40
- Refactored: 8
- Deep researched: 2
- Rejected: 3
- Needs manual review: 2
Exports saved to: ~/reference-library/exports/
Format: project_files
```
## Error Handling
If a stage fails:
1. State is checkpointed
2. Error is logged with details
3. Pipeline can be resumed from last successful stage
## Resume Pipeline
To resume a paused or failed pipeline:
- Provide the manifest.json or run state file
- Pipeline continues from last checkpoint
## Example Usage
### Full Pipeline from Topic
```
User: Curate references on Claude Code best practices with 5 sources
Claude: I'll run the full curation pipeline for "Claude Code best practices":
[Pipeline] Stage 1/6: Discovering sources...
Found 5 authoritative sources (3 official, 2 community)
[Pipeline] Stage 2/6: Crawling 5 sources...
Crawled 45 pages total
[Pipeline] Stage 3/6: Storing documents...
45 documents saved to repository
[Pipeline] Stage 4/6: Distilling content...
45 documents processed
[Pipeline] Stage 5/6: Quality review...
- 38 approved
- 5 refactored (re-distilling...)
- 2 rejected (low quality)
[Pipeline] Stage 6/6: Exporting...
40 documents exported to ~/reference-library/exports/
Pipeline complete! 40 reference documents ready for use.
```
### From Specific URLs
```
User: Curate these URLs with auto-approve:
- https://docs.anthropic.com/en/docs/prompt-caching
- https://docs.anthropic.com/en/docs/build-with-claude/tool-use
Claude: Running pipeline from URLs (skipping discovery)...
[Pipeline] Stage 2/6: Crawling 2 sources...
[...continues with remaining stages...]
```

View File

@@ -1,11 +1,27 @@
# Reference Curator - Claude.ai Project Knowledge
This project knowledge enables Claude to curate, process, and export reference documentation through 6 modular skills.
This project knowledge enables Claude to curate, process, and export reference documentation through 7 modular skills.
## Quick Start - Pipeline Orchestrator
Run the full curation workflow with a single command:
```
# Full pipeline from topic
curate references on "Claude Code best practices"
# From URLs (skip discovery)
curate these URLs: https://docs.anthropic.com/en/docs/prompt-caching
# With auto-approve
curate references on "MCP servers" with auto-approve
```
## Skills Overview
| Skill | Purpose | Trigger Phrases |
|-------|---------|-----------------|
| **pipeline-orchestrator** | Full 6-skill workflow with QA loops | "curate references", "run full pipeline", "automate curation" |
| **reference-discovery** | Search & validate authoritative sources | "find references", "search documentation", "discover sources" |
| **web-crawler** | Multi-backend crawling orchestration | "crawl URL", "fetch documents", "scrape pages" |
| **content-repository** | MySQL storage management | "store content", "save to database", "check duplicates" |
@@ -16,37 +32,43 @@ This project knowledge enables Claude to curate, process, and export reference d
## Workflow
```
[Topic Input]
┌─────────────────────┐
│ reference-discovery │ → Search & validate sources
└─────────────────────┘
┌───────────────────────────┐
│ pipeline-orchestrator │ (Coordinates all stages)
└───────────────────────────┘
┌───────────────────┼───────────────────┐
▼ ▼ ▼
[Topic Input] [URL Input] [Manifest Input]
│ │ │
▼ │ │
┌─────────────────────┐ │ │
│ reference-discovery │ ◄───┴───────────────────┘
└─────────────────────┘ (skip if URLs/manifest)
┌─────────────────────┐
│ web-crawler │ → Crawl (Firecrawl/Node.js/aiohttp/Scrapy)
└─────────────────────┘
┌─────────────────────┐
│ content-repository │ → Store in MySQL
└─────────────────────┘
┌─────────────────────┐
│ content-distiller │ → Summarize & extract
└─────────────────────┘
┌─────────────────────┐
│ quality-reviewer │ → QA loop
└─────────────────────┘
├── REFACTOR → content-distiller
├── DEEP_RESEARCH → web-crawler
▼ APPROVE
│ content-distiller │ → Summarize & extract ◄────┐
└─────────────────────┘
▼ │
┌─────────────────────┐
│ quality-reviewer │ → QA loop
└─────────────────────┘
├── REFACTOR (max 3) ───────────────────────┤
├── DEEP_RESEARCH (max 2) → crawler ────────┘
▼ APPROVE
┌─────────────────────┐
│ markdown-exporter │ → Project files / Fine-tuning
└─────────────────────┘
@@ -74,16 +96,28 @@ This project knowledge enables Claude to curate, process, and export reference d
## Files in This Project
- `INDEX.md` - This overview file
- `reference-curator-complete.md` - All 6 skills in one file
- `reference-curator-complete.md` - All 7 skills in one file (recommended)
- `01-reference-discovery.md` - Source discovery skill
- `02-web-crawler.md` - Crawling orchestration skill
- `03-content-repository.md` - Database storage skill
- `04-content-distiller.md` - Content summarization skill
- `05-quality-reviewer.md` - QA review skill
- `06-markdown-exporter.md` - Export skill
- `07-pipeline-orchestrator.md` - Full pipeline orchestration
## Usage
Upload all files to a Claude.ai Project, or upload only the skills you need.
For the complete experience, upload `reference-curator-complete.md` which contains all skills in one file.
## Pipeline Orchestrator Options
| Option | Default | Description |
|--------|---------|-------------|
| max_sources | 10 | Max sources to discover |
| max_pages | 50 | Max pages per source |
| auto_approve | false | Auto-approve above threshold |
| threshold | 0.85 | Approval threshold |
| max_iterations | 3 | Max QA loop iterations |
| export_format | project_files | Output format |

View File

@@ -1,6 +1,87 @@
# Reference Curator - Complete Skill Set
This document contains all 6 skills for curating, processing, and exporting reference documentation.
This document contains all 7 skills for curating, processing, and exporting reference documentation.
---
# Pipeline Orchestrator (Recommended Entry Point)
Coordinates the full 6-skill workflow with automated QA loop handling.
## Quick Start
```
# Full pipeline from topic
curate references on "Claude Code best practices"
# From URLs (skip discovery)
curate these URLs: https://docs.anthropic.com/en/docs/prompt-caching
# With auto-approve
curate references on "MCP servers" with auto-approve and fine-tuning output
```
## Configuration Options
| Option | Default | Description |
|--------|---------|-------------|
| max_sources | 10 | Maximum sources to discover |
| max_pages | 50 | Maximum pages per source |
| auto_approve | false | Auto-approve above threshold |
| threshold | 0.85 | Approval threshold |
| max_iterations | 3 | Max QA loop iterations |
| export_format | project_files | Output format |
## Pipeline Flow
```
[Input: Topic | URLs | Manifest]
1. reference-discovery (skip if URLs/manifest)
2. web-crawler
3. content-repository
4. content-distiller ◄─────────────┐
│ │
▼ │
5. quality-reviewer │
│ │
├── APPROVE → export │
├── REFACTOR (max 3) ─────┤
├── DEEP_RESEARCH (max 2) → crawler
└── REJECT → archive
6. markdown-exporter
```
## QA Loop Handling
| Decision | Action | Max Iterations |
|----------|--------|----------------|
| APPROVE | Proceed to export | - |
| REFACTOR | Re-distill with feedback | 3 |
| DEEP_RESEARCH | Crawl more sources | 2 |
| REJECT | Archive with reason | - |
Documents exceeding iteration limits are marked `needs_manual_review`.
## Output Summary
```
Pipeline Complete:
- Sources discovered: 5
- Pages crawled: 45
- Approved: 40
- Needs manual review: 2
- Exports: ~/reference-library/exports/
```
---
@@ -464,6 +545,7 @@ def add_cross_references(doc, all_docs):
| From | Output | To |
|------|--------|-----|
| **pipeline-orchestrator** | Coordinates all stages | All skills below |
| **reference-discovery** | URL manifest | web-crawler |
| **web-crawler** | Raw content + manifest | content-repository |
| **content-repository** | Document records | content-distiller |
@@ -471,3 +553,25 @@ def add_cross_references(doc, all_docs):
| **quality-reviewer** (approve) | Approved IDs | markdown-exporter |
| **quality-reviewer** (refactor) | Instructions | content-distiller |
| **quality-reviewer** (deep_research) | Queries | web-crawler |
## State Management
The pipeline orchestrator tracks state for resume capability:
**With Database:**
- `pipeline_runs` table tracks run status, current stage, statistics
- `pipeline_iteration_tracker` tracks QA loop iterations per document
**File-Based Fallback:**
```
~/reference-library/pipeline_state/run_XXX/
├── state.json # Current stage and stats
├── manifest.json # Discovered sources
└── review_log.json # QA decisions
```
## Resume Pipeline
To resume a paused or failed pipeline:
1. Provide the run_id or state file path
2. Pipeline continues from last successful checkpoint

View File

@@ -0,0 +1,115 @@
---
description: Orchestrates full reference curation pipeline as background task. Runs discovery → crawl → store → distill → review → export with QA loop handling.
argument-hint: <topic|urls|manifest> [--max-sources 10] [--max-pages 50] [--auto-approve] [--threshold 0.85] [--max-iterations 3] [--export-format project_files]
allowed-tools: WebSearch, WebFetch, Read, Write, Bash, Grep, Glob, Task
---
# Reference Curator Pipeline
Full-stack orchestration of the 6-skill reference curation workflow.
## Input Modes
| Mode | Input Example | Pipeline Start |
|------|---------------|----------------|
| **Topic** | `"Claude system prompts"` | reference-discovery |
| **URLs** | `https://docs.anthropic.com/...` | web-crawler (skip discovery) |
| **Manifest** | `./manifest.json` | web-crawler (resume from discovery) |
## Arguments
- `<input>`: Required. Topic string, URL(s), or manifest file path
- `--max-sources`: Maximum sources to discover (topic mode, default: 10)
- `--max-pages`: Maximum pages per source to crawl (default: 50)
- `--auto-approve`: Auto-approve scores above threshold
- `--threshold`: Approval threshold (default: 0.85)
- `--max-iterations`: Max QA loop iterations per document (default: 3)
- `--export-format`: Output format: `project_files`, `fine_tuning`, `jsonl` (default: project_files)
## Pipeline Stages
```
1. reference-discovery (topic mode only)
2. web-crawler-orchestrator
3. content-repository
4. content-distiller ◄────────┐
5. quality-reviewer │
├── APPROVE → export │
├── REFACTOR ─────────────────┤
├── DEEP_RESEARCH → crawler ──┘
└── REJECT → archive
6. markdown-exporter
```
## QA Loop Handling
| Decision | Action | Max Iterations |
|----------|--------|----------------|
| REFACTOR | Re-distill with feedback | 3 |
| DEEP_RESEARCH | Crawl more sources, re-distill | 2 |
| Combined | Total loops per document | 5 |
After max iterations, document marked as `needs_manual_review`.
## Example Usage
```
# Full pipeline from topic
/reference-curator-pipeline "Claude Code best practices" --max-sources 5
# Pipeline from specific URLs (skip discovery)
/reference-curator-pipeline https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching
# Resume from existing manifest
/reference-curator-pipeline ./manifest.json --auto-approve
# Fine-tuning dataset output
/reference-curator-pipeline "MCP servers" --export-format fine_tuning --auto-approve
```
## State Management
Pipeline state is saved after each stage to allow resume:
**With MySQL:**
```sql
SELECT * FROM pipeline_runs WHERE run_id = 123;
```
**File-based fallback:**
```
~/reference-library/pipeline_state/run_XXX/state.json
```
## Output
Pipeline returns summary on completion:
```json
{
"run_id": 123,
"status": "completed",
"stats": {
"sources_discovered": 5,
"pages_crawled": 45,
"documents_stored": 45,
"approved": 40,
"refactored": 8,
"deep_researched": 2,
"rejected": 3,
"needs_manual_review": 2
},
"exports": {
"format": "project_files",
"path": "~/reference-library/exports/"
}
}
```
## See Also
- `/reference-discovery` - Run discovery stage only
- `/web-crawler` - Run crawler stage only
- `/content-repository` - Manage stored content
- `/quality-reviewer` - Run QA review only
- `/markdown-exporter` - Run export only

View File

@@ -0,0 +1,40 @@
# Pipeline Orchestrator Configuration
# Copy to ~/.config/reference-curator/pipeline_config.yaml
pipeline:
# Discovery stage
max_sources: 10
# Crawler stage
max_pages: 50
# Auto-approve settings
auto_approve: false
approval_threshold: 0.85
qa_loop:
# Maximum iterations before escalating to manual review
max_refactor_iterations: 3
max_deep_research_iterations: 2
max_total_iterations: 5
export:
# Default export format: project_files, fine_tuning, jsonl
default_format: project_files
# Include rejected documents in a separate folder
include_rejected: false
state:
# State management backend: mysql or file
backend: ${STATE_BACKEND:-file}
# File-based state directory (used when backend=file)
state_directory: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/pipeline_state/
logging:
# Log level: DEBUG, INFO, WARNING, ERROR
level: INFO
# Save detailed logs for each run
save_run_logs: true

View File

@@ -187,7 +187,91 @@ CREATE TABLE export_jobs (
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- -----------------------------
-- 4. Tracking & Monitoring Tables
-- 4. Pipeline Orchestration Tables
-- -----------------------------
CREATE TABLE pipeline_runs (
run_id INT AUTO_INCREMENT PRIMARY KEY,
-- Input configuration
run_type ENUM('topic', 'urls', 'manifest') NOT NULL,
input_value TEXT NOT NULL,
-- Status tracking
status ENUM('running', 'completed', 'failed', 'paused') DEFAULT 'running',
current_stage ENUM('discovery', 'crawling', 'storing', 'distilling', 'reviewing', 'exporting') DEFAULT 'discovery',
-- Configuration options
options JSON,
/*
Example options JSON:
{
"max_sources": 10,
"max_pages": 50,
"auto_approve": false,
"threshold": 0.85,
"max_iterations": 3,
"export_format": "project_files"
}
*/
-- Pipeline statistics
stats JSON,
/*
Example stats JSON:
{
"sources_discovered": 5,
"pages_crawled": 45,
"documents_stored": 45,
"documents_distilled": 45,
"approved": 40,
"refactored": 8,
"deep_researched": 2,
"rejected": 3,
"needs_manual_review": 2
}
*/
-- Export information
export_path VARCHAR(500),
export_document_count INT,
-- Timing
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
completed_at TIMESTAMP NULL,
-- Error handling
error_message TEXT,
error_stage VARCHAR(50),
INDEX idx_status (status),
INDEX idx_run_type (run_type),
INDEX idx_started_at (started_at)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE pipeline_iteration_tracker (
tracker_id INT AUTO_INCREMENT PRIMARY KEY,
run_id INT NOT NULL,
doc_id INT NOT NULL,
-- Iteration counts
refactor_count INT DEFAULT 0,
deep_research_count INT DEFAULT 0,
-- Final status
final_decision ENUM('approved', 'rejected', 'needs_manual_review'),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
FOREIGN KEY (run_id) REFERENCES pipeline_runs(run_id) ON DELETE CASCADE,
FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE,
UNIQUE INDEX idx_run_doc (run_id, doc_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- -----------------------------
-- 5. Tracking & Monitoring Tables
-- -----------------------------
CREATE TABLE crawl_schedule (
@@ -218,7 +302,7 @@ CREATE TABLE change_detection (
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- -----------------------------
-- 5. Default Data
-- 6. Default Data
-- -----------------------------
INSERT INTO topics (topic_name, topic_slug, description) VALUES
@@ -240,7 +324,7 @@ INSERT INTO sources (source_name, source_type, base_url, credibility_tier, vendo
('Google AI Docs', 'official_docs', 'https://ai.google.dev/docs', 'tier1_official', 'google');
-- -----------------------------
-- 6. Useful Views
-- 7. Useful Views
-- -----------------------------
CREATE OR REPLACE VIEW v_pending_reviews AS
@@ -283,3 +367,39 @@ AND rl.review_id = (
WHERE distill_id = dc.distill_id
)
ORDER BY t.topic_slug, rl.quality_score DESC;
CREATE OR REPLACE VIEW v_pipeline_status AS
SELECT
pr.run_id,
pr.run_type,
pr.input_value,
pr.status,
pr.current_stage,
pr.started_at,
pr.completed_at,
TIMESTAMPDIFF(MINUTE, pr.started_at, COALESCE(pr.completed_at, NOW())) as duration_minutes,
JSON_EXTRACT(pr.stats, '$.sources_discovered') as sources_discovered,
JSON_EXTRACT(pr.stats, '$.pages_crawled') as pages_crawled,
JSON_EXTRACT(pr.stats, '$.documents_stored') as documents_stored,
JSON_EXTRACT(pr.stats, '$.approved') as approved,
JSON_EXTRACT(pr.stats, '$.rejected') as rejected,
JSON_EXTRACT(pr.stats, '$.needs_manual_review') as needs_manual_review,
pr.export_path,
pr.error_message
FROM pipeline_runs pr
ORDER BY pr.started_at DESC;
CREATE OR REPLACE VIEW v_pipeline_iterations AS
SELECT
pit.run_id,
pr.input_value,
d.title,
d.url,
pit.refactor_count,
pit.deep_research_count,
(pit.refactor_count + pit.deep_research_count) as total_iterations,
pit.final_decision
FROM pipeline_iteration_tracker pit
JOIN pipeline_runs pr ON pit.run_id = pr.run_id
JOIN documents d ON pit.doc_id = d.doc_id
ORDER BY pit.run_id DESC, pit.total_iterations DESC;