Pipeline Orchestrator: - Add 07-pipeline-orchestrator skill with code/CLAUDE.md and desktop/SKILL.md - Add /reference-curator-pipeline slash command for full workflow automation - Add pipeline_runs and pipeline_iteration_tracker tables to schema.sql - Add v_pipeline_status and v_pipeline_iterations views - Add pipeline_config.yaml configuration template - Update AGENTS.md with Reference Curator Skills section - Update claude-project files with pipeline documentation Skill Format Refactoring: - Extract YAML frontmatter from SKILL.md files to separate skill.yaml - Add tools/ directories with MCP tool documentation - Update SKILL-FORMAT-REQUIREMENTS.md with new structure - Add migrate-skill-structure.py script for format conversion Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
290 lines
8.4 KiB
Markdown
290 lines
8.4 KiB
Markdown
# Markdown Exporter
|
|
|
|
Exports approved content as structured markdown files for Claude Projects or fine-tuning.
|
|
|
|
## Export Configuration
|
|
|
|
```yaml
|
|
# ~/.config/reference-curator/export_config.yaml
|
|
output:
|
|
base_path: ~/reference-library/exports/
|
|
|
|
project_files:
|
|
structure: nested_by_topic # flat | nested_by_topic | nested_by_source
|
|
index_file: INDEX.md
|
|
include_metadata: true
|
|
|
|
fine_tuning:
|
|
format: jsonl
|
|
max_tokens_per_sample: 4096
|
|
include_system_prompt: true
|
|
|
|
quality:
|
|
min_score_for_export: 0.80
|
|
```
|
|
|
|
## Export Workflow
|
|
|
|
### Step 1: Query Approved Content
|
|
|
|
```python
|
|
def get_exportable_content(cursor, min_score=0.80, topic_filter=None):
|
|
"""Get all approved content meeting quality threshold."""
|
|
sql = """
|
|
SELECT d.doc_id, d.title, d.url,
|
|
dc.summary, dc.key_concepts, dc.code_snippets, dc.structured_content,
|
|
t.topic_slug, t.topic_name,
|
|
rl.quality_score, s.credibility_tier, s.vendor
|
|
FROM documents d
|
|
JOIN distilled_content dc ON d.doc_id = dc.doc_id
|
|
JOIN document_topics dt ON d.doc_id = dt.doc_id
|
|
JOIN topics t ON dt.topic_id = t.topic_id
|
|
JOIN review_logs rl ON dc.distill_id = rl.distill_id
|
|
JOIN sources s ON d.source_id = s.source_id
|
|
WHERE rl.decision = 'approve'
|
|
AND rl.quality_score >= %s
|
|
AND rl.review_id = (
|
|
SELECT MAX(review_id) FROM review_logs
|
|
WHERE distill_id = dc.distill_id
|
|
)
|
|
"""
|
|
params = [min_score]
|
|
|
|
if topic_filter:
|
|
sql += " AND t.topic_slug IN (%s)" % ','.join(['%s'] * len(topic_filter))
|
|
params.extend(topic_filter)
|
|
|
|
sql += " ORDER BY t.topic_slug, rl.quality_score DESC"
|
|
cursor.execute(sql, params)
|
|
return cursor.fetchall()
|
|
```
|
|
|
|
### Step 2: Organize by Structure
|
|
|
|
**Nested by Topic (recommended):**
|
|
```
|
|
exports/
|
|
├── INDEX.md
|
|
├── prompt-engineering/
|
|
│ ├── _index.md
|
|
│ ├── 01-chain-of-thought.md
|
|
│ ├── 02-few-shot-prompting.md
|
|
│ └── 03-system-prompts.md
|
|
├── claude-models/
|
|
│ ├── _index.md
|
|
│ ├── 01-model-comparison.md
|
|
│ └── 02-context-windows.md
|
|
└── agent-building/
|
|
├── _index.md
|
|
└── 01-tool-use.md
|
|
```
|
|
|
|
**Flat Structure:**
|
|
```
|
|
exports/
|
|
├── INDEX.md
|
|
├── prompt-engineering-chain-of-thought.md
|
|
├── prompt-engineering-few-shot.md
|
|
└── claude-models-comparison.md
|
|
```
|
|
|
|
### Step 3: Generate Files
|
|
|
|
**Document File Template:**
|
|
```python
|
|
def generate_document_file(doc, include_metadata=True):
|
|
content = []
|
|
|
|
if include_metadata:
|
|
content.append("---")
|
|
content.append(f"title: {doc['title']}")
|
|
content.append(f"source: {doc['url']}")
|
|
content.append(f"vendor: {doc['vendor']}")
|
|
content.append(f"tier: {doc['credibility_tier']}")
|
|
content.append(f"quality_score: {doc['quality_score']:.2f}")
|
|
content.append(f"exported: {datetime.now().isoformat()}")
|
|
content.append("---")
|
|
content.append("")
|
|
|
|
content.append(doc['structured_content'])
|
|
|
|
return "\n".join(content)
|
|
```
|
|
|
|
**Topic Index Template:**
|
|
```python
|
|
def generate_topic_index(topic_slug, topic_name, documents):
|
|
content = [
|
|
f"# {topic_name}",
|
|
"",
|
|
f"This section contains {len(documents)} reference documents.",
|
|
"",
|
|
"## Contents",
|
|
""
|
|
]
|
|
|
|
for i, doc in enumerate(documents, 1):
|
|
filename = generate_filename(doc['title'])
|
|
content.append(f"{i}. [{doc['title']}]({filename})")
|
|
|
|
return "\n".join(content)
|
|
```
|
|
|
|
**Root INDEX Template:**
|
|
```python
|
|
def generate_root_index(topics_with_counts, export_date):
|
|
content = [
|
|
"# Reference Library",
|
|
"",
|
|
f"Exported: {export_date}",
|
|
"",
|
|
"## Topics",
|
|
""
|
|
]
|
|
|
|
for topic in topics_with_counts:
|
|
content.append(f"- [{topic['name']}]({topic['slug']}/) ({topic['count']} documents)")
|
|
|
|
content.extend([
|
|
"",
|
|
"## Quality Standards",
|
|
"",
|
|
"All documents in this library have:",
|
|
"- Passed quality review (score ≥ 0.80)",
|
|
"- Been distilled for conciseness",
|
|
"- Verified source attribution"
|
|
])
|
|
|
|
return "\n".join(content)
|
|
```
|
|
|
|
### Step 4: Write Files
|
|
|
|
```python
|
|
def export_project_files(content_list, config):
|
|
base_path = Path(config['output']['base_path'])
|
|
structure = config['output']['project_files']['structure']
|
|
|
|
# Group by topic
|
|
by_topic = defaultdict(list)
|
|
for doc in content_list:
|
|
by_topic[doc['topic_slug']].append(doc)
|
|
|
|
# Create directories and files
|
|
for topic_slug, docs in by_topic.items():
|
|
if structure == 'nested_by_topic':
|
|
topic_dir = base_path / topic_slug
|
|
topic_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Write topic index
|
|
topic_index = generate_topic_index(topic_slug, docs[0]['topic_name'], docs)
|
|
(topic_dir / '_index.md').write_text(topic_index)
|
|
|
|
# Write document files
|
|
for i, doc in enumerate(docs, 1):
|
|
filename = f"{i:02d}-{slugify(doc['title'])}.md"
|
|
file_content = generate_document_file(doc)
|
|
(topic_dir / filename).write_text(file_content)
|
|
|
|
# Write root INDEX
|
|
topics_summary = [
|
|
{"slug": slug, "name": docs[0]['topic_name'], "count": len(docs)}
|
|
for slug, docs in by_topic.items()
|
|
]
|
|
root_index = generate_root_index(topics_summary, datetime.now().isoformat())
|
|
(base_path / 'INDEX.md').write_text(root_index)
|
|
```
|
|
|
|
### Step 5: Fine-tuning Export (Optional)
|
|
|
|
```python
|
|
def export_fine_tuning_dataset(content_list, config):
|
|
"""Export as JSONL for fine-tuning."""
|
|
output_path = Path(config['output']['base_path']) / 'fine_tuning.jsonl'
|
|
max_tokens = config['output']['fine_tuning']['max_tokens_per_sample']
|
|
|
|
with open(output_path, 'w') as f:
|
|
for doc in content_list:
|
|
sample = {
|
|
"messages": [
|
|
{
|
|
"role": "system",
|
|
"content": "You are an expert on AI and prompt engineering."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"Explain {doc['title']}"
|
|
},
|
|
{
|
|
"role": "assistant",
|
|
"content": truncate_to_tokens(doc['structured_content'], max_tokens)
|
|
}
|
|
],
|
|
"metadata": {
|
|
"source": doc['url'],
|
|
"topic": doc['topic_slug'],
|
|
"quality_score": doc['quality_score']
|
|
}
|
|
}
|
|
f.write(json.dumps(sample) + '\n')
|
|
```
|
|
|
|
### Step 6: Log Export Job
|
|
|
|
```python
|
|
def log_export_job(cursor, export_name, export_type, output_path,
|
|
topic_filter, total_docs, total_tokens):
|
|
sql = """
|
|
INSERT INTO export_jobs
|
|
(export_name, export_type, output_format, topic_filter, output_path,
|
|
total_documents, total_tokens, status, started_at, completed_at)
|
|
VALUES (%s, %s, 'markdown', %s, %s, %s, %s, 'completed', NOW(), NOW())
|
|
"""
|
|
cursor.execute(sql, (
|
|
export_name, export_type,
|
|
json.dumps(topic_filter) if topic_filter else None,
|
|
str(output_path), total_docs, total_tokens
|
|
))
|
|
```
|
|
|
|
## Cross-Reference Generation
|
|
|
|
Link related documents:
|
|
|
|
```python
|
|
def add_cross_references(doc, all_docs):
|
|
"""Find and link related documents."""
|
|
related = []
|
|
doc_concepts = set(c['term'].lower() for c in doc['key_concepts'])
|
|
|
|
for other in all_docs:
|
|
if other['doc_id'] == doc['doc_id']:
|
|
continue
|
|
other_concepts = set(c['term'].lower() for c in other['key_concepts'])
|
|
overlap = len(doc_concepts & other_concepts)
|
|
if overlap >= 2:
|
|
related.append({
|
|
"title": other['title'],
|
|
"path": generate_relative_path(doc, other),
|
|
"overlap": overlap
|
|
})
|
|
|
|
return sorted(related, key=lambda x: x['overlap'], reverse=True)[:5]
|
|
```
|
|
|
|
## Output Verification
|
|
|
|
After export, verify:
|
|
- [ ] All files readable and valid markdown
|
|
- [ ] INDEX.md links resolve correctly
|
|
- [ ] No broken cross-references
|
|
- [ ] Total token count matches expectation
|
|
- [ ] No duplicate content
|
|
|
|
## Integration
|
|
|
|
| From | Input | To |
|
|
|------|-------|-----|
|
|
| quality-reviewer | Approved content IDs | markdown-exporter |
|
|
| markdown-exporter | Structured files | Project knowledge / Fine-tuning |
|