--- name: markdown-exporter description: Exports approved reference content as structured markdown files for project knowledge or fine-tuning datasets. Generates INDEX files, organizes by topic, and maintains cross-references. Triggers on "export references", "generate project files", "create markdown output", "export for fine-tuning", "build knowledge base". --- # Markdown Exporter Exports approved content as structured markdown files for Claude Projects or fine-tuning. ## Export Configuration ```yaml # ~/.config/reference-curator/export_config.yaml output: base_path: ~/reference-library/exports/ project_files: structure: nested_by_topic # flat | nested_by_topic | nested_by_source index_file: INDEX.md include_metadata: true fine_tuning: format: jsonl max_tokens_per_sample: 4096 include_system_prompt: true quality: min_score_for_export: 0.80 ``` ## Export Workflow ### Step 1: Query Approved Content ```python def get_exportable_content(cursor, min_score=0.80, topic_filter=None): """Get all approved content meeting quality threshold.""" sql = """ SELECT d.doc_id, d.title, d.url, dc.summary, dc.key_concepts, dc.code_snippets, dc.structured_content, t.topic_slug, t.topic_name, rl.quality_score, s.credibility_tier, s.vendor FROM documents d JOIN distilled_content dc ON d.doc_id = dc.doc_id JOIN document_topics dt ON d.doc_id = dt.doc_id JOIN topics t ON dt.topic_id = t.topic_id JOIN review_logs rl ON dc.distill_id = rl.distill_id JOIN sources s ON d.source_id = s.source_id WHERE rl.decision = 'approve' AND rl.quality_score >= %s AND rl.review_id = ( SELECT MAX(review_id) FROM review_logs WHERE distill_id = dc.distill_id ) """ params = [min_score] if topic_filter: sql += " AND t.topic_slug IN (%s)" % ','.join(['%s'] * len(topic_filter)) params.extend(topic_filter) sql += " ORDER BY t.topic_slug, rl.quality_score DESC" cursor.execute(sql, params) return cursor.fetchall() ``` ### Step 2: Organize by Structure **Nested by Topic (recommended):** ``` exports/ ├── INDEX.md ├── prompt-engineering/ │ ├── _index.md │ ├── 01-chain-of-thought.md │ ├── 02-few-shot-prompting.md │ └── 03-system-prompts.md ├── claude-models/ │ ├── _index.md │ ├── 01-model-comparison.md │ └── 02-context-windows.md └── agent-building/ ├── _index.md └── 01-tool-use.md ``` **Flat Structure:** ``` exports/ ├── INDEX.md ├── prompt-engineering-chain-of-thought.md ├── prompt-engineering-few-shot.md └── claude-models-comparison.md ``` ### Step 3: Generate Files **Document File Template:** ```python def generate_document_file(doc, include_metadata=True): content = [] if include_metadata: content.append("---") content.append(f"title: {doc['title']}") content.append(f"source: {doc['url']}") content.append(f"vendor: {doc['vendor']}") content.append(f"tier: {doc['credibility_tier']}") content.append(f"quality_score: {doc['quality_score']:.2f}") content.append(f"exported: {datetime.now().isoformat()}") content.append("---") content.append("") content.append(doc['structured_content']) return "\n".join(content) ``` **Topic Index Template:** ```python def generate_topic_index(topic_slug, topic_name, documents): content = [ f"# {topic_name}", "", f"This section contains {len(documents)} reference documents.", "", "## Contents", "" ] for i, doc in enumerate(documents, 1): filename = generate_filename(doc['title']) content.append(f"{i}. [{doc['title']}]({filename})") return "\n".join(content) ``` **Root INDEX Template:** ```python def generate_root_index(topics_with_counts, export_date): content = [ "# Reference Library", "", f"Exported: {export_date}", "", "## Topics", "" ] for topic in topics_with_counts: content.append(f"- [{topic['name']}]({topic['slug']}/) ({topic['count']} documents)") content.extend([ "", "## Quality Standards", "", "All documents in this library have:", "- Passed quality review (score ≥ 0.80)", "- Been distilled for conciseness", "- Verified source attribution" ]) return "\n".join(content) ``` ### Step 4: Write Files ```python def export_project_files(content_list, config): base_path = Path(config['output']['base_path']) structure = config['output']['project_files']['structure'] # Group by topic by_topic = defaultdict(list) for doc in content_list: by_topic[doc['topic_slug']].append(doc) # Create directories and files for topic_slug, docs in by_topic.items(): if structure == 'nested_by_topic': topic_dir = base_path / topic_slug topic_dir.mkdir(parents=True, exist_ok=True) # Write topic index topic_index = generate_topic_index(topic_slug, docs[0]['topic_name'], docs) (topic_dir / '_index.md').write_text(topic_index) # Write document files for i, doc in enumerate(docs, 1): filename = f"{i:02d}-{slugify(doc['title'])}.md" file_content = generate_document_file(doc) (topic_dir / filename).write_text(file_content) # Write root INDEX topics_summary = [ {"slug": slug, "name": docs[0]['topic_name'], "count": len(docs)} for slug, docs in by_topic.items() ] root_index = generate_root_index(topics_summary, datetime.now().isoformat()) (base_path / 'INDEX.md').write_text(root_index) ``` ### Step 5: Fine-tuning Export (Optional) ```python def export_fine_tuning_dataset(content_list, config): """Export as JSONL for fine-tuning.""" output_path = Path(config['output']['base_path']) / 'fine_tuning.jsonl' max_tokens = config['output']['fine_tuning']['max_tokens_per_sample'] with open(output_path, 'w') as f: for doc in content_list: sample = { "messages": [ { "role": "system", "content": "You are an expert on AI and prompt engineering." }, { "role": "user", "content": f"Explain {doc['title']}" }, { "role": "assistant", "content": truncate_to_tokens(doc['structured_content'], max_tokens) } ], "metadata": { "source": doc['url'], "topic": doc['topic_slug'], "quality_score": doc['quality_score'] } } f.write(json.dumps(sample) + '\n') ``` ### Step 6: Log Export Job ```python def log_export_job(cursor, export_name, export_type, output_path, topic_filter, total_docs, total_tokens): sql = """ INSERT INTO export_jobs (export_name, export_type, output_format, topic_filter, output_path, total_documents, total_tokens, status, started_at, completed_at) VALUES (%s, %s, 'markdown', %s, %s, %s, %s, 'completed', NOW(), NOW()) """ cursor.execute(sql, ( export_name, export_type, json.dumps(topic_filter) if topic_filter else None, str(output_path), total_docs, total_tokens )) ``` ## Cross-Reference Generation Link related documents: ```python def add_cross_references(doc, all_docs): """Find and link related documents.""" related = [] doc_concepts = set(c['term'].lower() for c in doc['key_concepts']) for other in all_docs: if other['doc_id'] == doc['doc_id']: continue other_concepts = set(c['term'].lower() for c in other['key_concepts']) overlap = len(doc_concepts & other_concepts) if overlap >= 2: related.append({ "title": other['title'], "path": generate_relative_path(doc, other), "overlap": overlap }) return sorted(related, key=lambda x: x['overlap'], reverse=True)[:5] ``` ## Output Verification After export, verify: - [ ] All files readable and valid markdown - [ ] INDEX.md links resolve correctly - [ ] No broken cross-references - [ ] Total token count matches expectation - [ ] No duplicate content ## Integration | From | Input | To | |------|-------|-----| | quality-reviewer | Approved content IDs | markdown-exporter | | markdown-exporter | Structured files | Project knowledge / Fine-tuning |