feat(reference-curator): Add Claude.ai Projects export format
Add claude-project/ folder with skill files formatted for upload to Claude.ai Projects (web interface): - reference-curator-complete.md: All 6 skills consolidated - INDEX.md: Overview and workflow documentation - Individual skill files (01-06) without YAML frontmatter Add --claude-ai option to install.sh: - Lists available files for upload - Optionally copies to custom destination directory - Provides upload instructions for Claude.ai Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,473 @@
|
||||
# Reference Curator - Complete Skill Set
|
||||
|
||||
This document contains all 6 skills for curating, processing, and exporting reference documentation.
|
||||
|
||||
---
|
||||
|
||||
# 1. Reference Discovery
|
||||
|
||||
Searches for authoritative sources, validates credibility, and produces curated URL lists for crawling.
|
||||
|
||||
## Source Priority Hierarchy
|
||||
|
||||
| Tier | Source Type | Examples |
|
||||
|------|-------------|----------|
|
||||
| **Tier 1** | Official documentation | docs.anthropic.com, docs.claude.com, platform.openai.com/docs |
|
||||
| **Tier 1** | Engineering blogs (official) | anthropic.com/news, openai.com/blog |
|
||||
| **Tier 1** | Official GitHub repos | github.com/anthropics/*, github.com/openai/* |
|
||||
| **Tier 2** | Research papers | arxiv.org, papers with citations |
|
||||
| **Tier 2** | Verified community guides | Cookbook examples, official tutorials |
|
||||
| **Tier 3** | Community content | Blog posts, tutorials, Stack Overflow |
|
||||
|
||||
## Discovery Workflow
|
||||
|
||||
### Step 1: Define Search Scope
|
||||
|
||||
```python
|
||||
search_config = {
|
||||
"topic": "prompt engineering",
|
||||
"vendors": ["anthropic", "openai", "google"],
|
||||
"source_types": ["official_docs", "engineering_blog", "github_repo"],
|
||||
"freshness": "past_year",
|
||||
"max_results_per_query": 20
|
||||
}
|
||||
```
|
||||
|
||||
### Step 2: Generate Search Queries
|
||||
|
||||
```python
|
||||
def generate_queries(topic, vendors):
|
||||
queries = []
|
||||
for vendor in vendors:
|
||||
queries.append(f"site:docs.{vendor}.com {topic}")
|
||||
queries.append(f"site:{vendor}.com/docs {topic}")
|
||||
queries.append(f"site:{vendor}.com/blog {topic}")
|
||||
queries.append(f"site:github.com/{vendor} {topic}")
|
||||
queries.append(f"site:arxiv.org {topic}")
|
||||
return queries
|
||||
```
|
||||
|
||||
### Step 3: Validate and Score Sources
|
||||
|
||||
```python
|
||||
def score_source(url, title):
|
||||
score = 0.0
|
||||
if any(d in url for d in ['docs.anthropic.com', 'docs.claude.com', 'docs.openai.com']):
|
||||
score += 0.40 # Tier 1 official docs
|
||||
elif any(d in url for d in ['anthropic.com', 'openai.com', 'google.dev']):
|
||||
score += 0.30 # Tier 1 official blog/news
|
||||
elif 'github.com' in url and any(v in url for v in ['anthropics', 'openai', 'google']):
|
||||
score += 0.30 # Tier 1 official repos
|
||||
elif 'arxiv.org' in url:
|
||||
score += 0.20 # Tier 2 research
|
||||
else:
|
||||
score += 0.10 # Tier 3 community
|
||||
return min(score, 1.0)
|
||||
|
||||
def assign_credibility_tier(score):
|
||||
if score >= 0.60:
|
||||
return 'tier1_official'
|
||||
elif score >= 0.40:
|
||||
return 'tier2_verified'
|
||||
else:
|
||||
return 'tier3_community'
|
||||
```
|
||||
|
||||
## Output Format
|
||||
|
||||
```json
|
||||
{
|
||||
"discovery_date": "2025-01-28T10:30:00",
|
||||
"topic": "prompt engineering",
|
||||
"total_urls": 15,
|
||||
"urls": [
|
||||
{
|
||||
"url": "https://docs.anthropic.com/en/docs/prompt-engineering",
|
||||
"title": "Prompt Engineering Guide",
|
||||
"credibility_tier": "tier1_official",
|
||||
"credibility_score": 0.85,
|
||||
"source_type": "official_docs",
|
||||
"vendor": "anthropic"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
# 2. Web Crawler Orchestrator
|
||||
|
||||
Manages crawling operations using Firecrawl MCP with rate limiting and format handling.
|
||||
|
||||
## Crawl Configuration
|
||||
|
||||
```yaml
|
||||
firecrawl:
|
||||
rate_limit:
|
||||
requests_per_minute: 20
|
||||
concurrent_requests: 3
|
||||
default_options:
|
||||
timeout: 30000
|
||||
only_main_content: true
|
||||
```
|
||||
|
||||
## Crawl Workflow
|
||||
|
||||
### Determine Crawl Strategy
|
||||
|
||||
```python
|
||||
def select_strategy(url):
|
||||
if url.endswith('.pdf'):
|
||||
return 'pdf_extract'
|
||||
elif 'github.com' in url and '/blob/' in url:
|
||||
return 'raw_content'
|
||||
elif any(d in url for d in ['docs.', 'documentation']):
|
||||
return 'scrape'
|
||||
else:
|
||||
return 'scrape'
|
||||
```
|
||||
|
||||
### Execute Firecrawl
|
||||
|
||||
```python
|
||||
# Single page scrape
|
||||
firecrawl_scrape(
|
||||
url="https://docs.anthropic.com/en/docs/prompt-engineering",
|
||||
formats=["markdown"],
|
||||
only_main_content=True,
|
||||
timeout=30000
|
||||
)
|
||||
|
||||
# Multi-page crawl
|
||||
firecrawl_crawl(
|
||||
url="https://docs.anthropic.com/en/docs/",
|
||||
max_depth=2,
|
||||
limit=50,
|
||||
formats=["markdown"]
|
||||
)
|
||||
```
|
||||
|
||||
### Rate Limiting
|
||||
|
||||
```python
|
||||
class RateLimiter:
|
||||
def __init__(self, requests_per_minute=20):
|
||||
self.rpm = requests_per_minute
|
||||
self.request_times = deque()
|
||||
|
||||
def wait_if_needed(self):
|
||||
now = time.time()
|
||||
while self.request_times and now - self.request_times[0] > 60:
|
||||
self.request_times.popleft()
|
||||
if len(self.request_times) >= self.rpm:
|
||||
wait_time = 60 - (now - self.request_times[0])
|
||||
if wait_time > 0:
|
||||
time.sleep(wait_time)
|
||||
self.request_times.append(time.time())
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
| Error | Action |
|
||||
|-------|--------|
|
||||
| Timeout | Retry once with 2x timeout |
|
||||
| Rate limit (429) | Exponential backoff, max 3 retries |
|
||||
| Not found (404) | Log and skip |
|
||||
| Access denied (403) | Log, mark as `failed` |
|
||||
|
||||
---
|
||||
|
||||
# 3. Content Repository
|
||||
|
||||
Manages MySQL storage for the reference library. Handles document storage, version control, deduplication, and retrieval.
|
||||
|
||||
## Core Operations
|
||||
|
||||
**Store New Document:**
|
||||
```python
|
||||
def store_document(cursor, source_id, title, url, doc_type, raw_content_path):
|
||||
sql = """
|
||||
INSERT INTO documents (source_id, title, url, doc_type, crawl_date, crawl_status, raw_content_path)
|
||||
VALUES (%s, %s, %s, %s, NOW(), 'completed', %s)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
version = version + 1,
|
||||
crawl_date = NOW(),
|
||||
raw_content_path = VALUES(raw_content_path)
|
||||
"""
|
||||
cursor.execute(sql, (source_id, title, url, doc_type, raw_content_path))
|
||||
return cursor.lastrowid
|
||||
```
|
||||
|
||||
**Check Duplicate:**
|
||||
```python
|
||||
def is_duplicate(cursor, url):
|
||||
cursor.execute("SELECT doc_id FROM documents WHERE url_hash = SHA2(%s, 256)", (url,))
|
||||
return cursor.fetchone() is not None
|
||||
```
|
||||
|
||||
## Table Quick Reference
|
||||
|
||||
| Table | Purpose | Key Fields |
|
||||
|-------|---------|------------|
|
||||
| `sources` | Authorized content sources | source_type, credibility_tier, vendor |
|
||||
| `documents` | Crawled document metadata | url_hash (dedup), version, crawl_status |
|
||||
| `distilled_content` | Processed summaries | review_status, compression_ratio |
|
||||
| `review_logs` | QA decisions | quality_score, decision |
|
||||
| `topics` | Taxonomy | topic_slug, parent_topic_id |
|
||||
|
||||
## Status Values
|
||||
|
||||
- **crawl_status:** `pending` → `completed` | `failed` | `stale`
|
||||
- **review_status:** `pending` → `in_review` → `approved` | `needs_refactor` | `rejected`
|
||||
- **decision:** `approve` | `refactor` | `deep_research` | `reject`
|
||||
|
||||
---
|
||||
|
||||
# 4. Content Distiller
|
||||
|
||||
Transforms raw crawled content into structured, high-quality reference materials.
|
||||
|
||||
## Distillation Goals
|
||||
|
||||
1. **Compress** - Reduce token count while preserving essential information
|
||||
2. **Structure** - Organize content for easy retrieval and reference
|
||||
3. **Extract** - Pull out code snippets, key concepts, and actionable patterns
|
||||
4. **Annotate** - Add metadata for searchability and categorization
|
||||
|
||||
## Extract Key Components
|
||||
|
||||
**Extract Code Snippets:**
|
||||
```python
|
||||
def extract_code_snippets(content):
|
||||
pattern = r'```(\w*)\n([\s\S]*?)```'
|
||||
snippets = []
|
||||
for match in re.finditer(pattern, content):
|
||||
snippets.append({
|
||||
"language": match.group(1) or "text",
|
||||
"code": match.group(2).strip(),
|
||||
"context": get_surrounding_text(content, match.start(), 200)
|
||||
})
|
||||
return snippets
|
||||
```
|
||||
|
||||
**Extract Key Concepts:**
|
||||
```python
|
||||
def extract_key_concepts(content, title):
|
||||
prompt = f"""
|
||||
Analyze this document and extract key concepts:
|
||||
|
||||
Title: {title}
|
||||
Content: {content[:8000]}
|
||||
|
||||
Return JSON with:
|
||||
- concepts: [{{"term": "...", "definition": "...", "importance": "high|medium|low"}}]
|
||||
- techniques: [{{"name": "...", "description": "...", "use_case": "..."}}]
|
||||
- best_practices: ["..."]
|
||||
"""
|
||||
return claude_extract(prompt)
|
||||
```
|
||||
|
||||
## Summary Template
|
||||
|
||||
```markdown
|
||||
# {title}
|
||||
|
||||
**Source:** {url}
|
||||
**Type:** {source_type} | **Tier:** {credibility_tier}
|
||||
|
||||
## Executive Summary
|
||||
{2-3 sentence overview}
|
||||
|
||||
## Key Concepts
|
||||
{bulleted list of core concepts}
|
||||
|
||||
## Techniques & Patterns
|
||||
{extracted techniques with use cases}
|
||||
|
||||
## Code Examples
|
||||
{relevant code snippets}
|
||||
|
||||
## Best Practices
|
||||
{actionable recommendations}
|
||||
```
|
||||
|
||||
## Quality Metrics
|
||||
|
||||
| Metric | Target |
|
||||
|--------|--------|
|
||||
| Compression Ratio | 25-35% of original |
|
||||
| Key Concept Coverage | ≥90% of important terms |
|
||||
| Code Snippet Retention | 100% of relevant examples |
|
||||
|
||||
---
|
||||
|
||||
# 5. Quality Reviewer
|
||||
|
||||
Evaluates distilled content, routes decisions, and triggers refactoring or additional research.
|
||||
|
||||
## Review Workflow
|
||||
|
||||
```
|
||||
[Distilled Content]
|
||||
│
|
||||
▼
|
||||
┌─────────────────┐
|
||||
│ Score Criteria │ → accuracy, completeness, clarity, PE quality, usability
|
||||
└─────────────────┘
|
||||
│
|
||||
├── ≥ 0.85 → APPROVE → markdown-exporter
|
||||
├── 0.60-0.84 → REFACTOR → content-distiller (with instructions)
|
||||
├── 0.40-0.59 → DEEP_RESEARCH → web-crawler (with queries)
|
||||
└── < 0.40 → REJECT → archive with reason
|
||||
```
|
||||
|
||||
## Scoring Criteria
|
||||
|
||||
| Criterion | Weight | Checks |
|
||||
|-----------|--------|--------|
|
||||
| **Accuracy** | 0.25 | Factual correctness, up-to-date info, proper attribution |
|
||||
| **Completeness** | 0.20 | Covers key concepts, includes examples, addresses edge cases |
|
||||
| **Clarity** | 0.20 | Clear structure, concise language, logical flow |
|
||||
| **PE Quality** | 0.25 | Demonstrates techniques, before/after examples, explains why |
|
||||
| **Usability** | 0.10 | Easy to reference, searchable keywords, appropriate length |
|
||||
|
||||
## Calculate Final Score
|
||||
|
||||
```python
|
||||
WEIGHTS = {
|
||||
"accuracy": 0.25,
|
||||
"completeness": 0.20,
|
||||
"clarity": 0.20,
|
||||
"prompt_engineering_quality": 0.25,
|
||||
"usability": 0.10
|
||||
}
|
||||
|
||||
def calculate_quality_score(assessment):
|
||||
return sum(
|
||||
assessment[criterion]["score"] * weight
|
||||
for criterion, weight in WEIGHTS.items()
|
||||
)
|
||||
```
|
||||
|
||||
## Route Decision
|
||||
|
||||
```python
|
||||
def determine_decision(score, assessment):
|
||||
if score >= 0.85:
|
||||
return "approve", None, None
|
||||
elif score >= 0.60:
|
||||
instructions = generate_refactor_instructions(assessment)
|
||||
return "refactor", instructions, None
|
||||
elif score >= 0.40:
|
||||
queries = generate_research_queries(assessment)
|
||||
return "deep_research", None, queries
|
||||
else:
|
||||
return "reject", f"Quality score {score:.2f} below minimum", None
|
||||
```
|
||||
|
||||
## Prompt Engineering Quality Checklist
|
||||
|
||||
- [ ] Demonstrates specific techniques (CoT, few-shot, etc.)
|
||||
- [ ] Shows before/after examples
|
||||
- [ ] Explains *why* techniques work, not just *what*
|
||||
- [ ] Provides actionable patterns
|
||||
- [ ] Includes edge cases and failure modes
|
||||
- [ ] References authoritative sources
|
||||
|
||||
---
|
||||
|
||||
# 6. Markdown Exporter
|
||||
|
||||
Exports approved content as structured markdown files for Claude Projects or fine-tuning.
|
||||
|
||||
## Export Structure
|
||||
|
||||
**Nested by Topic (recommended):**
|
||||
```
|
||||
exports/
|
||||
├── INDEX.md
|
||||
├── prompt-engineering/
|
||||
│ ├── _index.md
|
||||
│ ├── 01-chain-of-thought.md
|
||||
│ └── 02-few-shot-prompting.md
|
||||
├── claude-models/
|
||||
│ ├── _index.md
|
||||
│ └── 01-model-comparison.md
|
||||
└── agent-building/
|
||||
└── 01-tool-use.md
|
||||
```
|
||||
|
||||
## Document File Template
|
||||
|
||||
```python
|
||||
def generate_document_file(doc, include_metadata=True):
|
||||
content = []
|
||||
if include_metadata:
|
||||
content.append("---")
|
||||
content.append(f"title: {doc['title']}")
|
||||
content.append(f"source: {doc['url']}")
|
||||
content.append(f"vendor: {doc['vendor']}")
|
||||
content.append(f"tier: {doc['credibility_tier']}")
|
||||
content.append(f"quality_score: {doc['quality_score']:.2f}")
|
||||
content.append("---")
|
||||
content.append("")
|
||||
content.append(doc['structured_content'])
|
||||
return "\n".join(content)
|
||||
```
|
||||
|
||||
## Fine-tuning Export (JSONL)
|
||||
|
||||
```python
|
||||
def export_fine_tuning_dataset(content_list, config):
|
||||
with open('fine_tuning.jsonl', 'w') as f:
|
||||
for doc in content_list:
|
||||
sample = {
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are an expert on AI and prompt engineering."},
|
||||
{"role": "user", "content": f"Explain {doc['title']}"},
|
||||
{"role": "assistant", "content": doc['structured_content']}
|
||||
],
|
||||
"metadata": {
|
||||
"source": doc['url'],
|
||||
"topic": doc['topic_slug'],
|
||||
"quality_score": doc['quality_score']
|
||||
}
|
||||
}
|
||||
f.write(json.dumps(sample) + '\n')
|
||||
```
|
||||
|
||||
## Cross-Reference Generation
|
||||
|
||||
```python
|
||||
def add_cross_references(doc, all_docs):
|
||||
related = []
|
||||
doc_concepts = set(c['term'].lower() for c in doc['key_concepts'])
|
||||
|
||||
for other in all_docs:
|
||||
if other['doc_id'] == doc['doc_id']:
|
||||
continue
|
||||
other_concepts = set(c['term'].lower() for c in other['key_concepts'])
|
||||
overlap = len(doc_concepts & other_concepts)
|
||||
if overlap >= 2:
|
||||
related.append({
|
||||
"title": other['title'],
|
||||
"path": generate_relative_path(doc, other),
|
||||
"overlap": overlap
|
||||
})
|
||||
|
||||
return sorted(related, key=lambda x: x['overlap'], reverse=True)[:5]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
# Integration Flow
|
||||
|
||||
| From | Output | To |
|
||||
|------|--------|-----|
|
||||
| **reference-discovery** | URL manifest | web-crawler |
|
||||
| **web-crawler** | Raw content + manifest | content-repository |
|
||||
| **content-repository** | Document records | content-distiller |
|
||||
| **content-distiller** | Distilled content | quality-reviewer |
|
||||
| **quality-reviewer** (approve) | Approved IDs | markdown-exporter |
|
||||
| **quality-reviewer** (refactor) | Instructions | content-distiller |
|
||||
| **quality-reviewer** (deep_research) | Queries | web-crawler |
|
||||
Reference in New Issue
Block a user