From 6d7a6d7a88ad835947d1f7ee15beacaf959caa9e Mon Sep 17 00:00:00 2001 From: Andrew Yim Date: Thu, 29 Jan 2026 00:20:27 +0700 Subject: [PATCH] feat(reference-curator): Add portable skill suite for reference documentation curation 6 modular skills for curating, processing, and exporting reference docs: - reference-discovery: Search and validate authoritative sources - web-crawler-orchestrator: Multi-backend crawling (Firecrawl/Node/aiohttp/Scrapy) - content-repository: MySQL storage with version tracking - content-distiller: Summarization and key concept extraction - quality-reviewer: QA loop with approve/refactor/research routing - markdown-exporter: Structured output for Claude Projects or fine-tuning Cross-machine installation support: - Environment-based config (~/.reference-curator.env) - Commands tracked in repo, symlinked during install - install.sh with --minimal, --check, --uninstall modes - Firecrawl MCP as default (always available) Co-Authored-By: Claude Opus 4.5 --- .claude/settings.local.json | 7 +- .../01-reference-discovery/code/CLAUDE.md | 75 ++ .../01-reference-discovery/desktop/SKILL.md | 188 +++++ .../code/CLAUDE.md | 159 ++++ .../desktop/SKILL.md | 234 ++++++ .../03-content-repository/code/CLAUDE.md | 97 +++ .../03-content-repository/desktop/SKILL.md | 162 ++++ .../04-content-distiller/code/CLAUDE.md | 106 +++ .../04-content-distiller/desktop/SKILL.md | 238 ++++++ .../05-quality-reviewer/code/CLAUDE.md | 103 +++ .../05-quality-reviewer/desktop/SKILL.md | 227 ++++++ .../06-markdown-exporter/code/CLAUDE.md | 136 ++++ .../06-markdown-exporter/desktop/SKILL.md | 294 +++++++ .../90-reference-curator/CHANGELOG.md | 168 ++++ custom-skills/90-reference-curator/README.md | 448 +++++++++++ .../commands/content-distiller.md | 92 +++ .../commands/content-repository.md | 94 +++ .../commands/markdown-exporter.md | 138 ++++ .../commands/quality-reviewer.md | 122 +++ .../commands/reference-discovery.md | 72 ++ .../commands/web-crawler.md | 79 ++ custom-skills/90-reference-curator/install.sh | 747 ++++++++++++++++++ .../shared/config/crawl_config.yaml | 139 ++++ .../shared/config/db_config.yaml | 31 + .../shared/config/export_config.yaml | 46 ++ .../90-reference-curator/shared/schema.sql | 285 +++++++ 26 files changed, 4486 insertions(+), 1 deletion(-) create mode 100644 custom-skills/90-reference-curator/01-reference-discovery/code/CLAUDE.md create mode 100644 custom-skills/90-reference-curator/01-reference-discovery/desktop/SKILL.md create mode 100644 custom-skills/90-reference-curator/02-web-crawler-orchestrator/code/CLAUDE.md create mode 100644 custom-skills/90-reference-curator/02-web-crawler-orchestrator/desktop/SKILL.md create mode 100644 custom-skills/90-reference-curator/03-content-repository/code/CLAUDE.md create mode 100644 custom-skills/90-reference-curator/03-content-repository/desktop/SKILL.md create mode 100644 custom-skills/90-reference-curator/04-content-distiller/code/CLAUDE.md create mode 100644 custom-skills/90-reference-curator/04-content-distiller/desktop/SKILL.md create mode 100644 custom-skills/90-reference-curator/05-quality-reviewer/code/CLAUDE.md create mode 100644 custom-skills/90-reference-curator/05-quality-reviewer/desktop/SKILL.md create mode 100644 custom-skills/90-reference-curator/06-markdown-exporter/code/CLAUDE.md create mode 100644 custom-skills/90-reference-curator/06-markdown-exporter/desktop/SKILL.md create mode 100644 custom-skills/90-reference-curator/CHANGELOG.md create mode 100644 custom-skills/90-reference-curator/README.md create mode 100644 custom-skills/90-reference-curator/commands/content-distiller.md create mode 100644 custom-skills/90-reference-curator/commands/content-repository.md create mode 100644 custom-skills/90-reference-curator/commands/markdown-exporter.md create mode 100644 custom-skills/90-reference-curator/commands/quality-reviewer.md create mode 100644 custom-skills/90-reference-curator/commands/reference-discovery.md create mode 100644 custom-skills/90-reference-curator/commands/web-crawler.md create mode 100755 custom-skills/90-reference-curator/install.sh create mode 100644 custom-skills/90-reference-curator/shared/config/crawl_config.yaml create mode 100644 custom-skills/90-reference-curator/shared/config/db_config.yaml create mode 100644 custom-skills/90-reference-curator/shared/config/export_config.yaml create mode 100644 custom-skills/90-reference-curator/shared/schema.sql diff --git a/.claude/settings.local.json b/.claude/settings.local.json index baf737d..df9ff60 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -21,7 +21,12 @@ "Bash(git push:*)", "Bash(chmod:*)", "Bash(python3:*)", - "Bash(git fetch:*)" + "Bash(git fetch:*)", + "Bash(mysql:*)", + "Bash(brew services:*)", + "Bash(source ~/.envrc)", + "mcp__plugin_Notion_notion__notion-fetch", + "mcp__plugin_Notion_notion__notion-update-page" ] } } diff --git a/custom-skills/90-reference-curator/01-reference-discovery/code/CLAUDE.md b/custom-skills/90-reference-curator/01-reference-discovery/code/CLAUDE.md new file mode 100644 index 0000000..8538377 --- /dev/null +++ b/custom-skills/90-reference-curator/01-reference-discovery/code/CLAUDE.md @@ -0,0 +1,75 @@ +# Reference Discovery + +Search and identify authoritative sources for reference materials. Validates source credibility, prioritizes by relevance, and outputs curated URL lists with metadata. + +## Trigger Keywords +"find references", "search documentation", "discover sources", "find authoritative materials", "research topic sources" + +## Source Priority Hierarchy + +| Tier | Source Type | Examples | +|------|-------------|----------| +| **Tier 1** | Official documentation | docs.anthropic.com, docs.claude.com, platform.openai.com/docs | +| **Tier 1** | Engineering blogs (official) | anthropic.com/news, openai.com/blog | +| **Tier 1** | Official GitHub repos | github.com/anthropics/*, github.com/openai/* | +| **Tier 2** | Research papers | arxiv.org, papers with citations | +| **Tier 2** | Verified community guides | Cookbook examples, official tutorials | +| **Tier 3** | Community content | Blog posts, tutorials, Stack Overflow | + +## Workflow + +### Step 1: Define Search Scope +Gather topic, target vendors, and freshness requirements from user input. + +### Step 2: Execute Web Search +Use WebSearch tool with targeted queries: +``` +site:docs.anthropic.com {topic} +site:github.com/anthropics {topic} +site:arxiv.org {topic} +``` + +### Step 3: Score and Validate Sources +Apply credibility scoring: +- Domain credibility (0.10 - 0.40) +- Freshness signals (0.10 - 0.20) +- Relevance signals (0.15) + +### Step 4: Output URL Manifest +Generate JSON manifest for the crawler skill: + +```json +{ + "discovery_date": "2025-01-28T10:30:00", + "topic": "prompt engineering", + "total_urls": 15, + "urls": [ + { + "url": "https://docs.anthropic.com/en/docs/prompt-engineering", + "title": "Prompt Engineering Guide", + "credibility_tier": "tier1_official", + "credibility_score": 0.85, + "source_type": "official_docs", + "vendor": "anthropic" + } + ] +} +``` + +## Scripts + +### `discover_sources.py` +Main discovery script. Usage: +```bash +python scripts/discover_sources.py --topic "prompt engineering" --vendors anthropic,openai --output manifest.json +``` + +## Output +- `manifest.json` → Handoff to `02-web-crawler-orchestrator` +- Register new sources in `sources` table via `03-content-repository` + +## Deduplication +Before outputting: +- Normalize URLs (remove trailing slashes, query params) +- Check against existing `documents` table +- Merge duplicates, keeping highest credibility score diff --git a/custom-skills/90-reference-curator/01-reference-discovery/desktop/SKILL.md b/custom-skills/90-reference-curator/01-reference-discovery/desktop/SKILL.md new file mode 100644 index 0000000..13d90a1 --- /dev/null +++ b/custom-skills/90-reference-curator/01-reference-discovery/desktop/SKILL.md @@ -0,0 +1,188 @@ +--- +name: reference-discovery +description: Search and identify authoritative sources for reference materials. Validates source credibility, prioritizes by relevance, and outputs curated URL lists with metadata. Triggers on "find references", "search documentation", "discover sources", "find authoritative materials", "research topic sources". +--- + +# Reference Discovery + +Searches for authoritative sources, validates credibility, and produces curated URL lists for crawling. + +## Source Priority Hierarchy + +| Tier | Source Type | Examples | +|------|-------------|----------| +| **Tier 1** | Official documentation | docs.anthropic.com, docs.claude.com, platform.openai.com/docs | +| **Tier 1** | Engineering blogs (official) | anthropic.com/news, openai.com/blog | +| **Tier 1** | Official GitHub repos | github.com/anthropics/*, github.com/openai/* | +| **Tier 2** | Research papers | arxiv.org, papers with citations | +| **Tier 2** | Verified community guides | Cookbook examples, official tutorials | +| **Tier 3** | Community content | Blog posts, tutorials, Stack Overflow | + +## Discovery Workflow + +### Step 1: Define Search Scope + +```python +search_config = { + "topic": "prompt engineering", + "vendors": ["anthropic", "openai", "google"], + "source_types": ["official_docs", "engineering_blog", "github_repo"], + "freshness": "past_year", # past_week, past_month, past_year, any + "max_results_per_query": 20 +} +``` + +### Step 2: Generate Search Queries + +For a given topic, generate targeted queries: + +```python +def generate_queries(topic, vendors): + queries = [] + + # Official documentation queries + for vendor in vendors: + queries.append(f"site:docs.{vendor}.com {topic}") + queries.append(f"site:{vendor}.com/docs {topic}") + + # Engineering blog queries + for vendor in vendors: + queries.append(f"site:{vendor}.com/blog {topic}") + queries.append(f"site:{vendor}.com/news {topic}") + + # GitHub queries + for vendor in vendors: + queries.append(f"site:github.com/{vendor} {topic}") + + # Research queries + queries.append(f"site:arxiv.org {topic}") + + return queries +``` + +### Step 3: Execute Search + +Use web search tool for each query: + +```python +def execute_discovery(queries): + results = [] + for query in queries: + search_results = web_search(query) + for result in search_results: + results.append({ + "url": result.url, + "title": result.title, + "snippet": result.snippet, + "query_used": query + }) + return deduplicate_by_url(results) +``` + +### Step 4: Validate and Score Sources + +```python +def score_source(url, title): + score = 0.0 + + # Domain credibility + if any(d in url for d in ['docs.anthropic.com', 'docs.claude.com', 'docs.openai.com']): + score += 0.40 # Tier 1 official docs + elif any(d in url for d in ['anthropic.com', 'openai.com', 'google.dev']): + score += 0.30 # Tier 1 official blog/news + elif 'github.com' in url and any(v in url for v in ['anthropics', 'openai', 'google']): + score += 0.30 # Tier 1 official repos + elif 'arxiv.org' in url: + score += 0.20 # Tier 2 research + else: + score += 0.10 # Tier 3 community + + # Freshness signals (from title/snippet) + if any(year in title for year in ['2025', '2024']): + score += 0.20 + elif any(year in title for year in ['2023']): + score += 0.10 + + # Relevance signals + if any(kw in title.lower() for kw in ['guide', 'documentation', 'tutorial', 'best practices']): + score += 0.15 + + return min(score, 1.0) + +def assign_credibility_tier(score): + if score >= 0.60: + return 'tier1_official' + elif score >= 0.40: + return 'tier2_verified' + else: + return 'tier3_community' +``` + +### Step 5: Output URL Manifest + +```python +def create_manifest(scored_results, topic): + manifest = { + "discovery_date": datetime.now().isoformat(), + "topic": topic, + "total_urls": len(scored_results), + "urls": [] + } + + for result in sorted(scored_results, key=lambda x: x['score'], reverse=True): + manifest["urls"].append({ + "url": result["url"], + "title": result["title"], + "credibility_tier": result["tier"], + "credibility_score": result["score"], + "source_type": infer_source_type(result["url"]), + "vendor": infer_vendor(result["url"]) + }) + + return manifest +``` + +## Output Format + +Discovery produces a JSON manifest for the crawler: + +```json +{ + "discovery_date": "2025-01-28T10:30:00", + "topic": "prompt engineering", + "total_urls": 15, + "urls": [ + { + "url": "https://docs.anthropic.com/en/docs/prompt-engineering", + "title": "Prompt Engineering Guide", + "credibility_tier": "tier1_official", + "credibility_score": 0.85, + "source_type": "official_docs", + "vendor": "anthropic" + } + ] +} +``` + +## Known Authoritative Sources + +Pre-validated sources for common topics: + +| Vendor | Documentation | Blog/News | GitHub | +|--------|--------------|-----------|--------| +| Anthropic | docs.anthropic.com, docs.claude.com | anthropic.com/news | github.com/anthropics | +| OpenAI | platform.openai.com/docs | openai.com/blog | github.com/openai | +| Google | ai.google.dev/docs | blog.google/technology/ai | github.com/google | + +## Integration + +**Output:** URL manifest JSON → `web-crawler-orchestrator` + +**Database:** Register new sources in `sources` table via `content-repository` + +## Deduplication + +Before outputting, deduplicate URLs: +- Normalize URLs (remove trailing slashes, query params) +- Check against existing `documents` table via `content-repository` +- Merge duplicate entries, keeping highest credibility score diff --git a/custom-skills/90-reference-curator/02-web-crawler-orchestrator/code/CLAUDE.md b/custom-skills/90-reference-curator/02-web-crawler-orchestrator/code/CLAUDE.md new file mode 100644 index 0000000..c4e05d2 --- /dev/null +++ b/custom-skills/90-reference-curator/02-web-crawler-orchestrator/code/CLAUDE.md @@ -0,0 +1,159 @@ +# Web Crawler Orchestrator + +Orchestrates web crawling with intelligent backend selection. Automatically chooses the best crawler based on site characteristics. + +## Trigger Keywords +"crawl URLs", "fetch documents", "scrape pages", "download references" + +## Intelligent Crawler Selection + +Claude automatically selects the optimal crawler based on the request: + +| Crawler | Best For | Auto-Selected When | +|---------|----------|-------------------| +| **Node.js** (default) | Small docs sites | ≤50 pages, static content | +| **Python aiohttp** | Technical docs | ≤200 pages, needs SEO data | +| **Scrapy** | Enterprise crawls | >200 pages, multi-domain | +| **Firecrawl MCP** | Dynamic sites | SPAs, JS-rendered content | + +### Decision Flow + +``` +[Crawl Request] + │ + ├─ Is it SPA/React/Vue/Angular? → Firecrawl MCP + │ + ├─ >200 pages or multi-domain? → Scrapy + │ + ├─ Needs SEO extraction? → Python aiohttp + │ + └─ Default (small site) → Node.js +``` + +## Crawler Backends + +### Node.js (Default) +Fast, lightweight crawler for small documentation sites. +```bash +cd ~/Project/our-seo-agent/util/js-crawler +node src/crawler.js --max-pages 50 +``` + +### Python aiohttp +Async crawler with full SEO extraction. +```bash +cd ~/Project/our-seo-agent +python -m seo_agent.crawler --url --max-pages 100 +``` + +### Scrapy +Enterprise-grade crawler with pipelines. +```bash +cd ~/Project/our-seo-agent +scrapy crawl seo_spider -a start_url= -a max_pages=500 +``` + +### Firecrawl MCP +Use MCP tools for JavaScript-heavy sites: +``` +firecrawl_scrape(url, formats=["markdown"], only_main_content=true) +firecrawl_crawl(url, max_depth=2, limit=50) +firecrawl_map(url, limit=100) # Discover URLs first +``` + +## Workflow + +### Step 1: Analyze Target Site +Determine site characteristics: +- Is it a SPA? (React, Vue, Angular, Next.js) +- How many pages expected? +- Does it need JavaScript rendering? +- Is SEO data extraction needed? + +### Step 2: Select Crawler +Based on analysis, select the appropriate backend. + +### Step 3: Load URL Manifest +```bash +# From reference-discovery output +cat manifest.json | jq '.urls[].url' +``` + +### Step 4: Execute Crawl + +**For Node.js:** +```bash +cd ~/Project/our-seo-agent/util/js-crawler +for url in $(cat urls.txt); do + node src/crawler.js "$url" --max-pages 50 + sleep 2 +done +``` + +**For Firecrawl MCP (Claude Desktop/Code):** +Use the firecrawl MCP tools directly in conversation. + +### Step 5: Save Raw Content +``` +~/reference-library/raw/ +└── 2025/01/ + ├── a1b2c3d4.md + └── b2c3d4e5.md +``` + +### Step 6: Generate Crawl Manifest +```json +{ + "crawl_date": "2025-01-28T12:00:00", + "crawler_used": "nodejs", + "total_crawled": 45, + "total_failed": 5, + "documents": [...] +} +``` + +## Rate Limiting + +All crawlers respect these limits: +- 20 requests/minute +- 3 concurrent requests +- Exponential backoff on 429/5xx + +## Error Handling + +| Error | Action | +|-------|--------| +| Timeout | Retry once with 2x timeout | +| Rate limit (429) | Exponential backoff, max 3 retries | +| Not found (404) | Log and skip | +| Access denied (403) | Log, mark as `failed` | +| JS rendering needed | Switch to Firecrawl | + +## Site Type Detection + +Indicators for automatic routing: + +**SPA (→ Firecrawl):** +- URL contains `#/` or uses hash routing +- Page source shows React/Vue/Angular markers +- Content loads dynamically after initial load + +**Static docs (→ Node.js/aiohttp):** +- Built with Hugo, Jekyll, MkDocs, Docusaurus, GitBook +- Clean HTML structure +- Server-side rendered + +## Scripts + +- `scripts/select_crawler.py` - Intelligent crawler selection +- `scripts/crawl_with_nodejs.sh` - Node.js wrapper +- `scripts/crawl_with_aiohttp.sh` - Python wrapper +- `scripts/crawl_with_firecrawl.py` - Firecrawl MCP wrapper + +## Integration + +| From | To | +|------|-----| +| reference-discovery | URL manifest input | +| → | content-repository (crawl manifest + raw files) | +| quality-reviewer (deep_research) | Additional crawl requests | diff --git a/custom-skills/90-reference-curator/02-web-crawler-orchestrator/desktop/SKILL.md b/custom-skills/90-reference-curator/02-web-crawler-orchestrator/desktop/SKILL.md new file mode 100644 index 0000000..6762c87 --- /dev/null +++ b/custom-skills/90-reference-curator/02-web-crawler-orchestrator/desktop/SKILL.md @@ -0,0 +1,234 @@ +--- +name: web-crawler-orchestrator +description: Orchestrates web crawling using Firecrawl MCP. Handles rate limiting, selects crawl strategies, manages formats (HTML/PDF/markdown), and produces raw content with manifests. Triggers on "crawl URLs", "fetch documents", "scrape pages", "download references", "Firecrawl crawl". +--- + +# Web Crawler Orchestrator + +Manages crawling operations using Firecrawl MCP with rate limiting and format handling. + +## Prerequisites + +- Firecrawl MCP server connected +- Config file at `~/.config/reference-curator/crawl_config.yaml` +- Storage directory exists: `~/reference-library/raw/` + +## Crawl Configuration + +```yaml +# ~/.config/reference-curator/crawl_config.yaml +firecrawl: + rate_limit: + requests_per_minute: 20 + concurrent_requests: 3 + default_options: + timeout: 30000 + only_main_content: true + include_html: false + +processing: + max_content_size_mb: 50 + raw_content_dir: ~/reference-library/raw/ +``` + +## Crawl Workflow + +### Step 1: Load URL Manifest + +Receive manifest from `reference-discovery`: + +```python +def load_manifest(manifest_path): + with open(manifest_path) as f: + manifest = json.load(f) + return manifest["urls"] +``` + +### Step 2: Determine Crawl Strategy + +```python +def select_strategy(url): + """Select optimal crawl strategy based on URL characteristics.""" + + if url.endswith('.pdf'): + return 'pdf_extract' + elif 'github.com' in url and '/blob/' in url: + return 'raw_content' # Get raw file content + elif 'github.com' in url: + return 'scrape' # Repository pages + elif any(d in url for d in ['docs.', 'documentation']): + return 'scrape' # Documentation sites + else: + return 'scrape' # Default +``` + +### Step 3: Execute Firecrawl + +Use Firecrawl MCP for crawling: + +```python +# Single page scrape +firecrawl_scrape( + url="https://docs.anthropic.com/en/docs/prompt-engineering", + formats=["markdown"], # markdown | html | screenshot + only_main_content=True, + timeout=30000 +) + +# Multi-page crawl (documentation sites) +firecrawl_crawl( + url="https://docs.anthropic.com/en/docs/", + max_depth=2, + limit=50, + formats=["markdown"], + only_main_content=True +) +``` + +### Step 4: Rate Limiting + +```python +import time +from collections import deque + +class RateLimiter: + def __init__(self, requests_per_minute=20): + self.rpm = requests_per_minute + self.request_times = deque() + + def wait_if_needed(self): + now = time.time() + # Remove requests older than 1 minute + while self.request_times and now - self.request_times[0] > 60: + self.request_times.popleft() + + if len(self.request_times) >= self.rpm: + wait_time = 60 - (now - self.request_times[0]) + if wait_time > 0: + time.sleep(wait_time) + + self.request_times.append(time.time()) +``` + +### Step 5: Save Raw Content + +```python +import hashlib +from pathlib import Path + +def save_content(url, content, content_type='markdown'): + """Save crawled content to raw storage.""" + + # Generate filename from URL hash + url_hash = hashlib.sha256(url.encode()).hexdigest()[:16] + + # Determine extension + ext_map = {'markdown': '.md', 'html': '.html', 'pdf': '.pdf'} + ext = ext_map.get(content_type, '.txt') + + # Create dated subdirectory + date_dir = datetime.now().strftime('%Y/%m') + output_dir = Path.home() / 'reference-library/raw' / date_dir + output_dir.mkdir(parents=True, exist_ok=True) + + # Save file + filepath = output_dir / f"{url_hash}{ext}" + if content_type == 'pdf': + filepath.write_bytes(content) + else: + filepath.write_text(content, encoding='utf-8') + + return str(filepath) +``` + +### Step 6: Generate Crawl Manifest + +```python +def create_crawl_manifest(results): + manifest = { + "crawl_date": datetime.now().isoformat(), + "total_crawled": len([r for r in results if r["status"] == "success"]), + "total_failed": len([r for r in results if r["status"] == "failed"]), + "documents": [] + } + + for result in results: + manifest["documents"].append({ + "url": result["url"], + "status": result["status"], + "raw_content_path": result.get("filepath"), + "content_size": result.get("size"), + "crawl_method": "firecrawl", + "error": result.get("error") + }) + + return manifest +``` + +## Error Handling + +| Error | Action | +|-------|--------| +| Timeout | Retry once with 2x timeout | +| Rate limit (429) | Exponential backoff, max 3 retries | +| Not found (404) | Log and skip | +| Access denied (403) | Log, mark as `failed` | +| Connection error | Retry with backoff | + +```python +def crawl_with_retry(url, max_retries=3): + for attempt in range(max_retries): + try: + result = firecrawl_scrape(url) + return {"status": "success", "content": result} + except RateLimitError: + wait = 2 ** attempt * 10 # 10, 20, 40 seconds + time.sleep(wait) + except TimeoutError: + if attempt == 0: + # Retry with doubled timeout + result = firecrawl_scrape(url, timeout=60000) + return {"status": "success", "content": result} + except NotFoundError: + return {"status": "failed", "error": "404 Not Found"} + except Exception as e: + if attempt == max_retries - 1: + return {"status": "failed", "error": str(e)} + + return {"status": "failed", "error": "Max retries exceeded"} +``` + +## Firecrawl MCP Reference + +**scrape** - Single page: +``` +firecrawl_scrape(url, formats, only_main_content, timeout) +``` + +**crawl** - Multi-page: +``` +firecrawl_crawl(url, max_depth, limit, formats, only_main_content) +``` + +**map** - Discover URLs: +``` +firecrawl_map(url, limit) # Returns list of URLs on site +``` + +## Integration + +| From | Input | To | +|------|-------|-----| +| reference-discovery | URL manifest | web-crawler-orchestrator | +| web-crawler-orchestrator | Crawl manifest + raw files | content-repository | +| quality-reviewer (deep_research) | Additional queries | reference-discovery → here | + +## Output Structure + +``` +~/reference-library/raw/ +└── 2025/01/ + ├── a1b2c3d4e5f6g7h8.md # Markdown content + ├── b2c3d4e5f6g7h8i9.md + └── c3d4e5f6g7h8i9j0.pdf # PDF documents +``` diff --git a/custom-skills/90-reference-curator/03-content-repository/code/CLAUDE.md b/custom-skills/90-reference-curator/03-content-repository/code/CLAUDE.md new file mode 100644 index 0000000..7c0ad07 --- /dev/null +++ b/custom-skills/90-reference-curator/03-content-repository/code/CLAUDE.md @@ -0,0 +1,97 @@ +# Content Repository + +MySQL storage management for the reference library. Handles document storage, version control, deduplication, and retrieval. + +## Trigger Keywords +"store content", "save to database", "check duplicates", "version tracking", "document retrieval", "reference library DB" + +## Prerequisites + +- MySQL 8.0+ with utf8mb4 charset +- Config file at `~/.config/reference-curator/db_config.yaml` +- Database `reference_library` initialized + +## Database Setup + +```bash +# Initialize database +mysql -u root -p < references/schema.sql + +# Verify tables +mysql -u root -p reference_library -e "SHOW TABLES;" +``` + +## Core Scripts + +### Store Document +```bash +python scripts/store_document.py \ + --source-id 1 \ + --title "Prompt Engineering Guide" \ + --url "https://docs.anthropic.com/..." \ + --doc-type webpage \ + --raw-path ~/reference-library/raw/2025/01/abc123.md +``` + +### Check Duplicate +```bash +python scripts/check_duplicate.py --url "https://docs.anthropic.com/..." +``` + +### Query by Topic +```bash +python scripts/query_topic.py --topic-slug prompt-engineering --min-quality 0.80 +``` + +## Table Quick Reference + +| Table | Purpose | Key Fields | +|-------|---------|------------| +| `sources` | Authorized sources | source_type, credibility_tier, vendor | +| `documents` | Document metadata | url_hash (dedup), version, crawl_status | +| `distilled_content` | Processed summaries | review_status, compression_ratio | +| `review_logs` | QA decisions | quality_score, decision | +| `topics` | Taxonomy | topic_slug, parent_topic_id | +| `document_topics` | Many-to-many links | relevance_score | +| `export_jobs` | Export tracking | export_type, status | + +## Status Values + +**crawl_status:** `pending` → `completed` | `failed` | `stale` + +**review_status:** `pending` → `in_review` → `approved` | `needs_refactor` | `rejected` + +## Common Queries + +### Find Stale Documents +```bash +python scripts/find_stale.py --output stale_docs.json +``` + +### Get Pending Reviews +```bash +python scripts/pending_reviews.py --output pending.json +``` + +### Export-Ready Content +```bash +python scripts/export_ready.py --min-score 0.85 --output ready.json +``` + +## Scripts + +- `scripts/store_document.py` - Store new document +- `scripts/check_duplicate.py` - URL deduplication +- `scripts/query_topic.py` - Query by topic +- `scripts/find_stale.py` - Find stale documents +- `scripts/pending_reviews.py` - Get pending reviews +- `scripts/db_utils.py` - Database connection utilities + +## Integration + +| From | Action | To | +|------|--------|-----| +| crawler-orchestrator | Store crawled content | → | +| → | Query pending docs | content-distiller | +| quality-reviewer | Update review_status | → | +| → | Query approved content | markdown-exporter | diff --git a/custom-skills/90-reference-curator/03-content-repository/desktop/SKILL.md b/custom-skills/90-reference-curator/03-content-repository/desktop/SKILL.md new file mode 100644 index 0000000..efa5c7c --- /dev/null +++ b/custom-skills/90-reference-curator/03-content-repository/desktop/SKILL.md @@ -0,0 +1,162 @@ +--- +name: content-repository +description: MySQL storage management for reference library. Use when storing crawled content, managing document versions, deduplicating URLs, querying stored references, or tracking document metadata. Triggers on keywords like "store content", "save to database", "check duplicates", "version tracking", "document retrieval", "reference library DB". +--- + +# Content Repository + +Manages MySQL storage for the reference library system. Handles document storage, version control, deduplication, and retrieval. + +## Prerequisites + +- MySQL 8.0+ with utf8mb4 charset +- Config file at `~/.config/reference-curator/db_config.yaml` +- Database `reference_library` initialized with schema + +## Quick Reference + +### Connection Setup + +```python +import yaml +import os +from pathlib import Path + +def get_db_config(): + config_path = Path.home() / ".config/reference-curator/db_config.yaml" + with open(config_path) as f: + config = yaml.safe_load(f) + + # Resolve environment variables + mysql = config['mysql'] + return { + 'host': mysql['host'], + 'port': mysql['port'], + 'database': mysql['database'], + 'user': os.environ.get('MYSQL_USER', mysql.get('user', '')), + 'password': os.environ.get('MYSQL_PASSWORD', mysql.get('password', '')), + 'charset': mysql['charset'] + } +``` + +### Core Operations + +**Store New Document:** +```python +def store_document(cursor, source_id, title, url, doc_type, raw_content_path): + sql = """ + INSERT INTO documents (source_id, title, url, doc_type, crawl_date, crawl_status, raw_content_path) + VALUES (%s, %s, %s, %s, NOW(), 'completed', %s) + ON DUPLICATE KEY UPDATE + version = version + 1, + previous_version_id = doc_id, + crawl_date = NOW(), + raw_content_path = VALUES(raw_content_path) + """ + cursor.execute(sql, (source_id, title, url, doc_type, raw_content_path)) + return cursor.lastrowid +``` + +**Check Duplicate:** +```python +def is_duplicate(cursor, url): + cursor.execute("SELECT doc_id FROM documents WHERE url_hash = SHA2(%s, 256)", (url,)) + return cursor.fetchone() is not None +``` + +**Get Document by Topic:** +```python +def get_docs_by_topic(cursor, topic_slug, min_quality=0.80): + sql = """ + SELECT d.doc_id, d.title, d.url, dc.structured_content, dc.quality_score + FROM documents d + JOIN document_topics dt ON d.doc_id = dt.doc_id + JOIN topics t ON dt.topic_id = t.topic_id + LEFT JOIN distilled_content dc ON d.doc_id = dc.doc_id + WHERE t.topic_slug = %s + AND (dc.review_status = 'approved' OR dc.review_status IS NULL) + ORDER BY dt.relevance_score DESC + """ + cursor.execute(sql, (topic_slug,)) + return cursor.fetchall() +``` + +## Table Quick Reference + +| Table | Purpose | Key Fields | +|-------|---------|------------| +| `sources` | Authorized content sources | source_type, credibility_tier, vendor | +| `documents` | Crawled document metadata | url_hash (dedup), version, crawl_status | +| `distilled_content` | Processed summaries | review_status, compression_ratio | +| `review_logs` | QA decisions | quality_score, decision, refactor_instructions | +| `topics` | Taxonomy | topic_slug, parent_topic_id | +| `document_topics` | Many-to-many linking | relevance_score | +| `export_jobs` | Export tracking | export_type, output_format, status | + +## Status Values + +**crawl_status:** `pending` → `completed` | `failed` | `stale` + +**review_status:** `pending` → `in_review` → `approved` | `needs_refactor` | `rejected` + +**decision (review):** `approve` | `refactor` | `deep_research` | `reject` + +## Common Queries + +### Find Stale Documents (needs re-crawl) +```sql +SELECT d.doc_id, d.title, d.url, d.crawl_date +FROM documents d +JOIN crawl_schedule cs ON d.source_id = cs.source_id +WHERE d.crawl_date < DATE_SUB(NOW(), INTERVAL + CASE cs.frequency + WHEN 'daily' THEN 1 + WHEN 'weekly' THEN 7 + WHEN 'biweekly' THEN 14 + WHEN 'monthly' THEN 30 + END DAY) +AND cs.is_enabled = TRUE; +``` + +### Get Pending Reviews +```sql +SELECT dc.distill_id, d.title, d.url, dc.token_count_distilled +FROM distilled_content dc +JOIN documents d ON dc.doc_id = d.doc_id +WHERE dc.review_status = 'pending' +ORDER BY dc.distill_date ASC; +``` + +### Export-Ready Content +```sql +SELECT d.title, d.url, dc.structured_content, t.topic_slug +FROM documents d +JOIN distilled_content dc ON d.doc_id = dc.doc_id +JOIN document_topics dt ON d.doc_id = dt.doc_id +JOIN topics t ON dt.topic_id = t.topic_id +JOIN review_logs rl ON dc.distill_id = rl.distill_id +WHERE rl.decision = 'approve' +AND rl.quality_score >= 0.85 +ORDER BY t.topic_slug, dt.relevance_score DESC; +``` + +## Workflow Integration + +1. **From crawler-orchestrator:** Receive URL + raw content path → `store_document()` +2. **To content-distiller:** Query pending documents → send for processing +3. **From quality-reviewer:** Update `review_status` based on decision +4. **To markdown-exporter:** Query approved content by topic + +## Error Handling + +- **Duplicate URL:** Silent update (version increment) via `ON DUPLICATE KEY UPDATE` +- **Missing source_id:** Validate against `sources` table before insert +- **Connection failure:** Implement retry with exponential backoff + +## Full Schema Reference + +See `references/schema.sql` for complete table definitions including indexes and constraints. + +## Config File Template + +See `references/db_config_template.yaml` for connection configuration template. diff --git a/custom-skills/90-reference-curator/04-content-distiller/code/CLAUDE.md b/custom-skills/90-reference-curator/04-content-distiller/code/CLAUDE.md new file mode 100644 index 0000000..4f33b85 --- /dev/null +++ b/custom-skills/90-reference-curator/04-content-distiller/code/CLAUDE.md @@ -0,0 +1,106 @@ +# Content Distiller + +Analyzes and distills raw crawled content into concise reference materials. Extracts key concepts, code snippets, and creates structured summaries. + +## Trigger Keywords +"distill content", "summarize document", "extract key concepts", "process raw content", "create reference summary" + +## Goals + +1. **Compress** - Reduce token count while preserving essential information +2. **Structure** - Organize content for easy retrieval +3. **Extract** - Pull out code snippets, key concepts, patterns +4. **Annotate** - Add metadata for searchability + +## Workflow + +### Step 1: Load Raw Content +```bash +python scripts/load_pending.py --output pending_docs.json +``` + +### Step 2: Analyze Content Structure +Identify document characteristics: +- Has code blocks? +- Has headers? +- Has tables? +- Estimated tokens? + +### Step 3: Extract Key Components +```bash +python scripts/extract_components.py --doc-id 123 --output components.json +``` + +Extracts: +- Code snippets with language tags +- Key concepts and definitions +- Best practices +- Techniques and patterns + +### Step 4: Create Structured Summary +Output template: +```markdown +# {title} + +**Source:** {url} +**Type:** {source_type} | **Tier:** {credibility_tier} +**Distilled:** {date} + +## Executive Summary +{2-3 sentence overview} + +## Key Concepts +{bulleted list with definitions} + +## Techniques & Patterns +{extracted techniques with use cases} + +## Code Examples +{relevant code snippets} + +## Best Practices +{actionable recommendations} +``` + +### Step 5: Optimize for Tokens +Target: 25-35% of original token count +```bash +python scripts/optimize_content.py --doc-id 123 --target-ratio 0.30 +``` + +### Step 6: Store Distilled Content +```bash +python scripts/store_distilled.py --doc-id 123 --content distilled.md +``` + +## Quality Metrics + +| Metric | Target | +|--------|--------| +| Compression Ratio | 25-35% of original | +| Key Concept Coverage | ≥90% of important terms | +| Code Snippet Retention | 100% of relevant examples | +| Readability | Clear, scannable structure | + +## Handling Refactor Requests + +When `quality-reviewer` returns `refactor`: +```bash +python scripts/refactor_content.py --distill-id 456 --instructions "Add more examples" +``` + +## Scripts + +- `scripts/load_pending.py` - Load documents pending distillation +- `scripts/extract_components.py` - Extract code, concepts, patterns +- `scripts/optimize_content.py` - Token optimization +- `scripts/store_distilled.py` - Save to database +- `scripts/refactor_content.py` - Handle refactor requests + +## Integration + +| From | To | +|------|-----| +| content-repository | Raw document records | +| → | quality-reviewer (distilled content) | +| quality-reviewer | Refactor instructions (loop back) | diff --git a/custom-skills/90-reference-curator/04-content-distiller/desktop/SKILL.md b/custom-skills/90-reference-curator/04-content-distiller/desktop/SKILL.md new file mode 100644 index 0000000..40aecfc --- /dev/null +++ b/custom-skills/90-reference-curator/04-content-distiller/desktop/SKILL.md @@ -0,0 +1,238 @@ +--- +name: content-distiller +description: Analyzes and distills raw crawled content into concise reference materials. Extracts key concepts, code snippets, and creates structured summaries optimized for prompt engineering use cases. Triggers on "distill content", "summarize document", "extract key concepts", "process raw content", "create reference summary". +--- + +# Content Distiller + +Transforms raw crawled content into structured, high-quality reference materials. + +## Distillation Goals + +1. **Compress** - Reduce token count while preserving essential information +2. **Structure** - Organize content for easy retrieval and reference +3. **Extract** - Pull out code snippets, key concepts, and actionable patterns +4. **Annotate** - Add metadata for searchability and categorization + +## Distillation Workflow + +### Step 1: Load Raw Content + +```python +def load_for_distillation(cursor): + """Get documents ready for distillation.""" + sql = """ + SELECT d.doc_id, d.title, d.url, d.raw_content_path, + d.doc_type, s.source_type, s.credibility_tier + FROM documents d + JOIN sources s ON d.source_id = s.source_id + LEFT JOIN distilled_content dc ON d.doc_id = dc.doc_id + WHERE d.crawl_status = 'completed' + AND dc.distill_id IS NULL + ORDER BY s.credibility_tier ASC + """ + cursor.execute(sql) + return cursor.fetchall() +``` + +### Step 2: Analyze Content Structure + +Identify content type and select appropriate distillation strategy: + +```python +def analyze_structure(content, doc_type): + """Analyze document structure for distillation.""" + analysis = { + "has_code_blocks": bool(re.findall(r'```[\s\S]*?```', content)), + "has_headers": bool(re.findall(r'^#+\s', content, re.MULTILINE)), + "has_lists": bool(re.findall(r'^\s*[-*]\s', content, re.MULTILINE)), + "has_tables": bool(re.findall(r'\|.*\|', content)), + "estimated_tokens": len(content.split()) * 1.3, # Rough estimate + "section_count": len(re.findall(r'^#+\s', content, re.MULTILINE)) + } + return analysis +``` + +### Step 3: Extract Key Components + +**Extract Code Snippets:** +```python +def extract_code_snippets(content): + """Extract all code blocks with language tags.""" + pattern = r'```(\w*)\n([\s\S]*?)```' + snippets = [] + for match in re.finditer(pattern, content): + snippets.append({ + "language": match.group(1) or "text", + "code": match.group(2).strip(), + "context": get_surrounding_text(content, match.start(), 200) + }) + return snippets +``` + +**Extract Key Concepts:** +```python +def extract_key_concepts(content, title): + """Use Claude to extract key concepts and definitions.""" + prompt = f""" + Analyze this document and extract key concepts: + + Title: {title} + Content: {content[:8000]} # Limit for context + + Return JSON with: + - concepts: [{{"term": "...", "definition": "...", "importance": "high|medium|low"}}] + - techniques: [{{"name": "...", "description": "...", "use_case": "..."}}] + - best_practices: ["..."] + """ + # Use Claude API to process + return claude_extract(prompt) +``` + +### Step 4: Create Structured Summary + +**Summary Template:** +```markdown +# {title} + +**Source:** {url} +**Type:** {source_type} | **Tier:** {credibility_tier} +**Distilled:** {date} + +## Executive Summary +{2-3 sentence overview} + +## Key Concepts +{bulleted list of core concepts with brief definitions} + +## Techniques & Patterns +{extracted techniques with use cases} + +## Code Examples +{relevant code snippets with context} + +## Best Practices +{actionable recommendations} + +## Related Topics +{links to related content in library} +``` + +### Step 5: Optimize for Tokens + +```python +def optimize_content(structured_content, target_ratio=0.30): + """ + Compress content to target ratio while preserving quality. + Target: 30% of original token count. + """ + original_tokens = count_tokens(structured_content) + target_tokens = int(original_tokens * target_ratio) + + # Prioritized compression strategies + strategies = [ + remove_redundant_explanations, + condense_examples, + merge_similar_sections, + trim_verbose_descriptions + ] + + optimized = structured_content + for strategy in strategies: + if count_tokens(optimized) > target_tokens: + optimized = strategy(optimized) + + return optimized +``` + +### Step 6: Store Distilled Content + +```python +def store_distilled(cursor, doc_id, summary, key_concepts, + code_snippets, structured_content, + original_tokens, distilled_tokens): + sql = """ + INSERT INTO distilled_content + (doc_id, summary, key_concepts, code_snippets, structured_content, + token_count_original, token_count_distilled, distill_model, review_status) + VALUES (%s, %s, %s, %s, %s, %s, %s, 'claude-opus-4-5', 'pending') + """ + cursor.execute(sql, ( + doc_id, summary, + json.dumps(key_concepts), + json.dumps(code_snippets), + structured_content, + original_tokens, + distilled_tokens + )) + return cursor.lastrowid +``` + +## Distillation Prompts + +**For Prompt Engineering Content:** +``` +Focus on: +1. Specific techniques with before/after examples +2. Why techniques work (not just what) +3. Common pitfalls and how to avoid them +4. Actionable patterns that can be directly applied +``` + +**For API Documentation:** +``` +Focus on: +1. Endpoint specifications and parameters +2. Request/response examples +3. Error codes and handling +4. Rate limits and best practices +``` + +**For Research Papers:** +``` +Focus on: +1. Key findings and conclusions +2. Novel techniques introduced +3. Practical applications +4. Limitations and caveats +``` + +## Quality Metrics + +Track compression efficiency: + +| Metric | Target | +|--------|--------| +| Compression Ratio | 25-35% of original | +| Key Concept Coverage | ≥90% of important terms | +| Code Snippet Retention | 100% of relevant examples | +| Readability | Clear, scannable structure | + +## Handling Refactor Requests + +When `quality-reviewer` returns `refactor` decision: + +```python +def handle_refactor(distill_id, instructions): + """Re-distill based on reviewer feedback.""" + # Load original content and existing distillation + original = load_raw_content(distill_id) + existing = load_distilled_content(distill_id) + + # Apply specific improvements based on instructions + improved = apply_improvements(existing, instructions) + + # Update distilled_content + update_distilled(distill_id, improved) + + # Reset review status + set_review_status(distill_id, 'pending') +``` + +## Integration + +| From | Input | To | +|------|-------|-----| +| content-repository | Raw document records | content-distiller | +| content-distiller | Distilled content | quality-reviewer | +| quality-reviewer | Refactor instructions | content-distiller (loop) | diff --git a/custom-skills/90-reference-curator/05-quality-reviewer/code/CLAUDE.md b/custom-skills/90-reference-curator/05-quality-reviewer/code/CLAUDE.md new file mode 100644 index 0000000..ab09278 --- /dev/null +++ b/custom-skills/90-reference-curator/05-quality-reviewer/code/CLAUDE.md @@ -0,0 +1,103 @@ +# Quality Reviewer + +QA loop for reference library content. Scores distilled materials, routes decisions, and provides actionable feedback. + +## Trigger Keywords +"review content", "quality check", "QA review", "assess distilled content", "check reference quality" + +## Decision Flow + +``` +[Distilled Content] + │ + ▼ +┌─────────────────┐ +│ Score Criteria │ → accuracy, completeness, clarity, PE quality, usability +└─────────────────┘ + │ + ├── ≥ 0.85 → APPROVE → markdown-exporter + ├── 0.60-0.84 → REFACTOR → content-distiller + ├── 0.40-0.59 → DEEP_RESEARCH → web-crawler + └── < 0.40 → REJECT → archive +``` + +## Scoring Criteria + +| Criterion | Weight | Checks | +|-----------|--------|--------| +| **Accuracy** | 0.25 | Factual correctness, up-to-date, attribution | +| **Completeness** | 0.20 | Key concepts, examples, edge cases | +| **Clarity** | 0.20 | Structure, concise language, logical flow | +| **PE Quality** | 0.25 | Techniques, before/after, explains why | +| **Usability** | 0.10 | Easy reference, searchable, appropriate length | + +## Workflow + +### Step 1: Load Pending Reviews +```bash +python scripts/load_pending_reviews.py --output pending.json +``` + +### Step 2: Score Content +```bash +python scripts/score_content.py --distill-id 123 --output assessment.json +``` + +### Step 3: Calculate Final Score +```bash +python scripts/calculate_score.py --assessment assessment.json +``` + +### Step 4: Route Decision +```bash +python scripts/route_decision.py --distill-id 123 --score 0.78 +``` + +Outputs: +- `approve` → Ready for export +- `refactor` → Return to distiller with instructions +- `deep_research` → Need more sources (queries generated) +- `reject` → Archive with reason + +### Step 5: Log Review +```bash +python scripts/log_review.py --distill-id 123 --decision refactor --instructions "Add more examples" +``` + +## PE Quality Checklist + +When scoring `prompt_engineering_quality`: +- [ ] Demonstrates specific techniques (CoT, few-shot, etc.) +- [ ] Shows before/after examples +- [ ] Explains *why* techniques work +- [ ] Provides actionable patterns +- [ ] Includes edge cases and failure modes +- [ ] References authoritative sources + +## Auto-Approve Rules + +Tier 1 sources with score ≥ 0.80 may auto-approve: +```yaml +# In config +quality: + auto_approve_tier1_sources: true + auto_approve_min_score: 0.80 +``` + +## Scripts + +- `scripts/load_pending_reviews.py` - Get pending reviews +- `scripts/score_content.py` - Multi-criteria scoring +- `scripts/calculate_score.py` - Weighted average calculation +- `scripts/route_decision.py` - Decision routing logic +- `scripts/log_review.py` - Log review to database +- `scripts/generate_feedback.py` - Generate refactor instructions + +## Integration + +| From | Action | To | +|------|--------|-----| +| content-distiller | Distilled content | → | +| → | APPROVE | markdown-exporter | +| → | REFACTOR + instructions | content-distiller | +| → | DEEP_RESEARCH + queries | web-crawler-orchestrator | diff --git a/custom-skills/90-reference-curator/05-quality-reviewer/desktop/SKILL.md b/custom-skills/90-reference-curator/05-quality-reviewer/desktop/SKILL.md new file mode 100644 index 0000000..9df2367 --- /dev/null +++ b/custom-skills/90-reference-curator/05-quality-reviewer/desktop/SKILL.md @@ -0,0 +1,227 @@ +--- +name: quality-reviewer +description: QA loop for reference library content. Scores distilled materials against prompt engineering quality criteria, routes decisions (approve/refactor/deep_research/reject), and provides actionable feedback. Triggers on "review content", "quality check", "QA review", "assess distilled content", "check reference quality", "refactoring needed". +--- + +# Quality Reviewer + +Evaluates distilled content for quality, routes decisions, and triggers refactoring or additional research when needed. + +## Review Workflow + +``` +[Distilled Content] + │ + ▼ +┌─────────────────┐ +│ Score Criteria │ → accuracy, completeness, clarity, PE quality, usability +└─────────────────┘ + │ + ▼ +┌─────────────────┐ +│ Calculate Total │ → weighted average +└─────────────────┘ + │ + ├── ≥ 0.85 → APPROVE → markdown-exporter + ├── 0.60-0.84 → REFACTOR → content-distiller (with instructions) + ├── 0.40-0.59 → DEEP_RESEARCH → web-crawler-orchestrator (with queries) + └── < 0.40 → REJECT → archive with reason +``` + +## Scoring Criteria + +| Criterion | Weight | Checks | +|-----------|--------|--------| +| **Accuracy** | 0.25 | Factual correctness, up-to-date info, proper attribution | +| **Completeness** | 0.20 | Covers key concepts, includes examples, addresses edge cases | +| **Clarity** | 0.20 | Clear structure, concise language, logical flow | +| **PE Quality** | 0.25 | Demonstrates techniques, before/after examples, explains why | +| **Usability** | 0.10 | Easy to reference, searchable keywords, appropriate length | + +## Decision Thresholds + +| Score Range | Decision | Action | +|-------------|----------|--------| +| ≥ 0.85 | `approve` | Proceed to export | +| 0.60 - 0.84 | `refactor` | Return to distiller with feedback | +| 0.40 - 0.59 | `deep_research` | Gather more sources, then re-distill | +| < 0.40 | `reject` | Archive, log reason | + +## Review Process + +### Step 1: Load Content for Review + +```python +def get_pending_reviews(cursor): + sql = """ + SELECT dc.distill_id, dc.doc_id, d.title, d.url, + dc.summary, dc.key_concepts, dc.structured_content, + dc.token_count_original, dc.token_count_distilled, + s.credibility_tier + FROM distilled_content dc + JOIN documents d ON dc.doc_id = d.doc_id + JOIN sources s ON d.source_id = s.source_id + WHERE dc.review_status = 'pending' + ORDER BY s.credibility_tier ASC, dc.distill_date ASC + """ + cursor.execute(sql) + return cursor.fetchall() +``` + +### Step 2: Score Each Criterion + +Evaluate content against each criterion using this assessment template: + +```python +assessment_template = { + "accuracy": { + "score": 0.0, # 0.00 - 1.00 + "notes": "", + "issues": [] # Specific factual errors if any + }, + "completeness": { + "score": 0.0, + "notes": "", + "missing_topics": [] # Concepts that should be covered + }, + "clarity": { + "score": 0.0, + "notes": "", + "confusing_sections": [] # Sections needing rewrite + }, + "prompt_engineering_quality": { + "score": 0.0, + "notes": "", + "improvements": [] # Specific PE technique gaps + }, + "usability": { + "score": 0.0, + "notes": "", + "suggestions": [] + } +} +``` + +### Step 3: Calculate Final Score + +```python +WEIGHTS = { + "accuracy": 0.25, + "completeness": 0.20, + "clarity": 0.20, + "prompt_engineering_quality": 0.25, + "usability": 0.10 +} + +def calculate_quality_score(assessment): + return sum( + assessment[criterion]["score"] * weight + for criterion, weight in WEIGHTS.items() + ) +``` + +### Step 4: Route Decision + +```python +def determine_decision(score, assessment): + if score >= 0.85: + return "approve", None, None + elif score >= 0.60: + instructions = generate_refactor_instructions(assessment) + return "refactor", instructions, None + elif score >= 0.40: + queries = generate_research_queries(assessment) + return "deep_research", None, queries + else: + return "reject", f"Quality score {score:.2f} below minimum threshold", None + +def generate_refactor_instructions(assessment): + """Extract actionable feedback from low-scoring criteria.""" + instructions = [] + for criterion, data in assessment.items(): + if data["score"] < 0.80: + if data.get("issues"): + instructions.extend(data["issues"]) + if data.get("missing_topics"): + instructions.append(f"Add coverage for: {', '.join(data['missing_topics'])}") + if data.get("improvements"): + instructions.extend(data["improvements"]) + return "\n".join(instructions) + +def generate_research_queries(assessment): + """Generate search queries for content gaps.""" + queries = [] + if assessment["completeness"]["missing_topics"]: + for topic in assessment["completeness"]["missing_topics"]: + queries.append(f"{topic} documentation guide") + if assessment["accuracy"]["issues"]: + queries.append("latest official documentation verification") + return queries +``` + +### Step 5: Log Review Decision + +```python +def log_review(cursor, distill_id, assessment, score, decision, instructions=None, queries=None): + # Get current round number + cursor.execute( + "SELECT COALESCE(MAX(review_round), 0) + 1 FROM review_logs WHERE distill_id = %s", + (distill_id,) + ) + review_round = cursor.fetchone()[0] + + sql = """ + INSERT INTO review_logs + (distill_id, review_round, reviewer_type, quality_score, assessment, + decision, refactor_instructions, research_queries) + VALUES (%s, %s, 'claude_review', %s, %s, %s, %s, %s) + """ + cursor.execute(sql, ( + distill_id, review_round, score, + json.dumps(assessment), decision, instructions, + json.dumps(queries) if queries else None + )) + + # Update distilled_content status + status_map = { + "approve": "approved", + "refactor": "needs_refactor", + "deep_research": "needs_refactor", + "reject": "rejected" + } + cursor.execute( + "UPDATE distilled_content SET review_status = %s WHERE distill_id = %s", + (status_map[decision], distill_id) + ) +``` + +## Prompt Engineering Quality Checklist + +When scoring `prompt_engineering_quality`, verify: + +- [ ] Demonstrates specific techniques (CoT, few-shot, etc.) +- [ ] Shows before/after examples +- [ ] Explains *why* techniques work, not just *what* +- [ ] Provides actionable patterns +- [ ] Includes edge cases and failure modes +- [ ] References authoritative sources + +## Auto-Approve Rules + +Tier 1 (official) sources with score ≥ 0.80 may auto-approve without human review if configured: + +```yaml +# In export_config.yaml +quality: + auto_approve_tier1_sources: true + auto_approve_min_score: 0.80 +``` + +## Integration Points + +| From | Action | To | +|------|--------|-----| +| content-distiller | Sends distilled content | quality-reviewer | +| quality-reviewer | APPROVE | markdown-exporter | +| quality-reviewer | REFACTOR + instructions | content-distiller | +| quality-reviewer | DEEP_RESEARCH + queries | web-crawler-orchestrator | diff --git a/custom-skills/90-reference-curator/06-markdown-exporter/code/CLAUDE.md b/custom-skills/90-reference-curator/06-markdown-exporter/code/CLAUDE.md new file mode 100644 index 0000000..5526c03 --- /dev/null +++ b/custom-skills/90-reference-curator/06-markdown-exporter/code/CLAUDE.md @@ -0,0 +1,136 @@ +# Markdown Exporter + +Exports approved reference content as structured markdown files for project knowledge or fine-tuning datasets. + +## Trigger Keywords +"export references", "generate project files", "create markdown output", "export for fine-tuning", "build knowledge base" + +## Export Types + +| Type | Format | Use Case | +|------|--------|----------| +| `project_files` | Nested markdown | Claude Projects knowledge | +| `fine_tuning` | JSONL | Model fine-tuning dataset | +| `knowledge_base` | Flat markdown | Documentation | + +## Workflow + +### Step 1: Query Approved Content +```bash +python scripts/query_approved.py --min-score 0.80 --output approved.json +``` + +### Step 2: Organize by Structure + +**Nested by Topic (default):** +``` +exports/ +├── INDEX.md +├── prompt-engineering/ +│ ├── _index.md +│ ├── 01-chain-of-thought.md +│ └── 02-few-shot-prompting.md +└── claude-models/ + ├── _index.md + └── 01-model-comparison.md +``` + +**Flat Structure:** +``` +exports/ +├── INDEX.md +├── prompt-engineering-chain-of-thought.md +└── claude-models-comparison.md +``` + +### Step 3: Generate Files +```bash +python scripts/export_project.py \ + --structure nested_by_topic \ + --output ~/reference-library/exports/ \ + --include-metadata +``` + +### Step 4: Generate INDEX +```bash +python scripts/generate_index.py --output ~/reference-library/exports/INDEX.md +``` + +### Step 5: Fine-tuning Export (Optional) +```bash +python scripts/export_finetuning.py \ + --output ~/reference-library/exports/fine_tuning.jsonl \ + --max-tokens 4096 +``` + +JSONL format: +```json +{ + "messages": [ + {"role": "system", "content": "You are an expert on AI and prompt engineering."}, + {"role": "user", "content": "Explain {title}"}, + {"role": "assistant", "content": "{structured_content}"} + ], + "metadata": {"source": "{url}", "topic": "{topic_slug}", "quality_score": 0.92} +} +``` + +### Step 6: Log Export Job +```bash +python scripts/log_export.py --name "January 2025 Export" --type project_files --docs 45 +``` + +## Cross-Reference Generation +```bash +python scripts/add_crossrefs.py --input ~/reference-library/exports/ +``` + +Links related documents based on overlapping key concepts. + +## Output Verification + +After export, verify: +- [ ] All files readable and valid markdown +- [ ] INDEX.md links resolve correctly +- [ ] No broken cross-references +- [ ] Total token count matches expectation +- [ ] No duplicate content + +```bash +python scripts/verify_export.py --path ~/reference-library/exports/ +``` + +## Scripts + +- `scripts/query_approved.py` - Get approved content from DB +- `scripts/export_project.py` - Main export for project files +- `scripts/export_finetuning.py` - JSONL export for fine-tuning +- `scripts/generate_index.py` - Generate INDEX.md +- `scripts/add_crossrefs.py` - Add cross-references +- `scripts/log_export.py` - Log export job to DB +- `scripts/verify_export.py` - Verify export integrity + +## Configuration + +```yaml +# ~/.config/reference-curator/export_config.yaml +output: + base_path: ~/reference-library/exports/ + project_files: + structure: nested_by_topic + index_file: INDEX.md + include_metadata: true + fine_tuning: + format: jsonl + max_tokens_per_sample: 4096 + +quality: + min_score_for_export: 0.80 +``` + +## Integration + +| From | To | +|------|-----| +| quality-reviewer (approved) | → | +| → | Project knowledge / Fine-tuning dataset | diff --git a/custom-skills/90-reference-curator/06-markdown-exporter/desktop/SKILL.md b/custom-skills/90-reference-curator/06-markdown-exporter/desktop/SKILL.md new file mode 100644 index 0000000..5274dc3 --- /dev/null +++ b/custom-skills/90-reference-curator/06-markdown-exporter/desktop/SKILL.md @@ -0,0 +1,294 @@ +--- +name: markdown-exporter +description: Exports approved reference content as structured markdown files for project knowledge or fine-tuning datasets. Generates INDEX files, organizes by topic, and maintains cross-references. Triggers on "export references", "generate project files", "create markdown output", "export for fine-tuning", "build knowledge base". +--- + +# Markdown Exporter + +Exports approved content as structured markdown files for Claude Projects or fine-tuning. + +## Export Configuration + +```yaml +# ~/.config/reference-curator/export_config.yaml +output: + base_path: ~/reference-library/exports/ + + project_files: + structure: nested_by_topic # flat | nested_by_topic | nested_by_source + index_file: INDEX.md + include_metadata: true + + fine_tuning: + format: jsonl + max_tokens_per_sample: 4096 + include_system_prompt: true + +quality: + min_score_for_export: 0.80 +``` + +## Export Workflow + +### Step 1: Query Approved Content + +```python +def get_exportable_content(cursor, min_score=0.80, topic_filter=None): + """Get all approved content meeting quality threshold.""" + sql = """ + SELECT d.doc_id, d.title, d.url, + dc.summary, dc.key_concepts, dc.code_snippets, dc.structured_content, + t.topic_slug, t.topic_name, + rl.quality_score, s.credibility_tier, s.vendor + FROM documents d + JOIN distilled_content dc ON d.doc_id = dc.doc_id + JOIN document_topics dt ON d.doc_id = dt.doc_id + JOIN topics t ON dt.topic_id = t.topic_id + JOIN review_logs rl ON dc.distill_id = rl.distill_id + JOIN sources s ON d.source_id = s.source_id + WHERE rl.decision = 'approve' + AND rl.quality_score >= %s + AND rl.review_id = ( + SELECT MAX(review_id) FROM review_logs + WHERE distill_id = dc.distill_id + ) + """ + params = [min_score] + + if topic_filter: + sql += " AND t.topic_slug IN (%s)" % ','.join(['%s'] * len(topic_filter)) + params.extend(topic_filter) + + sql += " ORDER BY t.topic_slug, rl.quality_score DESC" + cursor.execute(sql, params) + return cursor.fetchall() +``` + +### Step 2: Organize by Structure + +**Nested by Topic (recommended):** +``` +exports/ +├── INDEX.md +├── prompt-engineering/ +│ ├── _index.md +│ ├── 01-chain-of-thought.md +│ ├── 02-few-shot-prompting.md +│ └── 03-system-prompts.md +├── claude-models/ +│ ├── _index.md +│ ├── 01-model-comparison.md +│ └── 02-context-windows.md +└── agent-building/ + ├── _index.md + └── 01-tool-use.md +``` + +**Flat Structure:** +``` +exports/ +├── INDEX.md +├── prompt-engineering-chain-of-thought.md +├── prompt-engineering-few-shot.md +└── claude-models-comparison.md +``` + +### Step 3: Generate Files + +**Document File Template:** +```python +def generate_document_file(doc, include_metadata=True): + content = [] + + if include_metadata: + content.append("---") + content.append(f"title: {doc['title']}") + content.append(f"source: {doc['url']}") + content.append(f"vendor: {doc['vendor']}") + content.append(f"tier: {doc['credibility_tier']}") + content.append(f"quality_score: {doc['quality_score']:.2f}") + content.append(f"exported: {datetime.now().isoformat()}") + content.append("---") + content.append("") + + content.append(doc['structured_content']) + + return "\n".join(content) +``` + +**Topic Index Template:** +```python +def generate_topic_index(topic_slug, topic_name, documents): + content = [ + f"# {topic_name}", + "", + f"This section contains {len(documents)} reference documents.", + "", + "## Contents", + "" + ] + + for i, doc in enumerate(documents, 1): + filename = generate_filename(doc['title']) + content.append(f"{i}. [{doc['title']}]({filename})") + + return "\n".join(content) +``` + +**Root INDEX Template:** +```python +def generate_root_index(topics_with_counts, export_date): + content = [ + "# Reference Library", + "", + f"Exported: {export_date}", + "", + "## Topics", + "" + ] + + for topic in topics_with_counts: + content.append(f"- [{topic['name']}]({topic['slug']}/) ({topic['count']} documents)") + + content.extend([ + "", + "## Quality Standards", + "", + "All documents in this library have:", + "- Passed quality review (score ≥ 0.80)", + "- Been distilled for conciseness", + "- Verified source attribution" + ]) + + return "\n".join(content) +``` + +### Step 4: Write Files + +```python +def export_project_files(content_list, config): + base_path = Path(config['output']['base_path']) + structure = config['output']['project_files']['structure'] + + # Group by topic + by_topic = defaultdict(list) + for doc in content_list: + by_topic[doc['topic_slug']].append(doc) + + # Create directories and files + for topic_slug, docs in by_topic.items(): + if structure == 'nested_by_topic': + topic_dir = base_path / topic_slug + topic_dir.mkdir(parents=True, exist_ok=True) + + # Write topic index + topic_index = generate_topic_index(topic_slug, docs[0]['topic_name'], docs) + (topic_dir / '_index.md').write_text(topic_index) + + # Write document files + for i, doc in enumerate(docs, 1): + filename = f"{i:02d}-{slugify(doc['title'])}.md" + file_content = generate_document_file(doc) + (topic_dir / filename).write_text(file_content) + + # Write root INDEX + topics_summary = [ + {"slug": slug, "name": docs[0]['topic_name'], "count": len(docs)} + for slug, docs in by_topic.items() + ] + root_index = generate_root_index(topics_summary, datetime.now().isoformat()) + (base_path / 'INDEX.md').write_text(root_index) +``` + +### Step 5: Fine-tuning Export (Optional) + +```python +def export_fine_tuning_dataset(content_list, config): + """Export as JSONL for fine-tuning.""" + output_path = Path(config['output']['base_path']) / 'fine_tuning.jsonl' + max_tokens = config['output']['fine_tuning']['max_tokens_per_sample'] + + with open(output_path, 'w') as f: + for doc in content_list: + sample = { + "messages": [ + { + "role": "system", + "content": "You are an expert on AI and prompt engineering." + }, + { + "role": "user", + "content": f"Explain {doc['title']}" + }, + { + "role": "assistant", + "content": truncate_to_tokens(doc['structured_content'], max_tokens) + } + ], + "metadata": { + "source": doc['url'], + "topic": doc['topic_slug'], + "quality_score": doc['quality_score'] + } + } + f.write(json.dumps(sample) + '\n') +``` + +### Step 6: Log Export Job + +```python +def log_export_job(cursor, export_name, export_type, output_path, + topic_filter, total_docs, total_tokens): + sql = """ + INSERT INTO export_jobs + (export_name, export_type, output_format, topic_filter, output_path, + total_documents, total_tokens, status, started_at, completed_at) + VALUES (%s, %s, 'markdown', %s, %s, %s, %s, 'completed', NOW(), NOW()) + """ + cursor.execute(sql, ( + export_name, export_type, + json.dumps(topic_filter) if topic_filter else None, + str(output_path), total_docs, total_tokens + )) +``` + +## Cross-Reference Generation + +Link related documents: + +```python +def add_cross_references(doc, all_docs): + """Find and link related documents.""" + related = [] + doc_concepts = set(c['term'].lower() for c in doc['key_concepts']) + + for other in all_docs: + if other['doc_id'] == doc['doc_id']: + continue + other_concepts = set(c['term'].lower() for c in other['key_concepts']) + overlap = len(doc_concepts & other_concepts) + if overlap >= 2: + related.append({ + "title": other['title'], + "path": generate_relative_path(doc, other), + "overlap": overlap + }) + + return sorted(related, key=lambda x: x['overlap'], reverse=True)[:5] +``` + +## Output Verification + +After export, verify: +- [ ] All files readable and valid markdown +- [ ] INDEX.md links resolve correctly +- [ ] No broken cross-references +- [ ] Total token count matches expectation +- [ ] No duplicate content + +## Integration + +| From | Input | To | +|------|-------|-----| +| quality-reviewer | Approved content IDs | markdown-exporter | +| markdown-exporter | Structured files | Project knowledge / Fine-tuning | diff --git a/custom-skills/90-reference-curator/CHANGELOG.md b/custom-skills/90-reference-curator/CHANGELOG.md new file mode 100644 index 0000000..0aa5377 --- /dev/null +++ b/custom-skills/90-reference-curator/CHANGELOG.md @@ -0,0 +1,168 @@ +# Reference Curator Skills - Refactoring Log + +**Date**: 2025-01-28 +**Version**: 2.0 +**Author**: Claude Code (Opus 4.5) + +--- + +## Summary + +Complete restructuring of the Reference Curator skill suite from a flat structure to dual-platform format, with full installation automation. + +--- + +## Changes Made + +### 1. Directory Restructuring + +**Before:** +``` +90-reference-curator/ +├── SKILL.md (Skill 01) +└── mnt/user-data/outputs/reference-curator-skills/ + ├── 02-web-crawler/SKILL.md + ├── 03-content-repository/SKILL.md + ├── 04-content-distiller/SKILL.md + ├── 05-quality-reviewer/SKILL.md + └── 06-markdown-exporter/SKILL.md +``` + +**After:** +``` +90-reference-curator/ +├── README.md +├── install.sh # NEW: Installation script +├── 01-reference-discovery/ +│ ├── code/CLAUDE.md # NEW: Claude Code directive +│ └── desktop/SKILL.md +├── 02-web-crawler-orchestrator/ +│ ├── code/CLAUDE.md # NEW +│ └── desktop/SKILL.md +├── 03-content-repository/ +│ ├── code/CLAUDE.md # NEW +│ └── desktop/SKILL.md +├── 04-content-distiller/ +│ ├── code/CLAUDE.md # NEW +│ └── desktop/SKILL.md +├── 05-quality-reviewer/ +│ ├── code/CLAUDE.md # NEW +│ └── desktop/SKILL.md +├── 06-markdown-exporter/ +│ ├── code/CLAUDE.md # NEW +│ └── desktop/SKILL.md +└── shared/ + ├── schema.sql # NEW: MySQL schema + └── config/ + ├── db_config.yaml # NEW + ├── crawl_config.yaml # NEW + └── export_config.yaml # NEW +``` + +### 2. New Files Created + +| File | Purpose | +|------|---------| +| `install.sh` | Interactive installation script | +| `shared/schema.sql` | MySQL schema (9 tables, 2 views) | +| `shared/config/db_config.yaml` | Database connection config | +| `shared/config/crawl_config.yaml` | Crawler routing config | +| `shared/config/export_config.yaml` | Export format config | +| `*/code/CLAUDE.md` | 6 Claude Code directives | + +### 3. Crawler Configuration + +Implemented intelligent crawler routing: + +| Crawler | Condition | Use Case | +|---------|-----------|----------| +| **Node.js** (default) | ≤50 pages, static | Small documentation sites | +| **Python aiohttp** | ≤200 pages, SEO needed | Technical docs | +| **Scrapy** | >200 pages, multi-domain | Enterprise crawls | +| **Firecrawl MCP** | SPA, JS-rendered | Dynamic sites | + +### 4. Installation Script Features + +```bash +./install.sh # Interactive installation +./install.sh --check # Verify status +./install.sh --reset # Reset (preserves data) +``` + +**Handles:** +- Config file deployment to `~/.config/reference-curator/` +- Storage directory creation at `~/reference-library/` +- MySQL credentials setup in `~/.envrc` +- Database creation and schema application +- Skill symlink registration in `~/.claude/skills/` + +### 5. MySQL Schema + +**Tables (9):** +- `sources` - Authoritative source registry +- `documents` - Crawled document storage +- `distilled_content` - Processed summaries +- `review_logs` - QA decision history +- `topics` - Topic taxonomy +- `document_topics` - Document-topic mapping +- `export_jobs` - Export task tracking +- `crawl_schedule` - Scheduled crawl jobs +- `change_detection` - Content change tracking + +**Views (2):** +- `v_pending_reviews` - Documents awaiting review +- `v_export_ready` - Approved documents ready for export + +### 6. Environment Setup + +**Files installed:** +``` +~/.config/reference-curator/ +├── db_config.yaml +├── crawl_config.yaml +└── export_config.yaml + +~/reference-library/ +├── raw/ +├── processed/ +└── exports/ + +~/.claude/skills/ +├── reference-discovery -> .../01-reference-discovery/desktop +├── web-crawler-orchestrator -> .../02-web-crawler-orchestrator/desktop +├── content-repository -> .../03-content-repository/desktop +├── content-distiller -> .../04-content-distiller/desktop +├── quality-reviewer -> .../05-quality-reviewer/desktop +└── markdown-exporter -> .../06-markdown-exporter/desktop +``` + +--- + +## Deleted + +- `mnt/user-data/outputs/` directory (moved to proper structure) +- Root-level `SKILL.md` (moved to `01-reference-discovery/desktop/`) + +--- + +## Verification + +```bash +$ ./install.sh --check + +✓ Configuration files (3/3) +✓ Storage directories (3/3) +✓ MySQL database (11 tables) +✓ Skill registrations (6/6) + +All components installed correctly. +``` + +--- + +## Next Steps + +1. Add Python scripts to `*/code/scripts/` folders for automation +2. Implement `select_crawler.py` for intelligent routing logic +3. Add unit tests for database operations +4. Create example workflows in `*/desktop/examples/` diff --git a/custom-skills/90-reference-curator/README.md b/custom-skills/90-reference-curator/README.md new file mode 100644 index 0000000..d58d780 --- /dev/null +++ b/custom-skills/90-reference-curator/README.md @@ -0,0 +1,448 @@ +# Reference Curator Skills + +Modular Claude Skills for curating, processing, and exporting reference documentation. + +## Quick Start + +```bash +# Clone and install +git clone https://github.com/ourdigital/our-claude-skills.git +cd our-claude-skills/custom-skills/90-reference-curator +./install.sh + +# Or minimal install (Firecrawl only, no MySQL) +./install.sh --minimal + +# Check installation status +./install.sh --check + +# Uninstall +./install.sh --uninstall +``` + +--- + +## Installing from GitHub + +### For New Machines + +1. **Clone the repository:** + ```bash + git clone https://github.com/ourdigital/our-claude-skills.git + cd our-claude-skills/custom-skills/90-reference-curator + ``` + +2. **Run the installer:** + ```bash + ./install.sh + ``` + +3. **Follow the interactive prompts:** + - Set storage directory path + - Configure MySQL credentials (optional) + - Choose crawler backend (Firecrawl MCP recommended) + +4. **Add to shell profile:** + ```bash + echo 'source ~/.reference-curator.env' >> ~/.zshrc + source ~/.reference-curator.env + ``` + +5. **Verify installation:** + ```bash + ./install.sh --check + ``` + +### Installation Modes + +| Mode | Command | Description | +|------|---------|-------------| +| **Full** | `./install.sh` | Interactive setup with MySQL and crawlers | +| **Minimal** | `./install.sh --minimal` | Firecrawl MCP only, no database | +| **Check** | `./install.sh --check` | Verify installation status | +| **Uninstall** | `./install.sh --uninstall` | Remove installation (preserves data) | + +### What Gets Installed + +| Component | Location | Purpose | +|-----------|----------|---------| +| Environment config | `~/.reference-curator.env` | Credentials and paths | +| Config files | `~/.config/reference-curator/` | YAML configuration | +| Storage directories | `~/reference-library/` | Raw/processed/exports | +| Claude Code commands | `~/.claude/commands/` | Slash commands | +| Claude Desktop skills | `~/.claude/skills/` | Skill symlinks | +| MySQL database | `reference_library` | Document storage | + +### Environment Variables + +The installer creates `~/.reference-curator.env` with: + +```bash +# Storage paths +export REFERENCE_LIBRARY_PATH="~/reference-library" + +# MySQL configuration (if enabled) +export MYSQL_HOST="localhost" +export MYSQL_PORT="3306" +export MYSQL_USER="youruser" +export MYSQL_PASSWORD="yourpassword" + +# Crawler configuration +export DEFAULT_CRAWLER="firecrawl" # or "nodejs" +export CRAWLER_PROJECT_PATH="" # Path to local crawlers (optional) +``` + +--- + +## Architecture + +``` +[Topic Input] + │ + ▼ +┌─────────────────────┐ +│ reference-discovery │ → Search & validate sources +└─────────────────────┘ + │ + ▼ +┌──────────────────────────┐ +│ web-crawler-orchestrator │ → Crawl (Firecrawl/Node.js/aiohttp/Scrapy) +└──────────────────────────┘ + │ + ▼ +┌────────────────────┐ +│ content-repository │ → Store in MySQL +└────────────────────┘ + │ + ▼ +┌───────────────────┐ +│ content-distiller │ → Summarize & extract +└───────────────────┘ + │ + ▼ +┌──────────────────┐ +│ quality-reviewer │ → QA loop +└──────────────────┘ + │ + ├── REFACTOR → content-distiller + ├── DEEP_RESEARCH → web-crawler-orchestrator + │ + ▼ APPROVE +┌───────────────────┐ +│ markdown-exporter │ → Project files / Fine-tuning +└───────────────────┘ +``` + +--- + +## User Guide + +### Basic Workflow + +**Step 1: Discover References** +``` +/reference-discovery Claude's system prompt best practices +``` +Claude searches the web, validates sources, and creates a manifest of URLs to crawl. + +**Step 2: Crawl Content** +``` +/web-crawler https://docs.anthropic.com --max-pages 50 +``` +The crawler automatically selects the best backend based on site characteristics. + +**Step 3: Store in Repository** +``` +/content-repository store +``` +Documents are saved to MySQL with deduplication and version tracking. + +**Step 4: Distill Content** +``` +/content-distiller all-pending +``` +Claude summarizes, extracts key concepts, and creates structured content. + +**Step 5: Quality Review** +``` +/quality-reviewer all-pending --auto-approve +``` +Automated QA scoring determines: approve, refactor, deep research, or reject. + +**Step 6: Export** +``` +/markdown-exporter project_files --topic prompt-engineering +``` +Generates markdown files organized by topic with cross-references. + +### Example Prompts + +| Task | Command | +|------|---------| +| **Discover sources** | `/reference-discovery MCP server development` | +| **Crawl URL** | `/web-crawler https://docs.anthropic.com` | +| **Check repository** | `/content-repository stats` | +| **Distill document** | `/content-distiller 42` | +| **Review quality** | `/quality-reviewer all-pending` | +| **Export files** | `/markdown-exporter project_files` | + +### Crawler Selection + +The system automatically selects the optimal crawler: + +| Site Type | Crawler | When Auto-Selected | +|-----------|---------|-------------------| +| SPAs/Dynamic | **Firecrawl MCP** | React, Vue, Angular sites (default) | +| Small docs sites | **Node.js** | ≤50 pages, static HTML | +| Technical docs | **Python aiohttp** | ≤200 pages, needs SEO data | +| Enterprise sites | **Scrapy** | >200 pages, multi-domain | + +Override with explicit request: +``` +/web-crawler https://example.com --crawler firecrawl +``` + +### Quality Review Decisions + +| Score | Decision | What Happens | +|-------|----------|--------------| +| ≥ 0.85 | **Approve** | Ready for export | +| 0.60-0.84 | **Refactor** | Re-distill with feedback | +| 0.40-0.59 | **Deep Research** | Gather more sources | +| < 0.40 | **Reject** | Archive (low quality) | + +### Database Queries + +Check your reference library status: + +```bash +# Source credentials +source ~/.reference-curator.env + +# Count documents by status +mysql -h $MYSQL_HOST -u $MYSQL_USER -p"$MYSQL_PASSWORD" reference_library -e " + SELECT crawl_status, COUNT(*) as count + FROM documents GROUP BY crawl_status;" + +# View pending reviews +mysql -h $MYSQL_HOST -u $MYSQL_USER -p"$MYSQL_PASSWORD" reference_library -e " + SELECT * FROM v_pending_reviews;" + +# View export-ready documents +mysql -h $MYSQL_HOST -u $MYSQL_USER -p"$MYSQL_PASSWORD" reference_library -e " + SELECT * FROM v_export_ready;" +``` + +### Output Formats + +**For Claude Projects:** +``` +~/reference-library/exports/ +├── INDEX.md # Master index with all topics +└── prompt-engineering/ # Topic folder + ├── _index.md # Topic overview + ├── system-prompts.md # Individual document + └── chain-of-thought.md +``` + +**For Fine-tuning (JSONL):** +```json +{"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]} +``` + +--- + +## Skills & Commands Reference + +| # | Skill | Command | Purpose | +|---|-------|---------|---------| +| 01 | reference-discovery | `/reference-discovery` | Search authoritative sources | +| 02 | web-crawler-orchestrator | `/web-crawler` | Multi-backend crawling | +| 03 | content-repository | `/content-repository` | MySQL storage management | +| 04 | content-distiller | `/content-distiller` | Summarize & extract | +| 05 | quality-reviewer | `/quality-reviewer` | QA scoring & routing | +| 06 | markdown-exporter | `/markdown-exporter` | Export to markdown/JSONL | + +--- + +## Configuration + +### Environment (`~/.reference-curator.env`) + +```bash +# Required for MySQL features +export MYSQL_HOST="localhost" +export MYSQL_USER="youruser" +export MYSQL_PASSWORD="yourpassword" + +# Storage location +export REFERENCE_LIBRARY_PATH="~/reference-library" + +# Crawler selection +export DEFAULT_CRAWLER="firecrawl" # firecrawl, nodejs, aiohttp, scrapy +``` + +### Database (`~/.config/reference-curator/db_config.yaml`) + +```yaml +mysql: + host: ${MYSQL_HOST:-localhost} + port: ${MYSQL_PORT:-3306} + database: reference_library + user: ${MYSQL_USER} + password: ${MYSQL_PASSWORD} +``` + +### Crawlers (`~/.config/reference-curator/crawl_config.yaml`) + +```yaml +default_crawler: ${DEFAULT_CRAWLER:-firecrawl} + +rate_limit: + requests_per_minute: 20 + concurrent_requests: 3 + +default_options: + timeout: 30000 + max_depth: 3 + max_pages: 100 +``` + +### Export (`~/.config/reference-curator/export_config.yaml`) + +```yaml +output: + base_path: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/exports/ + +quality: + min_score_for_export: 0.80 + auto_approve_tier1_sources: true +``` + +--- + +## Troubleshooting + +### MySQL Connection Failed + +```bash +# Check MySQL is running +brew services list | grep mysql # macOS +systemctl status mysql # Linux + +# Start MySQL +brew services start mysql # macOS +sudo systemctl start mysql # Linux + +# Verify credentials +source ~/.reference-curator.env +mysql -h $MYSQL_HOST -u $MYSQL_USER -p"$MYSQL_PASSWORD" -e "SELECT 1" +``` + +### Commands Not Found + +```bash +# Check commands are registered +ls -la ~/.claude/commands/ + +# Re-run installer to fix +./install.sh +``` + +### Crawler Timeout + +For slow sites, increase timeout in `~/.config/reference-curator/crawl_config.yaml`: +```yaml +default_options: + timeout: 60000 # 60 seconds +``` + +### Skills Not Loading + +```bash +# Check symlinks exist +ls -la ~/.claude/skills/ + +# Re-run installer +./install.sh --uninstall +./install.sh +``` + +### Database Schema Outdated + +```bash +# Re-apply schema (preserves data with IF NOT EXISTS) +source ~/.reference-curator.env +mysql -h $MYSQL_HOST -u $MYSQL_USER -p"$MYSQL_PASSWORD" reference_library < shared/schema.sql +``` + +--- + +## Directory Structure + +``` +90-reference-curator/ +├── README.md # This file +├── CHANGELOG.md # Version history +├── install.sh # Portable installation script +│ +├── commands/ # Claude Code commands (tracked in git) +│ ├── reference-discovery.md +│ ├── web-crawler.md +│ ├── content-repository.md +│ ├── content-distiller.md +│ ├── quality-reviewer.md +│ └── markdown-exporter.md +│ +├── 01-reference-discovery/ +│ ├── code/CLAUDE.md # Claude Code directive +│ └── desktop/SKILL.md # Claude Desktop directive +├── 02-web-crawler-orchestrator/ +│ ├── code/CLAUDE.md +│ └── desktop/SKILL.md +├── 03-content-repository/ +│ ├── code/CLAUDE.md +│ └── desktop/SKILL.md +├── 04-content-distiller/ +│ ├── code/CLAUDE.md +│ └── desktop/SKILL.md +├── 05-quality-reviewer/ +│ ├── code/CLAUDE.md +│ └── desktop/SKILL.md +├── 06-markdown-exporter/ +│ ├── code/CLAUDE.md +│ └── desktop/SKILL.md +│ +└── shared/ + ├── schema.sql # MySQL schema + └── config/ # Config templates + ├── db_config.yaml + ├── crawl_config.yaml + └── export_config.yaml +``` + +--- + +## Platform Differences + +| Aspect | `code/` (Claude Code) | `desktop/` (Claude Desktop) | +|--------|----------------------|----------------------------| +| Directive | CLAUDE.md | SKILL.md (YAML frontmatter) | +| Commands | `~/.claude/commands/` | Not used | +| Skills | Reference only | `~/.claude/skills/` symlinks | +| Execution | Direct Bash/Python | MCP tools only | +| Best for | Automation, CI/CD | Interactive use | + +--- + +## Prerequisites + +### Required +- macOS or Linux +- Claude Code or Claude Desktop + +### Optional (for full features) +- MySQL 8.0+ (for database storage) +- Firecrawl MCP server configured +- Node.js 18+ (for Node.js crawler) +- Python 3.12+ (for aiohttp/Scrapy crawlers) diff --git a/custom-skills/90-reference-curator/commands/content-distiller.md b/custom-skills/90-reference-curator/commands/content-distiller.md new file mode 100644 index 0000000..4be4a80 --- /dev/null +++ b/custom-skills/90-reference-curator/commands/content-distiller.md @@ -0,0 +1,92 @@ +--- +description: Analyze and summarize stored documents. Extracts key concepts, code snippets, and creates structured content. +argument-hint: [--focus keywords] [--max-tokens 2000] +allowed-tools: Read, Write, Bash, Glob, Grep +--- + +# Content Distiller + +Analyze, summarize, and extract key information from stored documents. + +## Arguments +- ``: Specific document ID or process all pending +- `--focus`: Keywords to emphasize in distillation +- `--max-tokens`: Target token count for distilled output (default: 2000) + +## Distillation Process + +### 1. Load Raw Content +```bash +source ~/.envrc +# Get document path +mysql -u $MYSQL_USER -p"$MYSQL_PASSWORD" reference_library -N -e \ + "SELECT raw_content_path FROM documents WHERE doc_id = $DOC_ID" +``` + +### 2. Analyze Content + +Extract: +- **Summary**: 2-3 sentence executive summary +- **Key Concepts**: Important terms with definitions +- **Code Snippets**: Relevant code examples +- **Structured Content**: Full distilled markdown + +### 3. Output Format + +```json +{ + "summary": "Executive summary of the document...", + "key_concepts": [ + {"term": "System Prompt", "definition": "..."}, + {"term": "Context Window", "definition": "..."} + ], + "code_snippets": [ + {"language": "python", "description": "...", "code": "..."} + ], + "structured_content": "# Title\n\n## Overview\n..." +} +``` + +### 4. Store Distilled Content + +```sql +INSERT INTO distilled_content + (doc_id, summary, key_concepts, code_snippets, structured_content, + token_count_original, token_count_distilled, distill_model, review_status) +VALUES + (?, ?, ?, ?, ?, ?, ?, 'claude-opus-4', 'pending'); +``` + +### 5. Calculate Metrics + +- `token_count_original`: Tokens in raw content +- `token_count_distilled`: Tokens in output +- `compression_ratio`: Auto-calculated (distilled/original * 100) + +## Distillation Guidelines + +**For Prompt Engineering Content:** +- Emphasize techniques and patterns +- Include before/after examples +- Extract actionable best practices +- Note model-specific behaviors + +**For API Documentation:** +- Focus on endpoint signatures +- Include request/response examples +- Note rate limits and constraints +- Extract error handling patterns + +**For Code Repositories:** +- Summarize architecture +- Extract key functions/classes +- Note dependencies +- Include usage examples + +## Example Usage + +``` +/content-distiller 42 +/content-distiller all-pending --focus "system prompts" +/content-distiller 15 --max-tokens 3000 +``` diff --git a/custom-skills/90-reference-curator/commands/content-repository.md b/custom-skills/90-reference-curator/commands/content-repository.md new file mode 100644 index 0000000..93a8cd0 --- /dev/null +++ b/custom-skills/90-reference-curator/commands/content-repository.md @@ -0,0 +1,94 @@ +--- +description: Store and manage crawled content in MySQL. Handles versioning, deduplication, and document metadata. +argument-hint: [--doc-id N] [--source-id N] +allowed-tools: Bash, Read, Write, Glob, Grep +--- + +# Content Repository + +Manage crawled content in MySQL database. + +## Arguments +- ``: store | list | get | update | delete | stats +- `--doc-id`: Specific document ID +- `--source-id`: Filter by source ID + +## Database Connection + +```bash +source ~/.envrc +mysql -u $MYSQL_USER -p"$MYSQL_PASSWORD" reference_library +``` + +## Actions + +### store +Store new documents from crawl output: +```bash +# Read crawl manifest +cat ~/reference-library/raw/YYYY/MM/crawl_manifest.json + +# Insert into documents table +INSERT INTO documents (source_id, title, url, doc_type, raw_content_path, crawl_date, crawl_status) +VALUES (...); +``` + +### list +List documents with filters: +```sql +SELECT doc_id, title, crawl_status, created_at +FROM documents +WHERE source_id = ? AND crawl_status = 'completed' +ORDER BY created_at DESC; +``` + +### get +Retrieve specific document: +```sql +SELECT d.*, s.source_name, s.credibility_tier +FROM documents d +JOIN sources s ON d.source_id = s.source_id +WHERE d.doc_id = ?; +``` + +### stats +Show repository statistics: +```sql +SELECT + COUNT(*) as total_docs, + SUM(CASE WHEN crawl_status = 'completed' THEN 1 ELSE 0 END) as completed, + SUM(CASE WHEN crawl_status = 'pending' THEN 1 ELSE 0 END) as pending +FROM documents; +``` + +## Deduplication + +Documents are deduplicated by URL hash: +```sql +-- url_hash is auto-generated: SHA2(url, 256) +SELECT * FROM documents WHERE url_hash = SHA2('https://...', 256); +``` + +## Version Tracking + +When content changes: +```sql +-- Create new version +INSERT INTO documents (..., version, previous_version_id) +SELECT ..., version + 1, doc_id FROM documents WHERE doc_id = ?; + +-- Mark old as superseded +UPDATE documents SET crawl_status = 'stale' WHERE doc_id = ?; +``` + +## Schema Reference + +Key tables: +- `sources` - Authoritative source registry +- `documents` - Crawled document storage +- `distilled_content` - Processed summaries +- `review_logs` - QA decisions + +Views: +- `v_pending_reviews` - Documents awaiting review +- `v_export_ready` - Approved for export diff --git a/custom-skills/90-reference-curator/commands/markdown-exporter.md b/custom-skills/90-reference-curator/commands/markdown-exporter.md new file mode 100644 index 0000000..a1b4a75 --- /dev/null +++ b/custom-skills/90-reference-curator/commands/markdown-exporter.md @@ -0,0 +1,138 @@ +--- +description: Export approved content to markdown files or JSONL for fine-tuning. Generates structured output with cross-references. +argument-hint: [--topic slug] [--min-score 0.80] +allowed-tools: Read, Write, Bash, Glob, Grep +--- + +# Markdown Exporter + +Export approved content to structured formats. + +## Arguments +- ``: project_files | fine_tuning | knowledge_base +- `--topic`: Filter by topic slug (e.g., "prompt-engineering") +- `--min-score`: Minimum quality score (default: 0.80) + +## Export Formats + +### project_files (Claude Projects) + +Output structure: +``` +~/reference-library/exports/ +├── INDEX.md # Master index +└── {topic-slug}/ + ├── _index.md # Topic overview + ├── {document-1}.md + └── {document-2}.md +``` + +**INDEX.md format:** +```markdown +# Reference Library Index + +Generated: {timestamp} +Total Documents: {count} + +## Topics + +### [Prompt Engineering](./prompt-engineering/) +{count} documents | Last updated: {date} + +### [Claude Models](./claude-models/) +{count} documents | Last updated: {date} +``` + +**Document format:** +```markdown +--- +source: {source_name} +url: {original_url} +credibility: {tier} +quality_score: {score} +exported: {timestamp} +--- + +# {title} + +{structured_content} + +## Related Documents +- [Related Doc 1](./related-1.md) +- [Related Doc 2](./related-2.md) +``` + +### fine_tuning (JSONL) + +Output: `~/reference-library/exports/fine_tuning_{timestamp}.jsonl` + +```json +{"messages": [ + {"role": "system", "content": "You are an expert on AI and prompt engineering."}, + {"role": "user", "content": "Explain {topic}"}, + {"role": "assistant", "content": "{structured_content}"} +]} +``` + +### knowledge_base (Flat) + +Single consolidated file with table of contents. + +## Export Process + +### 1. Query Approved Content +```sql +SELECT dc.*, d.title, d.url, s.source_name, s.credibility_tier, t.topic_slug +FROM distilled_content dc +JOIN documents d ON dc.doc_id = d.doc_id +JOIN sources s ON d.source_id = s.source_id +LEFT JOIN document_topics dt ON d.doc_id = dt.doc_id +LEFT JOIN topics t ON dt.topic_id = t.topic_id +WHERE dc.review_status = 'approved' + AND (SELECT MAX(quality_score) FROM review_logs WHERE distill_id = dc.distill_id) >= ?; +``` + +### 2. Generate Cross-References + +Find related documents by: +- Shared topics +- Overlapping key concepts +- Same source + +### 3. Write Files + +```bash +mkdir -p ~/reference-library/exports/{topic-slug} +``` + +### 4. Log Export Job + +```sql +INSERT INTO export_jobs + (export_name, export_type, output_format, topic_filter, + min_quality_score, output_path, total_documents, status) +VALUES + (?, ?, ?, ?, ?, ?, ?, 'completed'); +``` + +## Configuration + +From `~/.config/reference-curator/export_config.yaml`: +```yaml +output: + base_path: ~/reference-library/exports/ + project_files: + structure: nested_by_topic + include_metadata: true +quality: + min_score_for_export: 0.80 + auto_approve_tier1_sources: true +``` + +## Example Usage + +``` +/markdown-exporter project_files +/markdown-exporter fine_tuning --topic prompt-engineering +/markdown-exporter project_files --min-score 0.90 +``` diff --git a/custom-skills/90-reference-curator/commands/quality-reviewer.md b/custom-skills/90-reference-curator/commands/quality-reviewer.md new file mode 100644 index 0000000..d0c141d --- /dev/null +++ b/custom-skills/90-reference-curator/commands/quality-reviewer.md @@ -0,0 +1,122 @@ +--- +description: Review distilled content quality. Multi-criteria scoring with decision routing (approve/refactor/deep_research/reject). +argument-hint: [--auto-approve] [--threshold 0.85] +allowed-tools: Read, Write, Bash, Glob, Grep +--- + +# Quality Reviewer + +Review distilled content for quality and route decisions. + +## Arguments +- ``: Specific distill ID or review all pending +- `--auto-approve`: Auto-approve scores above threshold +- `--threshold`: Approval threshold (default: 0.85) + +## Review Criteria + +### Scoring Dimensions + +| Criterion | Weight | Checks | +|-----------|--------|--------| +| **Accuracy** | 25% | Factual correctness, up-to-date info, proper attribution | +| **Completeness** | 20% | Covers key concepts, includes examples, addresses edge cases | +| **Clarity** | 20% | Clear structure, concise language, logical flow | +| **Prompt Engineering Quality** | 25% | Demonstrates techniques, shows before/after, actionable | +| **Usability** | 10% | Easy to reference, searchable keywords, appropriate length | + +### Score Calculation + +```python +score = ( + accuracy * 0.25 + + completeness * 0.20 + + clarity * 0.20 + + prompt_eng_quality * 0.25 + + usability * 0.10 +) +``` + +## Decision Thresholds + +| Score | Decision | Action | +|-------|----------|--------| +| ≥ 0.85 | **APPROVE** | Ready for export | +| 0.60-0.84 | **REFACTOR** | Re-distill with feedback | +| 0.40-0.59 | **DEEP_RESEARCH** | Gather more sources | +| < 0.40 | **REJECT** | Archive (low quality) | + +## Review Process + +### 1. Load Distilled Content +```bash +source ~/.envrc +mysql -u $MYSQL_USER -p"$MYSQL_PASSWORD" reference_library -e \ + "SELECT * FROM distilled_content WHERE distill_id = $ID" +``` + +### 2. Evaluate Each Criterion + +Score 0.0 to 1.0 for each dimension. + +### 3. Generate Assessment + +```json +{ + "accuracy": 0.90, + "completeness": 0.85, + "clarity": 0.95, + "prompt_engineering_quality": 0.88, + "usability": 0.82, + "overall_score": 0.88, + "decision": "approve", + "feedback": "Well-structured with clear examples...", + "refactor_instructions": null +} +``` + +### 4. Log Review + +```sql +INSERT INTO review_logs + (distill_id, review_round, reviewer_type, quality_score, + assessment, decision, feedback, refactor_instructions) +VALUES + (?, 1, 'claude_review', ?, ?, ?, ?, ?); +``` + +### 5. Update Status + +```sql +UPDATE distilled_content +SET review_status = 'approved' +WHERE distill_id = ?; +``` + +## Decision Routing + +**APPROVE → markdown-exporter** +Content is ready for export. + +**REFACTOR → content-distiller** +Re-distill with specific feedback: +```json +{"refactor_instructions": "Add more code examples for the API authentication section"} +``` + +**DEEP_RESEARCH → web-crawler** +Need more sources: +```json +{"research_queries": ["Claude API authentication examples", "Anthropic SDK best practices"]} +``` + +**REJECT → Archive** +Mark as rejected, optionally note reason. + +## Example Usage + +``` +/quality-reviewer 15 +/quality-reviewer all-pending --auto-approve +/quality-reviewer 42 --threshold 0.80 +``` diff --git a/custom-skills/90-reference-curator/commands/reference-discovery.md b/custom-skills/90-reference-curator/commands/reference-discovery.md new file mode 100644 index 0000000..10698f8 --- /dev/null +++ b/custom-skills/90-reference-curator/commands/reference-discovery.md @@ -0,0 +1,72 @@ +--- +description: Search and discover authoritative reference sources for a topic. Validates credibility, generates URL manifests for crawling. +argument-hint: [--vendor anthropic|openai|google] [--max-sources 10] +allowed-tools: WebSearch, WebFetch, Read, Write, Bash, Grep, Glob +--- + +# Reference Discovery + +Search for authoritative reference sources on a given topic. + +## Arguments +- ``: Required. The subject to find references for (e.g., "Claude system prompts") +- `--vendor`: Filter to specific vendor (anthropic, openai, google) +- `--max-sources`: Maximum sources to discover (default: 10) + +## Workflow + +### 1. Search Strategy +Use multiple search approaches: +- Official documentation sites +- Engineering blogs +- GitHub repositories +- Research papers +- Community guides + +### 2. Source Validation + +Evaluate each source for credibility: + +| Tier | Description | Examples | +|------|-------------|----------| +| tier1_official | Vendor documentation | docs.anthropic.com | +| tier2_verified | Verified engineering blogs | anthropic.com/news | +| tier3_community | Community resources | GitHub repos, tutorials | + +### 3. Output Manifest + +Generate `manifest.json` in working directory: + +```json +{ + "topic": "user provided topic", + "discovered_at": "ISO timestamp", + "sources": [ + { + "url": "https://...", + "title": "Page title", + "source_type": "official_docs", + "credibility_tier": "tier1_official", + "vendor": "anthropic" + } + ] +} +``` + +### 4. Store Sources + +Insert discovered sources into MySQL: +```bash +source ~/.envrc +mysql -u $MYSQL_USER -p"$MYSQL_PASSWORD" reference_library +``` + +Use the `sources` table schema from `~/.config/reference-curator/`. + +## Example Usage + +``` +/reference-discovery Claude's system prompt best practices +/reference-discovery MCP server development --vendor anthropic +/reference-discovery prompt engineering --max-sources 20 +``` diff --git a/custom-skills/90-reference-curator/commands/web-crawler.md b/custom-skills/90-reference-curator/commands/web-crawler.md new file mode 100644 index 0000000..a25120f --- /dev/null +++ b/custom-skills/90-reference-curator/commands/web-crawler.md @@ -0,0 +1,79 @@ +--- +description: Crawl URLs with intelligent backend selection. Auto-selects Node.js, Python aiohttp, Scrapy, or Firecrawl based on site characteristics. +argument-hint: [--crawler nodejs|aiohttp|scrapy|firecrawl] [--max-pages 50] +allowed-tools: Bash, Read, Write, WebFetch, Glob, Grep +--- + +# Web Crawler Orchestrator + +Crawl web content with intelligent backend selection. + +## Arguments +- ``: Single URL or path to manifest.json from reference-discovery +- `--crawler`: Force specific crawler (nodejs, aiohttp, scrapy, firecrawl) +- `--max-pages`: Maximum pages to crawl (default: 50) + +## Intelligent Crawler Selection + +Auto-select based on site characteristics: + +| Crawler | Best For | Auto-Selected When | +|---------|----------|-------------------| +| **Node.js** (default) | Small docs sites | ≤50 pages, static content | +| **Python aiohttp** | Technical docs | ≤200 pages, needs SEO data | +| **Scrapy** | Enterprise crawls | >200 pages, multi-domain | +| **Firecrawl MCP** | Dynamic sites | SPAs, JS-rendered content | + +### Detection Flow +``` +[URL] → Is SPA/React/Vue? → Firecrawl + → >200 pages? → Scrapy + → Needs SEO? → aiohttp + → Default → Node.js +``` + +## Crawler Commands + +**Node.js:** +```bash +cd ~/Project/our-seo-agent/util/js-crawler +node src/crawler.js --max-pages 50 +``` + +**Python aiohttp:** +```bash +cd ~/Project/our-seo-agent +python -m seo_agent.crawler --url --max-pages 100 +``` + +**Scrapy:** +```bash +cd ~/Project/our-seo-agent +scrapy crawl seo_spider -a start_url= -a max_pages=500 +``` + +**Firecrawl MCP:** +Use MCP tools: `firecrawl_scrape`, `firecrawl_crawl`, `firecrawl_map` + +## Output + +Save crawled content to `~/reference-library/raw/YYYY/MM/`: +- One markdown file per page +- Filename: `{url_hash}.md` + +Generate crawl manifest: +```json +{ + "crawl_date": "ISO timestamp", + "crawler_used": "nodejs", + "total_crawled": 45, + "documents": [...] +} +``` + +## Rate Limiting + +All crawlers respect: +- 20 requests/minute +- 3 concurrent requests +- Exponential backoff on 429/5xx diff --git a/custom-skills/90-reference-curator/install.sh b/custom-skills/90-reference-curator/install.sh new file mode 100755 index 0000000..a19be10 --- /dev/null +++ b/custom-skills/90-reference-curator/install.sh @@ -0,0 +1,747 @@ +#!/bin/bash +# +# Reference Curator - Portable Installation Script +# +# Installs the Reference Curator skill suite on any machine. +# Handles environment configuration, database setup, and skill registration. +# +# Usage: +# ./install.sh # Interactive installation +# ./install.sh --check # Verify installation status +# ./install.sh --minimal # Firecrawl-only mode (no local crawlers) +# ./install.sh --uninstall # Remove installation +# + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' # No Color + +# Paths +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ENV_FILE="$HOME/.reference-curator.env" +CONFIG_DIR="$HOME/.config/reference-curator" +CLAUDE_COMMANDS_DIR="$HOME/.claude/commands" +CLAUDE_SKILLS_DIR="$HOME/.claude/skills" + +# Default values +DEFAULT_LIBRARY_PATH="$HOME/reference-library" +DEFAULT_CRAWLER="firecrawl" + +# Skills and commands to register +SKILLS=( + "reference-discovery:01-reference-discovery" + "web-crawler-orchestrator:02-web-crawler-orchestrator" + "content-repository:03-content-repository" + "content-distiller:04-content-distiller" + "quality-reviewer:05-quality-reviewer" + "markdown-exporter:06-markdown-exporter" +) + +COMMANDS=( + "reference-discovery" + "web-crawler" + "content-repository" + "content-distiller" + "quality-reviewer" + "markdown-exporter" +) + +# ============================================================================ +# Helper Functions +# ============================================================================ + +print_header() { + echo "" + echo -e "${BLUE}╔══════════════════════════════════════════════════════╗${NC}" + echo -e "${BLUE}║ ${BOLD}Reference Curator - Portable Installation${NC}${BLUE} ║${NC}" + echo -e "${BLUE}╚══════════════════════════════════════════════════════╝${NC}" + echo "" +} + +print_step() { + echo -e "\n${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" + echo -e "${BOLD}$1${NC}" + echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +} + +print_substep() { + echo -e "\n${BLUE}▶${NC} $1" +} + +print_success() { + echo -e " ${GREEN}✓${NC} $1" +} + +print_warning() { + echo -e " ${YELLOW}⚠${NC} $1" +} + +print_error() { + echo -e " ${RED}✗${NC} $1" +} + +print_info() { + echo -e " ${BLUE}ℹ${NC} $1" +} + +prompt_with_default() { + local prompt="$1" + local default="$2" + local var_name="$3" + + if [[ -n "$default" ]]; then + read -p " $prompt [$default]: " input + eval "$var_name=\"${input:-$default}\"" + else + read -p " $prompt: " input + eval "$var_name=\"$input\"" + fi +} + +prompt_password() { + local prompt="$1" + local var_name="$2" + + read -sp " $prompt: " input + echo "" + eval "$var_name=\"$input\"" +} + +prompt_yes_no() { + local prompt="$1" + local default="$2" + + if [[ "$default" == "y" ]]; then + read -p " $prompt [Y/n]: " input + [[ ! "$input" =~ ^[Nn] ]] + else + read -p " $prompt [y/N]: " input + [[ "$input" =~ ^[Yy] ]] + fi +} + +# ============================================================================ +# Check Prerequisites +# ============================================================================ + +check_prerequisites() { + print_step "Step 1: Checking Prerequisites" + + local all_ok=true + + # Check OS + print_substep "Operating System" + case "$(uname -s)" in + Darwin) + print_success "macOS detected" + ;; + Linux) + print_success "Linux detected" + ;; + *) + print_error "Unsupported OS: $(uname -s)" + all_ok=false + ;; + esac + + # Check MySQL + print_substep "MySQL Client" + if command -v mysql &> /dev/null; then + local mysql_version=$(mysql --version 2>/dev/null | head -1) + print_success "MySQL client installed: $mysql_version" + else + print_warning "MySQL client not found - database features will be limited" + print_info "Install with: brew install mysql (macOS) or apt install mysql-client (Linux)" + fi + + # Check Claude Code directories + print_substep "Claude Code Directories" + if [[ -d "$HOME/.claude" ]]; then + print_success "~/.claude directory exists" + else + print_warning "~/.claude directory not found - will be created" + fi + + # Check for existing installation + print_substep "Existing Installation" + if [[ -f "$ENV_FILE" ]]; then + print_warning "Existing installation found at $ENV_FILE" + if prompt_yes_no "Overwrite existing configuration?" "n"; then + rm -f "$ENV_FILE" + print_info "Existing configuration removed" + else + print_info "Will merge with existing configuration" + fi + else + print_success "No existing installation found" + fi + + echo "" + if $all_ok; then + return 0 + else + return 1 + fi +} + +# ============================================================================ +# Configure Environment +# ============================================================================ + +configure_environment() { + print_step "Step 2: Environment Configuration" + + echo -e "\n ${BOLD}Environment variables will be saved to:${NC} $ENV_FILE" + echo "" + + # Library path + print_substep "Reference Library Path" + prompt_with_default "Storage directory for reference library" "$DEFAULT_LIBRARY_PATH" "LIBRARY_PATH" + + # MySQL configuration + print_substep "MySQL Configuration" + + if prompt_yes_no "Configure MySQL database?" "y"; then + SETUP_MYSQL=true + prompt_with_default "MySQL host" "localhost" "MYSQL_HOST" + prompt_with_default "MySQL port" "3306" "MYSQL_PORT" + prompt_with_default "MySQL username" "" "MYSQL_USER" + prompt_password "MySQL password" "MYSQL_PASSWORD" + + if [[ -z "$MYSQL_PASSWORD" ]]; then + print_warning "No password provided - MySQL connection may fail" + fi + else + SETUP_MYSQL=false + print_info "Skipping MySQL setup - database features will be unavailable" + fi + + # Crawler configuration + print_substep "Crawler Configuration" + echo "" + echo " Available crawler backends:" + echo " 1) Firecrawl MCP only (recommended for most users)" + echo " 2) Local crawlers + Firecrawl (requires our-seo-agent project)" + echo "" + + read -p " Select option [1]: " crawler_option + crawler_option=${crawler_option:-1} + + if [[ "$crawler_option" == "2" ]]; then + prompt_with_default "Path to crawler project (our-seo-agent)" "$HOME/Project/our-seo-agent" "CRAWLER_PATH" + + if [[ -d "$CRAWLER_PATH" ]]; then + print_success "Crawler project found at $CRAWLER_PATH" + ENABLE_LOCAL_CRAWLERS=true + DEFAULT_CRAWLER="nodejs" + else + print_warning "Crawler project not found - using Firecrawl only" + ENABLE_LOCAL_CRAWLERS=false + DEFAULT_CRAWLER="firecrawl" + fi + else + ENABLE_LOCAL_CRAWLERS=false + DEFAULT_CRAWLER="firecrawl" + print_info "Using Firecrawl MCP as default crawler" + fi + + # Save environment file + print_substep "Saving Configuration" + + cat > "$ENV_FILE" << EOF +# Reference Curator Environment Configuration +# Generated: $(date -Iseconds) +# +# Source this file in your shell profile: +# echo 'source ~/.reference-curator.env' >> ~/.zshrc +# + +# Storage paths +export REFERENCE_LIBRARY_PATH="$LIBRARY_PATH" + +# MySQL configuration +export MYSQL_HOST="${MYSQL_HOST:-localhost}" +export MYSQL_PORT="${MYSQL_PORT:-3306}" +export MYSQL_USER="$MYSQL_USER" +export MYSQL_PASSWORD='$MYSQL_PASSWORD' + +# Crawler configuration +export DEFAULT_CRAWLER="$DEFAULT_CRAWLER" +export CRAWLER_PROJECT_PATH="${CRAWLER_PATH:-}" +export NODEJS_CRAWLER_ENABLED=${ENABLE_LOCAL_CRAWLERS:-false} +export AIOHTTP_CRAWLER_ENABLED=${ENABLE_LOCAL_CRAWLERS:-false} +export SCRAPY_CRAWLER_ENABLED=${ENABLE_LOCAL_CRAWLERS:-false} +EOF + + chmod 600 "$ENV_FILE" + print_success "Environment saved to $ENV_FILE" + + # Source the environment + source "$ENV_FILE" +} + +# ============================================================================ +# Install Configuration Files +# ============================================================================ + +install_configs() { + print_step "Step 3: Installing Configuration Files" + + print_substep "Creating directories" + mkdir -p "$CONFIG_DIR" + print_success "$CONFIG_DIR" + + print_substep "Copying configuration files" + for config in db_config.yaml crawl_config.yaml export_config.yaml; do + if [[ -f "$SCRIPT_DIR/shared/config/$config" ]]; then + cp "$SCRIPT_DIR/shared/config/$config" "$CONFIG_DIR/" + print_success "$config" + else + print_error "$config - source not found" + fi + done +} + +# ============================================================================ +# Create Storage Directories +# ============================================================================ + +create_directories() { + print_step "Step 4: Creating Storage Directories" + + local lib_path="${REFERENCE_LIBRARY_PATH:-$DEFAULT_LIBRARY_PATH}" + + for dir in raw processed exports backups; do + mkdir -p "$lib_path/$dir" + print_success "$lib_path/$dir" + done +} + +# ============================================================================ +# Setup MySQL Database +# ============================================================================ + +setup_database() { + print_step "Step 5: Setting Up MySQL Database" + + if [[ "$SETUP_MYSQL" != "true" ]]; then + print_info "MySQL setup skipped" + return 0 + fi + + # Test connection + print_substep "Testing MySQL connection" + + if mysql -h "$MYSQL_HOST" -P "$MYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASSWORD" -e "SELECT 1" &>/dev/null; then + print_success "MySQL connection successful" + else + print_error "MySQL connection failed" + print_info "Check your credentials and try again" + return 1 + fi + + # Check if database exists + print_substep "Checking database" + + if mysql -h "$MYSQL_HOST" -P "$MYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASSWORD" -e "USE reference_library" &>/dev/null; then + print_info "Database 'reference_library' already exists" + + local table_count=$(mysql -h "$MYSQL_HOST" -P "$MYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASSWORD" -N -e \ + "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='reference_library'" 2>/dev/null) + + if [[ "$table_count" -ge 9 ]]; then + print_success "Schema already applied ($table_count tables)" + return 0 + fi + else + # Try to create database + print_substep "Creating database" + + if mysql -h "$MYSQL_HOST" -P "$MYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASSWORD" -e \ + "CREATE DATABASE IF NOT EXISTS reference_library CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci" &>/dev/null; then + print_success "Database created" + else + print_warning "Could not create database - may need admin privileges" + echo "" + + if prompt_yes_no "Do you have MySQL root access to create the database?" "n"; then + prompt_password "MySQL root password" "ROOT_PASSWORD" + + mysql -h "$MYSQL_HOST" -P "$MYSQL_PORT" -u root -p"$ROOT_PASSWORD" << EOF +CREATE DATABASE IF NOT EXISTS reference_library CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; +GRANT ALL PRIVILEGES ON reference_library.* TO '$MYSQL_USER'@'%'; +FLUSH PRIVILEGES; +EOF + print_success "Database created and privileges granted" + else + print_error "Cannot proceed without database" + return 1 + fi + fi + fi + + # Apply schema + print_substep "Applying schema" + + if [[ -f "$SCRIPT_DIR/shared/schema.sql" ]]; then + if mysql -h "$MYSQL_HOST" -P "$MYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASSWORD" reference_library < "$SCRIPT_DIR/shared/schema.sql" 2>/dev/null; then + print_success "Schema applied successfully" + else + print_error "Failed to apply schema" + return 1 + fi + else + print_error "Schema file not found: $SCRIPT_DIR/shared/schema.sql" + return 1 + fi +} + +# ============================================================================ +# Register Claude Code Commands +# ============================================================================ + +register_commands() { + print_step "Step 6: Registering Claude Code Commands" + + mkdir -p "$CLAUDE_COMMANDS_DIR" + + for cmd in "${COMMANDS[@]}"; do + local source="$SCRIPT_DIR/commands/${cmd}.md" + local target="$CLAUDE_COMMANDS_DIR/${cmd}.md" + + if [[ -f "$source" ]]; then + ln -sf "$source" "$target" + print_success "/$cmd" + else + print_warning "/$cmd - source not found: $source" + fi + done +} + +# ============================================================================ +# Register Claude Desktop Skills +# ============================================================================ + +register_skills() { + print_step "Step 7: Registering Claude Desktop Skills" + + mkdir -p "$CLAUDE_SKILLS_DIR" + + for skill_entry in "${SKILLS[@]}"; do + IFS=':' read -r skill_name skill_folder <<< "$skill_entry" + local target="$SCRIPT_DIR/$skill_folder/desktop" + + if [[ -d "$target" ]]; then + ln -sf "$target" "$CLAUDE_SKILLS_DIR/$skill_name" + print_success "$skill_name" + else + print_warning "$skill_name - target not found: $target" + fi + done +} + +# ============================================================================ +# Post-Installation +# ============================================================================ + +post_install() { + print_step "Installation Complete!" + + echo "" + echo -e "${GREEN}╔══════════════════════════════════════════════════════╗${NC}" + echo -e "${GREEN}║ Installation Successful! ║${NC}" + echo -e "${GREEN}╚══════════════════════════════════════════════════════╝${NC}" + echo "" + + echo -e "${BOLD}Next Steps:${NC}" + echo "" + echo " 1. Add environment to your shell profile:" + echo -e " ${CYAN}echo 'source ~/.reference-curator.env' >> ~/.zshrc${NC}" + echo "" + echo " 2. Reload your shell or run:" + echo -e " ${CYAN}source ~/.reference-curator.env${NC}" + echo "" + echo " 3. Verify installation:" + echo -e " ${CYAN}./install.sh --check${NC}" + echo "" + echo " 4. Start using the skills:" + echo -e " ${CYAN}/reference-discovery Claude system prompts${NC}" + echo "" + + if [[ "$DEFAULT_CRAWLER" == "firecrawl" ]]; then + echo -e "${YELLOW}Note:${NC} Using Firecrawl MCP as default crawler." + echo " Make sure firecrawl MCP server is configured in Claude Code settings." + echo "" + fi +} + +# ============================================================================ +# Check Installation Status +# ============================================================================ + +check_status() { + print_header + echo -e "${BOLD}Installation Status Check${NC}" + echo "" + + local all_ok=true + + # Check environment file + print_substep "Environment Configuration" + if [[ -f "$ENV_FILE" ]]; then + print_success "$ENV_FILE" + source "$ENV_FILE" + else + print_error "$ENV_FILE - not found" + all_ok=false + fi + + # Check config files + print_substep "Configuration Files ($CONFIG_DIR)" + for config in db_config.yaml crawl_config.yaml export_config.yaml; do + if [[ -f "$CONFIG_DIR/$config" ]]; then + print_success "$config" + else + print_error "$config - missing" + all_ok=false + fi + done + + # Check storage directories + print_substep "Storage Directories" + local lib_path="${REFERENCE_LIBRARY_PATH:-$DEFAULT_LIBRARY_PATH}" + for dir in raw processed exports; do + if [[ -d "$lib_path/$dir" ]]; then + print_success "$lib_path/$dir" + else + print_error "$lib_path/$dir - missing" + all_ok=false + fi + done + + # Check MySQL + print_substep "MySQL Database" + if [[ -n "$MYSQL_USER" ]] && command -v mysql &>/dev/null; then + if mysql -h "${MYSQL_HOST:-localhost}" -P "${MYSQL_PORT:-3306}" -u "$MYSQL_USER" -p"$MYSQL_PASSWORD" -e "USE reference_library" &>/dev/null; then + local table_count=$(mysql -h "${MYSQL_HOST:-localhost}" -P "${MYSQL_PORT:-3306}" -u "$MYSQL_USER" -p"$MYSQL_PASSWORD" -N -e \ + "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='reference_library'" 2>/dev/null) + print_success "Database accessible ($table_count tables)" + else + print_warning "Database 'reference_library' not accessible" + all_ok=false + fi + else + print_info "MySQL not configured or client not installed" + fi + + # Check Claude Code commands + print_substep "Claude Code Commands ($CLAUDE_COMMANDS_DIR)" + for cmd in "${COMMANDS[@]}"; do + if [[ -L "$CLAUDE_COMMANDS_DIR/${cmd}.md" ]] || [[ -f "$CLAUDE_COMMANDS_DIR/${cmd}.md" ]]; then + print_success "/$cmd" + else + print_error "/$cmd - not registered" + all_ok=false + fi + done + + # Check Claude Desktop skills + print_substep "Claude Desktop Skills ($CLAUDE_SKILLS_DIR)" + for skill_entry in "${SKILLS[@]}"; do + IFS=':' read -r skill_name skill_folder <<< "$skill_entry" + if [[ -L "$CLAUDE_SKILLS_DIR/$skill_name" ]]; then + print_success "$skill_name" + else + print_error "$skill_name - not registered" + all_ok=false + fi + done + + echo "" + if $all_ok; then + echo -e "${GREEN}All components installed correctly.${NC}" + return 0 + else + echo -e "${YELLOW}Some components need attention. Run ./install.sh to fix.${NC}" + return 1 + fi +} + +# ============================================================================ +# Uninstall +# ============================================================================ + +uninstall() { + print_header + echo -e "${YELLOW}${BOLD}Uninstall Reference Curator${NC}" + echo "" + + echo "This will remove:" + echo " • Environment file: $ENV_FILE" + echo " • Config directory: $CONFIG_DIR" + echo " • Claude Code commands" + echo " • Claude Desktop skill symlinks" + echo "" + echo -e "${GREEN}This will NOT remove:${NC}" + echo " • Storage directories (your data is safe)" + echo " • MySQL database (your data is safe)" + echo "" + + if ! prompt_yes_no "Proceed with uninstall?" "n"; then + echo "Uninstall cancelled." + exit 0 + fi + + echo "" + + # Remove environment file + if [[ -f "$ENV_FILE" ]]; then + rm -f "$ENV_FILE" + print_success "Removed $ENV_FILE" + fi + + # Remove config directory + if [[ -d "$CONFIG_DIR" ]]; then + rm -rf "$CONFIG_DIR" + print_success "Removed $CONFIG_DIR" + fi + + # Remove Claude Code commands + for cmd in "${COMMANDS[@]}"; do + local target="$CLAUDE_COMMANDS_DIR/${cmd}.md" + if [[ -L "$target" ]] || [[ -f "$target" ]]; then + rm -f "$target" + print_success "Removed /$cmd command" + fi + done + + # Remove Claude Desktop skills + for skill_entry in "${SKILLS[@]}"; do + IFS=':' read -r skill_name skill_folder <<< "$skill_entry" + local target="$CLAUDE_SKILLS_DIR/$skill_name" + if [[ -L "$target" ]]; then + rm -f "$target" + print_success "Removed $skill_name skill" + fi + done + + echo "" + echo -e "${GREEN}Uninstall complete.${NC}" + echo "" + echo "To reinstall, run: ./install.sh" +} + +# ============================================================================ +# Main Installation +# ============================================================================ + +install() { + print_header + + echo "This installer will set up the Reference Curator skill suite." + echo "" + echo "Components to install:" + echo " • Environment configuration" + echo " • Configuration files" + echo " • Storage directories" + echo " • MySQL database (optional)" + echo " • Claude Code commands (6)" + echo " • Claude Desktop skills (6)" + echo "" + + if ! prompt_yes_no "Continue with installation?" "y"; then + echo "Installation cancelled." + exit 0 + fi + + check_prerequisites + configure_environment + install_configs + create_directories + + if [[ "$SETUP_MYSQL" == "true" ]]; then + setup_database + fi + + register_commands + register_skills + post_install +} + +# ============================================================================ +# Minimal Installation (Firecrawl only) +# ============================================================================ + +install_minimal() { + print_header + + echo -e "${BOLD}Minimal Installation Mode${NC}" + echo "" + echo "This will install Reference Curator with Firecrawl MCP only." + echo "No local crawlers or MySQL database will be configured." + echo "" + + if ! prompt_yes_no "Continue?" "y"; then + echo "Installation cancelled." + exit 0 + fi + + # Minimal environment + SETUP_MYSQL=false + ENABLE_LOCAL_CRAWLERS=false + DEFAULT_CRAWLER="firecrawl" + LIBRARY_PATH="$DEFAULT_LIBRARY_PATH" + + cat > "$ENV_FILE" << EOF +# Reference Curator Environment Configuration (Minimal) +# Generated: $(date -Iseconds) + +export REFERENCE_LIBRARY_PATH="$LIBRARY_PATH" +export DEFAULT_CRAWLER="firecrawl" +EOF + + chmod 600 "$ENV_FILE" + source "$ENV_FILE" + + install_configs + create_directories + register_commands + register_skills + post_install +} + +# ============================================================================ +# Entry Point +# ============================================================================ + +case "${1:-}" in + --check) + check_status + ;; + --uninstall) + uninstall + ;; + --minimal) + install_minimal + ;; + --help|-h) + echo "Reference Curator - Portable Installation Script" + echo "" + echo "Usage:" + echo " ./install.sh Interactive installation" + echo " ./install.sh --check Check installation status" + echo " ./install.sh --minimal Firecrawl-only mode (no MySQL)" + echo " ./install.sh --uninstall Remove installation" + echo " ./install.sh --help Show this help" + ;; + *) + install + ;; +esac diff --git a/custom-skills/90-reference-curator/shared/config/crawl_config.yaml b/custom-skills/90-reference-curator/shared/config/crawl_config.yaml new file mode 100644 index 0000000..becfbcf --- /dev/null +++ b/custom-skills/90-reference-curator/shared/config/crawl_config.yaml @@ -0,0 +1,139 @@ +# Reference Curator - Crawl Configuration +# Location: ~/.config/reference-curator/crawl_config.yaml +# +# Environment variables (set in ~/.reference-curator.env): +# CRAWLER_PROJECT_PATH - Path to crawler project (optional) +# REFERENCE_LIBRARY_PATH - Path to reference library storage (optional) + +# Default crawler backend +# Options: nodejs, aiohttp, scrapy, firecrawl +# Set to "firecrawl" if local crawlers are not available +default_crawler: ${DEFAULT_CRAWLER:-firecrawl} + +# Intelligent routing rules +# Claude will select the appropriate crawler based on these criteria +routing: + nodejs: + conditions: + - max_pages <= 50 + - single_domain == true + - no_javascript_rendering == true + description: "Fast, lightweight - best for small documentation sites" + + aiohttp: + conditions: + - max_pages <= 200 + - needs_async == true + - seo_extraction == true + description: "Async with SEO extraction - best for technical docs" + + scrapy: + conditions: + - max_pages > 200 + - multi_domain == true + - needs_pipeline == true + description: "Enterprise-grade - best for large sites with complex structure" + + firecrawl: + conditions: + - needs_javascript_rendering == true + - spa_site == true + - dynamic_content == true + description: "JS rendering - best for SPAs and dynamic content (MCP-based, always available)" + +# Crawler locations (configurable via environment) +# If CRAWLER_PROJECT_PATH is not set, only Firecrawl MCP will be available +crawlers: + nodejs: + enabled: ${NODEJS_CRAWLER_ENABLED:-false} + path: ${CRAWLER_PROJECT_PATH}/util/js-crawler/ + command: node src/crawler.js + install: npm install + + aiohttp: + enabled: ${AIOHTTP_CRAWLER_ENABLED:-false} + path: ${CRAWLER_PROJECT_PATH}/ + command: python -m seo_agent.crawler + install: uv sync + + scrapy: + enabled: ${SCRAPY_CRAWLER_ENABLED:-false} + path: ${CRAWLER_PROJECT_PATH}/ + command: scrapy crawl seo_spider + install: uv sync + + firecrawl: + enabled: true # Always available via MCP + type: mcp + server: firecrawl + tools: + - firecrawl_scrape + - firecrawl_crawl + - firecrawl_map + install: "Configure firecrawl MCP server in Claude Code settings" + +# Rate limiting +rate_limit: + requests_per_minute: 20 + concurrent_requests: 3 + +# Retry settings +retry: + max_retries: 3 + backoff_multiplier: 2 + initial_delay_seconds: 10 + +# Default crawl options +default_options: + timeout: 30000 # milliseconds + max_depth: 3 + max_pages: 100 + respect_robots_txt: true + user_agent: "ReferenceBot/1.0" + +# Content processing +processing: + max_content_size_mb: 50 + supported_formats: + - html + - pdf + - markdown + + # Storage paths (configurable) + raw_content_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/raw/ + processed_dir: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/processed/ + +# URL filtering +url_filter: + skip_extensions: + - .jpg + - .jpeg + - .png + - .gif + - .svg + - .webp + - .css + - .js + - .woff + - .woff2 + - .ico + skip_patterns: + - /wp-admin/ + - /wp-includes/ + - /login + - /logout + +# Site type detection hints +site_detection: + spa_indicators: + - "react" + - "angular" + - "vue" + - "next.js" + - "nuxt" + static_indicators: + - "hugo" + - "jekyll" + - "mkdocs" + - "docusaurus" + - "gitbook" diff --git a/custom-skills/90-reference-curator/shared/config/db_config.yaml b/custom-skills/90-reference-curator/shared/config/db_config.yaml new file mode 100644 index 0000000..4c307ef --- /dev/null +++ b/custom-skills/90-reference-curator/shared/config/db_config.yaml @@ -0,0 +1,31 @@ +# Reference Curator - Database Configuration +# Location: ~/.config/reference-curator/db_config.yaml +# +# Environment variables (set in ~/.reference-curator.env): +# MYSQL_USER - MySQL username (required) +# MYSQL_PASSWORD - MySQL password (required) +# MYSQL_HOST - MySQL host (optional, default: localhost) +# MYSQL_PORT - MySQL port (optional, default: 3306) + +mysql: + host: ${MYSQL_HOST:-localhost} + port: ${MYSQL_PORT:-3306} + database: reference_library + user: ${MYSQL_USER} + password: ${MYSQL_PASSWORD} + charset: utf8mb4 + + # Connection pool settings + pool_size: 5 + pool_recycle: 3600 + + # SSL (if needed for remote MySQL) + ssl: + enabled: false + ca_cert: null + +# Backup settings +backup: + enabled: true + path: ${REFERENCE_LIBRARY_PATH:-~/reference-library}/backups/ + retention_days: 30 diff --git a/custom-skills/90-reference-curator/shared/config/export_config.yaml b/custom-skills/90-reference-curator/shared/config/export_config.yaml new file mode 100644 index 0000000..3cab480 --- /dev/null +++ b/custom-skills/90-reference-curator/shared/config/export_config.yaml @@ -0,0 +1,46 @@ +# Reference Curator - Export Configuration +# Location: ~/.config/reference-curator/export_config.yaml +# +# Copy this file to ~/.config/reference-curator/export_config.yaml + +output: + base_path: ~/reference-library/exports/ + + # Project files format (for Claude Projects) + project_files: + structure: nested_by_topic # flat | nested_by_topic | nested_by_source + index_file: INDEX.md + include_metadata: true + max_file_size_kb: 500 + + # Fine-tuning dataset format + fine_tuning: + format: jsonl + include_system_prompt: true + system_prompt: "You are an expert on AI and prompt engineering." + max_tokens_per_sample: 4096 + + # Knowledge base format + knowledge_base: + structure: flat + include_toc: true + +# Quality thresholds +quality: + min_score_for_export: 0.80 + require_human_review: false + auto_approve_tier1_sources: true + auto_approve_min_score: 0.80 + +# Cross-reference settings +cross_references: + enabled: true + min_concept_overlap: 2 + max_related_docs: 5 + +# Verification +verification: + check_broken_links: true + validate_markdown: true + check_duplicates: true + max_allowed_duplicates: 0 diff --git a/custom-skills/90-reference-curator/shared/schema.sql b/custom-skills/90-reference-curator/shared/schema.sql new file mode 100644 index 0000000..bb5a50c --- /dev/null +++ b/custom-skills/90-reference-curator/shared/schema.sql @@ -0,0 +1,285 @@ +-- =========================================== +-- Reference Library Database Schema +-- Version: 1.0 +-- Purpose: Store and manage curated reference materials +-- =========================================== + +CREATE DATABASE IF NOT EXISTS reference_library + CHARACTER SET utf8mb4 + COLLATE utf8mb4_unicode_ci; + +USE reference_library; + +-- ----------------------------- +-- 1. Core Tables +-- ----------------------------- + +CREATE TABLE sources ( + source_id INT AUTO_INCREMENT PRIMARY KEY, + source_name VARCHAR(255) NOT NULL, + source_type ENUM('official_docs', 'engineering_blog', 'research_paper', + 'github_repo', 'community_guide', 'pdf_document', 'api_reference') NOT NULL, + base_url VARCHAR(500), + credibility_tier ENUM('tier1_official', 'tier2_verified', 'tier3_community') DEFAULT 'tier3_community', + vendor VARCHAR(100), -- anthropic, openai, google, etc. + is_active BOOLEAN DEFAULT TRUE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + + INDEX idx_vendor (vendor), + INDEX idx_source_type (source_type) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE documents ( + doc_id INT AUTO_INCREMENT PRIMARY KEY, + source_id INT NOT NULL, + + -- Document identification + title VARCHAR(500) NOT NULL, + url VARCHAR(1000), + url_hash CHAR(64) AS (SHA2(url, 256)) STORED, -- For deduplication + + -- Content metadata + doc_type ENUM('webpage', 'pdf', 'markdown', 'api_spec', 'code_sample') NOT NULL, + language ENUM('en', 'ko', 'mixed') DEFAULT 'en', + original_publish_date DATE, + last_modified_date DATE, + + -- Crawl metadata + crawl_date TIMESTAMP, + crawl_method ENUM('firecrawl', 'scrapy', 'aiohttp', 'nodejs', 'manual', 'api') DEFAULT 'aiohttp', + crawl_status ENUM('pending', 'completed', 'failed', 'stale') DEFAULT 'pending', + + -- Storage + raw_content_path VARCHAR(500), -- Path to raw crawled file + raw_content_size INT, -- Bytes + + -- Version tracking + version INT DEFAULT 1, + previous_version_id INT, + + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + + FOREIGN KEY (source_id) REFERENCES sources(source_id) ON DELETE CASCADE, + FOREIGN KEY (previous_version_id) REFERENCES documents(doc_id) ON DELETE SET NULL, + + UNIQUE INDEX idx_url_hash (url_hash), + INDEX idx_crawl_status (crawl_status), + INDEX idx_crawl_date (crawl_date) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- ----------------------------- +-- 2. Content Processing Tables +-- ----------------------------- + +CREATE TABLE distilled_content ( + distill_id INT AUTO_INCREMENT PRIMARY KEY, + doc_id INT NOT NULL, + + -- Distilled output + summary TEXT, -- Executive summary + key_concepts JSON, -- Extracted key terms and definitions + code_snippets JSON, -- Extracted code examples + structured_content MEDIUMTEXT, -- Full distilled markdown + + -- Quality metrics + token_count_original INT, + token_count_distilled INT, + compression_ratio DECIMAL(5,2) AS (token_count_distilled / token_count_original * 100), + + -- Processing metadata + distill_model VARCHAR(50), -- claude-opus-4-5, etc. + distill_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + + -- Review status + review_status ENUM('pending', 'in_review', 'approved', 'needs_refactor', 'rejected') DEFAULT 'pending', + + FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE, + INDEX idx_review_status (review_status) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE review_logs ( + review_id INT AUTO_INCREMENT PRIMARY KEY, + distill_id INT NOT NULL, + + -- Review details + review_round INT DEFAULT 1, + reviewer_type ENUM('auto_qa', 'human', 'claude_review') NOT NULL, + + -- Quality assessment + quality_score DECIMAL(3,2), -- 0.00 - 1.00 + assessment JSON, -- Detailed scoring breakdown + /* + Example assessment JSON: + { + "accuracy": 0.9, + "completeness": 0.85, + "clarity": 0.95, + "prompt_engineering_quality": 0.88, + "usability": 0.82 + } + */ + + -- Review outcome + decision ENUM('approve', 'refactor', 'deep_research', 'reject') NOT NULL, + feedback TEXT, + refactor_instructions TEXT, + research_queries JSON, -- Additional search queries if deep_research needed + + reviewed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + + FOREIGN KEY (distill_id) REFERENCES distilled_content(distill_id) ON DELETE CASCADE, + INDEX idx_decision (decision) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- ----------------------------- +-- 3. Organization & Export Tables +-- ----------------------------- + +CREATE TABLE topics ( + topic_id INT AUTO_INCREMENT PRIMARY KEY, + topic_name VARCHAR(255) NOT NULL, + topic_slug VARCHAR(100) NOT NULL, -- URL/folder-friendly + parent_topic_id INT, + description TEXT, + + FOREIGN KEY (parent_topic_id) REFERENCES topics(topic_id) ON DELETE SET NULL, + UNIQUE INDEX idx_topic_slug (topic_slug) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE document_topics ( + doc_id INT NOT NULL, + topic_id INT NOT NULL, + relevance_score DECIMAL(3,2) DEFAULT 1.00, + + PRIMARY KEY (doc_id, topic_id), + FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE, + FOREIGN KEY (topic_id) REFERENCES topics(topic_id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE export_jobs ( + export_id INT AUTO_INCREMENT PRIMARY KEY, + + -- Export configuration + export_name VARCHAR(255) NOT NULL, + export_type ENUM('project_files', 'fine_tuning', 'training_dataset', 'knowledge_base') NOT NULL, + output_format ENUM('markdown', 'jsonl', 'parquet', 'sqlite') DEFAULT 'markdown', + + -- Scope + topic_filter JSON, -- Topic IDs to include + date_range_start DATE, + date_range_end DATE, + min_quality_score DECIMAL(3,2) DEFAULT 0.80, + + -- Output + output_path VARCHAR(500), + total_documents INT, + total_tokens INT, + + -- Status + status ENUM('pending', 'processing', 'completed', 'failed') DEFAULT 'pending', + started_at TIMESTAMP, + completed_at TIMESTAMP, + error_message TEXT, + + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- ----------------------------- +-- 4. Tracking & Monitoring Tables +-- ----------------------------- + +CREATE TABLE crawl_schedule ( + schedule_id INT AUTO_INCREMENT PRIMARY KEY, + source_id INT NOT NULL, + + frequency ENUM('daily', 'weekly', 'biweekly', 'monthly', 'on_demand') DEFAULT 'weekly', + last_crawl TIMESTAMP, + next_crawl TIMESTAMP, + is_enabled BOOLEAN DEFAULT TRUE, + + FOREIGN KEY (source_id) REFERENCES sources(source_id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE change_detection ( + change_id INT AUTO_INCREMENT PRIMARY KEY, + doc_id INT NOT NULL, + + detected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + change_type ENUM('content_updated', 'url_moved', 'deleted', 'new_version') NOT NULL, + previous_hash CHAR(64), + current_hash CHAR(64), + diff_summary TEXT, + + action_taken ENUM('pending', 'recrawled', 'archived', 'ignored') DEFAULT 'pending', + + FOREIGN KEY (doc_id) REFERENCES documents(doc_id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- ----------------------------- +-- 5. Default Data +-- ----------------------------- + +INSERT INTO topics (topic_name, topic_slug, description) VALUES +('Prompt Engineering', 'prompt-engineering', 'Techniques for effective LLM prompting'), +('Claude Models', 'claude-models', 'Claude model architecture, capabilities, and versions'), +('Agent Building', 'agent-building', 'AI agent design patterns and implementation'), +('Claude Code', 'claude-code', 'Claude Code CLI tool usage and best practices'), +('MCP Integrations', 'mcp-integrations', 'Model Context Protocol servers and tools'), +('API Reference', 'api-reference', 'Anthropic API documentation and usage'), +('Fine-tuning', 'fine-tuning', 'Model fine-tuning techniques and datasets'); + +INSERT INTO sources (source_name, source_type, base_url, credibility_tier, vendor) VALUES +('Anthropic Official Docs', 'official_docs', 'https://docs.anthropic.com', 'tier1_official', 'anthropic'), +('Claude.ai Docs', 'official_docs', 'https://docs.claude.com', 'tier1_official', 'anthropic'), +('Anthropic Engineering Blog', 'engineering_blog', 'https://anthropic.com/engineering', 'tier1_official', 'anthropic'), +('Anthropic News', 'engineering_blog', 'https://anthropic.com/news', 'tier1_official', 'anthropic'), +('Anthropic Cookbook', 'github_repo', 'https://github.com/anthropics/anthropic-cookbook', 'tier1_official', 'anthropic'), +('OpenAI Docs', 'official_docs', 'https://platform.openai.com/docs', 'tier1_official', 'openai'), +('Google AI Docs', 'official_docs', 'https://ai.google.dev/docs', 'tier1_official', 'google'); + +-- ----------------------------- +-- 6. Useful Views +-- ----------------------------- + +CREATE OR REPLACE VIEW v_pending_reviews AS +SELECT + dc.distill_id, + d.doc_id, + d.title, + d.url, + dc.token_count_distilled, + dc.distill_date, + s.credibility_tier +FROM distilled_content dc +JOIN documents d ON dc.doc_id = d.doc_id +JOIN sources s ON d.source_id = s.source_id +WHERE dc.review_status = 'pending' +ORDER BY s.credibility_tier ASC, dc.distill_date ASC; + +CREATE OR REPLACE VIEW v_export_ready AS +SELECT + d.doc_id, + d.title, + d.url, + dc.structured_content, + t.topic_slug, + t.topic_name, + rl.quality_score, + s.credibility_tier, + s.vendor +FROM documents d +JOIN distilled_content dc ON d.doc_id = dc.doc_id +JOIN document_topics dt ON d.doc_id = dt.doc_id +JOIN topics t ON dt.topic_id = t.topic_id +JOIN review_logs rl ON dc.distill_id = rl.distill_id +JOIN sources s ON d.source_id = s.source_id +WHERE rl.decision = 'approve' +AND rl.quality_score >= 0.80 +AND rl.review_id = ( + SELECT MAX(review_id) + FROM review_logs + WHERE distill_id = dc.distill_id +) +ORDER BY t.topic_slug, rl.quality_score DESC;