Files
our-claude-skills/custom-skills/05-ourdigital-document/code/scripts/extract_notion.py
Andrew Yim 0bc24d00b9 feat: Add OurDigital custom skills package (10 skills)
Complete implementation of OurDigital skills with dual-platform support
(Claude Desktop + Claude Code) following standardized structure.

Skills created:
- 01-ourdigital-brand-guide: Brand reference & style guidelines
- 02-ourdigital-blog: Korean blog drafts (blog.ourdigital.org)
- 03-ourdigital-journal: English essays (journal.ourdigital.org)
- 04-ourdigital-research: Research prompts & workflows
- 05-ourdigital-document: Notion-to-presentation pipeline
- 06-ourdigital-designer: Visual/image prompt generation
- 07-ourdigital-ad-manager: Ad copywriting & keyword research
- 08-ourdigital-trainer: Training materials & workshop planning
- 09-ourdigital-backoffice: Quotes, proposals, cost analysis
- 10-ourdigital-skill-creator: Meta skill for creating new skills

Features:
- YAML frontmatter with "ourdigital" or "our" prefix triggers
- Standardized directory structure (code/, desktop/, shared/, docs/)
- Shared environment setup (_ourdigital-shared/)
- Comprehensive reference documentation
- Cross-skill integration support

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-31 16:50:17 +07:00

204 lines
6.4 KiB
Python

#!/usr/bin/env python3
"""
Extract research content from Notion pages and databases
Outputs structured JSON for downstream processing
"""
import json
import argparse
from typing import Dict, List, Any
from datetime import datetime
def extract_notion_content(notion_url: str) -> Dict[str, Any]:
"""
Extract and structure content from Notion
This function would integrate with Notion MCP tools:
- notion-search for finding related pages
- notion-fetch for getting full content
Args:
notion_url: URL of Notion page or database
Returns:
Structured research data
"""
# Parse Notion URL to get page/database ID
page_id = parse_notion_url(notion_url)
# This would use actual Notion MCP tools in production
# Simulating the structure for now
extracted_data = {
"source": {
"url": notion_url,
"id": page_id,
"type": "page", # or "database"
"extracted_at": datetime.now().isoformat()
},
"metadata": {
"title": "Q4 Research Summary",
"last_edited": "2024-12-15T10:30:00Z",
"created_by": "user@company.com",
"tags": ["research", "Q4", "strategy"]
},
"content": {
"sections": [
{
"title": "Executive Summary",
"content": "Key findings from Q4 research indicate...",
"level": 1,
"data_points": [
{"metric": "Growth Rate", "value": "25%"},
{"metric": "User Satisfaction", "value": "4.5/5"}
]
},
{
"title": "Market Analysis",
"content": "The market landscape shows...",
"level": 1,
"subsections": [
{
"title": "Competitive Landscape",
"content": "Our position relative to competitors...",
"level": 2
},
{
"title": "Growth Opportunities",
"content": "Identified opportunities include...",
"level": 2
}
]
},
{
"title": "Customer Insights",
"content": "Customer feedback reveals...",
"level": 1,
"data_points": [
{"metric": "NPS Score", "value": "72"},
{"metric": "Retention Rate", "value": "89%"}
]
},
{
"title": "Recommendations",
"content": "Based on the research, we recommend...",
"level": 1,
"action_items": [
"Expand into new market segments",
"Enhance product features based on feedback",
"Increase investment in customer success"
]
}
]
},
"linked_pages": [
{
"title": "Detailed Customer Survey Results",
"url": "notion://page/survey-results-id",
"relevance": "high"
},
{
"title": "Competitor Analysis Deep Dive",
"url": "notion://page/competitor-analysis-id",
"relevance": "medium"
}
],
"attachments": [
{
"type": "spreadsheet",
"title": "Q4 Metrics Dashboard",
"url": "notion://attachment/metrics-id"
}
]
}
return extracted_data
def parse_notion_url(url: str) -> str:
"""Extract page/database ID from Notion URL"""
# Simplified URL parsing
if "notion.so/" in url or "notion://" in url:
parts = url.split("/")
return parts[-1].split("?")[0]
return url
def fetch_linked_content(linked_pages: List[Dict], depth: int = 1) -> List[Dict]:
"""
Recursively fetch linked page content
Args:
linked_pages: List of linked page references
depth: How deep to follow links
Returns:
Expanded content from linked pages
"""
if depth <= 0:
return []
expanded_content = []
for page in linked_pages:
if page.get("relevance") in ["high", "medium"]:
# Would fetch actual content here
expanded_content.append({
"source": page["url"],
"title": page["title"],
"content": f"Content from {page['title']}..."
})
return expanded_content
def main():
parser = argparse.ArgumentParser(
description="Extract research content from Notion"
)
parser.add_argument(
"notion_url",
help="URL of Notion page or database"
)
parser.add_argument(
"--output",
default="research.json",
help="Output JSON file (default: research.json)"
)
parser.add_argument(
"--include-linked",
action="store_true",
help="Include content from linked pages"
)
parser.add_argument(
"--depth",
type=int,
default=1,
help="Link following depth (default: 1)"
)
args = parser.parse_args()
print(f"📚 Extracting content from: {args.notion_url}")
# Extract main content
research_data = extract_notion_content(args.notion_url)
# Optionally fetch linked content
if args.include_linked and research_data.get("linked_pages"):
print("📎 Fetching linked pages...")
linked_content = fetch_linked_content(
research_data["linked_pages"],
args.depth
)
research_data["linked_content"] = linked_content
# Save to JSON
with open(args.output, 'w', encoding='utf-8') as f:
json.dump(research_data, f, indent=2, ensure_ascii=False)
print(f"✅ Research data saved to: {args.output}")
print(f"📊 Extracted {len(research_data['content']['sections'])} sections")
if research_data.get("linked_pages"):
print(f"🔗 Found {len(research_data['linked_pages'])} linked pages")
if __name__ == "__main__":
main()