Files
our-claude-skills/custom-skills/32-ourdigital-presentation/code/scripts/extract_notion.py
Andrew Yim 236be6c580 directory changes and restructuring
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 02:01:41 +09:00

204 lines
6.4 KiB
Python

#!/usr/bin/env python3
"""
Extract research content from Notion pages and databases
Outputs structured JSON for downstream processing
"""
import json
import argparse
from typing import Dict, List, Any
from datetime import datetime
def extract_notion_content(notion_url: str) -> Dict[str, Any]:
"""
Extract and structure content from Notion
This function would integrate with Notion MCP tools:
- notion-search for finding related pages
- notion-fetch for getting full content
Args:
notion_url: URL of Notion page or database
Returns:
Structured research data
"""
# Parse Notion URL to get page/database ID
page_id = parse_notion_url(notion_url)
# This would use actual Notion MCP tools in production
# Simulating the structure for now
extracted_data = {
"source": {
"url": notion_url,
"id": page_id,
"type": "page", # or "database"
"extracted_at": datetime.now().isoformat()
},
"metadata": {
"title": "Q4 Research Summary",
"last_edited": "2024-12-15T10:30:00Z",
"created_by": "user@company.com",
"tags": ["research", "Q4", "strategy"]
},
"content": {
"sections": [
{
"title": "Executive Summary",
"content": "Key findings from Q4 research indicate...",
"level": 1,
"data_points": [
{"metric": "Growth Rate", "value": "25%"},
{"metric": "User Satisfaction", "value": "4.5/5"}
]
},
{
"title": "Market Analysis",
"content": "The market landscape shows...",
"level": 1,
"subsections": [
{
"title": "Competitive Landscape",
"content": "Our position relative to competitors...",
"level": 2
},
{
"title": "Growth Opportunities",
"content": "Identified opportunities include...",
"level": 2
}
]
},
{
"title": "Customer Insights",
"content": "Customer feedback reveals...",
"level": 1,
"data_points": [
{"metric": "NPS Score", "value": "72"},
{"metric": "Retention Rate", "value": "89%"}
]
},
{
"title": "Recommendations",
"content": "Based on the research, we recommend...",
"level": 1,
"action_items": [
"Expand into new market segments",
"Enhance product features based on feedback",
"Increase investment in customer success"
]
}
]
},
"linked_pages": [
{
"title": "Detailed Customer Survey Results",
"url": "notion://page/survey-results-id",
"relevance": "high"
},
{
"title": "Competitor Analysis Deep Dive",
"url": "notion://page/competitor-analysis-id",
"relevance": "medium"
}
],
"attachments": [
{
"type": "spreadsheet",
"title": "Q4 Metrics Dashboard",
"url": "notion://attachment/metrics-id"
}
]
}
return extracted_data
def parse_notion_url(url: str) -> str:
"""Extract page/database ID from Notion URL"""
# Simplified URL parsing
if "notion.so/" in url or "notion://" in url:
parts = url.split("/")
return parts[-1].split("?")[0]
return url
def fetch_linked_content(linked_pages: List[Dict], depth: int = 1) -> List[Dict]:
"""
Recursively fetch linked page content
Args:
linked_pages: List of linked page references
depth: How deep to follow links
Returns:
Expanded content from linked pages
"""
if depth <= 0:
return []
expanded_content = []
for page in linked_pages:
if page.get("relevance") in ["high", "medium"]:
# Would fetch actual content here
expanded_content.append({
"source": page["url"],
"title": page["title"],
"content": f"Content from {page['title']}..."
})
return expanded_content
def main():
parser = argparse.ArgumentParser(
description="Extract research content from Notion"
)
parser.add_argument(
"notion_url",
help="URL of Notion page or database"
)
parser.add_argument(
"--output",
default="research.json",
help="Output JSON file (default: research.json)"
)
parser.add_argument(
"--include-linked",
action="store_true",
help="Include content from linked pages"
)
parser.add_argument(
"--depth",
type=int,
default=1,
help="Link following depth (default: 1)"
)
args = parser.parse_args()
print(f"📚 Extracting content from: {args.notion_url}")
# Extract main content
research_data = extract_notion_content(args.notion_url)
# Optionally fetch linked content
if args.include_linked and research_data.get("linked_pages"):
print("📎 Fetching linked pages...")
linked_content = fetch_linked_content(
research_data["linked_pages"],
args.depth
)
research_data["linked_content"] = linked_content
# Save to JSON
with open(args.output, 'w', encoding='utf-8') as f:
json.dump(research_data, f, indent=2, ensure_ascii=False)
print(f"✅ Research data saved to: {args.output}")
print(f"📊 Extracted {len(research_data['content']['sections'])} sections")
if research_data.get("linked_pages"):
print(f"🔗 Found {len(research_data['linked_pages'])} linked pages")
if __name__ == "__main__":
main()