Files
our-claude-skills/custom-skills/03-ourdigital-presentation/code/scripts/extract_notion.py
Andrew Yim b69e4b6f3a refactor: Reorganize skill numbering and update documentation
Skill Numbering Changes:
- 01-03: OurDigital core (was 30-32)
- 31-32: Notion tools (was 01-02)
- 99_archive: Renamed from _archive for sorting

New Files:
- AGENTS.md: Claude Code agent routing guide
- requirements.txt for 00-claude-code-setting, 32-notion-writer, 43-jamie-youtube-manager

Documentation Updates:
- CLAUDE.md: Updated skill inventory (23 skills)
- AUDIT_REPORT.md: Current completion status (91%)
- Archived REFACTORING_PLAN.md (most tasks complete)

Removed:
- ga-agent-skills/ (moved to separate repo ~/Project/dintel-ga4-agent)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 18:42:39 +07:00

204 lines
6.4 KiB
Python

#!/usr/bin/env python3
"""
Extract research content from Notion pages and databases
Outputs structured JSON for downstream processing
"""
import json
import argparse
from typing import Dict, List, Any
from datetime import datetime
def extract_notion_content(notion_url: str) -> Dict[str, Any]:
"""
Extract and structure content from Notion
This function would integrate with Notion MCP tools:
- notion-search for finding related pages
- notion-fetch for getting full content
Args:
notion_url: URL of Notion page or database
Returns:
Structured research data
"""
# Parse Notion URL to get page/database ID
page_id = parse_notion_url(notion_url)
# This would use actual Notion MCP tools in production
# Simulating the structure for now
extracted_data = {
"source": {
"url": notion_url,
"id": page_id,
"type": "page", # or "database"
"extracted_at": datetime.now().isoformat()
},
"metadata": {
"title": "Q4 Research Summary",
"last_edited": "2024-12-15T10:30:00Z",
"created_by": "user@company.com",
"tags": ["research", "Q4", "strategy"]
},
"content": {
"sections": [
{
"title": "Executive Summary",
"content": "Key findings from Q4 research indicate...",
"level": 1,
"data_points": [
{"metric": "Growth Rate", "value": "25%"},
{"metric": "User Satisfaction", "value": "4.5/5"}
]
},
{
"title": "Market Analysis",
"content": "The market landscape shows...",
"level": 1,
"subsections": [
{
"title": "Competitive Landscape",
"content": "Our position relative to competitors...",
"level": 2
},
{
"title": "Growth Opportunities",
"content": "Identified opportunities include...",
"level": 2
}
]
},
{
"title": "Customer Insights",
"content": "Customer feedback reveals...",
"level": 1,
"data_points": [
{"metric": "NPS Score", "value": "72"},
{"metric": "Retention Rate", "value": "89%"}
]
},
{
"title": "Recommendations",
"content": "Based on the research, we recommend...",
"level": 1,
"action_items": [
"Expand into new market segments",
"Enhance product features based on feedback",
"Increase investment in customer success"
]
}
]
},
"linked_pages": [
{
"title": "Detailed Customer Survey Results",
"url": "notion://page/survey-results-id",
"relevance": "high"
},
{
"title": "Competitor Analysis Deep Dive",
"url": "notion://page/competitor-analysis-id",
"relevance": "medium"
}
],
"attachments": [
{
"type": "spreadsheet",
"title": "Q4 Metrics Dashboard",
"url": "notion://attachment/metrics-id"
}
]
}
return extracted_data
def parse_notion_url(url: str) -> str:
"""Extract page/database ID from Notion URL"""
# Simplified URL parsing
if "notion.so/" in url or "notion://" in url:
parts = url.split("/")
return parts[-1].split("?")[0]
return url
def fetch_linked_content(linked_pages: List[Dict], depth: int = 1) -> List[Dict]:
"""
Recursively fetch linked page content
Args:
linked_pages: List of linked page references
depth: How deep to follow links
Returns:
Expanded content from linked pages
"""
if depth <= 0:
return []
expanded_content = []
for page in linked_pages:
if page.get("relevance") in ["high", "medium"]:
# Would fetch actual content here
expanded_content.append({
"source": page["url"],
"title": page["title"],
"content": f"Content from {page['title']}..."
})
return expanded_content
def main():
parser = argparse.ArgumentParser(
description="Extract research content from Notion"
)
parser.add_argument(
"notion_url",
help="URL of Notion page or database"
)
parser.add_argument(
"--output",
default="research.json",
help="Output JSON file (default: research.json)"
)
parser.add_argument(
"--include-linked",
action="store_true",
help="Include content from linked pages"
)
parser.add_argument(
"--depth",
type=int,
default=1,
help="Link following depth (default: 1)"
)
args = parser.parse_args()
print(f"📚 Extracting content from: {args.notion_url}")
# Extract main content
research_data = extract_notion_content(args.notion_url)
# Optionally fetch linked content
if args.include_linked and research_data.get("linked_pages"):
print("📎 Fetching linked pages...")
linked_content = fetch_linked_content(
research_data["linked_pages"],
args.depth
)
research_data["linked_content"] = linked_content
# Save to JSON
with open(args.output, 'w', encoding='utf-8') as f:
json.dump(research_data, f, indent=2, ensure_ascii=False)
print(f"✅ Research data saved to: {args.output}")
print(f"📊 Extracted {len(research_data['content']['sections'])} sections")
if research_data.get("linked_pages"):
print(f"🔗 Found {len(research_data['linked_pages'])} linked pages")
if __name__ == "__main__":
main()