#!/usr/bin/env python3 """ Extract research content from Notion pages and databases Outputs structured JSON for downstream processing """ import json import argparse from typing import Dict, List, Any from datetime import datetime def extract_notion_content(notion_url: str) -> Dict[str, Any]: """ Extract and structure content from Notion This function would integrate with Notion MCP tools: - notion-search for finding related pages - notion-fetch for getting full content Args: notion_url: URL of Notion page or database Returns: Structured research data """ # Parse Notion URL to get page/database ID page_id = parse_notion_url(notion_url) # This would use actual Notion MCP tools in production # Simulating the structure for now extracted_data = { "source": { "url": notion_url, "id": page_id, "type": "page", # or "database" "extracted_at": datetime.now().isoformat() }, "metadata": { "title": "Q4 Research Summary", "last_edited": "2024-12-15T10:30:00Z", "created_by": "user@company.com", "tags": ["research", "Q4", "strategy"] }, "content": { "sections": [ { "title": "Executive Summary", "content": "Key findings from Q4 research indicate...", "level": 1, "data_points": [ {"metric": "Growth Rate", "value": "25%"}, {"metric": "User Satisfaction", "value": "4.5/5"} ] }, { "title": "Market Analysis", "content": "The market landscape shows...", "level": 1, "subsections": [ { "title": "Competitive Landscape", "content": "Our position relative to competitors...", "level": 2 }, { "title": "Growth Opportunities", "content": "Identified opportunities include...", "level": 2 } ] }, { "title": "Customer Insights", "content": "Customer feedback reveals...", "level": 1, "data_points": [ {"metric": "NPS Score", "value": "72"}, {"metric": "Retention Rate", "value": "89%"} ] }, { "title": "Recommendations", "content": "Based on the research, we recommend...", "level": 1, "action_items": [ "Expand into new market segments", "Enhance product features based on feedback", "Increase investment in customer success" ] } ] }, "linked_pages": [ { "title": "Detailed Customer Survey Results", "url": "notion://page/survey-results-id", "relevance": "high" }, { "title": "Competitor Analysis Deep Dive", "url": "notion://page/competitor-analysis-id", "relevance": "medium" } ], "attachments": [ { "type": "spreadsheet", "title": "Q4 Metrics Dashboard", "url": "notion://attachment/metrics-id" } ] } return extracted_data def parse_notion_url(url: str) -> str: """Extract page/database ID from Notion URL""" # Simplified URL parsing if "notion.so/" in url or "notion://" in url: parts = url.split("/") return parts[-1].split("?")[0] return url def fetch_linked_content(linked_pages: List[Dict], depth: int = 1) -> List[Dict]: """ Recursively fetch linked page content Args: linked_pages: List of linked page references depth: How deep to follow links Returns: Expanded content from linked pages """ if depth <= 0: return [] expanded_content = [] for page in linked_pages: if page.get("relevance") in ["high", "medium"]: # Would fetch actual content here expanded_content.append({ "source": page["url"], "title": page["title"], "content": f"Content from {page['title']}..." }) return expanded_content def main(): parser = argparse.ArgumentParser( description="Extract research content from Notion" ) parser.add_argument( "notion_url", help="URL of Notion page or database" ) parser.add_argument( "--output", default="research.json", help="Output JSON file (default: research.json)" ) parser.add_argument( "--include-linked", action="store_true", help="Include content from linked pages" ) parser.add_argument( "--depth", type=int, default=1, help="Link following depth (default: 1)" ) args = parser.parse_args() print(f"📚 Extracting content from: {args.notion_url}") # Extract main content research_data = extract_notion_content(args.notion_url) # Optionally fetch linked content if args.include_linked and research_data.get("linked_pages"): print("📎 Fetching linked pages...") linked_content = fetch_linked_content( research_data["linked_pages"], args.depth ) research_data["linked_content"] = linked_content # Save to JSON with open(args.output, 'w', encoding='utf-8') as f: json.dump(research_data, f, indent=2, ensure_ascii=False) print(f"✅ Research data saved to: {args.output}") print(f"📊 Extracted {len(research_data['content']['sections'])} sections") if research_data.get("linked_pages"): print(f"🔗 Found {len(research_data['linked_pages'])} linked pages") if __name__ == "__main__": main()