our-claude-skills/custom-skills/05-ourdigital-document/code/scripts/extract_notion.py

#!/usr/bin/env python3
"""
Extract research content from Notion pages and databases
Outputs structured JSON for downstream processing
"""

import json
import argparse
from typing import Dict, List, Any
from datetime import datetime

def extract_notion_content(notion_url: str) -> Dict[str, Any]:
    """
    Extract and structure content from Notion

    This function would integrate with Notion MCP tools:
    - notion-search for finding related pages
    - notion-fetch for getting full content

    Args:
        notion_url: URL of Notion page or database

    Returns:
        Structured research data
    """

    # Parse Notion URL to get page/database ID
    page_id = parse_notion_url(notion_url)

    # This would use actual Notion MCP tools in production
    # Simulating the structure for now
    extracted_data = {
        "source": {
            "url": notion_url,
            "id": page_id,
            "type": "page",  # or "database"
            "extracted_at": datetime.now().isoformat()
        },
        "metadata": {
            "title": "Q4 Research Summary",
            "last_edited": "2024-12-15T10:30:00Z",
            "created_by": "user@company.com",
            "tags": ["research", "Q4", "strategy"]
        },
        "content": {
            "sections": [
                {
                    "title": "Executive Summary",
                    "content": "Key findings from Q4 research indicate...",
                    "level": 1,
                    "data_points": [
                        {"metric": "Growth Rate", "value": "25%"},
                        {"metric": "User Satisfaction", "value": "4.5/5"}
                    ]
                },
                {
                    "title": "Market Analysis",
                    "content": "The market landscape shows...",
                    "level": 1,
                    "subsections": [
                        {
                            "title": "Competitive Landscape",
                            "content": "Our position relative to competitors...",
                            "level": 2
                        },
                        {
                            "title": "Growth Opportunities",
                            "content": "Identified opportunities include...",
                            "level": 2
                        }
                    ]
                },
                {
                    "title": "Customer Insights",
                    "content": "Customer feedback reveals...",
                    "level": 1,
                    "data_points": [
                        {"metric": "NPS Score", "value": "72"},
                        {"metric": "Retention Rate", "value": "89%"}
                    ]
                },
                {
                    "title": "Recommendations",
                    "content": "Based on the research, we recommend...",
                    "level": 1,
                    "action_items": [
                        "Expand into new market segments",
                        "Enhance product features based on feedback",
                        "Increase investment in customer success"
                    ]
                }
            ]
        },
        "linked_pages": [
            {
                "title": "Detailed Customer Survey Results",
                "url": "notion://page/survey-results-id",
                "relevance": "high"
            },
            {
                "title": "Competitor Analysis Deep Dive",
                "url": "notion://page/competitor-analysis-id",
                "relevance": "medium"
            }
        ],
        "attachments": [
            {
                "type": "spreadsheet",
                "title": "Q4 Metrics Dashboard",
                "url": "notion://attachment/metrics-id"
            }
        ]
    }

    return extracted_data

def parse_notion_url(url: str) -> str:
    """Extract page/database ID from Notion URL"""
    # Simplified URL parsing
    if "notion.so/" in url or "notion://" in url:
        parts = url.split("/")
        return parts[-1].split("?")[0]
    return url

def fetch_linked_content(linked_pages: List[Dict], depth: int = 1) -> List[Dict]:
    """
    Recursively fetch linked page content

    Args:
        linked_pages: List of linked page references
        depth: How deep to follow links

    Returns:
        Expanded content from linked pages
    """
    if depth <= 0:
        return []

    expanded_content = []
    for page in linked_pages:
        if page.get("relevance") in ["high", "medium"]:
            # Would fetch actual content here
            expanded_content.append({
                "source": page["url"],
                "title": page["title"],
                "content": f"Content from {page['title']}..."
            })

    return expanded_content

def main():
    parser = argparse.ArgumentParser(
        description="Extract research content from Notion"
    )
    parser.add_argument(
        "notion_url",
        help="URL of Notion page or database"
    )
    parser.add_argument(
        "--output",
        default="research.json",
        help="Output JSON file (default: research.json)"
    )
    parser.add_argument(
        "--include-linked",
        action="store_true",
        help="Include content from linked pages"
    )
    parser.add_argument(
        "--depth",
        type=int,
        default=1,
        help="Link following depth (default: 1)"
    )

    args = parser.parse_args()

    print(f"📚 Extracting content from: {args.notion_url}")

    # Extract main content
    research_data = extract_notion_content(args.notion_url)

    # Optionally fetch linked content
    if args.include_linked and research_data.get("linked_pages"):
        print("📎 Fetching linked pages...")
        linked_content = fetch_linked_content(
            research_data["linked_pages"],
            args.depth
        )
        research_data["linked_content"] = linked_content

    # Save to JSON
    with open(args.output, 'w', encoding='utf-8') as f:
        json.dump(research_data, f, indent=2, ensure_ascii=False)

    print(f"✅ Research data saved to: {args.output}")
    print(f"📊 Extracted {len(research_data['content']['sections'])} sections")

    if research_data.get("linked_pages"):
        print(f"🔗 Found {len(research_data['linked_pages'])} linked pages")

if __name__ == "__main__":
    main()