invisible-threads/enrich_with_video.py at main · baboonzero/invisible-threads · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""
Enrich insights, threads, and debates with YouTube video URLs and timestamps.

This script adds video_url and timestamp_url to each insight, making them
directly linkable to the exact moment in the source video.

Usage:
    python insights_first/enrich_with_video.py --db lennys_full.db
"""

import argparse
import json
import sqlite3
from pathlib import Path
from datetime import datetime


def timestamp_to_seconds(timestamp: str) -> int:
    """Convert HH:MM:SS or MM:SS timestamp to seconds."""
    if not timestamp:
        return 0

    parts = timestamp.split(':')
    if len(parts) == 3:
        hours, minutes, seconds = map(int, parts)
        return hours * 3600 + minutes * 60 + seconds
    elif len(parts) == 2:
        minutes, seconds = map(int, parts)
        return minutes * 60 + seconds
    return 0


def build_youtube_timestamp_url(video_url: str, timestamp: str) -> str:
    """Build a YouTube URL with timestamp parameter."""
    if not video_url or not timestamp:
        return video_url or ""

    seconds = timestamp_to_seconds(timestamp)
    if seconds == 0:
        return video_url

    # Handle both youtube.com and youtu.be URLs
    if '?' in video_url:
        return f"{video_url}&t={seconds}"
    else:
        return f"{video_url}?t={seconds}"


def load_video_data(db_path: str) -> tuple[dict, dict]:
    """Load video URLs and timestamps from database.

    Returns:
        Tuple of (document_id -> video_url, chunk_id -> timestamp_start)
    """
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    cursor = conn.cursor()

    # Load document video URLs
    cursor.execute("SELECT id, video_url FROM documents WHERE video_url IS NOT NULL")
    doc_videos = {row['id']: row['video_url'] for row in cursor.fetchall()}

    # Load chunk timestamps
    cursor.execute("SELECT id, document_id, timestamp_start FROM chunks WHERE timestamp_start IS NOT NULL")
    chunk_timestamps = {}
    chunk_doc_map = {}
    for row in cursor.fetchall():
        chunk_timestamps[row['id']] = row['timestamp_start']
        chunk_doc_map[row['id']] = row['document_id']

    conn.close()

    return doc_videos, chunk_timestamps, chunk_doc_map


def enrich_insight(insight: dict, doc_videos: dict, chunk_timestamps: dict) -> dict:
    """Add video_url and timestamp_url to an insight."""
    chunk_id = insight.get('chunk_id', '')
    document_id = insight.get('document_id', '')

    video_url = doc_videos.get(document_id, '')
    timestamp = chunk_timestamps.get(chunk_id, '')
    timestamp_url = build_youtube_timestamp_url(video_url, timestamp)

    insight['video_url'] = video_url
    insight['timestamp_start'] = timestamp
    insight['timestamp_url'] = timestamp_url

    return insight


def enrich_extraction_file(file_path: Path, doc_videos: dict, chunk_timestamps: dict) -> int:
    """Enrich a modal extraction JSON file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    enriched_count = 0
    for result in data.get('results', []):
        if result.get('has_insight') and result.get('insight'):
            enrich_insight(result['insight'], doc_videos, chunk_timestamps)
            enriched_count += 1

    # Mark as enriched
    data['metadata']['enriched_with_video'] = True
    data['metadata']['enriched_at'] = datetime.now().isoformat()

    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    return enriched_count


def enrich_threads_file(file_path: Path, doc_videos: dict, chunk_timestamps: dict) -> int:
    """Enrich a named threads JSON file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    enriched_count = 0
    for thread_name, thread_data in data.get('threads', {}).items():
        for insight in thread_data.get('insights', []):
            enrich_insight(insight, doc_videos, chunk_timestamps)
            enriched_count += 1

    # Mark as enriched
    data['metadata']['enriched_with_video'] = True
    data['metadata']['enriched_at'] = datetime.now().isoformat()

    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    return enriched_count


def enrich_debates_file(file_path: Path, doc_videos: dict, chunk_timestamps: dict, chunk_doc_map: dict) -> int:
    """Enrich a debates JSON file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Load full insight data to get chunk_ids for debates
    # Debates only have idx_a, idx_b - we need to map back to chunk_ids
    extraction_file = Path(data['metadata'].get('input_file', ''))
    if extraction_file.exists():
        with open(extraction_file, 'r', encoding='utf-8') as f:
            extraction_data = json.load(f)
        insights_list = [r['insight'] for r in extraction_data['results'] if r['has_insight'] and r['insight']]
    else:
        # Try relative path
        extraction_file = Path('insights_first/data/modal_extraction_20260120_024600.json')
        if extraction_file.exists():
            with open(extraction_file, 'r', encoding='utf-8') as f:
                extraction_data = json.load(f)
            insights_list = [r['insight'] for r in extraction_data['results'] if r['has_insight'] and r['insight']]
        else:
            print(f"Warning: Could not find extraction file to enrich debates")
            return 0

    enriched_count = 0
    for debate in data.get('debates', []):
        idx_a = debate.get('idx_a')
        idx_b = debate.get('idx_b')

        if idx_a is not None and idx_a < len(insights_list):
            insight_a = insights_list[idx_a]
            video_url_a = doc_videos.get(insight_a.get('document_id', ''), '')
            timestamp_a = chunk_timestamps.get(insight_a.get('chunk_id', ''), '')
            debate['video_url_a'] = video_url_a
            debate['timestamp_start_a'] = timestamp_a
            debate['timestamp_url_a'] = build_youtube_timestamp_url(video_url_a, timestamp_a)
            debate['episode_title_a'] = insight_a.get('episode_title', '')

        if idx_b is not None and idx_b < len(insights_list):
            insight_b = insights_list[idx_b]
            video_url_b = doc_videos.get(insight_b.get('document_id', ''), '')
            timestamp_b = chunk_timestamps.get(insight_b.get('chunk_id', ''), '')
            debate['video_url_b'] = video_url_b
            debate['timestamp_start_b'] = timestamp_b
            debate['timestamp_url_b'] = build_youtube_timestamp_url(video_url_b, timestamp_b)
            debate['episode_title_b'] = insight_b.get('episode_title', '')

        enriched_count += 1

    # Mark as enriched
    data['metadata']['enriched_with_video'] = True
    data['metadata']['enriched_at'] = datetime.now().isoformat()

    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    return enriched_count


def main():
    parser = argparse.ArgumentParser(description='Enrich insights with video URLs')
    parser.add_argument('--db', default='lennys_full.db', help='Database path')
    parser.add_argument('--data-dir', default='insights_first/data', help='Data directory')
    args = parser.parse_args()

    data_dir = Path(args.data_dir)

    print(f"Loading video data from {args.db}...")
    doc_videos, chunk_timestamps, chunk_doc_map = load_video_data(args.db)
    print(f"  Found {len(doc_videos)} documents with video URLs")
    print(f"  Found {len(chunk_timestamps)} chunks with timestamps")

    # Enrich extraction files
    for file_path in data_dir.glob('modal_extraction_*.json'):
        print(f"\nEnriching {file_path.name}...")
        count = enrich_extraction_file(file_path, doc_videos, chunk_timestamps)
        print(f"  Enriched {count} insights")

    # Enrich threads files
    for file_path in data_dir.glob('named_threads_*.json'):
        print(f"\nEnriching {file_path.name}...")
        count = enrich_threads_file(file_path, doc_videos, chunk_timestamps)
        print(f"  Enriched {count} insights in threads")

    # Enrich debates files
    for file_path in data_dir.glob('debates_*.json'):
        print(f"\nEnriching {file_path.name}...")
        count = enrich_debates_file(file_path, doc_videos, chunk_timestamps, chunk_doc_map)
        print(f"  Enriched {count} debates")

    print("\n✓ Enrichment complete!")


if __name__ == "__main__":
    main()