-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathenrich_with_video.py
More file actions
227 lines (173 loc) · 8.39 KB
/
enrich_with_video.py
File metadata and controls
227 lines (173 loc) · 8.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""
Enrich insights, threads, and debates with YouTube video URLs and timestamps.
This script adds video_url and timestamp_url to each insight, making them
directly linkable to the exact moment in the source video.
Usage:
python insights_first/enrich_with_video.py --db lennys_full.db
"""
import argparse
import json
import sqlite3
from pathlib import Path
from datetime import datetime
def timestamp_to_seconds(timestamp: str) -> int:
"""Convert HH:MM:SS or MM:SS timestamp to seconds."""
if not timestamp:
return 0
parts = timestamp.split(':')
if len(parts) == 3:
hours, minutes, seconds = map(int, parts)
return hours * 3600 + minutes * 60 + seconds
elif len(parts) == 2:
minutes, seconds = map(int, parts)
return minutes * 60 + seconds
return 0
def build_youtube_timestamp_url(video_url: str, timestamp: str) -> str:
"""Build a YouTube URL with timestamp parameter."""
if not video_url or not timestamp:
return video_url or ""
seconds = timestamp_to_seconds(timestamp)
if seconds == 0:
return video_url
# Handle both youtube.com and youtu.be URLs
if '?' in video_url:
return f"{video_url}&t={seconds}"
else:
return f"{video_url}?t={seconds}"
def load_video_data(db_path: str) -> tuple[dict, dict]:
"""Load video URLs and timestamps from database.
Returns:
Tuple of (document_id -> video_url, chunk_id -> timestamp_start)
"""
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
# Load document video URLs
cursor.execute("SELECT id, video_url FROM documents WHERE video_url IS NOT NULL")
doc_videos = {row['id']: row['video_url'] for row in cursor.fetchall()}
# Load chunk timestamps
cursor.execute("SELECT id, document_id, timestamp_start FROM chunks WHERE timestamp_start IS NOT NULL")
chunk_timestamps = {}
chunk_doc_map = {}
for row in cursor.fetchall():
chunk_timestamps[row['id']] = row['timestamp_start']
chunk_doc_map[row['id']] = row['document_id']
conn.close()
return doc_videos, chunk_timestamps, chunk_doc_map
def enrich_insight(insight: dict, doc_videos: dict, chunk_timestamps: dict) -> dict:
"""Add video_url and timestamp_url to an insight."""
chunk_id = insight.get('chunk_id', '')
document_id = insight.get('document_id', '')
video_url = doc_videos.get(document_id, '')
timestamp = chunk_timestamps.get(chunk_id, '')
timestamp_url = build_youtube_timestamp_url(video_url, timestamp)
insight['video_url'] = video_url
insight['timestamp_start'] = timestamp
insight['timestamp_url'] = timestamp_url
return insight
def enrich_extraction_file(file_path: Path, doc_videos: dict, chunk_timestamps: dict) -> int:
"""Enrich a modal extraction JSON file."""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
enriched_count = 0
for result in data.get('results', []):
if result.get('has_insight') and result.get('insight'):
enrich_insight(result['insight'], doc_videos, chunk_timestamps)
enriched_count += 1
# Mark as enriched
data['metadata']['enriched_with_video'] = True
data['metadata']['enriched_at'] = datetime.now().isoformat()
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
return enriched_count
def enrich_threads_file(file_path: Path, doc_videos: dict, chunk_timestamps: dict) -> int:
"""Enrich a named threads JSON file."""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
enriched_count = 0
for thread_name, thread_data in data.get('threads', {}).items():
for insight in thread_data.get('insights', []):
enrich_insight(insight, doc_videos, chunk_timestamps)
enriched_count += 1
# Mark as enriched
data['metadata']['enriched_with_video'] = True
data['metadata']['enriched_at'] = datetime.now().isoformat()
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
return enriched_count
def enrich_debates_file(file_path: Path, doc_videos: dict, chunk_timestamps: dict, chunk_doc_map: dict) -> int:
"""Enrich a debates JSON file."""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Load full insight data to get chunk_ids for debates
# Debates only have idx_a, idx_b - we need to map back to chunk_ids
extraction_file = Path(data['metadata'].get('input_file', ''))
if extraction_file.exists():
with open(extraction_file, 'r', encoding='utf-8') as f:
extraction_data = json.load(f)
insights_list = [r['insight'] for r in extraction_data['results'] if r['has_insight'] and r['insight']]
else:
# Try relative path
extraction_file = Path('insights_first/data/modal_extraction_20260120_024600.json')
if extraction_file.exists():
with open(extraction_file, 'r', encoding='utf-8') as f:
extraction_data = json.load(f)
insights_list = [r['insight'] for r in extraction_data['results'] if r['has_insight'] and r['insight']]
else:
print(f"Warning: Could not find extraction file to enrich debates")
return 0
enriched_count = 0
for debate in data.get('debates', []):
idx_a = debate.get('idx_a')
idx_b = debate.get('idx_b')
if idx_a is not None and idx_a < len(insights_list):
insight_a = insights_list[idx_a]
video_url_a = doc_videos.get(insight_a.get('document_id', ''), '')
timestamp_a = chunk_timestamps.get(insight_a.get('chunk_id', ''), '')
debate['video_url_a'] = video_url_a
debate['timestamp_start_a'] = timestamp_a
debate['timestamp_url_a'] = build_youtube_timestamp_url(video_url_a, timestamp_a)
debate['episode_title_a'] = insight_a.get('episode_title', '')
if idx_b is not None and idx_b < len(insights_list):
insight_b = insights_list[idx_b]
video_url_b = doc_videos.get(insight_b.get('document_id', ''), '')
timestamp_b = chunk_timestamps.get(insight_b.get('chunk_id', ''), '')
debate['video_url_b'] = video_url_b
debate['timestamp_start_b'] = timestamp_b
debate['timestamp_url_b'] = build_youtube_timestamp_url(video_url_b, timestamp_b)
debate['episode_title_b'] = insight_b.get('episode_title', '')
enriched_count += 1
# Mark as enriched
data['metadata']['enriched_with_video'] = True
data['metadata']['enriched_at'] = datetime.now().isoformat()
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
return enriched_count
def main():
parser = argparse.ArgumentParser(description='Enrich insights with video URLs')
parser.add_argument('--db', default='lennys_full.db', help='Database path')
parser.add_argument('--data-dir', default='insights_first/data', help='Data directory')
args = parser.parse_args()
data_dir = Path(args.data_dir)
print(f"Loading video data from {args.db}...")
doc_videos, chunk_timestamps, chunk_doc_map = load_video_data(args.db)
print(f" Found {len(doc_videos)} documents with video URLs")
print(f" Found {len(chunk_timestamps)} chunks with timestamps")
# Enrich extraction files
for file_path in data_dir.glob('modal_extraction_*.json'):
print(f"\nEnriching {file_path.name}...")
count = enrich_extraction_file(file_path, doc_videos, chunk_timestamps)
print(f" Enriched {count} insights")
# Enrich threads files
for file_path in data_dir.glob('named_threads_*.json'):
print(f"\nEnriching {file_path.name}...")
count = enrich_threads_file(file_path, doc_videos, chunk_timestamps)
print(f" Enriched {count} insights in threads")
# Enrich debates files
for file_path in data_dir.glob('debates_*.json'):
print(f"\nEnriching {file_path.name}...")
count = enrich_debates_file(file_path, doc_videos, chunk_timestamps, chunk_doc_map)
print(f" Enriched {count} debates")
print("\n✓ Enrichment complete!")
if __name__ == "__main__":
main()