-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmerge_pairs.py
More file actions
91 lines (75 loc) · 3.05 KB
/
merge_pairs.py
File metadata and controls
91 lines (75 loc) · 3.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""Merge valid 2-insight threads into final output."""
import json
from datetime import datetime
# Load existing final threads
with open('insights_first/data/threads_final.json', 'r', encoding='utf-8') as f:
final_data = json.load(f)
# Load min2 threads
with open('insights_first/data/threads_v2_min2.json', 'r', encoding='utf-8') as f:
min2_data = json.load(f)
# Get 2-insight threads
threads = list(min2_data['threads'].values())
two_insight = [t for t in threads if t['size'] == 2]
# Filter out same-guest pairs
def get_guest_name(episode_id):
"""Extract guest name from episode ID."""
parts = episode_id.replace('-', ' ').replace('_', ' ').split()
# Take first 2 parts, ignore numbers
name_parts = [p for p in parts[:3] if not p.isdigit()][:2]
return ' '.join(name_parts).lower()
valid_pairs = []
filtered_out = []
for t in two_insight:
eps = [ins['document_id'] for ins in t['insights']]
guests = [get_guest_name(ep) for ep in eps]
if guests[0] != guests[1]:
valid_pairs.append(t)
else:
filtered_out.append((t, guests[0]))
print(f"Valid 2-insight threads: {len(valid_pairs)}")
print(f"Filtered out (same guest): {len(filtered_out)}")
# Merge into final
existing_threads = final_data['threads']
next_id = len(existing_threads)
# Add valid pairs with proper naming
for t in valid_pairs:
topic = t.get('representative_topics', ['Unnamed'])[0]
category = t.get('category', 'general')
new_thread = {
'thread_id': next_id,
'thread_name': topic, # Match existing schema
'core_claim': topic, # Simplified for 2-insight threads
'why_connected': 'Topic-based clustering with min_size=2',
'size': t['size'],
'num_episodes': t['num_episodes'],
'episodes': t['episodes'],
'category': category,
'coherence': t.get('coherence', 0.0),
'insights': t['insights']
}
existing_threads.append(new_thread)
next_id += 1
# Update metadata
total_insights = sum(t['size'] for t in existing_threads)
final_data['metadata']['threads_found'] = len(existing_threads)
final_data['metadata']['total_insights'] = total_insights
final_data['metadata']['coverage_pct'] = round(total_insights / 465 * 100, 1)
final_data['metadata']['updated'] = datetime.now().isoformat()
final_data['metadata']['note'] = 'Added 2-insight threads (filtered same-guest duplicates)'
# Save updated final
with open('insights_first/data/threads_final.json', 'w', encoding='utf-8') as f:
json.dump(final_data, f, indent=2, ensure_ascii=False)
print(f"\n=== UPDATED FINAL OUTPUT ===")
print(f"Threads: {len(existing_threads)}")
print(f"Insights: {total_insights}")
print(f"Coverage: {total_insights/465*100:.1f}%")
print(f"\nFiltered out same-guest pairs:")
for t, guest in filtered_out:
topic = t.get('representative_topics', ['?'])[0]
print(f" - {topic} (both from {guest})")
print(f"\nAdded 2-insight threads:")
for t in valid_pairs:
topic = t.get('representative_topics', ['?'])[0]
eps = t['episodes']
print(f" + {topic}")
print(f" ({eps[0]}, {eps[1]})")