Skip to content

Commit b3fddbe

Browse files
inital commit
0 parents  commit b3fddbe

15 files changed

Lines changed: 1152 additions & 0 deletions

.gitignore

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# Ignore log files
2+
*.log
3+
4+
# Ignore media files
5+
*.mp3
6+
*.mp4
7+
*.mkv
8+
*.avi
9+
*.wav
10+
*.flac
11+
*.mov
12+
*.wmv
13+
*.aac
14+
*.ogg
15+
*.webm
16+
*.jpg
17+
*.jpeg
18+
*.png
19+
*.gif
20+
*.bmp
21+
*.tiff
22+
*.svg
23+
24+
# Ignore Python cache and environment files
25+
__pycache__/
26+
*.pyc
27+
*.pyo
28+
*.pyd
29+
.env
30+
.venv
31+
venv/
32+
env/
33+
ENV/
34+
*.egg-info/
35+
*.egg
36+
constants.py
37+
38+
# Ignore VS Code settings
39+
.vscode/
40+
41+
# Ignore OS files
42+
.DS_Store
43+
Thumbs.db
44+
45+
# Ignore output and temporary files
46+
*.out
47+
*.tmp
48+
*.bak
49+
50+
# Ignore model and data files
51+
*.h5
52+
*.hdf5
53+
*.ckpt
54+
*.pth
55+
*.pt
56+
*.pb
57+
*.npz
58+
*.npy
59+
*.csv
60+
*.tsv
61+
*.xlsx
62+
*.xls
63+
*.pickle
64+
*.joblib
65+
*.tar.gz
66+
*.zip
67+
*.rar
68+
*.7z

1_download.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
"""
2+
This script uses yt-dlp to download audio, by year, from the
3+
Shell Game podcast playlist.
4+
"""
5+
6+
import yt_dlp
7+
8+
def download_playlist(playlist_url):
9+
"""
10+
Downloads all audio from the podcast site.
11+
"""
12+
ydl_opts = {
13+
"format": "bestaudio",
14+
"noplaylist": False,
15+
"ignoreerrors": True,
16+
"download_archive": "downloaded.log",
17+
"outtmpl": "%(upload_date>%Y)s/%(title)s.%(ext)s",
18+
"postprocessors": [
19+
{
20+
"key": "FFmpegExtractAudio",
21+
"preferredcodec": "mp3",
22+
"preferredquality": "0",
23+
}
24+
],
25+
'concurrent-fragments': True,
26+
'no-mtime': True
27+
}
28+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
29+
ydl.download([playlist_url])
30+
31+
32+
if __name__ == "__main__":
33+
playlist = "https://www.omnycontent.com/d/playlist/e73c998e-6e60-432f-8610-ae210140c5b1/d3d3abca-191a-4010-8160-b3530112d393/c639b22c-ee8c-43dd-86c1-b3530112d3a3/podcast.rss"
34+
download_playlist(playlist)
35+
print("Downloaded all audio from the Shell Game podcast website.")

2_transcriber.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""
2+
This script transcribes audio files from the Shell Game
3+
podcast episodes using OpenAI's Whisper model.
4+
"""
5+
6+
import os
7+
import sys
8+
import re
9+
import whisper
10+
11+
# Log file name
12+
LOG_FILENAME = "transcribed.log"
13+
14+
# Check for PyTorch
15+
import torch
16+
print("Is CUDA enabled? " + str(torch.cuda.is_available()))
17+
print("Current CUDA GPU: " + str(torch.cuda.get_device_name(0)))
18+
19+
def transcribe_audio(folder_path):
20+
"""Transcribes only .mp3 files in the specified directory using Whisper."""
21+
# Path to the log file
22+
log_path = os.path.join(folder_path, LOG_FILENAME)
23+
24+
# Read already transcribed files from log
25+
transcribed_files = set()
26+
if os.path.exists(log_path):
27+
with open(log_path, "r", encoding="utf-8") as log_file:
28+
for line in log_file:
29+
transcribed_files.add(line.strip())
30+
31+
# Open log file for appending
32+
with open(log_path, "a", encoding="utf-8") as log_file:
33+
for file in os.listdir(folder_path):
34+
if file.endswith(".mp3"):
35+
if file in transcribed_files:
36+
print(f"⏭️ Skipping (already transcribed): {file}")
37+
continue
38+
full_path = os.path.join(folder_path, file)
39+
success = transcribe(full_path)
40+
if success:
41+
log_file.write(file + "\n")
42+
log_file.flush()
43+
44+
def transcribe(file_path):
45+
"""Transcribes a single audio file using Whisper and saves the output with timestamps.
46+
Returns True if successful."""
47+
try:
48+
print("Loading model...")
49+
model = whisper.load_model("turbo")
50+
51+
print(f"Transcribing: {file_path}")
52+
result = model.transcribe(file_path, language="en", verbose=True)
53+
54+
# Create output file name
55+
base_name = re.sub(r"\s*\[.*?\]", "", os.path.splitext(file_path)[0])
56+
output_path = f"{base_name}_transcript"
57+
58+
# Save timestamped + punctuated transcription as .txt
59+
with open(output_path + ".txt", "w", encoding="utf-8") as f:
60+
for segment in result["segments"]:
61+
start = segment["start"]
62+
end = segment["end"]
63+
text = segment["text"]
64+
f.write(f"[{start:.2f} --> {end:.2f}] {text}\n")
65+
66+
# Save timestamped + punctuated transcription as .md
67+
with open(output_path + ".md", "w", encoding="utf-8") as f:
68+
for segment in result["segments"]:
69+
start = segment["start"]
70+
end = segment["end"]
71+
text = segment["text"]
72+
f.write(f"[{start:.2f} --> {end:.2f}] {text}\n")
73+
74+
# User feedback
75+
print(f"Transcription saved to: {output_path}.txt and {output_path}.md")
76+
return True
77+
# Catch any exceptions and report failure
78+
except Exception as e:
79+
print(f"Failed to transcribe {file_path}: {e}")
80+
return False
81+
82+
# When script is run, transcribe all audio files in the current directory
83+
if __name__ == "__main__":
84+
transcribe_audio(sys.argv[1] if len(sys.argv) > 1 else os.getcwd())

3_summarizer.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
"""
2+
This script summarizes a transcript file by splitting it into manageable chunks,
3+
summarizing each chunk using the Ollama API, and then combining the summaries into a final summary.
4+
"""
5+
6+
import os
7+
import sys
8+
import ollama
9+
from transformers import AutoTokenizer
10+
11+
# Log file name
12+
LOG_FILENAME = "summarized.log"
13+
14+
# Load tokenizer
15+
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
16+
17+
def split_text_by_tokens(text, max_tokens=2000):
18+
"""Splits the input text into chunks that do not exceed the specified token limit."""
19+
words = text.split()
20+
chunks = []
21+
current_chunk = []
22+
23+
# Split the text into words and process them
24+
for word in words:
25+
current_chunk.append(word)
26+
tokens = tokenizer(" ".join(current_chunk))["input_ids"]
27+
if len(tokens) > max_tokens:
28+
current_chunk.pop()
29+
chunks.append(" ".join(current_chunk))
30+
current_chunk = [word]
31+
32+
if current_chunk:
33+
chunks.append(" ".join(current_chunk))
34+
35+
return chunks
36+
37+
def summarize_chunk(text, model="llama3.1:8b"):
38+
"""Summarizes a chunk of text using the specified Ollama model."""
39+
# Ollama prompt
40+
prompt = (
41+
"You are an expert summarizer. Summarize the following transcript into a short list "
42+
"of the main topics discussed or mentioned. Use only bullet points. "
43+
"Do not include any pleasantries to the user, or any sort of heading.\n\n"
44+
f"{text}"
45+
)
46+
47+
# Ollama response handler
48+
response = ollama.chat(
49+
model=model,
50+
messages=[
51+
{"role": "system", "content": "You summarize transcripts into concise topic overviews."},
52+
{"role": "user", "content": prompt}
53+
]
54+
)
55+
return response['message']['content']
56+
57+
def summarize_transcript(full_path, model):
58+
"""Summarizes a single transcript file by splitting
59+
it into chunks and summarizing each chunk."""
60+
with open(full_path, "r", encoding="utf-8") as f:
61+
transcript = f.read()
62+
63+
# Split transcript into chunks
64+
print("Splitting transcript into chunks...")
65+
chunks = split_text_by_tokens(transcript)
66+
67+
print(f"{len(chunks)} chunks created. Summarizing each...")
68+
69+
# Summarize each chunk
70+
partial_summaries = []
71+
for i, chunk in enumerate(chunks):
72+
print(f"Summarizing chunk {i+1}/{len(chunks)}...")
73+
summary = summarize_chunk(chunk, model)
74+
partial_summaries.append(summary)
75+
76+
print("Generating final summary from chunk summaries...")
77+
78+
# Combine partial summaries
79+
combined_summary = "\n".join(partial_summaries)
80+
81+
# Save result
82+
base_name = os.path.splitext(full_path)[0]
83+
base_name = base_name.replace("_transcript", "")
84+
summary_path_txt = f"{base_name}_summary.txt"
85+
with open(summary_path_txt, "w", encoding="utf-8") as f:
86+
f.write(combined_summary)
87+
summary_path_md = f"{base_name}_summary.md"
88+
with open(summary_path_md, "w", encoding="utf-8") as f:
89+
f.write(combined_summary)
90+
91+
print(f"Summary saved to: {summary_path_txt} and {summary_path_md}")
92+
93+
def summarize_transcripts(file_path, model="llama3.1:8b"):
94+
"""Loops through all .txt files in the specified directory,
95+
skipping already processed files and summary files."""\
96+
97+
# Create a log file to track processed files
98+
log_path = os.path.join(file_path, LOG_FILENAME)
99+
processed_files = set()
100+
101+
# Load already processed files from log
102+
if os.path.exists(log_path):
103+
with open(log_path, "r", encoding="utf-8") as log_file:
104+
processed_files = set(line.strip() for line in log_file if line.strip())
105+
106+
# Loop through all .txt files in the directory
107+
for file in os.listdir(file_path):
108+
full_path = os.path.join(file_path, file)
109+
# Skip summary files and already processed files
110+
if file.endswith(".txt") and not file.endswith("_summary.txt") and not file.endswith("corrected.txt") and not file.endswith("_summary.md") and not file.endswith("corrected.md") and file not in processed_files:
111+
print(f"Processing {file}...")
112+
summarize_transcript(full_path, model)
113+
with open(log_path, "a", encoding="utf-8") as log_file:
114+
log_file.write(file + "\n")
115+
log_file.flush()
116+
if file in processed_files:
117+
print(f"⏭️ Skipping (already summarized): {file}")
118+
119+
# When script is run, summarize all transcripts in the current directory
120+
if __name__ == "__main__":
121+
summarize_transcripts(sys.argv[1] if len(sys.argv) > 1 else os.getcwd())

0 commit comments

Comments
 (0)