willtheorangeguy
diff --git a/‎.gitignore‎
Lines changed: 68 additions & 0 deletions b/‎.gitignore‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎1_download.py‎
Lines changed: 35 additions & 0 deletions b/‎1_download.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎2_transcriber.py‎
Lines changed: 84 additions & 0 deletions b/‎2_transcriber.py‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎3_summarizer.py‎
Lines changed: 121 additions & 0 deletions b/‎3_summarizer.py‎
Lines changed: 121 additions & 0 deletions
@@ -0,0 +1,68 @@
+# Ignore log files
+*.log
+
+# Ignore media files
+*.mp3
+*.mp4
+*.mkv
+*.avi
+*.wav
+*.flac
+*.mov
+*.wmv
+*.aac
+*.ogg
+*.webm
+*.jpg
+*.jpeg
+*.png
+*.gif
+*.bmp
+*.tiff
+*.svg
+
+# Ignore Python cache and environment files
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.env
+.venv
+venv/
+env/
+ENV/
+*.egg-info/
+*.egg
+constants.py
+
+# Ignore VS Code settings
+.vscode/
+
+# Ignore OS files
+.DS_Store
+Thumbs.db
+
+# Ignore output and temporary files
+*.out
+*.tmp
+*.bak
+
+# Ignore model and data files
+*.h5
+*.hdf5
+*.ckpt
+*.pth
+*.pt
+*.pb
+*.npz
+*.npy
+*.csv
+*.tsv
+*.xlsx
+*.xls
+*.pickle
+*.joblib
+*.tar.gz
+*.zip
+*.rar
+*.7z
@@ -0,0 +1,35 @@
+"""
+This script uses yt-dlp to download audio, by year, from the
+Shell Game podcast playlist.
+"""
+
+import yt_dlp
+
+def download_playlist(playlist_url):
+    """
+    Downloads all audio from the podcast site.
+    """
+    ydl_opts = {
+        "format": "bestaudio",
+        "noplaylist": False,
+        "ignoreerrors": True,
+        "download_archive": "downloaded.log",
+        "outtmpl": "%(upload_date>%Y)s/%(title)s.%(ext)s",
+        "postprocessors": [
+            {
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "mp3",
+                "preferredquality": "0",
+            }
+        ],
+        'concurrent-fragments': True,
+        'no-mtime': True
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([playlist_url])
+
+
+if __name__ == "__main__":
+    playlist = "https://www.omnycontent.com/d/playlist/e73c998e-6e60-432f-8610-ae210140c5b1/d3d3abca-191a-4010-8160-b3530112d393/c639b22c-ee8c-43dd-86c1-b3530112d3a3/podcast.rss"
+    download_playlist(playlist)
+    print("Downloaded all audio from the Shell Game podcast website.")
@@ -0,0 +1,84 @@
+"""
+This script transcribes audio files from the Shell Game
+podcast episodes using OpenAI's Whisper model.
+"""
+
+import os
+import sys
+import re
+import whisper
+
+# Log file name
+LOG_FILENAME = "transcribed.log"
+
+# Check for PyTorch
+import torch
+print("Is CUDA enabled? " + str(torch.cuda.is_available()))
+print("Current CUDA GPU: " + str(torch.cuda.get_device_name(0)))
+
+def transcribe_audio(folder_path):
+    """Transcribes only .mp3 files in the specified directory using Whisper."""
+    # Path to the log file
+    log_path = os.path.join(folder_path, LOG_FILENAME)
+
+    # Read already transcribed files from log
+    transcribed_files = set()
+    if os.path.exists(log_path):
+        with open(log_path, "r", encoding="utf-8") as log_file:
+            for line in log_file:
+                transcribed_files.add(line.strip())
+
+    # Open log file for appending
+    with open(log_path, "a", encoding="utf-8") as log_file:
+        for file in os.listdir(folder_path):
+            if file.endswith(".mp3"):
+                if file in transcribed_files:
+                    print(f"⏭️ Skipping (already transcribed): {file}")
+                    continue
+                full_path = os.path.join(folder_path, file)
+                success = transcribe(full_path)
+                if success:
+                    log_file.write(file + "\n")
+                    log_file.flush()
+
+def transcribe(file_path):
+    """Transcribes a single audio file using Whisper and saves the output with timestamps.
+    Returns True if successful."""
+    try:
+        print("Loading model...")
+        model = whisper.load_model("turbo")
+
+        print(f"Transcribing: {file_path}")
+        result = model.transcribe(file_path, language="en", verbose=True)
+
+        # Create output file name
+        base_name = re.sub(r"\s*\[.*?\]", "", os.path.splitext(file_path)[0])
+        output_path = f"{base_name}_transcript"
+
+        # Save timestamped + punctuated transcription as .txt
+        with open(output_path + ".txt", "w", encoding="utf-8") as f:
+            for segment in result["segments"]:
+                start = segment["start"]
+                end = segment["end"]
+                text = segment["text"]
+                f.write(f"[{start:.2f} --> {end:.2f}] {text}\n")
+
+        # Save timestamped + punctuated transcription as .md
+        with open(output_path + ".md", "w", encoding="utf-8") as f:
+            for segment in result["segments"]:
+                start = segment["start"]
+                end = segment["end"]
+                text = segment["text"]
+                f.write(f"[{start:.2f} --> {end:.2f}] {text}\n")
+
+        # User feedback
+        print(f"Transcription saved to: {output_path}.txt and {output_path}.md")
+        return True
+    # Catch any exceptions and report failure
+    except Exception as e:
+        print(f"Failed to transcribe {file_path}: {e}")
+        return False
+
+# When script is run, transcribe all audio files in the current directory
+if __name__ == "__main__":
+    transcribe_audio(sys.argv[1] if len(sys.argv) > 1 else os.getcwd())
@@ -0,0 +1,121 @@
+"""
+This script summarizes a transcript file by splitting it into manageable chunks,
+summarizing each chunk using the Ollama API, and then combining the summaries into a final summary.
+"""
+
+import os
+import sys
+import ollama
+from transformers import AutoTokenizer
+
+# Log file name
+LOG_FILENAME = "summarized.log"
+
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+def split_text_by_tokens(text, max_tokens=2000):
+    """Splits the input text into chunks that do not exceed the specified token limit."""
+    words = text.split()
+    chunks = []
+    current_chunk = []
+
+    # Split the text into words and process them
+    for word in words:
+        current_chunk.append(word)
+        tokens = tokenizer(" ".join(current_chunk))["input_ids"]
+        if len(tokens) > max_tokens:
+            current_chunk.pop()
+            chunks.append(" ".join(current_chunk))
+            current_chunk = [word]
+
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+
+    return chunks
+
+def summarize_chunk(text, model="llama3.1:8b"):
+    """Summarizes a chunk of text using the specified Ollama model."""
+    # Ollama prompt
+    prompt = (
+        "You are an expert summarizer. Summarize the following transcript into a short list "
+        "of the main topics discussed or mentioned. Use only bullet points. "
+        "Do not include any pleasantries to the user, or any sort of heading.\n\n"
+        f"{text}"
+    )
+
+    # Ollama response handler
+    response = ollama.chat(
+        model=model,
+        messages=[
+            {"role": "system", "content": "You summarize transcripts into concise topic overviews."},
+            {"role": "user", "content": prompt}
+        ]
+    )
+    return response['message']['content']
+
+def summarize_transcript(full_path, model):
+    """Summarizes a single transcript file by splitting 
+    it into chunks and summarizing each chunk."""
+    with open(full_path, "r", encoding="utf-8") as f:
+        transcript = f.read()
+
+    # Split transcript into chunks
+    print("Splitting transcript into chunks...")
+    chunks = split_text_by_tokens(transcript)
+
+    print(f"{len(chunks)} chunks created. Summarizing each...")
+
+    # Summarize each chunk
+    partial_summaries = []
+    for i, chunk in enumerate(chunks):
+        print(f"Summarizing chunk {i+1}/{len(chunks)}...")
+        summary = summarize_chunk(chunk, model)
+        partial_summaries.append(summary)
+
+    print("Generating final summary from chunk summaries...")
+
+    # Combine partial summaries
+    combined_summary = "\n".join(partial_summaries)
+
+    # Save result
+    base_name = os.path.splitext(full_path)[0]
+    base_name = base_name.replace("_transcript", "")
+    summary_path_txt = f"{base_name}_summary.txt"
+    with open(summary_path_txt, "w", encoding="utf-8") as f:
+        f.write(combined_summary)
+    summary_path_md = f"{base_name}_summary.md"
+    with open(summary_path_md, "w", encoding="utf-8") as f:
+        f.write(combined_summary)
+
+    print(f"Summary saved to: {summary_path_txt} and {summary_path_md}")
+
+def summarize_transcripts(file_path, model="llama3.1:8b"):
+    """Loops through all .txt files in the specified directory,
+    skipping already processed files and summary files."""\
+    
+    # Create a log file to track processed files
+    log_path = os.path.join(file_path, LOG_FILENAME)
+    processed_files = set()
+
+    # Load already processed files from log
+    if os.path.exists(log_path):
+        with open(log_path, "r", encoding="utf-8") as log_file:
+            processed_files = set(line.strip() for line in log_file if line.strip())
+
+    # Loop through all .txt files in the directory
+    for file in os.listdir(file_path):
+        full_path = os.path.join(file_path, file)
+        # Skip summary files and already processed files
+        if file.endswith(".txt") and not file.endswith("_summary.txt") and not file.endswith("corrected.txt") and not file.endswith("_summary.md") and not file.endswith("corrected.md") and file not in processed_files:
+            print(f"Processing {file}...")
+            summarize_transcript(full_path, model)
+            with open(log_path, "a", encoding="utf-8") as log_file:
+                log_file.write(file + "\n")
+                log_file.flush()
+        if file in processed_files:
+            print(f"⏭️ Skipping (already summarized): {file}")
+            
+# When script is run, summarize all transcripts in the current directory
+if __name__ == "__main__":
+    summarize_transcripts(sys.argv[1] if len(sys.argv) > 1 else os.getcwd())