diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..3702a9c --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ +OPENAI_API_KEY= +LOCAL_LLM_API_URL= +GROQ_API_KEY= +LOCAL_LLM_REQUEST_TIMEOUT= +GUNICORN_TIMEOUT= \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1f9b105..19dc5a2 100644 --- a/.gitignore +++ b/.gitignore @@ -178,4 +178,5 @@ logs/* *.wav *.mp3 *.index -*.pkl \ No newline at end of file +*.pkl +*.videos \ No newline at end of file diff --git a/README.md b/README.md index 54119c2..9538c66 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,9 @@ ## Features * 🎙️ **Audio Transcription**: Convert spoken content from meetings into text using advanced speech-to-text capabilities. +* 🎬 **Video Upload Support**: Upload meeting videos directly and extract audio automatically before transcription. * 🧠 **Summarization with LLMs**: Generate concise summaries of transcribed text utilizing powerful language models. +* 🧩 **Flexible LLM Providers**: Use cloud models (OpenAI, Groq) or connect to local models through LM Studio or Ollama. * Mind Map Generation: Create visual representations of meeting summaries to enhance understanding and retention. * 🖥️ **User-Friendly Interface**: Interact with a clean and responsive web UI built with Flask. * 🐳 **Dockerized Deployment**: Easily deploy the application using Docker and Docker Compose for a consistent environment setup. @@ -43,11 +45,14 @@ ```bash export GROQ_API_KEY=your_groq_api_key export OPENAI_API_KEY=your_openai_api_key + export LOCAL_LLM_API_URL=http://your-local-llm-host:1234 + export LOCAL_LLM_REQUEST_TIMEOUT=900 + export GUNICORN_TIMEOUT=960 ``` -*Replace `your_groq_api_key` and `your_openai_api_key` with your actual API keys.* +*Replace the placeholder values with your actual keys and local LLM URL. For Ollama, use port 11434. Increase `LOCAL_LLM_REQUEST_TIMEOUT` for slower local models, and keep `GUNICORN_TIMEOUT` higher than that value.* 4. **Build and Run the Docker Container** diff --git a/audio_extraction.py b/audio_extraction.py new file mode 100644 index 0000000..477a8cf --- /dev/null +++ b/audio_extraction.py @@ -0,0 +1,27 @@ +import subprocess +from pathlib import Path + + +def extract_audio(video_file, output_file=None): + video_path = Path(video_file) + + if output_file is None: + output_file = video_path.with_suffix(".wav") + + command = [ + "ffmpeg", + "-i", str(video_path), + "-vn", # no video + "-acodec", "pcm_s16le", + "-ar", "16000", # 16 kHz sample rate + "-ac", "1", # mono audio + str(output_file), + "-y" + ] + + subprocess.run(command, check=True) + print(f"Audio saved to: {output_file}") + + +if __name__ == "__main__": + extract_audio("") \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 0d624ec..35ec139 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,21 +1,19 @@ services: flask: - container_name: Speak2Summary-flask + container_name: s2s-flask build: context: . dockerfile: Dockerfile restart: always - # ports: - # - "5000:5000" - environment: - - GROQ_API_KEY=${GROQ_API_KEY} - - OPENAI_API_KEY=${OPENAI_API_KEY} + ports: + - "5000:5000" + env_file: + - .env depends_on: - redis - command: gunicorn -w 4 -b 0.0.0.0:5000 src.app:app --timeout 120 + command: gunicorn -w 4 -b 0.0.0.0:5000 src.app:app --timeout ${GUNICORN_TIMEOUT:-960} --graceful-timeout 120 networks: - Speak2Summary-net - - homelab labels: - "tsdproxy.enable=true" - "tsdproxy.name=speak2summary" @@ -33,15 +31,14 @@ services: retries: 5 celery: - container_name: Speak2Summary-celery + container_name: s2s-celery build: context: . dockerfile: Dockerfile restart: always command: celery -A src.celery_worker.celery worker --loglevel=info - environment: - - GROQ_API_KEY=${GROQ_API_KEY} - - OPENAI_API_KEY=${OPENAI_API_KEY} + env_file: + - .env depends_on: - redis networks: @@ -53,7 +50,7 @@ services: com.Speak2Summary.service: "celery-worker" redis: - container_name: Speak2Summary-redis + container_name: s2s-redis image: redis:7-alpine restart: always ports: @@ -71,8 +68,7 @@ services: networks: Speak2Summary-net: driver: bridge - homelab: - external: true + # networks: # homelab: diff --git a/src/audio_extraction.py b/src/audio_extraction.py new file mode 100644 index 0000000..615b223 --- /dev/null +++ b/src/audio_extraction.py @@ -0,0 +1,36 @@ +# cython: language_level=3 +import subprocess +from pathlib import Path +from typing import Optional + + +def extract_audio(video_file: str, output_file: Optional[str] = None) -> str: + """Extract mono 16kHz WAV audio from a video file using ffmpeg.""" + video_path = Path(video_file) + + if output_file is None: + output_path = video_path.with_suffix(".wav") + else: + output_path = Path(output_file) + + command = [ + "ffmpeg", + "-i", + str(video_path), + "-vn", + "-acodec", + "pcm_s16le", + "-ar", + "16000", + "-ac", + "1", + str(output_path), + "-y", + ] + + subprocess.run(command, check=True) + return str(output_path) + + +if __name__ == "__main__": + extract_audio("videos/sample.mp4") diff --git a/src/llm_clients.py b/src/llm_clients.py new file mode 100644 index 0000000..5e4fc16 --- /dev/null +++ b/src/llm_clients.py @@ -0,0 +1,253 @@ +# cython: language_level=3 +import json +import os +import re +import socket +from typing import Any, Dict, Optional +from urllib import request as urllib_request +from urllib.error import HTTPError, URLError + +from transmeet import generate_meeting_minutes_from_transcript, generate_mind_map_from_transcript +from transmeet.utils.general_utils import get_logger + +logger = get_logger(__name__) + + +def _default_local_llm_url() -> Optional[str]: + configured_url = os.getenv("LOCAL_LLM_API_URL", "").strip() + return configured_url or None + + +def _local_llm_timeout_seconds() -> float: + configured_timeout = os.getenv("LOCAL_LLM_REQUEST_TIMEOUT", "900").strip() + try: + timeout = float(configured_timeout) + if timeout <= 0: + raise ValueError + return timeout + except ValueError: + logger.warning( + "Invalid LOCAL_LLM_REQUEST_TIMEOUT '%s'; falling back to 900 seconds", + configured_timeout, + ) + return 900.0 + + +def _normalize_lmstudio_endpoint(base_url: Optional[str]) -> str: + base_url = base_url or _default_local_llm_url() + if not base_url: + return "http://localhost:1234/v1/chat/completions" + + url = base_url.rstrip("/") + if url.endswith("/v1/chat/completions"): + return url + if url.endswith("/v1"): + return f"{url}/chat/completions" + return f"{url}/v1/chat/completions" + + +def _normalize_ollama_endpoint(base_url: Optional[str]) -> str: + base_url = base_url or _default_local_llm_url() + if not base_url: + return "http://localhost:11434/api/chat" + + url = base_url.rstrip("/") + if url.endswith("/api/chat"): + return url + if url.endswith("/api"): + return f"{url}/chat" + return f"{url}/api/chat" + + +def _post_json(endpoint: str, payload: Dict[str, Any], api_key: Optional[str] = None) -> Dict[str, Any]: + data = json.dumps(payload).encode("utf-8") + headers = {"Content-Type": "application/json"} + timeout_seconds = _local_llm_timeout_seconds() + + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + request = urllib_request.Request(endpoint, data=data, headers=headers, method="POST") + try: + with urllib_request.urlopen(request, timeout=timeout_seconds) as response: + return json.loads(response.read().decode("utf-8")) + except HTTPError as exc: + body = "" + try: + body = exc.read().decode("utf-8") + except Exception: + body = "" + raise RuntimeError(f"HTTP {exc.code} calling {endpoint}: {body}") from exc + except socket.timeout as exc: + raise RuntimeError( + f"Timed out after {int(timeout_seconds)}s calling {endpoint}. " + "Increase LOCAL_LLM_REQUEST_TIMEOUT or use a faster model." + ) from exc + except URLError as exc: + if isinstance(exc.reason, socket.timeout): + raise RuntimeError( + f"Timed out after {int(timeout_seconds)}s calling {endpoint}. " + "Increase LOCAL_LLM_REQUEST_TIMEOUT or use a faster model." + ) from exc + raise RuntimeError(f"Unable to reach {endpoint}: {exc.reason}") from exc + + +def _get_lmstudio_content(response: Dict[str, Any]) -> str: + choices = response.get("choices") or [] + if not choices: + raise RuntimeError("LM Studio response did not contain choices") + + message = choices[0].get("message") or {} + content = message.get("content") + if not content: + raise RuntimeError("LM Studio response did not contain message content") + return content.strip() + + +def _get_ollama_content(response: Dict[str, Any]) -> str: + message = response.get("message") or {} + content = message.get("content") + if not content: + raise RuntimeError("Ollama response did not contain message content") + return content.strip() + + +def _extract_json_object(raw_text: str) -> Dict[str, Any]: + text = raw_text.strip() + + if text.startswith("```"): + text = re.sub(r"^```(?:json)?\\s*", "", text) + text = re.sub(r"\\s*```$", "", text) + + try: + parsed = json.loads(text) + if isinstance(parsed, dict): + return parsed + except json.JSONDecodeError: + pass + + start = text.find("{") + end = text.rfind("}") + if start != -1 and end != -1 and end > start: + snippet = text[start : end + 1] + parsed = json.loads(snippet) + if isinstance(parsed, dict): + return parsed + + raise RuntimeError("Could not parse JSON object from local model response") + + +def _local_chat_completion( + llm_client: str, + llm_model: str, + system_prompt: str, + user_prompt: str, + llm_base_url: Optional[str] = None, + llm_api_key: Optional[str] = None, +) -> str: + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ] + + if llm_client == "lmstudio": + endpoint = _normalize_lmstudio_endpoint(llm_base_url) + payload = { + "model": llm_model, + "messages": messages, + "temperature": 0.2, + "stream": False, + } + response = _post_json(endpoint, payload, llm_api_key) + return _get_lmstudio_content(response) + + if llm_client == "ollama": + endpoint = _normalize_ollama_endpoint(llm_base_url) + payload = { + "model": llm_model, + "messages": messages, + "stream": False, + "options": {"temperature": 0.2}, + } + response = _post_json(endpoint, payload, llm_api_key) + return _get_ollama_content(response) + + raise RuntimeError(f"Unsupported local client '{llm_client}'") + + +def generate_meeting_minutes_with_provider( + transcript: str, + llm_client: str, + llm_model: str, + llm_base_url: Optional[str] = None, + llm_api_key: Optional[str] = None, +) -> str: + if llm_client in {"lmstudio", "ollama"}: + system_prompt = ( + "You generate clear, concise meeting minutes in markdown. " + "Use sections: Executive Summary, Key Discussion Points, Decisions Made, " + "Action Items, Risks and Blockers, and Next Steps." + ) + user_prompt = ( + "Create polished meeting minutes from the transcript below. " + "Use bullet points where useful and keep factual accuracy.\n\n" + f"Transcript:\n{transcript}" + ) + return _local_chat_completion( + llm_client, + llm_model, + system_prompt, + user_prompt, + llm_base_url, + llm_api_key, + ) + + return generate_meeting_minutes_from_transcript( + transcript, + llm_client=llm_client, + llm_model=llm_model, + ) + + +def generate_mind_map_with_provider( + transcript: str, + llm_client: str, + llm_model: str, + llm_base_url: Optional[str] = None, + llm_api_key: Optional[str] = None, +) -> Dict[str, Any]: + if llm_client in {"lmstudio", "ollama"}: + system_prompt = ( + "You return strict JSON only, no markdown. " + "Generate a hierarchical meeting mind map." + ) + user_prompt = ( + "From the transcript below, return only a JSON object shaped like:\n" + "{\n" + " \"Root Topic\": \"Meeting Title\",\n" + " \"Section\": {\n" + " \"Subtopic\": [\"point 1\", \"point 2\"]\n" + " }\n" + "}\n" + "Use concise keys and meaningful grouping.\n\n" + f"Transcript:\n{transcript}" + ) + + raw_response = _local_chat_completion( + llm_client, + llm_model, + system_prompt, + user_prompt, + llm_base_url, + llm_api_key, + ) + parsed_response = _extract_json_object(raw_response) + if "Root Topic" not in parsed_response: + parsed_response["Root Topic"] = "Meeting Mind Map" + return parsed_response + + return generate_mind_map_from_transcript( + transcript, + llm_client=llm_client, + llm_model=llm_model, + ) diff --git a/src/models.json b/src/models.json index a1904a8..cf3ec3a 100644 --- a/src/models.json +++ b/src/models.json @@ -1,19 +1,21 @@ { "transcription_model_to_client": { - "whisper-large-v3-turbo": "groq", - "whisper-large-v3": "groq", - "whisper-1": "openai", "gpt-4o-transcribe": "openai", - "gpt-4o-mini-transcribe": "openai" + "gpt-4o-mini-transcribe": "openai", + "whisper-1": "openai", + "whisper-large-v3-turbo": "groq", + "whisper-large-v3": "groq" }, "llm_model_to_client": { "llama-3.3-70b-versatile": "groq", - "llama3-70b-8192": "groq", - "gemma2-9b-it": "groq", - "mistral-saba-24b": "groq", - "qwen-qwq-32b": "groq", - "gpt-3.5-turbo": "openai", - "chatgpt-4o-latest": "openai", - "gpt-4o-mini": "openai" + "llama-3.1-8b-instant": "groq", + "qwen/qwen3-32b": "groq", + "deepseek-r1-distill-llama-70b": "groq", + "gpt-4.1": "openai", + "gpt-4.1-mini": "openai", + "gpt-4o": "openai", + "gpt-4o-mini": "openai", + "o4-mini": "openai", + "qwen/qwen3-14b": "lmstudio" } } \ No newline at end of file diff --git a/src/routes/api_routes.py b/src/routes/api_routes.py index a11d37d..601629a 100644 --- a/src/routes/api_routes.py +++ b/src/routes/api_routes.py @@ -1,119 +1,144 @@ # cython: language_level=3 -from flask import jsonify from datetime import datetime -from flask import request, jsonify + +from flask import jsonify, request +from transmeet.utils.general_utils import get_logger + +from src.llm_clients import ( + generate_meeting_minutes_with_provider, + generate_mind_map_with_provider, +) from src.models import TranscriptEntry, db -from transmeet import generate_mind_map_from_transcript, generate_meeting_minutes_from_transcript from src.utils import render_minutes_with_tailwind -from transmeet.utils.general_utils import get_logger -from src.models import TranscriptEntry from . import audio_bp logger = get_logger(__name__) + @audio_bp.route('/api/files', methods=['GET']) def list_files(): files = TranscriptEntry.query.order_by(TranscriptEntry.upload_time.desc()).all() - return jsonify([{ - 'id': f.id, - 'filename': f.filename, - 'status': f.status, - 'upload_time': f.upload_time.isoformat(), - 'transcript_available': f.transcript is not None, - 'minutes_available': f.minutes is not None, - 'error_message': f.error_message, - 'transcription_client': f.transcription_client, - 'transcription_model': f.transcription_model, - 'llm_client': f.llm_client, - 'llm_model': f.llm_model, - 'mind_map': f.mind_map is not None, - 'topic': f.mind_map.get('Root Topic') if f.mind_map else None, - - } for f in files]) - - -@audio_bp.route("/api/generate_mindmap", methods=["POST"]) + return jsonify([ + { + 'id': f.id, + 'filename': f.filename, + 'status': f.status, + 'upload_time': f.upload_time.isoformat(), + 'transcript_available': f.transcript is not None, + 'minutes_available': f.minutes is not None, + 'error_message': f.error_message, + 'transcription_client': f.transcription_client, + 'transcription_model': f.transcription_model, + 'llm_client': f.llm_client, + 'llm_model': f.llm_model, + 'mind_map': f.mind_map is not None, + 'topic': f.mind_map.get('Root Topic') if f.mind_map else None, + } + for f in files + ]) + + +@audio_bp.route('/api/generate_mindmap', methods=['POST']) def generate_mindmap_api(): - """API to generate mind map from transcript""" - data = request.get_json() - file_id = data.get("id") - - llm_client = data.get("llm-client") - llm_model = data.get("llm-model") - + """API to generate mind map from transcript.""" + data = request.get_json(silent=True) or {} + file_id = data.get('id') + force_regenerate = bool(data.get('force')) if not file_id: - return jsonify({"error": "Missing 'id' in request body"}), 400 + return jsonify({'error': "Missing 'id' in request body"}), 400 file_record = TranscriptEntry.query.filter_by(id=file_id).first() if not file_record: - return jsonify({"error": f"No record found for id: {file_id}"}), 404 + return jsonify({'error': f'No record found for id: {file_id}'}), 404 - if file_record.mind_map: - return jsonify({"message": "Mind map already exists", "mind_map": file_record.mind_map}), 200 + if file_record.mind_map and not force_regenerate: + return jsonify({'message': 'Mind map already exists', 'mind_map': file_record.mind_map}), 200 if not file_record.transcript: - return jsonify({"error": "Transcript not found for this file"}), 400 + return jsonify({'error': 'Transcript not found for this file'}), 400 - if not llm_client or not llm_model: - return jsonify({"error": "Missing 'llm_client' or 'llm_model' in request body"}), 400 + llm_client = (data.get('llm-client') or '').strip() + llm_model = (data.get('llm-model') or '').strip() + llm_base_url = data.get('llm-base-url') + llm_api_key = data.get('llm-api-key') - # Generate the mind map - mindmap_data = generate_mind_map_from_transcript( - file_record.transcript, - llm_client=llm_client, - llm_model=llm_model - ) + if not llm_client or not llm_model: + return jsonify({'error': "Missing 'llm-client' or 'llm-model' in request body"}), 400 + + try: + mindmap_data = generate_mind_map_with_provider( + file_record.transcript, + llm_client=llm_client, + llm_model=llm_model, + llm_base_url=llm_base_url, + llm_api_key=llm_api_key, + ) + except Exception as exc: + logger.error(f'Mind map generation failed for {file_id}: {exc}') + return jsonify({'error': str(exc)}), 500 file_record.mind_map = mindmap_data - file_record.status = "completed" + file_record.llm_client = llm_client + file_record.llm_model = llm_model + file_record.status = 'completed' file_record.completion_time = datetime.utcnow() db.session.commit() - return jsonify({"message": "Mind map generated successfully", "mind_map": mindmap_data}), 200 + return jsonify({'message': 'Mind map generated successfully', 'mind_map': mindmap_data}), 200 -@audio_bp.route("/api/generate_meeting_minutes", methods=["POST"]) +@audio_bp.route('/api/generate_meeting_minutes', methods=['POST']) def generate_meeting_minutes_api(): - """API to generate meeting minutes from transcript""" - data = request.get_json() + """API to generate meeting minutes from transcript.""" + data = request.get_json(silent=True) or {} - file_id = data.get("id") - llm_client = data.get("llm-client") - llm_model = data.get("llm-model") + file_id = data.get('id') + force_regenerate = bool(data.get('force')) if not file_id: - return jsonify({"error": "Missing 'id' in request body"}), 400 + return jsonify({'error': "Missing 'id' in request body"}), 400 file_record = TranscriptEntry.query.filter_by(id=file_id).first() if not file_record: - return jsonify({"error": f"No record found for id: {file_id}"}), 404 + return jsonify({'error': f'No record found for id: {file_id}'}), 404 + + if file_record.minutes and not force_regenerate: + return jsonify({'message': 'Meeting minutes already exist', 'minutes': file_record.minutes}), 200 - if file_record.minutes: - return jsonify({"message": "Meeting minutes already exist", "minutes": file_record.minutes}), 200 - if not file_record.transcript: - return jsonify({"error": "Transcript not found for this file"}), 400 + return jsonify({'error': 'Transcript not found for this file'}), 400 + + llm_client = (data.get('llm-client') or '').strip() + llm_model = (data.get('llm-model') or '').strip() + llm_base_url = data.get('llm-base-url') + llm_api_key = data.get('llm-api-key') if not llm_client or not llm_model: - return jsonify({"error": "Missing 'llm_client' or 'llm_model' in request body"}), 400 - - # Generate the meeting minutes - meeting_minutes_markdown = generate_meeting_minutes_from_transcript( - file_record.transcript, - llm_client=llm_client, - llm_model=llm_model - ) - - # minutes_raw + return jsonify({'error': "Missing 'llm-client' or 'llm-model' in request body"}), 400 + + try: + meeting_minutes_markdown = generate_meeting_minutes_with_provider( + file_record.transcript, + llm_client=llm_client, + llm_model=llm_model, + llm_base_url=llm_base_url, + llm_api_key=llm_api_key, + ) + except Exception as exc: + logger.error(f'Meeting minutes generation failed for {file_id}: {exc}') + return jsonify({'error': str(exc)}), 500 + meeting_minutes_html = render_minutes_with_tailwind(meeting_minutes_markdown) file_record.minutes_raw = meeting_minutes_markdown file_record.minutes = meeting_minutes_html - file_record.status = "completed" + file_record.llm_client = llm_client + file_record.llm_model = llm_model + file_record.status = 'completed' file_record.completion_time = datetime.utcnow() db.session.commit() - return jsonify({"message": "Meeting minutes generated successfully", "minutes": meeting_minutes_html}), 200 + return jsonify({'message': 'Meeting minutes generated successfully', 'minutes': meeting_minutes_html}), 200 diff --git a/src/routes/home_routes.py b/src/routes/home_routes.py index 1286b3c..61773b9 100644 --- a/src/routes/home_routes.py +++ b/src/routes/home_routes.py @@ -1,12 +1,13 @@ # cython: language_level=3 +import os +import json +from pathlib import Path + from flask import render_template from src.models import TranscriptEntry from . import audio_bp -import json -from pathlib import Path - ROOT_DIR = Path(__file__).resolve().parent.parent.parent models_path = ROOT_DIR / 'src/models.json' @@ -26,4 +27,5 @@ def index(): return render_template('index.html', files=files, transcription_model_to_client=transcription_model_to_client, llm_model_to_client=llm_model_to_client, transcription_model_list=transcription_model_list, - llm_model_list=llm_model_list) + llm_model_list=llm_model_list, + local_llm_api_url=os.getenv('LOCAL_LLM_API_URL', '')) diff --git a/src/routes/mindmap_route.py b/src/routes/mindmap_route.py index 4a0c0a0..3a52abe 100644 --- a/src/routes/mindmap_route.py +++ b/src/routes/mindmap_route.py @@ -5,7 +5,7 @@ from datetime import datetime from flask import render_template, request from src.models import TranscriptEntry, db -from transmeet import generate_mind_map_from_transcript +from src.llm_clients import generate_mind_map_with_provider from transmeet.utils.general_utils import get_logger logger = get_logger(__name__) @@ -155,7 +155,10 @@ def mindmap(): else: transcript = file_record.transcript - mindmap_data = generate_mind_map_from_transcript( + if not file_record.llm_client or not file_record.llm_model: + return "No LLM settings found for this file. Generate the mind map from the home page first.", 400 + + mindmap_data = generate_mind_map_with_provider( transcript, llm_client=file_record.llm_client, llm_model=file_record.llm_model diff --git a/src/routes/upload_routes.py b/src/routes/upload_routes.py index 7eecd39..b2de321 100644 --- a/src/routes/upload_routes.py +++ b/src/routes/upload_routes.py @@ -2,16 +2,20 @@ import os import uuid -from venv import logger from flask import request, jsonify from werkzeug.utils import secure_filename +from src.audio_extraction import extract_audio from src.models import db, TranscriptEntry from src.celery_worker import process_audio_file, process_transcript_file from src.config import app from transmeet.utils.general_utils import get_logger -logger = get_logger(__name__) +logger = get_logger(__name__) + +AUDIO_EXTENSIONS = {'.wav', '.mp3', '.m4a', '.ogg', '.flac', '.aac', '.webm'} +VIDEO_EXTENSIONS = {'.mp4', '.mov', '.mkv', '.avi', '.m4v', '.webm'} +ALLOWED_MEDIA_EXTENSIONS = AUDIO_EXTENSIONS | VIDEO_EXTENSIONS from . import audio_bp @@ -22,6 +26,27 @@ def save_file(file, tracking_id): file.save(path) return path, filename + +def prepare_media_file(file, tracking_id): + filepath, original_name = save_file(file, tracking_id) + ext = os.path.splitext(original_name)[1].lower() + mime_type = (file.mimetype or '').lower() + + if ext not in ALLOWED_MEDIA_EXTENSIONS: + if os.path.exists(filepath): + os.remove(filepath) + raise ValueError(f"Unsupported file type: {ext or 'unknown'}") + + is_video_upload = mime_type.startswith('video/') or (ext in VIDEO_EXTENSIONS and ext not in AUDIO_EXTENSIONS) + + if is_video_upload: + extracted_audio_path = os.path.splitext(filepath)[0] + '.wav' + extract_audio(filepath, extracted_audio_path) + os.remove(filepath) + return extracted_audio_path, original_name + + return filepath, original_name + def extract_text_from_file(file, ext): if ext == '.txt': return file.read().decode('utf-8') @@ -38,7 +63,7 @@ def extract_text_from_file(file, ext): raise ValueError("Unsupported file type") -def create_audio_file_entry(tracking_id, original_filename, filepath, t_client, t_model): +def create_audio_file_entry(tracking_id, original_filename, filepath, t_client, t_model, llm_client, llm_model): new_file = TranscriptEntry( id=tracking_id, #type: ignore filename=original_filename, #type: ignore @@ -46,6 +71,8 @@ def create_audio_file_entry(tracking_id, original_filename, filepath, t_client, status="queued", #type: ignore transcription_client=t_client, #type: ignore transcription_model=t_model, #type: ignore + llm_client=llm_client, #type: ignore + llm_model=llm_model, #type: ignore ) db.session.add(new_file) db.session.commit() @@ -66,27 +93,59 @@ def create_text_file_entry(tracking_id, original_filename, filepath, llm_client, @audio_bp.route('/upload', methods=['POST']) def upload(): - if 'audio' not in request.files: + if 'audio' not in request.files and 'media' not in request.files: return jsonify({'error': 'No file part'}), 400 - uploaded_files = request.files.getlist('audio') + uploaded_files = request.files.getlist('media') or request.files.getlist('audio') t_client = request.form.get('transcription-client') t_model = request.form.get('transcription-model') + llm_client = request.form.get('llm-client') + llm_model = request.form.get('llm-model') if not uploaded_files or uploaded_files[0].filename == '': return jsonify({'error': 'No selected file'}), 400 results = [] + queued_count = 0 for file in uploaded_files: + if not file or not file.filename: + continue + tracking_id = str(uuid.uuid4()) - filepath, original_name = save_file(file, tracking_id) - create_audio_file_entry(tracking_id, original_name, filepath, t_client, t_model) + filename = secure_filename(file.filename) + + try: + filepath, original_name = prepare_media_file(file, tracking_id) + except Exception as exc: + logger.error(f"Error preparing media file {filename}: {exc}") + results.append({ + 'filename': filename, + 'status': 'failed', + 'error': str(exc) + }) + continue + + create_audio_file_entry( + tracking_id, + original_name, + filepath, + t_client, + t_model, + llm_client, + llm_model, + ) + process_audio_file.delay(tracking_id, filepath, t_client, t_model) #type: ignore + results.append({ 'id': tracking_id, 'filename': original_name, 'status': 'queued' }) + queued_count += 1 + + if queued_count == 0: + return jsonify({'error': 'No valid media files processed', 'results': results}), 400 return jsonify(results) diff --git a/src/templates/index.html b/src/templates/index.html index 406fe8a..9ec8b47 100644 --- a/src/templates/index.html +++ b/src/templates/index.html @@ -33,6 +33,9 @@ Speak2Summary - Meeting Minutes Generator + + + + - +
- +
@@ -101,7 +113,7 @@ - Upload Audio Files + Upload Audio/Video Files
@@ -117,7 +129,7 @@ x-bind:class="{'bg-accent-500/10 border-accent-400': dragover}" x-on:click="$refs.fileInput.click()"> - -

Drag & drop audio files here
or click to select

+

Drag & drop audio or video files here
or click to select

@@ -185,7 +197,7 @@

Selected Files ( - +

@@ -283,7 +295,7 @@

Selected Files ( -
+

@@ -339,6 +351,38 @@

+ +
+ Selected LLM Provider: + +
+ +
+

Local Provider Settings

+ +
+ + +

If empty, server uses LOCAL_LLM_API_URL.

+
+ +
+ + +
+ +
+ + +
+
@@ -680,7 +724,7 @@

All Set!, Ready to Start?

-

Upload an audio/text/pdf file to get started with +

Upload an audio/video/text/pdf file to get started with transcription, meeting minutes, and mind map generation.

@@ -718,10 +762,101 @@

All Set!, Ready to St const transcriptionModelToClient = {{ transcription_model_to_client | tojson }}; const llmModelToClient = {{ llm_model_to_client | tojson }}; + const defaultLocalLlmApiUrl = {{ local_llm_api_url | tojson }}; const transcriptionModelList = {{ transcription_model_list | tojson }}; const llmModelList = {{ llm_model_list | tojson }}; + const mediaExtensions = new Set([ + 'wav', 'mp3', 'm4a', 'ogg', 'flac', 'aac', 'webm', 'mp4', 'mov', 'mkv', 'avi', 'm4v' + ]); + + function isLocalProvider(clientName) { + return clientName === 'lmstudio' || clientName === 'ollama'; + } + + function readCurrentSettings() { + const transcriptionModel = document.getElementById('transcription-model')?.value || ''; + const selectedLLMModel = document.getElementById('llm-model')?.value || ''; + const customLLMModel = document.getElementById('custom-llm-model')?.value?.trim() || ''; + const selectedLLMClient = llmModelToClient[selectedLLMModel] || ''; + const useCustomLocalModel = isLocalProvider(selectedLLMClient) && !!customLLMModel; + + return { + transcriptionModel, + transcriptionClient: transcriptionModelToClient[transcriptionModel] || '', + llmModel: useCustomLocalModel ? customLLMModel : selectedLLMModel, + llmClient: selectedLLMClient, + llmBaseUrl: document.getElementById('llm-base-url')?.value?.trim() || defaultLocalLlmApiUrl || '', + llmApiKey: document.getElementById('llm-api-key')?.value?.trim() || '', + }; + } + + function transcriptionSettings() { + return { + open: false, + transcriptionModelList, + llmModelList, + transcriptionModel: transcriptionModelList[0] || '', + llmModel: llmModelList[0] || '', + transcriptionClient: '', + llmClient: '', + llmBaseUrl: '', + llmApiKey: '', + customLLMModel: '', + + updateTranscriptionClient() { + this.transcriptionClient = transcriptionModelToClient[this.transcriptionModel] || ''; + }, + + updateLLMClient() { + this.llmClient = llmModelToClient[this.llmModel] || ''; + if (!isLocalProvider(this.llmClient)) { + this.customLLMModel = ''; + } + }, + + persist() { + localStorage.setItem('transcriptionSettings', JSON.stringify({ + transcriptionModel: this.transcriptionModel, + llmModel: this.llmModel, + llmBaseUrl: this.llmBaseUrl, + customLLMModel: this.customLLMModel, + })); + }, + + init() { + const savedRaw = localStorage.getItem('transcriptionSettings'); + if (savedRaw) { + try { + const saved = JSON.parse(savedRaw); + if (saved.transcriptionModel && this.transcriptionModelList.includes(saved.transcriptionModel)) { + this.transcriptionModel = saved.transcriptionModel; + } + if (saved.llmModel && this.llmModelList.includes(saved.llmModel)) { + this.llmModel = saved.llmModel; + } + if (saved.llmBaseUrl) { + this.llmBaseUrl = saved.llmBaseUrl; + } + if (saved.customLLMModel) { + this.customLLMModel = saved.customLLMModel; + } + } catch (error) { + console.error('Unable to parse saved settings', error); + } + } + + if (!this.llmBaseUrl && defaultLocalLlmApiUrl) { + this.llmBaseUrl = defaultLocalLlmApiUrl; + } + + this.updateTranscriptionClient(); + this.updateLLMClient(); + }, + }; + } + function appData() { @@ -729,6 +864,7 @@

All Set!, Ready to St selectedFiles: [], allFiles: [], showTranscript: false, + darkMode: false, autoRefresh: true, refreshInterval: null, dragover: false, @@ -780,10 +916,10 @@

All Set!, Ready to St const files = event.target.files; if (files) { for (let i = 0; i < files.length; i++) { - if (this.isAudioFile(files[i])) { + if (this.isMediaFile(files[i])) { this.selectedFiles.push(files[i]); } else { - this.showToastMessage('Only audio files are allowed', 'error'); + this.showToastMessage('Only audio/video files are allowed', 'error'); } } } @@ -793,17 +929,22 @@

All Set!, Ready to St const files = event.dataTransfer.files; if (files) { for (let i = 0; i < files.length; i++) { - if (this.isAudioFile(files[i])) { + if (this.isMediaFile(files[i])) { this.selectedFiles.push(files[i]); } else { - this.showToastMessage('Only audio files are allowed', 'error'); + this.showToastMessage('Only audio/video files are allowed', 'error'); } } } }, - isAudioFile(file) { - return file.type.startsWith('audio/'); + isMediaFile(file) { + if (file.type.startsWith('audio/') || file.type.startsWith('video/')) { + return true; + } + + const extension = file.name.includes('.') ? file.name.split('.').pop().toLowerCase() : ''; + return mediaExtensions.has(extension); }, removeFile(index) { @@ -813,30 +954,47 @@

All Set!, Ready to St uploadFiles() { if (this.selectedFiles.length === 0) return; + const settings = readCurrentSettings(); + if (!settings.transcriptionModel || !settings.transcriptionClient) { + this.showToastMessage('Please choose a valid transcription model', 'error'); + return; + } + + if (!settings.llmModel || !settings.llmClient) { + this.showToastMessage('Please choose a valid LLM model', 'error'); + return; + } + this.uploading = true; const formData = new FormData(); this.selectedFiles.forEach(file => { - formData.append('audio', file); + formData.append('media', file); }); - // Get selected models from the settings form fields instead of localStorage - const transcriptionModel = document.getElementById('transcription-model')?.value || ''; - const llmModel = document.getElementById('llm-model')?.value || ''; - - const transcriptionClient = transcriptionModelToClient[transcriptionModel] || ''; - const llmClient = llmModelToClient[llmModel] || ''; + formData.append('transcription-model', settings.transcriptionModel); + formData.append('transcription-client', settings.transcriptionClient); + formData.append('llm-model', settings.llmModel); + formData.append('llm-client', settings.llmClient); - formData.append('transcription-model', transcriptionModel); - formData.append('transcription-client', transcriptionClient); - formData.append('llm-model', llmModel); - formData.append('llm-client', llmClient); + if (settings.llmBaseUrl) { + formData.append('llm-base-url', settings.llmBaseUrl); + } + if (settings.llmApiKey) { + formData.append('llm-api-key', settings.llmApiKey); + } fetch('/upload', { method: 'POST', body: formData }) - .then(response => response.json()) + .then(async response => { + const payload = await response.json(); + if (!response.ok) { + throw new Error(payload.error || 'Upload failed'); + } + return payload; + }) .then(data => { this.uploading = false; this.selectedFiles = []; @@ -852,7 +1010,7 @@

All Set!, Ready to St .catch(error => { this.uploading = false; console.error('Error uploading files:', error); - this.showToastMessage('Error uploading files', 'error'); + this.showToastMessage(error.message || 'Error uploading files', 'error'); }); }, refreshFileList() { @@ -945,26 +1103,31 @@

All Set!, Ready to St buttonHref: `/mindmap?id=${file.id}`, async handleClick(event) { + const settings = readCurrentSettings(); + const modelChanged = + (file.llm_model || '') !== settings.llmModel || + (file.llm_client || '') !== settings.llmClient; + // If already has mindmap, allow normal redirect - if (this.hasMindMap || this.generatedNow) return; + if ((this.hasMindMap || this.generatedNow) && !modelChanged) return; // Prevent navigation if generating event.preventDefault(); this.isGenerating = true; - this.buttonText = 'Generating...'; + this.buttonText = modelChanged ? 'Regenerating...' : 'Generating...'; try { - const llmModel = document.getElementById('llm-model')?.value || ''; - const llmClient = llmModelToClient?.[llmModel] || ''; - const response = await fetch('/api/generate_mindmap', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ id: file.id, - 'llm-client': llmClient, - 'llm-model': llmModel + 'llm-client': settings.llmClient, + 'llm-model': settings.llmModel, + 'llm-base-url': settings.llmBaseUrl, + 'llm-api-key': settings.llmApiKey, + force: modelChanged, }) }); @@ -978,6 +1141,8 @@

All Set!, Ready to St this.hasMindMap = true; this.generatedNow = true; + file.llm_model = settings.llmModel; + file.llm_client = settings.llmClient; this.buttonText = 'View Mindmap'; this.isGenerating = false; @@ -991,40 +1156,6 @@

All Set!, Ready to St }; } - - document.addEventListener('alpine:init', () => { - Alpine.data('transcriptionSettings', () => ({ - transcriptionModel: 'whisper-large-v3-turbo', - transcriptionClient: '', - - llmModel: 'llama-3.3-70b-versatile', - llmClient: '', - - updateTranscriptionClient() { - this.transcriptionClient = this.transcriptionModelToClient[this.transcriptionModel] || ''; - }, - - updateLLMClient() { - this.llmClient = this.llmModelToClient[this.llmModel] || ''; - }, - - persist() { - localStorage.setItem('transcriptionSettings', JSON.stringify({ - transcriptionModel: this.transcriptionModel, - llmModel: this.llmModel - })); - }, - - init() { - const llm_model = document.getElementById('llm-model'); - if (saved) { - this.llmModel = saved.llmModel; - } - this.updateTranscriptionClient(); - this.updateLLMClient(); - } - })); - }); function transcriptUploadForm() { return { transcriptText: '', @@ -1077,12 +1208,15 @@

All Set!, Ready to St return; } - // Get selected model values - const llmModel = document.getElementById('llm-model')?.value || ''; - const llmClient = llmModelToClient[llmModel] || ''; - - formData.append('llm-model', llmModel); - formData.append('llm-client', llmClient); + const settings = readCurrentSettings(); + formData.append('llm-model', settings.llmModel); + formData.append('llm-client', settings.llmClient); + if (settings.llmBaseUrl) { + formData.append('llm-base-url', settings.llmBaseUrl); + } + if (settings.llmApiKey) { + formData.append('llm-api-key', settings.llmApiKey); + } try { const response = await fetch('/upload_transcript', { @@ -1120,23 +1254,32 @@

All Set!, Ready to St buttonText: file.minutes_available ? 'View Minutes' : 'Generate Minutes', async handleClick() { - if (this.hasMinutes) { + const settings = readCurrentSettings(); + const modelChanged = + (file.llm_model || '') !== settings.llmModel || + (file.llm_client || '') !== settings.llmClient; + + if (this.hasMinutes && !modelChanged) { // Now only redirect if already generated window.location.href = `/view/${file.id}`; return; } this.isGenerating = true; - this.buttonText = 'Generating...'; + this.buttonText = this.hasMinutes ? 'Regenerating...' : 'Generating...'; try { - const llmModel = document.getElementById('llm-model')?.value || ''; - const llmClient = llmModelToClient[llmModel] || ''; - const response = await fetch('/api/generate_meeting_minutes', { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ id: file.id, 'llm-client': llmClient, 'llm-model': llmModel }) + body: JSON.stringify({ + id: file.id, + 'llm-client': settings.llmClient, + 'llm-model': settings.llmModel, + 'llm-base-url': settings.llmBaseUrl, + 'llm-api-key': settings.llmApiKey, + force: modelChanged, + }) }); const data = await response.json(); @@ -1151,6 +1294,8 @@

All Set!, Ready to St // Update state to show "View Minutes" instead of redirecting this.hasMinutes = true; file.minutes_available = true; + file.llm_model = settings.llmModel; + file.llm_client = settings.llmClient; this.buttonText = 'View Minutes'; } catch (e) { diff --git a/src/templates/view.html b/src/templates/view.html index 13d35dc..ef076b6 100644 --- a/src/templates/view.html +++ b/src/templates/view.html @@ -32,12 +32,9 @@ - - - - - - + + + @@ -69,16 +66,14 @@ } - +

Theme - - - - @@ -405,7 +384,8 @@

Results (< @@ -421,13 +401,14 @@

No meeting minutes available

-