diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..3702a9c --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ +OPENAI_API_KEY= +LOCAL_LLM_API_URL= +GROQ_API_KEY= +LOCAL_LLM_REQUEST_TIMEOUT= +GUNICORN_TIMEOUT= \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1f9b105..19dc5a2 100644 --- a/.gitignore +++ b/.gitignore @@ -178,4 +178,5 @@ logs/* *.wav *.mp3 *.index -*.pkl \ No newline at end of file +*.pkl +*.videos \ No newline at end of file diff --git a/README.md b/README.md index 54119c2..9538c66 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,9 @@ ## Features * 🎙️ **Audio Transcription**: Convert spoken content from meetings into text using advanced speech-to-text capabilities. +* 🎬 **Video Upload Support**: Upload meeting videos directly and extract audio automatically before transcription. * 🧠 **Summarization with LLMs**: Generate concise summaries of transcribed text utilizing powerful language models. +* 🧩 **Flexible LLM Providers**: Use cloud models (OpenAI, Groq) or connect to local models through LM Studio or Ollama. * Mind Map Generation: Create visual representations of meeting summaries to enhance understanding and retention. * 🖥️ **User-Friendly Interface**: Interact with a clean and responsive web UI built with Flask. * 🐳 **Dockerized Deployment**: Easily deploy the application using Docker and Docker Compose for a consistent environment setup. @@ -43,11 +45,14 @@ ```bash export GROQ_API_KEY=your_groq_api_key export OPENAI_API_KEY=your_openai_api_key + export LOCAL_LLM_API_URL=http://your-local-llm-host:1234 + export LOCAL_LLM_REQUEST_TIMEOUT=900 + export GUNICORN_TIMEOUT=960 ``` -*Replace `your_groq_api_key` and `your_openai_api_key` with your actual API keys.* +*Replace the placeholder values with your actual keys and local LLM URL. For Ollama, use port 11434. Increase `LOCAL_LLM_REQUEST_TIMEOUT` for slower local models, and keep `GUNICORN_TIMEOUT` higher than that value.* 4. **Build and Run the Docker Container** diff --git a/audio_extraction.py b/audio_extraction.py new file mode 100644 index 0000000..477a8cf --- /dev/null +++ b/audio_extraction.py @@ -0,0 +1,27 @@ +import subprocess +from pathlib import Path + + +def extract_audio(video_file, output_file=None): + video_path = Path(video_file) + + if output_file is None: + output_file = video_path.with_suffix(".wav") + + command = [ + "ffmpeg", + "-i", str(video_path), + "-vn", # no video + "-acodec", "pcm_s16le", + "-ar", "16000", # 16 kHz sample rate + "-ac", "1", # mono audio + str(output_file), + "-y" + ] + + subprocess.run(command, check=True) + print(f"Audio saved to: {output_file}") + + +if __name__ == "__main__": + extract_audio("") \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 0d624ec..35ec139 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,21 +1,19 @@ services: flask: - container_name: Speak2Summary-flask + container_name: s2s-flask build: context: . dockerfile: Dockerfile restart: always - # ports: - # - "5000:5000" - environment: - - GROQ_API_KEY=${GROQ_API_KEY} - - OPENAI_API_KEY=${OPENAI_API_KEY} + ports: + - "5000:5000" + env_file: + - .env depends_on: - redis - command: gunicorn -w 4 -b 0.0.0.0:5000 src.app:app --timeout 120 + command: gunicorn -w 4 -b 0.0.0.0:5000 src.app:app --timeout ${GUNICORN_TIMEOUT:-960} --graceful-timeout 120 networks: - Speak2Summary-net - - homelab labels: - "tsdproxy.enable=true" - "tsdproxy.name=speak2summary" @@ -33,15 +31,14 @@ services: retries: 5 celery: - container_name: Speak2Summary-celery + container_name: s2s-celery build: context: . dockerfile: Dockerfile restart: always command: celery -A src.celery_worker.celery worker --loglevel=info - environment: - - GROQ_API_KEY=${GROQ_API_KEY} - - OPENAI_API_KEY=${OPENAI_API_KEY} + env_file: + - .env depends_on: - redis networks: @@ -53,7 +50,7 @@ services: com.Speak2Summary.service: "celery-worker" redis: - container_name: Speak2Summary-redis + container_name: s2s-redis image: redis:7-alpine restart: always ports: @@ -71,8 +68,7 @@ services: networks: Speak2Summary-net: driver: bridge - homelab: - external: true + # networks: # homelab: diff --git a/src/audio_extraction.py b/src/audio_extraction.py new file mode 100644 index 0000000..615b223 --- /dev/null +++ b/src/audio_extraction.py @@ -0,0 +1,36 @@ +# cython: language_level=3 +import subprocess +from pathlib import Path +from typing import Optional + + +def extract_audio(video_file: str, output_file: Optional[str] = None) -> str: + """Extract mono 16kHz WAV audio from a video file using ffmpeg.""" + video_path = Path(video_file) + + if output_file is None: + output_path = video_path.with_suffix(".wav") + else: + output_path = Path(output_file) + + command = [ + "ffmpeg", + "-i", + str(video_path), + "-vn", + "-acodec", + "pcm_s16le", + "-ar", + "16000", + "-ac", + "1", + str(output_path), + "-y", + ] + + subprocess.run(command, check=True) + return str(output_path) + + +if __name__ == "__main__": + extract_audio("videos/sample.mp4") diff --git a/src/llm_clients.py b/src/llm_clients.py new file mode 100644 index 0000000..5e4fc16 --- /dev/null +++ b/src/llm_clients.py @@ -0,0 +1,253 @@ +# cython: language_level=3 +import json +import os +import re +import socket +from typing import Any, Dict, Optional +from urllib import request as urllib_request +from urllib.error import HTTPError, URLError + +from transmeet import generate_meeting_minutes_from_transcript, generate_mind_map_from_transcript +from transmeet.utils.general_utils import get_logger + +logger = get_logger(__name__) + + +def _default_local_llm_url() -> Optional[str]: + configured_url = os.getenv("LOCAL_LLM_API_URL", "").strip() + return configured_url or None + + +def _local_llm_timeout_seconds() -> float: + configured_timeout = os.getenv("LOCAL_LLM_REQUEST_TIMEOUT", "900").strip() + try: + timeout = float(configured_timeout) + if timeout <= 0: + raise ValueError + return timeout + except ValueError: + logger.warning( + "Invalid LOCAL_LLM_REQUEST_TIMEOUT '%s'; falling back to 900 seconds", + configured_timeout, + ) + return 900.0 + + +def _normalize_lmstudio_endpoint(base_url: Optional[str]) -> str: + base_url = base_url or _default_local_llm_url() + if not base_url: + return "http://localhost:1234/v1/chat/completions" + + url = base_url.rstrip("/") + if url.endswith("/v1/chat/completions"): + return url + if url.endswith("/v1"): + return f"{url}/chat/completions" + return f"{url}/v1/chat/completions" + + +def _normalize_ollama_endpoint(base_url: Optional[str]) -> str: + base_url = base_url or _default_local_llm_url() + if not base_url: + return "http://localhost:11434/api/chat" + + url = base_url.rstrip("/") + if url.endswith("/api/chat"): + return url + if url.endswith("/api"): + return f"{url}/chat" + return f"{url}/api/chat" + + +def _post_json(endpoint: str, payload: Dict[str, Any], api_key: Optional[str] = None) -> Dict[str, Any]: + data = json.dumps(payload).encode("utf-8") + headers = {"Content-Type": "application/json"} + timeout_seconds = _local_llm_timeout_seconds() + + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + request = urllib_request.Request(endpoint, data=data, headers=headers, method="POST") + try: + with urllib_request.urlopen(request, timeout=timeout_seconds) as response: + return json.loads(response.read().decode("utf-8")) + except HTTPError as exc: + body = "" + try: + body = exc.read().decode("utf-8") + except Exception: + body = "" + raise RuntimeError(f"HTTP {exc.code} calling {endpoint}: {body}") from exc + except socket.timeout as exc: + raise RuntimeError( + f"Timed out after {int(timeout_seconds)}s calling {endpoint}. " + "Increase LOCAL_LLM_REQUEST_TIMEOUT or use a faster model." + ) from exc + except URLError as exc: + if isinstance(exc.reason, socket.timeout): + raise RuntimeError( + f"Timed out after {int(timeout_seconds)}s calling {endpoint}. " + "Increase LOCAL_LLM_REQUEST_TIMEOUT or use a faster model." + ) from exc + raise RuntimeError(f"Unable to reach {endpoint}: {exc.reason}") from exc + + +def _get_lmstudio_content(response: Dict[str, Any]) -> str: + choices = response.get("choices") or [] + if not choices: + raise RuntimeError("LM Studio response did not contain choices") + + message = choices[0].get("message") or {} + content = message.get("content") + if not content: + raise RuntimeError("LM Studio response did not contain message content") + return content.strip() + + +def _get_ollama_content(response: Dict[str, Any]) -> str: + message = response.get("message") or {} + content = message.get("content") + if not content: + raise RuntimeError("Ollama response did not contain message content") + return content.strip() + + +def _extract_json_object(raw_text: str) -> Dict[str, Any]: + text = raw_text.strip() + + if text.startswith("```"): + text = re.sub(r"^```(?:json)?\\s*", "", text) + text = re.sub(r"\\s*```$", "", text) + + try: + parsed = json.loads(text) + if isinstance(parsed, dict): + return parsed + except json.JSONDecodeError: + pass + + start = text.find("{") + end = text.rfind("}") + if start != -1 and end != -1 and end > start: + snippet = text[start : end + 1] + parsed = json.loads(snippet) + if isinstance(parsed, dict): + return parsed + + raise RuntimeError("Could not parse JSON object from local model response") + + +def _local_chat_completion( + llm_client: str, + llm_model: str, + system_prompt: str, + user_prompt: str, + llm_base_url: Optional[str] = None, + llm_api_key: Optional[str] = None, +) -> str: + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ] + + if llm_client == "lmstudio": + endpoint = _normalize_lmstudio_endpoint(llm_base_url) + payload = { + "model": llm_model, + "messages": messages, + "temperature": 0.2, + "stream": False, + } + response = _post_json(endpoint, payload, llm_api_key) + return _get_lmstudio_content(response) + + if llm_client == "ollama": + endpoint = _normalize_ollama_endpoint(llm_base_url) + payload = { + "model": llm_model, + "messages": messages, + "stream": False, + "options": {"temperature": 0.2}, + } + response = _post_json(endpoint, payload, llm_api_key) + return _get_ollama_content(response) + + raise RuntimeError(f"Unsupported local client '{llm_client}'") + + +def generate_meeting_minutes_with_provider( + transcript: str, + llm_client: str, + llm_model: str, + llm_base_url: Optional[str] = None, + llm_api_key: Optional[str] = None, +) -> str: + if llm_client in {"lmstudio", "ollama"}: + system_prompt = ( + "You generate clear, concise meeting minutes in markdown. " + "Use sections: Executive Summary, Key Discussion Points, Decisions Made, " + "Action Items, Risks and Blockers, and Next Steps." + ) + user_prompt = ( + "Create polished meeting minutes from the transcript below. " + "Use bullet points where useful and keep factual accuracy.\n\n" + f"Transcript:\n{transcript}" + ) + return _local_chat_completion( + llm_client, + llm_model, + system_prompt, + user_prompt, + llm_base_url, + llm_api_key, + ) + + return generate_meeting_minutes_from_transcript( + transcript, + llm_client=llm_client, + llm_model=llm_model, + ) + + +def generate_mind_map_with_provider( + transcript: str, + llm_client: str, + llm_model: str, + llm_base_url: Optional[str] = None, + llm_api_key: Optional[str] = None, +) -> Dict[str, Any]: + if llm_client in {"lmstudio", "ollama"}: + system_prompt = ( + "You return strict JSON only, no markdown. " + "Generate a hierarchical meeting mind map." + ) + user_prompt = ( + "From the transcript below, return only a JSON object shaped like:\n" + "{\n" + " \"Root Topic\": \"Meeting Title\",\n" + " \"Section\": {\n" + " \"Subtopic\": [\"point 1\", \"point 2\"]\n" + " }\n" + "}\n" + "Use concise keys and meaningful grouping.\n\n" + f"Transcript:\n{transcript}" + ) + + raw_response = _local_chat_completion( + llm_client, + llm_model, + system_prompt, + user_prompt, + llm_base_url, + llm_api_key, + ) + parsed_response = _extract_json_object(raw_response) + if "Root Topic" not in parsed_response: + parsed_response["Root Topic"] = "Meeting Mind Map" + return parsed_response + + return generate_mind_map_from_transcript( + transcript, + llm_client=llm_client, + llm_model=llm_model, + ) diff --git a/src/models.json b/src/models.json index a1904a8..cf3ec3a 100644 --- a/src/models.json +++ b/src/models.json @@ -1,19 +1,21 @@ { "transcription_model_to_client": { - "whisper-large-v3-turbo": "groq", - "whisper-large-v3": "groq", - "whisper-1": "openai", "gpt-4o-transcribe": "openai", - "gpt-4o-mini-transcribe": "openai" + "gpt-4o-mini-transcribe": "openai", + "whisper-1": "openai", + "whisper-large-v3-turbo": "groq", + "whisper-large-v3": "groq" }, "llm_model_to_client": { "llama-3.3-70b-versatile": "groq", - "llama3-70b-8192": "groq", - "gemma2-9b-it": "groq", - "mistral-saba-24b": "groq", - "qwen-qwq-32b": "groq", - "gpt-3.5-turbo": "openai", - "chatgpt-4o-latest": "openai", - "gpt-4o-mini": "openai" + "llama-3.1-8b-instant": "groq", + "qwen/qwen3-32b": "groq", + "deepseek-r1-distill-llama-70b": "groq", + "gpt-4.1": "openai", + "gpt-4.1-mini": "openai", + "gpt-4o": "openai", + "gpt-4o-mini": "openai", + "o4-mini": "openai", + "qwen/qwen3-14b": "lmstudio" } } \ No newline at end of file diff --git a/src/routes/api_routes.py b/src/routes/api_routes.py index a11d37d..601629a 100644 --- a/src/routes/api_routes.py +++ b/src/routes/api_routes.py @@ -1,119 +1,144 @@ # cython: language_level=3 -from flask import jsonify from datetime import datetime -from flask import request, jsonify + +from flask import jsonify, request +from transmeet.utils.general_utils import get_logger + +from src.llm_clients import ( + generate_meeting_minutes_with_provider, + generate_mind_map_with_provider, +) from src.models import TranscriptEntry, db -from transmeet import generate_mind_map_from_transcript, generate_meeting_minutes_from_transcript from src.utils import render_minutes_with_tailwind -from transmeet.utils.general_utils import get_logger -from src.models import TranscriptEntry from . import audio_bp logger = get_logger(__name__) + @audio_bp.route('/api/files', methods=['GET']) def list_files(): files = TranscriptEntry.query.order_by(TranscriptEntry.upload_time.desc()).all() - return jsonify([{ - 'id': f.id, - 'filename': f.filename, - 'status': f.status, - 'upload_time': f.upload_time.isoformat(), - 'transcript_available': f.transcript is not None, - 'minutes_available': f.minutes is not None, - 'error_message': f.error_message, - 'transcription_client': f.transcription_client, - 'transcription_model': f.transcription_model, - 'llm_client': f.llm_client, - 'llm_model': f.llm_model, - 'mind_map': f.mind_map is not None, - 'topic': f.mind_map.get('Root Topic') if f.mind_map else None, - - } for f in files]) - - -@audio_bp.route("/api/generate_mindmap", methods=["POST"]) + return jsonify([ + { + 'id': f.id, + 'filename': f.filename, + 'status': f.status, + 'upload_time': f.upload_time.isoformat(), + 'transcript_available': f.transcript is not None, + 'minutes_available': f.minutes is not None, + 'error_message': f.error_message, + 'transcription_client': f.transcription_client, + 'transcription_model': f.transcription_model, + 'llm_client': f.llm_client, + 'llm_model': f.llm_model, + 'mind_map': f.mind_map is not None, + 'topic': f.mind_map.get('Root Topic') if f.mind_map else None, + } + for f in files + ]) + + +@audio_bp.route('/api/generate_mindmap', methods=['POST']) def generate_mindmap_api(): - """API to generate mind map from transcript""" - data = request.get_json() - file_id = data.get("id") - - llm_client = data.get("llm-client") - llm_model = data.get("llm-model") - + """API to generate mind map from transcript.""" + data = request.get_json(silent=True) or {} + file_id = data.get('id') + force_regenerate = bool(data.get('force')) if not file_id: - return jsonify({"error": "Missing 'id' in request body"}), 400 + return jsonify({'error': "Missing 'id' in request body"}), 400 file_record = TranscriptEntry.query.filter_by(id=file_id).first() if not file_record: - return jsonify({"error": f"No record found for id: {file_id}"}), 404 + return jsonify({'error': f'No record found for id: {file_id}'}), 404 - if file_record.mind_map: - return jsonify({"message": "Mind map already exists", "mind_map": file_record.mind_map}), 200 + if file_record.mind_map and not force_regenerate: + return jsonify({'message': 'Mind map already exists', 'mind_map': file_record.mind_map}), 200 if not file_record.transcript: - return jsonify({"error": "Transcript not found for this file"}), 400 + return jsonify({'error': 'Transcript not found for this file'}), 400 - if not llm_client or not llm_model: - return jsonify({"error": "Missing 'llm_client' or 'llm_model' in request body"}), 400 + llm_client = (data.get('llm-client') or '').strip() + llm_model = (data.get('llm-model') or '').strip() + llm_base_url = data.get('llm-base-url') + llm_api_key = data.get('llm-api-key') - # Generate the mind map - mindmap_data = generate_mind_map_from_transcript( - file_record.transcript, - llm_client=llm_client, - llm_model=llm_model - ) + if not llm_client or not llm_model: + return jsonify({'error': "Missing 'llm-client' or 'llm-model' in request body"}), 400 + + try: + mindmap_data = generate_mind_map_with_provider( + file_record.transcript, + llm_client=llm_client, + llm_model=llm_model, + llm_base_url=llm_base_url, + llm_api_key=llm_api_key, + ) + except Exception as exc: + logger.error(f'Mind map generation failed for {file_id}: {exc}') + return jsonify({'error': str(exc)}), 500 file_record.mind_map = mindmap_data - file_record.status = "completed" + file_record.llm_client = llm_client + file_record.llm_model = llm_model + file_record.status = 'completed' file_record.completion_time = datetime.utcnow() db.session.commit() - return jsonify({"message": "Mind map generated successfully", "mind_map": mindmap_data}), 200 + return jsonify({'message': 'Mind map generated successfully', 'mind_map': mindmap_data}), 200 -@audio_bp.route("/api/generate_meeting_minutes", methods=["POST"]) +@audio_bp.route('/api/generate_meeting_minutes', methods=['POST']) def generate_meeting_minutes_api(): - """API to generate meeting minutes from transcript""" - data = request.get_json() + """API to generate meeting minutes from transcript.""" + data = request.get_json(silent=True) or {} - file_id = data.get("id") - llm_client = data.get("llm-client") - llm_model = data.get("llm-model") + file_id = data.get('id') + force_regenerate = bool(data.get('force')) if not file_id: - return jsonify({"error": "Missing 'id' in request body"}), 400 + return jsonify({'error': "Missing 'id' in request body"}), 400 file_record = TranscriptEntry.query.filter_by(id=file_id).first() if not file_record: - return jsonify({"error": f"No record found for id: {file_id}"}), 404 + return jsonify({'error': f'No record found for id: {file_id}'}), 404 + + if file_record.minutes and not force_regenerate: + return jsonify({'message': 'Meeting minutes already exist', 'minutes': file_record.minutes}), 200 - if file_record.minutes: - return jsonify({"message": "Meeting minutes already exist", "minutes": file_record.minutes}), 200 - if not file_record.transcript: - return jsonify({"error": "Transcript not found for this file"}), 400 + return jsonify({'error': 'Transcript not found for this file'}), 400 + + llm_client = (data.get('llm-client') or '').strip() + llm_model = (data.get('llm-model') or '').strip() + llm_base_url = data.get('llm-base-url') + llm_api_key = data.get('llm-api-key') if not llm_client or not llm_model: - return jsonify({"error": "Missing 'llm_client' or 'llm_model' in request body"}), 400 - - # Generate the meeting minutes - meeting_minutes_markdown = generate_meeting_minutes_from_transcript( - file_record.transcript, - llm_client=llm_client, - llm_model=llm_model - ) - - # minutes_raw + return jsonify({'error': "Missing 'llm-client' or 'llm-model' in request body"}), 400 + + try: + meeting_minutes_markdown = generate_meeting_minutes_with_provider( + file_record.transcript, + llm_client=llm_client, + llm_model=llm_model, + llm_base_url=llm_base_url, + llm_api_key=llm_api_key, + ) + except Exception as exc: + logger.error(f'Meeting minutes generation failed for {file_id}: {exc}') + return jsonify({'error': str(exc)}), 500 + meeting_minutes_html = render_minutes_with_tailwind(meeting_minutes_markdown) file_record.minutes_raw = meeting_minutes_markdown file_record.minutes = meeting_minutes_html - file_record.status = "completed" + file_record.llm_client = llm_client + file_record.llm_model = llm_model + file_record.status = 'completed' file_record.completion_time = datetime.utcnow() db.session.commit() - return jsonify({"message": "Meeting minutes generated successfully", "minutes": meeting_minutes_html}), 200 + return jsonify({'message': 'Meeting minutes generated successfully', 'minutes': meeting_minutes_html}), 200 diff --git a/src/routes/home_routes.py b/src/routes/home_routes.py index 1286b3c..61773b9 100644 --- a/src/routes/home_routes.py +++ b/src/routes/home_routes.py @@ -1,12 +1,13 @@ # cython: language_level=3 +import os +import json +from pathlib import Path + from flask import render_template from src.models import TranscriptEntry from . import audio_bp -import json -from pathlib import Path - ROOT_DIR = Path(__file__).resolve().parent.parent.parent models_path = ROOT_DIR / 'src/models.json' @@ -26,4 +27,5 @@ def index(): return render_template('index.html', files=files, transcription_model_to_client=transcription_model_to_client, llm_model_to_client=llm_model_to_client, transcription_model_list=transcription_model_list, - llm_model_list=llm_model_list) + llm_model_list=llm_model_list, + local_llm_api_url=os.getenv('LOCAL_LLM_API_URL', '')) diff --git a/src/routes/mindmap_route.py b/src/routes/mindmap_route.py index 4a0c0a0..3a52abe 100644 --- a/src/routes/mindmap_route.py +++ b/src/routes/mindmap_route.py @@ -5,7 +5,7 @@ from datetime import datetime from flask import render_template, request from src.models import TranscriptEntry, db -from transmeet import generate_mind_map_from_transcript +from src.llm_clients import generate_mind_map_with_provider from transmeet.utils.general_utils import get_logger logger = get_logger(__name__) @@ -155,7 +155,10 @@ def mindmap(): else: transcript = file_record.transcript - mindmap_data = generate_mind_map_from_transcript( + if not file_record.llm_client or not file_record.llm_model: + return "No LLM settings found for this file. Generate the mind map from the home page first.", 400 + + mindmap_data = generate_mind_map_with_provider( transcript, llm_client=file_record.llm_client, llm_model=file_record.llm_model diff --git a/src/routes/upload_routes.py b/src/routes/upload_routes.py index 7eecd39..b2de321 100644 --- a/src/routes/upload_routes.py +++ b/src/routes/upload_routes.py @@ -2,16 +2,20 @@ import os import uuid -from venv import logger from flask import request, jsonify from werkzeug.utils import secure_filename +from src.audio_extraction import extract_audio from src.models import db, TranscriptEntry from src.celery_worker import process_audio_file, process_transcript_file from src.config import app from transmeet.utils.general_utils import get_logger -logger = get_logger(__name__) +logger = get_logger(__name__) + +AUDIO_EXTENSIONS = {'.wav', '.mp3', '.m4a', '.ogg', '.flac', '.aac', '.webm'} +VIDEO_EXTENSIONS = {'.mp4', '.mov', '.mkv', '.avi', '.m4v', '.webm'} +ALLOWED_MEDIA_EXTENSIONS = AUDIO_EXTENSIONS | VIDEO_EXTENSIONS from . import audio_bp @@ -22,6 +26,27 @@ def save_file(file, tracking_id): file.save(path) return path, filename + +def prepare_media_file(file, tracking_id): + filepath, original_name = save_file(file, tracking_id) + ext = os.path.splitext(original_name)[1].lower() + mime_type = (file.mimetype or '').lower() + + if ext not in ALLOWED_MEDIA_EXTENSIONS: + if os.path.exists(filepath): + os.remove(filepath) + raise ValueError(f"Unsupported file type: {ext or 'unknown'}") + + is_video_upload = mime_type.startswith('video/') or (ext in VIDEO_EXTENSIONS and ext not in AUDIO_EXTENSIONS) + + if is_video_upload: + extracted_audio_path = os.path.splitext(filepath)[0] + '.wav' + extract_audio(filepath, extracted_audio_path) + os.remove(filepath) + return extracted_audio_path, original_name + + return filepath, original_name + def extract_text_from_file(file, ext): if ext == '.txt': return file.read().decode('utf-8') @@ -38,7 +63,7 @@ def extract_text_from_file(file, ext): raise ValueError("Unsupported file type") -def create_audio_file_entry(tracking_id, original_filename, filepath, t_client, t_model): +def create_audio_file_entry(tracking_id, original_filename, filepath, t_client, t_model, llm_client, llm_model): new_file = TranscriptEntry( id=tracking_id, #type: ignore filename=original_filename, #type: ignore @@ -46,6 +71,8 @@ def create_audio_file_entry(tracking_id, original_filename, filepath, t_client, status="queued", #type: ignore transcription_client=t_client, #type: ignore transcription_model=t_model, #type: ignore + llm_client=llm_client, #type: ignore + llm_model=llm_model, #type: ignore ) db.session.add(new_file) db.session.commit() @@ -66,27 +93,59 @@ def create_text_file_entry(tracking_id, original_filename, filepath, llm_client, @audio_bp.route('/upload', methods=['POST']) def upload(): - if 'audio' not in request.files: + if 'audio' not in request.files and 'media' not in request.files: return jsonify({'error': 'No file part'}), 400 - uploaded_files = request.files.getlist('audio') + uploaded_files = request.files.getlist('media') or request.files.getlist('audio') t_client = request.form.get('transcription-client') t_model = request.form.get('transcription-model') + llm_client = request.form.get('llm-client') + llm_model = request.form.get('llm-model') if not uploaded_files or uploaded_files[0].filename == '': return jsonify({'error': 'No selected file'}), 400 results = [] + queued_count = 0 for file in uploaded_files: + if not file or not file.filename: + continue + tracking_id = str(uuid.uuid4()) - filepath, original_name = save_file(file, tracking_id) - create_audio_file_entry(tracking_id, original_name, filepath, t_client, t_model) + filename = secure_filename(file.filename) + + try: + filepath, original_name = prepare_media_file(file, tracking_id) + except Exception as exc: + logger.error(f"Error preparing media file {filename}: {exc}") + results.append({ + 'filename': filename, + 'status': 'failed', + 'error': str(exc) + }) + continue + + create_audio_file_entry( + tracking_id, + original_name, + filepath, + t_client, + t_model, + llm_client, + llm_model, + ) + process_audio_file.delay(tracking_id, filepath, t_client, t_model) #type: ignore + results.append({ 'id': tracking_id, 'filename': original_name, 'status': 'queued' }) + queued_count += 1 + + if queued_count == 0: + return jsonify({'error': 'No valid media files processed', 'results': results}), 400 return jsonify(results) diff --git a/src/templates/index.html b/src/templates/index.html index 406fe8a..9ec8b47 100644 --- a/src/templates/index.html +++ b/src/templates/index.html @@ -33,6 +33,9 @@
Drag & drop audio files here
or click to select
Drag & drop audio or video files here
or click to select
Local Provider Settings
+ +If empty, server uses LOCAL_LLM_API_URL.
+Upload an audio/text/pdf file to get started with +
Upload an audio/video/text/pdf file to get started with transcription, meeting minutes, and mind map generation.
@@ -718,10 +762,101 @@