From 0e920943fbff7e7a77d0eca2ec3e053c64d8d5b2 Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Mon, 9 Mar 2026 23:23:45 +0900 Subject: [PATCH] feat: integrate WhisperCppSpeechToText and VoiceVoxSpeechSynthesizer for enhanced speech recognition and synthesis --- example_apps/echo.py | 15 +- .../speech_recognition/__init__.py | 3 +- .../speech_recognition/whisper_cpp.py | 234 ++++++++++++++++++ 3 files changed, 250 insertions(+), 2 deletions(-) create mode 100644 stackchan_server/speech_recognition/whisper_cpp.py diff --git a/example_apps/echo.py b/example_apps/echo.py index 8fb47f3..2a60bfa 100644 --- a/example_apps/echo.py +++ b/example_apps/echo.py @@ -5,6 +5,8 @@ from logging import getLogger from stackchan_server.app import StackChanApp +from stackchan_server.speech_recognition import WhisperCppSpeechToText +from stackchan_server.speech_synthesis import VoiceVoxSpeechSynthesizer from stackchan_server.ws_proxy import EmptyTranscriptError, WsProxy logger = getLogger(__name__) @@ -14,8 +16,19 @@ datefmt="%H:%M:%S", ) +def _create_app() -> StackChanApp: + whisper_model = os.getenv("STACKCHAN_WHISPER_MODEL") + if whisper_model: + return StackChanApp( + speech_recognizer=WhisperCppSpeechToText( + model_path=whisper_model, + ), + speech_synthesizer=VoiceVoxSpeechSynthesizer(), + ) + return StackChanApp() -app = StackChanApp() + +app = _create_app() @app.setup diff --git a/stackchan_server/speech_recognition/__init__.py b/stackchan_server/speech_recognition/__init__.py index 660cf6d..f9a066a 100644 --- a/stackchan_server/speech_recognition/__init__.py +++ b/stackchan_server/speech_recognition/__init__.py @@ -2,10 +2,11 @@ from ..types import SpeechRecognizer from .google_cloud import GoogleCloudSpeechToText +from .whisper_cpp import WhisperCppSpeechToText def create_speech_recognizer() -> SpeechRecognizer: return GoogleCloudSpeechToText() -__all__ = ["GoogleCloudSpeechToText", "create_speech_recognizer"] +__all__ = ["GoogleCloudSpeechToText", "WhisperCppSpeechToText", "create_speech_recognizer"] diff --git a/stackchan_server/speech_recognition/whisper_cpp.py b/stackchan_server/speech_recognition/whisper_cpp.py new file mode 100644 index 0000000..97e10da --- /dev/null +++ b/stackchan_server/speech_recognition/whisper_cpp.py @@ -0,0 +1,234 @@ +from __future__ import annotations + +import asyncio +import io +import json +import math +import os +import shlex +import shutil +import tempfile +import wave +from logging import getLogger +from pathlib import Path + +from ..types import SpeechRecognizer + +logger = getLogger(__name__) +_DEFAULT_SILENCE_RMS_THRESHOLD = 75.0 +_DEFAULT_VAD_THRESHOLD = 0.6 +_DEFAULT_VAD_MIN_SPEECH_DURATION_MS = 250 +_DEFAULT_VAD_MIN_SILENCE_DURATION_MS = 400 +_DEFAULT_VAD_SPEECH_PAD_MS = 30 + + +class WhisperCppSpeechToText(SpeechRecognizer): + def __init__( + self, + *, + model_path: str | Path, + cli_path: str = "whisper-cli", + threads: int | None = None, + translate: bool = False, + no_speech_threshold: float = 0.8, + suppress_non_speech_tokens: bool = True, + vad_model_path: str | Path | None = None, + use_vad: bool = True, + vad_threshold: float = _DEFAULT_VAD_THRESHOLD, + vad_min_speech_duration_ms: int = _DEFAULT_VAD_MIN_SPEECH_DURATION_MS, + vad_min_silence_duration_ms: int = _DEFAULT_VAD_MIN_SILENCE_DURATION_MS, + vad_speech_pad_ms: int = _DEFAULT_VAD_SPEECH_PAD_MS, + silence_rms_threshold: float = _DEFAULT_SILENCE_RMS_THRESHOLD, + ) -> None: + self._model_path = Path(model_path) + self._cli_path = cli_path + self._threads = threads + self._translate = translate + self._no_speech_threshold = no_speech_threshold + self._suppress_non_speech_tokens = suppress_non_speech_tokens + self._vad_model_path = _resolve_vad_model_path(vad_model_path) + self._use_vad = use_vad + self._vad_threshold = vad_threshold + self._vad_min_speech_duration_ms = vad_min_speech_duration_ms + self._vad_min_silence_duration_ms = vad_min_silence_duration_ms + self._vad_speech_pad_ms = vad_speech_pad_ms + self._silence_rms_threshold = silence_rms_threshold + + async def transcribe( + self, + pcm_bytes: bytes, + *, + sample_rate_hz: int, + channels: int, + sample_width: int, + language_code: str = "ja-JP", + ) -> str: + if channels != 1: + raise ValueError(f"whisper.cpp only supports mono input here: channels={channels}") + if sample_width != 2: + raise ValueError(f"whisper.cpp expects 16-bit PCM here: sample_width={sample_width}") + if not self._model_path.is_file(): + raise FileNotFoundError(f"whisper.cpp model not found: {self._model_path}") + if _pcm_rms_level(pcm_bytes) < self._silence_rms_threshold: + logger.info( + "Skipping whisper.cpp transcription because pcm rms %.2f is below silence threshold %.2f", + _pcm_rms_level(pcm_bytes), + self._silence_rms_threshold, + ) + return "" + + cli_path = shutil.which(self._cli_path) + if cli_path is None: + raise FileNotFoundError(f"whisper.cpp CLI not found in PATH: {self._cli_path}") + + language = _normalize_language(language_code) + with tempfile.TemporaryDirectory(prefix="stackchan_whisper_") as temp_dir_name: + temp_dir = Path(temp_dir_name) + wav_path = temp_dir / "input.wav" + out_base = temp_dir / "result" + json_path = out_base.with_suffix(".json") + _write_wav( + wav_path, + pcm_bytes, + sample_rate_hz=sample_rate_hz, + channels=channels, + sample_width=sample_width, + ) + + command = [ + cli_path, + "-m", + str(self._model_path), + "-f", + str(wav_path), + "-l", + language, + "-nth", + str(self._no_speech_threshold), + "-nt", + "-ojf", + "-of", + str(out_base), + ] + if self._threads is not None: + command.extend(["-t", str(self._threads)]) + if self._translate: + command.append("-tr") + if self._suppress_non_speech_tokens: + command.append("-sns") + if self._use_vad and self._vad_model_path is not None: + command.extend( + [ + "--vad", + "-vm", + str(self._vad_model_path), + "-vt", + str(self._vad_threshold), + "-vspd", + str(self._vad_min_speech_duration_ms), + "-vsd", + str(self._vad_min_silence_duration_ms), + "-vp", + str(self._vad_speech_pad_ms), + ] + ) + command.append("-np") + + logger.info( + "Running whisper.cpp command: %s", + shlex.join(command), + ) + process = await asyncio.create_subprocess_exec( + *command, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await process.communicate() + if process.returncode != 0: + stderr_text = stderr.decode("utf-8", errors="replace").strip() + stdout_text = stdout.decode("utf-8", errors="replace").strip() + raise RuntimeError( + "whisper.cpp failed: " + f"exit_code={process.returncode} stderr={stderr_text or ''} " + f"stdout={stdout_text or ''}" + ) + + if not json_path.is_file(): + raise RuntimeError("whisper.cpp did not produce a JSON transcript file") + + transcript = _load_transcript_from_json(json_path) + if transcript: + logger.info("whisper.cpp transcript: %s", transcript) + return transcript + + +def _normalize_language(language_code: str) -> str: + if not language_code: + return "auto" + return language_code.split("-", 1)[0].lower() + + +def _normalize_transcript(text: str) -> str: + return text.strip() + + +def _load_transcript_from_json(path: Path) -> str: + data = json.loads(path.read_text(encoding="utf-8")) + transcription = data.get("transcription") + if not isinstance(transcription, list): + return "" + parts: list[str] = [] + for item in transcription: + if not isinstance(item, dict): + continue + text = item.get("text") + if isinstance(text, str): + normalized = _normalize_transcript(text) + if normalized: + parts.append(normalized) + return " ".join(parts).strip() + + +def _pcm_rms_level(pcm_bytes: bytes) -> float: + if len(pcm_bytes) < 2: + return 0.0 + sample_count = len(pcm_bytes) // 2 + total = 0.0 + for index in range(0, sample_count * 2, 2): + sample = int.from_bytes(pcm_bytes[index : index + 2], byteorder="little", signed=True) + total += float(sample * sample) + return math.sqrt(total / sample_count) + + +def _resolve_vad_model_path(vad_model_path: str | Path | None) -> Path | None: + if vad_model_path is not None: + path = Path(vad_model_path) + return path if path.is_file() else None + + env_path = os.getenv("STACKCHAN_WHISPER_VAD_MODEL") + if env_path: + path = Path(env_path) + if path.is_file(): + return path + + return None + + +def _write_wav( + path: Path, + pcm_bytes: bytes, + *, + sample_rate_hz: int, + channels: int, + sample_width: int, +) -> None: + with io.BytesIO() as buffer: + with wave.open(buffer, "wb") as wav_fp: + wav_fp.setnchannels(channels) + wav_fp.setsampwidth(sample_width) + wav_fp.setframerate(sample_rate_hz) + wav_fp.writeframes(pcm_bytes) + path.write_bytes(buffer.getvalue()) + + +__all__ = ["WhisperCppSpeechToText"]