From ac83a0a164ecdfe633cfa74a6ccd905a362d9443 Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sat, 9 May 2026 14:27:18 +0900 Subject: [PATCH 01/15] feat: ServerSideWWD --- docs/rest_api_ja.md | 58 ++++ docs/websocket_protocols_ja.md | 40 ++- firmware/include/listening.hpp | 16 + firmware/include/metadata.hpp | 1 + .../generated_protobuf/websocket-message.pb.c | 2 + .../generated_protobuf/websocket-message.pb.h | 22 +- firmware/src/display.cpp | 6 + firmware/src/listening.cpp | 48 ++- firmware/src/main.cpp | 34 +- firmware/src/metadata.cpp | 5 + protobuf/websocket-message.proto | 7 + stackchan_server/app.py | 26 ++ .../websocket_message_pb2.py | 64 ++-- stackchan_server/protobuf_ws.py | 17 +- .../wakeup_word_detection/__init__.py | 13 + .../wakeup_word_detection/create.py | 28 ++ .../wakeup_word_detection/server_side.py | 205 ++++++++++++ stackchan_server/ws_proxy.py | 312 ++++++++++++++++-- 18 files changed, 837 insertions(+), 67 deletions(-) create mode 100644 stackchan_server/wakeup_word_detection/__init__.py create mode 100644 stackchan_server/wakeup_word_detection/create.py create mode 100644 stackchan_server/wakeup_word_detection/server_side.py diff --git a/docs/rest_api_ja.md b/docs/rest_api_ja.md index 86c2d2f..a3b41e1 100644 --- a/docs/rest_api_ja.md +++ b/docs/rest_api_ja.md @@ -17,6 +17,7 @@ | `GET` | `/v1/stackchan` | 接続中 StackChan 一覧 | | `GET` | `/v1/stackchan/{stackchan_ip}` | 指定 StackChan の状態取得 | | `POST` | `/v1/stackchan/{stackchan_ip}/wakeword` | 擬似 wakeword 発火 | +| `POST` | `/v1/stackchan/{stackchan_ip}/wakeword/server-detect` | サーバーサイド wakeword 検出を要求 | | `POST` | `/v1/stackchan/{stackchan_ip}/speak` | 指定 StackChan に発話させる | ## `GET /health` @@ -134,6 +135,63 @@ - 実機側のウェイクワード検出 (`WakeWordEvt`) と同じように扱われます。 - すでに `talk_session` 実行中でも、イベント自体は内部フラグとして立ちます。 +## `POST /v1/stackchan/{stackchan_ip}/wakeword/server-detect` + +サーバーサイドの wakeword 検出を開始します。 + +> [!NOTE] +> 環境変数 `USE_SERVER_SIDE_WWD_WHISPER_SERVER=1` の場合、サーバーは `@app.setup()` 完了後と `Idle` 復帰後に自動でサーバーサイド wakeword 検出を開始します。この API は明示的に現在の検出サイクルを待ちたい場合に利用できます。 + +- サーバーは対象 StackChan に `StateCmd(Listening, WAKE_WORD)` を送ってマイク音声を受信し、 + 直近 3 秒窓を 0.5 秒ごとに認識します。 +- 認識テキストには全結果がログ出力されます。 +- キーワードが検出されたら内部 wakeword イベントを発火し、`talk_session` 開始待ちを解除します。 +- 検出の終了時には `StateCmd(Idle)` を送って待機に戻します。 +- 実機の表示状態は `Listening` へは変わらず、`Idle(Server-WWD)` のまま待ち受けます。 +- このモードでは無音 3 秒によるクライアント側自動終了は行いません。 + +### パスパラメータ + +| 名前 | 型 | 説明 | +| --- | --- | --- | +| `stackchan_ip` | `string` | 対象 StackChan の接続元 IP | + +### クエリパラメータ + +| 名前 | 型 | 必須 | 説明 | +| --- | --- | --- | --- | +| `timeout_seconds` | `number` | 任意 | 検出待ちタイムアウト秒。未指定時はサーバー設定値 | + +### 成功レスポンス + +- Status: `200 OK` + +```json +{ + "detected": true +} +``` + +`detected` が `false` の場合は、検出セッションは終了したがキーワード未検出です。 + +### エラーレスポンス + +- Status: `404 Not Found` + +```json +{ + "detail": "stackchan not connected" +} +``` + +- Status: `409 Conflict` + +```json +{ + "detail": "Server-side wake-word detection is not available for this connection" +} +``` + ## `POST /v1/stackchan/{stackchan_ip}/speak` 指定した StackChan にテキストを発話させます。 diff --git a/docs/websocket_protocols_ja.md b/docs/websocket_protocols_ja.md index b816c13..d5a21aa 100644 --- a/docs/websocket_protocols_ja.md +++ b/docs/websocket_protocols_ja.md @@ -35,6 +35,8 @@ | `SpeakDoneEvt` | CoreS3 → Server | 音声再生完了通知 | | `ServoCmd` | Server → CoreS3 | サーボ動作シーケンス指示 | | `ServoDoneEvt` | CoreS3 → Server | サーボ動作完了通知 | +| `FirmwareMetadata` | CoreS3 → Server | クライアント能力通知 | +| `ServerMetadata` | Server → CoreS3 | サーバー能力通知 | ### `MessageType` 一覧 @@ -89,7 +91,7 @@ - 方向: Server → CoreS3 - `messageType`: `DATA` のみ -- body: `StateCommand { state }` +- body: `StateCommand { state, listening_purpose }` 利用する状態名: @@ -98,12 +100,21 @@ - `Thinking` - `Speaking` +`listening_purpose` の値: + +- `SPEECH`: 通常の会話入力 +- `WAKE_WORD`: サーバーサイド wakeword 検出用の uplink + ### 現行実装メモ -- `proxy.listen()` 開始時に Server が `Listening` を指示します。 +- `proxy.listen()` 開始時に Server が `StateCmd(Listening, SPEECH)` を指示します。 +- サーバーサイド wakeword 検出開始時は `StateCmd(Listening, WAKE_WORD)` を指示します。 - 音声 uplink の `END` を受けると、Server は `Thinking` を指示します。 - `proxy.speak()` 完了後、Server は `Idle` を指示します。 +> [!NOTE] +> `WAKE_WORD` の場合、CoreS3 は内部的にマイク uplink を開始しますが、状態表示は `Listening` に遷移せず `Idle(Server-WWD)` のままです。また無音 3 秒による自動終了も行いません。 + ## ウェイクワード検出 `WakeWordEvt` - 方向: CoreS3 → Server @@ -112,6 +123,31 @@ - `Idle` 中のウェイクワード検出をサーバー側に通知します。 - REST API の `POST /v1/stackchan/{ip}/wakeword` は、このイベントをサーバー内部で擬似発火させます。 +## メタデータ交換 `FirmwareMetadata` / `ServerMetadata` + +WebSocket 接続後、能力情報を相互交換します。 + +- CoreS3 → Server: `FirmwareMetadata` + - `has_device_wake_word`: クライアント側 wakeword 対応有無 + - そのほか `device_type`, `display_width`, `display_height`, `has_led`, `servo_type`, `supports_audio_duplex`, `firmware_version` +- Server → CoreS3: `ServerMetadata` + - `has_server_wake_word`: サーバー側 wakeword 対応有無 + - `server_version` + +CoreS3 側は `has_server_wake_word=true` を受けると、デバイス側 wakeword を使わずにサーバー側検出モードで待機します(表示は `Idle(Server-WWD)`)。 + +## サーバーサイド wakeword 検出フロー + +- 環境変数 `USE_SERVER_SIDE_WWD_WHISPER_SERVER=1` の場合、サーバーは `@app.setup()` 完了後と `Idle` 復帰後に自動でサーバーサイド wakeword 検出を開始します。 +- REST API `POST /v1/stackchan/{ip}/wakeword/server-detect` を呼ぶと、 + サーバーは `StateCmd(Listening, WAKE_WORD)` を送信してマイク uplink を受信します。 +- 受信した音声の直近 3 秒窓を 0.5 秒ごとに音声認識へ渡し、 + 定義キーワード(例: `スタクチャン`)を含むか判定します。 +- 各判定タイミングの認識結果はすべてログ出力されます。 +- キーワード検出時は内部 wakeword イベントを発火し、通常の `talk_session` フローに進みます。 +- 検出完了時(検出/未検出を問わず)は `StateCmd(Idle)` で待機状態に戻します。 +- この間、CoreS3 の画面表示は `Listening` ではなく `Idle(Server-WWD)` を維持します。 + ## 状態通知 `StateEvt` - 方向: CoreS3 → Server diff --git a/firmware/include/listening.hpp b/firmware/include/listening.hpp index 0e18ba8..0a89b25 100644 --- a/firmware/include/listening.hpp +++ b/firmware/include/listening.hpp @@ -10,6 +10,12 @@ class Listening { public: + enum class SessionMode + { + Speech, + WakeWord, + }; + Listening(WebSocketsClient &ws, StateMachine &sm, int sampleRate); // allocate buffers / reset counters; call once from setup @@ -19,6 +25,10 @@ class Listening void begin(); void end(); + // Idle(Server-WWD) のままマイク uplink を開始/終了する + bool beginWakeWordStreaming(); + void endWakeWordStreaming(); + // begin a new streaming session (sends START); returns false if WS not connected bool startStreaming(); @@ -34,7 +44,11 @@ class Listening // 無音が所定時間続いているか判定 bool shouldStopForSilence() const; + bool isWakeWordStreaming() const { return streaming_ && session_mode_ == SessionMode::WakeWord; } + private: + bool beginStreamingSession(SessionMode mode, bool auto_stop_for_silence); + void stopMicrophoneOnly(); void updateLevelStats(const int16_t *samples, size_t sampleCount); bool sendPacket(stackchan_websocket_v1_MessageType type, const int16_t *samples, size_t sampleCount); void ringPush(const int16_t *src, size_t samples); @@ -56,6 +70,8 @@ class Listening uint32_t seq_counter_ = 0; bool streaming_ = false; bool events_registered_ = false; + SessionMode session_mode_ = SessionMode::Speech; + bool auto_stop_for_silence_ = true; // 無音判定関連 int32_t last_level_ = 0; diff --git a/firmware/include/metadata.hpp b/firmware/include/metadata.hpp index f490abd..c97b4eb 100644 --- a/firmware/include/metadata.hpp +++ b/firmware/include/metadata.hpp @@ -29,6 +29,7 @@ extern ServerMetadataState g_server_metadata; void initializeFirmwareMetadata(); void resetServerMetadata(); bool shouldUseDeviceWakeWord(); +bool shouldUseServerWakeWord(); void setFirmwareMetadataMessage( stackchan_websocket_v1_WebSocketMessage &message, uint32_t seq); diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.c b/firmware/lib/generated_protobuf/websocket-message.pb.c index f70a79e..7620ee8 100644 --- a/firmware/lib/generated_protobuf/websocket-message.pb.c +++ b/firmware/lib/generated_protobuf/websocket-message.pb.c @@ -64,3 +64,5 @@ PB_BIND(stackchan_websocket_v1_ServerMetadata, stackchan_websocket_v1_ServerMeta + + diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.h b/firmware/lib/generated_protobuf/websocket-message.pb.h index 8e0c222..ffd1d31 100644 --- a/firmware/lib/generated_protobuf/websocket-message.pb.h +++ b/firmware/lib/generated_protobuf/websocket-message.pb.h @@ -38,6 +38,12 @@ typedef enum _stackchan_websocket_v1_StackchanState { stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING = 3 } stackchan_websocket_v1_StackchanState; +typedef enum _stackchan_websocket_v1_ListeningPurpose { + stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_UNSPECIFIED = 0, + stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_SPEECH = 1, + stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD = 2 +} stackchan_websocket_v1_ListeningPurpose; + typedef enum _stackchan_websocket_v1_ServoOperation { stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP = 0, stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_X = 1, @@ -83,6 +89,7 @@ typedef struct _stackchan_websocket_v1_AudioChunk { typedef struct _stackchan_websocket_v1_StateCommand { stackchan_websocket_v1_StackchanState state; + stackchan_websocket_v1_ListeningPurpose listening_purpose; } stackchan_websocket_v1_StateCommand; typedef struct _stackchan_websocket_v1_WakeWordEvent { @@ -176,6 +183,10 @@ extern "C" { #define _stackchan_websocket_v1_StackchanState_MAX stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING #define _stackchan_websocket_v1_StackchanState_ARRAYSIZE ((stackchan_websocket_v1_StackchanState)(stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING+1)) +#define _stackchan_websocket_v1_ListeningPurpose_MIN stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_UNSPECIFIED +#define _stackchan_websocket_v1_ListeningPurpose_MAX stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD +#define _stackchan_websocket_v1_ListeningPurpose_ARRAYSIZE ((stackchan_websocket_v1_ListeningPurpose)(stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD+1)) + #define _stackchan_websocket_v1_ServoOperation_MIN stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP #define _stackchan_websocket_v1_ServoOperation_MAX stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y #define _stackchan_websocket_v1_ServoOperation_ARRAYSIZE ((stackchan_websocket_v1_ServoOperation)(stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y+1)) @@ -197,6 +208,7 @@ extern "C" { #define stackchan_websocket_v1_StateCommand_state_ENUMTYPE stackchan_websocket_v1_StackchanState +#define stackchan_websocket_v1_StateCommand_listening_purpose_ENUMTYPE stackchan_websocket_v1_ListeningPurpose #define stackchan_websocket_v1_StateEvent_state_ENUMTYPE stackchan_websocket_v1_StackchanState @@ -218,7 +230,7 @@ extern "C" { #define stackchan_websocket_v1_AudioWavStart_init_default {0, 0} #define stackchan_websocket_v1_AudioWavEnd_init_default {0} #define stackchan_websocket_v1_AudioChunk_init_default {{0, {0}}} -#define stackchan_websocket_v1_StateCommand_init_default {_stackchan_websocket_v1_StackchanState_MIN} +#define stackchan_websocket_v1_StateCommand_init_default {_stackchan_websocket_v1_StackchanState_MIN, _stackchan_websocket_v1_ListeningPurpose_MIN} #define stackchan_websocket_v1_WakeWordEvent_init_default {0} #define stackchan_websocket_v1_StateEvent_init_default {_stackchan_websocket_v1_StackchanState_MIN} #define stackchan_websocket_v1_SpeakDoneEvent_init_default {0} @@ -233,7 +245,7 @@ extern "C" { #define stackchan_websocket_v1_AudioWavStart_init_zero {0, 0} #define stackchan_websocket_v1_AudioWavEnd_init_zero {0} #define stackchan_websocket_v1_AudioChunk_init_zero {{0, {0}}} -#define stackchan_websocket_v1_StateCommand_init_zero {_stackchan_websocket_v1_StackchanState_MIN} +#define stackchan_websocket_v1_StateCommand_init_zero {_stackchan_websocket_v1_StackchanState_MIN, _stackchan_websocket_v1_ListeningPurpose_MIN} #define stackchan_websocket_v1_WakeWordEvent_init_zero {0} #define stackchan_websocket_v1_StateEvent_init_zero {_stackchan_websocket_v1_StackchanState_MIN} #define stackchan_websocket_v1_SpeakDoneEvent_init_zero {0} @@ -248,6 +260,7 @@ extern "C" { #define stackchan_websocket_v1_AudioWavStart_channels_tag 2 #define stackchan_websocket_v1_AudioChunk_pcm_bytes_tag 1 #define stackchan_websocket_v1_StateCommand_state_tag 1 +#define stackchan_websocket_v1_StateCommand_listening_purpose_tag 2 #define stackchan_websocket_v1_WakeWordEvent_detected_tag 1 #define stackchan_websocket_v1_StateEvent_state_tag 1 #define stackchan_websocket_v1_SpeakDoneEvent_done_tag 1 @@ -347,7 +360,8 @@ X(a, STATIC, SINGULAR, BYTES, pcm_bytes, 1) #define stackchan_websocket_v1_AudioChunk_DEFAULT NULL #define stackchan_websocket_v1_StateCommand_FIELDLIST(X, a) \ -X(a, STATIC, SINGULAR, UENUM, state, 1) +X(a, STATIC, SINGULAR, UENUM, state, 1) \ +X(a, STATIC, SINGULAR, UENUM, listening_purpose, 2) #define stackchan_websocket_v1_StateCommand_CALLBACK NULL #define stackchan_websocket_v1_StateCommand_DEFAULT NULL @@ -448,7 +462,7 @@ extern const pb_msgdesc_t stackchan_websocket_v1_ServerMetadata_msg; #define stackchan_websocket_v1_ServoCommand_size 14 #define stackchan_websocket_v1_ServoDoneEvent_size 2 #define stackchan_websocket_v1_SpeakDoneEvent_size 2 -#define stackchan_websocket_v1_StateCommand_size 2 +#define stackchan_websocket_v1_StateCommand_size 4 #define stackchan_websocket_v1_StateEvent_size 2 #define stackchan_websocket_v1_WakeWordEvent_size 2 #define stackchan_websocket_v1_WebSocketMessage_size 4113 diff --git a/firmware/src/display.cpp b/firmware/src/display.cpp index a33cabf..21aaf62 100644 --- a/firmware/src/display.cpp +++ b/firmware/src/display.cpp @@ -4,6 +4,7 @@ #include "config.h" #include "display.hpp" +#include "metadata.hpp" #if USE_STACKCHAN_BSP #define GFXModule M5StackChan.Display() @@ -137,6 +138,11 @@ void Display::drawForState(StateMachine::State state) GFXModule.setTextSize(1); GFXModule.setTextColor(font_color, bg_color); GFXModule.setCursor(isAtomS3R() ? 4 : 10, bar_y + (isAtomS3R() ? 6 : 2)); + if (state == StateMachine::Idle && shouldUseServerWakeWord()) + { + GFXModule.printf("Idle(Server-WWD)"); + return; + } GFXModule.printf("%s", stateToString(state)); } diff --git a/firmware/src/listening.cpp b/firmware/src/listening.cpp index edb2e35..1138d67 100644 --- a/firmware/src/listening.cpp +++ b/firmware/src/listening.cpp @@ -42,21 +42,45 @@ void Listening::init() void Listening::begin() { M5.Mic.begin(); - startStreaming(); + beginStreamingSession(SessionMode::Speech, true); } void Listening::end() { stopStreaming(); - M5.Mic.end(); + stopMicrophoneOnly(); +} + +bool Listening::beginWakeWordStreaming() +{ + if (streaming_) + { + return session_mode_ == SessionMode::WakeWord; + } + + M5.Mic.begin(); + return beginStreamingSession(SessionMode::WakeWord, false); +} + +void Listening::endWakeWordStreaming() +{ + stopStreaming(); + stopMicrophoneOnly(); } bool Listening::startStreaming() +{ + return beginStreamingSession(SessionMode::Speech, true); +} + +bool Listening::beginStreamingSession(SessionMode mode, bool auto_stop_for_silence) { ring_write_ = ring_read_ = ring_available_ = 0; seq_counter_ = 0; last_level_ = 0; silence_since_ms_ = 0; + session_mode_ = mode; + auto_stop_for_silence_ = auto_stop_for_silence; streaming_ = true; return sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_START, nullptr, 0); } @@ -89,10 +113,19 @@ bool Listening::stopStreaming() } streaming_ = false; + session_mode_ = SessionMode::Speech; + auto_stop_for_silence_ = true; ok = sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END, nullptr, 0) && ok; return ok; } +void Listening::stopMicrophoneOnly() +{ + session_mode_ = SessionMode::Speech; + auto_stop_for_silence_ = true; + M5.Mic.end(); +} + void Listening::loop() { if (!streaming_) @@ -123,13 +156,20 @@ void Listening::loop() { streaming_ = false; log_i("WS send failed (data)"); - state_.setState(StateMachine::Idle); + if (session_mode_ == SessionMode::Speech) + { + state_.setState(StateMachine::Idle); + } + else + { + stopMicrophoneOnly(); + } return; } } // 無音が3秒続いたら終了 - if (shouldStopForSilence()) + if (auto_stop_for_silence_ && shouldStopForSilence()) { log_i("Auto stop: silence detected (avg=%ld)", static_cast(last_level_)); if (!stopStreaming()) diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp index 6fb95c4..07c1da2 100644 --- a/firmware/src/main.cpp +++ b/firmware/src/main.cpp @@ -238,15 +238,43 @@ bool applyRemoteStateCommand(const stackchan_websocket_v1_StateCommand &command) switch (command.state) { case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE: + if (listening.isWakeWordStreaming()) + { + listening.endWakeWordStreaming(); + } stateMachine.setState(StateMachine::Idle); return true; case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING: + if (command.listening_purpose == stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD && + shouldUseServerWakeWord() && + stateMachine.getState() == StateMachine::Idle) + { + if (!listening.beginWakeWordStreaming()) + { + log_w("Failed to start server-side wakeword streaming"); + return false; + } + return true; + } + + if (listening.isWakeWordStreaming()) + { + listening.endWakeWordStreaming(); + } stateMachine.setState(StateMachine::Listening); return true; case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING: + if (listening.isWakeWordStreaming()) + { + listening.endWakeWordStreaming(); + } stateMachine.setState(StateMachine::Thinking); return true; case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING: + if (listening.isWakeWordStreaming()) + { + listening.endWakeWordStreaming(); + } stateMachine.setState(StateMachine::Speaking); return true; default: @@ -542,7 +570,11 @@ void loop() { case StateMachine::Idle: handleTouchWakeWordInput(); - if (shouldUseDeviceWakeWord()) + if (listening.isWakeWordStreaming()) + { + listening.loop(); + } + else if (shouldUseDeviceWakeWord()) { wakeUpWord.loop(); } diff --git a/firmware/src/metadata.cpp b/firmware/src/metadata.cpp index 5e6cd9b..579e9a8 100644 --- a/firmware/src/metadata.cpp +++ b/firmware/src/metadata.cpp @@ -73,6 +73,11 @@ bool shouldUseDeviceWakeWord() return g_server_metadata.available && !g_server_metadata.has_server_wake_word; } +bool shouldUseServerWakeWord() +{ + return g_server_metadata.available && g_server_metadata.has_server_wake_word; +} + void setFirmwareMetadataMessage( stackchan_websocket_v1_WebSocketMessage &message, uint32_t seq) diff --git a/protobuf/websocket-message.proto b/protobuf/websocket-message.proto index c643673..fec8bff 100644 --- a/protobuf/websocket-message.proto +++ b/protobuf/websocket-message.proto @@ -62,6 +62,12 @@ enum StackchanState { STACKCHAN_STATE_SPEAKING = 3; } +enum ListeningPurpose { + LISTENING_PURPOSE_UNSPECIFIED = 0; + LISTENING_PURPOSE_SPEECH = 1; + LISTENING_PURPOSE_WAKE_WORD = 2; +} + enum ServoOperation { SERVO_OPERATION_SLEEP = 0; SERVO_OPERATION_MOVE_X = 1; @@ -99,6 +105,7 @@ message AudioChunk { message StateCommand { StackchanState state = 1; + ListeningPurpose listening_purpose = 2; } message WakeWordEvent { diff --git a/stackchan_server/app.py b/stackchan_server/app.py index 14496d2..94cf8de 100644 --- a/stackchan_server/app.py +++ b/stackchan_server/app.py @@ -10,6 +10,7 @@ from .speech_recognition import create_speech_recognizer from .speech_synthesis import create_speech_synthesizer from .types import SpeechRecognizer, SpeechSynthesizer +from .wakeup_word_detection import WakeWordDetectionError from .ws_proxy import WsProxy logger = getLogger(__name__) @@ -24,6 +25,10 @@ class SpeakRequest(BaseModel): text: str +class ServerWakeWordDetectResponse(BaseModel): + detected: bool + + class StackChanApp: def __init__( self, @@ -64,6 +69,25 @@ async def _trigger_wakeword(stackchan_ip: str): raise HTTPException(status_code=404, detail="stackchan not connected") proxy.trigger_wakeword() + @self.fastapi.post( + "/v1/stackchan/{stackchan_ip}/wakeword/server-detect", + response_model=ServerWakeWordDetectResponse, + ) + async def _detect_server_wakeword( + stackchan_ip: str, + timeout_seconds: float | None = None, + ): + proxy = await self._get_proxy(stackchan_ip) + if proxy is None: + raise HTTPException(status_code=404, detail="stackchan not connected") + try: + detected = await proxy.request_server_wakeword_detection( + timeout_seconds=timeout_seconds + ) + except WakeWordDetectionError as exc: + raise HTTPException(status_code=409, detail=str(exc)) from exc + return ServerWakeWordDetectResponse(detected=detected) + @self.fastapi.post("/v1/stackchan/{stackchan_ip}/speak", status_code=204) async def _speak(stackchan_ip: str, body: SpeakRequest): proxy = await self._get_proxy(stackchan_ip) @@ -99,6 +123,8 @@ async def _handle_ws(self, websocket: WebSocket) -> None: if self._setup_fn: await self._setup_fn(proxy) + await proxy.enable_auto_server_wakeword_detection() + while not proxy.closed: if not self._talk_session_fn: await asyncio.sleep(0.05) diff --git a/stackchan_server/generated_protobuf/websocket_message_pb2.py b/stackchan_server/generated_protobuf/websocket_message_pb2.py index a7d7a4e..433f9a8 100644 --- a/stackchan_server/generated_protobuf/websocket_message_pb2.py +++ b/stackchan_server/generated_protobuf/websocket_message_pb2.py @@ -24,25 +24,27 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17websocket-message.proto\x12\x16stackchan.websocket.v1\"\x96\x08\n\x10WebSocketMessage\x12\x31\n\x04kind\x18\x01 \x01(\x0e\x32#.stackchan.websocket.v1.MessageKind\x12\x39\n\x0cmessage_type\x18\x02 \x01(\x0e\x32#.stackchan.websocket.v1.MessageType\x12\x0b\n\x03seq\x18\x03 \x01(\r\x12@\n\x0f\x61udio_pcm_start\x18\n \x01(\x0b\x32%.stackchan.websocket.v1.AudioPcmStartH\x00\x12<\n\x0e\x61udio_pcm_data\x18\x0b \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_pcm_end\x18\x0c \x01(\x0b\x32#.stackchan.websocket.v1.AudioPcmEndH\x00\x12@\n\x0f\x61udio_wav_start\x18\x14 \x01(\x0b\x32%.stackchan.websocket.v1.AudioWavStartH\x00\x12<\n\x0e\x61udio_wav_data\x18\x15 \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_wav_end\x18\x16 \x01(\x0b\x32#.stackchan.websocket.v1.AudioWavEndH\x00\x12\x39\n\tstate_cmd\x18\x1e \x01(\x0b\x32$.stackchan.websocket.v1.StateCommandH\x00\x12>\n\rwake_word_evt\x18\x1f \x01(\x0b\x32%.stackchan.websocket.v1.WakeWordEventH\x00\x12\x37\n\tstate_evt\x18 \x01(\x0b\x32\".stackchan.websocket.v1.StateEventH\x00\x12@\n\x0espeak_done_evt\x18! \x01(\x0b\x32&.stackchan.websocket.v1.SpeakDoneEventH\x00\x12\x41\n\tservo_cmd\x18\" \x01(\x0b\x32,.stackchan.websocket.v1.ServoCommandSequenceH\x00\x12@\n\x0eservo_done_evt\x18# \x01(\x0b\x32&.stackchan.websocket.v1.ServoDoneEventH\x00\x12\x45\n\x11\x66irmware_metadata\x18$ \x01(\x0b\x32(.stackchan.websocket.v1.FirmwareMetadataH\x00\x12\x41\n\x0fserver_metadata\x18% \x01(\x0b\x32&.stackchan.websocket.v1.ServerMetadataH\x00\x42\x06\n\x04\x62ody\"\x0f\n\rAudioPcmStart\"\r\n\x0b\x41udioPcmEnd\"6\n\rAudioWavStart\x12\x13\n\x0bsample_rate\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\"\r\n\x0b\x41udioWavEnd\"\x1f\n\nAudioChunk\x12\x11\n\tpcm_bytes\x18\x01 \x01(\x0c\"E\n\x0cStateCommand\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"!\n\rWakeWordEvent\x12\x10\n\x08\x64\x65tected\x18\x01 \x01(\x08\"C\n\nStateEvent\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"\x1e\n\x0eSpeakDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"N\n\x14ServoCommandSequence\x12\x36\n\x08\x63ommands\x18\x01 \x03(\x0b\x32$.stackchan.websocket.v1.ServoCommand\"f\n\x0cServoCommand\x12\x32\n\x02op\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.ServoOperation\x12\r\n\x05\x61ngle\x18\x02 \x01(\x11\x12\x13\n\x0b\x64uration_ms\x18\x03 \x01(\x11\"\x1e\n\x0eServoDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"\x99\x02\n\x10\x46irmwareMetadata\x12\x37\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\".stackchan.websocket.v1.DeviceType\x12\x15\n\rdisplay_width\x18\x02 \x01(\r\x12\x16\n\x0e\x64isplay_height\x18\x03 \x01(\r\x12\x1c\n\x14has_device_wake_word\x18\x04 \x01(\x08\x12\x0f\n\x07has_led\x18\x05 \x01(\x08\x12\x35\n\nservo_type\x18\x06 \x01(\x0e\x32!.stackchan.websocket.v1.ServoType\x12\x1d\n\x15supports_audio_duplex\x18\x07 \x01(\x08\x12\x18\n\x10\x66irmware_version\x18\x08 \x01(\t\"F\n\x0eServerMetadata\x12\x1c\n\x14has_server_wake_word\x18\x01 \x01(\x08\x12\x16\n\x0eserver_version\x18\x02 \x01(\t*\xdf\x02\n\x0bMessageKind\x12\x1c\n\x18MESSAGE_KIND_UNSPECIFIED\x10\x00\x12\x1a\n\x16MESSAGE_KIND_AUDIO_PCM\x10\x01\x12\x1a\n\x16MESSAGE_KIND_AUDIO_WAV\x10\x02\x12\x1a\n\x16MESSAGE_KIND_STATE_CMD\x10\x03\x12\x1e\n\x1aMESSAGE_KIND_WAKE_WORD_EVT\x10\x04\x12\x1a\n\x16MESSAGE_KIND_STATE_EVT\x10\x05\x12\x1f\n\x1bMESSAGE_KIND_SPEAK_DONE_EVT\x10\x06\x12\x1a\n\x16MESSAGE_KIND_SERVO_CMD\x10\x07\x12\x1f\n\x1bMESSAGE_KIND_SERVO_DONE_EVT\x10\x08\x12\"\n\x1eMESSAGE_KIND_FIRMWARE_METADATA\x10\t\x12 \n\x1cMESSAGE_KIND_SERVER_METADATA\x10\n*p\n\x0bMessageType\x12\x1c\n\x18MESSAGE_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12MESSAGE_TYPE_START\x10\x01\x12\x15\n\x11MESSAGE_TYPE_DATA\x10\x02\x12\x14\n\x10MESSAGE_TYPE_END\x10\x03*\x85\x01\n\x0eStackchanState\x12\x18\n\x14STACKCHAN_STATE_IDLE\x10\x00\x12\x1d\n\x19STACKCHAN_STATE_LISTENING\x10\x01\x12\x1c\n\x18STACKCHAN_STATE_THINKING\x10\x02\x12\x1c\n\x18STACKCHAN_STATE_SPEAKING\x10\x03*c\n\x0eServoOperation\x12\x19\n\x15SERVO_OPERATION_SLEEP\x10\x00\x12\x1a\n\x16SERVO_OPERATION_MOVE_X\x10\x01\x12\x1a\n\x16SERVO_OPERATION_MOVE_Y\x10\x02*\x85\x01\n\nDeviceType\x12\x1b\n\x17\x44\x45VICE_TYPE_UNSPECIFIED\x10\x00\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5STACK_CORES3\x10\x01\x12\x1a\n\x16\x44\x45VICE_TYPE_M5ATOM_S3R\x10\x02\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5ATOM_ECHOS3R\x10\x03*i\n\tServoType\x12\x1a\n\x16SERVO_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fSERVO_TYPE_NONE\x10\x01\x12\x13\n\x0fSERVO_TYPE_SG90\x10\x02\x12\x16\n\x12SERVO_TYPE_SCS0009\x10\x03\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17websocket-message.proto\x12\x16stackchan.websocket.v1\"\x96\x08\n\x10WebSocketMessage\x12\x31\n\x04kind\x18\x01 \x01(\x0e\x32#.stackchan.websocket.v1.MessageKind\x12\x39\n\x0cmessage_type\x18\x02 \x01(\x0e\x32#.stackchan.websocket.v1.MessageType\x12\x0b\n\x03seq\x18\x03 \x01(\r\x12@\n\x0f\x61udio_pcm_start\x18\n \x01(\x0b\x32%.stackchan.websocket.v1.AudioPcmStartH\x00\x12<\n\x0e\x61udio_pcm_data\x18\x0b \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_pcm_end\x18\x0c \x01(\x0b\x32#.stackchan.websocket.v1.AudioPcmEndH\x00\x12@\n\x0f\x61udio_wav_start\x18\x14 \x01(\x0b\x32%.stackchan.websocket.v1.AudioWavStartH\x00\x12<\n\x0e\x61udio_wav_data\x18\x15 \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_wav_end\x18\x16 \x01(\x0b\x32#.stackchan.websocket.v1.AudioWavEndH\x00\x12\x39\n\tstate_cmd\x18\x1e \x01(\x0b\x32$.stackchan.websocket.v1.StateCommandH\x00\x12>\n\rwake_word_evt\x18\x1f \x01(\x0b\x32%.stackchan.websocket.v1.WakeWordEventH\x00\x12\x37\n\tstate_evt\x18 \x01(\x0b\x32\".stackchan.websocket.v1.StateEventH\x00\x12@\n\x0espeak_done_evt\x18! \x01(\x0b\x32&.stackchan.websocket.v1.SpeakDoneEventH\x00\x12\x41\n\tservo_cmd\x18\" \x01(\x0b\x32,.stackchan.websocket.v1.ServoCommandSequenceH\x00\x12@\n\x0eservo_done_evt\x18# \x01(\x0b\x32&.stackchan.websocket.v1.ServoDoneEventH\x00\x12\x45\n\x11\x66irmware_metadata\x18$ \x01(\x0b\x32(.stackchan.websocket.v1.FirmwareMetadataH\x00\x12\x41\n\x0fserver_metadata\x18% \x01(\x0b\x32&.stackchan.websocket.v1.ServerMetadataH\x00\x42\x06\n\x04\x62ody\"\x0f\n\rAudioPcmStart\"\r\n\x0b\x41udioPcmEnd\"6\n\rAudioWavStart\x12\x13\n\x0bsample_rate\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\"\r\n\x0b\x41udioWavEnd\"\x1f\n\nAudioChunk\x12\x11\n\tpcm_bytes\x18\x01 \x01(\x0c\"\x8a\x01\n\x0cStateCommand\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\x12\x43\n\x11listening_purpose\x18\x02 \x01(\x0e\x32(.stackchan.websocket.v1.ListeningPurpose\"!\n\rWakeWordEvent\x12\x10\n\x08\x64\x65tected\x18\x01 \x01(\x08\"C\n\nStateEvent\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"\x1e\n\x0eSpeakDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"N\n\x14ServoCommandSequence\x12\x36\n\x08\x63ommands\x18\x01 \x03(\x0b\x32$.stackchan.websocket.v1.ServoCommand\"f\n\x0cServoCommand\x12\x32\n\x02op\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.ServoOperation\x12\r\n\x05\x61ngle\x18\x02 \x01(\x11\x12\x13\n\x0b\x64uration_ms\x18\x03 \x01(\x11\"\x1e\n\x0eServoDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"\x99\x02\n\x10\x46irmwareMetadata\x12\x37\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\".stackchan.websocket.v1.DeviceType\x12\x15\n\rdisplay_width\x18\x02 \x01(\r\x12\x16\n\x0e\x64isplay_height\x18\x03 \x01(\r\x12\x1c\n\x14has_device_wake_word\x18\x04 \x01(\x08\x12\x0f\n\x07has_led\x18\x05 \x01(\x08\x12\x35\n\nservo_type\x18\x06 \x01(\x0e\x32!.stackchan.websocket.v1.ServoType\x12\x1d\n\x15supports_audio_duplex\x18\x07 \x01(\x08\x12\x18\n\x10\x66irmware_version\x18\x08 \x01(\t\"F\n\x0eServerMetadata\x12\x1c\n\x14has_server_wake_word\x18\x01 \x01(\x08\x12\x16\n\x0eserver_version\x18\x02 \x01(\t*\xdf\x02\n\x0bMessageKind\x12\x1c\n\x18MESSAGE_KIND_UNSPECIFIED\x10\x00\x12\x1a\n\x16MESSAGE_KIND_AUDIO_PCM\x10\x01\x12\x1a\n\x16MESSAGE_KIND_AUDIO_WAV\x10\x02\x12\x1a\n\x16MESSAGE_KIND_STATE_CMD\x10\x03\x12\x1e\n\x1aMESSAGE_KIND_WAKE_WORD_EVT\x10\x04\x12\x1a\n\x16MESSAGE_KIND_STATE_EVT\x10\x05\x12\x1f\n\x1bMESSAGE_KIND_SPEAK_DONE_EVT\x10\x06\x12\x1a\n\x16MESSAGE_KIND_SERVO_CMD\x10\x07\x12\x1f\n\x1bMESSAGE_KIND_SERVO_DONE_EVT\x10\x08\x12\"\n\x1eMESSAGE_KIND_FIRMWARE_METADATA\x10\t\x12 \n\x1cMESSAGE_KIND_SERVER_METADATA\x10\n*p\n\x0bMessageType\x12\x1c\n\x18MESSAGE_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12MESSAGE_TYPE_START\x10\x01\x12\x15\n\x11MESSAGE_TYPE_DATA\x10\x02\x12\x14\n\x10MESSAGE_TYPE_END\x10\x03*\x85\x01\n\x0eStackchanState\x12\x18\n\x14STACKCHAN_STATE_IDLE\x10\x00\x12\x1d\n\x19STACKCHAN_STATE_LISTENING\x10\x01\x12\x1c\n\x18STACKCHAN_STATE_THINKING\x10\x02\x12\x1c\n\x18STACKCHAN_STATE_SPEAKING\x10\x03*t\n\x10ListeningPurpose\x12!\n\x1dLISTENING_PURPOSE_UNSPECIFIED\x10\x00\x12\x1c\n\x18LISTENING_PURPOSE_SPEECH\x10\x01\x12\x1f\n\x1bLISTENING_PURPOSE_WAKE_WORD\x10\x02*c\n\x0eServoOperation\x12\x19\n\x15SERVO_OPERATION_SLEEP\x10\x00\x12\x1a\n\x16SERVO_OPERATION_MOVE_X\x10\x01\x12\x1a\n\x16SERVO_OPERATION_MOVE_Y\x10\x02*\x85\x01\n\nDeviceType\x12\x1b\n\x17\x44\x45VICE_TYPE_UNSPECIFIED\x10\x00\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5STACK_CORES3\x10\x01\x12\x1a\n\x16\x44\x45VICE_TYPE_M5ATOM_S3R\x10\x02\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5ATOM_ECHOS3R\x10\x03*i\n\tServoType\x12\x1a\n\x16SERVO_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fSERVO_TYPE_NONE\x10\x01\x12\x13\n\x0fSERVO_TYPE_SG90\x10\x02\x12\x16\n\x12SERVO_TYPE_SCS0009\x10\x03\x62\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'websocket_message_pb2', _globals) if not _descriptor._USE_C_DESCRIPTORS: DESCRIPTOR._loaded_options = None - _globals['_MESSAGEKIND']._serialized_start=2016 - _globals['_MESSAGEKIND']._serialized_end=2367 - _globals['_MESSAGETYPE']._serialized_start=2369 - _globals['_MESSAGETYPE']._serialized_end=2481 - _globals['_STACKCHANSTATE']._serialized_start=2484 - _globals['_STACKCHANSTATE']._serialized_end=2617 - _globals['_SERVOOPERATION']._serialized_start=2619 - _globals['_SERVOOPERATION']._serialized_end=2718 - _globals['_DEVICETYPE']._serialized_start=2721 - _globals['_DEVICETYPE']._serialized_end=2854 - _globals['_SERVOTYPE']._serialized_start=2856 - _globals['_SERVOTYPE']._serialized_end=2961 + _globals['_MESSAGEKIND']._serialized_start=2086 + _globals['_MESSAGEKIND']._serialized_end=2437 + _globals['_MESSAGETYPE']._serialized_start=2439 + _globals['_MESSAGETYPE']._serialized_end=2551 + _globals['_STACKCHANSTATE']._serialized_start=2554 + _globals['_STACKCHANSTATE']._serialized_end=2687 + _globals['_LISTENINGPURPOSE']._serialized_start=2689 + _globals['_LISTENINGPURPOSE']._serialized_end=2805 + _globals['_SERVOOPERATION']._serialized_start=2807 + _globals['_SERVOOPERATION']._serialized_end=2906 + _globals['_DEVICETYPE']._serialized_start=2909 + _globals['_DEVICETYPE']._serialized_end=3042 + _globals['_SERVOTYPE']._serialized_start=3044 + _globals['_SERVOTYPE']._serialized_end=3149 _globals['_WEBSOCKETMESSAGE']._serialized_start=52 _globals['_WEBSOCKETMESSAGE']._serialized_end=1098 _globals['_AUDIOPCMSTART']._serialized_start=1100 @@ -55,22 +57,22 @@ _globals['_AUDIOWAVEND']._serialized_end=1201 _globals['_AUDIOCHUNK']._serialized_start=1203 _globals['_AUDIOCHUNK']._serialized_end=1234 - _globals['_STATECOMMAND']._serialized_start=1236 - _globals['_STATECOMMAND']._serialized_end=1305 - _globals['_WAKEWORDEVENT']._serialized_start=1307 - _globals['_WAKEWORDEVENT']._serialized_end=1340 - _globals['_STATEEVENT']._serialized_start=1342 - _globals['_STATEEVENT']._serialized_end=1409 - _globals['_SPEAKDONEEVENT']._serialized_start=1411 - _globals['_SPEAKDONEEVENT']._serialized_end=1441 - _globals['_SERVOCOMMANDSEQUENCE']._serialized_start=1443 - _globals['_SERVOCOMMANDSEQUENCE']._serialized_end=1521 - _globals['_SERVOCOMMAND']._serialized_start=1523 - _globals['_SERVOCOMMAND']._serialized_end=1625 - _globals['_SERVODONEEVENT']._serialized_start=1627 - _globals['_SERVODONEEVENT']._serialized_end=1657 - _globals['_FIRMWAREMETADATA']._serialized_start=1660 - _globals['_FIRMWAREMETADATA']._serialized_end=1941 - _globals['_SERVERMETADATA']._serialized_start=1943 - _globals['_SERVERMETADATA']._serialized_end=2013 + _globals['_STATECOMMAND']._serialized_start=1237 + _globals['_STATECOMMAND']._serialized_end=1375 + _globals['_WAKEWORDEVENT']._serialized_start=1377 + _globals['_WAKEWORDEVENT']._serialized_end=1410 + _globals['_STATEEVENT']._serialized_start=1412 + _globals['_STATEEVENT']._serialized_end=1479 + _globals['_SPEAKDONEEVENT']._serialized_start=1481 + _globals['_SPEAKDONEEVENT']._serialized_end=1511 + _globals['_SERVOCOMMANDSEQUENCE']._serialized_start=1513 + _globals['_SERVOCOMMANDSEQUENCE']._serialized_end=1591 + _globals['_SERVOCOMMAND']._serialized_start=1593 + _globals['_SERVOCOMMAND']._serialized_end=1695 + _globals['_SERVODONEEVENT']._serialized_start=1697 + _globals['_SERVODONEEVENT']._serialized_end=1727 + _globals['_FIRMWAREMETADATA']._serialized_start=1730 + _globals['_FIRMWAREMETADATA']._serialized_end=2011 + _globals['_SERVERMETADATA']._serialized_start=2013 + _globals['_SERVERMETADATA']._serialized_end=2083 # @@protoc_insertion_point(module_scope) diff --git a/stackchan_server/protobuf_ws.py b/stackchan_server/protobuf_ws.py index 8569004..443900b 100644 --- a/stackchan_server/protobuf_ws.py +++ b/stackchan_server/protobuf_ws.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections.abc import Sequence -from enum import StrEnum +from enum import IntEnum, StrEnum from typing import Any, Literal, cast from .generated_protobuf import websocket_message_pb2 as _ws_pb2 @@ -15,6 +15,12 @@ ServoCommand = ServoMoveCommand | ServoSleepCommand +class ListeningPurpose(IntEnum): + UNSPECIFIED = 0 + SPEECH = 1 + WAKE_WORD = 2 + + def _ensure_range(value: int, *, minimum: int, maximum: int, label: str) -> int: if not minimum <= value <= maximum: raise ValueError(f"{label} must be between {minimum} and {maximum}: {value}") @@ -92,13 +98,19 @@ def encode_audio_wav_end_message(seq: int) -> bytes: return message.SerializeToString() -def encode_state_command_message(seq: int, state_id: int) -> bytes: +def encode_state_command_message( + seq: int, + state_id: int, + *, + listening_purpose: int = ListeningPurpose.SPEECH, +) -> bytes: message = _new_message( ws_pb2.MESSAGE_KIND_STATE_CMD, ws_pb2.MESSAGE_TYPE_DATA, seq, ) message.state_cmd.state = int(state_id) + message.state_cmd.listening_purpose = int(listening_purpose) return message.SerializeToString() @@ -173,6 +185,7 @@ def encode_servo_command_message(seq: int, commands: Sequence[ServoCommand]) -> __all__ = [ "ServoCommand", + "ListeningPurpose", "encode_audio_pcm_data_message", "encode_audio_pcm_end_message", "encode_audio_pcm_start_message", diff --git a/stackchan_server/wakeup_word_detection/__init__.py b/stackchan_server/wakeup_word_detection/__init__.py new file mode 100644 index 0000000..d4c0f62 --- /dev/null +++ b/stackchan_server/wakeup_word_detection/__init__.py @@ -0,0 +1,13 @@ +from .create import create_server_side_wake_word_detector +from .server_side import ( + ServerSideWakeWordConfig, + ServerSideWakeWordDetector, + WakeWordDetectionError, +) + +__all__ = [ + "create_server_side_wake_word_detector", + "ServerSideWakeWordConfig", + "ServerSideWakeWordDetector", + "WakeWordDetectionError", +] diff --git a/stackchan_server/wakeup_word_detection/create.py b/stackchan_server/wakeup_word_detection/create.py new file mode 100644 index 0000000..2ec5011 --- /dev/null +++ b/stackchan_server/wakeup_word_detection/create.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from pydantic import Field +from pydantic_settings import BaseSettings + +from ..speech_recognition.whisper_server import WhisperServerSpeechToText +from .server_side import ServerSideWakeWordDetector + + +class _CreateServerSideWakeWordDetectorEnv(BaseSettings): + use_server_side_wwd_whisper_server: bool = Field( + default=False, + validation_alias="USE_SERVER_SIDE_WWD_WHISPER_SERVER", + ) + + class Config: + env_prefix = "" + + +def create_server_side_wake_word_detector() -> ServerSideWakeWordDetector | None: + env = _CreateServerSideWakeWordDetectorEnv() + if not env.use_server_side_wwd_whisper_server: + return None + + return ServerSideWakeWordDetector(recognizer=WhisperServerSpeechToText()) + + +__all__ = ["create_server_side_wake_word_detector"] diff --git a/stackchan_server/wakeup_word_detection/server_side.py b/stackchan_server/wakeup_word_detection/server_side.py new file mode 100644 index 0000000..f77be72 --- /dev/null +++ b/stackchan_server/wakeup_word_detection/server_side.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +import asyncio +import unicodedata +from logging import getLogger + +from pydantic import Field +from pydantic_settings import BaseSettings + +from ..speech_recognition.whisper_server import WhisperServerSpeechToText +from ..static import LISTEN_AUDIO_FORMAT + +logger = getLogger(__name__) + + +class WakeWordDetectionError(Exception): + pass + + +class ServerSideWakeWordConfig(BaseSettings): + keywords: list[str] = Field(default_factory=lambda: ["スタックチャン"]) + window_seconds: float = 3.0 + interval_seconds: float = 0.5 + timeout_seconds: float = 30.0 + + class Config: + env_prefix = "STACKCHAN_SERVER_WAKE_WORD_" + + +class ServerSideWakeWordDetector: + def __init__( + self, + *, + recognizer: WhisperServerSpeechToText | None = None, + config: ServerSideWakeWordConfig | None = None, + ) -> None: + self.config = config or ServerSideWakeWordConfig() + self.recognizer = recognizer or WhisperServerSpeechToText() + self._pcm_buffer = bytearray() + self._running = False + self._detected = False + self._streaming_started = False + self._error: Exception | None = None + self._last_inference_at = 0.0 + self._inference_task: asyncio.Task[None] | None = None + self._event = asyncio.Event() + self._lock = asyncio.Lock() + self._streaming_ended = False + + @property + def running(self) -> bool: + return self._running + + async def start(self) -> None: + await self.stop() + self._pcm_buffer = bytearray() + self._running = True + self._detected = False + self._streaming_started = False + self._streaming_ended = False + self._error = None + self._last_inference_at = 0.0 + self._event.clear() + logger.info("Server-side wake-word detection started") + + async def stop(self) -> None: + self._running = False + if self._inference_task is not None: + self._inference_task.cancel() + try: + await self._inference_task + except asyncio.CancelledError: + pass + self._inference_task = None + self._event.set() + + async def handle_start(self) -> None: + if not self._running: + return + self._streaming_started = True + self._streaming_ended = False + self._pcm_buffer = bytearray() + self._last_inference_at = 0.0 + logger.info("Server-side wake-word stream START") + + async def handle_data(self, payload: bytes) -> None: + if not self._running: + return + if not self._streaming_started: + logger.warning( + "Ignoring stale server-side wake-word DATA before START payload_bytes=%d", + len(payload), + ) + return + if self._streaming_ended: + logger.warning( + "Ignoring stale server-side wake-word DATA after END payload_bytes=%d", + len(payload), + ) + return + + self._pcm_buffer.extend(payload) + self._truncate_buffer_to_window() + + loop = asyncio.get_running_loop() + now = loop.time() + if (now - self._last_inference_at) < self.config.interval_seconds: + return + if self._inference_task is not None and not self._inference_task.done(): + return + + self._last_inference_at = now + window_bytes = bytes(self._pcm_buffer) + self._inference_task = asyncio.create_task(self._run_inference(window_bytes)) + + async def handle_end(self) -> None: + if not self._running: + return + if not self._streaming_started: + logger.warning("Ignoring stale server-side wake-word END before START") + return + if self._streaming_ended: + logger.warning("Ignoring duplicate server-side wake-word END") + return + self._streaming_ended = True + logger.info("Server-side wake-word stream END") + if self._inference_task is not None and not self._inference_task.done(): + try: + await self._inference_task + except Exception as exc: # pragma: no cover + self._error = exc + if not self._detected: + self._event.set() + + async def wait_result(self, timeout_seconds: float | None = None) -> bool: + if not self._running: + raise WakeWordDetectionError("Server-side wake-word detection is not running") + + timeout = ( + timeout_seconds + if timeout_seconds is not None + else self.config.timeout_seconds + ) + try: + await asyncio.wait_for(self._event.wait(), timeout=timeout) + except asyncio.TimeoutError as exc: + raise WakeWordDetectionError("Server-side wake-word detection timed out") from exc + + if self._error is not None: + raise WakeWordDetectionError(str(self._error)) from self._error + + return self._detected + + async def _run_inference(self, pcm_bytes: bytes) -> None: + if not pcm_bytes: + return + + try: + async with self._lock: + transcript = await self.recognizer.transcribe(pcm_bytes) + except Exception as exc: # pragma: no cover + logger.exception("Server-side wake-word transcription failed") + self._error = exc + self._event.set() + return + + logger.info("Server-side wake-word transcript: %s", transcript) + + if self._contains_wake_word(transcript): + logger.info("Server-side wake-word detected") + self._detected = True + self._event.set() + + def _contains_wake_word(self, transcript: str) -> bool: + normalized_transcript = _normalize_text(transcript) + if not normalized_transcript: + return False + + for keyword in self.config.keywords: + normalized_keyword = _normalize_text(keyword) + if normalized_keyword and normalized_keyword in normalized_transcript: + return True + return False + + def _truncate_buffer_to_window(self) -> None: + sample_rate = LISTEN_AUDIO_FORMAT.sample_rate_hz + channels = LISTEN_AUDIO_FORMAT.channels + sample_width = LISTEN_AUDIO_FORMAT.sample_width + bytes_per_second = sample_rate * channels * sample_width + max_bytes = max(1, int(bytes_per_second * self.config.window_seconds)) + if len(self._pcm_buffer) <= max_bytes: + return + del self._pcm_buffer[: len(self._pcm_buffer) - max_bytes] + + +def _normalize_text(text: str) -> str: + normalized = unicodedata.normalize("NFKC", text or "") + return "".join(normalized.lower().split()) + + +__all__ = [ + "ServerSideWakeWordConfig", + "ServerSideWakeWordDetector", + "WakeWordDetectionError", +] diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py index 1c45236..b10ec5a 100644 --- a/stackchan_server/ws_proxy.py +++ b/stackchan_server/ws_proxy.py @@ -12,12 +12,12 @@ from fastapi import WebSocket, WebSocketDisconnect from google.protobuf.message import DecodeError -from pydantic_settings import BaseSettings from . import __version__ from .generated_protobuf import websocket_message_pb2 as _ws_pb2 from .listen import EmptyTranscriptError, ListenHandler, TimeoutError from .protobuf_ws import ( + ListeningPurpose, encode_server_metadata_message, encode_servo_command_message, encode_state_command_message, @@ -26,6 +26,10 @@ from .speak import SpeakHandler from .static import LISTEN_AUDIO_FORMAT from .types import SpeechRecognizer, SpeechSynthesizer +from .wakeup_word_detection import ( + WakeWordDetectionError, + create_server_side_wake_word_detector, +) logger = getLogger(__name__) @@ -43,17 +47,8 @@ ) # half interval for the second segment start _LISTEN_AUDIO_TIMEOUT_SECONDS = 10.0 _DEBUG_RECORDING_ENABLED = os.getenv("DEBUG_RECODING") == "1" - - -class _WakeWordServerConfig(BaseSettings): - no_use_client_wakeup_word: bool = False - use_open_wake_word: bool = False - - class Config: - env_prefix = "STACKCHAN_" - - -_WAKEWORD_SERVER_CONFIG = _WakeWordServerConfig() +_SERVER_WAKEWORD_RESTART_DELAY_SECONDS = 0.25 +_TRAILING_PCM_DRAIN_SECONDS = 1.0 class FirmwareState(IntEnum): @@ -129,6 +124,12 @@ def __init__( recordings_dir=self.recordings_dir, debug_recording=self._debug_recording, ) + self._server_wakeword_detector = create_server_side_wake_word_detector() + self._server_wakeword_task: Optional[asyncio.Task[bool]] = None + self._server_wakeword_restart_task: Optional[asyncio.Task[None]] = None + self._auto_start_server_wakeword = False + self._drain_trailing_pcm_until_end = False + self._drain_trailing_pcm_deadline: float | None = None self._receiving_task: Optional[asyncio.Task] = None self._closed = False @@ -157,6 +158,10 @@ def current_state(self) -> FirmwareState: def receive_task(self) -> Optional[asyncio.Task]: return self._receiving_task + @property + def has_server_wakeword_detector(self) -> bool: + return self._server_wakeword_detector is not None + def trigger_wakeword(self) -> None: """Web API から擬似的に WAKEWORD_EVT を発火させる。""" logger.info("Triggered wakeword via API") @@ -165,6 +170,7 @@ def trigger_wakeword(self) -> None: async def wait_for_talk_session(self) -> None: while True: if self._wakeword_event.is_set(): + await self.stop_server_wakeword_detection() self._wakeword_event.clear() return if self._closed: @@ -172,6 +178,7 @@ async def wait_for_talk_session(self) -> None: await asyncio.sleep(0.05) async def listen(self) -> str: + await self.stop_server_wakeword_detection() return await self._listener.listen( send_state_command=self.send_state_command, is_closed=lambda: self._closed, @@ -188,11 +195,21 @@ async def speak(self, text: str) -> None: is_closed=lambda: self._closed, ) - async def send_state_command(self, state_id: int | FirmwareState) -> None: - await self._send_state_command(state_id) + async def send_state_command( + self, + state_id: int | FirmwareState, + *, + listening_purpose: ListeningPurpose = ListeningPurpose.SPEECH, + ) -> None: + await self._send_state_command( + state_id, + listening_purpose=listening_purpose, + ) async def reset_state(self) -> None: await self.send_state_command(FirmwareState.IDLE) + self._current_firmware_state = FirmwareState.IDLE + self._schedule_server_wakeword_restart() async def move_servo(self, commands: Sequence[ServoCommand]) -> None: previous_counter = self._servo_sent_counter @@ -232,6 +249,8 @@ async def start(self) -> None: async def close(self) -> None: self._closed = True + self._cancel_server_wakeword_restart_task() + await self.stop_server_wakeword_detection() if self._receiving_task: self._receiving_task.cancel() with suppress(asyncio.CancelledError): @@ -241,6 +260,84 @@ async def close(self) -> None: async def start_talking(self, text: str) -> None: await self.speak(text) + async def enable_auto_server_wakeword_detection(self) -> None: + self._auto_start_server_wakeword = True + await self.start_server_wakeword_detection_if_available() + + async def start_server_wakeword_detection_if_available(self) -> bool: + if ( + self._closed + or self._server_wakeword_detector is None + or not self.server_metadata.has_server_wake_word + or self.current_state != FirmwareState.IDLE + ): + return False + + if self._server_wakeword_task is not None and not self._server_wakeword_task.done(): + return True + + self._cancel_server_wakeword_restart_task() + self._server_wakeword_task = asyncio.create_task( + self._run_server_wakeword_detection(), + name="server-side-wakeword-detection", + ) + return True + + async def stop_server_wakeword_detection(self) -> None: + self._cancel_server_wakeword_restart_task() + task = self._server_wakeword_task + if task is None: + return + + if task.done(): + self._server_wakeword_task = None + try: + await task + except asyncio.CancelledError: + pass + except Exception: + logger.exception("Server-side wake-word detection task failed") + return + + task.cancel() + self._server_wakeword_task = None + try: + await task + except asyncio.CancelledError: + pass + except Exception: + logger.exception("Server-side wake-word detection task failed") + + async def request_server_wakeword_detection( + self, + *, + timeout_seconds: float | None = None, + ) -> bool: + if self._server_wakeword_detector is None or not self.server_metadata.has_server_wake_word: + raise WakeWordDetectionError( + "Server-side wake-word detection is not available for this connection" + ) + if self._closed: + raise WebSocketDisconnect() + + started = await self.start_server_wakeword_detection_if_available() + if not started: + raise WakeWordDetectionError( + "Server-side wake-word detection could not be started in the current state" + ) + + task = self._server_wakeword_task + if task is None: + raise WakeWordDetectionError("Server-side wake-word detection task is unavailable") + + try: + if timeout_seconds is None: + return await asyncio.shield(task) + return await asyncio.wait_for(asyncio.shield(task), timeout=timeout_seconds) + except asyncio.TimeoutError as exc: + await self.stop_server_wakeword_detection() + raise WakeWordDetectionError("Server-side wake-word detection timed out") from exc + async def _receive_loop(self) -> None: try: while True: @@ -254,6 +351,63 @@ async def _receive_loop(self) -> None: if message.kind == ws_pb2.MESSAGE_KIND_AUDIO_PCM: body_name = message.WhichOneof("body") + if self._should_drain_trailing_pcm(): + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_START + and body_name == "audio_pcm_start" + ): + logger.info( + "Received a new PCM START while draining trailing wake-word audio; resuming normal routing" + ) + self._clear_trailing_pcm_drain() + elif ( + message.message_type == ws_pb2.MESSAGE_TYPE_DATA + and body_name == "audio_pcm_data" + ): + logger.info( + "Discarding trailing PCM DATA after wake-word detection stop payload_bytes=%d", + len(message.audio_pcm_data.pcm_bytes), + ) + continue + elif ( + message.message_type == ws_pb2.MESSAGE_TYPE_END + and body_name == "audio_pcm_end" + ): + logger.info( + "Finished draining trailing PCM after wake-word detection stop" + ) + self._clear_trailing_pcm_drain() + continue + + if ( + self._server_wakeword_detector is not None + and self._server_wakeword_detector.running + ): + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_START + and body_name == "audio_pcm_start" + ): + await self._server_wakeword_detector.handle_start() + continue + + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_DATA + and body_name == "audio_pcm_data" + ): + payload = bytes(message.audio_pcm_data.pcm_bytes) + await self._server_wakeword_detector.handle_data(payload) + continue + + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_END + and body_name == "audio_pcm_end" + ): + await self._server_wakeword_detector.handle_end() + continue + + await self.ws.close(code=1003, reason="unknown wakeword PCM protobuf body") + break + if ( message.message_type == ws_pb2.MESSAGE_TYPE_START and body_name == "audio_pcm_start" @@ -362,17 +516,13 @@ async def _handle_firmware_metadata(self, message: Any) -> None: server_version=self.server_metadata.server_version, ) ) + if self._auto_start_server_wakeword: + await self.start_server_wakeword_detection_if_available() def _build_server_metadata( self, firmware_metadata: FirmwareMetadata ) -> ServerMetadata: - should_use_server_wake_word = ( - _WAKEWORD_SERVER_CONFIG.use_open_wake_word - and ( - _WAKEWORD_SERVER_CONFIG.no_use_client_wakeup_word - or not firmware_metadata.has_device_wake_word - ) - ) + should_use_server_wake_word = self._server_wakeword_detector is not None return ServerMetadata( has_server_wake_word=should_use_server_wake_word, server_version=__version__, @@ -410,10 +560,126 @@ def _handle_servo_done_event(self, message: Any) -> None: self._servo_done_counter += 1 logger.info("Received servo done event") - async def _send_state_command(self, state_id: int | FirmwareState) -> None: + async def _send_state_command( + self, + state_id: int | FirmwareState, + *, + listening_purpose: ListeningPurpose = ListeningPurpose.SPEECH, + ) -> None: await self.ws.send_bytes( - encode_state_command_message(self._next_down_seq(), int(state_id)) + encode_state_command_message( + self._next_down_seq(), + int(state_id), + listening_purpose=int(listening_purpose), + ) + ) + + async def _run_server_wakeword_detection(self) -> bool: + detector = self._server_wakeword_detector + if detector is None: + return False + + detected = False + should_restart = False + try: + await detector.start() + await self.send_state_command( + FirmwareState.LISTENING, + listening_purpose=ListeningPurpose.WAKE_WORD, + ) + detected = await detector.wait_result() + if detected: + self._wakeword_event.set() + return detected + except asyncio.CancelledError: + raise + except WakeWordDetectionError as exc: + logger.warning("Server-side wake-word detection stopped: %s", exc) + return False + except Exception: + logger.exception("Server-side wake-word detection failed") + return False + finally: + await detector.stop() + self._arm_trailing_pcm_drain() + if not self._closed: + self._current_firmware_state = FirmwareState.IDLE + try: + await self.send_state_command(FirmwareState.IDLE) + except Exception: + logger.exception("Failed to return firmware to idle after wake-word detection") + should_restart = ( + self._auto_start_server_wakeword + and not detected + and not self._wakeword_event.is_set() + and not self._closed + ) + if self._server_wakeword_task is asyncio.current_task(): + self._server_wakeword_task = None + if should_restart: + self._schedule_server_wakeword_restart() + + def _schedule_server_wakeword_restart( + self, + delay_seconds: float = _SERVER_WAKEWORD_RESTART_DELAY_SECONDS, + ) -> None: + if not self._auto_start_server_wakeword or self._closed: + return + + self._cancel_server_wakeword_restart_task() + self._server_wakeword_restart_task = asyncio.create_task( + self._restart_server_wakeword_detection_after_delay(delay_seconds), + name="server-side-wakeword-restart", + ) + + def _cancel_server_wakeword_restart_task(self) -> None: + task = self._server_wakeword_restart_task + if task is None: + return + self._server_wakeword_restart_task = None + task.cancel() + + async def _restart_server_wakeword_detection_after_delay( + self, + delay_seconds: float, + ) -> None: + try: + await asyncio.sleep(delay_seconds) + if self._closed: + return + await self.start_server_wakeword_detection_if_available() + except asyncio.CancelledError: + raise + finally: + if self._server_wakeword_restart_task is asyncio.current_task(): + self._server_wakeword_restart_task = None + + def _arm_trailing_pcm_drain( + self, + timeout_seconds: float = _TRAILING_PCM_DRAIN_SECONDS, + ) -> None: + loop = asyncio.get_running_loop() + self._drain_trailing_pcm_until_end = True + self._drain_trailing_pcm_deadline = loop.time() + timeout_seconds + + def _clear_trailing_pcm_drain(self) -> None: + self._drain_trailing_pcm_until_end = False + self._drain_trailing_pcm_deadline = None + + def _should_drain_trailing_pcm(self) -> bool: + if not self._drain_trailing_pcm_until_end: + return False + deadline = self._drain_trailing_pcm_deadline + if deadline is None: + return True + if asyncio.get_running_loop().time() <= deadline: + return True + + logger.info( + "Trailing PCM drain window expired before END arrived; resuming normal routing" ) + self._clear_trailing_pcm_drain() + return False async def _wait_for_counter( self, From 930fdffdc7e53065427109003a49bebbc4a3a7bc Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sat, 9 May 2026 14:38:07 +0900 Subject: [PATCH 02/15] feat: Stop wakeword streaming on WebSocket disconnection --- firmware/src/main.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp index 07c1da2..67256ad 100644 --- a/firmware/src/main.cpp +++ b/firmware/src/main.cpp @@ -351,6 +351,11 @@ void handleWsEvent(WStype_t type, uint8_t *payload, size_t length) case WStype_DISCONNECTED: // M5.Display.println("WS: disconnected"); log_i("WS disconnected"); + if (listening.isWakeWordStreaming()) + { + log_i("Stopping server-side wakeword uplink because WS disconnected"); + listening.endWakeWordStreaming(); + } resetServerMetadata(); stateMachine.setState(StateMachine::Disconnected); break; From 03d0bab82716698f303b5f3f50fdf77adb9b6f0c Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sat, 9 May 2026 15:25:54 +0900 Subject: [PATCH 03/15] feat: Add server-side wakeword detection support - Introduced `ServerWwdPcm` message kind for server-side wakeword PCM stream. - Updated WebSocket message protocol to include `MESSAGE_KIND_SERVER_WWD_PCM`. - Implemented `WhisperServerWakeWordDetector` for handling server-side wakeword detection. - Refactored `WsProxy` to manage server-side wakeword PCM messages. - Removed deprecated server-side wakeword detection API endpoint. - Enhanced documentation for new wakeword detection flow and message types. --- .env.template | 6 + docs/rest_api_ja.md | 58 ----- docs/server_ja.md | 16 ++ docs/websocket_protocols_ja.md | 20 +- .../generated_protobuf/websocket-message.pb.h | 7 +- firmware/src/listening.cpp | 7 +- protobuf/websocket-message.proto | 1 + stackchan_server/app.py | 24 -- .../websocket_message_pb2.py | 28 +-- .../wakeup_word_detection/__init__.py | 12 +- .../wakeup_word_detection/create.py | 21 +- .../{server_side.py => whisper_server.py} | 31 ++- stackchan_server/ws_proxy.py | 232 ++++++++++-------- 13 files changed, 234 insertions(+), 229 deletions(-) rename stackchan_server/wakeup_word_detection/{server_side.py => whisper_server.py} (89%) diff --git a/.env.template b/.env.template index 43d3290..89d30f6 100644 --- a/.env.template +++ b/.env.template @@ -34,6 +34,12 @@ STACKCHAN_GOOGLE_CLOUD_TTS_VOICE_NAME="Despina" STACKCHAN_VOICEVOX_URL="http://localhost:50021" STACKCHAN_VOICEVOX_SPEAKER=1 +# -- Server-side Wakeup Word Detection -- +# Whisper Server +# STACKCHAN_USE_WWD_WHISPER_SERVER=1 +# STACKCHAN_WWD_WHISPER_SERVER_URL="http://127.0.0.1:8080/inference" +# STACKCHAN_WWD_WHISPER_SERVER_MODEL= + # -- Claude Agent SDK -- # using Google Cloud Vertex AI CLAUDE_CODE_USE_VERTEX=1 diff --git a/docs/rest_api_ja.md b/docs/rest_api_ja.md index a3b41e1..86c2d2f 100644 --- a/docs/rest_api_ja.md +++ b/docs/rest_api_ja.md @@ -17,7 +17,6 @@ | `GET` | `/v1/stackchan` | 接続中 StackChan 一覧 | | `GET` | `/v1/stackchan/{stackchan_ip}` | 指定 StackChan の状態取得 | | `POST` | `/v1/stackchan/{stackchan_ip}/wakeword` | 擬似 wakeword 発火 | -| `POST` | `/v1/stackchan/{stackchan_ip}/wakeword/server-detect` | サーバーサイド wakeword 検出を要求 | | `POST` | `/v1/stackchan/{stackchan_ip}/speak` | 指定 StackChan に発話させる | ## `GET /health` @@ -135,63 +134,6 @@ - 実機側のウェイクワード検出 (`WakeWordEvt`) と同じように扱われます。 - すでに `talk_session` 実行中でも、イベント自体は内部フラグとして立ちます。 -## `POST /v1/stackchan/{stackchan_ip}/wakeword/server-detect` - -サーバーサイドの wakeword 検出を開始します。 - -> [!NOTE] -> 環境変数 `USE_SERVER_SIDE_WWD_WHISPER_SERVER=1` の場合、サーバーは `@app.setup()` 完了後と `Idle` 復帰後に自動でサーバーサイド wakeword 検出を開始します。この API は明示的に現在の検出サイクルを待ちたい場合に利用できます。 - -- サーバーは対象 StackChan に `StateCmd(Listening, WAKE_WORD)` を送ってマイク音声を受信し、 - 直近 3 秒窓を 0.5 秒ごとに認識します。 -- 認識テキストには全結果がログ出力されます。 -- キーワードが検出されたら内部 wakeword イベントを発火し、`talk_session` 開始待ちを解除します。 -- 検出の終了時には `StateCmd(Idle)` を送って待機に戻します。 -- 実機の表示状態は `Listening` へは変わらず、`Idle(Server-WWD)` のまま待ち受けます。 -- このモードでは無音 3 秒によるクライアント側自動終了は行いません。 - -### パスパラメータ - -| 名前 | 型 | 説明 | -| --- | --- | --- | -| `stackchan_ip` | `string` | 対象 StackChan の接続元 IP | - -### クエリパラメータ - -| 名前 | 型 | 必須 | 説明 | -| --- | --- | --- | --- | -| `timeout_seconds` | `number` | 任意 | 検出待ちタイムアウト秒。未指定時はサーバー設定値 | - -### 成功レスポンス - -- Status: `200 OK` - -```json -{ - "detected": true -} -``` - -`detected` が `false` の場合は、検出セッションは終了したがキーワード未検出です。 - -### エラーレスポンス - -- Status: `404 Not Found` - -```json -{ - "detail": "stackchan not connected" -} -``` - -- Status: `409 Conflict` - -```json -{ - "detail": "Server-side wake-word detection is not available for this connection" -} -``` - ## `POST /v1/stackchan/{stackchan_ip}/speak` 指定した StackChan にテキストを発話させます。 diff --git a/docs/server_ja.md b/docs/server_ja.md index 33767ce..506727c 100644 --- a/docs/server_ja.md +++ b/docs/server_ja.md @@ -86,6 +86,22 @@ STACKCHAN_WHISPER_SERVER_URL=http://localhost:13305/api/v1/audio/transcriptions STACKCHAN_WHISPER_SERVER_MODEL=Whisper-Large-v3-Turbo ``` +### (オプション) サーバーサイド wakeword 用 Whisper Server の設定 + +サーバーサイド wakeword 検出を有効にするには、以下を設定します。 + +- `STACKCHAN_USE_WWD_WHISPER_SERVER`: `1` +- `STACKCHAN_WWD_WHISPER_SERVER_URL`: wakeword 検出専用 Whisper Server の推論エンドポイント URL +- `STACKCHAN_WWD_WHISPER_SERVER_MODEL`: wakeword 検出専用に利用するモデル名 + +通常の音声認識で使う `STACKCHAN_WHISPER_SERVER_URL` / `STACKCHAN_WHISPER_SERVER_MODEL` とは別設定です。 + +``` +STACKCHAN_USE_WWD_WHISPER_SERVER=1 +STACKCHAN_WWD_WHISPER_SERVER_URL="http://127.0.0.1:8080/inference" +STACKCHAN_WWD_WHISPER_SERVER_MODEL= +``` + ## 音声合成の設定 音声合成エンジンとして、以下に対応しています。 diff --git a/docs/websocket_protocols_ja.md b/docs/websocket_protocols_ja.md index d5a21aa..e8694a0 100644 --- a/docs/websocket_protocols_ja.md +++ b/docs/websocket_protocols_ja.md @@ -28,6 +28,7 @@ | 名前 | 方向 | 用途 | | --- | --- | --- | | `AudioPcm` | CoreS3 → Server | マイク音声 PCM ストリーム | +| `ServerWwdPcm` | CoreS3 → Server | サーバーサイド wakeword 検出専用 PCM ストリーム | | `AudioWav` | Server → CoreS3 | TTS 音声 PCM ストリーム | | `StateCmd` | Server → CoreS3 | 状態遷移指示 | | `WakeWordEvt` | CoreS3 → Server | ウェイクワード検出通知 | @@ -64,6 +65,20 @@ - 無音判定は平均絶対振幅 `<= 200` が 3 秒継続したときに発火します。 - 停止時は未送信サンプルを `DATA` で flush してから `END` を送ります。 +## サーバーサイド wakeword 入力 `ServerWwdPcm` + +- 方向: CoreS3 → Server +- フォーマット: PCM16LE / 16kHz / 1ch +- シーケンス: `AudioPcmStart` → `AudioChunk` 複数回 → `AudioPcmEnd` +- `kind`: `MESSAGE_KIND_SERVER_WWD_PCM` +- body は `AudioPcm` と同じ `AudioPcmStart` / `AudioChunk` / `AudioPcmEnd` を使います。 + +### 現行実装メモ + +- `StateCmd(Listening, WAKE_WORD)` を受けた CoreS3 は、見た目の状態を `Idle(Server-WWD)` のままにしてこの kind で uplink します。 +- 無音 3 秒によるクライアント側自動終了は行いません。 +- サーバーはこの kind だけを server-side wakeword detector にルーティングします。 + ## スピーカ再生 `AudioWav` - 方向: Server → CoreS3 @@ -138,9 +153,8 @@ CoreS3 側は `has_server_wake_word=true` を受けると、デバイス側 wake ## サーバーサイド wakeword 検出フロー -- 環境変数 `USE_SERVER_SIDE_WWD_WHISPER_SERVER=1` の場合、サーバーは `@app.setup()` 完了後と `Idle` 復帰後に自動でサーバーサイド wakeword 検出を開始します。 -- REST API `POST /v1/stackchan/{ip}/wakeword/server-detect` を呼ぶと、 - サーバーは `StateCmd(Listening, WAKE_WORD)` を送信してマイク uplink を受信します。 +- 環境変数 `STACKCHAN_USE_WWD_WHISPER_SERVER=1` の場合、サーバーは `@app.setup()` 完了後と `Idle` 復帰後に自動でサーバーサイド wakeword 検出を開始します。 +- サーバーは `StateCmd(Listening, WAKE_WORD)` を送信して `MESSAGE_KIND_SERVER_WWD_PCM` のマイク uplink を受信します。 - 受信した音声の直近 3 秒窓を 0.5 秒ごとに音声認識へ渡し、 定義キーワード(例: `スタクチャン`)を含むか判定します。 - 各判定タイミングの認識結果はすべてログ出力されます。 diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.h b/firmware/lib/generated_protobuf/websocket-message.pb.h index ffd1d31..28e1f54 100644 --- a/firmware/lib/generated_protobuf/websocket-message.pb.h +++ b/firmware/lib/generated_protobuf/websocket-message.pb.h @@ -21,7 +21,8 @@ typedef enum _stackchan_websocket_v1_MessageKind { stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_CMD = 7, stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_DONE_EVT = 8, stackchan_websocket_v1_MessageKind_MESSAGE_KIND_FIRMWARE_METADATA = 9, - stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_METADATA = 10 + stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_METADATA = 10, + stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_WWD_PCM = 11 } stackchan_websocket_v1_MessageKind; typedef enum _stackchan_websocket_v1_MessageType { @@ -172,8 +173,8 @@ extern "C" { /* Helper constants for enums */ #define _stackchan_websocket_v1_MessageKind_MIN stackchan_websocket_v1_MessageKind_MESSAGE_KIND_UNSPECIFIED -#define _stackchan_websocket_v1_MessageKind_MAX stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_METADATA -#define _stackchan_websocket_v1_MessageKind_ARRAYSIZE ((stackchan_websocket_v1_MessageKind)(stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_METADATA+1)) +#define _stackchan_websocket_v1_MessageKind_MAX stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_WWD_PCM +#define _stackchan_websocket_v1_MessageKind_ARRAYSIZE ((stackchan_websocket_v1_MessageKind)(stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_WWD_PCM+1)) #define _stackchan_websocket_v1_MessageType_MIN stackchan_websocket_v1_MessageType_MESSAGE_TYPE_UNSPECIFIED #define _stackchan_websocket_v1_MessageType_MAX stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END diff --git a/firmware/src/listening.cpp b/firmware/src/listening.cpp index 1138d67..256d26a 100644 --- a/firmware/src/listening.cpp +++ b/firmware/src/listening.cpp @@ -113,9 +113,9 @@ bool Listening::stopStreaming() } streaming_ = false; + ok = sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END, nullptr, 0) && ok; session_mode_ = SessionMode::Speech; auto_stop_for_silence_ = true; - ok = sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END, nullptr, 0) && ok; return ok; } @@ -236,7 +236,10 @@ bool Listening::sendPacket(stackchan_websocket_v1_MessageType type, const int16_ auto &message = g_listening_tx_message; message = stackchan_websocket_v1_WebSocketMessage_init_zero; - message.kind = stackchan_websocket_v1_MessageKind_MESSAGE_KIND_AUDIO_PCM; + message.kind = + (session_mode_ == SessionMode::WakeWord) + ? stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_WWD_PCM + : stackchan_websocket_v1_MessageKind_MESSAGE_KIND_AUDIO_PCM; message.message_type = type; message.seq = seq_counter_++; diff --git a/protobuf/websocket-message.proto b/protobuf/websocket-message.proto index fec8bff..10932ac 100644 --- a/protobuf/websocket-message.proto +++ b/protobuf/websocket-message.proto @@ -46,6 +46,7 @@ enum MessageKind { MESSAGE_KIND_SERVO_DONE_EVT = 8; MESSAGE_KIND_FIRMWARE_METADATA = 9; MESSAGE_KIND_SERVER_METADATA = 10; + MESSAGE_KIND_SERVER_WWD_PCM = 11; } enum MessageType { diff --git a/stackchan_server/app.py b/stackchan_server/app.py index 94cf8de..5921d9a 100644 --- a/stackchan_server/app.py +++ b/stackchan_server/app.py @@ -10,7 +10,6 @@ from .speech_recognition import create_speech_recognizer from .speech_synthesis import create_speech_synthesizer from .types import SpeechRecognizer, SpeechSynthesizer -from .wakeup_word_detection import WakeWordDetectionError from .ws_proxy import WsProxy logger = getLogger(__name__) @@ -25,10 +24,6 @@ class SpeakRequest(BaseModel): text: str -class ServerWakeWordDetectResponse(BaseModel): - detected: bool - - class StackChanApp: def __init__( self, @@ -69,25 +64,6 @@ async def _trigger_wakeword(stackchan_ip: str): raise HTTPException(status_code=404, detail="stackchan not connected") proxy.trigger_wakeword() - @self.fastapi.post( - "/v1/stackchan/{stackchan_ip}/wakeword/server-detect", - response_model=ServerWakeWordDetectResponse, - ) - async def _detect_server_wakeword( - stackchan_ip: str, - timeout_seconds: float | None = None, - ): - proxy = await self._get_proxy(stackchan_ip) - if proxy is None: - raise HTTPException(status_code=404, detail="stackchan not connected") - try: - detected = await proxy.request_server_wakeword_detection( - timeout_seconds=timeout_seconds - ) - except WakeWordDetectionError as exc: - raise HTTPException(status_code=409, detail=str(exc)) from exc - return ServerWakeWordDetectResponse(detected=detected) - @self.fastapi.post("/v1/stackchan/{stackchan_ip}/speak", status_code=204) async def _speak(stackchan_ip: str, body: SpeakRequest): proxy = await self._get_proxy(stackchan_ip) diff --git a/stackchan_server/generated_protobuf/websocket_message_pb2.py b/stackchan_server/generated_protobuf/websocket_message_pb2.py index 433f9a8..15b3d61 100644 --- a/stackchan_server/generated_protobuf/websocket_message_pb2.py +++ b/stackchan_server/generated_protobuf/websocket_message_pb2.py @@ -24,7 +24,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17websocket-message.proto\x12\x16stackchan.websocket.v1\"\x96\x08\n\x10WebSocketMessage\x12\x31\n\x04kind\x18\x01 \x01(\x0e\x32#.stackchan.websocket.v1.MessageKind\x12\x39\n\x0cmessage_type\x18\x02 \x01(\x0e\x32#.stackchan.websocket.v1.MessageType\x12\x0b\n\x03seq\x18\x03 \x01(\r\x12@\n\x0f\x61udio_pcm_start\x18\n \x01(\x0b\x32%.stackchan.websocket.v1.AudioPcmStartH\x00\x12<\n\x0e\x61udio_pcm_data\x18\x0b \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_pcm_end\x18\x0c \x01(\x0b\x32#.stackchan.websocket.v1.AudioPcmEndH\x00\x12@\n\x0f\x61udio_wav_start\x18\x14 \x01(\x0b\x32%.stackchan.websocket.v1.AudioWavStartH\x00\x12<\n\x0e\x61udio_wav_data\x18\x15 \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_wav_end\x18\x16 \x01(\x0b\x32#.stackchan.websocket.v1.AudioWavEndH\x00\x12\x39\n\tstate_cmd\x18\x1e \x01(\x0b\x32$.stackchan.websocket.v1.StateCommandH\x00\x12>\n\rwake_word_evt\x18\x1f \x01(\x0b\x32%.stackchan.websocket.v1.WakeWordEventH\x00\x12\x37\n\tstate_evt\x18 \x01(\x0b\x32\".stackchan.websocket.v1.StateEventH\x00\x12@\n\x0espeak_done_evt\x18! \x01(\x0b\x32&.stackchan.websocket.v1.SpeakDoneEventH\x00\x12\x41\n\tservo_cmd\x18\" \x01(\x0b\x32,.stackchan.websocket.v1.ServoCommandSequenceH\x00\x12@\n\x0eservo_done_evt\x18# \x01(\x0b\x32&.stackchan.websocket.v1.ServoDoneEventH\x00\x12\x45\n\x11\x66irmware_metadata\x18$ \x01(\x0b\x32(.stackchan.websocket.v1.FirmwareMetadataH\x00\x12\x41\n\x0fserver_metadata\x18% \x01(\x0b\x32&.stackchan.websocket.v1.ServerMetadataH\x00\x42\x06\n\x04\x62ody\"\x0f\n\rAudioPcmStart\"\r\n\x0b\x41udioPcmEnd\"6\n\rAudioWavStart\x12\x13\n\x0bsample_rate\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\"\r\n\x0b\x41udioWavEnd\"\x1f\n\nAudioChunk\x12\x11\n\tpcm_bytes\x18\x01 \x01(\x0c\"\x8a\x01\n\x0cStateCommand\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\x12\x43\n\x11listening_purpose\x18\x02 \x01(\x0e\x32(.stackchan.websocket.v1.ListeningPurpose\"!\n\rWakeWordEvent\x12\x10\n\x08\x64\x65tected\x18\x01 \x01(\x08\"C\n\nStateEvent\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"\x1e\n\x0eSpeakDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"N\n\x14ServoCommandSequence\x12\x36\n\x08\x63ommands\x18\x01 \x03(\x0b\x32$.stackchan.websocket.v1.ServoCommand\"f\n\x0cServoCommand\x12\x32\n\x02op\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.ServoOperation\x12\r\n\x05\x61ngle\x18\x02 \x01(\x11\x12\x13\n\x0b\x64uration_ms\x18\x03 \x01(\x11\"\x1e\n\x0eServoDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"\x99\x02\n\x10\x46irmwareMetadata\x12\x37\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\".stackchan.websocket.v1.DeviceType\x12\x15\n\rdisplay_width\x18\x02 \x01(\r\x12\x16\n\x0e\x64isplay_height\x18\x03 \x01(\r\x12\x1c\n\x14has_device_wake_word\x18\x04 \x01(\x08\x12\x0f\n\x07has_led\x18\x05 \x01(\x08\x12\x35\n\nservo_type\x18\x06 \x01(\x0e\x32!.stackchan.websocket.v1.ServoType\x12\x1d\n\x15supports_audio_duplex\x18\x07 \x01(\x08\x12\x18\n\x10\x66irmware_version\x18\x08 \x01(\t\"F\n\x0eServerMetadata\x12\x1c\n\x14has_server_wake_word\x18\x01 \x01(\x08\x12\x16\n\x0eserver_version\x18\x02 \x01(\t*\xdf\x02\n\x0bMessageKind\x12\x1c\n\x18MESSAGE_KIND_UNSPECIFIED\x10\x00\x12\x1a\n\x16MESSAGE_KIND_AUDIO_PCM\x10\x01\x12\x1a\n\x16MESSAGE_KIND_AUDIO_WAV\x10\x02\x12\x1a\n\x16MESSAGE_KIND_STATE_CMD\x10\x03\x12\x1e\n\x1aMESSAGE_KIND_WAKE_WORD_EVT\x10\x04\x12\x1a\n\x16MESSAGE_KIND_STATE_EVT\x10\x05\x12\x1f\n\x1bMESSAGE_KIND_SPEAK_DONE_EVT\x10\x06\x12\x1a\n\x16MESSAGE_KIND_SERVO_CMD\x10\x07\x12\x1f\n\x1bMESSAGE_KIND_SERVO_DONE_EVT\x10\x08\x12\"\n\x1eMESSAGE_KIND_FIRMWARE_METADATA\x10\t\x12 \n\x1cMESSAGE_KIND_SERVER_METADATA\x10\n*p\n\x0bMessageType\x12\x1c\n\x18MESSAGE_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12MESSAGE_TYPE_START\x10\x01\x12\x15\n\x11MESSAGE_TYPE_DATA\x10\x02\x12\x14\n\x10MESSAGE_TYPE_END\x10\x03*\x85\x01\n\x0eStackchanState\x12\x18\n\x14STACKCHAN_STATE_IDLE\x10\x00\x12\x1d\n\x19STACKCHAN_STATE_LISTENING\x10\x01\x12\x1c\n\x18STACKCHAN_STATE_THINKING\x10\x02\x12\x1c\n\x18STACKCHAN_STATE_SPEAKING\x10\x03*t\n\x10ListeningPurpose\x12!\n\x1dLISTENING_PURPOSE_UNSPECIFIED\x10\x00\x12\x1c\n\x18LISTENING_PURPOSE_SPEECH\x10\x01\x12\x1f\n\x1bLISTENING_PURPOSE_WAKE_WORD\x10\x02*c\n\x0eServoOperation\x12\x19\n\x15SERVO_OPERATION_SLEEP\x10\x00\x12\x1a\n\x16SERVO_OPERATION_MOVE_X\x10\x01\x12\x1a\n\x16SERVO_OPERATION_MOVE_Y\x10\x02*\x85\x01\n\nDeviceType\x12\x1b\n\x17\x44\x45VICE_TYPE_UNSPECIFIED\x10\x00\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5STACK_CORES3\x10\x01\x12\x1a\n\x16\x44\x45VICE_TYPE_M5ATOM_S3R\x10\x02\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5ATOM_ECHOS3R\x10\x03*i\n\tServoType\x12\x1a\n\x16SERVO_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fSERVO_TYPE_NONE\x10\x01\x12\x13\n\x0fSERVO_TYPE_SG90\x10\x02\x12\x16\n\x12SERVO_TYPE_SCS0009\x10\x03\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17websocket-message.proto\x12\x16stackchan.websocket.v1\"\x96\x08\n\x10WebSocketMessage\x12\x31\n\x04kind\x18\x01 \x01(\x0e\x32#.stackchan.websocket.v1.MessageKind\x12\x39\n\x0cmessage_type\x18\x02 \x01(\x0e\x32#.stackchan.websocket.v1.MessageType\x12\x0b\n\x03seq\x18\x03 \x01(\r\x12@\n\x0f\x61udio_pcm_start\x18\n \x01(\x0b\x32%.stackchan.websocket.v1.AudioPcmStartH\x00\x12<\n\x0e\x61udio_pcm_data\x18\x0b \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_pcm_end\x18\x0c \x01(\x0b\x32#.stackchan.websocket.v1.AudioPcmEndH\x00\x12@\n\x0f\x61udio_wav_start\x18\x14 \x01(\x0b\x32%.stackchan.websocket.v1.AudioWavStartH\x00\x12<\n\x0e\x61udio_wav_data\x18\x15 \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_wav_end\x18\x16 \x01(\x0b\x32#.stackchan.websocket.v1.AudioWavEndH\x00\x12\x39\n\tstate_cmd\x18\x1e \x01(\x0b\x32$.stackchan.websocket.v1.StateCommandH\x00\x12>\n\rwake_word_evt\x18\x1f \x01(\x0b\x32%.stackchan.websocket.v1.WakeWordEventH\x00\x12\x37\n\tstate_evt\x18 \x01(\x0b\x32\".stackchan.websocket.v1.StateEventH\x00\x12@\n\x0espeak_done_evt\x18! \x01(\x0b\x32&.stackchan.websocket.v1.SpeakDoneEventH\x00\x12\x41\n\tservo_cmd\x18\" \x01(\x0b\x32,.stackchan.websocket.v1.ServoCommandSequenceH\x00\x12@\n\x0eservo_done_evt\x18# \x01(\x0b\x32&.stackchan.websocket.v1.ServoDoneEventH\x00\x12\x45\n\x11\x66irmware_metadata\x18$ \x01(\x0b\x32(.stackchan.websocket.v1.FirmwareMetadataH\x00\x12\x41\n\x0fserver_metadata\x18% \x01(\x0b\x32&.stackchan.websocket.v1.ServerMetadataH\x00\x42\x06\n\x04\x62ody\"\x0f\n\rAudioPcmStart\"\r\n\x0b\x41udioPcmEnd\"6\n\rAudioWavStart\x12\x13\n\x0bsample_rate\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\"\r\n\x0b\x41udioWavEnd\"\x1f\n\nAudioChunk\x12\x11\n\tpcm_bytes\x18\x01 \x01(\x0c\"\x8a\x01\n\x0cStateCommand\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\x12\x43\n\x11listening_purpose\x18\x02 \x01(\x0e\x32(.stackchan.websocket.v1.ListeningPurpose\"!\n\rWakeWordEvent\x12\x10\n\x08\x64\x65tected\x18\x01 \x01(\x08\"C\n\nStateEvent\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"\x1e\n\x0eSpeakDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"N\n\x14ServoCommandSequence\x12\x36\n\x08\x63ommands\x18\x01 \x03(\x0b\x32$.stackchan.websocket.v1.ServoCommand\"f\n\x0cServoCommand\x12\x32\n\x02op\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.ServoOperation\x12\r\n\x05\x61ngle\x18\x02 \x01(\x11\x12\x13\n\x0b\x64uration_ms\x18\x03 \x01(\x11\"\x1e\n\x0eServoDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"\x99\x02\n\x10\x46irmwareMetadata\x12\x37\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\".stackchan.websocket.v1.DeviceType\x12\x15\n\rdisplay_width\x18\x02 \x01(\r\x12\x16\n\x0e\x64isplay_height\x18\x03 \x01(\r\x12\x1c\n\x14has_device_wake_word\x18\x04 \x01(\x08\x12\x0f\n\x07has_led\x18\x05 \x01(\x08\x12\x35\n\nservo_type\x18\x06 \x01(\x0e\x32!.stackchan.websocket.v1.ServoType\x12\x1d\n\x15supports_audio_duplex\x18\x07 \x01(\x08\x12\x18\n\x10\x66irmware_version\x18\x08 \x01(\t\"F\n\x0eServerMetadata\x12\x1c\n\x14has_server_wake_word\x18\x01 \x01(\x08\x12\x16\n\x0eserver_version\x18\x02 \x01(\t*\x80\x03\n\x0bMessageKind\x12\x1c\n\x18MESSAGE_KIND_UNSPECIFIED\x10\x00\x12\x1a\n\x16MESSAGE_KIND_AUDIO_PCM\x10\x01\x12\x1a\n\x16MESSAGE_KIND_AUDIO_WAV\x10\x02\x12\x1a\n\x16MESSAGE_KIND_STATE_CMD\x10\x03\x12\x1e\n\x1aMESSAGE_KIND_WAKE_WORD_EVT\x10\x04\x12\x1a\n\x16MESSAGE_KIND_STATE_EVT\x10\x05\x12\x1f\n\x1bMESSAGE_KIND_SPEAK_DONE_EVT\x10\x06\x12\x1a\n\x16MESSAGE_KIND_SERVO_CMD\x10\x07\x12\x1f\n\x1bMESSAGE_KIND_SERVO_DONE_EVT\x10\x08\x12\"\n\x1eMESSAGE_KIND_FIRMWARE_METADATA\x10\t\x12 \n\x1cMESSAGE_KIND_SERVER_METADATA\x10\n\x12\x1f\n\x1bMESSAGE_KIND_SERVER_WWD_PCM\x10\x0b*p\n\x0bMessageType\x12\x1c\n\x18MESSAGE_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12MESSAGE_TYPE_START\x10\x01\x12\x15\n\x11MESSAGE_TYPE_DATA\x10\x02\x12\x14\n\x10MESSAGE_TYPE_END\x10\x03*\x85\x01\n\x0eStackchanState\x12\x18\n\x14STACKCHAN_STATE_IDLE\x10\x00\x12\x1d\n\x19STACKCHAN_STATE_LISTENING\x10\x01\x12\x1c\n\x18STACKCHAN_STATE_THINKING\x10\x02\x12\x1c\n\x18STACKCHAN_STATE_SPEAKING\x10\x03*t\n\x10ListeningPurpose\x12!\n\x1dLISTENING_PURPOSE_UNSPECIFIED\x10\x00\x12\x1c\n\x18LISTENING_PURPOSE_SPEECH\x10\x01\x12\x1f\n\x1bLISTENING_PURPOSE_WAKE_WORD\x10\x02*c\n\x0eServoOperation\x12\x19\n\x15SERVO_OPERATION_SLEEP\x10\x00\x12\x1a\n\x16SERVO_OPERATION_MOVE_X\x10\x01\x12\x1a\n\x16SERVO_OPERATION_MOVE_Y\x10\x02*\x85\x01\n\nDeviceType\x12\x1b\n\x17\x44\x45VICE_TYPE_UNSPECIFIED\x10\x00\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5STACK_CORES3\x10\x01\x12\x1a\n\x16\x44\x45VICE_TYPE_M5ATOM_S3R\x10\x02\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5ATOM_ECHOS3R\x10\x03*i\n\tServoType\x12\x1a\n\x16SERVO_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fSERVO_TYPE_NONE\x10\x01\x12\x13\n\x0fSERVO_TYPE_SG90\x10\x02\x12\x16\n\x12SERVO_TYPE_SCS0009\x10\x03\x62\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -32,19 +32,19 @@ if not _descriptor._USE_C_DESCRIPTORS: DESCRIPTOR._loaded_options = None _globals['_MESSAGEKIND']._serialized_start=2086 - _globals['_MESSAGEKIND']._serialized_end=2437 - _globals['_MESSAGETYPE']._serialized_start=2439 - _globals['_MESSAGETYPE']._serialized_end=2551 - _globals['_STACKCHANSTATE']._serialized_start=2554 - _globals['_STACKCHANSTATE']._serialized_end=2687 - _globals['_LISTENINGPURPOSE']._serialized_start=2689 - _globals['_LISTENINGPURPOSE']._serialized_end=2805 - _globals['_SERVOOPERATION']._serialized_start=2807 - _globals['_SERVOOPERATION']._serialized_end=2906 - _globals['_DEVICETYPE']._serialized_start=2909 - _globals['_DEVICETYPE']._serialized_end=3042 - _globals['_SERVOTYPE']._serialized_start=3044 - _globals['_SERVOTYPE']._serialized_end=3149 + _globals['_MESSAGEKIND']._serialized_end=2470 + _globals['_MESSAGETYPE']._serialized_start=2472 + _globals['_MESSAGETYPE']._serialized_end=2584 + _globals['_STACKCHANSTATE']._serialized_start=2587 + _globals['_STACKCHANSTATE']._serialized_end=2720 + _globals['_LISTENINGPURPOSE']._serialized_start=2722 + _globals['_LISTENINGPURPOSE']._serialized_end=2838 + _globals['_SERVOOPERATION']._serialized_start=2840 + _globals['_SERVOOPERATION']._serialized_end=2939 + _globals['_DEVICETYPE']._serialized_start=2942 + _globals['_DEVICETYPE']._serialized_end=3075 + _globals['_SERVOTYPE']._serialized_start=3077 + _globals['_SERVOTYPE']._serialized_end=3182 _globals['_WEBSOCKETMESSAGE']._serialized_start=52 _globals['_WEBSOCKETMESSAGE']._serialized_end=1098 _globals['_AUDIOPCMSTART']._serialized_start=1100 diff --git a/stackchan_server/wakeup_word_detection/__init__.py b/stackchan_server/wakeup_word_detection/__init__.py index d4c0f62..dbbd0c2 100644 --- a/stackchan_server/wakeup_word_detection/__init__.py +++ b/stackchan_server/wakeup_word_detection/__init__.py @@ -1,13 +1,15 @@ from .create import create_server_side_wake_word_detector -from .server_side import ( - ServerSideWakeWordConfig, - ServerSideWakeWordDetector, +from .whisper_server import ( WakeWordDetectionError, + WhisperServerWakeWordDetector, + WhisperServerWakeWordDetectorConfig, + WhisperServerWakeWordSpeechToTextConfig, ) __all__ = [ "create_server_side_wake_word_detector", - "ServerSideWakeWordConfig", - "ServerSideWakeWordDetector", + "WhisperServerWakeWordDetector", + "WhisperServerWakeWordDetectorConfig", + "WhisperServerWakeWordSpeechToTextConfig", "WakeWordDetectionError", ] diff --git a/stackchan_server/wakeup_word_detection/create.py b/stackchan_server/wakeup_word_detection/create.py index 2ec5011..6d7520d 100644 --- a/stackchan_server/wakeup_word_detection/create.py +++ b/stackchan_server/wakeup_word_detection/create.py @@ -1,28 +1,23 @@ from __future__ import annotations -from pydantic import Field from pydantic_settings import BaseSettings -from ..speech_recognition.whisper_server import WhisperServerSpeechToText -from .server_side import ServerSideWakeWordDetector +from .whisper_server import WhisperServerWakeWordDetector -class _CreateServerSideWakeWordDetectorEnv(BaseSettings): - use_server_side_wwd_whisper_server: bool = Field( - default=False, - validation_alias="USE_SERVER_SIDE_WWD_WHISPER_SERVER", - ) +class _CreateWhisperServerWakeWordDetectorEnv(BaseSettings): + use_wwd_whisper_server: bool = False class Config: - env_prefix = "" + env_prefix = "STACKCHAN_" -def create_server_side_wake_word_detector() -> ServerSideWakeWordDetector | None: - env = _CreateServerSideWakeWordDetectorEnv() - if not env.use_server_side_wwd_whisper_server: +def create_server_side_wake_word_detector() -> WhisperServerWakeWordDetector | None: + env = _CreateWhisperServerWakeWordDetectorEnv() + if not env.use_wwd_whisper_server: return None - return ServerSideWakeWordDetector(recognizer=WhisperServerSpeechToText()) + return WhisperServerWakeWordDetector() __all__ = ["create_server_side_wake_word_detector"] diff --git a/stackchan_server/wakeup_word_detection/server_side.py b/stackchan_server/wakeup_word_detection/whisper_server.py similarity index 89% rename from stackchan_server/wakeup_word_detection/server_side.py rename to stackchan_server/wakeup_word_detection/whisper_server.py index f77be72..e389bbb 100644 --- a/stackchan_server/wakeup_word_detection/server_side.py +++ b/stackchan_server/wakeup_word_detection/whisper_server.py @@ -7,7 +7,10 @@ from pydantic import Field from pydantic_settings import BaseSettings -from ..speech_recognition.whisper_server import WhisperServerSpeechToText +from ..speech_recognition.whisper_server import ( + WhisperServerSpeechToText, + WhisperServerSpeechToTextConfig, +) from ..static import LISTEN_AUDIO_FORMAT logger = getLogger(__name__) @@ -17,25 +20,32 @@ class WakeWordDetectionError(Exception): pass -class ServerSideWakeWordConfig(BaseSettings): +class WhisperServerWakeWordDetectorConfig(BaseSettings): keywords: list[str] = Field(default_factory=lambda: ["スタックチャン"]) window_seconds: float = 3.0 interval_seconds: float = 0.5 timeout_seconds: float = 30.0 class Config: - env_prefix = "STACKCHAN_SERVER_WAKE_WORD_" + env_prefix = "STACKCHAN_WWD_" -class ServerSideWakeWordDetector: +class WhisperServerWakeWordSpeechToTextConfig(WhisperServerSpeechToTextConfig): + class Config(WhisperServerSpeechToTextConfig.Config): + env_prefix = "STACKCHAN_WWD_WHISPER_SERVER_" + + +class WhisperServerWakeWordDetector: def __init__( self, *, recognizer: WhisperServerSpeechToText | None = None, - config: ServerSideWakeWordConfig | None = None, + config: WhisperServerWakeWordDetectorConfig | None = None, ) -> None: - self.config = config or ServerSideWakeWordConfig() - self.recognizer = recognizer or WhisperServerSpeechToText() + self.config = config or WhisperServerWakeWordDetectorConfig() + self.recognizer = recognizer or WhisperServerSpeechToText( + config=WhisperServerWakeWordSpeechToTextConfig() + ) self._pcm_buffer = bytearray() self._running = False self._detected = False @@ -199,7 +209,8 @@ def _normalize_text(text: str) -> str: __all__ = [ - "ServerSideWakeWordConfig", - "ServerSideWakeWordDetector", + "WhisperServerWakeWordDetector", + "WhisperServerWakeWordDetectorConfig", + "WhisperServerWakeWordSpeechToTextConfig", "WakeWordDetectionError", -] +] \ No newline at end of file diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py index b10ec5a..55576a8 100644 --- a/stackchan_server/ws_proxy.py +++ b/stackchan_server/ws_proxy.py @@ -217,7 +217,7 @@ async def move_servo(self, commands: Sequence[ServoCommand]) -> None: self._servo_sent_counter = target_counter self._pending_servo_wait_targets.append(target_counter) try: - await self.ws.send_bytes( + await self._send_ws_bytes( encode_servo_command_message(self._next_down_seq(), commands) ) except Exception: @@ -254,7 +254,11 @@ async def close(self) -> None: if self._receiving_task: self._receiving_task.cancel() with suppress(asyncio.CancelledError): - await self._receiving_task + try: + await self._receiving_task + except RuntimeError as exc: + if not self._is_closed_websocket_runtime_error(exc): + raise await self._listener.close() async def start_talking(self, text: str) -> None: @@ -341,107 +345,27 @@ async def request_server_wakeword_detection( async def _receive_loop(self) -> None: try: while True: - raw_message = await self.ws.receive_bytes() + try: + raw_message = await self.ws.receive_bytes() + except RuntimeError as exc: + if self._is_closed_websocket_runtime_error(exc): + break + raise try: message = parse_websocket_message(raw_message) except DecodeError: await self.ws.close(code=1003, reason="invalid protobuf message") break - if message.kind == ws_pb2.MESSAGE_KIND_AUDIO_PCM: - body_name = message.WhichOneof("body") - - if self._should_drain_trailing_pcm(): - if ( - message.message_type == ws_pb2.MESSAGE_TYPE_START - and body_name == "audio_pcm_start" - ): - logger.info( - "Received a new PCM START while draining trailing wake-word audio; resuming normal routing" - ) - self._clear_trailing_pcm_drain() - elif ( - message.message_type == ws_pb2.MESSAGE_TYPE_DATA - and body_name == "audio_pcm_data" - ): - logger.info( - "Discarding trailing PCM DATA after wake-word detection stop payload_bytes=%d", - len(message.audio_pcm_data.pcm_bytes), - ) - continue - elif ( - message.message_type == ws_pb2.MESSAGE_TYPE_END - and body_name == "audio_pcm_end" - ): - logger.info( - "Finished draining trailing PCM after wake-word detection stop" - ) - self._clear_trailing_pcm_drain() - continue - - if ( - self._server_wakeword_detector is not None - and self._server_wakeword_detector.running - ): - if ( - message.message_type == ws_pb2.MESSAGE_TYPE_START - and body_name == "audio_pcm_start" - ): - await self._server_wakeword_detector.handle_start() - continue - - if ( - message.message_type == ws_pb2.MESSAGE_TYPE_DATA - and body_name == "audio_pcm_data" - ): - payload = bytes(message.audio_pcm_data.pcm_bytes) - await self._server_wakeword_detector.handle_data(payload) - continue - - if ( - message.message_type == ws_pb2.MESSAGE_TYPE_END - and body_name == "audio_pcm_end" - ): - await self._server_wakeword_detector.handle_end() - continue - - await self.ws.close(code=1003, reason="unknown wakeword PCM protobuf body") + if message.kind == ws_pb2.MESSAGE_KIND_SERVER_WWD_PCM: + if not await self._handle_server_wakeword_pcm_message(message): break + continue - if ( - message.message_type == ws_pb2.MESSAGE_TYPE_START - and body_name == "audio_pcm_start" - ): - if not await self._listener.handle_start(self.ws): - break - continue - - if ( - message.message_type == ws_pb2.MESSAGE_TYPE_DATA - and body_name == "audio_pcm_data" - ): - payload = bytes(message.audio_pcm_data.pcm_bytes) - if not await self._listener.handle_data( - self.ws, len(payload), payload - ): - break - continue - - if ( - message.message_type == ws_pb2.MESSAGE_TYPE_END - and body_name == "audio_pcm_end" - ): - await self._listener.handle_end( - self.ws, - payload_bytes=0, - payload=b"", - send_state_command=self.send_state_command, - thinking_state=FirmwareState.THINKING, - ) - continue - - await self.ws.close(code=1003, reason="unknown PCM protobuf body") - break + if message.kind == ws_pb2.MESSAGE_KIND_AUDIO_PCM: + if not await self._handle_audio_pcm_message(message): + break + continue if message.kind == ws_pb2.MESSAGE_KIND_WAKE_WORD_EVT: self._handle_wakeword_event(message) @@ -470,6 +394,101 @@ async def _receive_loop(self) -> None: finally: self._closed = True + async def _handle_server_wakeword_pcm_message(self, message: Any) -> bool: + body_name = message.WhichOneof("body") + + if self._should_drain_trailing_pcm(): + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_START + and body_name == "audio_pcm_start" + ): + logger.info( + "Received a new server-side wake-word PCM START while draining trailing audio; resuming normal routing" + ) + self._clear_trailing_pcm_drain() + elif ( + message.message_type == ws_pb2.MESSAGE_TYPE_DATA + and body_name == "audio_pcm_data" + ): + logger.info( + "Discarding trailing server-side wake-word PCM DATA payload_bytes=%d", + len(message.audio_pcm_data.pcm_bytes), + ) + return True + elif ( + message.message_type == ws_pb2.MESSAGE_TYPE_END + and body_name == "audio_pcm_end" + ): + logger.info("Finished draining trailing server-side wake-word PCM") + self._clear_trailing_pcm_drain() + return True + + detector = self._server_wakeword_detector + if detector is None or not detector.running: + logger.info( + "Ignoring server-side wake-word PCM while detector is inactive type=%s body=%s", + message.message_type, + body_name, + ) + return True + + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_START + and body_name == "audio_pcm_start" + ): + await detector.handle_start() + return True + + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_DATA + and body_name == "audio_pcm_data" + ): + payload = bytes(message.audio_pcm_data.pcm_bytes) + await detector.handle_data(payload) + return True + + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_END + and body_name == "audio_pcm_end" + ): + await detector.handle_end() + return True + + await self.ws.close(code=1003, reason="unknown server wake-word PCM protobuf body") + return False + + async def _handle_audio_pcm_message(self, message: Any) -> bool: + body_name = message.WhichOneof("body") + + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_START + and body_name == "audio_pcm_start" + ): + return await self._listener.handle_start(self.ws) + + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_DATA + and body_name == "audio_pcm_data" + ): + payload = bytes(message.audio_pcm_data.pcm_bytes) + return await self._listener.handle_data(self.ws, len(payload), payload) + + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_END + and body_name == "audio_pcm_end" + ): + await self._listener.handle_end( + self.ws, + payload_bytes=0, + payload=b"", + send_state_command=self.send_state_command, + thinking_state=FirmwareState.THINKING, + ) + return True + + await self.ws.close(code=1003, reason="unknown PCM protobuf body") + return False + def _handle_wakeword_event(self, message: Any) -> None: if message.message_type != ws_pb2.MESSAGE_TYPE_DATA: return @@ -509,7 +528,7 @@ async def _handle_firmware_metadata(self, message: Any) -> None: self.firmware_metadata.firmware_version, ) self.server_metadata = self._build_server_metadata(self.firmware_metadata) - await self.ws.send_bytes( + await self._send_ws_bytes( encode_server_metadata_message( self._next_down_seq(), has_server_wake_word=self.server_metadata.has_server_wake_word, @@ -566,7 +585,7 @@ async def _send_state_command( *, listening_purpose: ListeningPurpose = ListeningPurpose.SPEECH, ) -> None: - await self.ws.send_bytes( + await self._send_ws_bytes( encode_state_command_message( self._next_down_seq(), int(state_id), @@ -574,6 +593,25 @@ async def _send_state_command( ) ) + async def _send_ws_bytes(self, data: bytes) -> None: + try: + await self.ws.send_bytes(data) + except RuntimeError as exc: + self._raise_websocket_disconnect_from_runtime_error(exc) + + def _is_closed_websocket_runtime_error(self, exc: RuntimeError) -> bool: + message = str(exc) + return ( + 'Cannot call "send" once a close message has been sent.' in message + or 'WebSocket is not connected. Need to call "accept" first.' in message + ) + + def _raise_websocket_disconnect_from_runtime_error(self, exc: RuntimeError) -> None: + if not self._is_closed_websocket_runtime_error(exc): + raise exc + self._closed = True + raise WebSocketDisconnect() from exc + async def _run_server_wakeword_detection(self) -> bool: detector = self._server_wakeword_detector if detector is None: From 82c09a9b39ecc62dc0cc5fe0883cc12cfc688861 Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sat, 9 May 2026 16:09:45 +0900 Subject: [PATCH 04/15] feat: Add language and prompt configuration for Whisper Server --- .env.template | 4 ++++ docs/server_ja.md | 10 ++++++++++ stackchan_server/speech_recognition/whisper_server.py | 11 +++++++++-- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/.env.template b/.env.template index 89d30f6..1d82075 100644 --- a/.env.template +++ b/.env.template @@ -21,6 +21,8 @@ STACKCHAN_GOOGLE_CLOUD_STT_LANGUAGE_CODE="ja-JP" # STACKCHAN_USE_WHISPER_SERVER=1 # STACKCHAN_WHISPER_SERVER_URL="http://127.0.0.1:8080/inference" # STACKCHAN_WHISPER_SERVER_MODEL= +# STACKCHAN_WHISPER_SERVER_PROMPT= +# STACKCHAN_WHISPER_SERVER_LANGUAGE="ja" # -- Speech Syntheis -- # Google Cloud TTS @@ -39,6 +41,8 @@ STACKCHAN_VOICEVOX_SPEAKER=1 # STACKCHAN_USE_WWD_WHISPER_SERVER=1 # STACKCHAN_WWD_WHISPER_SERVER_URL="http://127.0.0.1:8080/inference" # STACKCHAN_WWD_WHISPER_SERVER_MODEL= +# STACKCHAN_WWD_WHISPER_SERVER_LANGUAGE="ja" +# STACKCHAN_WWD_WHISPER_SERVER_PROMPT="日本語で、スタックチャンという名前で、話しかけらるので、話しかけられたことを検出してください" # -- Claude Agent SDK -- # using Google Cloud Vertex AI diff --git a/docs/server_ja.md b/docs/server_ja.md index 506727c..2bc40c9 100644 --- a/docs/server_ja.md +++ b/docs/server_ja.md @@ -65,6 +65,8 @@ STACKCHAN_WHISPER_CLI_VAD_MODEL_PATH="/path/to/whisper.cpp/ggml-silero-v5.1.2.bi `STACKCHAN_WHISPER_SERVER_URL` に Whisper Server の推論エンドポイント URL をそのまま指定します。 未設定時は `http://127.0.0.1:8080/inference` を利用します。 +`STACKCHAN_WHISPER_SERVER_LANGUAGE` を設定すると、その値を `language` パラメータとして各リクエストに含めます。未設定または空文字の場合は `language` を送信しません。 +また、`STACKCHAN_WHISPER_SERVER_PROMPT` を設定すると、whisper-server の各リクエストに `prompt` フィールドとして送信します。 #### 例: Whisper.cppのwhisper-serverの設定 @@ -74,6 +76,8 @@ whisper.cpp/examples/server: https://github.com/ggml-org/whisper.cpp/tree/master STACKCHAN_USE_WHISPER_SERVER=1 STACKCHAN_WHISPER_SERVER_URL="http://127.0.0.1:8080/inference" STACKCHAN_WHISPER_SERVER_MODEL= +STACKCHAN_WHISPER_SERVER_LANGUAGE="ja" +STACKCHAN_WHISPER_SERVER_PROMPT="" ``` #### 例: [Lemonade](https://lemonade-server.ai/) を使う場合 @@ -84,6 +88,8 @@ Lemonade: https://lemonade-server.ai/ STACKCHAN_USE_WHISPER_SERVER=1 STACKCHAN_WHISPER_SERVER_URL=http://localhost:13305/api/v1/audio/transcriptions STACKCHAN_WHISPER_SERVER_MODEL=Whisper-Large-v3-Turbo +STACKCHAN_WHISPER_SERVER_LANGUAGE="ja" +STACKCHAN_WHISPER_SERVER_PROMPT="" ``` ### (オプション) サーバーサイド wakeword 用 Whisper Server の設定 @@ -93,6 +99,8 @@ STACKCHAN_WHISPER_SERVER_MODEL=Whisper-Large-v3-Turbo - `STACKCHAN_USE_WWD_WHISPER_SERVER`: `1` - `STACKCHAN_WWD_WHISPER_SERVER_URL`: wakeword 検出専用 Whisper Server の推論エンドポイント URL - `STACKCHAN_WWD_WHISPER_SERVER_MODEL`: wakeword 検出専用に利用するモデル名 +- `STACKCHAN_WWD_WHISPER_SERVER_LANGUAGE`: wakeword 検出専用 Whisper Server リクエストへ渡す language +- `STACKCHAN_WWD_WHISPER_SERVER_PROMPT`: wakeword 検出専用 Whisper Server リクエストへ渡す prompt 通常の音声認識で使う `STACKCHAN_WHISPER_SERVER_URL` / `STACKCHAN_WHISPER_SERVER_MODEL` とは別設定です。 @@ -100,6 +108,8 @@ STACKCHAN_WHISPER_SERVER_MODEL=Whisper-Large-v3-Turbo STACKCHAN_USE_WWD_WHISPER_SERVER=1 STACKCHAN_WWD_WHISPER_SERVER_URL="http://127.0.0.1:8080/inference" STACKCHAN_WWD_WHISPER_SERVER_MODEL= +STACKCHAN_WWD_WHISPER_SERVER_LANGUAGE="ja" +STACKCHAN_WWD_WHISPER_SERVER_PROMPT="日本語で、スタックチャンという名前で、話しかけらるので、話しかけられたことを検出してください" ``` ## 音声合成の設定 diff --git a/stackchan_server/speech_recognition/whisper_server.py b/stackchan_server/speech_recognition/whisper_server.py index 99dd811..d508cb5 100644 --- a/stackchan_server/speech_recognition/whisper_server.py +++ b/stackchan_server/speech_recognition/whisper_server.py @@ -25,9 +25,10 @@ class WhisperServerSpeechToTextConfig(BaseSettings): url: str = _DEFAULT_SERVER_URL - language: str = "auto" + language: str = "" detect_language: bool = False response_format: str = "verbose_json" + prompt: str = "" silence_rms_threshold: float = _DEFAULT_SILENCE_RMS_THRESHOLD request_timeout_seconds: float = 60.0 model: str = "" @@ -73,9 +74,15 @@ async def transcribe(self, pcm_bytes: bytes) -> str: def _request_transcript(self, wav_bytes: bytes, language: str) -> str: fields = { "response_format": self._conf.response_format, - "language": language, } + normalized_language = language.strip() + if normalized_language: + fields["language"] = normalized_language + + if self._conf.prompt: + fields["prompt"] = self._conf.prompt + if self._conf.model: fields["model"] = self._conf.model From f318d976d195e388f862f4c4270aafe11f5eaa28 Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sat, 9 May 2026 16:24:14 +0900 Subject: [PATCH 05/15] feat: Add WakeWordDetectionTimeout for improved error handling in wake-word detection --- stackchan_server/wakeup_word_detection/__init__.py | 2 ++ .../wakeup_word_detection/whisper_server.py | 13 ++++++++++--- stackchan_server/ws_proxy.py | 8 +++++++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/stackchan_server/wakeup_word_detection/__init__.py b/stackchan_server/wakeup_word_detection/__init__.py index dbbd0c2..198a4fb 100644 --- a/stackchan_server/wakeup_word_detection/__init__.py +++ b/stackchan_server/wakeup_word_detection/__init__.py @@ -1,6 +1,7 @@ from .create import create_server_side_wake_word_detector from .whisper_server import ( WakeWordDetectionError, + WakeWordDetectionTimeout, WhisperServerWakeWordDetector, WhisperServerWakeWordDetectorConfig, WhisperServerWakeWordSpeechToTextConfig, @@ -12,4 +13,5 @@ "WhisperServerWakeWordDetectorConfig", "WhisperServerWakeWordSpeechToTextConfig", "WakeWordDetectionError", + "WakeWordDetectionTimeout", ] diff --git a/stackchan_server/wakeup_word_detection/whisper_server.py b/stackchan_server/wakeup_word_detection/whisper_server.py index e389bbb..6b25fe6 100644 --- a/stackchan_server/wakeup_word_detection/whisper_server.py +++ b/stackchan_server/wakeup_word_detection/whisper_server.py @@ -20,11 +20,15 @@ class WakeWordDetectionError(Exception): pass +class WakeWordDetectionTimeout(WakeWordDetectionError): + pass + + class WhisperServerWakeWordDetectorConfig(BaseSettings): keywords: list[str] = Field(default_factory=lambda: ["スタックチャン"]) window_seconds: float = 3.0 interval_seconds: float = 0.5 - timeout_seconds: float = 30.0 + timeout_seconds: float = 300.0 class Config: env_prefix = "STACKCHAN_WWD_" @@ -154,7 +158,9 @@ async def wait_result(self, timeout_seconds: float | None = None) -> bool: try: await asyncio.wait_for(self._event.wait(), timeout=timeout) except asyncio.TimeoutError as exc: - raise WakeWordDetectionError("Server-side wake-word detection timed out") from exc + raise WakeWordDetectionTimeout( + "Server-side wake-word detection timed out" + ) from exc if self._error is not None: raise WakeWordDetectionError(str(self._error)) from self._error @@ -213,4 +219,5 @@ def _normalize_text(text: str) -> str: "WhisperServerWakeWordDetectorConfig", "WhisperServerWakeWordSpeechToTextConfig", "WakeWordDetectionError", -] \ No newline at end of file + "WakeWordDetectionTimeout", +] diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py index 55576a8..f4024ce 100644 --- a/stackchan_server/ws_proxy.py +++ b/stackchan_server/ws_proxy.py @@ -28,6 +28,7 @@ from .types import SpeechRecognizer, SpeechSynthesizer from .wakeup_word_detection import ( WakeWordDetectionError, + WakeWordDetectionTimeout, create_server_side_wake_word_detector, ) @@ -340,7 +341,9 @@ async def request_server_wakeword_detection( return await asyncio.wait_for(asyncio.shield(task), timeout=timeout_seconds) except asyncio.TimeoutError as exc: await self.stop_server_wakeword_detection() - raise WakeWordDetectionError("Server-side wake-word detection timed out") from exc + raise WakeWordDetectionTimeout( + "Server-side wake-word detection timed out" + ) from exc async def _receive_loop(self) -> None: try: @@ -631,6 +634,9 @@ async def _run_server_wakeword_detection(self) -> bool: return detected except asyncio.CancelledError: raise + except WakeWordDetectionTimeout as exc: + logger.info("Server-side wake-word detection stopped: %s", exc) + return False except WakeWordDetectionError as exc: logger.warning("Server-side wake-word detection stopped: %s", exc) return False From 40543d013cd2265f21945e47faa26ba2801e8f3b Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sat, 9 May 2026 16:24:21 +0900 Subject: [PATCH 06/15] feat: Enhance display logic to track server wake word idle state --- firmware/include/display.hpp | 2 ++ firmware/src/display.cpp | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/firmware/include/display.hpp b/firmware/include/display.hpp index 5cfc72d..2012127 100644 --- a/firmware/include/display.hpp +++ b/firmware/include/display.hpp @@ -26,8 +26,10 @@ class Display void drawFace(); bool isAtomS3R() const; int32_t statusBarHeight() const; + bool shouldShowServerWakeWordIdle() const; StateMachine &state_; bool has_prev_state_ = false; StateMachine::State prev_state_ = StateMachine::Idle; + bool prev_server_wake_word_idle_ = false; }; diff --git a/firmware/src/display.cpp b/firmware/src/display.cpp index 21aaf62..aeb9676 100644 --- a/firmware/src/display.cpp +++ b/firmware/src/display.cpp @@ -71,12 +71,14 @@ void Display::init() drawFace(); has_prev_state_ = true; prev_state_ = state_.getState(); + prev_server_wake_word_idle_ = shouldShowServerWakeWordIdle(); } void Display::loop() { StateMachine::State current = state_.getState(); - if (!has_prev_state_ || current != prev_state_) + bool current_server_wake_word_idle = shouldShowServerWakeWordIdle(); + if (!has_prev_state_ || current != prev_state_ || current_server_wake_word_idle != prev_server_wake_word_idle_) { GFXModule.fillScreen(TFT_BLACK); drawForState(current); @@ -84,6 +86,7 @@ void Display::loop() } prev_state_ = current; + prev_server_wake_word_idle_ = current_server_wake_word_idle; has_prev_state_ = true; } @@ -174,6 +177,11 @@ bool Display::isAtomS3R() const #endif } +bool Display::shouldShowServerWakeWordIdle() const +{ + return state_.getState() == StateMachine::Idle && shouldUseServerWakeWord(); +} + int32_t Display::statusBarHeight() const { return isAtomS3R() ? 28 : 20; From 7235abfb1f10164cd9ae5cf6d05c991f3b3062d6 Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sat, 9 May 2026 16:47:38 +0900 Subject: [PATCH 07/15] feat: Update StateCmd handling for server-side wakeword detection --- AGENTS.md | 7 +- docs/websocket_protocols_ja.md | 19 +++--- firmware/include/display.hpp | 2 - firmware/include/state_machine.hpp | 8 ++- .../generated_protobuf/websocket-message.pb.c | 2 - .../generated_protobuf/websocket-message.pb.h | 29 +++------ firmware/src/display.cpp | 17 ++--- firmware/src/main.cpp | 58 +++++++++-------- firmware/src/state_machine.cpp | 7 ++ protobuf/websocket-message.proto | 8 +-- .../websocket_message_pb2.py | 64 +++++++++---------- stackchan_server/protobuf_ws.py | 12 +--- stackchan_server/ws_proxy.py | 17 +---- 13 files changed, 106 insertions(+), 144 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 26b7ae4..54be171 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -12,8 +12,8 @@ ## 状態遷移の要点 -- ファームウェア状態: `Idle`, `Listening`, `Thinking`, `Speaking`, `Disconnected` -- サーバーから指示できるのは `StateCmd` の `Idle` / `Listening` / `Thinking` / `Speaking` +- ファームウェア状態: `Idle`, `Listening`, `Thinking`, `Speaking`, `ServerWwd`, `Disconnected` +- サーバーから指示できるのは `StateCmd` の `Idle` / `Listening` / `Thinking` / `Speaking` / `ServerWwd` - `Disconnected` はファームウェア内部状態で、WebSocket 切断時に入る - `WakeWordEvt` を受けるか、REST API の wakeword 擬似発火で talk session が始まる @@ -75,6 +75,7 @@ - `websocket.client.host` を StackChan の識別子として使う - 同一 IP の再接続時は既存接続を置き換える - `listen()` は `Listening` 指示後、音声 uplink 完了を待つ +- サーバーサイド wakeword 検出中は `ServerWwd` を指示する - `speak()` は TTS downlink 送信後、`SpeakDoneEvt` を待つ - `move_servo()` / `wait_servo_complete()` を公開 @@ -106,7 +107,7 @@ - `MoveX`, `MoveY`, `Sleep` を順次処理 - 完了時に `ServoDoneEvt` - `src/display.cpp` - - `Idle=濃いグレー`, `Listening=青`, `Thinking=オレンジ`, `Speaking=緑`, `Disconnected=赤` + - `Idle=濃いグレー`, `Listening=青`, `Thinking=オレンジ`, `Speaking=緑`, `ServerWwd=Idle(Server-WWD)`, `Disconnected=赤` ## サンプルアプリ diff --git a/docs/websocket_protocols_ja.md b/docs/websocket_protocols_ja.md index e8694a0..fc2945c 100644 --- a/docs/websocket_protocols_ja.md +++ b/docs/websocket_protocols_ja.md @@ -75,7 +75,7 @@ ### 現行実装メモ -- `StateCmd(Listening, WAKE_WORD)` を受けた CoreS3 は、見た目の状態を `Idle(Server-WWD)` のままにしてこの kind で uplink します。 +- `StateCmd(ServerWwd)` を受けた CoreS3 は、この kind で uplink を開始します。 - 無音 3 秒によるクライアント側自動終了は行いません。 - サーバーはこの kind だけを server-side wakeword detector にルーティングします。 @@ -106,7 +106,7 @@ - 方向: Server → CoreS3 - `messageType`: `DATA` のみ -- body: `StateCommand { state, listening_purpose }` +- body: `StateCommand { state }` 利用する状態名: @@ -114,21 +114,17 @@ - `Listening` - `Thinking` - `Speaking` - -`listening_purpose` の値: - -- `SPEECH`: 通常の会話入力 -- `WAKE_WORD`: サーバーサイド wakeword 検出用の uplink +- `ServerWwd` ### 現行実装メモ -- `proxy.listen()` 開始時に Server が `StateCmd(Listening, SPEECH)` を指示します。 -- サーバーサイド wakeword 検出開始時は `StateCmd(Listening, WAKE_WORD)` を指示します。 +- `proxy.listen()` 開始時に Server が `StateCmd(Listening)` を指示します。 +- サーバーサイド wakeword 検出開始時は `StateCmd(ServerWwd)` を指示します。 - 音声 uplink の `END` を受けると、Server は `Thinking` を指示します。 - `proxy.speak()` 完了後、Server は `Idle` を指示します。 > [!NOTE] -> `WAKE_WORD` の場合、CoreS3 は内部的にマイク uplink を開始しますが、状態表示は `Listening` に遷移せず `Idle(Server-WWD)` のままです。また無音 3 秒による自動終了も行いません。 +> `ServerWwd` の場合、CoreS3 は内部的にマイク uplink を開始しますが、表示は `Idle(Server-WWD)` にし、無音 3 秒による自動終了も行いません。 ## ウェイクワード検出 `WakeWordEvt` @@ -154,7 +150,7 @@ CoreS3 側は `has_server_wake_word=true` を受けると、デバイス側 wake ## サーバーサイド wakeword 検出フロー - 環境変数 `STACKCHAN_USE_WWD_WHISPER_SERVER=1` の場合、サーバーは `@app.setup()` 完了後と `Idle` 復帰後に自動でサーバーサイド wakeword 検出を開始します。 -- サーバーは `StateCmd(Listening, WAKE_WORD)` を送信して `MESSAGE_KIND_SERVER_WWD_PCM` のマイク uplink を受信します。 +- サーバーは `StateCmd(ServerWwd)` を送信して `MESSAGE_KIND_SERVER_WWD_PCM` のマイク uplink を受信します。 - 受信した音声の直近 3 秒窓を 0.5 秒ごとに音声認識へ渡し、 定義キーワード(例: `スタクチャン`)を含むか判定します。 - 各判定タイミングの認識結果はすべてログ出力されます。 @@ -174,6 +170,7 @@ CoreS3 側は `has_server_wake_word=true` を受けると、デバイス側 wake - `Listening` - `Thinking` - `Speaking` +- `ServerWwd` - CoreS3 は状態遷移の entry hook で送信します。 - WebSocket 切断中は `Disconnected` 状態になりますが、切断時は uplink 送信できないため `StateEvt` では通知されません。 diff --git a/firmware/include/display.hpp b/firmware/include/display.hpp index 2012127..5cfc72d 100644 --- a/firmware/include/display.hpp +++ b/firmware/include/display.hpp @@ -26,10 +26,8 @@ class Display void drawFace(); bool isAtomS3R() const; int32_t statusBarHeight() const; - bool shouldShowServerWakeWordIdle() const; StateMachine &state_; bool has_prev_state_ = false; StateMachine::State prev_state_ = StateMachine::Idle; - bool prev_server_wake_word_idle_ = false; }; diff --git a/firmware/include/state_machine.hpp b/firmware/include/state_machine.hpp index d5bcd62..a3f4b97 100644 --- a/firmware/include/state_machine.hpp +++ b/firmware/include/state_machine.hpp @@ -14,7 +14,8 @@ class StateMachine Listening = 1, Thinking = 2, Speaking = 3, - Disconnected = 4, + ServerWwd = 4, + Disconnected = 5, }; StateMachine() = default; @@ -25,6 +26,7 @@ class StateMachine bool isListening() const; bool isThinking() const; bool isSpeaking() const; + bool isServerWwd() const; bool isDisconnected() const; using Callback = std::function; @@ -33,8 +35,8 @@ class StateMachine private: State state_ = Disconnected; - std::array, 5> entry_events_{}; - std::array, 5> exit_events_{}; + std::array, 6> entry_events_{}; + std::array, 6> exit_events_{}; }; const char *stateToString(StateMachine::State state); diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.c b/firmware/lib/generated_protobuf/websocket-message.pb.c index 7620ee8..f70a79e 100644 --- a/firmware/lib/generated_protobuf/websocket-message.pb.c +++ b/firmware/lib/generated_protobuf/websocket-message.pb.c @@ -64,5 +64,3 @@ PB_BIND(stackchan_websocket_v1_ServerMetadata, stackchan_websocket_v1_ServerMeta - - diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.h b/firmware/lib/generated_protobuf/websocket-message.pb.h index 28e1f54..cc98ef3 100644 --- a/firmware/lib/generated_protobuf/websocket-message.pb.h +++ b/firmware/lib/generated_protobuf/websocket-message.pb.h @@ -36,15 +36,10 @@ typedef enum _stackchan_websocket_v1_StackchanState { stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE = 0, stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING = 1, stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING = 2, - stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING = 3 + stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING = 3, + stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SERVER_WWD = 4 } stackchan_websocket_v1_StackchanState; -typedef enum _stackchan_websocket_v1_ListeningPurpose { - stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_UNSPECIFIED = 0, - stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_SPEECH = 1, - stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD = 2 -} stackchan_websocket_v1_ListeningPurpose; - typedef enum _stackchan_websocket_v1_ServoOperation { stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP = 0, stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_X = 1, @@ -90,7 +85,6 @@ typedef struct _stackchan_websocket_v1_AudioChunk { typedef struct _stackchan_websocket_v1_StateCommand { stackchan_websocket_v1_StackchanState state; - stackchan_websocket_v1_ListeningPurpose listening_purpose; } stackchan_websocket_v1_StateCommand; typedef struct _stackchan_websocket_v1_WakeWordEvent { @@ -181,12 +175,8 @@ extern "C" { #define _stackchan_websocket_v1_MessageType_ARRAYSIZE ((stackchan_websocket_v1_MessageType)(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END+1)) #define _stackchan_websocket_v1_StackchanState_MIN stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE -#define _stackchan_websocket_v1_StackchanState_MAX stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING -#define _stackchan_websocket_v1_StackchanState_ARRAYSIZE ((stackchan_websocket_v1_StackchanState)(stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING+1)) - -#define _stackchan_websocket_v1_ListeningPurpose_MIN stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_UNSPECIFIED -#define _stackchan_websocket_v1_ListeningPurpose_MAX stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD -#define _stackchan_websocket_v1_ListeningPurpose_ARRAYSIZE ((stackchan_websocket_v1_ListeningPurpose)(stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD+1)) +#define _stackchan_websocket_v1_StackchanState_MAX stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SERVER_WWD +#define _stackchan_websocket_v1_StackchanState_ARRAYSIZE ((stackchan_websocket_v1_StackchanState)(stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SERVER_WWD+1)) #define _stackchan_websocket_v1_ServoOperation_MIN stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP #define _stackchan_websocket_v1_ServoOperation_MAX stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y @@ -209,7 +199,6 @@ extern "C" { #define stackchan_websocket_v1_StateCommand_state_ENUMTYPE stackchan_websocket_v1_StackchanState -#define stackchan_websocket_v1_StateCommand_listening_purpose_ENUMTYPE stackchan_websocket_v1_ListeningPurpose #define stackchan_websocket_v1_StateEvent_state_ENUMTYPE stackchan_websocket_v1_StackchanState @@ -231,7 +220,7 @@ extern "C" { #define stackchan_websocket_v1_AudioWavStart_init_default {0, 0} #define stackchan_websocket_v1_AudioWavEnd_init_default {0} #define stackchan_websocket_v1_AudioChunk_init_default {{0, {0}}} -#define stackchan_websocket_v1_StateCommand_init_default {_stackchan_websocket_v1_StackchanState_MIN, _stackchan_websocket_v1_ListeningPurpose_MIN} +#define stackchan_websocket_v1_StateCommand_init_default {_stackchan_websocket_v1_StackchanState_MIN} #define stackchan_websocket_v1_WakeWordEvent_init_default {0} #define stackchan_websocket_v1_StateEvent_init_default {_stackchan_websocket_v1_StackchanState_MIN} #define stackchan_websocket_v1_SpeakDoneEvent_init_default {0} @@ -246,7 +235,7 @@ extern "C" { #define stackchan_websocket_v1_AudioWavStart_init_zero {0, 0} #define stackchan_websocket_v1_AudioWavEnd_init_zero {0} #define stackchan_websocket_v1_AudioChunk_init_zero {{0, {0}}} -#define stackchan_websocket_v1_StateCommand_init_zero {_stackchan_websocket_v1_StackchanState_MIN, _stackchan_websocket_v1_ListeningPurpose_MIN} +#define stackchan_websocket_v1_StateCommand_init_zero {_stackchan_websocket_v1_StackchanState_MIN} #define stackchan_websocket_v1_WakeWordEvent_init_zero {0} #define stackchan_websocket_v1_StateEvent_init_zero {_stackchan_websocket_v1_StackchanState_MIN} #define stackchan_websocket_v1_SpeakDoneEvent_init_zero {0} @@ -261,7 +250,6 @@ extern "C" { #define stackchan_websocket_v1_AudioWavStart_channels_tag 2 #define stackchan_websocket_v1_AudioChunk_pcm_bytes_tag 1 #define stackchan_websocket_v1_StateCommand_state_tag 1 -#define stackchan_websocket_v1_StateCommand_listening_purpose_tag 2 #define stackchan_websocket_v1_WakeWordEvent_detected_tag 1 #define stackchan_websocket_v1_StateEvent_state_tag 1 #define stackchan_websocket_v1_SpeakDoneEvent_done_tag 1 @@ -361,8 +349,7 @@ X(a, STATIC, SINGULAR, BYTES, pcm_bytes, 1) #define stackchan_websocket_v1_AudioChunk_DEFAULT NULL #define stackchan_websocket_v1_StateCommand_FIELDLIST(X, a) \ -X(a, STATIC, SINGULAR, UENUM, state, 1) \ -X(a, STATIC, SINGULAR, UENUM, listening_purpose, 2) +X(a, STATIC, SINGULAR, UENUM, state, 1) #define stackchan_websocket_v1_StateCommand_CALLBACK NULL #define stackchan_websocket_v1_StateCommand_DEFAULT NULL @@ -463,7 +450,7 @@ extern const pb_msgdesc_t stackchan_websocket_v1_ServerMetadata_msg; #define stackchan_websocket_v1_ServoCommand_size 14 #define stackchan_websocket_v1_ServoDoneEvent_size 2 #define stackchan_websocket_v1_SpeakDoneEvent_size 2 -#define stackchan_websocket_v1_StateCommand_size 4 +#define stackchan_websocket_v1_StateCommand_size 2 #define stackchan_websocket_v1_StateEvent_size 2 #define stackchan_websocket_v1_WakeWordEvent_size 2 #define stackchan_websocket_v1_WebSocketMessage_size 4113 diff --git a/firmware/src/display.cpp b/firmware/src/display.cpp index aeb9676..00ba761 100644 --- a/firmware/src/display.cpp +++ b/firmware/src/display.cpp @@ -71,14 +71,12 @@ void Display::init() drawFace(); has_prev_state_ = true; prev_state_ = state_.getState(); - prev_server_wake_word_idle_ = shouldShowServerWakeWordIdle(); } void Display::loop() { StateMachine::State current = state_.getState(); - bool current_server_wake_word_idle = shouldShowServerWakeWordIdle(); - if (!has_prev_state_ || current != prev_state_ || current_server_wake_word_idle != prev_server_wake_word_idle_) + if (!has_prev_state_ || current != prev_state_) { GFXModule.fillScreen(TFT_BLACK); drawForState(current); @@ -86,7 +84,6 @@ void Display::loop() } prev_state_ = current; - prev_server_wake_word_idle_ = current_server_wake_word_idle; has_prev_state_ = true; } @@ -123,6 +120,11 @@ void Display::drawForState(StateMachine::State state) font_color = TFT_BLACK; led_color = Adafruit_NeoPixel::ColorHSV(kLedHueGreen, 255, ledValueFromBrightness()); break; + case StateMachine::ServerWwd: + bg_color = TFT_DARKGRAY; + font_color = TFT_WHITE; + led_color = Adafruit_NeoPixel::ColorHSV(0, 0, 0); + break; case StateMachine::Disconnected: bg_color = TFT_RED; font_color = TFT_WHITE; @@ -141,7 +143,7 @@ void Display::drawForState(StateMachine::State state) GFXModule.setTextSize(1); GFXModule.setTextColor(font_color, bg_color); GFXModule.setCursor(isAtomS3R() ? 4 : 10, bar_y + (isAtomS3R() ? 6 : 2)); - if (state == StateMachine::Idle && shouldUseServerWakeWord()) + if (state == StateMachine::ServerWwd) { GFXModule.printf("Idle(Server-WWD)"); return; @@ -177,11 +179,6 @@ bool Display::isAtomS3R() const #endif } -bool Display::shouldShowServerWakeWordIdle() const -{ - return state_.getState() == StateMachine::Idle && shouldUseServerWakeWord(); -} - int32_t Display::statusBarHeight() const { return isAtomS3R() ? 28 : 20; diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp index 67256ad..5263e24 100644 --- a/firmware/src/main.cpp +++ b/firmware/src/main.cpp @@ -238,44 +238,38 @@ bool applyRemoteStateCommand(const stackchan_websocket_v1_StateCommand &command) switch (command.state) { case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE: - if (listening.isWakeWordStreaming()) - { - listening.endWakeWordStreaming(); - } stateMachine.setState(StateMachine::Idle); return true; case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING: - if (command.listening_purpose == stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD && - shouldUseServerWakeWord() && - stateMachine.getState() == StateMachine::Idle) - { - if (!listening.beginWakeWordStreaming()) - { - log_w("Failed to start server-side wakeword streaming"); - return false; - } - return true; - } - - if (listening.isWakeWordStreaming()) - { - listening.endWakeWordStreaming(); - } stateMachine.setState(StateMachine::Listening); return true; case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING: - if (listening.isWakeWordStreaming()) - { - listening.endWakeWordStreaming(); - } stateMachine.setState(StateMachine::Thinking); return true; case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING: - if (listening.isWakeWordStreaming()) + stateMachine.setState(StateMachine::Speaking); + return true; + case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SERVER_WWD: + if (!shouldUseServerWakeWord()) { - listening.endWakeWordStreaming(); + log_w("Server-side wakeword is not available"); + return false; } - stateMachine.setState(StateMachine::Speaking); + if (stateMachine.getState() == StateMachine::ServerWwd) + { + return true; + } + if (stateMachine.getState() != StateMachine::Idle) + { + log_w("Cannot enter server-side wakeword from state=%u", static_cast(stateMachine.getState())); + return false; + } + if (!listening.beginWakeWordStreaming()) + { + log_w("Failed to start server-side wakeword streaming"); + return false; + } + stateMachine.setState(StateMachine::ServerWwd); return true; default: log_w("Unknown remote state"); @@ -546,6 +540,13 @@ void setup() listening.end(); }); + stateMachine.addStateEntryEvent(StateMachine::ServerWwd, [](StateMachine::State, StateMachine::State) { + notifyCurrentState(StateMachine::ServerWwd); + }); + stateMachine.addStateExitEvent(StateMachine::ServerWwd, [](StateMachine::State, StateMachine::State) { + listening.endWakeWordStreaming(); + }); + stateMachine.addStateEntryEvent(StateMachine::Speaking, [](StateMachine::State, StateMachine::State) { notifyCurrentState(StateMachine::Speaking); speaking.begin(); @@ -587,6 +588,9 @@ void loop() case StateMachine::Listening: listening.loop(); break; + case StateMachine::ServerWwd: + listening.loop(); + break; case StateMachine::Thinking: // Wait for server side command / audio stream. break; diff --git a/firmware/src/state_machine.cpp b/firmware/src/state_machine.cpp index 2432cd2..196aaad 100644 --- a/firmware/src/state_machine.cpp +++ b/firmware/src/state_machine.cpp @@ -13,6 +13,8 @@ const char *stateToString(StateMachine::State s) return "Thinking"; case StateMachine::Speaking: return "Speaking"; + case StateMachine::ServerWwd: + return "ServerWwd"; case StateMachine::Disconnected: return "Disconnected"; default: @@ -66,6 +68,11 @@ bool StateMachine::isThinking() const return state_ == Thinking; } +bool StateMachine::isServerWwd() const +{ + return state_ == ServerWwd; +} + bool StateMachine::isDisconnected() const { return state_ == Disconnected; diff --git a/protobuf/websocket-message.proto b/protobuf/websocket-message.proto index 10932ac..4d288ef 100644 --- a/protobuf/websocket-message.proto +++ b/protobuf/websocket-message.proto @@ -61,12 +61,7 @@ enum StackchanState { STACKCHAN_STATE_LISTENING = 1; STACKCHAN_STATE_THINKING = 2; STACKCHAN_STATE_SPEAKING = 3; -} - -enum ListeningPurpose { - LISTENING_PURPOSE_UNSPECIFIED = 0; - LISTENING_PURPOSE_SPEECH = 1; - LISTENING_PURPOSE_WAKE_WORD = 2; + STACKCHAN_STATE_SERVER_WWD = 4; } enum ServoOperation { @@ -106,7 +101,6 @@ message AudioChunk { message StateCommand { StackchanState state = 1; - ListeningPurpose listening_purpose = 2; } message WakeWordEvent { diff --git a/stackchan_server/generated_protobuf/websocket_message_pb2.py b/stackchan_server/generated_protobuf/websocket_message_pb2.py index 15b3d61..1237224 100644 --- a/stackchan_server/generated_protobuf/websocket_message_pb2.py +++ b/stackchan_server/generated_protobuf/websocket_message_pb2.py @@ -24,27 +24,25 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17websocket-message.proto\x12\x16stackchan.websocket.v1\"\x96\x08\n\x10WebSocketMessage\x12\x31\n\x04kind\x18\x01 \x01(\x0e\x32#.stackchan.websocket.v1.MessageKind\x12\x39\n\x0cmessage_type\x18\x02 \x01(\x0e\x32#.stackchan.websocket.v1.MessageType\x12\x0b\n\x03seq\x18\x03 \x01(\r\x12@\n\x0f\x61udio_pcm_start\x18\n \x01(\x0b\x32%.stackchan.websocket.v1.AudioPcmStartH\x00\x12<\n\x0e\x61udio_pcm_data\x18\x0b \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_pcm_end\x18\x0c \x01(\x0b\x32#.stackchan.websocket.v1.AudioPcmEndH\x00\x12@\n\x0f\x61udio_wav_start\x18\x14 \x01(\x0b\x32%.stackchan.websocket.v1.AudioWavStartH\x00\x12<\n\x0e\x61udio_wav_data\x18\x15 \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_wav_end\x18\x16 \x01(\x0b\x32#.stackchan.websocket.v1.AudioWavEndH\x00\x12\x39\n\tstate_cmd\x18\x1e \x01(\x0b\x32$.stackchan.websocket.v1.StateCommandH\x00\x12>\n\rwake_word_evt\x18\x1f \x01(\x0b\x32%.stackchan.websocket.v1.WakeWordEventH\x00\x12\x37\n\tstate_evt\x18 \x01(\x0b\x32\".stackchan.websocket.v1.StateEventH\x00\x12@\n\x0espeak_done_evt\x18! \x01(\x0b\x32&.stackchan.websocket.v1.SpeakDoneEventH\x00\x12\x41\n\tservo_cmd\x18\" \x01(\x0b\x32,.stackchan.websocket.v1.ServoCommandSequenceH\x00\x12@\n\x0eservo_done_evt\x18# \x01(\x0b\x32&.stackchan.websocket.v1.ServoDoneEventH\x00\x12\x45\n\x11\x66irmware_metadata\x18$ \x01(\x0b\x32(.stackchan.websocket.v1.FirmwareMetadataH\x00\x12\x41\n\x0fserver_metadata\x18% \x01(\x0b\x32&.stackchan.websocket.v1.ServerMetadataH\x00\x42\x06\n\x04\x62ody\"\x0f\n\rAudioPcmStart\"\r\n\x0b\x41udioPcmEnd\"6\n\rAudioWavStart\x12\x13\n\x0bsample_rate\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\"\r\n\x0b\x41udioWavEnd\"\x1f\n\nAudioChunk\x12\x11\n\tpcm_bytes\x18\x01 \x01(\x0c\"\x8a\x01\n\x0cStateCommand\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\x12\x43\n\x11listening_purpose\x18\x02 \x01(\x0e\x32(.stackchan.websocket.v1.ListeningPurpose\"!\n\rWakeWordEvent\x12\x10\n\x08\x64\x65tected\x18\x01 \x01(\x08\"C\n\nStateEvent\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"\x1e\n\x0eSpeakDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"N\n\x14ServoCommandSequence\x12\x36\n\x08\x63ommands\x18\x01 \x03(\x0b\x32$.stackchan.websocket.v1.ServoCommand\"f\n\x0cServoCommand\x12\x32\n\x02op\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.ServoOperation\x12\r\n\x05\x61ngle\x18\x02 \x01(\x11\x12\x13\n\x0b\x64uration_ms\x18\x03 \x01(\x11\"\x1e\n\x0eServoDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"\x99\x02\n\x10\x46irmwareMetadata\x12\x37\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\".stackchan.websocket.v1.DeviceType\x12\x15\n\rdisplay_width\x18\x02 \x01(\r\x12\x16\n\x0e\x64isplay_height\x18\x03 \x01(\r\x12\x1c\n\x14has_device_wake_word\x18\x04 \x01(\x08\x12\x0f\n\x07has_led\x18\x05 \x01(\x08\x12\x35\n\nservo_type\x18\x06 \x01(\x0e\x32!.stackchan.websocket.v1.ServoType\x12\x1d\n\x15supports_audio_duplex\x18\x07 \x01(\x08\x12\x18\n\x10\x66irmware_version\x18\x08 \x01(\t\"F\n\x0eServerMetadata\x12\x1c\n\x14has_server_wake_word\x18\x01 \x01(\x08\x12\x16\n\x0eserver_version\x18\x02 \x01(\t*\x80\x03\n\x0bMessageKind\x12\x1c\n\x18MESSAGE_KIND_UNSPECIFIED\x10\x00\x12\x1a\n\x16MESSAGE_KIND_AUDIO_PCM\x10\x01\x12\x1a\n\x16MESSAGE_KIND_AUDIO_WAV\x10\x02\x12\x1a\n\x16MESSAGE_KIND_STATE_CMD\x10\x03\x12\x1e\n\x1aMESSAGE_KIND_WAKE_WORD_EVT\x10\x04\x12\x1a\n\x16MESSAGE_KIND_STATE_EVT\x10\x05\x12\x1f\n\x1bMESSAGE_KIND_SPEAK_DONE_EVT\x10\x06\x12\x1a\n\x16MESSAGE_KIND_SERVO_CMD\x10\x07\x12\x1f\n\x1bMESSAGE_KIND_SERVO_DONE_EVT\x10\x08\x12\"\n\x1eMESSAGE_KIND_FIRMWARE_METADATA\x10\t\x12 \n\x1cMESSAGE_KIND_SERVER_METADATA\x10\n\x12\x1f\n\x1bMESSAGE_KIND_SERVER_WWD_PCM\x10\x0b*p\n\x0bMessageType\x12\x1c\n\x18MESSAGE_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12MESSAGE_TYPE_START\x10\x01\x12\x15\n\x11MESSAGE_TYPE_DATA\x10\x02\x12\x14\n\x10MESSAGE_TYPE_END\x10\x03*\x85\x01\n\x0eStackchanState\x12\x18\n\x14STACKCHAN_STATE_IDLE\x10\x00\x12\x1d\n\x19STACKCHAN_STATE_LISTENING\x10\x01\x12\x1c\n\x18STACKCHAN_STATE_THINKING\x10\x02\x12\x1c\n\x18STACKCHAN_STATE_SPEAKING\x10\x03*t\n\x10ListeningPurpose\x12!\n\x1dLISTENING_PURPOSE_UNSPECIFIED\x10\x00\x12\x1c\n\x18LISTENING_PURPOSE_SPEECH\x10\x01\x12\x1f\n\x1bLISTENING_PURPOSE_WAKE_WORD\x10\x02*c\n\x0eServoOperation\x12\x19\n\x15SERVO_OPERATION_SLEEP\x10\x00\x12\x1a\n\x16SERVO_OPERATION_MOVE_X\x10\x01\x12\x1a\n\x16SERVO_OPERATION_MOVE_Y\x10\x02*\x85\x01\n\nDeviceType\x12\x1b\n\x17\x44\x45VICE_TYPE_UNSPECIFIED\x10\x00\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5STACK_CORES3\x10\x01\x12\x1a\n\x16\x44\x45VICE_TYPE_M5ATOM_S3R\x10\x02\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5ATOM_ECHOS3R\x10\x03*i\n\tServoType\x12\x1a\n\x16SERVO_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fSERVO_TYPE_NONE\x10\x01\x12\x13\n\x0fSERVO_TYPE_SG90\x10\x02\x12\x16\n\x12SERVO_TYPE_SCS0009\x10\x03\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17websocket-message.proto\x12\x16stackchan.websocket.v1\"\x96\x08\n\x10WebSocketMessage\x12\x31\n\x04kind\x18\x01 \x01(\x0e\x32#.stackchan.websocket.v1.MessageKind\x12\x39\n\x0cmessage_type\x18\x02 \x01(\x0e\x32#.stackchan.websocket.v1.MessageType\x12\x0b\n\x03seq\x18\x03 \x01(\r\x12@\n\x0f\x61udio_pcm_start\x18\n \x01(\x0b\x32%.stackchan.websocket.v1.AudioPcmStartH\x00\x12<\n\x0e\x61udio_pcm_data\x18\x0b \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_pcm_end\x18\x0c \x01(\x0b\x32#.stackchan.websocket.v1.AudioPcmEndH\x00\x12@\n\x0f\x61udio_wav_start\x18\x14 \x01(\x0b\x32%.stackchan.websocket.v1.AudioWavStartH\x00\x12<\n\x0e\x61udio_wav_data\x18\x15 \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_wav_end\x18\x16 \x01(\x0b\x32#.stackchan.websocket.v1.AudioWavEndH\x00\x12\x39\n\tstate_cmd\x18\x1e \x01(\x0b\x32$.stackchan.websocket.v1.StateCommandH\x00\x12>\n\rwake_word_evt\x18\x1f \x01(\x0b\x32%.stackchan.websocket.v1.WakeWordEventH\x00\x12\x37\n\tstate_evt\x18 \x01(\x0b\x32\".stackchan.websocket.v1.StateEventH\x00\x12@\n\x0espeak_done_evt\x18! \x01(\x0b\x32&.stackchan.websocket.v1.SpeakDoneEventH\x00\x12\x41\n\tservo_cmd\x18\" \x01(\x0b\x32,.stackchan.websocket.v1.ServoCommandSequenceH\x00\x12@\n\x0eservo_done_evt\x18# \x01(\x0b\x32&.stackchan.websocket.v1.ServoDoneEventH\x00\x12\x45\n\x11\x66irmware_metadata\x18$ \x01(\x0b\x32(.stackchan.websocket.v1.FirmwareMetadataH\x00\x12\x41\n\x0fserver_metadata\x18% \x01(\x0b\x32&.stackchan.websocket.v1.ServerMetadataH\x00\x42\x06\n\x04\x62ody\"\x0f\n\rAudioPcmStart\"\r\n\x0b\x41udioPcmEnd\"6\n\rAudioWavStart\x12\x13\n\x0bsample_rate\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\"\r\n\x0b\x41udioWavEnd\"\x1f\n\nAudioChunk\x12\x11\n\tpcm_bytes\x18\x01 \x01(\x0c\"E\n\x0cStateCommand\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"!\n\rWakeWordEvent\x12\x10\n\x08\x64\x65tected\x18\x01 \x01(\x08\"C\n\nStateEvent\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"\x1e\n\x0eSpeakDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"N\n\x14ServoCommandSequence\x12\x36\n\x08\x63ommands\x18\x01 \x03(\x0b\x32$.stackchan.websocket.v1.ServoCommand\"f\n\x0cServoCommand\x12\x32\n\x02op\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.ServoOperation\x12\r\n\x05\x61ngle\x18\x02 \x01(\x11\x12\x13\n\x0b\x64uration_ms\x18\x03 \x01(\x11\"\x1e\n\x0eServoDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"\x99\x02\n\x10\x46irmwareMetadata\x12\x37\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\".stackchan.websocket.v1.DeviceType\x12\x15\n\rdisplay_width\x18\x02 \x01(\r\x12\x16\n\x0e\x64isplay_height\x18\x03 \x01(\r\x12\x1c\n\x14has_device_wake_word\x18\x04 \x01(\x08\x12\x0f\n\x07has_led\x18\x05 \x01(\x08\x12\x35\n\nservo_type\x18\x06 \x01(\x0e\x32!.stackchan.websocket.v1.ServoType\x12\x1d\n\x15supports_audio_duplex\x18\x07 \x01(\x08\x12\x18\n\x10\x66irmware_version\x18\x08 \x01(\t\"F\n\x0eServerMetadata\x12\x1c\n\x14has_server_wake_word\x18\x01 \x01(\x08\x12\x16\n\x0eserver_version\x18\x02 \x01(\t*\x80\x03\n\x0bMessageKind\x12\x1c\n\x18MESSAGE_KIND_UNSPECIFIED\x10\x00\x12\x1a\n\x16MESSAGE_KIND_AUDIO_PCM\x10\x01\x12\x1a\n\x16MESSAGE_KIND_AUDIO_WAV\x10\x02\x12\x1a\n\x16MESSAGE_KIND_STATE_CMD\x10\x03\x12\x1e\n\x1aMESSAGE_KIND_WAKE_WORD_EVT\x10\x04\x12\x1a\n\x16MESSAGE_KIND_STATE_EVT\x10\x05\x12\x1f\n\x1bMESSAGE_KIND_SPEAK_DONE_EVT\x10\x06\x12\x1a\n\x16MESSAGE_KIND_SERVO_CMD\x10\x07\x12\x1f\n\x1bMESSAGE_KIND_SERVO_DONE_EVT\x10\x08\x12\"\n\x1eMESSAGE_KIND_FIRMWARE_METADATA\x10\t\x12 \n\x1cMESSAGE_KIND_SERVER_METADATA\x10\n\x12\x1f\n\x1bMESSAGE_KIND_SERVER_WWD_PCM\x10\x0b*p\n\x0bMessageType\x12\x1c\n\x18MESSAGE_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12MESSAGE_TYPE_START\x10\x01\x12\x15\n\x11MESSAGE_TYPE_DATA\x10\x02\x12\x14\n\x10MESSAGE_TYPE_END\x10\x03*\xa5\x01\n\x0eStackchanState\x12\x18\n\x14STACKCHAN_STATE_IDLE\x10\x00\x12\x1d\n\x19STACKCHAN_STATE_LISTENING\x10\x01\x12\x1c\n\x18STACKCHAN_STATE_THINKING\x10\x02\x12\x1c\n\x18STACKCHAN_STATE_SPEAKING\x10\x03\x12\x1e\n\x1aSTACKCHAN_STATE_SERVER_WWD\x10\x04*c\n\x0eServoOperation\x12\x19\n\x15SERVO_OPERATION_SLEEP\x10\x00\x12\x1a\n\x16SERVO_OPERATION_MOVE_X\x10\x01\x12\x1a\n\x16SERVO_OPERATION_MOVE_Y\x10\x02*\x85\x01\n\nDeviceType\x12\x1b\n\x17\x44\x45VICE_TYPE_UNSPECIFIED\x10\x00\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5STACK_CORES3\x10\x01\x12\x1a\n\x16\x44\x45VICE_TYPE_M5ATOM_S3R\x10\x02\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5ATOM_ECHOS3R\x10\x03*i\n\tServoType\x12\x1a\n\x16SERVO_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fSERVO_TYPE_NONE\x10\x01\x12\x13\n\x0fSERVO_TYPE_SG90\x10\x02\x12\x16\n\x12SERVO_TYPE_SCS0009\x10\x03\x62\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'websocket_message_pb2', _globals) if not _descriptor._USE_C_DESCRIPTORS: DESCRIPTOR._loaded_options = None - _globals['_MESSAGEKIND']._serialized_start=2086 - _globals['_MESSAGEKIND']._serialized_end=2470 - _globals['_MESSAGETYPE']._serialized_start=2472 - _globals['_MESSAGETYPE']._serialized_end=2584 - _globals['_STACKCHANSTATE']._serialized_start=2587 - _globals['_STACKCHANSTATE']._serialized_end=2720 - _globals['_LISTENINGPURPOSE']._serialized_start=2722 - _globals['_LISTENINGPURPOSE']._serialized_end=2838 - _globals['_SERVOOPERATION']._serialized_start=2840 - _globals['_SERVOOPERATION']._serialized_end=2939 - _globals['_DEVICETYPE']._serialized_start=2942 - _globals['_DEVICETYPE']._serialized_end=3075 - _globals['_SERVOTYPE']._serialized_start=3077 - _globals['_SERVOTYPE']._serialized_end=3182 + _globals['_MESSAGEKIND']._serialized_start=2016 + _globals['_MESSAGEKIND']._serialized_end=2400 + _globals['_MESSAGETYPE']._serialized_start=2402 + _globals['_MESSAGETYPE']._serialized_end=2514 + _globals['_STACKCHANSTATE']._serialized_start=2517 + _globals['_STACKCHANSTATE']._serialized_end=2682 + _globals['_SERVOOPERATION']._serialized_start=2684 + _globals['_SERVOOPERATION']._serialized_end=2783 + _globals['_DEVICETYPE']._serialized_start=2786 + _globals['_DEVICETYPE']._serialized_end=2919 + _globals['_SERVOTYPE']._serialized_start=2921 + _globals['_SERVOTYPE']._serialized_end=3026 _globals['_WEBSOCKETMESSAGE']._serialized_start=52 _globals['_WEBSOCKETMESSAGE']._serialized_end=1098 _globals['_AUDIOPCMSTART']._serialized_start=1100 @@ -57,22 +55,22 @@ _globals['_AUDIOWAVEND']._serialized_end=1201 _globals['_AUDIOCHUNK']._serialized_start=1203 _globals['_AUDIOCHUNK']._serialized_end=1234 - _globals['_STATECOMMAND']._serialized_start=1237 - _globals['_STATECOMMAND']._serialized_end=1375 - _globals['_WAKEWORDEVENT']._serialized_start=1377 - _globals['_WAKEWORDEVENT']._serialized_end=1410 - _globals['_STATEEVENT']._serialized_start=1412 - _globals['_STATEEVENT']._serialized_end=1479 - _globals['_SPEAKDONEEVENT']._serialized_start=1481 - _globals['_SPEAKDONEEVENT']._serialized_end=1511 - _globals['_SERVOCOMMANDSEQUENCE']._serialized_start=1513 - _globals['_SERVOCOMMANDSEQUENCE']._serialized_end=1591 - _globals['_SERVOCOMMAND']._serialized_start=1593 - _globals['_SERVOCOMMAND']._serialized_end=1695 - _globals['_SERVODONEEVENT']._serialized_start=1697 - _globals['_SERVODONEEVENT']._serialized_end=1727 - _globals['_FIRMWAREMETADATA']._serialized_start=1730 - _globals['_FIRMWAREMETADATA']._serialized_end=2011 - _globals['_SERVERMETADATA']._serialized_start=2013 - _globals['_SERVERMETADATA']._serialized_end=2083 + _globals['_STATECOMMAND']._serialized_start=1236 + _globals['_STATECOMMAND']._serialized_end=1305 + _globals['_WAKEWORDEVENT']._serialized_start=1307 + _globals['_WAKEWORDEVENT']._serialized_end=1340 + _globals['_STATEEVENT']._serialized_start=1342 + _globals['_STATEEVENT']._serialized_end=1409 + _globals['_SPEAKDONEEVENT']._serialized_start=1411 + _globals['_SPEAKDONEEVENT']._serialized_end=1441 + _globals['_SERVOCOMMANDSEQUENCE']._serialized_start=1443 + _globals['_SERVOCOMMANDSEQUENCE']._serialized_end=1521 + _globals['_SERVOCOMMAND']._serialized_start=1523 + _globals['_SERVOCOMMAND']._serialized_end=1625 + _globals['_SERVODONEEVENT']._serialized_start=1627 + _globals['_SERVODONEEVENT']._serialized_end=1657 + _globals['_FIRMWAREMETADATA']._serialized_start=1660 + _globals['_FIRMWAREMETADATA']._serialized_end=1941 + _globals['_SERVERMETADATA']._serialized_start=1943 + _globals['_SERVERMETADATA']._serialized_end=2013 # @@protoc_insertion_point(module_scope) diff --git a/stackchan_server/protobuf_ws.py b/stackchan_server/protobuf_ws.py index 443900b..b652a2c 100644 --- a/stackchan_server/protobuf_ws.py +++ b/stackchan_server/protobuf_ws.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections.abc import Sequence -from enum import IntEnum, StrEnum +from enum import StrEnum from typing import Any, Literal, cast from .generated_protobuf import websocket_message_pb2 as _ws_pb2 @@ -15,12 +15,6 @@ ServoCommand = ServoMoveCommand | ServoSleepCommand -class ListeningPurpose(IntEnum): - UNSPECIFIED = 0 - SPEECH = 1 - WAKE_WORD = 2 - - def _ensure_range(value: int, *, minimum: int, maximum: int, label: str) -> int: if not minimum <= value <= maximum: raise ValueError(f"{label} must be between {minimum} and {maximum}: {value}") @@ -101,8 +95,6 @@ def encode_audio_wav_end_message(seq: int) -> bytes: def encode_state_command_message( seq: int, state_id: int, - *, - listening_purpose: int = ListeningPurpose.SPEECH, ) -> bytes: message = _new_message( ws_pb2.MESSAGE_KIND_STATE_CMD, @@ -110,7 +102,6 @@ def encode_state_command_message( seq, ) message.state_cmd.state = int(state_id) - message.state_cmd.listening_purpose = int(listening_purpose) return message.SerializeToString() @@ -185,7 +176,6 @@ def encode_servo_command_message(seq: int, commands: Sequence[ServoCommand]) -> __all__ = [ "ServoCommand", - "ListeningPurpose", "encode_audio_pcm_data_message", "encode_audio_pcm_end_message", "encode_audio_pcm_start_message", diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py index f4024ce..7a5e06b 100644 --- a/stackchan_server/ws_proxy.py +++ b/stackchan_server/ws_proxy.py @@ -17,7 +17,6 @@ from .generated_protobuf import websocket_message_pb2 as _ws_pb2 from .listen import EmptyTranscriptError, ListenHandler, TimeoutError from .protobuf_ws import ( - ListeningPurpose, encode_server_metadata_message, encode_servo_command_message, encode_state_command_message, @@ -57,6 +56,7 @@ class FirmwareState(IntEnum): LISTENING = 1 THINKING = 2 SPEAKING = 3 + SERVER_WWD = 4 class ServoMoveType(StrEnum): @@ -199,13 +199,8 @@ async def speak(self, text: str) -> None: async def send_state_command( self, state_id: int | FirmwareState, - *, - listening_purpose: ListeningPurpose = ListeningPurpose.SPEECH, ) -> None: - await self._send_state_command( - state_id, - listening_purpose=listening_purpose, - ) + await self._send_state_command(state_id) async def reset_state(self) -> None: await self.send_state_command(FirmwareState.IDLE) @@ -585,14 +580,11 @@ def _handle_servo_done_event(self, message: Any) -> None: async def _send_state_command( self, state_id: int | FirmwareState, - *, - listening_purpose: ListeningPurpose = ListeningPurpose.SPEECH, ) -> None: await self._send_ws_bytes( encode_state_command_message( self._next_down_seq(), int(state_id), - listening_purpose=int(listening_purpose), ) ) @@ -624,10 +616,7 @@ async def _run_server_wakeword_detection(self) -> bool: should_restart = False try: await detector.start() - await self.send_state_command( - FirmwareState.LISTENING, - listening_purpose=ListeningPurpose.WAKE_WORD, - ) + await self.send_state_command(FirmwareState.SERVER_WWD) detected = await detector.wait_result() if detected: self._wakeword_event.set() From 97ba831d81205561084dd2b40a398f2a7e55664e Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sat, 9 May 2026 16:51:46 +0900 Subject: [PATCH 08/15] feat: Update ServerWwd state representation to "Idle(Server-WWD)" in display and state machine --- firmware/src/display.cpp | 5 ----- firmware/src/state_machine.cpp | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/firmware/src/display.cpp b/firmware/src/display.cpp index 00ba761..07f0815 100644 --- a/firmware/src/display.cpp +++ b/firmware/src/display.cpp @@ -143,11 +143,6 @@ void Display::drawForState(StateMachine::State state) GFXModule.setTextSize(1); GFXModule.setTextColor(font_color, bg_color); GFXModule.setCursor(isAtomS3R() ? 4 : 10, bar_y + (isAtomS3R() ? 6 : 2)); - if (state == StateMachine::ServerWwd) - { - GFXModule.printf("Idle(Server-WWD)"); - return; - } GFXModule.printf("%s", stateToString(state)); } diff --git a/firmware/src/state_machine.cpp b/firmware/src/state_machine.cpp index 196aaad..ea38cfd 100644 --- a/firmware/src/state_machine.cpp +++ b/firmware/src/state_machine.cpp @@ -14,7 +14,7 @@ const char *stateToString(StateMachine::State s) case StateMachine::Speaking: return "Speaking"; case StateMachine::ServerWwd: - return "ServerWwd"; + return "Idle(Server-WWD)"; case StateMachine::Disconnected: return "Disconnected"; default: From 5dd2c34b425f9f186c7eb785116e4c53e4110234 Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sat, 9 May 2026 17:01:00 +0900 Subject: [PATCH 09/15] feat: Refactor server wakeword detection methods for consistency and clarity --- stackchan_server/ws_proxy.py | 48 ++++++------------------------------ 1 file changed, 8 insertions(+), 40 deletions(-) diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py index 7a5e06b..4857f00 100644 --- a/stackchan_server/ws_proxy.py +++ b/stackchan_server/ws_proxy.py @@ -171,7 +171,7 @@ def trigger_wakeword(self) -> None: async def wait_for_talk_session(self) -> None: while True: if self._wakeword_event.is_set(): - await self.stop_server_wakeword_detection() + await self._stop_server_wakeword_detection() self._wakeword_event.clear() return if self._closed: @@ -179,7 +179,7 @@ async def wait_for_talk_session(self) -> None: await asyncio.sleep(0.05) async def listen(self) -> str: - await self.stop_server_wakeword_detection() + await self._stop_server_wakeword_detection() return await self._listener.listen( send_state_command=self.send_state_command, is_closed=lambda: self._closed, @@ -246,7 +246,7 @@ async def start(self) -> None: async def close(self) -> None: self._closed = True self._cancel_server_wakeword_restart_task() - await self.stop_server_wakeword_detection() + await self._stop_server_wakeword_detection() if self._receiving_task: self._receiving_task.cancel() with suppress(asyncio.CancelledError): @@ -262,9 +262,9 @@ async def start_talking(self, text: str) -> None: async def enable_auto_server_wakeword_detection(self) -> None: self._auto_start_server_wakeword = True - await self.start_server_wakeword_detection_if_available() + await self._start_server_wakeword_detection_if_available() - async def start_server_wakeword_detection_if_available(self) -> bool: + async def _start_server_wakeword_detection_if_available(self) -> bool: if ( self._closed or self._server_wakeword_detector is None @@ -283,7 +283,7 @@ async def start_server_wakeword_detection_if_available(self) -> bool: ) return True - async def stop_server_wakeword_detection(self) -> None: + async def _stop_server_wakeword_detection(self) -> None: self._cancel_server_wakeword_restart_task() task = self._server_wakeword_task if task is None: @@ -308,38 +308,6 @@ async def stop_server_wakeword_detection(self) -> None: except Exception: logger.exception("Server-side wake-word detection task failed") - async def request_server_wakeword_detection( - self, - *, - timeout_seconds: float | None = None, - ) -> bool: - if self._server_wakeword_detector is None or not self.server_metadata.has_server_wake_word: - raise WakeWordDetectionError( - "Server-side wake-word detection is not available for this connection" - ) - if self._closed: - raise WebSocketDisconnect() - - started = await self.start_server_wakeword_detection_if_available() - if not started: - raise WakeWordDetectionError( - "Server-side wake-word detection could not be started in the current state" - ) - - task = self._server_wakeword_task - if task is None: - raise WakeWordDetectionError("Server-side wake-word detection task is unavailable") - - try: - if timeout_seconds is None: - return await asyncio.shield(task) - return await asyncio.wait_for(asyncio.shield(task), timeout=timeout_seconds) - except asyncio.TimeoutError as exc: - await self.stop_server_wakeword_detection() - raise WakeWordDetectionTimeout( - "Server-side wake-word detection timed out" - ) from exc - async def _receive_loop(self) -> None: try: while True: @@ -534,7 +502,7 @@ async def _handle_firmware_metadata(self, message: Any) -> None: ) ) if self._auto_start_server_wakeword: - await self.start_server_wakeword_detection_if_available() + await self._start_server_wakeword_detection_if_available() def _build_server_metadata( self, firmware_metadata: FirmwareMetadata @@ -680,7 +648,7 @@ async def _restart_server_wakeword_detection_after_delay( await asyncio.sleep(delay_seconds) if self._closed: return - await self.start_server_wakeword_detection_if_available() + await self._start_server_wakeword_detection_if_available() except asyncio.CancelledError: raise finally: From b0e7b702c0b724669d4a284dde87f66aad791951 Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sat, 9 May 2026 17:12:49 +0900 Subject: [PATCH 10/15] feat: Implement ServerWwdController for server-side wakeword detection management --- stackchan_server/server_wwd.py | 278 +++++++++++++++++++++++++++++++++ stackchan_server/ws_proxy.py | 265 +++---------------------------- 2 files changed, 303 insertions(+), 240 deletions(-) create mode 100644 stackchan_server/server_wwd.py diff --git a/stackchan_server/server_wwd.py b/stackchan_server/server_wwd.py new file mode 100644 index 0000000..193c99f --- /dev/null +++ b/stackchan_server/server_wwd.py @@ -0,0 +1,278 @@ +from __future__ import annotations + +import asyncio +from logging import getLogger +from typing import Any, Awaitable, Callable, Optional + +from .wakeup_word_detection import ( + WakeWordDetectionError, + WakeWordDetectionTimeout, + create_server_side_wake_word_detector, +) + +logger = getLogger(__name__) + +_SERVER_WAKEWORD_RESTART_DELAY_SECONDS = 0.25 +_TRAILING_PCM_DRAIN_SECONDS = 1.0 + + +class ServerWwdController: + def __init__( + self, + *, + send_state_command: Callable[[int], Awaitable[None]], + set_current_state: Callable[[int], None], + close_websocket: Callable[[int, str], Awaitable[None]], + current_state: Callable[[], int], + has_server_wake_word: Callable[[], bool], + is_closed: Callable[[], bool], + on_detected: Callable[[], None], + has_pending_wakeword: Callable[[], bool], + server_wwd_state: int, + idle_state: int, + ) -> None: + self._send_state_command = send_state_command + self._set_current_state = set_current_state + self._close_websocket = close_websocket + self._current_state = current_state + self._has_server_wake_word = has_server_wake_word + self._is_closed = is_closed + self._on_detected = on_detected + self._has_pending_wakeword = has_pending_wakeword + self._server_wwd_state = server_wwd_state + self._idle_state = idle_state + + self._detector = create_server_side_wake_word_detector() + self._task: Optional[asyncio.Task[bool]] = None + self._restart_task: Optional[asyncio.Task[None]] = None + self._auto_start = False + self._drain_trailing_pcm_until_end = False + self._drain_trailing_pcm_deadline: float | None = None + + @property + def available(self) -> bool: + return self._detector is not None + + @property + def auto_start_enabled(self) -> bool: + return self._auto_start + + async def enable_auto_detection(self) -> None: + self._auto_start = True + await self.start_if_available() + + async def start_if_available(self) -> bool: + if ( + self._is_closed() + or self._detector is None + or not self._has_server_wake_word() + or self._current_state() != self._idle_state + ): + return False + + if self._task is not None and not self._task.done(): + return True + + self._cancel_restart_task() + self._task = asyncio.create_task( + self._run_detection(), + name="server-side-wakeword-detection", + ) + return True + + async def stop(self) -> None: + self._cancel_restart_task() + task = self._task + if task is None: + return + + if task.done(): + self._task = None + try: + await task + except asyncio.CancelledError: + pass + except Exception: + logger.exception("Server-side wake-word detection task failed") + return + + task.cancel() + self._task = None + try: + await task + except asyncio.CancelledError: + pass + except Exception: + logger.exception("Server-side wake-word detection task failed") + + async def handle_pcm_message(self, message: Any, *, ws_pb2: Any) -> bool: + body_name = message.WhichOneof("body") + + if self._should_drain_trailing_pcm(): + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_START + and body_name == "audio_pcm_start" + ): + logger.info( + "Received a new server-side wake-word PCM START while draining trailing audio; resuming normal routing" + ) + self._clear_trailing_pcm_drain() + elif ( + message.message_type == ws_pb2.MESSAGE_TYPE_DATA + and body_name == "audio_pcm_data" + ): + logger.info( + "Discarding trailing server-side wake-word PCM DATA payload_bytes=%d", + len(message.audio_pcm_data.pcm_bytes), + ) + return True + elif ( + message.message_type == ws_pb2.MESSAGE_TYPE_END + and body_name == "audio_pcm_end" + ): + logger.info("Finished draining trailing server-side wake-word PCM") + self._clear_trailing_pcm_drain() + return True + + detector = self._detector + if detector is None or not detector.running: + logger.info( + "Ignoring server-side wake-word PCM while detector is inactive type=%s body=%s", + message.message_type, + body_name, + ) + return True + + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_START + and body_name == "audio_pcm_start" + ): + await detector.handle_start() + return True + + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_DATA + and body_name == "audio_pcm_data" + ): + payload = bytes(message.audio_pcm_data.pcm_bytes) + await detector.handle_data(payload) + return True + + if ( + message.message_type == ws_pb2.MESSAGE_TYPE_END + and body_name == "audio_pcm_end" + ): + await detector.handle_end() + return True + + await self._close_websocket(1003, "unknown server wake-word PCM protobuf body") + return False + + def schedule_restart( + self, + delay_seconds: float = _SERVER_WAKEWORD_RESTART_DELAY_SECONDS, + ) -> None: + if not self._auto_start or self._is_closed(): + return + + self._cancel_restart_task() + self._restart_task = asyncio.create_task( + self._restart_after_delay(delay_seconds), + name="server-side-wakeword-restart", + ) + + async def _run_detection(self) -> bool: + detector = self._detector + if detector is None: + return False + + detected = False + should_restart = False + try: + await detector.start() + await self._send_state_command(self._server_wwd_state) + detected = await detector.wait_result() + if detected: + self._on_detected() + return detected + except asyncio.CancelledError: + raise + except WakeWordDetectionTimeout as exc: + logger.info("Server-side wake-word detection stopped: %s", exc) + return False + except WakeWordDetectionError as exc: + logger.warning("Server-side wake-word detection stopped: %s", exc) + return False + except Exception: + logger.exception("Server-side wake-word detection failed") + return False + finally: + await detector.stop() + self._arm_trailing_pcm_drain() + if not self._is_closed(): + self._set_current_state(self._idle_state) + try: + await self._send_state_command(self._idle_state) + except Exception: + logger.exception( + "Failed to return firmware to idle after wake-word detection" + ) + should_restart = ( + self._auto_start + and not detected + and not self._has_pending_wakeword() + and not self._is_closed() + ) + if self._task is asyncio.current_task(): + self._task = None + if should_restart: + self.schedule_restart() + + def _cancel_restart_task(self) -> None: + task = self._restart_task + if task is None: + return + self._restart_task = None + task.cancel() + + async def _restart_after_delay(self, delay_seconds: float) -> None: + try: + await asyncio.sleep(delay_seconds) + if self._is_closed(): + return + await self.start_if_available() + except asyncio.CancelledError: + raise + finally: + if self._restart_task is asyncio.current_task(): + self._restart_task = None + + def _arm_trailing_pcm_drain( + self, + timeout_seconds: float = _TRAILING_PCM_DRAIN_SECONDS, + ) -> None: + loop = asyncio.get_running_loop() + self._drain_trailing_pcm_until_end = True + self._drain_trailing_pcm_deadline = loop.time() + timeout_seconds + + def _clear_trailing_pcm_drain(self) -> None: + self._drain_trailing_pcm_until_end = False + self._drain_trailing_pcm_deadline = None + + def _should_drain_trailing_pcm(self) -> bool: + if not self._drain_trailing_pcm_until_end: + return False + deadline = self._drain_trailing_pcm_deadline + if deadline is None: + return True + if asyncio.get_running_loop().time() <= deadline: + return True + + logger.info( + "Trailing PCM drain window expired before END arrived; resuming normal routing" + ) + self._clear_trailing_pcm_drain() + return False + + +__all__ = ["ServerWwdController"] diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py index 4857f00..bdcd823 100644 --- a/stackchan_server/ws_proxy.py +++ b/stackchan_server/ws_proxy.py @@ -22,14 +22,10 @@ encode_state_command_message, parse_websocket_message, ) +from .server_wwd import ServerWwdController from .speak import SpeakHandler from .static import LISTEN_AUDIO_FORMAT from .types import SpeechRecognizer, SpeechSynthesizer -from .wakeup_word_detection import ( - WakeWordDetectionError, - WakeWordDetectionTimeout, - create_server_side_wake_word_detector, -) logger = getLogger(__name__) @@ -47,8 +43,6 @@ ) # half interval for the second segment start _LISTEN_AUDIO_TIMEOUT_SECONDS = 10.0 _DEBUG_RECORDING_ENABLED = os.getenv("DEBUG_RECODING") == "1" -_SERVER_WAKEWORD_RESTART_DELAY_SECONDS = 0.25 -_TRAILING_PCM_DRAIN_SECONDS = 1.0 class FirmwareState(IntEnum): @@ -125,13 +119,6 @@ def __init__( recordings_dir=self.recordings_dir, debug_recording=self._debug_recording, ) - self._server_wakeword_detector = create_server_side_wake_word_detector() - self._server_wakeword_task: Optional[asyncio.Task[bool]] = None - self._server_wakeword_restart_task: Optional[asyncio.Task[None]] = None - self._auto_start_server_wakeword = False - self._drain_trailing_pcm_until_end = False - self._drain_trailing_pcm_deadline: float | None = None - self._receiving_task: Optional[asyncio.Task] = None self._closed = False @@ -146,6 +133,20 @@ def __init__( self._servo_done_counter = 0 self._servo_sent_counter = 0 self._pending_servo_wait_targets: deque[int] = deque() + self._server_wwd = ServerWwdController( + send_state_command=self.send_state_command, + set_current_state=lambda state: setattr( + self, "_current_firmware_state", FirmwareState(state) + ), + close_websocket=self.ws.close, + current_state=lambda: int(self._current_firmware_state), + has_server_wake_word=lambda: self.server_metadata.has_server_wake_word, + is_closed=lambda: self._closed, + on_detected=self._wakeword_event.set, + has_pending_wakeword=self._wakeword_event.is_set, + server_wwd_state=int(FirmwareState.SERVER_WWD), + idle_state=int(FirmwareState.IDLE), + ) @property def closed(self) -> bool: @@ -161,7 +162,7 @@ def receive_task(self) -> Optional[asyncio.Task]: @property def has_server_wakeword_detector(self) -> bool: - return self._server_wakeword_detector is not None + return self._server_wwd.available def trigger_wakeword(self) -> None: """Web API から擬似的に WAKEWORD_EVT を発火させる。""" @@ -171,7 +172,7 @@ def trigger_wakeword(self) -> None: async def wait_for_talk_session(self) -> None: while True: if self._wakeword_event.is_set(): - await self._stop_server_wakeword_detection() + await self._server_wwd.stop() self._wakeword_event.clear() return if self._closed: @@ -179,7 +180,7 @@ async def wait_for_talk_session(self) -> None: await asyncio.sleep(0.05) async def listen(self) -> str: - await self._stop_server_wakeword_detection() + await self._server_wwd.stop() return await self._listener.listen( send_state_command=self.send_state_command, is_closed=lambda: self._closed, @@ -205,7 +206,7 @@ async def send_state_command( async def reset_state(self) -> None: await self.send_state_command(FirmwareState.IDLE) self._current_firmware_state = FirmwareState.IDLE - self._schedule_server_wakeword_restart() + self._server_wwd.schedule_restart() async def move_servo(self, commands: Sequence[ServoCommand]) -> None: previous_counter = self._servo_sent_counter @@ -245,8 +246,7 @@ async def start(self) -> None: async def close(self) -> None: self._closed = True - self._cancel_server_wakeword_restart_task() - await self._stop_server_wakeword_detection() + await self._server_wwd.stop() if self._receiving_task: self._receiving_task.cancel() with suppress(asyncio.CancelledError): @@ -261,52 +261,7 @@ async def start_talking(self, text: str) -> None: await self.speak(text) async def enable_auto_server_wakeword_detection(self) -> None: - self._auto_start_server_wakeword = True - await self._start_server_wakeword_detection_if_available() - - async def _start_server_wakeword_detection_if_available(self) -> bool: - if ( - self._closed - or self._server_wakeword_detector is None - or not self.server_metadata.has_server_wake_word - or self.current_state != FirmwareState.IDLE - ): - return False - - if self._server_wakeword_task is not None and not self._server_wakeword_task.done(): - return True - - self._cancel_server_wakeword_restart_task() - self._server_wakeword_task = asyncio.create_task( - self._run_server_wakeword_detection(), - name="server-side-wakeword-detection", - ) - return True - - async def _stop_server_wakeword_detection(self) -> None: - self._cancel_server_wakeword_restart_task() - task = self._server_wakeword_task - if task is None: - return - - if task.done(): - self._server_wakeword_task = None - try: - await task - except asyncio.CancelledError: - pass - except Exception: - logger.exception("Server-side wake-word detection task failed") - return - - task.cancel() - self._server_wakeword_task = None - try: - await task - except asyncio.CancelledError: - pass - except Exception: - logger.exception("Server-side wake-word detection task failed") + await self._server_wwd.enable_auto_detection() async def _receive_loop(self) -> None: try: @@ -324,7 +279,7 @@ async def _receive_loop(self) -> None: break if message.kind == ws_pb2.MESSAGE_KIND_SERVER_WWD_PCM: - if not await self._handle_server_wakeword_pcm_message(message): + if not await self._server_wwd.handle_pcm_message(message, ws_pb2=ws_pb2): break continue @@ -360,69 +315,6 @@ async def _receive_loop(self) -> None: finally: self._closed = True - async def _handle_server_wakeword_pcm_message(self, message: Any) -> bool: - body_name = message.WhichOneof("body") - - if self._should_drain_trailing_pcm(): - if ( - message.message_type == ws_pb2.MESSAGE_TYPE_START - and body_name == "audio_pcm_start" - ): - logger.info( - "Received a new server-side wake-word PCM START while draining trailing audio; resuming normal routing" - ) - self._clear_trailing_pcm_drain() - elif ( - message.message_type == ws_pb2.MESSAGE_TYPE_DATA - and body_name == "audio_pcm_data" - ): - logger.info( - "Discarding trailing server-side wake-word PCM DATA payload_bytes=%d", - len(message.audio_pcm_data.pcm_bytes), - ) - return True - elif ( - message.message_type == ws_pb2.MESSAGE_TYPE_END - and body_name == "audio_pcm_end" - ): - logger.info("Finished draining trailing server-side wake-word PCM") - self._clear_trailing_pcm_drain() - return True - - detector = self._server_wakeword_detector - if detector is None or not detector.running: - logger.info( - "Ignoring server-side wake-word PCM while detector is inactive type=%s body=%s", - message.message_type, - body_name, - ) - return True - - if ( - message.message_type == ws_pb2.MESSAGE_TYPE_START - and body_name == "audio_pcm_start" - ): - await detector.handle_start() - return True - - if ( - message.message_type == ws_pb2.MESSAGE_TYPE_DATA - and body_name == "audio_pcm_data" - ): - payload = bytes(message.audio_pcm_data.pcm_bytes) - await detector.handle_data(payload) - return True - - if ( - message.message_type == ws_pb2.MESSAGE_TYPE_END - and body_name == "audio_pcm_end" - ): - await detector.handle_end() - return True - - await self.ws.close(code=1003, reason="unknown server wake-word PCM protobuf body") - return False - async def _handle_audio_pcm_message(self, message: Any) -> bool: body_name = message.WhichOneof("body") @@ -501,13 +393,13 @@ async def _handle_firmware_metadata(self, message: Any) -> None: server_version=self.server_metadata.server_version, ) ) - if self._auto_start_server_wakeword: - await self._start_server_wakeword_detection_if_available() + if self._server_wwd.auto_start_enabled: + await self._server_wwd.start_if_available() def _build_server_metadata( self, firmware_metadata: FirmwareMetadata ) -> ServerMetadata: - should_use_server_wake_word = self._server_wakeword_detector is not None + should_use_server_wake_word = self._server_wwd.available return ServerMetadata( has_server_wake_word=should_use_server_wake_word, server_version=__version__, @@ -575,113 +467,6 @@ def _raise_websocket_disconnect_from_runtime_error(self, exc: RuntimeError) -> N self._closed = True raise WebSocketDisconnect() from exc - async def _run_server_wakeword_detection(self) -> bool: - detector = self._server_wakeword_detector - if detector is None: - return False - - detected = False - should_restart = False - try: - await detector.start() - await self.send_state_command(FirmwareState.SERVER_WWD) - detected = await detector.wait_result() - if detected: - self._wakeword_event.set() - return detected - except asyncio.CancelledError: - raise - except WakeWordDetectionTimeout as exc: - logger.info("Server-side wake-word detection stopped: %s", exc) - return False - except WakeWordDetectionError as exc: - logger.warning("Server-side wake-word detection stopped: %s", exc) - return False - except Exception: - logger.exception("Server-side wake-word detection failed") - return False - finally: - await detector.stop() - self._arm_trailing_pcm_drain() - if not self._closed: - self._current_firmware_state = FirmwareState.IDLE - try: - await self.send_state_command(FirmwareState.IDLE) - except Exception: - logger.exception("Failed to return firmware to idle after wake-word detection") - should_restart = ( - self._auto_start_server_wakeword - and not detected - and not self._wakeword_event.is_set() - and not self._closed - ) - if self._server_wakeword_task is asyncio.current_task(): - self._server_wakeword_task = None - if should_restart: - self._schedule_server_wakeword_restart() - - def _schedule_server_wakeword_restart( - self, - delay_seconds: float = _SERVER_WAKEWORD_RESTART_DELAY_SECONDS, - ) -> None: - if not self._auto_start_server_wakeword or self._closed: - return - - self._cancel_server_wakeword_restart_task() - self._server_wakeword_restart_task = asyncio.create_task( - self._restart_server_wakeword_detection_after_delay(delay_seconds), - name="server-side-wakeword-restart", - ) - - def _cancel_server_wakeword_restart_task(self) -> None: - task = self._server_wakeword_restart_task - if task is None: - return - self._server_wakeword_restart_task = None - task.cancel() - - async def _restart_server_wakeword_detection_after_delay( - self, - delay_seconds: float, - ) -> None: - try: - await asyncio.sleep(delay_seconds) - if self._closed: - return - await self._start_server_wakeword_detection_if_available() - except asyncio.CancelledError: - raise - finally: - if self._server_wakeword_restart_task is asyncio.current_task(): - self._server_wakeword_restart_task = None - - def _arm_trailing_pcm_drain( - self, - timeout_seconds: float = _TRAILING_PCM_DRAIN_SECONDS, - ) -> None: - loop = asyncio.get_running_loop() - self._drain_trailing_pcm_until_end = True - self._drain_trailing_pcm_deadline = loop.time() + timeout_seconds - - def _clear_trailing_pcm_drain(self) -> None: - self._drain_trailing_pcm_until_end = False - self._drain_trailing_pcm_deadline = None - - def _should_drain_trailing_pcm(self) -> bool: - if not self._drain_trailing_pcm_until_end: - return False - deadline = self._drain_trailing_pcm_deadline - if deadline is None: - return True - if asyncio.get_running_loop().time() <= deadline: - return True - - logger.info( - "Trailing PCM drain window expired before END arrived; resuming normal routing" - ) - self._clear_trailing_pcm_drain() - return False - async def _wait_for_counter( self, *, From 2fc4c1190d84d3845d9acae037e2f69bdf9b8269 Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sat, 9 May 2026 17:19:28 +0900 Subject: [PATCH 11/15] feat: Remove has_server_wake_word dependency and streamline auto detection logic --- stackchan_server/server_wwd.py | 4 ---- stackchan_server/ws_proxy.py | 3 ++- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/stackchan_server/server_wwd.py b/stackchan_server/server_wwd.py index 193c99f..5474745 100644 --- a/stackchan_server/server_wwd.py +++ b/stackchan_server/server_wwd.py @@ -24,7 +24,6 @@ def __init__( set_current_state: Callable[[int], None], close_websocket: Callable[[int, str], Awaitable[None]], current_state: Callable[[], int], - has_server_wake_word: Callable[[], bool], is_closed: Callable[[], bool], on_detected: Callable[[], None], has_pending_wakeword: Callable[[], bool], @@ -35,7 +34,6 @@ def __init__( self._set_current_state = set_current_state self._close_websocket = close_websocket self._current_state = current_state - self._has_server_wake_word = has_server_wake_word self._is_closed = is_closed self._on_detected = on_detected self._has_pending_wakeword = has_pending_wakeword @@ -59,13 +57,11 @@ def auto_start_enabled(self) -> bool: async def enable_auto_detection(self) -> None: self._auto_start = True - await self.start_if_available() async def start_if_available(self) -> bool: if ( self._is_closed() or self._detector is None - or not self._has_server_wake_word() or self._current_state() != self._idle_state ): return False diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py index bdcd823..c098f81 100644 --- a/stackchan_server/ws_proxy.py +++ b/stackchan_server/ws_proxy.py @@ -140,7 +140,6 @@ def __init__( ), close_websocket=self.ws.close, current_state=lambda: int(self._current_firmware_state), - has_server_wake_word=lambda: self.server_metadata.has_server_wake_word, is_closed=lambda: self._closed, on_detected=self._wakeword_event.set, has_pending_wakeword=self._wakeword_event.is_set, @@ -262,6 +261,8 @@ async def start_talking(self, text: str) -> None: async def enable_auto_server_wakeword_detection(self) -> None: await self._server_wwd.enable_auto_detection() + if self.firmware_metadata is not None: + await self._server_wwd.start_if_available() async def _receive_loop(self) -> None: try: From a29aad296c74de3cb490f85863340bf432af8d76 Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sat, 9 May 2026 17:20:35 +0900 Subject: [PATCH 12/15] feat: Remove has_pending_wakeword parameter and add suppress_restart logic in ServerWwdController --- stackchan_server/server_wwd.py | 12 ++++++++---- stackchan_server/ws_proxy.py | 1 - 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/stackchan_server/server_wwd.py b/stackchan_server/server_wwd.py index 5474745..43bb73e 100644 --- a/stackchan_server/server_wwd.py +++ b/stackchan_server/server_wwd.py @@ -26,7 +26,6 @@ def __init__( current_state: Callable[[], int], is_closed: Callable[[], bool], on_detected: Callable[[], None], - has_pending_wakeword: Callable[[], bool], server_wwd_state: int, idle_state: int, ) -> None: @@ -36,7 +35,6 @@ def __init__( self._current_state = current_state self._is_closed = is_closed self._on_detected = on_detected - self._has_pending_wakeword = has_pending_wakeword self._server_wwd_state = server_wwd_state self._idle_state = idle_state @@ -44,6 +42,7 @@ def __init__( self._task: Optional[asyncio.Task[bool]] = None self._restart_task: Optional[asyncio.Task[None]] = None self._auto_start = False + self._suppress_restart_once = False self._drain_trailing_pcm_until_end = False self._drain_trailing_pcm_deadline: float | None = None @@ -76,12 +75,15 @@ async def start_if_available(self) -> bool: ) return True - async def stop(self) -> None: + async def stop(self, *, suppress_restart: bool = True) -> None: self._cancel_restart_task() task = self._task if task is None: return + if suppress_restart and not task.done(): + self._suppress_restart_once = True + if task.done(): self._task = None try: @@ -213,10 +215,12 @@ async def _run_detection(self) -> bool: logger.exception( "Failed to return firmware to idle after wake-word detection" ) + suppress_restart = self._suppress_restart_once + self._suppress_restart_once = False should_restart = ( self._auto_start and not detected - and not self._has_pending_wakeword() + and not suppress_restart and not self._is_closed() ) if self._task is asyncio.current_task(): diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py index c098f81..1c414cc 100644 --- a/stackchan_server/ws_proxy.py +++ b/stackchan_server/ws_proxy.py @@ -142,7 +142,6 @@ def __init__( current_state=lambda: int(self._current_firmware_state), is_closed=lambda: self._closed, on_detected=self._wakeword_event.set, - has_pending_wakeword=self._wakeword_event.is_set, server_wwd_state=int(FirmwareState.SERVER_WWD), idle_state=int(FirmwareState.IDLE), ) From 8f7e40e87bd18714037ab323dcb82bd72d5b8d4b Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sat, 9 May 2026 17:24:06 +0900 Subject: [PATCH 13/15] feat: Refactor ServerWwdController initialization and state management methods --- stackchan_server/server_wwd.py | 27 +++++++++------------------ stackchan_server/ws_proxy.py | 17 ++++++++++------- 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/stackchan_server/server_wwd.py b/stackchan_server/server_wwd.py index 43bb73e..b249857 100644 --- a/stackchan_server/server_wwd.py +++ b/stackchan_server/server_wwd.py @@ -20,23 +20,19 @@ class ServerWwdController: def __init__( self, *, - send_state_command: Callable[[int], Awaitable[None]], - set_current_state: Callable[[int], None], + enter_server_wwd: Callable[[], Awaitable[None]], + return_to_idle: Callable[[], Awaitable[None]], close_websocket: Callable[[int, str], Awaitable[None]], - current_state: Callable[[], int], + is_idle_state: Callable[[], bool], is_closed: Callable[[], bool], on_detected: Callable[[], None], - server_wwd_state: int, - idle_state: int, ) -> None: - self._send_state_command = send_state_command - self._set_current_state = set_current_state + self._enter_server_wwd = enter_server_wwd + self._return_to_idle = return_to_idle self._close_websocket = close_websocket - self._current_state = current_state + self._is_idle_state = is_idle_state self._is_closed = is_closed self._on_detected = on_detected - self._server_wwd_state = server_wwd_state - self._idle_state = idle_state self._detector = create_server_side_wake_word_detector() self._task: Optional[asyncio.Task[bool]] = None @@ -58,11 +54,7 @@ async def enable_auto_detection(self) -> None: self._auto_start = True async def start_if_available(self) -> bool: - if ( - self._is_closed() - or self._detector is None - or self._current_state() != self._idle_state - ): + if self._is_closed() or self._detector is None or not self._is_idle_state(): return False if self._task is not None and not self._task.done(): @@ -188,7 +180,7 @@ async def _run_detection(self) -> bool: should_restart = False try: await detector.start() - await self._send_state_command(self._server_wwd_state) + await self._enter_server_wwd() detected = await detector.wait_result() if detected: self._on_detected() @@ -208,9 +200,8 @@ async def _run_detection(self) -> bool: await detector.stop() self._arm_trailing_pcm_drain() if not self._is_closed(): - self._set_current_state(self._idle_state) try: - await self._send_state_command(self._idle_state) + await self._return_to_idle() except Exception: logger.exception( "Failed to return firmware to idle after wake-word detection" diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py index 1c414cc..2d8154e 100644 --- a/stackchan_server/ws_proxy.py +++ b/stackchan_server/ws_proxy.py @@ -134,16 +134,12 @@ def __init__( self._servo_sent_counter = 0 self._pending_servo_wait_targets: deque[int] = deque() self._server_wwd = ServerWwdController( - send_state_command=self.send_state_command, - set_current_state=lambda state: setattr( - self, "_current_firmware_state", FirmwareState(state) - ), + enter_server_wwd=self._enter_server_wwd_state, + return_to_idle=self._return_to_idle_state, close_websocket=self.ws.close, - current_state=lambda: int(self._current_firmware_state), + is_idle_state=lambda: self._current_firmware_state == FirmwareState.IDLE, is_closed=lambda: self._closed, on_detected=self._wakeword_event.set, - server_wwd_state=int(FirmwareState.SERVER_WWD), - idle_state=int(FirmwareState.IDLE), ) @property @@ -448,6 +444,13 @@ async def _send_state_command( ) ) + async def _enter_server_wwd_state(self) -> None: + await self.send_state_command(FirmwareState.SERVER_WWD) + + async def _return_to_idle_state(self) -> None: + self._current_firmware_state = FirmwareState.IDLE + await self.send_state_command(FirmwareState.IDLE) + async def _send_ws_bytes(self, data: bytes) -> None: try: await self.ws.send_bytes(data) From d0ad278c22d663c5cd6801dcc07d4dc66009ec00 Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sun, 10 May 2026 16:30:08 +0900 Subject: [PATCH 14/15] feat: Add workflows for version bump check and release management --- .../check-version-bump-for-main-pr.yml | 75 +++++++++ .github/workflows/release-on-main.yml | 145 ++++++++++++++++++ 2 files changed, 220 insertions(+) create mode 100644 .github/workflows/check-version-bump-for-main-pr.yml create mode 100644 .github/workflows/release-on-main.yml diff --git a/.github/workflows/check-version-bump-for-main-pr.yml b/.github/workflows/check-version-bump-for-main-pr.yml new file mode 100644 index 0000000..71ef4a8 --- /dev/null +++ b/.github/workflows/check-version-bump-for-main-pr.yml @@ -0,0 +1,75 @@ +name: Check Version Bump For Main Or Test PR + +on: + pull_request: + branches: + - main + - test + types: + - opened + - synchronize + - reopened + - edited + - ready_for_review + +permissions: + contents: read + +jobs: + check-version-bump: + if: github.head_ref == 'develop' + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Compare pyproject.toml versions + env: + BASE_REF: ${{ github.base_ref }} + HEAD_REF: ${{ github.head_ref }} + run: | + set -euo pipefail + git fetch origin "${BASE_REF}" --depth=1 + python - <<'PY' + import os + import subprocess + import tomllib + from pathlib import Path + + def read_version(pyproject_text: str) -> str: + data = tomllib.loads(pyproject_text) + version = data.get("project", {}).get("version") + if not version: + raise SystemExit("project.version not found in pyproject.toml") + return version + + base_ref = os.environ["BASE_REF"] + head_ref = os.environ["HEAD_REF"] + base_pyproject = subprocess.check_output( + ["git", "show", f"origin/{base_ref}:pyproject.toml"], + text=True, + ) + head_pyproject = Path("pyproject.toml").read_text() + + base_version = read_version(base_pyproject) + head_version = read_version(head_pyproject) + + print(f"Base branch version ({base_ref}): {base_version}") + print(f"PR branch version ({head_ref}): {head_version}") + + if base_version == head_version: + print( + f"::error file=pyproject.toml,title=Version bump required::" + f"develop -> {base_ref} PR must update [project].version in pyproject.toml. " + f"{base_ref} is {base_version} and this PR is still {head_version}." + ) + raise SystemExit(1) + PY diff --git a/.github/workflows/release-on-main.yml b/.github/workflows/release-on-main.yml new file mode 100644 index 0000000..ed6cf0a --- /dev/null +++ b/.github/workflows/release-on-main.yml @@ -0,0 +1,145 @@ +name: Release On Main Or Test + +on: + push: + branches: + - main + - test + +permissions: + contents: write + pull-requests: read + +concurrency: + group: release-${{ github.ref }} + cancel-in-progress: false + +jobs: + release: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Read release version from pyproject.toml + id: version + run: | + python - <<'PY' + import os + import tomllib + from pathlib import Path + + data = tomllib.loads(Path("pyproject.toml").read_text()) + version = data["project"]["version"] + + with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as fh: + print(f"tag={version}", file=fh) + + print(f"Release version: {version}") + PY + + - name: Find previous version tag + id: previous + shell: bash + run: | + set -euo pipefail + current_tag="${{ steps.version.outputs.tag }}" + previous_tag="$(git tag --list '[0-9]*' --sort=-version:refname | grep -Fxv "${current_tag}" | head -n 1 || true)" + echo "previous_tag=${previous_tag}" >> "$GITHUB_OUTPUT" + if [ -n "${previous_tag}" ]; then + echo "Previous tag: ${previous_tag}" + else + echo "Previous tag: none" + fi + + - name: Create tag if needed + uses: actions/github-script@v7 + env: + TAG: ${{ steps.version.outputs.tag }} + TARGET_SHA: ${{ github.sha }} + with: + script: | + const { owner, repo } = context.repo; + const tag = process.env.TAG; + const targetSha = process.env.TARGET_SHA; + + try { + await github.rest.git.getRef({ + owner, + repo, + ref: `tags/${tag}`, + }); + core.notice(`Tag ${tag} already exists.`); + } catch (error) { + if (error.status !== 404) { + throw error; + } + + await github.rest.git.createRef({ + owner, + repo, + ref: `refs/tags/${tag}`, + sha: targetSha, + }); + core.notice(`Created tag ${tag} at ${targetSha}.`); + } + + - name: Create GitHub Release if needed + uses: actions/github-script@v7 + env: + TAG: ${{ steps.version.outputs.tag }} + PREVIOUS_TAG: ${{ steps.previous.outputs.previous_tag }} + TARGET_SHA: ${{ github.sha }} + with: + script: | + const { owner, repo } = context.repo; + const tag = process.env.TAG; + const previousTag = process.env.PREVIOUS_TAG; + const targetSha = process.env.TARGET_SHA; + + try { + const existing = await github.rest.repos.getReleaseByTag({ + owner, + repo, + tag, + }); + core.notice(`Release for ${tag} already exists: ${existing.data.html_url}`); + return; + } catch (error) { + if (error.status !== 404) { + throw error; + } + } + + const notes = await github.request( + "POST /repos/{owner}/{repo}/releases/generate-notes", + { + owner, + repo, + tag_name: tag, + target_commitish: targetSha, + ...(previousTag ? { previous_tag_name: previousTag } : {}), + }, + ); + + const release = await github.rest.repos.createRelease({ + owner, + repo, + tag_name: tag, + target_commitish: targetSha, + name: tag, + body: notes.data.body, + draft: false, + prerelease: false, + generate_release_notes: false, + }); + + core.notice(`Created release ${release.data.html_url}`); From da6e0a7534d5a0f8a652c5652479ec7a7db98fb5 Mon Sep 17 00:00:00 2001 From: Atsushi Morimoto <74th.tech@gmail.com> Date: Sun, 10 May 2026 17:04:35 +0900 Subject: [PATCH 15/15] bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 90638f2..b6d4e0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "websocket-control-stackchan-server" -version = "0.1.0" +version = "0.2.0" description = "A WebSocket control interface for StackChan AI agent" readme = "README.md" requires-python = ">=3.13"