From ac83a0a164ecdfe633cfa74a6ccd905a362d9443 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sat, 9 May 2026 14:27:18 +0900
Subject: [PATCH 01/15] feat: ServerSideWWD

---
 docs/rest_api_ja.md                           |  58 ++++
 docs/websocket_protocols_ja.md                |  40 ++-
 firmware/include/listening.hpp                |  16 +
 firmware/include/metadata.hpp                 |   1 +
 .../generated_protobuf/websocket-message.pb.c |   2 +
 .../generated_protobuf/websocket-message.pb.h |  22 +-
 firmware/src/display.cpp                      |   6 +
 firmware/src/listening.cpp                    |  48 ++-
 firmware/src/main.cpp                         |  34 +-
 firmware/src/metadata.cpp                     |   5 +
 protobuf/websocket-message.proto              |   7 +
 stackchan_server/app.py                       |  26 ++
 .../websocket_message_pb2.py                  |  64 ++--
 stackchan_server/protobuf_ws.py               |  17 +-
 .../wakeup_word_detection/__init__.py         |  13 +
 .../wakeup_word_detection/create.py           |  28 ++
 .../wakeup_word_detection/server_side.py      | 205 ++++++++++++
 stackchan_server/ws_proxy.py                  | 312 ++++++++++++++++--
 18 files changed, 837 insertions(+), 67 deletions(-)
 create mode 100644 stackchan_server/wakeup_word_detection/__init__.py
 create mode 100644 stackchan_server/wakeup_word_detection/create.py
 create mode 100644 stackchan_server/wakeup_word_detection/server_side.py

diff --git a/docs/rest_api_ja.md b/docs/rest_api_ja.md
index 86c2d2f..a3b41e1 100644
--- a/docs/rest_api_ja.md
+++ b/docs/rest_api_ja.md
@@ -17,6 +17,7 @@
 | `GET` | `/v1/stackchan` | 接続中 StackChan 一覧 |
 | `GET` | `/v1/stackchan/{stackchan_ip}` | 指定 StackChan の状態取得 |
 | `POST` | `/v1/stackchan/{stackchan_ip}/wakeword` | 擬似 wakeword 発火 |
+| `POST` | `/v1/stackchan/{stackchan_ip}/wakeword/server-detect` | サーバーサイド wakeword 検出を要求 |
 | `POST` | `/v1/stackchan/{stackchan_ip}/speak` | 指定 StackChan に発話させる |
 
 ## `GET /health`
@@ -134,6 +135,63 @@
 - 実機側のウェイクワード検出 (`WakeWordEvt`) と同じように扱われます。
 - すでに `talk_session` 実行中でも、イベント自体は内部フラグとして立ちます。
 
+## `POST /v1/stackchan/{stackchan_ip}/wakeword/server-detect`
+
+サーバーサイドの wakeword 検出を開始します。
+
+> [!NOTE]
+> 環境変数 `USE_SERVER_SIDE_WWD_WHISPER_SERVER=1` の場合、サーバーは `@app.setup()` 完了後と `Idle` 復帰後に自動でサーバーサイド wakeword 検出を開始します。この API は明示的に現在の検出サイクルを待ちたい場合に利用できます。
+
+- サーバーは対象 StackChan に `StateCmd(Listening, WAKE_WORD)` を送ってマイク音声を受信し、
+  直近 3 秒窓を 0.5 秒ごとに認識します。
+- 認識テキストには全結果がログ出力されます。
+- キーワードが検出されたら内部 wakeword イベントを発火し、`talk_session` 開始待ちを解除します。
+- 検出の終了時には `StateCmd(Idle)` を送って待機に戻します。
+- 実機の表示状態は `Listening` へは変わらず、`Idle(Server-WWD)` のまま待ち受けます。
+- このモードでは無音 3 秒によるクライアント側自動終了は行いません。
+
+### パスパラメータ
+
+| 名前 | 型 | 説明 |
+| --- | --- | --- |
+| `stackchan_ip` | `string` | 対象 StackChan の接続元 IP |
+
+### クエリパラメータ
+
+| 名前 | 型 | 必須 | 説明 |
+| --- | --- | --- | --- |
+| `timeout_seconds` | `number` | 任意 | 検出待ちタイムアウト秒。未指定時はサーバー設定値 |
+
+### 成功レスポンス
+
+- Status: `200 OK`
+
+```json
+{
+  "detected": true
+}
+```
+
+`detected` が `false` の場合は、検出セッションは終了したがキーワード未検出です。
+
+### エラーレスポンス
+
+- Status: `404 Not Found`
+
+```json
+{
+  "detail": "stackchan not connected"
+}
+```
+
+- Status: `409 Conflict`
+
+```json
+{
+  "detail": "Server-side wake-word detection is not available for this connection"
+}
+```
+
 ## `POST /v1/stackchan/{stackchan_ip}/speak`
 
 指定した StackChan にテキストを発話させます。
diff --git a/docs/websocket_protocols_ja.md b/docs/websocket_protocols_ja.md
index b816c13..d5a21aa 100644
--- a/docs/websocket_protocols_ja.md
+++ b/docs/websocket_protocols_ja.md
@@ -35,6 +35,8 @@
 | `SpeakDoneEvt` | CoreS3 → Server | 音声再生完了通知 |
 | `ServoCmd` | Server → CoreS3 | サーボ動作シーケンス指示 |
 | `ServoDoneEvt` | CoreS3 → Server | サーボ動作完了通知 |
+| `FirmwareMetadata` | CoreS3 → Server | クライアント能力通知 |
+| `ServerMetadata` | Server → CoreS3 | サーバー能力通知 |
 
 ### `MessageType` 一覧
 
@@ -89,7 +91,7 @@
 
 - 方向: Server → CoreS3
 - `messageType`: `DATA` のみ
-- body: `StateCommand { state }`
+- body: `StateCommand { state, listening_purpose }`
 
 利用する状態名:
 
@@ -98,12 +100,21 @@
 - `Thinking`
 - `Speaking`
 
+`listening_purpose` の値:
+
+- `SPEECH`: 通常の会話入力
+- `WAKE_WORD`: サーバーサイド wakeword 検出用の uplink
+
 ### 現行実装メモ
 
-- `proxy.listen()` 開始時に Server が `Listening` を指示します。
+- `proxy.listen()` 開始時に Server が `StateCmd(Listening, SPEECH)` を指示します。
+- サーバーサイド wakeword 検出開始時は `StateCmd(Listening, WAKE_WORD)` を指示します。
 - 音声 uplink の `END` を受けると、Server は `Thinking` を指示します。
 - `proxy.speak()` 完了後、Server は `Idle` を指示します。
 
+> [!NOTE]
+> `WAKE_WORD` の場合、CoreS3 は内部的にマイク uplink を開始しますが、状態表示は `Listening` に遷移せず `Idle(Server-WWD)` のままです。また無音 3 秒による自動終了も行いません。
+
 ## ウェイクワード検出 `WakeWordEvt`
 
 - 方向: CoreS3 → Server
@@ -112,6 +123,31 @@
 - `Idle` 中のウェイクワード検出をサーバー側に通知します。
 - REST API の `POST /v1/stackchan/{ip}/wakeword` は、このイベントをサーバー内部で擬似発火させます。
 
+## メタデータ交換 `FirmwareMetadata` / `ServerMetadata`
+
+WebSocket 接続後、能力情報を相互交換します。
+
+- CoreS3 → Server: `FirmwareMetadata`
+  - `has_device_wake_word`: クライアント側 wakeword 対応有無
+  - そのほか `device_type`, `display_width`, `display_height`, `has_led`, `servo_type`, `supports_audio_duplex`, `firmware_version`
+- Server → CoreS3: `ServerMetadata`
+  - `has_server_wake_word`: サーバー側 wakeword 対応有無
+  - `server_version`
+
+CoreS3 側は `has_server_wake_word=true` を受けると、デバイス側 wakeword を使わずにサーバー側検出モードで待機します（表示は `Idle(Server-WWD)`）。
+
+## サーバーサイド wakeword 検出フロー
+
+- 環境変数 `USE_SERVER_SIDE_WWD_WHISPER_SERVER=1` の場合、サーバーは `@app.setup()` 完了後と `Idle` 復帰後に自動でサーバーサイド wakeword 検出を開始します。
+- REST API `POST /v1/stackchan/{ip}/wakeword/server-detect` を呼ぶと、
+  サーバーは `StateCmd(Listening, WAKE_WORD)` を送信してマイク uplink を受信します。
+- 受信した音声の直近 3 秒窓を 0.5 秒ごとに音声認識へ渡し、
+  定義キーワード（例: `スタクチャン`）を含むか判定します。
+- 各判定タイミングの認識結果はすべてログ出力されます。
+- キーワード検出時は内部 wakeword イベントを発火し、通常の `talk_session` フローに進みます。
+- 検出完了時（検出/未検出を問わず）は `StateCmd(Idle)` で待機状態に戻します。
+- この間、CoreS3 の画面表示は `Listening` ではなく `Idle(Server-WWD)` を維持します。
+
 ## 状態通知 `StateEvt`
 
 - 方向: CoreS3 → Server
diff --git a/firmware/include/listening.hpp b/firmware/include/listening.hpp
index 0e18ba8..0a89b25 100644
--- a/firmware/include/listening.hpp
+++ b/firmware/include/listening.hpp
@@ -10,6 +10,12 @@
 class Listening
 {
 public:
+  enum class SessionMode
+  {
+    Speech,
+    WakeWord,
+  };
+
   Listening(WebSocketsClient &ws, StateMachine &sm, int sampleRate);
 
   // allocate buffers / reset counters; call once from setup
@@ -19,6 +25,10 @@ class Listening
   void begin();
   void end();
 
+  // Idle(Server-WWD) のままマイク uplink を開始/終了する
+  bool beginWakeWordStreaming();
+  void endWakeWordStreaming();
+
   // begin a new streaming session (sends START); returns false if WS not connected
   bool startStreaming();
 
@@ -34,7 +44,11 @@ class Listening
   // 無音が所定時間続いているか判定
   bool shouldStopForSilence() const;
 
+  bool isWakeWordStreaming() const { return streaming_ && session_mode_ == SessionMode::WakeWord; }
+
 private:
+  bool beginStreamingSession(SessionMode mode, bool auto_stop_for_silence);
+  void stopMicrophoneOnly();
   void updateLevelStats(const int16_t *samples, size_t sampleCount);
   bool sendPacket(stackchan_websocket_v1_MessageType type, const int16_t *samples, size_t sampleCount);
   void ringPush(const int16_t *src, size_t samples);
@@ -56,6 +70,8 @@ class Listening
   uint32_t seq_counter_ = 0;
   bool streaming_ = false;
   bool events_registered_ = false;
+  SessionMode session_mode_ = SessionMode::Speech;
+  bool auto_stop_for_silence_ = true;
 
   // 無音判定関連
   int32_t last_level_ = 0;
diff --git a/firmware/include/metadata.hpp b/firmware/include/metadata.hpp
index f490abd..c97b4eb 100644
--- a/firmware/include/metadata.hpp
+++ b/firmware/include/metadata.hpp
@@ -29,6 +29,7 @@ extern ServerMetadataState g_server_metadata;
 void initializeFirmwareMetadata();
 void resetServerMetadata();
 bool shouldUseDeviceWakeWord();
+bool shouldUseServerWakeWord();
 void setFirmwareMetadataMessage(
     stackchan_websocket_v1_WebSocketMessage &message,
     uint32_t seq);
diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.c b/firmware/lib/generated_protobuf/websocket-message.pb.c
index f70a79e..7620ee8 100644
--- a/firmware/lib/generated_protobuf/websocket-message.pb.c
+++ b/firmware/lib/generated_protobuf/websocket-message.pb.c
@@ -64,3 +64,5 @@ PB_BIND(stackchan_websocket_v1_ServerMetadata, stackchan_websocket_v1_ServerMeta
 
 
 
+
+
diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.h b/firmware/lib/generated_protobuf/websocket-message.pb.h
index 8e0c222..ffd1d31 100644
--- a/firmware/lib/generated_protobuf/websocket-message.pb.h
+++ b/firmware/lib/generated_protobuf/websocket-message.pb.h
@@ -38,6 +38,12 @@ typedef enum _stackchan_websocket_v1_StackchanState {
     stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING = 3
 } stackchan_websocket_v1_StackchanState;
 
+typedef enum _stackchan_websocket_v1_ListeningPurpose {
+    stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_UNSPECIFIED = 0,
+    stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_SPEECH = 1,
+    stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD = 2
+} stackchan_websocket_v1_ListeningPurpose;
+
 typedef enum _stackchan_websocket_v1_ServoOperation {
     stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP = 0,
     stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_X = 1,
@@ -83,6 +89,7 @@ typedef struct _stackchan_websocket_v1_AudioChunk {
 
 typedef struct _stackchan_websocket_v1_StateCommand {
     stackchan_websocket_v1_StackchanState state;
+    stackchan_websocket_v1_ListeningPurpose listening_purpose;
 } stackchan_websocket_v1_StateCommand;
 
 typedef struct _stackchan_websocket_v1_WakeWordEvent {
@@ -176,6 +183,10 @@ extern "C" {
 #define _stackchan_websocket_v1_StackchanState_MAX stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING
 #define _stackchan_websocket_v1_StackchanState_ARRAYSIZE ((stackchan_websocket_v1_StackchanState)(stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING+1))
 
+#define _stackchan_websocket_v1_ListeningPurpose_MIN stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_UNSPECIFIED
+#define _stackchan_websocket_v1_ListeningPurpose_MAX stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD
+#define _stackchan_websocket_v1_ListeningPurpose_ARRAYSIZE ((stackchan_websocket_v1_ListeningPurpose)(stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD+1))
+
 #define _stackchan_websocket_v1_ServoOperation_MIN stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP
 #define _stackchan_websocket_v1_ServoOperation_MAX stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y
 #define _stackchan_websocket_v1_ServoOperation_ARRAYSIZE ((stackchan_websocket_v1_ServoOperation)(stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y+1))
@@ -197,6 +208,7 @@ extern "C" {
 
 
 #define stackchan_websocket_v1_StateCommand_state_ENUMTYPE stackchan_websocket_v1_StackchanState
+#define stackchan_websocket_v1_StateCommand_listening_purpose_ENUMTYPE stackchan_websocket_v1_ListeningPurpose
 
 
 #define stackchan_websocket_v1_StateEvent_state_ENUMTYPE stackchan_websocket_v1_StackchanState
@@ -218,7 +230,7 @@ extern "C" {
 #define stackchan_websocket_v1_AudioWavStart_init_default {0, 0}
 #define stackchan_websocket_v1_AudioWavEnd_init_default {0}
 #define stackchan_websocket_v1_AudioChunk_init_default {{0, {0}}}
-#define stackchan_websocket_v1_StateCommand_init_default {_stackchan_websocket_v1_StackchanState_MIN}
+#define stackchan_websocket_v1_StateCommand_init_default {_stackchan_websocket_v1_StackchanState_MIN, _stackchan_websocket_v1_ListeningPurpose_MIN}
 #define stackchan_websocket_v1_WakeWordEvent_init_default {0}
 #define stackchan_websocket_v1_StateEvent_init_default {_stackchan_websocket_v1_StackchanState_MIN}
 #define stackchan_websocket_v1_SpeakDoneEvent_init_default {0}
@@ -233,7 +245,7 @@ extern "C" {
 #define stackchan_websocket_v1_AudioWavStart_init_zero {0, 0}
 #define stackchan_websocket_v1_AudioWavEnd_init_zero {0}
 #define stackchan_websocket_v1_AudioChunk_init_zero {{0, {0}}}
-#define stackchan_websocket_v1_StateCommand_init_zero {_stackchan_websocket_v1_StackchanState_MIN}
+#define stackchan_websocket_v1_StateCommand_init_zero {_stackchan_websocket_v1_StackchanState_MIN, _stackchan_websocket_v1_ListeningPurpose_MIN}
 #define stackchan_websocket_v1_WakeWordEvent_init_zero {0}
 #define stackchan_websocket_v1_StateEvent_init_zero {_stackchan_websocket_v1_StackchanState_MIN}
 #define stackchan_websocket_v1_SpeakDoneEvent_init_zero {0}
@@ -248,6 +260,7 @@ extern "C" {
 #define stackchan_websocket_v1_AudioWavStart_channels_tag 2
 #define stackchan_websocket_v1_AudioChunk_pcm_bytes_tag 1
 #define stackchan_websocket_v1_StateCommand_state_tag 1
+#define stackchan_websocket_v1_StateCommand_listening_purpose_tag 2
 #define stackchan_websocket_v1_WakeWordEvent_detected_tag 1
 #define stackchan_websocket_v1_StateEvent_state_tag 1
 #define stackchan_websocket_v1_SpeakDoneEvent_done_tag 1
@@ -347,7 +360,8 @@ X(a, STATIC,   SINGULAR, BYTES,    pcm_bytes,         1)
 #define stackchan_websocket_v1_AudioChunk_DEFAULT NULL
 
 #define stackchan_websocket_v1_StateCommand_FIELDLIST(X, a) \
-X(a, STATIC,   SINGULAR, UENUM,    state,             1)
+X(a, STATIC,   SINGULAR, UENUM,    state,             1) \
+X(a, STATIC,   SINGULAR, UENUM,    listening_purpose,   2)
 #define stackchan_websocket_v1_StateCommand_CALLBACK NULL
 #define stackchan_websocket_v1_StateCommand_DEFAULT NULL
 
@@ -448,7 +462,7 @@ extern const pb_msgdesc_t stackchan_websocket_v1_ServerMetadata_msg;
 #define stackchan_websocket_v1_ServoCommand_size 14
 #define stackchan_websocket_v1_ServoDoneEvent_size 2
 #define stackchan_websocket_v1_SpeakDoneEvent_size 2
-#define stackchan_websocket_v1_StateCommand_size 2
+#define stackchan_websocket_v1_StateCommand_size 4
 #define stackchan_websocket_v1_StateEvent_size   2
 #define stackchan_websocket_v1_WakeWordEvent_size 2
 #define stackchan_websocket_v1_WebSocketMessage_size 4113
diff --git a/firmware/src/display.cpp b/firmware/src/display.cpp
index a33cabf..21aaf62 100644
--- a/firmware/src/display.cpp
+++ b/firmware/src/display.cpp
@@ -4,6 +4,7 @@
 
 #include "config.h"
 #include "display.hpp"
+#include "metadata.hpp"
 
 #if USE_STACKCHAN_BSP
 #define GFXModule M5StackChan.Display()
@@ -137,6 +138,11 @@ void Display::drawForState(StateMachine::State state)
   GFXModule.setTextSize(1);
   GFXModule.setTextColor(font_color, bg_color);
   GFXModule.setCursor(isAtomS3R() ? 4 : 10, bar_y + (isAtomS3R() ? 6 : 2));
+  if (state == StateMachine::Idle && shouldUseServerWakeWord())
+  {
+    GFXModule.printf("Idle(Server-WWD)");
+    return;
+  }
   GFXModule.printf("%s", stateToString(state));
 }
 
diff --git a/firmware/src/listening.cpp b/firmware/src/listening.cpp
index edb2e35..1138d67 100644
--- a/firmware/src/listening.cpp
+++ b/firmware/src/listening.cpp
@@ -42,21 +42,45 @@ void Listening::init()
 void Listening::begin()
 {
   M5.Mic.begin();
-  startStreaming();
+  beginStreamingSession(SessionMode::Speech, true);
 }
 
 void Listening::end()
 {
   stopStreaming();
-  M5.Mic.end();
+  stopMicrophoneOnly();
+}
+
+bool Listening::beginWakeWordStreaming()
+{
+  if (streaming_)
+  {
+    return session_mode_ == SessionMode::WakeWord;
+  }
+
+  M5.Mic.begin();
+  return beginStreamingSession(SessionMode::WakeWord, false);
+}
+
+void Listening::endWakeWordStreaming()
+{
+  stopStreaming();
+  stopMicrophoneOnly();
 }
 
 bool Listening::startStreaming()
+{
+  return beginStreamingSession(SessionMode::Speech, true);
+}
+
+bool Listening::beginStreamingSession(SessionMode mode, bool auto_stop_for_silence)
 {
   ring_write_ = ring_read_ = ring_available_ = 0;
   seq_counter_ = 0;
   last_level_ = 0;
   silence_since_ms_ = 0;
+  session_mode_ = mode;
+  auto_stop_for_silence_ = auto_stop_for_silence;
   streaming_ = true;
   return sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_START, nullptr, 0);
 }
@@ -89,10 +113,19 @@ bool Listening::stopStreaming()
   }
 
   streaming_ = false;
+  session_mode_ = SessionMode::Speech;
+  auto_stop_for_silence_ = true;
   ok = sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END, nullptr, 0) && ok;
   return ok;
 }
 
+void Listening::stopMicrophoneOnly()
+{
+  session_mode_ = SessionMode::Speech;
+  auto_stop_for_silence_ = true;
+  M5.Mic.end();
+}
+
 void Listening::loop()
 {
   if (!streaming_)
@@ -123,13 +156,20 @@ void Listening::loop()
     {
       streaming_ = false;
       log_i("WS send failed (data)");
-      state_.setState(StateMachine::Idle);
+      if (session_mode_ == SessionMode::Speech)
+      {
+        state_.setState(StateMachine::Idle);
+      }
+      else
+      {
+        stopMicrophoneOnly();
+      }
       return;
     }
   }
 
   // 無音が3秒続いたら終了
-  if (shouldStopForSilence())
+  if (auto_stop_for_silence_ && shouldStopForSilence())
   {
     log_i("Auto stop: silence detected (avg=%ld)", static_cast<long>(last_level_));
     if (!stopStreaming())
diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp
index 6fb95c4..07c1da2 100644
--- a/firmware/src/main.cpp
+++ b/firmware/src/main.cpp
@@ -238,15 +238,43 @@ bool applyRemoteStateCommand(const stackchan_websocket_v1_StateCommand &command)
   switch (command.state)
   {
   case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE:
+    if (listening.isWakeWordStreaming())
+    {
+      listening.endWakeWordStreaming();
+    }
     stateMachine.setState(StateMachine::Idle);
     return true;
   case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING:
+    if (command.listening_purpose == stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD &&
+        shouldUseServerWakeWord() &&
+        stateMachine.getState() == StateMachine::Idle)
+    {
+      if (!listening.beginWakeWordStreaming())
+      {
+        log_w("Failed to start server-side wakeword streaming");
+        return false;
+      }
+      return true;
+    }
+
+    if (listening.isWakeWordStreaming())
+    {
+      listening.endWakeWordStreaming();
+    }
     stateMachine.setState(StateMachine::Listening);
     return true;
   case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING:
+    if (listening.isWakeWordStreaming())
+    {
+      listening.endWakeWordStreaming();
+    }
     stateMachine.setState(StateMachine::Thinking);
     return true;
   case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING:
+    if (listening.isWakeWordStreaming())
+    {
+      listening.endWakeWordStreaming();
+    }
     stateMachine.setState(StateMachine::Speaking);
     return true;
   default:
@@ -542,7 +570,11 @@ void loop()
   {
   case StateMachine::Idle:
     handleTouchWakeWordInput();
-    if (shouldUseDeviceWakeWord())
+    if (listening.isWakeWordStreaming())
+    {
+      listening.loop();
+    }
+    else if (shouldUseDeviceWakeWord())
     {
       wakeUpWord.loop();
     }
diff --git a/firmware/src/metadata.cpp b/firmware/src/metadata.cpp
index 5e6cd9b..579e9a8 100644
--- a/firmware/src/metadata.cpp
+++ b/firmware/src/metadata.cpp
@@ -73,6 +73,11 @@ bool shouldUseDeviceWakeWord()
   return g_server_metadata.available && !g_server_metadata.has_server_wake_word;
 }
 
+bool shouldUseServerWakeWord()
+{
+  return g_server_metadata.available && g_server_metadata.has_server_wake_word;
+}
+
 void setFirmwareMetadataMessage(
     stackchan_websocket_v1_WebSocketMessage &message,
     uint32_t seq)
diff --git a/protobuf/websocket-message.proto b/protobuf/websocket-message.proto
index c643673..fec8bff 100644
--- a/protobuf/websocket-message.proto
+++ b/protobuf/websocket-message.proto
@@ -62,6 +62,12 @@ enum StackchanState {
   STACKCHAN_STATE_SPEAKING = 3;
 }
 
+enum ListeningPurpose {
+  LISTENING_PURPOSE_UNSPECIFIED = 0;
+  LISTENING_PURPOSE_SPEECH = 1;
+  LISTENING_PURPOSE_WAKE_WORD = 2;
+}
+
 enum ServoOperation {
   SERVO_OPERATION_SLEEP = 0;
   SERVO_OPERATION_MOVE_X = 1;
@@ -99,6 +105,7 @@ message AudioChunk {
 
 message StateCommand {
   StackchanState state = 1;
+  ListeningPurpose listening_purpose = 2;
 }
 
 message WakeWordEvent {
diff --git a/stackchan_server/app.py b/stackchan_server/app.py
index 14496d2..94cf8de 100644
--- a/stackchan_server/app.py
+++ b/stackchan_server/app.py
@@ -10,6 +10,7 @@
 from .speech_recognition import create_speech_recognizer
 from .speech_synthesis import create_speech_synthesizer
 from .types import SpeechRecognizer, SpeechSynthesizer
+from .wakeup_word_detection import WakeWordDetectionError
 from .ws_proxy import WsProxy
 
 logger = getLogger(__name__)
@@ -24,6 +25,10 @@ class SpeakRequest(BaseModel):
     text: str
 
 
+class ServerWakeWordDetectResponse(BaseModel):
+    detected: bool
+
+
 class StackChanApp:
     def __init__(
         self,
@@ -64,6 +69,25 @@ async def _trigger_wakeword(stackchan_ip: str):
                 raise HTTPException(status_code=404, detail="stackchan not connected")
             proxy.trigger_wakeword()
 
+        @self.fastapi.post(
+            "/v1/stackchan/{stackchan_ip}/wakeword/server-detect",
+            response_model=ServerWakeWordDetectResponse,
+        )
+        async def _detect_server_wakeword(
+            stackchan_ip: str,
+            timeout_seconds: float | None = None,
+        ):
+            proxy = await self._get_proxy(stackchan_ip)
+            if proxy is None:
+                raise HTTPException(status_code=404, detail="stackchan not connected")
+            try:
+                detected = await proxy.request_server_wakeword_detection(
+                    timeout_seconds=timeout_seconds
+                )
+            except WakeWordDetectionError as exc:
+                raise HTTPException(status_code=409, detail=str(exc)) from exc
+            return ServerWakeWordDetectResponse(detected=detected)
+
         @self.fastapi.post("/v1/stackchan/{stackchan_ip}/speak", status_code=204)
         async def _speak(stackchan_ip: str, body: SpeakRequest):
             proxy = await self._get_proxy(stackchan_ip)
@@ -99,6 +123,8 @@ async def _handle_ws(self, websocket: WebSocket) -> None:
             if self._setup_fn:
                 await self._setup_fn(proxy)
 
+            await proxy.enable_auto_server_wakeword_detection()
+
             while not proxy.closed:
                 if not self._talk_session_fn:
                     await asyncio.sleep(0.05)
diff --git a/stackchan_server/generated_protobuf/websocket_message_pb2.py b/stackchan_server/generated_protobuf/websocket_message_pb2.py
index a7d7a4e..433f9a8 100644
--- a/stackchan_server/generated_protobuf/websocket_message_pb2.py
+++ b/stackchan_server/generated_protobuf/websocket_message_pb2.py
@@ -24,25 +24,27 @@
 
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17websocket-message.proto\x12\x16stackchan.websocket.v1\"\x96\x08\n\x10WebSocketMessage\x12\x31\n\x04kind\x18\x01 \x01(\x0e\x32#.stackchan.websocket.v1.MessageKind\x12\x39\n\x0cmessage_type\x18\x02 \x01(\x0e\x32#.stackchan.websocket.v1.MessageType\x12\x0b\n\x03seq\x18\x03 \x01(\r\x12@\n\x0f\x61udio_pcm_start\x18\n \x01(\x0b\x32%.stackchan.websocket.v1.AudioPcmStartH\x00\x12<\n\x0e\x61udio_pcm_data\x18\x0b \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_pcm_end\x18\x0c \x01(\x0b\x32#.stackchan.websocket.v1.AudioPcmEndH\x00\x12@\n\x0f\x61udio_wav_start\x18\x14 \x01(\x0b\x32%.stackchan.websocket.v1.AudioWavStartH\x00\x12<\n\x0e\x61udio_wav_data\x18\x15 \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_wav_end\x18\x16 \x01(\x0b\x32#.stackchan.websocket.v1.AudioWavEndH\x00\x12\x39\n\tstate_cmd\x18\x1e \x01(\x0b\x32$.stackchan.websocket.v1.StateCommandH\x00\x12>\n\rwake_word_evt\x18\x1f \x01(\x0b\x32%.stackchan.websocket.v1.WakeWordEventH\x00\x12\x37\n\tstate_evt\x18  \x01(\x0b\x32\".stackchan.websocket.v1.StateEventH\x00\x12@\n\x0espeak_done_evt\x18! \x01(\x0b\x32&.stackchan.websocket.v1.SpeakDoneEventH\x00\x12\x41\n\tservo_cmd\x18\" \x01(\x0b\x32,.stackchan.websocket.v1.ServoCommandSequenceH\x00\x12@\n\x0eservo_done_evt\x18# \x01(\x0b\x32&.stackchan.websocket.v1.ServoDoneEventH\x00\x12\x45\n\x11\x66irmware_metadata\x18$ \x01(\x0b\x32(.stackchan.websocket.v1.FirmwareMetadataH\x00\x12\x41\n\x0fserver_metadata\x18% \x01(\x0b\x32&.stackchan.websocket.v1.ServerMetadataH\x00\x42\x06\n\x04\x62ody\"\x0f\n\rAudioPcmStart\"\r\n\x0b\x41udioPcmEnd\"6\n\rAudioWavStart\x12\x13\n\x0bsample_rate\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\"\r\n\x0b\x41udioWavEnd\"\x1f\n\nAudioChunk\x12\x11\n\tpcm_bytes\x18\x01 \x01(\x0c\"E\n\x0cStateCommand\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"!\n\rWakeWordEvent\x12\x10\n\x08\x64\x65tected\x18\x01 \x01(\x08\"C\n\nStateEvent\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"\x1e\n\x0eSpeakDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"N\n\x14ServoCommandSequence\x12\x36\n\x08\x63ommands\x18\x01 \x03(\x0b\x32$.stackchan.websocket.v1.ServoCommand\"f\n\x0cServoCommand\x12\x32\n\x02op\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.ServoOperation\x12\r\n\x05\x61ngle\x18\x02 \x01(\x11\x12\x13\n\x0b\x64uration_ms\x18\x03 \x01(\x11\"\x1e\n\x0eServoDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"\x99\x02\n\x10\x46irmwareMetadata\x12\x37\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\".stackchan.websocket.v1.DeviceType\x12\x15\n\rdisplay_width\x18\x02 \x01(\r\x12\x16\n\x0e\x64isplay_height\x18\x03 \x01(\r\x12\x1c\n\x14has_device_wake_word\x18\x04 \x01(\x08\x12\x0f\n\x07has_led\x18\x05 \x01(\x08\x12\x35\n\nservo_type\x18\x06 \x01(\x0e\x32!.stackchan.websocket.v1.ServoType\x12\x1d\n\x15supports_audio_duplex\x18\x07 \x01(\x08\x12\x18\n\x10\x66irmware_version\x18\x08 \x01(\t\"F\n\x0eServerMetadata\x12\x1c\n\x14has_server_wake_word\x18\x01 \x01(\x08\x12\x16\n\x0eserver_version\x18\x02 \x01(\t*\xdf\x02\n\x0bMessageKind\x12\x1c\n\x18MESSAGE_KIND_UNSPECIFIED\x10\x00\x12\x1a\n\x16MESSAGE_KIND_AUDIO_PCM\x10\x01\x12\x1a\n\x16MESSAGE_KIND_AUDIO_WAV\x10\x02\x12\x1a\n\x16MESSAGE_KIND_STATE_CMD\x10\x03\x12\x1e\n\x1aMESSAGE_KIND_WAKE_WORD_EVT\x10\x04\x12\x1a\n\x16MESSAGE_KIND_STATE_EVT\x10\x05\x12\x1f\n\x1bMESSAGE_KIND_SPEAK_DONE_EVT\x10\x06\x12\x1a\n\x16MESSAGE_KIND_SERVO_CMD\x10\x07\x12\x1f\n\x1bMESSAGE_KIND_SERVO_DONE_EVT\x10\x08\x12\"\n\x1eMESSAGE_KIND_FIRMWARE_METADATA\x10\t\x12 \n\x1cMESSAGE_KIND_SERVER_METADATA\x10\n*p\n\x0bMessageType\x12\x1c\n\x18MESSAGE_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12MESSAGE_TYPE_START\x10\x01\x12\x15\n\x11MESSAGE_TYPE_DATA\x10\x02\x12\x14\n\x10MESSAGE_TYPE_END\x10\x03*\x85\x01\n\x0eStackchanState\x12\x18\n\x14STACKCHAN_STATE_IDLE\x10\x00\x12\x1d\n\x19STACKCHAN_STATE_LISTENING\x10\x01\x12\x1c\n\x18STACKCHAN_STATE_THINKING\x10\x02\x12\x1c\n\x18STACKCHAN_STATE_SPEAKING\x10\x03*c\n\x0eServoOperation\x12\x19\n\x15SERVO_OPERATION_SLEEP\x10\x00\x12\x1a\n\x16SERVO_OPERATION_MOVE_X\x10\x01\x12\x1a\n\x16SERVO_OPERATION_MOVE_Y\x10\x02*\x85\x01\n\nDeviceType\x12\x1b\n\x17\x44\x45VICE_TYPE_UNSPECIFIED\x10\x00\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5STACK_CORES3\x10\x01\x12\x1a\n\x16\x44\x45VICE_TYPE_M5ATOM_S3R\x10\x02\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5ATOM_ECHOS3R\x10\x03*i\n\tServoType\x12\x1a\n\x16SERVO_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fSERVO_TYPE_NONE\x10\x01\x12\x13\n\x0fSERVO_TYPE_SG90\x10\x02\x12\x16\n\x12SERVO_TYPE_SCS0009\x10\x03\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17websocket-message.proto\x12\x16stackchan.websocket.v1\"\x96\x08\n\x10WebSocketMessage\x12\x31\n\x04kind\x18\x01 \x01(\x0e\x32#.stackchan.websocket.v1.MessageKind\x12\x39\n\x0cmessage_type\x18\x02 \x01(\x0e\x32#.stackchan.websocket.v1.MessageType\x12\x0b\n\x03seq\x18\x03 \x01(\r\x12@\n\x0f\x61udio_pcm_start\x18\n \x01(\x0b\x32%.stackchan.websocket.v1.AudioPcmStartH\x00\x12<\n\x0e\x61udio_pcm_data\x18\x0b \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_pcm_end\x18\x0c \x01(\x0b\x32#.stackchan.websocket.v1.AudioPcmEndH\x00\x12@\n\x0f\x61udio_wav_start\x18\x14 \x01(\x0b\x32%.stackchan.websocket.v1.AudioWavStartH\x00\x12<\n\x0e\x61udio_wav_data\x18\x15 \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_wav_end\x18\x16 \x01(\x0b\x32#.stackchan.websocket.v1.AudioWavEndH\x00\x12\x39\n\tstate_cmd\x18\x1e \x01(\x0b\x32$.stackchan.websocket.v1.StateCommandH\x00\x12>\n\rwake_word_evt\x18\x1f \x01(\x0b\x32%.stackchan.websocket.v1.WakeWordEventH\x00\x12\x37\n\tstate_evt\x18  \x01(\x0b\x32\".stackchan.websocket.v1.StateEventH\x00\x12@\n\x0espeak_done_evt\x18! \x01(\x0b\x32&.stackchan.websocket.v1.SpeakDoneEventH\x00\x12\x41\n\tservo_cmd\x18\" \x01(\x0b\x32,.stackchan.websocket.v1.ServoCommandSequenceH\x00\x12@\n\x0eservo_done_evt\x18# \x01(\x0b\x32&.stackchan.websocket.v1.ServoDoneEventH\x00\x12\x45\n\x11\x66irmware_metadata\x18$ \x01(\x0b\x32(.stackchan.websocket.v1.FirmwareMetadataH\x00\x12\x41\n\x0fserver_metadata\x18% \x01(\x0b\x32&.stackchan.websocket.v1.ServerMetadataH\x00\x42\x06\n\x04\x62ody\"\x0f\n\rAudioPcmStart\"\r\n\x0b\x41udioPcmEnd\"6\n\rAudioWavStart\x12\x13\n\x0bsample_rate\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\"\r\n\x0b\x41udioWavEnd\"\x1f\n\nAudioChunk\x12\x11\n\tpcm_bytes\x18\x01 \x01(\x0c\"\x8a\x01\n\x0cStateCommand\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\x12\x43\n\x11listening_purpose\x18\x02 \x01(\x0e\x32(.stackchan.websocket.v1.ListeningPurpose\"!\n\rWakeWordEvent\x12\x10\n\x08\x64\x65tected\x18\x01 \x01(\x08\"C\n\nStateEvent\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"\x1e\n\x0eSpeakDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"N\n\x14ServoCommandSequence\x12\x36\n\x08\x63ommands\x18\x01 \x03(\x0b\x32$.stackchan.websocket.v1.ServoCommand\"f\n\x0cServoCommand\x12\x32\n\x02op\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.ServoOperation\x12\r\n\x05\x61ngle\x18\x02 \x01(\x11\x12\x13\n\x0b\x64uration_ms\x18\x03 \x01(\x11\"\x1e\n\x0eServoDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"\x99\x02\n\x10\x46irmwareMetadata\x12\x37\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\".stackchan.websocket.v1.DeviceType\x12\x15\n\rdisplay_width\x18\x02 \x01(\r\x12\x16\n\x0e\x64isplay_height\x18\x03 \x01(\r\x12\x1c\n\x14has_device_wake_word\x18\x04 \x01(\x08\x12\x0f\n\x07has_led\x18\x05 \x01(\x08\x12\x35\n\nservo_type\x18\x06 \x01(\x0e\x32!.stackchan.websocket.v1.ServoType\x12\x1d\n\x15supports_audio_duplex\x18\x07 \x01(\x08\x12\x18\n\x10\x66irmware_version\x18\x08 \x01(\t\"F\n\x0eServerMetadata\x12\x1c\n\x14has_server_wake_word\x18\x01 \x01(\x08\x12\x16\n\x0eserver_version\x18\x02 \x01(\t*\xdf\x02\n\x0bMessageKind\x12\x1c\n\x18MESSAGE_KIND_UNSPECIFIED\x10\x00\x12\x1a\n\x16MESSAGE_KIND_AUDIO_PCM\x10\x01\x12\x1a\n\x16MESSAGE_KIND_AUDIO_WAV\x10\x02\x12\x1a\n\x16MESSAGE_KIND_STATE_CMD\x10\x03\x12\x1e\n\x1aMESSAGE_KIND_WAKE_WORD_EVT\x10\x04\x12\x1a\n\x16MESSAGE_KIND_STATE_EVT\x10\x05\x12\x1f\n\x1bMESSAGE_KIND_SPEAK_DONE_EVT\x10\x06\x12\x1a\n\x16MESSAGE_KIND_SERVO_CMD\x10\x07\x12\x1f\n\x1bMESSAGE_KIND_SERVO_DONE_EVT\x10\x08\x12\"\n\x1eMESSAGE_KIND_FIRMWARE_METADATA\x10\t\x12 \n\x1cMESSAGE_KIND_SERVER_METADATA\x10\n*p\n\x0bMessageType\x12\x1c\n\x18MESSAGE_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12MESSAGE_TYPE_START\x10\x01\x12\x15\n\x11MESSAGE_TYPE_DATA\x10\x02\x12\x14\n\x10MESSAGE_TYPE_END\x10\x03*\x85\x01\n\x0eStackchanState\x12\x18\n\x14STACKCHAN_STATE_IDLE\x10\x00\x12\x1d\n\x19STACKCHAN_STATE_LISTENING\x10\x01\x12\x1c\n\x18STACKCHAN_STATE_THINKING\x10\x02\x12\x1c\n\x18STACKCHAN_STATE_SPEAKING\x10\x03*t\n\x10ListeningPurpose\x12!\n\x1dLISTENING_PURPOSE_UNSPECIFIED\x10\x00\x12\x1c\n\x18LISTENING_PURPOSE_SPEECH\x10\x01\x12\x1f\n\x1bLISTENING_PURPOSE_WAKE_WORD\x10\x02*c\n\x0eServoOperation\x12\x19\n\x15SERVO_OPERATION_SLEEP\x10\x00\x12\x1a\n\x16SERVO_OPERATION_MOVE_X\x10\x01\x12\x1a\n\x16SERVO_OPERATION_MOVE_Y\x10\x02*\x85\x01\n\nDeviceType\x12\x1b\n\x17\x44\x45VICE_TYPE_UNSPECIFIED\x10\x00\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5STACK_CORES3\x10\x01\x12\x1a\n\x16\x44\x45VICE_TYPE_M5ATOM_S3R\x10\x02\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5ATOM_ECHOS3R\x10\x03*i\n\tServoType\x12\x1a\n\x16SERVO_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fSERVO_TYPE_NONE\x10\x01\x12\x13\n\x0fSERVO_TYPE_SG90\x10\x02\x12\x16\n\x12SERVO_TYPE_SCS0009\x10\x03\x62\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'websocket_message_pb2', _globals)
 if not _descriptor._USE_C_DESCRIPTORS:
   DESCRIPTOR._loaded_options = None
-  _globals['_MESSAGEKIND']._serialized_start=2016
-  _globals['_MESSAGEKIND']._serialized_end=2367
-  _globals['_MESSAGETYPE']._serialized_start=2369
-  _globals['_MESSAGETYPE']._serialized_end=2481
-  _globals['_STACKCHANSTATE']._serialized_start=2484
-  _globals['_STACKCHANSTATE']._serialized_end=2617
-  _globals['_SERVOOPERATION']._serialized_start=2619
-  _globals['_SERVOOPERATION']._serialized_end=2718
-  _globals['_DEVICETYPE']._serialized_start=2721
-  _globals['_DEVICETYPE']._serialized_end=2854
-  _globals['_SERVOTYPE']._serialized_start=2856
-  _globals['_SERVOTYPE']._serialized_end=2961
+  _globals['_MESSAGEKIND']._serialized_start=2086
+  _globals['_MESSAGEKIND']._serialized_end=2437
+  _globals['_MESSAGETYPE']._serialized_start=2439
+  _globals['_MESSAGETYPE']._serialized_end=2551
+  _globals['_STACKCHANSTATE']._serialized_start=2554
+  _globals['_STACKCHANSTATE']._serialized_end=2687
+  _globals['_LISTENINGPURPOSE']._serialized_start=2689
+  _globals['_LISTENINGPURPOSE']._serialized_end=2805
+  _globals['_SERVOOPERATION']._serialized_start=2807
+  _globals['_SERVOOPERATION']._serialized_end=2906
+  _globals['_DEVICETYPE']._serialized_start=2909
+  _globals['_DEVICETYPE']._serialized_end=3042
+  _globals['_SERVOTYPE']._serialized_start=3044
+  _globals['_SERVOTYPE']._serialized_end=3149
   _globals['_WEBSOCKETMESSAGE']._serialized_start=52
   _globals['_WEBSOCKETMESSAGE']._serialized_end=1098
   _globals['_AUDIOPCMSTART']._serialized_start=1100
@@ -55,22 +57,22 @@
   _globals['_AUDIOWAVEND']._serialized_end=1201
   _globals['_AUDIOCHUNK']._serialized_start=1203
   _globals['_AUDIOCHUNK']._serialized_end=1234
-  _globals['_STATECOMMAND']._serialized_start=1236
-  _globals['_STATECOMMAND']._serialized_end=1305
-  _globals['_WAKEWORDEVENT']._serialized_start=1307
-  _globals['_WAKEWORDEVENT']._serialized_end=1340
-  _globals['_STATEEVENT']._serialized_start=1342
-  _globals['_STATEEVENT']._serialized_end=1409
-  _globals['_SPEAKDONEEVENT']._serialized_start=1411
-  _globals['_SPEAKDONEEVENT']._serialized_end=1441
-  _globals['_SERVOCOMMANDSEQUENCE']._serialized_start=1443
-  _globals['_SERVOCOMMANDSEQUENCE']._serialized_end=1521
-  _globals['_SERVOCOMMAND']._serialized_start=1523
-  _globals['_SERVOCOMMAND']._serialized_end=1625
-  _globals['_SERVODONEEVENT']._serialized_start=1627
-  _globals['_SERVODONEEVENT']._serialized_end=1657
-  _globals['_FIRMWAREMETADATA']._serialized_start=1660
-  _globals['_FIRMWAREMETADATA']._serialized_end=1941
-  _globals['_SERVERMETADATA']._serialized_start=1943
-  _globals['_SERVERMETADATA']._serialized_end=2013
+  _globals['_STATECOMMAND']._serialized_start=1237
+  _globals['_STATECOMMAND']._serialized_end=1375
+  _globals['_WAKEWORDEVENT']._serialized_start=1377
+  _globals['_WAKEWORDEVENT']._serialized_end=1410
+  _globals['_STATEEVENT']._serialized_start=1412
+  _globals['_STATEEVENT']._serialized_end=1479
+  _globals['_SPEAKDONEEVENT']._serialized_start=1481
+  _globals['_SPEAKDONEEVENT']._serialized_end=1511
+  _globals['_SERVOCOMMANDSEQUENCE']._serialized_start=1513
+  _globals['_SERVOCOMMANDSEQUENCE']._serialized_end=1591
+  _globals['_SERVOCOMMAND']._serialized_start=1593
+  _globals['_SERVOCOMMAND']._serialized_end=1695
+  _globals['_SERVODONEEVENT']._serialized_start=1697
+  _globals['_SERVODONEEVENT']._serialized_end=1727
+  _globals['_FIRMWAREMETADATA']._serialized_start=1730
+  _globals['_FIRMWAREMETADATA']._serialized_end=2011
+  _globals['_SERVERMETADATA']._serialized_start=2013
+  _globals['_SERVERMETADATA']._serialized_end=2083
 # @@protoc_insertion_point(module_scope)
diff --git a/stackchan_server/protobuf_ws.py b/stackchan_server/protobuf_ws.py
index 8569004..443900b 100644
--- a/stackchan_server/protobuf_ws.py
+++ b/stackchan_server/protobuf_ws.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from collections.abc import Sequence
-from enum import StrEnum
+from enum import IntEnum, StrEnum
 from typing import Any, Literal, cast
 
 from .generated_protobuf import websocket_message_pb2 as _ws_pb2
@@ -15,6 +15,12 @@
 ServoCommand = ServoMoveCommand | ServoSleepCommand
 
 
+class ListeningPurpose(IntEnum):
+    UNSPECIFIED = 0
+    SPEECH = 1
+    WAKE_WORD = 2
+
+
 def _ensure_range(value: int, *, minimum: int, maximum: int, label: str) -> int:
     if not minimum <= value <= maximum:
         raise ValueError(f"{label} must be between {minimum} and {maximum}: {value}")
@@ -92,13 +98,19 @@ def encode_audio_wav_end_message(seq: int) -> bytes:
     return message.SerializeToString()
 
 
-def encode_state_command_message(seq: int, state_id: int) -> bytes:
+def encode_state_command_message(
+    seq: int,
+    state_id: int,
+    *,
+    listening_purpose: int = ListeningPurpose.SPEECH,
+) -> bytes:
     message = _new_message(
         ws_pb2.MESSAGE_KIND_STATE_CMD,
         ws_pb2.MESSAGE_TYPE_DATA,
         seq,
     )
     message.state_cmd.state = int(state_id)
+    message.state_cmd.listening_purpose = int(listening_purpose)
     return message.SerializeToString()
 
 
@@ -173,6 +185,7 @@ def encode_servo_command_message(seq: int, commands: Sequence[ServoCommand]) ->
 
 __all__ = [
     "ServoCommand",
+    "ListeningPurpose",
     "encode_audio_pcm_data_message",
     "encode_audio_pcm_end_message",
     "encode_audio_pcm_start_message",
diff --git a/stackchan_server/wakeup_word_detection/__init__.py b/stackchan_server/wakeup_word_detection/__init__.py
new file mode 100644
index 0000000..d4c0f62
--- /dev/null
+++ b/stackchan_server/wakeup_word_detection/__init__.py
@@ -0,0 +1,13 @@
+from .create import create_server_side_wake_word_detector
+from .server_side import (
+    ServerSideWakeWordConfig,
+    ServerSideWakeWordDetector,
+    WakeWordDetectionError,
+)
+
+__all__ = [
+    "create_server_side_wake_word_detector",
+    "ServerSideWakeWordConfig",
+    "ServerSideWakeWordDetector",
+    "WakeWordDetectionError",
+]
diff --git a/stackchan_server/wakeup_word_detection/create.py b/stackchan_server/wakeup_word_detection/create.py
new file mode 100644
index 0000000..2ec5011
--- /dev/null
+++ b/stackchan_server/wakeup_word_detection/create.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+from pydantic import Field
+from pydantic_settings import BaseSettings
+
+from ..speech_recognition.whisper_server import WhisperServerSpeechToText
+from .server_side import ServerSideWakeWordDetector
+
+
+class _CreateServerSideWakeWordDetectorEnv(BaseSettings):
+    use_server_side_wwd_whisper_server: bool = Field(
+        default=False,
+        validation_alias="USE_SERVER_SIDE_WWD_WHISPER_SERVER",
+    )
+
+    class Config:
+        env_prefix = ""
+
+
+def create_server_side_wake_word_detector() -> ServerSideWakeWordDetector | None:
+    env = _CreateServerSideWakeWordDetectorEnv()
+    if not env.use_server_side_wwd_whisper_server:
+        return None
+
+    return ServerSideWakeWordDetector(recognizer=WhisperServerSpeechToText())
+
+
+__all__ = ["create_server_side_wake_word_detector"]
diff --git a/stackchan_server/wakeup_word_detection/server_side.py b/stackchan_server/wakeup_word_detection/server_side.py
new file mode 100644
index 0000000..f77be72
--- /dev/null
+++ b/stackchan_server/wakeup_word_detection/server_side.py
@@ -0,0 +1,205 @@
+from __future__ import annotations
+
+import asyncio
+import unicodedata
+from logging import getLogger
+
+from pydantic import Field
+from pydantic_settings import BaseSettings
+
+from ..speech_recognition.whisper_server import WhisperServerSpeechToText
+from ..static import LISTEN_AUDIO_FORMAT
+
+logger = getLogger(__name__)
+
+
+class WakeWordDetectionError(Exception):
+    pass
+
+
+class ServerSideWakeWordConfig(BaseSettings):
+    keywords: list[str] = Field(default_factory=lambda: ["スタックチャン"])
+    window_seconds: float = 3.0
+    interval_seconds: float = 0.5
+    timeout_seconds: float = 30.0
+
+    class Config:
+        env_prefix = "STACKCHAN_SERVER_WAKE_WORD_"
+
+
+class ServerSideWakeWordDetector:
+    def __init__(
+        self,
+        *,
+        recognizer: WhisperServerSpeechToText | None = None,
+        config: ServerSideWakeWordConfig | None = None,
+    ) -> None:
+        self.config = config or ServerSideWakeWordConfig()
+        self.recognizer = recognizer or WhisperServerSpeechToText()
+        self._pcm_buffer = bytearray()
+        self._running = False
+        self._detected = False
+        self._streaming_started = False
+        self._error: Exception | None = None
+        self._last_inference_at = 0.0
+        self._inference_task: asyncio.Task[None] | None = None
+        self._event = asyncio.Event()
+        self._lock = asyncio.Lock()
+        self._streaming_ended = False
+
+    @property
+    def running(self) -> bool:
+        return self._running
+
+    async def start(self) -> None:
+        await self.stop()
+        self._pcm_buffer = bytearray()
+        self._running = True
+        self._detected = False
+        self._streaming_started = False
+        self._streaming_ended = False
+        self._error = None
+        self._last_inference_at = 0.0
+        self._event.clear()
+        logger.info("Server-side wake-word detection started")
+
+    async def stop(self) -> None:
+        self._running = False
+        if self._inference_task is not None:
+            self._inference_task.cancel()
+            try:
+                await self._inference_task
+            except asyncio.CancelledError:
+                pass
+            self._inference_task = None
+        self._event.set()
+
+    async def handle_start(self) -> None:
+        if not self._running:
+            return
+        self._streaming_started = True
+        self._streaming_ended = False
+        self._pcm_buffer = bytearray()
+        self._last_inference_at = 0.0
+        logger.info("Server-side wake-word stream START")
+
+    async def handle_data(self, payload: bytes) -> None:
+        if not self._running:
+            return
+        if not self._streaming_started:
+            logger.warning(
+                "Ignoring stale server-side wake-word DATA before START payload_bytes=%d",
+                len(payload),
+            )
+            return
+        if self._streaming_ended:
+            logger.warning(
+                "Ignoring stale server-side wake-word DATA after END payload_bytes=%d",
+                len(payload),
+            )
+            return
+
+        self._pcm_buffer.extend(payload)
+        self._truncate_buffer_to_window()
+
+        loop = asyncio.get_running_loop()
+        now = loop.time()
+        if (now - self._last_inference_at) < self.config.interval_seconds:
+            return
+        if self._inference_task is not None and not self._inference_task.done():
+            return
+
+        self._last_inference_at = now
+        window_bytes = bytes(self._pcm_buffer)
+        self._inference_task = asyncio.create_task(self._run_inference(window_bytes))
+
+    async def handle_end(self) -> None:
+        if not self._running:
+            return
+        if not self._streaming_started:
+            logger.warning("Ignoring stale server-side wake-word END before START")
+            return
+        if self._streaming_ended:
+            logger.warning("Ignoring duplicate server-side wake-word END")
+            return
+        self._streaming_ended = True
+        logger.info("Server-side wake-word stream END")
+        if self._inference_task is not None and not self._inference_task.done():
+            try:
+                await self._inference_task
+            except Exception as exc:  # pragma: no cover
+                self._error = exc
+        if not self._detected:
+            self._event.set()
+
+    async def wait_result(self, timeout_seconds: float | None = None) -> bool:
+        if not self._running:
+            raise WakeWordDetectionError("Server-side wake-word detection is not running")
+
+        timeout = (
+            timeout_seconds
+            if timeout_seconds is not None
+            else self.config.timeout_seconds
+        )
+        try:
+            await asyncio.wait_for(self._event.wait(), timeout=timeout)
+        except asyncio.TimeoutError as exc:
+            raise WakeWordDetectionError("Server-side wake-word detection timed out") from exc
+
+        if self._error is not None:
+            raise WakeWordDetectionError(str(self._error)) from self._error
+
+        return self._detected
+
+    async def _run_inference(self, pcm_bytes: bytes) -> None:
+        if not pcm_bytes:
+            return
+
+        try:
+            async with self._lock:
+                transcript = await self.recognizer.transcribe(pcm_bytes)
+        except Exception as exc:  # pragma: no cover
+            logger.exception("Server-side wake-word transcription failed")
+            self._error = exc
+            self._event.set()
+            return
+
+        logger.info("Server-side wake-word transcript: %s", transcript)
+
+        if self._contains_wake_word(transcript):
+            logger.info("Server-side wake-word detected")
+            self._detected = True
+            self._event.set()
+
+    def _contains_wake_word(self, transcript: str) -> bool:
+        normalized_transcript = _normalize_text(transcript)
+        if not normalized_transcript:
+            return False
+
+        for keyword in self.config.keywords:
+            normalized_keyword = _normalize_text(keyword)
+            if normalized_keyword and normalized_keyword in normalized_transcript:
+                return True
+        return False
+
+    def _truncate_buffer_to_window(self) -> None:
+        sample_rate = LISTEN_AUDIO_FORMAT.sample_rate_hz
+        channels = LISTEN_AUDIO_FORMAT.channels
+        sample_width = LISTEN_AUDIO_FORMAT.sample_width
+        bytes_per_second = sample_rate * channels * sample_width
+        max_bytes = max(1, int(bytes_per_second * self.config.window_seconds))
+        if len(self._pcm_buffer) <= max_bytes:
+            return
+        del self._pcm_buffer[: len(self._pcm_buffer) - max_bytes]
+
+
+def _normalize_text(text: str) -> str:
+    normalized = unicodedata.normalize("NFKC", text or "")
+    return "".join(normalized.lower().split())
+
+
+__all__ = [
+    "ServerSideWakeWordConfig",
+    "ServerSideWakeWordDetector",
+    "WakeWordDetectionError",
+]
diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py
index 1c45236..b10ec5a 100644
--- a/stackchan_server/ws_proxy.py
+++ b/stackchan_server/ws_proxy.py
@@ -12,12 +12,12 @@
 
 from fastapi import WebSocket, WebSocketDisconnect
 from google.protobuf.message import DecodeError
-from pydantic_settings import BaseSettings
 
 from . import __version__
 from .generated_protobuf import websocket_message_pb2 as _ws_pb2
 from .listen import EmptyTranscriptError, ListenHandler, TimeoutError
 from .protobuf_ws import (
+    ListeningPurpose,
     encode_server_metadata_message,
     encode_servo_command_message,
     encode_state_command_message,
@@ -26,6 +26,10 @@
 from .speak import SpeakHandler
 from .static import LISTEN_AUDIO_FORMAT
 from .types import SpeechRecognizer, SpeechSynthesizer
+from .wakeup_word_detection import (
+    WakeWordDetectionError,
+    create_server_side_wake_word_detector,
+)
 
 logger = getLogger(__name__)
 
@@ -43,17 +47,8 @@
 )  # half interval for the second segment start
 _LISTEN_AUDIO_TIMEOUT_SECONDS = 10.0
 _DEBUG_RECORDING_ENABLED = os.getenv("DEBUG_RECODING") == "1"
-
-
-class _WakeWordServerConfig(BaseSettings):
-    no_use_client_wakeup_word: bool = False
-    use_open_wake_word: bool = False
-
-    class Config:
-        env_prefix = "STACKCHAN_"
-
-
-_WAKEWORD_SERVER_CONFIG = _WakeWordServerConfig()
+_SERVER_WAKEWORD_RESTART_DELAY_SECONDS = 0.25
+_TRAILING_PCM_DRAIN_SECONDS = 1.0
 
 
 class FirmwareState(IntEnum):
@@ -129,6 +124,12 @@ def __init__(
             recordings_dir=self.recordings_dir,
             debug_recording=self._debug_recording,
         )
+        self._server_wakeword_detector = create_server_side_wake_word_detector()
+        self._server_wakeword_task: Optional[asyncio.Task[bool]] = None
+        self._server_wakeword_restart_task: Optional[asyncio.Task[None]] = None
+        self._auto_start_server_wakeword = False
+        self._drain_trailing_pcm_until_end = False
+        self._drain_trailing_pcm_deadline: float | None = None
 
         self._receiving_task: Optional[asyncio.Task] = None
         self._closed = False
@@ -157,6 +158,10 @@ def current_state(self) -> FirmwareState:
     def receive_task(self) -> Optional[asyncio.Task]:
         return self._receiving_task
 
+    @property
+    def has_server_wakeword_detector(self) -> bool:
+        return self._server_wakeword_detector is not None
+
     def trigger_wakeword(self) -> None:
         """Web API から擬似的に WAKEWORD_EVT を発火させる。"""
         logger.info("Triggered wakeword via API")
@@ -165,6 +170,7 @@ def trigger_wakeword(self) -> None:
     async def wait_for_talk_session(self) -> None:
         while True:
             if self._wakeword_event.is_set():
+                await self.stop_server_wakeword_detection()
                 self._wakeword_event.clear()
                 return
             if self._closed:
@@ -172,6 +178,7 @@ async def wait_for_talk_session(self) -> None:
             await asyncio.sleep(0.05)
 
     async def listen(self) -> str:
+        await self.stop_server_wakeword_detection()
         return await self._listener.listen(
             send_state_command=self.send_state_command,
             is_closed=lambda: self._closed,
@@ -188,11 +195,21 @@ async def speak(self, text: str) -> None:
             is_closed=lambda: self._closed,
         )
 
-    async def send_state_command(self, state_id: int | FirmwareState) -> None:
-        await self._send_state_command(state_id)
+    async def send_state_command(
+        self,
+        state_id: int | FirmwareState,
+        *,
+        listening_purpose: ListeningPurpose = ListeningPurpose.SPEECH,
+    ) -> None:
+        await self._send_state_command(
+            state_id,
+            listening_purpose=listening_purpose,
+        )
 
     async def reset_state(self) -> None:
         await self.send_state_command(FirmwareState.IDLE)
+        self._current_firmware_state = FirmwareState.IDLE
+        self._schedule_server_wakeword_restart()
 
     async def move_servo(self, commands: Sequence[ServoCommand]) -> None:
         previous_counter = self._servo_sent_counter
@@ -232,6 +249,8 @@ async def start(self) -> None:
 
     async def close(self) -> None:
         self._closed = True
+        self._cancel_server_wakeword_restart_task()
+        await self.stop_server_wakeword_detection()
         if self._receiving_task:
             self._receiving_task.cancel()
             with suppress(asyncio.CancelledError):
@@ -241,6 +260,84 @@ async def close(self) -> None:
     async def start_talking(self, text: str) -> None:
         await self.speak(text)
 
+    async def enable_auto_server_wakeword_detection(self) -> None:
+        self._auto_start_server_wakeword = True
+        await self.start_server_wakeword_detection_if_available()
+
+    async def start_server_wakeword_detection_if_available(self) -> bool:
+        if (
+            self._closed
+            or self._server_wakeword_detector is None
+            or not self.server_metadata.has_server_wake_word
+            or self.current_state != FirmwareState.IDLE
+        ):
+            return False
+
+        if self._server_wakeword_task is not None and not self._server_wakeword_task.done():
+            return True
+
+        self._cancel_server_wakeword_restart_task()
+        self._server_wakeword_task = asyncio.create_task(
+            self._run_server_wakeword_detection(),
+            name="server-side-wakeword-detection",
+        )
+        return True
+
+    async def stop_server_wakeword_detection(self) -> None:
+        self._cancel_server_wakeword_restart_task()
+        task = self._server_wakeword_task
+        if task is None:
+            return
+
+        if task.done():
+            self._server_wakeword_task = None
+            try:
+                await task
+            except asyncio.CancelledError:
+                pass
+            except Exception:
+                logger.exception("Server-side wake-word detection task failed")
+            return
+
+        task.cancel()
+        self._server_wakeword_task = None
+        try:
+            await task
+        except asyncio.CancelledError:
+            pass
+        except Exception:
+            logger.exception("Server-side wake-word detection task failed")
+
+    async def request_server_wakeword_detection(
+        self,
+        *,
+        timeout_seconds: float | None = None,
+    ) -> bool:
+        if self._server_wakeword_detector is None or not self.server_metadata.has_server_wake_word:
+            raise WakeWordDetectionError(
+                "Server-side wake-word detection is not available for this connection"
+            )
+        if self._closed:
+            raise WebSocketDisconnect()
+
+        started = await self.start_server_wakeword_detection_if_available()
+        if not started:
+            raise WakeWordDetectionError(
+                "Server-side wake-word detection could not be started in the current state"
+            )
+
+        task = self._server_wakeword_task
+        if task is None:
+            raise WakeWordDetectionError("Server-side wake-word detection task is unavailable")
+
+        try:
+            if timeout_seconds is None:
+                return await asyncio.shield(task)
+            return await asyncio.wait_for(asyncio.shield(task), timeout=timeout_seconds)
+        except asyncio.TimeoutError as exc:
+            await self.stop_server_wakeword_detection()
+            raise WakeWordDetectionError("Server-side wake-word detection timed out") from exc
+
     async def _receive_loop(self) -> None:
         try:
             while True:
@@ -254,6 +351,63 @@ async def _receive_loop(self) -> None:
                 if message.kind == ws_pb2.MESSAGE_KIND_AUDIO_PCM:
                     body_name = message.WhichOneof("body")
 
+                    if self._should_drain_trailing_pcm():
+                        if (
+                            message.message_type == ws_pb2.MESSAGE_TYPE_START
+                            and body_name == "audio_pcm_start"
+                        ):
+                            logger.info(
+                                "Received a new PCM START while draining trailing wake-word audio; resuming normal routing"
+                            )
+                            self._clear_trailing_pcm_drain()
+                        elif (
+                            message.message_type == ws_pb2.MESSAGE_TYPE_DATA
+                            and body_name == "audio_pcm_data"
+                        ):
+                            logger.info(
+                                "Discarding trailing PCM DATA after wake-word detection stop payload_bytes=%d",
+                                len(message.audio_pcm_data.pcm_bytes),
+                            )
+                            continue
+                        elif (
+                            message.message_type == ws_pb2.MESSAGE_TYPE_END
+                            and body_name == "audio_pcm_end"
+                        ):
+                            logger.info(
+                                "Finished draining trailing PCM after wake-word detection stop"
+                            )
+                            self._clear_trailing_pcm_drain()
+                            continue
+
+                    if (
+                        self._server_wakeword_detector is not None
+                        and self._server_wakeword_detector.running
+                    ):
+                        if (
+                            message.message_type == ws_pb2.MESSAGE_TYPE_START
+                            and body_name == "audio_pcm_start"
+                        ):
+                            await self._server_wakeword_detector.handle_start()
+                            continue
+
+                        if (
+                            message.message_type == ws_pb2.MESSAGE_TYPE_DATA
+                            and body_name == "audio_pcm_data"
+                        ):
+                            payload = bytes(message.audio_pcm_data.pcm_bytes)
+                            await self._server_wakeword_detector.handle_data(payload)
+                            continue
+
+                        if (
+                            message.message_type == ws_pb2.MESSAGE_TYPE_END
+                            and body_name == "audio_pcm_end"
+                        ):
+                            await self._server_wakeword_detector.handle_end()
+                            continue
+
+                        await self.ws.close(code=1003, reason="unknown wakeword PCM protobuf body")
+                        break
+
                     if (
                         message.message_type == ws_pb2.MESSAGE_TYPE_START
                         and body_name == "audio_pcm_start"
@@ -362,17 +516,13 @@ async def _handle_firmware_metadata(self, message: Any) -> None:
                 server_version=self.server_metadata.server_version,
             )
         )
+        if self._auto_start_server_wakeword:
+            await self.start_server_wakeword_detection_if_available()
 
     def _build_server_metadata(
         self, firmware_metadata: FirmwareMetadata
     ) -> ServerMetadata:
-        should_use_server_wake_word = (
-            _WAKEWORD_SERVER_CONFIG.use_open_wake_word
-            and (
-                _WAKEWORD_SERVER_CONFIG.no_use_client_wakeup_word
-                or not firmware_metadata.has_device_wake_word
-            )
-        )
+        should_use_server_wake_word = self._server_wakeword_detector is not None
         return ServerMetadata(
             has_server_wake_word=should_use_server_wake_word,
             server_version=__version__,
@@ -410,10 +560,126 @@ def _handle_servo_done_event(self, message: Any) -> None:
         self._servo_done_counter += 1
         logger.info("Received servo done event")
 
-    async def _send_state_command(self, state_id: int | FirmwareState) -> None:
+    async def _send_state_command(
+        self,
+        state_id: int | FirmwareState,
+        *,
+        listening_purpose: ListeningPurpose = ListeningPurpose.SPEECH,
+    ) -> None:
         await self.ws.send_bytes(
-            encode_state_command_message(self._next_down_seq(), int(state_id))
+            encode_state_command_message(
+                self._next_down_seq(),
+                int(state_id),
+                listening_purpose=int(listening_purpose),
+            )
+        )
+
+    async def _run_server_wakeword_detection(self) -> bool:
+        detector = self._server_wakeword_detector
+        if detector is None:
+            return False
+
+        detected = False
+        should_restart = False
+        try:
+            await detector.start()
+            await self.send_state_command(
+                FirmwareState.LISTENING,
+                listening_purpose=ListeningPurpose.WAKE_WORD,
+            )
+            detected = await detector.wait_result()
+            if detected:
+                self._wakeword_event.set()
+            return detected
+        except asyncio.CancelledError:
+            raise
+        except WakeWordDetectionError as exc:
+            logger.warning("Server-side wake-word detection stopped: %s", exc)
+            return False
+        except Exception:
+            logger.exception("Server-side wake-word detection failed")
+            return False
+        finally:
+            await detector.stop()
+            self._arm_trailing_pcm_drain()
+            if not self._closed:
+                self._current_firmware_state = FirmwareState.IDLE
+                try:
+                    await self.send_state_command(FirmwareState.IDLE)
+                except Exception:
+                    logger.exception("Failed to return firmware to idle after wake-word detection")
+            should_restart = (
+                self._auto_start_server_wakeword
+                and not detected
+                and not self._wakeword_event.is_set()
+                and not self._closed
+            )
+            if self._server_wakeword_task is asyncio.current_task():
+                self._server_wakeword_task = None
+            if should_restart:
+                self._schedule_server_wakeword_restart()
+
+    def _schedule_server_wakeword_restart(
+        self,
+        delay_seconds: float = _SERVER_WAKEWORD_RESTART_DELAY_SECONDS,
+    ) -> None:
+        if not self._auto_start_server_wakeword or self._closed:
+            return
+
+        self._cancel_server_wakeword_restart_task()
+        self._server_wakeword_restart_task = asyncio.create_task(
+            self._restart_server_wakeword_detection_after_delay(delay_seconds),
+            name="server-side-wakeword-restart",
+        )
+
+    def _cancel_server_wakeword_restart_task(self) -> None:
+        task = self._server_wakeword_restart_task
+        if task is None:
+            return
+        self._server_wakeword_restart_task = None
+        task.cancel()
+
+    async def _restart_server_wakeword_detection_after_delay(
+        self,
+        delay_seconds: float,
+    ) -> None:
+        try:
+            await asyncio.sleep(delay_seconds)
+            if self._closed:
+                return
+            await self.start_server_wakeword_detection_if_available()
+        except asyncio.CancelledError:
+            raise
+        finally:
+            if self._server_wakeword_restart_task is asyncio.current_task():
+                self._server_wakeword_restart_task = None
+
+    def _arm_trailing_pcm_drain(
+        self,
+        timeout_seconds: float = _TRAILING_PCM_DRAIN_SECONDS,
+    ) -> None:
+        loop = asyncio.get_running_loop()
+        self._drain_trailing_pcm_until_end = True
+        self._drain_trailing_pcm_deadline = loop.time() + timeout_seconds
+
+    def _clear_trailing_pcm_drain(self) -> None:
+        self._drain_trailing_pcm_until_end = False
+        self._drain_trailing_pcm_deadline = None
+
+    def _should_drain_trailing_pcm(self) -> bool:
+        if not self._drain_trailing_pcm_until_end:
+            return False
+        deadline = self._drain_trailing_pcm_deadline
+        if deadline is None:
+            return True
+        if asyncio.get_running_loop().time() <= deadline:
+            return True
+
+        logger.info(
+            "Trailing PCM drain window expired before END arrived; resuming normal routing"
         )
+        self._clear_trailing_pcm_drain()
+        return False
 
     async def _wait_for_counter(
         self,

From 930fdffdc7e53065427109003a49bebbc4a3a7bc Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sat, 9 May 2026 14:38:07 +0900
Subject: [PATCH 02/15] feat: Stop wakeword streaming on WebSocket
 disconnection

---
 firmware/src/main.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp
index 07c1da2..67256ad 100644
--- a/firmware/src/main.cpp
+++ b/firmware/src/main.cpp
@@ -351,6 +351,11 @@ void handleWsEvent(WStype_t type, uint8_t *payload, size_t length)
   case WStype_DISCONNECTED:
     // M5.Display.println("WS: disconnected");
     log_i("WS disconnected");
+    if (listening.isWakeWordStreaming())
+    {
+      log_i("Stopping server-side wakeword uplink because WS disconnected");
+      listening.endWakeWordStreaming();
+    }
     resetServerMetadata();
     stateMachine.setState(StateMachine::Disconnected);
     break;

From 03d0bab82716698f303b5f3f50fdf77adb9b6f0c Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sat, 9 May 2026 15:25:54 +0900
Subject: [PATCH 03/15] feat: Add server-side wakeword detection support

- Introduced `ServerWwdPcm` message kind for server-side wakeword PCM stream.
- Updated WebSocket message protocol to include `MESSAGE_KIND_SERVER_WWD_PCM`.
- Implemented `WhisperServerWakeWordDetector` for handling server-side wakeword detection.
- Refactored `WsProxy` to manage server-side wakeword PCM messages.
- Removed deprecated server-side wakeword detection API endpoint.
- Enhanced documentation for new wakeword detection flow and message types.
---
 .env.template                                 |   6 +
 docs/rest_api_ja.md                           |  58 -----
 docs/server_ja.md                             |  16 ++
 docs/websocket_protocols_ja.md                |  20 +-
 .../generated_protobuf/websocket-message.pb.h |   7 +-
 firmware/src/listening.cpp                    |   7 +-
 protobuf/websocket-message.proto              |   1 +
 stackchan_server/app.py                       |  24 --
 .../websocket_message_pb2.py                  |  28 +--
 .../wakeup_word_detection/__init__.py         |  12 +-
 .../wakeup_word_detection/create.py           |  21 +-
 .../{server_side.py => whisper_server.py}     |  31 ++-
 stackchan_server/ws_proxy.py                  | 232 ++++++++++--------
 13 files changed, 234 insertions(+), 229 deletions(-)
 rename stackchan_server/wakeup_word_detection/{server_side.py => whisper_server.py} (89%)

diff --git a/.env.template b/.env.template
index 43d3290..89d30f6 100644
--- a/.env.template
+++ b/.env.template
@@ -34,6 +34,12 @@ STACKCHAN_GOOGLE_CLOUD_TTS_VOICE_NAME="Despina"
 STACKCHAN_VOICEVOX_URL="http://localhost:50021"
 STACKCHAN_VOICEVOX_SPEAKER=1
 
+# -- Server-side Wakeup Word Detection --
+# Whisper Server
+# STACKCHAN_USE_WWD_WHISPER_SERVER=1
+# STACKCHAN_WWD_WHISPER_SERVER_URL="http://127.0.0.1:8080/inference"
+# STACKCHAN_WWD_WHISPER_SERVER_MODEL=
+
 # -- Claude Agent SDK --
 # using Google Cloud Vertex AI
 CLAUDE_CODE_USE_VERTEX=1
diff --git a/docs/rest_api_ja.md b/docs/rest_api_ja.md
index a3b41e1..86c2d2f 100644
--- a/docs/rest_api_ja.md
+++ b/docs/rest_api_ja.md
@@ -17,7 +17,6 @@
 | `GET` | `/v1/stackchan` | 接続中 StackChan 一覧 |
 | `GET` | `/v1/stackchan/{stackchan_ip}` | 指定 StackChan の状態取得 |
 | `POST` | `/v1/stackchan/{stackchan_ip}/wakeword` | 擬似 wakeword 発火 |
-| `POST` | `/v1/stackchan/{stackchan_ip}/wakeword/server-detect` | サーバーサイド wakeword 検出を要求 |
 | `POST` | `/v1/stackchan/{stackchan_ip}/speak` | 指定 StackChan に発話させる |
 
 ## `GET /health`
@@ -135,63 +134,6 @@
 - 実機側のウェイクワード検出 (`WakeWordEvt`) と同じように扱われます。
 - すでに `talk_session` 実行中でも、イベント自体は内部フラグとして立ちます。
 
-## `POST /v1/stackchan/{stackchan_ip}/wakeword/server-detect`
-
-サーバーサイドの wakeword 検出を開始します。
-
-> [!NOTE]
-> 環境変数 `USE_SERVER_SIDE_WWD_WHISPER_SERVER=1` の場合、サーバーは `@app.setup()` 完了後と `Idle` 復帰後に自動でサーバーサイド wakeword 検出を開始します。この API は明示的に現在の検出サイクルを待ちたい場合に利用できます。
-
-- サーバーは対象 StackChan に `StateCmd(Listening, WAKE_WORD)` を送ってマイク音声を受信し、
-  直近 3 秒窓を 0.5 秒ごとに認識します。
-- 認識テキストには全結果がログ出力されます。
-- キーワードが検出されたら内部 wakeword イベントを発火し、`talk_session` 開始待ちを解除します。
-- 検出の終了時には `StateCmd(Idle)` を送って待機に戻します。
-- 実機の表示状態は `Listening` へは変わらず、`Idle(Server-WWD)` のまま待ち受けます。
-- このモードでは無音 3 秒によるクライアント側自動終了は行いません。
-
-### パスパラメータ
-
-| 名前 | 型 | 説明 |
-| --- | --- | --- |
-| `stackchan_ip` | `string` | 対象 StackChan の接続元 IP |
-
-### クエリパラメータ
-
-| 名前 | 型 | 必須 | 説明 |
-| --- | --- | --- | --- |
-| `timeout_seconds` | `number` | 任意 | 検出待ちタイムアウト秒。未指定時はサーバー設定値 |
-
-### 成功レスポンス
-
-- Status: `200 OK`
-
-```json
-{
-  "detected": true
-}
-```
-
-`detected` が `false` の場合は、検出セッションは終了したがキーワード未検出です。
-
-### エラーレスポンス
-
-- Status: `404 Not Found`
-
-```json
-{
-  "detail": "stackchan not connected"
-}
-```
-
-- Status: `409 Conflict`
-
-```json
-{
-  "detail": "Server-side wake-word detection is not available for this connection"
-}
-```
-
 ## `POST /v1/stackchan/{stackchan_ip}/speak`
 
 指定した StackChan にテキストを発話させます。
diff --git a/docs/server_ja.md b/docs/server_ja.md
index 33767ce..506727c 100644
--- a/docs/server_ja.md
+++ b/docs/server_ja.md
@@ -86,6 +86,22 @@ STACKCHAN_WHISPER_SERVER_URL=http://localhost:13305/api/v1/audio/transcriptions
 STACKCHAN_WHISPER_SERVER_MODEL=Whisper-Large-v3-Turbo
 ```
 
+### (オプション) サーバーサイド wakeword 用 Whisper Server の設定
+
+サーバーサイド wakeword 検出を有効にするには、以下を設定します。
+
+- `STACKCHAN_USE_WWD_WHISPER_SERVER`: `1`
+- `STACKCHAN_WWD_WHISPER_SERVER_URL`: wakeword 検出専用 Whisper Server の推論エンドポイント URL
+- `STACKCHAN_WWD_WHISPER_SERVER_MODEL`: wakeword 検出専用に利用するモデル名
+
+通常の音声認識で使う `STACKCHAN_WHISPER_SERVER_URL` / `STACKCHAN_WHISPER_SERVER_MODEL` とは別設定です。
+
+```
+STACKCHAN_USE_WWD_WHISPER_SERVER=1
+STACKCHAN_WWD_WHISPER_SERVER_URL="http://127.0.0.1:8080/inference"
+STACKCHAN_WWD_WHISPER_SERVER_MODEL=
+```
+
 ## 音声合成の設定
 
 音声合成エンジンとして、以下に対応しています。
diff --git a/docs/websocket_protocols_ja.md b/docs/websocket_protocols_ja.md
index d5a21aa..e8694a0 100644
--- a/docs/websocket_protocols_ja.md
+++ b/docs/websocket_protocols_ja.md
@@ -28,6 +28,7 @@
 | 名前 | 方向 | 用途 |
 | --- | --- | --- |
 | `AudioPcm` | CoreS3 → Server | マイク音声 PCM ストリーム |
+| `ServerWwdPcm` | CoreS3 → Server | サーバーサイド wakeword 検出専用 PCM ストリーム |
 | `AudioWav` | Server → CoreS3 | TTS 音声 PCM ストリーム |
 | `StateCmd` | Server → CoreS3 | 状態遷移指示 |
 | `WakeWordEvt` | CoreS3 → Server | ウェイクワード検出通知 |
@@ -64,6 +65,20 @@
 - 無音判定は平均絶対振幅 `<= 200` が 3 秒継続したときに発火します。
 - 停止時は未送信サンプルを `DATA` で flush してから `END` を送ります。
 
+## サーバーサイド wakeword 入力 `ServerWwdPcm`
+
+- 方向: CoreS3 → Server
+- フォーマット: PCM16LE / 16kHz / 1ch
+- シーケンス: `AudioPcmStart` → `AudioChunk` 複数回 → `AudioPcmEnd`
+- `kind`: `MESSAGE_KIND_SERVER_WWD_PCM`
+- body は `AudioPcm` と同じ `AudioPcmStart` / `AudioChunk` / `AudioPcmEnd` を使います。
+
+### 現行実装メモ
+
+- `StateCmd(Listening, WAKE_WORD)` を受けた CoreS3 は、見た目の状態を `Idle(Server-WWD)` のままにしてこの kind で uplink します。
+- 無音 3 秒によるクライアント側自動終了は行いません。
+- サーバーはこの kind だけを server-side wakeword detector にルーティングします。
+
 ## スピーカ再生 `AudioWav`
 
 - 方向: Server → CoreS3
@@ -138,9 +153,8 @@ CoreS3 側は `has_server_wake_word=true` を受けると、デバイス側 wake
 
 ## サーバーサイド wakeword 検出フロー
 
-- 環境変数 `USE_SERVER_SIDE_WWD_WHISPER_SERVER=1` の場合、サーバーは `@app.setup()` 完了後と `Idle` 復帰後に自動でサーバーサイド wakeword 検出を開始します。
-- REST API `POST /v1/stackchan/{ip}/wakeword/server-detect` を呼ぶと、
-  サーバーは `StateCmd(Listening, WAKE_WORD)` を送信してマイク uplink を受信します。
+- 環境変数 `STACKCHAN_USE_WWD_WHISPER_SERVER=1` の場合、サーバーは `@app.setup()` 完了後と `Idle` 復帰後に自動でサーバーサイド wakeword 検出を開始します。
+- サーバーは `StateCmd(Listening, WAKE_WORD)` を送信して `MESSAGE_KIND_SERVER_WWD_PCM` のマイク uplink を受信します。
 - 受信した音声の直近 3 秒窓を 0.5 秒ごとに音声認識へ渡し、
   定義キーワード（例: `スタクチャン`）を含むか判定します。
 - 各判定タイミングの認識結果はすべてログ出力されます。
diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.h b/firmware/lib/generated_protobuf/websocket-message.pb.h
index ffd1d31..28e1f54 100644
--- a/firmware/lib/generated_protobuf/websocket-message.pb.h
+++ b/firmware/lib/generated_protobuf/websocket-message.pb.h
@@ -21,7 +21,8 @@ typedef enum _stackchan_websocket_v1_MessageKind {
     stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_CMD = 7,
     stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_DONE_EVT = 8,
     stackchan_websocket_v1_MessageKind_MESSAGE_KIND_FIRMWARE_METADATA = 9,
-    stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_METADATA = 10
+    stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_METADATA = 10,
+    stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_WWD_PCM = 11
 } stackchan_websocket_v1_MessageKind;
 
 typedef enum _stackchan_websocket_v1_MessageType {
@@ -172,8 +173,8 @@ extern "C" {
 
 /* Helper constants for enums */
 #define _stackchan_websocket_v1_MessageKind_MIN stackchan_websocket_v1_MessageKind_MESSAGE_KIND_UNSPECIFIED
-#define _stackchan_websocket_v1_MessageKind_MAX stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_METADATA
-#define _stackchan_websocket_v1_MessageKind_ARRAYSIZE ((stackchan_websocket_v1_MessageKind)(stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_METADATA+1))
+#define _stackchan_websocket_v1_MessageKind_MAX stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_WWD_PCM
+#define _stackchan_websocket_v1_MessageKind_ARRAYSIZE ((stackchan_websocket_v1_MessageKind)(stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_WWD_PCM+1))
 
 #define _stackchan_websocket_v1_MessageType_MIN stackchan_websocket_v1_MessageType_MESSAGE_TYPE_UNSPECIFIED
 #define _stackchan_websocket_v1_MessageType_MAX stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END
diff --git a/firmware/src/listening.cpp b/firmware/src/listening.cpp
index 1138d67..256d26a 100644
--- a/firmware/src/listening.cpp
+++ b/firmware/src/listening.cpp
@@ -113,9 +113,9 @@ bool Listening::stopStreaming()
   }
 
   streaming_ = false;
+  ok = sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END, nullptr, 0) && ok;
   session_mode_ = SessionMode::Speech;
   auto_stop_for_silence_ = true;
-  ok = sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END, nullptr, 0) && ok;
   return ok;
 }
 
@@ -236,7 +236,10 @@ bool Listening::sendPacket(stackchan_websocket_v1_MessageType type, const int16_
 
   auto &message = g_listening_tx_message;
   message = stackchan_websocket_v1_WebSocketMessage_init_zero;
-  message.kind = stackchan_websocket_v1_MessageKind_MESSAGE_KIND_AUDIO_PCM;
+  message.kind =
+      (session_mode_ == SessionMode::WakeWord)
+          ? stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_WWD_PCM
+          : stackchan_websocket_v1_MessageKind_MESSAGE_KIND_AUDIO_PCM;
   message.message_type = type;
   message.seq = seq_counter_++;
 
diff --git a/protobuf/websocket-message.proto b/protobuf/websocket-message.proto
index fec8bff..10932ac 100644
--- a/protobuf/websocket-message.proto
+++ b/protobuf/websocket-message.proto
@@ -46,6 +46,7 @@ enum MessageKind {
   MESSAGE_KIND_SERVO_DONE_EVT = 8;
   MESSAGE_KIND_FIRMWARE_METADATA = 9;
   MESSAGE_KIND_SERVER_METADATA = 10;
+  MESSAGE_KIND_SERVER_WWD_PCM = 11;
 }
 
 enum MessageType {
diff --git a/stackchan_server/app.py b/stackchan_server/app.py
index 94cf8de..5921d9a 100644
--- a/stackchan_server/app.py
+++ b/stackchan_server/app.py
@@ -10,7 +10,6 @@
 from .speech_recognition import create_speech_recognizer
 from .speech_synthesis import create_speech_synthesizer
 from .types import SpeechRecognizer, SpeechSynthesizer
-from .wakeup_word_detection import WakeWordDetectionError
 from .ws_proxy import WsProxy
 
 logger = getLogger(__name__)
@@ -25,10 +24,6 @@ class SpeakRequest(BaseModel):
     text: str
 
 
-class ServerWakeWordDetectResponse(BaseModel):
-    detected: bool
-
-
 class StackChanApp:
     def __init__(
         self,
@@ -69,25 +64,6 @@ async def _trigger_wakeword(stackchan_ip: str):
                 raise HTTPException(status_code=404, detail="stackchan not connected")
             proxy.trigger_wakeword()
 
-        @self.fastapi.post(
-            "/v1/stackchan/{stackchan_ip}/wakeword/server-detect",
-            response_model=ServerWakeWordDetectResponse,
-        )
-        async def _detect_server_wakeword(
-            stackchan_ip: str,
-            timeout_seconds: float | None = None,
-        ):
-            proxy = await self._get_proxy(stackchan_ip)
-            if proxy is None:
-                raise HTTPException(status_code=404, detail="stackchan not connected")
-            try:
-                detected = await proxy.request_server_wakeword_detection(
-                    timeout_seconds=timeout_seconds
-                )
-            except WakeWordDetectionError as exc:
-                raise HTTPException(status_code=409, detail=str(exc)) from exc
-            return ServerWakeWordDetectResponse(detected=detected)
-
         @self.fastapi.post("/v1/stackchan/{stackchan_ip}/speak", status_code=204)
         async def _speak(stackchan_ip: str, body: SpeakRequest):
             proxy = await self._get_proxy(stackchan_ip)
diff --git a/stackchan_server/generated_protobuf/websocket_message_pb2.py b/stackchan_server/generated_protobuf/websocket_message_pb2.py
index 433f9a8..15b3d61 100644
--- a/stackchan_server/generated_protobuf/websocket_message_pb2.py
+++ b/stackchan_server/generated_protobuf/websocket_message_pb2.py
@@ -24,7 +24,7 @@
 
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17websocket-message.proto\x12\x16stackchan.websocket.v1\"\x96\x08\n\x10WebSocketMessage\x12\x31\n\x04kind\x18\x01 \x01(\x0e\x32#.stackchan.websocket.v1.MessageKind\x12\x39\n\x0cmessage_type\x18\x02 \x01(\x0e\x32#.stackchan.websocket.v1.MessageType\x12\x0b\n\x03seq\x18\x03 \x01(\r\x12@\n\x0f\x61udio_pcm_start\x18\n \x01(\x0b\x32%.stackchan.websocket.v1.AudioPcmStartH\x00\x12<\n\x0e\x61udio_pcm_data\x18\x0b \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_pcm_end\x18\x0c \x01(\x0b\x32#.stackchan.websocket.v1.AudioPcmEndH\x00\x12@\n\x0f\x61udio_wav_start\x18\x14 \x01(\x0b\x32%.stackchan.websocket.v1.AudioWavStartH\x00\x12<\n\x0e\x61udio_wav_data\x18\x15 \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_wav_end\x18\x16 \x01(\x0b\x32#.stackchan.websocket.v1.AudioWavEndH\x00\x12\x39\n\tstate_cmd\x18\x1e \x01(\x0b\x32$.stackchan.websocket.v1.StateCommandH\x00\x12>\n\rwake_word_evt\x18\x1f \x01(\x0b\x32%.stackchan.websocket.v1.WakeWordEventH\x00\x12\x37\n\tstate_evt\x18  \x01(\x0b\x32\".stackchan.websocket.v1.StateEventH\x00\x12@\n\x0espeak_done_evt\x18! \x01(\x0b\x32&.stackchan.websocket.v1.SpeakDoneEventH\x00\x12\x41\n\tservo_cmd\x18\" \x01(\x0b\x32,.stackchan.websocket.v1.ServoCommandSequenceH\x00\x12@\n\x0eservo_done_evt\x18# \x01(\x0b\x32&.stackchan.websocket.v1.ServoDoneEventH\x00\x12\x45\n\x11\x66irmware_metadata\x18$ \x01(\x0b\x32(.stackchan.websocket.v1.FirmwareMetadataH\x00\x12\x41\n\x0fserver_metadata\x18% \x01(\x0b\x32&.stackchan.websocket.v1.ServerMetadataH\x00\x42\x06\n\x04\x62ody\"\x0f\n\rAudioPcmStart\"\r\n\x0b\x41udioPcmEnd\"6\n\rAudioWavStart\x12\x13\n\x0bsample_rate\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\"\r\n\x0b\x41udioWavEnd\"\x1f\n\nAudioChunk\x12\x11\n\tpcm_bytes\x18\x01 \x01(\x0c\"\x8a\x01\n\x0cStateCommand\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\x12\x43\n\x11listening_purpose\x18\x02 \x01(\x0e\x32(.stackchan.websocket.v1.ListeningPurpose\"!\n\rWakeWordEvent\x12\x10\n\x08\x64\x65tected\x18\x01 \x01(\x08\"C\n\nStateEvent\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"\x1e\n\x0eSpeakDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"N\n\x14ServoCommandSequence\x12\x36\n\x08\x63ommands\x18\x01 \x03(\x0b\x32$.stackchan.websocket.v1.ServoCommand\"f\n\x0cServoCommand\x12\x32\n\x02op\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.ServoOperation\x12\r\n\x05\x61ngle\x18\x02 \x01(\x11\x12\x13\n\x0b\x64uration_ms\x18\x03 \x01(\x11\"\x1e\n\x0eServoDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"\x99\x02\n\x10\x46irmwareMetadata\x12\x37\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\".stackchan.websocket.v1.DeviceType\x12\x15\n\rdisplay_width\x18\x02 \x01(\r\x12\x16\n\x0e\x64isplay_height\x18\x03 \x01(\r\x12\x1c\n\x14has_device_wake_word\x18\x04 \x01(\x08\x12\x0f\n\x07has_led\x18\x05 \x01(\x08\x12\x35\n\nservo_type\x18\x06 \x01(\x0e\x32!.stackchan.websocket.v1.ServoType\x12\x1d\n\x15supports_audio_duplex\x18\x07 \x01(\x08\x12\x18\n\x10\x66irmware_version\x18\x08 \x01(\t\"F\n\x0eServerMetadata\x12\x1c\n\x14has_server_wake_word\x18\x01 \x01(\x08\x12\x16\n\x0eserver_version\x18\x02 \x01(\t*\xdf\x02\n\x0bMessageKind\x12\x1c\n\x18MESSAGE_KIND_UNSPECIFIED\x10\x00\x12\x1a\n\x16MESSAGE_KIND_AUDIO_PCM\x10\x01\x12\x1a\n\x16MESSAGE_KIND_AUDIO_WAV\x10\x02\x12\x1a\n\x16MESSAGE_KIND_STATE_CMD\x10\x03\x12\x1e\n\x1aMESSAGE_KIND_WAKE_WORD_EVT\x10\x04\x12\x1a\n\x16MESSAGE_KIND_STATE_EVT\x10\x05\x12\x1f\n\x1bMESSAGE_KIND_SPEAK_DONE_EVT\x10\x06\x12\x1a\n\x16MESSAGE_KIND_SERVO_CMD\x10\x07\x12\x1f\n\x1bMESSAGE_KIND_SERVO_DONE_EVT\x10\x08\x12\"\n\x1eMESSAGE_KIND_FIRMWARE_METADATA\x10\t\x12 \n\x1cMESSAGE_KIND_SERVER_METADATA\x10\n*p\n\x0bMessageType\x12\x1c\n\x18MESSAGE_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12MESSAGE_TYPE_START\x10\x01\x12\x15\n\x11MESSAGE_TYPE_DATA\x10\x02\x12\x14\n\x10MESSAGE_TYPE_END\x10\x03*\x85\x01\n\x0eStackchanState\x12\x18\n\x14STACKCHAN_STATE_IDLE\x10\x00\x12\x1d\n\x19STACKCHAN_STATE_LISTENING\x10\x01\x12\x1c\n\x18STACKCHAN_STATE_THINKING\x10\x02\x12\x1c\n\x18STACKCHAN_STATE_SPEAKING\x10\x03*t\n\x10ListeningPurpose\x12!\n\x1dLISTENING_PURPOSE_UNSPECIFIED\x10\x00\x12\x1c\n\x18LISTENING_PURPOSE_SPEECH\x10\x01\x12\x1f\n\x1bLISTENING_PURPOSE_WAKE_WORD\x10\x02*c\n\x0eServoOperation\x12\x19\n\x15SERVO_OPERATION_SLEEP\x10\x00\x12\x1a\n\x16SERVO_OPERATION_MOVE_X\x10\x01\x12\x1a\n\x16SERVO_OPERATION_MOVE_Y\x10\x02*\x85\x01\n\nDeviceType\x12\x1b\n\x17\x44\x45VICE_TYPE_UNSPECIFIED\x10\x00\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5STACK_CORES3\x10\x01\x12\x1a\n\x16\x44\x45VICE_TYPE_M5ATOM_S3R\x10\x02\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5ATOM_ECHOS3R\x10\x03*i\n\tServoType\x12\x1a\n\x16SERVO_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fSERVO_TYPE_NONE\x10\x01\x12\x13\n\x0fSERVO_TYPE_SG90\x10\x02\x12\x16\n\x12SERVO_TYPE_SCS0009\x10\x03\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17websocket-message.proto\x12\x16stackchan.websocket.v1\"\x96\x08\n\x10WebSocketMessage\x12\x31\n\x04kind\x18\x01 \x01(\x0e\x32#.stackchan.websocket.v1.MessageKind\x12\x39\n\x0cmessage_type\x18\x02 \x01(\x0e\x32#.stackchan.websocket.v1.MessageType\x12\x0b\n\x03seq\x18\x03 \x01(\r\x12@\n\x0f\x61udio_pcm_start\x18\n \x01(\x0b\x32%.stackchan.websocket.v1.AudioPcmStartH\x00\x12<\n\x0e\x61udio_pcm_data\x18\x0b \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_pcm_end\x18\x0c \x01(\x0b\x32#.stackchan.websocket.v1.AudioPcmEndH\x00\x12@\n\x0f\x61udio_wav_start\x18\x14 \x01(\x0b\x32%.stackchan.websocket.v1.AudioWavStartH\x00\x12<\n\x0e\x61udio_wav_data\x18\x15 \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_wav_end\x18\x16 \x01(\x0b\x32#.stackchan.websocket.v1.AudioWavEndH\x00\x12\x39\n\tstate_cmd\x18\x1e \x01(\x0b\x32$.stackchan.websocket.v1.StateCommandH\x00\x12>\n\rwake_word_evt\x18\x1f \x01(\x0b\x32%.stackchan.websocket.v1.WakeWordEventH\x00\x12\x37\n\tstate_evt\x18  \x01(\x0b\x32\".stackchan.websocket.v1.StateEventH\x00\x12@\n\x0espeak_done_evt\x18! \x01(\x0b\x32&.stackchan.websocket.v1.SpeakDoneEventH\x00\x12\x41\n\tservo_cmd\x18\" \x01(\x0b\x32,.stackchan.websocket.v1.ServoCommandSequenceH\x00\x12@\n\x0eservo_done_evt\x18# \x01(\x0b\x32&.stackchan.websocket.v1.ServoDoneEventH\x00\x12\x45\n\x11\x66irmware_metadata\x18$ \x01(\x0b\x32(.stackchan.websocket.v1.FirmwareMetadataH\x00\x12\x41\n\x0fserver_metadata\x18% \x01(\x0b\x32&.stackchan.websocket.v1.ServerMetadataH\x00\x42\x06\n\x04\x62ody\"\x0f\n\rAudioPcmStart\"\r\n\x0b\x41udioPcmEnd\"6\n\rAudioWavStart\x12\x13\n\x0bsample_rate\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\"\r\n\x0b\x41udioWavEnd\"\x1f\n\nAudioChunk\x12\x11\n\tpcm_bytes\x18\x01 \x01(\x0c\"\x8a\x01\n\x0cStateCommand\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\x12\x43\n\x11listening_purpose\x18\x02 \x01(\x0e\x32(.stackchan.websocket.v1.ListeningPurpose\"!\n\rWakeWordEvent\x12\x10\n\x08\x64\x65tected\x18\x01 \x01(\x08\"C\n\nStateEvent\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"\x1e\n\x0eSpeakDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"N\n\x14ServoCommandSequence\x12\x36\n\x08\x63ommands\x18\x01 \x03(\x0b\x32$.stackchan.websocket.v1.ServoCommand\"f\n\x0cServoCommand\x12\x32\n\x02op\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.ServoOperation\x12\r\n\x05\x61ngle\x18\x02 \x01(\x11\x12\x13\n\x0b\x64uration_ms\x18\x03 \x01(\x11\"\x1e\n\x0eServoDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"\x99\x02\n\x10\x46irmwareMetadata\x12\x37\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\".stackchan.websocket.v1.DeviceType\x12\x15\n\rdisplay_width\x18\x02 \x01(\r\x12\x16\n\x0e\x64isplay_height\x18\x03 \x01(\r\x12\x1c\n\x14has_device_wake_word\x18\x04 \x01(\x08\x12\x0f\n\x07has_led\x18\x05 \x01(\x08\x12\x35\n\nservo_type\x18\x06 \x01(\x0e\x32!.stackchan.websocket.v1.ServoType\x12\x1d\n\x15supports_audio_duplex\x18\x07 \x01(\x08\x12\x18\n\x10\x66irmware_version\x18\x08 \x01(\t\"F\n\x0eServerMetadata\x12\x1c\n\x14has_server_wake_word\x18\x01 \x01(\x08\x12\x16\n\x0eserver_version\x18\x02 \x01(\t*\x80\x03\n\x0bMessageKind\x12\x1c\n\x18MESSAGE_KIND_UNSPECIFIED\x10\x00\x12\x1a\n\x16MESSAGE_KIND_AUDIO_PCM\x10\x01\x12\x1a\n\x16MESSAGE_KIND_AUDIO_WAV\x10\x02\x12\x1a\n\x16MESSAGE_KIND_STATE_CMD\x10\x03\x12\x1e\n\x1aMESSAGE_KIND_WAKE_WORD_EVT\x10\x04\x12\x1a\n\x16MESSAGE_KIND_STATE_EVT\x10\x05\x12\x1f\n\x1bMESSAGE_KIND_SPEAK_DONE_EVT\x10\x06\x12\x1a\n\x16MESSAGE_KIND_SERVO_CMD\x10\x07\x12\x1f\n\x1bMESSAGE_KIND_SERVO_DONE_EVT\x10\x08\x12\"\n\x1eMESSAGE_KIND_FIRMWARE_METADATA\x10\t\x12 \n\x1cMESSAGE_KIND_SERVER_METADATA\x10\n\x12\x1f\n\x1bMESSAGE_KIND_SERVER_WWD_PCM\x10\x0b*p\n\x0bMessageType\x12\x1c\n\x18MESSAGE_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12MESSAGE_TYPE_START\x10\x01\x12\x15\n\x11MESSAGE_TYPE_DATA\x10\x02\x12\x14\n\x10MESSAGE_TYPE_END\x10\x03*\x85\x01\n\x0eStackchanState\x12\x18\n\x14STACKCHAN_STATE_IDLE\x10\x00\x12\x1d\n\x19STACKCHAN_STATE_LISTENING\x10\x01\x12\x1c\n\x18STACKCHAN_STATE_THINKING\x10\x02\x12\x1c\n\x18STACKCHAN_STATE_SPEAKING\x10\x03*t\n\x10ListeningPurpose\x12!\n\x1dLISTENING_PURPOSE_UNSPECIFIED\x10\x00\x12\x1c\n\x18LISTENING_PURPOSE_SPEECH\x10\x01\x12\x1f\n\x1bLISTENING_PURPOSE_WAKE_WORD\x10\x02*c\n\x0eServoOperation\x12\x19\n\x15SERVO_OPERATION_SLEEP\x10\x00\x12\x1a\n\x16SERVO_OPERATION_MOVE_X\x10\x01\x12\x1a\n\x16SERVO_OPERATION_MOVE_Y\x10\x02*\x85\x01\n\nDeviceType\x12\x1b\n\x17\x44\x45VICE_TYPE_UNSPECIFIED\x10\x00\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5STACK_CORES3\x10\x01\x12\x1a\n\x16\x44\x45VICE_TYPE_M5ATOM_S3R\x10\x02\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5ATOM_ECHOS3R\x10\x03*i\n\tServoType\x12\x1a\n\x16SERVO_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fSERVO_TYPE_NONE\x10\x01\x12\x13\n\x0fSERVO_TYPE_SG90\x10\x02\x12\x16\n\x12SERVO_TYPE_SCS0009\x10\x03\x62\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -32,19 +32,19 @@
 if not _descriptor._USE_C_DESCRIPTORS:
   DESCRIPTOR._loaded_options = None
   _globals['_MESSAGEKIND']._serialized_start=2086
-  _globals['_MESSAGEKIND']._serialized_end=2437
-  _globals['_MESSAGETYPE']._serialized_start=2439
-  _globals['_MESSAGETYPE']._serialized_end=2551
-  _globals['_STACKCHANSTATE']._serialized_start=2554
-  _globals['_STACKCHANSTATE']._serialized_end=2687
-  _globals['_LISTENINGPURPOSE']._serialized_start=2689
-  _globals['_LISTENINGPURPOSE']._serialized_end=2805
-  _globals['_SERVOOPERATION']._serialized_start=2807
-  _globals['_SERVOOPERATION']._serialized_end=2906
-  _globals['_DEVICETYPE']._serialized_start=2909
-  _globals['_DEVICETYPE']._serialized_end=3042
-  _globals['_SERVOTYPE']._serialized_start=3044
-  _globals['_SERVOTYPE']._serialized_end=3149
+  _globals['_MESSAGEKIND']._serialized_end=2470
+  _globals['_MESSAGETYPE']._serialized_start=2472
+  _globals['_MESSAGETYPE']._serialized_end=2584
+  _globals['_STACKCHANSTATE']._serialized_start=2587
+  _globals['_STACKCHANSTATE']._serialized_end=2720
+  _globals['_LISTENINGPURPOSE']._serialized_start=2722
+  _globals['_LISTENINGPURPOSE']._serialized_end=2838
+  _globals['_SERVOOPERATION']._serialized_start=2840
+  _globals['_SERVOOPERATION']._serialized_end=2939
+  _globals['_DEVICETYPE']._serialized_start=2942
+  _globals['_DEVICETYPE']._serialized_end=3075
+  _globals['_SERVOTYPE']._serialized_start=3077
+  _globals['_SERVOTYPE']._serialized_end=3182
   _globals['_WEBSOCKETMESSAGE']._serialized_start=52
   _globals['_WEBSOCKETMESSAGE']._serialized_end=1098
   _globals['_AUDIOPCMSTART']._serialized_start=1100
diff --git a/stackchan_server/wakeup_word_detection/__init__.py b/stackchan_server/wakeup_word_detection/__init__.py
index d4c0f62..dbbd0c2 100644
--- a/stackchan_server/wakeup_word_detection/__init__.py
+++ b/stackchan_server/wakeup_word_detection/__init__.py
@@ -1,13 +1,15 @@
 from .create import create_server_side_wake_word_detector
-from .server_side import (
-    ServerSideWakeWordConfig,
-    ServerSideWakeWordDetector,
+from .whisper_server import (
     WakeWordDetectionError,
+    WhisperServerWakeWordDetector,
+    WhisperServerWakeWordDetectorConfig,
+    WhisperServerWakeWordSpeechToTextConfig,
 )
 
 __all__ = [
     "create_server_side_wake_word_detector",
-    "ServerSideWakeWordConfig",
-    "ServerSideWakeWordDetector",
+    "WhisperServerWakeWordDetector",
+    "WhisperServerWakeWordDetectorConfig",
+    "WhisperServerWakeWordSpeechToTextConfig",
     "WakeWordDetectionError",
 ]
diff --git a/stackchan_server/wakeup_word_detection/create.py b/stackchan_server/wakeup_word_detection/create.py
index 2ec5011..6d7520d 100644
--- a/stackchan_server/wakeup_word_detection/create.py
+++ b/stackchan_server/wakeup_word_detection/create.py
@@ -1,28 +1,23 @@
 from __future__ import annotations
 
-from pydantic import Field
 from pydantic_settings import BaseSettings
 
-from ..speech_recognition.whisper_server import WhisperServerSpeechToText
-from .server_side import ServerSideWakeWordDetector
+from .whisper_server import WhisperServerWakeWordDetector
 
 
-class _CreateServerSideWakeWordDetectorEnv(BaseSettings):
-    use_server_side_wwd_whisper_server: bool = Field(
-        default=False,
-        validation_alias="USE_SERVER_SIDE_WWD_WHISPER_SERVER",
-    )
+class _CreateWhisperServerWakeWordDetectorEnv(BaseSettings):
+    use_wwd_whisper_server: bool = False
 
     class Config:
-        env_prefix = ""
+        env_prefix = "STACKCHAN_"
 
 
-def create_server_side_wake_word_detector() -> ServerSideWakeWordDetector | None:
-    env = _CreateServerSideWakeWordDetectorEnv()
-    if not env.use_server_side_wwd_whisper_server:
+def create_server_side_wake_word_detector() -> WhisperServerWakeWordDetector | None:
+    env = _CreateWhisperServerWakeWordDetectorEnv()
+    if not env.use_wwd_whisper_server:
         return None
 
-    return ServerSideWakeWordDetector(recognizer=WhisperServerSpeechToText())
+    return WhisperServerWakeWordDetector()
 
 
 __all__ = ["create_server_side_wake_word_detector"]
diff --git a/stackchan_server/wakeup_word_detection/server_side.py b/stackchan_server/wakeup_word_detection/whisper_server.py
similarity index 89%
rename from stackchan_server/wakeup_word_detection/server_side.py
rename to stackchan_server/wakeup_word_detection/whisper_server.py
index f77be72..e389bbb 100644
--- a/stackchan_server/wakeup_word_detection/server_side.py
+++ b/stackchan_server/wakeup_word_detection/whisper_server.py
@@ -7,7 +7,10 @@
 from pydantic import Field
 from pydantic_settings import BaseSettings
 
-from ..speech_recognition.whisper_server import WhisperServerSpeechToText
+from ..speech_recognition.whisper_server import (
+    WhisperServerSpeechToText,
+    WhisperServerSpeechToTextConfig,
+)
 from ..static import LISTEN_AUDIO_FORMAT
 
 logger = getLogger(__name__)
@@ -17,25 +20,32 @@ class WakeWordDetectionError(Exception):
     pass
 
 
-class ServerSideWakeWordConfig(BaseSettings):
+class WhisperServerWakeWordDetectorConfig(BaseSettings):
     keywords: list[str] = Field(default_factory=lambda: ["スタックチャン"])
     window_seconds: float = 3.0
     interval_seconds: float = 0.5
     timeout_seconds: float = 30.0
 
     class Config:
-        env_prefix = "STACKCHAN_SERVER_WAKE_WORD_"
+        env_prefix = "STACKCHAN_WWD_"
 
 
-class ServerSideWakeWordDetector:
+class WhisperServerWakeWordSpeechToTextConfig(WhisperServerSpeechToTextConfig):
+    class Config(WhisperServerSpeechToTextConfig.Config):
+        env_prefix = "STACKCHAN_WWD_WHISPER_SERVER_"
+
+
+class WhisperServerWakeWordDetector:
     def __init__(
         self,
         *,
         recognizer: WhisperServerSpeechToText | None = None,
-        config: ServerSideWakeWordConfig | None = None,
+        config: WhisperServerWakeWordDetectorConfig | None = None,
     ) -> None:
-        self.config = config or ServerSideWakeWordConfig()
-        self.recognizer = recognizer or WhisperServerSpeechToText()
+        self.config = config or WhisperServerWakeWordDetectorConfig()
+        self.recognizer = recognizer or WhisperServerSpeechToText(
+            config=WhisperServerWakeWordSpeechToTextConfig()
+        )
         self._pcm_buffer = bytearray()
         self._running = False
         self._detected = False
@@ -199,7 +209,8 @@ def _normalize_text(text: str) -> str:
 
 
 __all__ = [
-    "ServerSideWakeWordConfig",
-    "ServerSideWakeWordDetector",
+    "WhisperServerWakeWordDetector",
+    "WhisperServerWakeWordDetectorConfig",
+    "WhisperServerWakeWordSpeechToTextConfig",
     "WakeWordDetectionError",
-]
+]
\ No newline at end of file
diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py
index b10ec5a..55576a8 100644
--- a/stackchan_server/ws_proxy.py
+++ b/stackchan_server/ws_proxy.py
@@ -217,7 +217,7 @@ async def move_servo(self, commands: Sequence[ServoCommand]) -> None:
         self._servo_sent_counter = target_counter
         self._pending_servo_wait_targets.append(target_counter)
         try:
-            await self.ws.send_bytes(
+            await self._send_ws_bytes(
                 encode_servo_command_message(self._next_down_seq(), commands)
             )
         except Exception:
@@ -254,7 +254,11 @@ async def close(self) -> None:
         if self._receiving_task:
             self._receiving_task.cancel()
             with suppress(asyncio.CancelledError):
-                await self._receiving_task
+                try:
+                    await self._receiving_task
+                except RuntimeError as exc:
+                    if not self._is_closed_websocket_runtime_error(exc):
+                        raise
         await self._listener.close()
 
     async def start_talking(self, text: str) -> None:
@@ -341,107 +345,27 @@ async def request_server_wakeword_detection(
     async def _receive_loop(self) -> None:
         try:
             while True:
-                raw_message = await self.ws.receive_bytes()
+                try:
+                    raw_message = await self.ws.receive_bytes()
+                except RuntimeError as exc:
+                    if self._is_closed_websocket_runtime_error(exc):
+                        break
+                    raise
                 try:
                     message = parse_websocket_message(raw_message)
                 except DecodeError:
                     await self.ws.close(code=1003, reason="invalid protobuf message")
                     break
 
-                if message.kind == ws_pb2.MESSAGE_KIND_AUDIO_PCM:
-                    body_name = message.WhichOneof("body")
-
-                    if self._should_drain_trailing_pcm():
-                        if (
-                            message.message_type == ws_pb2.MESSAGE_TYPE_START
-                            and body_name == "audio_pcm_start"
-                        ):
-                            logger.info(
-                                "Received a new PCM START while draining trailing wake-word audio; resuming normal routing"
-                            )
-                            self._clear_trailing_pcm_drain()
-                        elif (
-                            message.message_type == ws_pb2.MESSAGE_TYPE_DATA
-                            and body_name == "audio_pcm_data"
-                        ):
-                            logger.info(
-                                "Discarding trailing PCM DATA after wake-word detection stop payload_bytes=%d",
-                                len(message.audio_pcm_data.pcm_bytes),
-                            )
-                            continue
-                        elif (
-                            message.message_type == ws_pb2.MESSAGE_TYPE_END
-                            and body_name == "audio_pcm_end"
-                        ):
-                            logger.info(
-                                "Finished draining trailing PCM after wake-word detection stop"
-                            )
-                            self._clear_trailing_pcm_drain()
-                            continue
-
-                    if (
-                        self._server_wakeword_detector is not None
-                        and self._server_wakeword_detector.running
-                    ):
-                        if (
-                            message.message_type == ws_pb2.MESSAGE_TYPE_START
-                            and body_name == "audio_pcm_start"
-                        ):
-                            await self._server_wakeword_detector.handle_start()
-                            continue
-
-                        if (
-                            message.message_type == ws_pb2.MESSAGE_TYPE_DATA
-                            and body_name == "audio_pcm_data"
-                        ):
-                            payload = bytes(message.audio_pcm_data.pcm_bytes)
-                            await self._server_wakeword_detector.handle_data(payload)
-                            continue
-
-                        if (
-                            message.message_type == ws_pb2.MESSAGE_TYPE_END
-                            and body_name == "audio_pcm_end"
-                        ):
-                            await self._server_wakeword_detector.handle_end()
-                            continue
-
-                        await self.ws.close(code=1003, reason="unknown wakeword PCM protobuf body")
+                if message.kind == ws_pb2.MESSAGE_KIND_SERVER_WWD_PCM:
+                    if not await self._handle_server_wakeword_pcm_message(message):
                         break
+                    continue
 
-                    if (
-                        message.message_type == ws_pb2.MESSAGE_TYPE_START
-                        and body_name == "audio_pcm_start"
-                    ):
-                        if not await self._listener.handle_start(self.ws):
-                            break
-                        continue
-
-                    if (
-                        message.message_type == ws_pb2.MESSAGE_TYPE_DATA
-                        and body_name == "audio_pcm_data"
-                    ):
-                        payload = bytes(message.audio_pcm_data.pcm_bytes)
-                        if not await self._listener.handle_data(
-                            self.ws, len(payload), payload
-                        ):
-                            break
-                        continue
-
-                    if (
-                        message.message_type == ws_pb2.MESSAGE_TYPE_END
-                        and body_name == "audio_pcm_end"
-                    ):
-                        await self._listener.handle_end(
-                            self.ws,
-                            payload_bytes=0,
-                            payload=b"",
-                            send_state_command=self.send_state_command,
-                            thinking_state=FirmwareState.THINKING,
-                        )
-                        continue
-
-                    await self.ws.close(code=1003, reason="unknown PCM protobuf body")
-                    break
+                if message.kind == ws_pb2.MESSAGE_KIND_AUDIO_PCM:
+                    if not await self._handle_audio_pcm_message(message):
+                        break
+                    continue
 
                 if message.kind == ws_pb2.MESSAGE_KIND_WAKE_WORD_EVT:
                     self._handle_wakeword_event(message)
@@ -470,6 +394,101 @@ async def _receive_loop(self) -> None:
         finally:
             self._closed = True
 
+    async def _handle_server_wakeword_pcm_message(self, message: Any) -> bool:
+        body_name = message.WhichOneof("body")
+
+        if self._should_drain_trailing_pcm():
+            if (
+                message.message_type == ws_pb2.MESSAGE_TYPE_START
+                and body_name == "audio_pcm_start"
+            ):
+                logger.info(
+                    "Received a new server-side wake-word PCM START while draining trailing audio; resuming normal routing"
+                )
+                self._clear_trailing_pcm_drain()
+            elif (
+                message.message_type == ws_pb2.MESSAGE_TYPE_DATA
+                and body_name == "audio_pcm_data"
+            ):
+                logger.info(
+                    "Discarding trailing server-side wake-word PCM DATA payload_bytes=%d",
+                    len(message.audio_pcm_data.pcm_bytes),
+                )
+                return True
+            elif (
+                message.message_type == ws_pb2.MESSAGE_TYPE_END
+                and body_name == "audio_pcm_end"
+            ):
+                logger.info("Finished draining trailing server-side wake-word PCM")
+                self._clear_trailing_pcm_drain()
+                return True
+
+        detector = self._server_wakeword_detector
+        if detector is None or not detector.running:
+            logger.info(
+                "Ignoring server-side wake-word PCM while detector is inactive type=%s body=%s",
+                message.message_type,
+                body_name,
+            )
+            return True
+
+        if (
+            message.message_type == ws_pb2.MESSAGE_TYPE_START
+            and body_name == "audio_pcm_start"
+        ):
+            await detector.handle_start()
+            return True
+
+        if (
+            message.message_type == ws_pb2.MESSAGE_TYPE_DATA
+            and body_name == "audio_pcm_data"
+        ):
+            payload = bytes(message.audio_pcm_data.pcm_bytes)
+            await detector.handle_data(payload)
+            return True
+
+        if (
+            message.message_type == ws_pb2.MESSAGE_TYPE_END
+            and body_name == "audio_pcm_end"
+        ):
+            await detector.handle_end()
+            return True
+
+        await self.ws.close(code=1003, reason="unknown server wake-word PCM protobuf body")
+        return False
+
+    async def _handle_audio_pcm_message(self, message: Any) -> bool:
+        body_name = message.WhichOneof("body")
+
+        if (
+            message.message_type == ws_pb2.MESSAGE_TYPE_START
+            and body_name == "audio_pcm_start"
+        ):
+            return await self._listener.handle_start(self.ws)
+
+        if (
+            message.message_type == ws_pb2.MESSAGE_TYPE_DATA
+            and body_name == "audio_pcm_data"
+        ):
+            payload = bytes(message.audio_pcm_data.pcm_bytes)
+            return await self._listener.handle_data(self.ws, len(payload), payload)
+
+        if (
+            message.message_type == ws_pb2.MESSAGE_TYPE_END
+            and body_name == "audio_pcm_end"
+        ):
+            await self._listener.handle_end(
+                self.ws,
+                payload_bytes=0,
+                payload=b"",
+                send_state_command=self.send_state_command,
+                thinking_state=FirmwareState.THINKING,
+            )
+            return True
+
+        await self.ws.close(code=1003, reason="unknown PCM protobuf body")
+        return False
+
     def _handle_wakeword_event(self, message: Any) -> None:
         if message.message_type != ws_pb2.MESSAGE_TYPE_DATA:
             return
@@ -509,7 +528,7 @@ async def _handle_firmware_metadata(self, message: Any) -> None:
             self.firmware_metadata.firmware_version,
         )
         self.server_metadata = self._build_server_metadata(self.firmware_metadata)
-        await self.ws.send_bytes(
+        await self._send_ws_bytes(
             encode_server_metadata_message(
                 self._next_down_seq(),
                 has_server_wake_word=self.server_metadata.has_server_wake_word,
@@ -566,7 +585,7 @@ async def _send_state_command(
         *,
         listening_purpose: ListeningPurpose = ListeningPurpose.SPEECH,
     ) -> None:
-        await self.ws.send_bytes(
+        await self._send_ws_bytes(
             encode_state_command_message(
                 self._next_down_seq(),
                 int(state_id),
@@ -574,6 +593,25 @@ async def _send_state_command(
             )
         )
 
+    async def _send_ws_bytes(self, data: bytes) -> None:
+        try:
+            await self.ws.send_bytes(data)
+        except RuntimeError as exc:
+            self._raise_websocket_disconnect_from_runtime_error(exc)
+
+    def _is_closed_websocket_runtime_error(self, exc: RuntimeError) -> bool:
+        message = str(exc)
+        return (
+            'Cannot call "send" once a close message has been sent.' in message
+            or 'WebSocket is not connected. Need to call "accept" first.' in message
+        )
+
+    def _raise_websocket_disconnect_from_runtime_error(self, exc: RuntimeError) -> None:
+        if not self._is_closed_websocket_runtime_error(exc):
+            raise exc
+        self._closed = True
+        raise WebSocketDisconnect() from exc
+
     async def _run_server_wakeword_detection(self) -> bool:
         detector = self._server_wakeword_detector
         if detector is None:

From 82c09a9b39ecc62dc0cc5fe0883cc12cfc688861 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sat, 9 May 2026 16:09:45 +0900
Subject: [PATCH 04/15] feat: Add language and prompt configuration for Whisper
 Server

---
 .env.template                                         |  4 ++++
 docs/server_ja.md                                     | 10 ++++++++++
 stackchan_server/speech_recognition/whisper_server.py | 11 +++++++++--
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/.env.template b/.env.template
index 89d30f6..1d82075 100644
--- a/.env.template
+++ b/.env.template
@@ -21,6 +21,8 @@ STACKCHAN_GOOGLE_CLOUD_STT_LANGUAGE_CODE="ja-JP"
 # STACKCHAN_USE_WHISPER_SERVER=1
 # STACKCHAN_WHISPER_SERVER_URL="http://127.0.0.1:8080/inference"
 # STACKCHAN_WHISPER_SERVER_MODEL=
+# STACKCHAN_WHISPER_SERVER_PROMPT=
+# STACKCHAN_WHISPER_SERVER_LANGUAGE="ja"
 
 # -- Speech Syntheis --
 # Google Cloud TTS
@@ -39,6 +41,8 @@ STACKCHAN_VOICEVOX_SPEAKER=1
 # STACKCHAN_USE_WWD_WHISPER_SERVER=1
 # STACKCHAN_WWD_WHISPER_SERVER_URL="http://127.0.0.1:8080/inference"
 # STACKCHAN_WWD_WHISPER_SERVER_MODEL=
+# STACKCHAN_WWD_WHISPER_SERVER_LANGUAGE="ja"
+# STACKCHAN_WWD_WHISPER_SERVER_PROMPT="日本語で、スタックチャンという名前で、話しかけらるので、話しかけられたことを検出してください"
 
 # -- Claude Agent SDK --
 # using Google Cloud Vertex AI
diff --git a/docs/server_ja.md b/docs/server_ja.md
index 506727c..2bc40c9 100644
--- a/docs/server_ja.md
+++ b/docs/server_ja.md
@@ -65,6 +65,8 @@ STACKCHAN_WHISPER_CLI_VAD_MODEL_PATH="/path/to/whisper.cpp/ggml-silero-v5.1.2.bi
 
 `STACKCHAN_WHISPER_SERVER_URL` に Whisper Server の推論エンドポイント URL をそのまま指定します。
 未設定時は `http://127.0.0.1:8080/inference` を利用します。
+`STACKCHAN_WHISPER_SERVER_LANGUAGE` を設定すると、その値を `language` パラメータとして各リクエストに含めます。未設定または空文字の場合は `language` を送信しません。
+また、`STACKCHAN_WHISPER_SERVER_PROMPT` を設定すると、whisper-server の各リクエストに `prompt` フィールドとして送信します。
 
 #### 例: Whisper.cppのwhisper-serverの設定
 
@@ -74,6 +76,8 @@ whisper.cpp/examples/server: https://github.com/ggml-org/whisper.cpp/tree/master
 STACKCHAN_USE_WHISPER_SERVER=1
 STACKCHAN_WHISPER_SERVER_URL="http://127.0.0.1:8080/inference"
 STACKCHAN_WHISPER_SERVER_MODEL=
+STACKCHAN_WHISPER_SERVER_LANGUAGE="ja"
+STACKCHAN_WHISPER_SERVER_PROMPT=""
 ```
 
 #### 例: [Lemonade](https://lemonade-server.ai/) を使う場合
@@ -84,6 +88,8 @@ Lemonade: https://lemonade-server.ai/
 STACKCHAN_USE_WHISPER_SERVER=1
 STACKCHAN_WHISPER_SERVER_URL=http://localhost:13305/api/v1/audio/transcriptions
 STACKCHAN_WHISPER_SERVER_MODEL=Whisper-Large-v3-Turbo
+STACKCHAN_WHISPER_SERVER_LANGUAGE="ja"
+STACKCHAN_WHISPER_SERVER_PROMPT=""
 ```
 
 ### (オプション) サーバーサイド wakeword 用 Whisper Server の設定
@@ -93,6 +99,8 @@ STACKCHAN_WHISPER_SERVER_MODEL=Whisper-Large-v3-Turbo
 - `STACKCHAN_USE_WWD_WHISPER_SERVER`: `1`
 - `STACKCHAN_WWD_WHISPER_SERVER_URL`: wakeword 検出専用 Whisper Server の推論エンドポイント URL
 - `STACKCHAN_WWD_WHISPER_SERVER_MODEL`: wakeword 検出専用に利用するモデル名
+- `STACKCHAN_WWD_WHISPER_SERVER_LANGUAGE`: wakeword 検出専用 Whisper Server リクエストへ渡す language
+- `STACKCHAN_WWD_WHISPER_SERVER_PROMPT`: wakeword 検出専用 Whisper Server リクエストへ渡す prompt
 
 通常の音声認識で使う `STACKCHAN_WHISPER_SERVER_URL` / `STACKCHAN_WHISPER_SERVER_MODEL` とは別設定です。
 
@@ -100,6 +108,8 @@ STACKCHAN_WHISPER_SERVER_MODEL=Whisper-Large-v3-Turbo
 STACKCHAN_USE_WWD_WHISPER_SERVER=1
 STACKCHAN_WWD_WHISPER_SERVER_URL="http://127.0.0.1:8080/inference"
 STACKCHAN_WWD_WHISPER_SERVER_MODEL=
+STACKCHAN_WWD_WHISPER_SERVER_LANGUAGE="ja"
+STACKCHAN_WWD_WHISPER_SERVER_PROMPT="日本語で、スタックチャンという名前で、話しかけらるので、話しかけられたことを検出してください"
 ```
 
 ## 音声合成の設定
diff --git a/stackchan_server/speech_recognition/whisper_server.py b/stackchan_server/speech_recognition/whisper_server.py
index 99dd811..d508cb5 100644
--- a/stackchan_server/speech_recognition/whisper_server.py
+++ b/stackchan_server/speech_recognition/whisper_server.py
@@ -25,9 +25,10 @@
 
 class WhisperServerSpeechToTextConfig(BaseSettings):
     url: str = _DEFAULT_SERVER_URL
-    language: str = "auto"
+    language: str = ""
     detect_language: bool = False
     response_format: str = "verbose_json"
+    prompt: str = ""
     silence_rms_threshold: float = _DEFAULT_SILENCE_RMS_THRESHOLD
     request_timeout_seconds: float = 60.0
     model: str = ""
@@ -73,9 +74,15 @@ async def transcribe(self, pcm_bytes: bytes) -> str:
     def _request_transcript(self, wav_bytes: bytes, language: str) -> str:
         fields = {
             "response_format": self._conf.response_format,
-            "language": language,
         }
 
+        normalized_language = language.strip()
+        if normalized_language:
+            fields["language"] = normalized_language
+
+        if self._conf.prompt:
+            fields["prompt"] = self._conf.prompt
+
         if self._conf.model:
             fields["model"] = self._conf.model
 

From f318d976d195e388f862f4c4270aafe11f5eaa28 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sat, 9 May 2026 16:24:14 +0900
Subject: [PATCH 05/15] feat: Add WakeWordDetectionTimeout for improved error
 handling in wake-word detection

---
 stackchan_server/wakeup_word_detection/__init__.py  |  2 ++
 .../wakeup_word_detection/whisper_server.py         | 13 ++++++++++---
 stackchan_server/ws_proxy.py                        |  8 +++++++-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/stackchan_server/wakeup_word_detection/__init__.py b/stackchan_server/wakeup_word_detection/__init__.py
index dbbd0c2..198a4fb 100644
--- a/stackchan_server/wakeup_word_detection/__init__.py
+++ b/stackchan_server/wakeup_word_detection/__init__.py
@@ -1,6 +1,7 @@
 from .create import create_server_side_wake_word_detector
 from .whisper_server import (
     WakeWordDetectionError,
+    WakeWordDetectionTimeout,
     WhisperServerWakeWordDetector,
     WhisperServerWakeWordDetectorConfig,
     WhisperServerWakeWordSpeechToTextConfig,
@@ -12,4 +13,5 @@
     "WhisperServerWakeWordDetectorConfig",
     "WhisperServerWakeWordSpeechToTextConfig",
     "WakeWordDetectionError",
+    "WakeWordDetectionTimeout",
 ]
diff --git a/stackchan_server/wakeup_word_detection/whisper_server.py b/stackchan_server/wakeup_word_detection/whisper_server.py
index e389bbb..6b25fe6 100644
--- a/stackchan_server/wakeup_word_detection/whisper_server.py
+++ b/stackchan_server/wakeup_word_detection/whisper_server.py
@@ -20,11 +20,15 @@ class WakeWordDetectionError(Exception):
     pass
 
 
+class WakeWordDetectionTimeout(WakeWordDetectionError):
+    pass
+
+
 class WhisperServerWakeWordDetectorConfig(BaseSettings):
     keywords: list[str] = Field(default_factory=lambda: ["スタックチャン"])
     window_seconds: float = 3.0
     interval_seconds: float = 0.5
-    timeout_seconds: float = 30.0
+    timeout_seconds: float = 300.0
 
     class Config:
         env_prefix = "STACKCHAN_WWD_"
@@ -154,7 +158,9 @@ async def wait_result(self, timeout_seconds: float | None = None) -> bool:
         try:
             await asyncio.wait_for(self._event.wait(), timeout=timeout)
         except asyncio.TimeoutError as exc:
-            raise WakeWordDetectionError("Server-side wake-word detection timed out") from exc
+            raise WakeWordDetectionTimeout(
+                "Server-side wake-word detection timed out"
+            ) from exc
 
         if self._error is not None:
             raise WakeWordDetectionError(str(self._error)) from self._error
@@ -213,4 +219,5 @@ def _normalize_text(text: str) -> str:
     "WhisperServerWakeWordDetectorConfig",
     "WhisperServerWakeWordSpeechToTextConfig",
     "WakeWordDetectionError",
-]
\ No newline at end of file
+    "WakeWordDetectionTimeout",
+]
diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py
index 55576a8..f4024ce 100644
--- a/stackchan_server/ws_proxy.py
+++ b/stackchan_server/ws_proxy.py
@@ -28,6 +28,7 @@
 from .types import SpeechRecognizer, SpeechSynthesizer
 from .wakeup_word_detection import (
     WakeWordDetectionError,
+    WakeWordDetectionTimeout,
     create_server_side_wake_word_detector,
 )
 
@@ -340,7 +341,9 @@ async def request_server_wakeword_detection(
             return await asyncio.wait_for(asyncio.shield(task), timeout=timeout_seconds)
         except asyncio.TimeoutError as exc:
             await self.stop_server_wakeword_detection()
-            raise WakeWordDetectionError("Server-side wake-word detection timed out") from exc
+            raise WakeWordDetectionTimeout(
+                "Server-side wake-word detection timed out"
+            ) from exc
 
     async def _receive_loop(self) -> None:
         try:
@@ -631,6 +634,9 @@ async def _run_server_wakeword_detection(self) -> bool:
             return detected
         except asyncio.CancelledError:
             raise
+        except WakeWordDetectionTimeout as exc:
+            logger.info("Server-side wake-word detection stopped: %s", exc)
+            return False
         except WakeWordDetectionError as exc:
             logger.warning("Server-side wake-word detection stopped: %s", exc)
             return False

From 40543d013cd2265f21945e47faa26ba2801e8f3b Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sat, 9 May 2026 16:24:21 +0900
Subject: [PATCH 06/15] feat: Enhance display logic to track server wake word
 idle state

---
 firmware/include/display.hpp |  2 ++
 firmware/src/display.cpp     | 10 +++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/firmware/include/display.hpp b/firmware/include/display.hpp
index 5cfc72d..2012127 100644
--- a/firmware/include/display.hpp
+++ b/firmware/include/display.hpp
@@ -26,8 +26,10 @@ class Display
   void drawFace();
   bool isAtomS3R() const;
   int32_t statusBarHeight() const;
+  bool shouldShowServerWakeWordIdle() const;
 
   StateMachine &state_;
   bool has_prev_state_ = false;
   StateMachine::State prev_state_ = StateMachine::Idle;
+  bool prev_server_wake_word_idle_ = false;
 };
diff --git a/firmware/src/display.cpp b/firmware/src/display.cpp
index 21aaf62..aeb9676 100644
--- a/firmware/src/display.cpp
+++ b/firmware/src/display.cpp
@@ -71,12 +71,14 @@ void Display::init()
   drawFace();
   has_prev_state_ = true;
   prev_state_ = state_.getState();
+  prev_server_wake_word_idle_ = shouldShowServerWakeWordIdle();
 }
 
 void Display::loop()
 {
   StateMachine::State current = state_.getState();
-  if (!has_prev_state_ || current != prev_state_)
+  bool current_server_wake_word_idle = shouldShowServerWakeWordIdle();
+  if (!has_prev_state_ || current != prev_state_ || current_server_wake_word_idle != prev_server_wake_word_idle_)
   {
     GFXModule.fillScreen(TFT_BLACK);
     drawForState(current);
@@ -84,6 +86,7 @@ void Display::loop()
   }
 
   prev_state_ = current;
+  prev_server_wake_word_idle_ = current_server_wake_word_idle;
   has_prev_state_ = true;
 }
 
@@ -174,6 +177,11 @@ bool Display::isAtomS3R() const
 #endif
 }
 
+bool Display::shouldShowServerWakeWordIdle() const
+{
+  return state_.getState() == StateMachine::Idle && shouldUseServerWakeWord();
+}
+
 int32_t Display::statusBarHeight() const
 {
   return isAtomS3R() ? 28 : 20;

From 7235abfb1f10164cd9ae5cf6d05c991f3b3062d6 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sat, 9 May 2026 16:47:38 +0900
Subject: [PATCH 07/15] feat: Update StateCmd handling for server-side wakeword
 detection

---
 AGENTS.md                                     |  7 +-
 docs/websocket_protocols_ja.md                | 19 +++---
 firmware/include/display.hpp                  |  2 -
 firmware/include/state_machine.hpp            |  8 ++-
 .../generated_protobuf/websocket-message.pb.c |  2 -
 .../generated_protobuf/websocket-message.pb.h | 29 +++------
 firmware/src/display.cpp                      | 17 ++---
 firmware/src/main.cpp                         | 58 +++++++++--------
 firmware/src/state_machine.cpp                |  7 ++
 protobuf/websocket-message.proto              |  8 +--
 .../websocket_message_pb2.py                  | 64 +++++++++----------
 stackchan_server/protobuf_ws.py               | 12 +---
 stackchan_server/ws_proxy.py                  | 17 +----
 13 files changed, 106 insertions(+), 144 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 26b7ae4..54be171 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -12,8 +12,8 @@
 
 ## 状態遷移の要点
 
-- ファームウェア状態: `Idle`, `Listening`, `Thinking`, `Speaking`, `Disconnected`
-- サーバーから指示できるのは `StateCmd` の `Idle` / `Listening` / `Thinking` / `Speaking`
+- ファームウェア状態: `Idle`, `Listening`, `Thinking`, `Speaking`, `ServerWwd`, `Disconnected`
+- サーバーから指示できるのは `StateCmd` の `Idle` / `Listening` / `Thinking` / `Speaking` / `ServerWwd`
 - `Disconnected` はファームウェア内部状態で、WebSocket 切断時に入る
 - `WakeWordEvt` を受けるか、REST API の wakeword 擬似発火で talk session が始まる
 
@@ -75,6 +75,7 @@
 - `websocket.client.host` を StackChan の識別子として使う
 - 同一 IP の再接続時は既存接続を置き換える
 - `listen()` は `Listening` 指示後、音声 uplink 完了を待つ
+- サーバーサイド wakeword 検出中は `ServerWwd` を指示する
 - `speak()` は TTS downlink 送信後、`SpeakDoneEvt` を待つ
 - `move_servo()` / `wait_servo_complete()` を公開
 
@@ -106,7 +107,7 @@
   - `MoveX`, `MoveY`, `Sleep` を順次処理
   - 完了時に `ServoDoneEvt`
 - `src/display.cpp`
-  - `Idle=濃いグレー`, `Listening=青`, `Thinking=オレンジ`, `Speaking=緑`, `Disconnected=赤`
+  - `Idle=濃いグレー`, `Listening=青`, `Thinking=オレンジ`, `Speaking=緑`, `ServerWwd=Idle(Server-WWD)`, `Disconnected=赤`
 
 ## サンプルアプリ
 
diff --git a/docs/websocket_protocols_ja.md b/docs/websocket_protocols_ja.md
index e8694a0..fc2945c 100644
--- a/docs/websocket_protocols_ja.md
+++ b/docs/websocket_protocols_ja.md
@@ -75,7 +75,7 @@
 
 ### 現行実装メモ
 
-- `StateCmd(Listening, WAKE_WORD)` を受けた CoreS3 は、見た目の状態を `Idle(Server-WWD)` のままにしてこの kind で uplink します。
+- `StateCmd(ServerWwd)` を受けた CoreS3 は、この kind で uplink を開始します。
 - 無音 3 秒によるクライアント側自動終了は行いません。
 - サーバーはこの kind だけを server-side wakeword detector にルーティングします。
 
@@ -106,7 +106,7 @@
 
 - 方向: Server → CoreS3
 - `messageType`: `DATA` のみ
-- body: `StateCommand { state, listening_purpose }`
+- body: `StateCommand { state }`
 
 利用する状態名:
 
@@ -114,21 +114,17 @@
 - `Listening`
 - `Thinking`
 - `Speaking`
-
-`listening_purpose` の値:
-
-- `SPEECH`: 通常の会話入力
-- `WAKE_WORD`: サーバーサイド wakeword 検出用の uplink
+- `ServerWwd`
 
 ### 現行実装メモ
 
-- `proxy.listen()` 開始時に Server が `StateCmd(Listening, SPEECH)` を指示します。
-- サーバーサイド wakeword 検出開始時は `StateCmd(Listening, WAKE_WORD)` を指示します。
+- `proxy.listen()` 開始時に Server が `StateCmd(Listening)` を指示します。
+- サーバーサイド wakeword 検出開始時は `StateCmd(ServerWwd)` を指示します。
 - 音声 uplink の `END` を受けると、Server は `Thinking` を指示します。
 - `proxy.speak()` 完了後、Server は `Idle` を指示します。
 
 > [!NOTE]
-> `WAKE_WORD` の場合、CoreS3 は内部的にマイク uplink を開始しますが、状態表示は `Listening` に遷移せず `Idle(Server-WWD)` のままです。また無音 3 秒による自動終了も行いません。
+> `ServerWwd` の場合、CoreS3 は内部的にマイク uplink を開始しますが、表示は `Idle(Server-WWD)` にし、無音 3 秒による自動終了も行いません。
 
 ## ウェイクワード検出 `WakeWordEvt`
 
@@ -154,7 +150,7 @@ CoreS3 側は `has_server_wake_word=true` を受けると、デバイス側 wake
 ## サーバーサイド wakeword 検出フロー
 
 - 環境変数 `STACKCHAN_USE_WWD_WHISPER_SERVER=1` の場合、サーバーは `@app.setup()` 完了後と `Idle` 復帰後に自動でサーバーサイド wakeword 検出を開始します。
-- サーバーは `StateCmd(Listening, WAKE_WORD)` を送信して `MESSAGE_KIND_SERVER_WWD_PCM` のマイク uplink を受信します。
+- サーバーは `StateCmd(ServerWwd)` を送信して `MESSAGE_KIND_SERVER_WWD_PCM` のマイク uplink を受信します。
 - 受信した音声の直近 3 秒窓を 0.5 秒ごとに音声認識へ渡し、
   定義キーワード（例: `スタクチャン`）を含むか判定します。
 - 各判定タイミングの認識結果はすべてログ出力されます。
@@ -174,6 +170,7 @@ CoreS3 側は `has_server_wake_word=true` を受けると、デバイス側 wake
 - `Listening`
 - `Thinking`
 - `Speaking`
+- `ServerWwd`
 
 - CoreS3 は状態遷移の entry hook で送信します。
 - WebSocket 切断中は `Disconnected` 状態になりますが、切断時は uplink 送信できないため `StateEvt` では通知されません。
diff --git a/firmware/include/display.hpp b/firmware/include/display.hpp
index 2012127..5cfc72d 100644
--- a/firmware/include/display.hpp
+++ b/firmware/include/display.hpp
@@ -26,10 +26,8 @@ class Display
   void drawFace();
   bool isAtomS3R() const;
   int32_t statusBarHeight() const;
-  bool shouldShowServerWakeWordIdle() const;
 
   StateMachine &state_;
   bool has_prev_state_ = false;
   StateMachine::State prev_state_ = StateMachine::Idle;
-  bool prev_server_wake_word_idle_ = false;
 };
diff --git a/firmware/include/state_machine.hpp b/firmware/include/state_machine.hpp
index d5bcd62..a3f4b97 100644
--- a/firmware/include/state_machine.hpp
+++ b/firmware/include/state_machine.hpp
@@ -14,7 +14,8 @@ class StateMachine
     Listening = 1,
     Thinking = 2,
     Speaking = 3,
-    Disconnected = 4,
+    ServerWwd = 4,
+    Disconnected = 5,
   };
 
   StateMachine() = default;
@@ -25,6 +26,7 @@ class StateMachine
   bool isListening() const;
   bool isThinking() const;
   bool isSpeaking() const;
+  bool isServerWwd() const;
   bool isDisconnected() const;
 
   using Callback = std::function<void(State prev, State next)>;
@@ -33,8 +35,8 @@ class StateMachine
 
 private:
   State state_ = Disconnected;
-  std::array<std::vector<Callback>, 5> entry_events_{};
-  std::array<std::vector<Callback>, 5> exit_events_{};
+  std::array<std::vector<Callback>, 6> entry_events_{};
+  std::array<std::vector<Callback>, 6> exit_events_{};
 };
 
 const char *stateToString(StateMachine::State state);
diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.c b/firmware/lib/generated_protobuf/websocket-message.pb.c
index 7620ee8..f70a79e 100644
--- a/firmware/lib/generated_protobuf/websocket-message.pb.c
+++ b/firmware/lib/generated_protobuf/websocket-message.pb.c
@@ -64,5 +64,3 @@ PB_BIND(stackchan_websocket_v1_ServerMetadata, stackchan_websocket_v1_ServerMeta
 
 
 
-
-
diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.h b/firmware/lib/generated_protobuf/websocket-message.pb.h
index 28e1f54..cc98ef3 100644
--- a/firmware/lib/generated_protobuf/websocket-message.pb.h
+++ b/firmware/lib/generated_protobuf/websocket-message.pb.h
@@ -36,15 +36,10 @@ typedef enum _stackchan_websocket_v1_StackchanState {
     stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE = 0,
     stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING = 1,
     stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING = 2,
-    stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING = 3
+    stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING = 3,
+    stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SERVER_WWD = 4
 } stackchan_websocket_v1_StackchanState;
 
-typedef enum _stackchan_websocket_v1_ListeningPurpose {
-    stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_UNSPECIFIED = 0,
-    stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_SPEECH = 1,
-    stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD = 2
-} stackchan_websocket_v1_ListeningPurpose;
-
 typedef enum _stackchan_websocket_v1_ServoOperation {
     stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP = 0,
     stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_X = 1,
@@ -90,7 +85,6 @@ typedef struct _stackchan_websocket_v1_AudioChunk {
 
 typedef struct _stackchan_websocket_v1_StateCommand {
     stackchan_websocket_v1_StackchanState state;
-    stackchan_websocket_v1_ListeningPurpose listening_purpose;
 } stackchan_websocket_v1_StateCommand;
 
 typedef struct _stackchan_websocket_v1_WakeWordEvent {
@@ -181,12 +175,8 @@ extern "C" {
 #define _stackchan_websocket_v1_MessageType_ARRAYSIZE ((stackchan_websocket_v1_MessageType)(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END+1))
 
 #define _stackchan_websocket_v1_StackchanState_MIN stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE
-#define _stackchan_websocket_v1_StackchanState_MAX stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING
-#define _stackchan_websocket_v1_StackchanState_ARRAYSIZE ((stackchan_websocket_v1_StackchanState)(stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING+1))
-
-#define _stackchan_websocket_v1_ListeningPurpose_MIN stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_UNSPECIFIED
-#define _stackchan_websocket_v1_ListeningPurpose_MAX stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD
-#define _stackchan_websocket_v1_ListeningPurpose_ARRAYSIZE ((stackchan_websocket_v1_ListeningPurpose)(stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD+1))
+#define _stackchan_websocket_v1_StackchanState_MAX stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SERVER_WWD
+#define _stackchan_websocket_v1_StackchanState_ARRAYSIZE ((stackchan_websocket_v1_StackchanState)(stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SERVER_WWD+1))
 
 #define _stackchan_websocket_v1_ServoOperation_MIN stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP
 #define _stackchan_websocket_v1_ServoOperation_MAX stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y
@@ -209,7 +199,6 @@ extern "C" {
 
 
 #define stackchan_websocket_v1_StateCommand_state_ENUMTYPE stackchan_websocket_v1_StackchanState
-#define stackchan_websocket_v1_StateCommand_listening_purpose_ENUMTYPE stackchan_websocket_v1_ListeningPurpose
 
 
 #define stackchan_websocket_v1_StateEvent_state_ENUMTYPE stackchan_websocket_v1_StackchanState
@@ -231,7 +220,7 @@ extern "C" {
 #define stackchan_websocket_v1_AudioWavStart_init_default {0, 0}
 #define stackchan_websocket_v1_AudioWavEnd_init_default {0}
 #define stackchan_websocket_v1_AudioChunk_init_default {{0, {0}}}
-#define stackchan_websocket_v1_StateCommand_init_default {_stackchan_websocket_v1_StackchanState_MIN, _stackchan_websocket_v1_ListeningPurpose_MIN}
+#define stackchan_websocket_v1_StateCommand_init_default {_stackchan_websocket_v1_StackchanState_MIN}
 #define stackchan_websocket_v1_WakeWordEvent_init_default {0}
 #define stackchan_websocket_v1_StateEvent_init_default {_stackchan_websocket_v1_StackchanState_MIN}
 #define stackchan_websocket_v1_SpeakDoneEvent_init_default {0}
@@ -246,7 +235,7 @@ extern "C" {
 #define stackchan_websocket_v1_AudioWavStart_init_zero {0, 0}
 #define stackchan_websocket_v1_AudioWavEnd_init_zero {0}
 #define stackchan_websocket_v1_AudioChunk_init_zero {{0, {0}}}
-#define stackchan_websocket_v1_StateCommand_init_zero {_stackchan_websocket_v1_StackchanState_MIN, _stackchan_websocket_v1_ListeningPurpose_MIN}
+#define stackchan_websocket_v1_StateCommand_init_zero {_stackchan_websocket_v1_StackchanState_MIN}
 #define stackchan_websocket_v1_WakeWordEvent_init_zero {0}
 #define stackchan_websocket_v1_StateEvent_init_zero {_stackchan_websocket_v1_StackchanState_MIN}
 #define stackchan_websocket_v1_SpeakDoneEvent_init_zero {0}
@@ -261,7 +250,6 @@ extern "C" {
 #define stackchan_websocket_v1_AudioWavStart_channels_tag 2
 #define stackchan_websocket_v1_AudioChunk_pcm_bytes_tag 1
 #define stackchan_websocket_v1_StateCommand_state_tag 1
-#define stackchan_websocket_v1_StateCommand_listening_purpose_tag 2
 #define stackchan_websocket_v1_WakeWordEvent_detected_tag 1
 #define stackchan_websocket_v1_StateEvent_state_tag 1
 #define stackchan_websocket_v1_SpeakDoneEvent_done_tag 1
@@ -361,8 +349,7 @@ X(a, STATIC,   SINGULAR, BYTES,    pcm_bytes,         1)
 #define stackchan_websocket_v1_AudioChunk_DEFAULT NULL
 
 #define stackchan_websocket_v1_StateCommand_FIELDLIST(X, a) \
-X(a, STATIC,   SINGULAR, UENUM,    state,             1) \
-X(a, STATIC,   SINGULAR, UENUM,    listening_purpose,   2)
+X(a, STATIC,   SINGULAR, UENUM,    state,             1)
 #define stackchan_websocket_v1_StateCommand_CALLBACK NULL
 #define stackchan_websocket_v1_StateCommand_DEFAULT NULL
 
@@ -463,7 +450,7 @@ extern const pb_msgdesc_t stackchan_websocket_v1_ServerMetadata_msg;
 #define stackchan_websocket_v1_ServoCommand_size 14
 #define stackchan_websocket_v1_ServoDoneEvent_size 2
 #define stackchan_websocket_v1_SpeakDoneEvent_size 2
-#define stackchan_websocket_v1_StateCommand_size 4
+#define stackchan_websocket_v1_StateCommand_size 2
 #define stackchan_websocket_v1_StateEvent_size   2
 #define stackchan_websocket_v1_WakeWordEvent_size 2
 #define stackchan_websocket_v1_WebSocketMessage_size 4113
diff --git a/firmware/src/display.cpp b/firmware/src/display.cpp
index aeb9676..00ba761 100644
--- a/firmware/src/display.cpp
+++ b/firmware/src/display.cpp
@@ -71,14 +71,12 @@ void Display::init()
   drawFace();
   has_prev_state_ = true;
   prev_state_ = state_.getState();
-  prev_server_wake_word_idle_ = shouldShowServerWakeWordIdle();
 }
 
 void Display::loop()
 {
   StateMachine::State current = state_.getState();
-  bool current_server_wake_word_idle = shouldShowServerWakeWordIdle();
-  if (!has_prev_state_ || current != prev_state_ || current_server_wake_word_idle != prev_server_wake_word_idle_)
+  if (!has_prev_state_ || current != prev_state_)
   {
     GFXModule.fillScreen(TFT_BLACK);
     drawForState(current);
@@ -86,7 +84,6 @@ void Display::loop()
   }
 
   prev_state_ = current;
-  prev_server_wake_word_idle_ = current_server_wake_word_idle;
   has_prev_state_ = true;
 }
 
@@ -123,6 +120,11 @@ void Display::drawForState(StateMachine::State state)
     font_color = TFT_BLACK;
     led_color = Adafruit_NeoPixel::ColorHSV(kLedHueGreen, 255, ledValueFromBrightness());
     break;
+  case StateMachine::ServerWwd:
+    bg_color = TFT_DARKGRAY;
+    font_color = TFT_WHITE;
+    led_color = Adafruit_NeoPixel::ColorHSV(0, 0, 0);
+    break;
   case StateMachine::Disconnected:
     bg_color = TFT_RED;
     font_color = TFT_WHITE;
@@ -141,7 +143,7 @@ void Display::drawForState(StateMachine::State state)
   GFXModule.setTextSize(1);
   GFXModule.setTextColor(font_color, bg_color);
   GFXModule.setCursor(isAtomS3R() ? 4 : 10, bar_y + (isAtomS3R() ? 6 : 2));
-  if (state == StateMachine::Idle && shouldUseServerWakeWord())
+  if (state == StateMachine::ServerWwd)
   {
     GFXModule.printf("Idle(Server-WWD)");
     return;
@@ -177,11 +179,6 @@ bool Display::isAtomS3R() const
 #endif
 }
 
-bool Display::shouldShowServerWakeWordIdle() const
-{
-  return state_.getState() == StateMachine::Idle && shouldUseServerWakeWord();
-}
-
 int32_t Display::statusBarHeight() const
 {
   return isAtomS3R() ? 28 : 20;
diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp
index 67256ad..5263e24 100644
--- a/firmware/src/main.cpp
+++ b/firmware/src/main.cpp
@@ -238,44 +238,38 @@ bool applyRemoteStateCommand(const stackchan_websocket_v1_StateCommand &command)
   switch (command.state)
   {
   case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE:
-    if (listening.isWakeWordStreaming())
-    {
-      listening.endWakeWordStreaming();
-    }
     stateMachine.setState(StateMachine::Idle);
     return true;
   case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING:
-    if (command.listening_purpose == stackchan_websocket_v1_ListeningPurpose_LISTENING_PURPOSE_WAKE_WORD &&
-        shouldUseServerWakeWord() &&
-        stateMachine.getState() == StateMachine::Idle)
-    {
-      if (!listening.beginWakeWordStreaming())
-      {
-        log_w("Failed to start server-side wakeword streaming");
-        return false;
-      }
-      return true;
-    }
-
-    if (listening.isWakeWordStreaming())
-    {
-      listening.endWakeWordStreaming();
-    }
     stateMachine.setState(StateMachine::Listening);
     return true;
   case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING:
-    if (listening.isWakeWordStreaming())
-    {
-      listening.endWakeWordStreaming();
-    }
     stateMachine.setState(StateMachine::Thinking);
     return true;
   case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING:
-    if (listening.isWakeWordStreaming())
+    stateMachine.setState(StateMachine::Speaking);
+    return true;
+  case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SERVER_WWD:
+    if (!shouldUseServerWakeWord())
     {
-      listening.endWakeWordStreaming();
+      log_w("Server-side wakeword is not available");
+      return false;
     }
-    stateMachine.setState(StateMachine::Speaking);
+    if (stateMachine.getState() == StateMachine::ServerWwd)
+    {
+      return true;
+    }
+    if (stateMachine.getState() != StateMachine::Idle)
+    {
+      log_w("Cannot enter server-side wakeword from state=%u", static_cast<unsigned>(stateMachine.getState()));
+      return false;
+    }
+    if (!listening.beginWakeWordStreaming())
+    {
+      log_w("Failed to start server-side wakeword streaming");
+      return false;
+    }
+    stateMachine.setState(StateMachine::ServerWwd);
     return true;
   default:
     log_w("Unknown remote state");
@@ -546,6 +540,13 @@ void setup()
     listening.end();
   });
 
+  stateMachine.addStateEntryEvent(StateMachine::ServerWwd, [](StateMachine::State, StateMachine::State) {
+    notifyCurrentState(StateMachine::ServerWwd);
+  });
+  stateMachine.addStateExitEvent(StateMachine::ServerWwd, [](StateMachine::State, StateMachine::State) {
+    listening.endWakeWordStreaming();
+  });
+
   stateMachine.addStateEntryEvent(StateMachine::Speaking, [](StateMachine::State, StateMachine::State) {
     notifyCurrentState(StateMachine::Speaking);
     speaking.begin();
@@ -587,6 +588,9 @@ void loop()
   case StateMachine::Listening:
     listening.loop();
     break;
+  case StateMachine::ServerWwd:
+    listening.loop();
+    break;
   case StateMachine::Thinking:
     // Wait for server side command / audio stream.
     break;
diff --git a/firmware/src/state_machine.cpp b/firmware/src/state_machine.cpp
index 2432cd2..196aaad 100644
--- a/firmware/src/state_machine.cpp
+++ b/firmware/src/state_machine.cpp
@@ -13,6 +13,8 @@ const char *stateToString(StateMachine::State s)
 		return "Thinking";
 	case StateMachine::Speaking:
 		return "Speaking";
+	case StateMachine::ServerWwd:
+		return "ServerWwd";
 	case StateMachine::Disconnected:
 		return "Disconnected";
 	default:
@@ -66,6 +68,11 @@ bool StateMachine::isThinking() const
 	return state_ == Thinking;
 }
 
+bool StateMachine::isServerWwd() const
+{
+	return state_ == ServerWwd;
+}
+
 bool StateMachine::isDisconnected() const
 {
 	return state_ == Disconnected;
diff --git a/protobuf/websocket-message.proto b/protobuf/websocket-message.proto
index 10932ac..4d288ef 100644
--- a/protobuf/websocket-message.proto
+++ b/protobuf/websocket-message.proto
@@ -61,12 +61,7 @@ enum StackchanState {
   STACKCHAN_STATE_LISTENING = 1;
   STACKCHAN_STATE_THINKING = 2;
   STACKCHAN_STATE_SPEAKING = 3;
-}
-
-enum ListeningPurpose {
-  LISTENING_PURPOSE_UNSPECIFIED = 0;
-  LISTENING_PURPOSE_SPEECH = 1;
-  LISTENING_PURPOSE_WAKE_WORD = 2;
+  STACKCHAN_STATE_SERVER_WWD = 4;
 }
 
 enum ServoOperation {
@@ -106,7 +101,6 @@ message AudioChunk {
 
 message StateCommand {
   StackchanState state = 1;
-  ListeningPurpose listening_purpose = 2;
 }
 
 message WakeWordEvent {
diff --git a/stackchan_server/generated_protobuf/websocket_message_pb2.py b/stackchan_server/generated_protobuf/websocket_message_pb2.py
index 15b3d61..1237224 100644
--- a/stackchan_server/generated_protobuf/websocket_message_pb2.py
+++ b/stackchan_server/generated_protobuf/websocket_message_pb2.py
@@ -24,27 +24,25 @@
 
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17websocket-message.proto\x12\x16stackchan.websocket.v1\"\x96\x08\n\x10WebSocketMessage\x12\x31\n\x04kind\x18\x01 \x01(\x0e\x32#.stackchan.websocket.v1.MessageKind\x12\x39\n\x0cmessage_type\x18\x02 \x01(\x0e\x32#.stackchan.websocket.v1.MessageType\x12\x0b\n\x03seq\x18\x03 \x01(\r\x12@\n\x0f\x61udio_pcm_start\x18\n \x01(\x0b\x32%.stackchan.websocket.v1.AudioPcmStartH\x00\x12<\n\x0e\x61udio_pcm_data\x18\x0b \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_pcm_end\x18\x0c \x01(\x0b\x32#.stackchan.websocket.v1.AudioPcmEndH\x00\x12@\n\x0f\x61udio_wav_start\x18\x14 \x01(\x0b\x32%.stackchan.websocket.v1.AudioWavStartH\x00\x12<\n\x0e\x61udio_wav_data\x18\x15 \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_wav_end\x18\x16 \x01(\x0b\x32#.stackchan.websocket.v1.AudioWavEndH\x00\x12\x39\n\tstate_cmd\x18\x1e \x01(\x0b\x32$.stackchan.websocket.v1.StateCommandH\x00\x12>\n\rwake_word_evt\x18\x1f \x01(\x0b\x32%.stackchan.websocket.v1.WakeWordEventH\x00\x12\x37\n\tstate_evt\x18  \x01(\x0b\x32\".stackchan.websocket.v1.StateEventH\x00\x12@\n\x0espeak_done_evt\x18! \x01(\x0b\x32&.stackchan.websocket.v1.SpeakDoneEventH\x00\x12\x41\n\tservo_cmd\x18\" \x01(\x0b\x32,.stackchan.websocket.v1.ServoCommandSequenceH\x00\x12@\n\x0eservo_done_evt\x18# \x01(\x0b\x32&.stackchan.websocket.v1.ServoDoneEventH\x00\x12\x45\n\x11\x66irmware_metadata\x18$ \x01(\x0b\x32(.stackchan.websocket.v1.FirmwareMetadataH\x00\x12\x41\n\x0fserver_metadata\x18% \x01(\x0b\x32&.stackchan.websocket.v1.ServerMetadataH\x00\x42\x06\n\x04\x62ody\"\x0f\n\rAudioPcmStart\"\r\n\x0b\x41udioPcmEnd\"6\n\rAudioWavStart\x12\x13\n\x0bsample_rate\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\"\r\n\x0b\x41udioWavEnd\"\x1f\n\nAudioChunk\x12\x11\n\tpcm_bytes\x18\x01 \x01(\x0c\"\x8a\x01\n\x0cStateCommand\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\x12\x43\n\x11listening_purpose\x18\x02 \x01(\x0e\x32(.stackchan.websocket.v1.ListeningPurpose\"!\n\rWakeWordEvent\x12\x10\n\x08\x64\x65tected\x18\x01 \x01(\x08\"C\n\nStateEvent\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"\x1e\n\x0eSpeakDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"N\n\x14ServoCommandSequence\x12\x36\n\x08\x63ommands\x18\x01 \x03(\x0b\x32$.stackchan.websocket.v1.ServoCommand\"f\n\x0cServoCommand\x12\x32\n\x02op\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.ServoOperation\x12\r\n\x05\x61ngle\x18\x02 \x01(\x11\x12\x13\n\x0b\x64uration_ms\x18\x03 \x01(\x11\"\x1e\n\x0eServoDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"\x99\x02\n\x10\x46irmwareMetadata\x12\x37\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\".stackchan.websocket.v1.DeviceType\x12\x15\n\rdisplay_width\x18\x02 \x01(\r\x12\x16\n\x0e\x64isplay_height\x18\x03 \x01(\r\x12\x1c\n\x14has_device_wake_word\x18\x04 \x01(\x08\x12\x0f\n\x07has_led\x18\x05 \x01(\x08\x12\x35\n\nservo_type\x18\x06 \x01(\x0e\x32!.stackchan.websocket.v1.ServoType\x12\x1d\n\x15supports_audio_duplex\x18\x07 \x01(\x08\x12\x18\n\x10\x66irmware_version\x18\x08 \x01(\t\"F\n\x0eServerMetadata\x12\x1c\n\x14has_server_wake_word\x18\x01 \x01(\x08\x12\x16\n\x0eserver_version\x18\x02 \x01(\t*\x80\x03\n\x0bMessageKind\x12\x1c\n\x18MESSAGE_KIND_UNSPECIFIED\x10\x00\x12\x1a\n\x16MESSAGE_KIND_AUDIO_PCM\x10\x01\x12\x1a\n\x16MESSAGE_KIND_AUDIO_WAV\x10\x02\x12\x1a\n\x16MESSAGE_KIND_STATE_CMD\x10\x03\x12\x1e\n\x1aMESSAGE_KIND_WAKE_WORD_EVT\x10\x04\x12\x1a\n\x16MESSAGE_KIND_STATE_EVT\x10\x05\x12\x1f\n\x1bMESSAGE_KIND_SPEAK_DONE_EVT\x10\x06\x12\x1a\n\x16MESSAGE_KIND_SERVO_CMD\x10\x07\x12\x1f\n\x1bMESSAGE_KIND_SERVO_DONE_EVT\x10\x08\x12\"\n\x1eMESSAGE_KIND_FIRMWARE_METADATA\x10\t\x12 \n\x1cMESSAGE_KIND_SERVER_METADATA\x10\n\x12\x1f\n\x1bMESSAGE_KIND_SERVER_WWD_PCM\x10\x0b*p\n\x0bMessageType\x12\x1c\n\x18MESSAGE_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12MESSAGE_TYPE_START\x10\x01\x12\x15\n\x11MESSAGE_TYPE_DATA\x10\x02\x12\x14\n\x10MESSAGE_TYPE_END\x10\x03*\x85\x01\n\x0eStackchanState\x12\x18\n\x14STACKCHAN_STATE_IDLE\x10\x00\x12\x1d\n\x19STACKCHAN_STATE_LISTENING\x10\x01\x12\x1c\n\x18STACKCHAN_STATE_THINKING\x10\x02\x12\x1c\n\x18STACKCHAN_STATE_SPEAKING\x10\x03*t\n\x10ListeningPurpose\x12!\n\x1dLISTENING_PURPOSE_UNSPECIFIED\x10\x00\x12\x1c\n\x18LISTENING_PURPOSE_SPEECH\x10\x01\x12\x1f\n\x1bLISTENING_PURPOSE_WAKE_WORD\x10\x02*c\n\x0eServoOperation\x12\x19\n\x15SERVO_OPERATION_SLEEP\x10\x00\x12\x1a\n\x16SERVO_OPERATION_MOVE_X\x10\x01\x12\x1a\n\x16SERVO_OPERATION_MOVE_Y\x10\x02*\x85\x01\n\nDeviceType\x12\x1b\n\x17\x44\x45VICE_TYPE_UNSPECIFIED\x10\x00\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5STACK_CORES3\x10\x01\x12\x1a\n\x16\x44\x45VICE_TYPE_M5ATOM_S3R\x10\x02\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5ATOM_ECHOS3R\x10\x03*i\n\tServoType\x12\x1a\n\x16SERVO_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fSERVO_TYPE_NONE\x10\x01\x12\x13\n\x0fSERVO_TYPE_SG90\x10\x02\x12\x16\n\x12SERVO_TYPE_SCS0009\x10\x03\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17websocket-message.proto\x12\x16stackchan.websocket.v1\"\x96\x08\n\x10WebSocketMessage\x12\x31\n\x04kind\x18\x01 \x01(\x0e\x32#.stackchan.websocket.v1.MessageKind\x12\x39\n\x0cmessage_type\x18\x02 \x01(\x0e\x32#.stackchan.websocket.v1.MessageType\x12\x0b\n\x03seq\x18\x03 \x01(\r\x12@\n\x0f\x61udio_pcm_start\x18\n \x01(\x0b\x32%.stackchan.websocket.v1.AudioPcmStartH\x00\x12<\n\x0e\x61udio_pcm_data\x18\x0b \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_pcm_end\x18\x0c \x01(\x0b\x32#.stackchan.websocket.v1.AudioPcmEndH\x00\x12@\n\x0f\x61udio_wav_start\x18\x14 \x01(\x0b\x32%.stackchan.websocket.v1.AudioWavStartH\x00\x12<\n\x0e\x61udio_wav_data\x18\x15 \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_wav_end\x18\x16 \x01(\x0b\x32#.stackchan.websocket.v1.AudioWavEndH\x00\x12\x39\n\tstate_cmd\x18\x1e \x01(\x0b\x32$.stackchan.websocket.v1.StateCommandH\x00\x12>\n\rwake_word_evt\x18\x1f \x01(\x0b\x32%.stackchan.websocket.v1.WakeWordEventH\x00\x12\x37\n\tstate_evt\x18  \x01(\x0b\x32\".stackchan.websocket.v1.StateEventH\x00\x12@\n\x0espeak_done_evt\x18! \x01(\x0b\x32&.stackchan.websocket.v1.SpeakDoneEventH\x00\x12\x41\n\tservo_cmd\x18\" \x01(\x0b\x32,.stackchan.websocket.v1.ServoCommandSequenceH\x00\x12@\n\x0eservo_done_evt\x18# \x01(\x0b\x32&.stackchan.websocket.v1.ServoDoneEventH\x00\x12\x45\n\x11\x66irmware_metadata\x18$ \x01(\x0b\x32(.stackchan.websocket.v1.FirmwareMetadataH\x00\x12\x41\n\x0fserver_metadata\x18% \x01(\x0b\x32&.stackchan.websocket.v1.ServerMetadataH\x00\x42\x06\n\x04\x62ody\"\x0f\n\rAudioPcmStart\"\r\n\x0b\x41udioPcmEnd\"6\n\rAudioWavStart\x12\x13\n\x0bsample_rate\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\"\r\n\x0b\x41udioWavEnd\"\x1f\n\nAudioChunk\x12\x11\n\tpcm_bytes\x18\x01 \x01(\x0c\"E\n\x0cStateCommand\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"!\n\rWakeWordEvent\x12\x10\n\x08\x64\x65tected\x18\x01 \x01(\x08\"C\n\nStateEvent\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"\x1e\n\x0eSpeakDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"N\n\x14ServoCommandSequence\x12\x36\n\x08\x63ommands\x18\x01 \x03(\x0b\x32$.stackchan.websocket.v1.ServoCommand\"f\n\x0cServoCommand\x12\x32\n\x02op\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.ServoOperation\x12\r\n\x05\x61ngle\x18\x02 \x01(\x11\x12\x13\n\x0b\x64uration_ms\x18\x03 \x01(\x11\"\x1e\n\x0eServoDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"\x99\x02\n\x10\x46irmwareMetadata\x12\x37\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\".stackchan.websocket.v1.DeviceType\x12\x15\n\rdisplay_width\x18\x02 \x01(\r\x12\x16\n\x0e\x64isplay_height\x18\x03 \x01(\r\x12\x1c\n\x14has_device_wake_word\x18\x04 \x01(\x08\x12\x0f\n\x07has_led\x18\x05 \x01(\x08\x12\x35\n\nservo_type\x18\x06 \x01(\x0e\x32!.stackchan.websocket.v1.ServoType\x12\x1d\n\x15supports_audio_duplex\x18\x07 \x01(\x08\x12\x18\n\x10\x66irmware_version\x18\x08 \x01(\t\"F\n\x0eServerMetadata\x12\x1c\n\x14has_server_wake_word\x18\x01 \x01(\x08\x12\x16\n\x0eserver_version\x18\x02 \x01(\t*\x80\x03\n\x0bMessageKind\x12\x1c\n\x18MESSAGE_KIND_UNSPECIFIED\x10\x00\x12\x1a\n\x16MESSAGE_KIND_AUDIO_PCM\x10\x01\x12\x1a\n\x16MESSAGE_KIND_AUDIO_WAV\x10\x02\x12\x1a\n\x16MESSAGE_KIND_STATE_CMD\x10\x03\x12\x1e\n\x1aMESSAGE_KIND_WAKE_WORD_EVT\x10\x04\x12\x1a\n\x16MESSAGE_KIND_STATE_EVT\x10\x05\x12\x1f\n\x1bMESSAGE_KIND_SPEAK_DONE_EVT\x10\x06\x12\x1a\n\x16MESSAGE_KIND_SERVO_CMD\x10\x07\x12\x1f\n\x1bMESSAGE_KIND_SERVO_DONE_EVT\x10\x08\x12\"\n\x1eMESSAGE_KIND_FIRMWARE_METADATA\x10\t\x12 \n\x1cMESSAGE_KIND_SERVER_METADATA\x10\n\x12\x1f\n\x1bMESSAGE_KIND_SERVER_WWD_PCM\x10\x0b*p\n\x0bMessageType\x12\x1c\n\x18MESSAGE_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12MESSAGE_TYPE_START\x10\x01\x12\x15\n\x11MESSAGE_TYPE_DATA\x10\x02\x12\x14\n\x10MESSAGE_TYPE_END\x10\x03*\xa5\x01\n\x0eStackchanState\x12\x18\n\x14STACKCHAN_STATE_IDLE\x10\x00\x12\x1d\n\x19STACKCHAN_STATE_LISTENING\x10\x01\x12\x1c\n\x18STACKCHAN_STATE_THINKING\x10\x02\x12\x1c\n\x18STACKCHAN_STATE_SPEAKING\x10\x03\x12\x1e\n\x1aSTACKCHAN_STATE_SERVER_WWD\x10\x04*c\n\x0eServoOperation\x12\x19\n\x15SERVO_OPERATION_SLEEP\x10\x00\x12\x1a\n\x16SERVO_OPERATION_MOVE_X\x10\x01\x12\x1a\n\x16SERVO_OPERATION_MOVE_Y\x10\x02*\x85\x01\n\nDeviceType\x12\x1b\n\x17\x44\x45VICE_TYPE_UNSPECIFIED\x10\x00\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5STACK_CORES3\x10\x01\x12\x1a\n\x16\x44\x45VICE_TYPE_M5ATOM_S3R\x10\x02\x12\x1e\n\x1a\x44\x45VICE_TYPE_M5ATOM_ECHOS3R\x10\x03*i\n\tServoType\x12\x1a\n\x16SERVO_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fSERVO_TYPE_NONE\x10\x01\x12\x13\n\x0fSERVO_TYPE_SG90\x10\x02\x12\x16\n\x12SERVO_TYPE_SCS0009\x10\x03\x62\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'websocket_message_pb2', _globals)
 if not _descriptor._USE_C_DESCRIPTORS:
   DESCRIPTOR._loaded_options = None
-  _globals['_MESSAGEKIND']._serialized_start=2086
-  _globals['_MESSAGEKIND']._serialized_end=2470
-  _globals['_MESSAGETYPE']._serialized_start=2472
-  _globals['_MESSAGETYPE']._serialized_end=2584
-  _globals['_STACKCHANSTATE']._serialized_start=2587
-  _globals['_STACKCHANSTATE']._serialized_end=2720
-  _globals['_LISTENINGPURPOSE']._serialized_start=2722
-  _globals['_LISTENINGPURPOSE']._serialized_end=2838
-  _globals['_SERVOOPERATION']._serialized_start=2840
-  _globals['_SERVOOPERATION']._serialized_end=2939
-  _globals['_DEVICETYPE']._serialized_start=2942
-  _globals['_DEVICETYPE']._serialized_end=3075
-  _globals['_SERVOTYPE']._serialized_start=3077
-  _globals['_SERVOTYPE']._serialized_end=3182
+  _globals['_MESSAGEKIND']._serialized_start=2016
+  _globals['_MESSAGEKIND']._serialized_end=2400
+  _globals['_MESSAGETYPE']._serialized_start=2402
+  _globals['_MESSAGETYPE']._serialized_end=2514
+  _globals['_STACKCHANSTATE']._serialized_start=2517
+  _globals['_STACKCHANSTATE']._serialized_end=2682
+  _globals['_SERVOOPERATION']._serialized_start=2684
+  _globals['_SERVOOPERATION']._serialized_end=2783
+  _globals['_DEVICETYPE']._serialized_start=2786
+  _globals['_DEVICETYPE']._serialized_end=2919
+  _globals['_SERVOTYPE']._serialized_start=2921
+  _globals['_SERVOTYPE']._serialized_end=3026
   _globals['_WEBSOCKETMESSAGE']._serialized_start=52
   _globals['_WEBSOCKETMESSAGE']._serialized_end=1098
   _globals['_AUDIOPCMSTART']._serialized_start=1100
@@ -57,22 +55,22 @@
   _globals['_AUDIOWAVEND']._serialized_end=1201
   _globals['_AUDIOCHUNK']._serialized_start=1203
   _globals['_AUDIOCHUNK']._serialized_end=1234
-  _globals['_STATECOMMAND']._serialized_start=1237
-  _globals['_STATECOMMAND']._serialized_end=1375
-  _globals['_WAKEWORDEVENT']._serialized_start=1377
-  _globals['_WAKEWORDEVENT']._serialized_end=1410
-  _globals['_STATEEVENT']._serialized_start=1412
-  _globals['_STATEEVENT']._serialized_end=1479
-  _globals['_SPEAKDONEEVENT']._serialized_start=1481
-  _globals['_SPEAKDONEEVENT']._serialized_end=1511
-  _globals['_SERVOCOMMANDSEQUENCE']._serialized_start=1513
-  _globals['_SERVOCOMMANDSEQUENCE']._serialized_end=1591
-  _globals['_SERVOCOMMAND']._serialized_start=1593
-  _globals['_SERVOCOMMAND']._serialized_end=1695
-  _globals['_SERVODONEEVENT']._serialized_start=1697
-  _globals['_SERVODONEEVENT']._serialized_end=1727
-  _globals['_FIRMWAREMETADATA']._serialized_start=1730
-  _globals['_FIRMWAREMETADATA']._serialized_end=2011
-  _globals['_SERVERMETADATA']._serialized_start=2013
-  _globals['_SERVERMETADATA']._serialized_end=2083
+  _globals['_STATECOMMAND']._serialized_start=1236
+  _globals['_STATECOMMAND']._serialized_end=1305
+  _globals['_WAKEWORDEVENT']._serialized_start=1307
+  _globals['_WAKEWORDEVENT']._serialized_end=1340
+  _globals['_STATEEVENT']._serialized_start=1342
+  _globals['_STATEEVENT']._serialized_end=1409
+  _globals['_SPEAKDONEEVENT']._serialized_start=1411
+  _globals['_SPEAKDONEEVENT']._serialized_end=1441
+  _globals['_SERVOCOMMANDSEQUENCE']._serialized_start=1443
+  _globals['_SERVOCOMMANDSEQUENCE']._serialized_end=1521
+  _globals['_SERVOCOMMAND']._serialized_start=1523
+  _globals['_SERVOCOMMAND']._serialized_end=1625
+  _globals['_SERVODONEEVENT']._serialized_start=1627
+  _globals['_SERVODONEEVENT']._serialized_end=1657
+  _globals['_FIRMWAREMETADATA']._serialized_start=1660
+  _globals['_FIRMWAREMETADATA']._serialized_end=1941
+  _globals['_SERVERMETADATA']._serialized_start=1943
+  _globals['_SERVERMETADATA']._serialized_end=2013
 # @@protoc_insertion_point(module_scope)
diff --git a/stackchan_server/protobuf_ws.py b/stackchan_server/protobuf_ws.py
index 443900b..b652a2c 100644
--- a/stackchan_server/protobuf_ws.py
+++ b/stackchan_server/protobuf_ws.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from collections.abc import Sequence
-from enum import IntEnum, StrEnum
+from enum import StrEnum
 from typing import Any, Literal, cast
 
 from .generated_protobuf import websocket_message_pb2 as _ws_pb2
@@ -15,12 +15,6 @@
 ServoCommand = ServoMoveCommand | ServoSleepCommand
 
 
-class ListeningPurpose(IntEnum):
-    UNSPECIFIED = 0
-    SPEECH = 1
-    WAKE_WORD = 2
-
-
 def _ensure_range(value: int, *, minimum: int, maximum: int, label: str) -> int:
     if not minimum <= value <= maximum:
         raise ValueError(f"{label} must be between {minimum} and {maximum}: {value}")
@@ -101,8 +95,6 @@ def encode_audio_wav_end_message(seq: int) -> bytes:
 def encode_state_command_message(
     seq: int,
     state_id: int,
-    *,
-    listening_purpose: int = ListeningPurpose.SPEECH,
 ) -> bytes:
     message = _new_message(
         ws_pb2.MESSAGE_KIND_STATE_CMD,
@@ -110,7 +102,6 @@ def encode_state_command_message(
         seq,
     )
     message.state_cmd.state = int(state_id)
-    message.state_cmd.listening_purpose = int(listening_purpose)
     return message.SerializeToString()
 
 
@@ -185,7 +176,6 @@ def encode_servo_command_message(seq: int, commands: Sequence[ServoCommand]) ->
 
 __all__ = [
     "ServoCommand",
-    "ListeningPurpose",
     "encode_audio_pcm_data_message",
     "encode_audio_pcm_end_message",
     "encode_audio_pcm_start_message",
diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py
index f4024ce..7a5e06b 100644
--- a/stackchan_server/ws_proxy.py
+++ b/stackchan_server/ws_proxy.py
@@ -17,7 +17,6 @@
 from .generated_protobuf import websocket_message_pb2 as _ws_pb2
 from .listen import EmptyTranscriptError, ListenHandler, TimeoutError
 from .protobuf_ws import (
-    ListeningPurpose,
     encode_server_metadata_message,
     encode_servo_command_message,
     encode_state_command_message,
@@ -57,6 +56,7 @@ class FirmwareState(IntEnum):
     LISTENING = 1
     THINKING = 2
     SPEAKING = 3
+    SERVER_WWD = 4
 
 
 class ServoMoveType(StrEnum):
@@ -199,13 +199,8 @@ async def speak(self, text: str) -> None:
     async def send_state_command(
         self,
         state_id: int | FirmwareState,
-        *,
-        listening_purpose: ListeningPurpose = ListeningPurpose.SPEECH,
     ) -> None:
-        await self._send_state_command(
-            state_id,
-            listening_purpose=listening_purpose,
-        )
+        await self._send_state_command(state_id)
 
     async def reset_state(self) -> None:
         await self.send_state_command(FirmwareState.IDLE)
@@ -585,14 +580,11 @@ def _handle_servo_done_event(self, message: Any) -> None:
     async def _send_state_command(
         self,
         state_id: int | FirmwareState,
-        *,
-        listening_purpose: ListeningPurpose = ListeningPurpose.SPEECH,
     ) -> None:
         await self._send_ws_bytes(
             encode_state_command_message(
                 self._next_down_seq(),
                 int(state_id),
-                listening_purpose=int(listening_purpose),
             )
         )
 
@@ -624,10 +616,7 @@ async def _run_server_wakeword_detection(self) -> bool:
         should_restart = False
         try:
             await detector.start()
-            await self.send_state_command(
-                FirmwareState.LISTENING,
-                listening_purpose=ListeningPurpose.WAKE_WORD,
-            )
+            await self.send_state_command(FirmwareState.SERVER_WWD)
             detected = await detector.wait_result()
             if detected:
                 self._wakeword_event.set()

From 97ba831d81205561084dd2b40a398f2a7e55664e Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sat, 9 May 2026 16:51:46 +0900
Subject: [PATCH 08/15] feat: Update ServerWwd state representation to
 "Idle(Server-WWD)" in display and state machine

---
 firmware/src/display.cpp       | 5 -----
 firmware/src/state_machine.cpp | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/firmware/src/display.cpp b/firmware/src/display.cpp
index 00ba761..07f0815 100644
--- a/firmware/src/display.cpp
+++ b/firmware/src/display.cpp
@@ -143,11 +143,6 @@ void Display::drawForState(StateMachine::State state)
   GFXModule.setTextSize(1);
   GFXModule.setTextColor(font_color, bg_color);
   GFXModule.setCursor(isAtomS3R() ? 4 : 10, bar_y + (isAtomS3R() ? 6 : 2));
-  if (state == StateMachine::ServerWwd)
-  {
-    GFXModule.printf("Idle(Server-WWD)");
-    return;
-  }
   GFXModule.printf("%s", stateToString(state));
 }
 
diff --git a/firmware/src/state_machine.cpp b/firmware/src/state_machine.cpp
index 196aaad..ea38cfd 100644
--- a/firmware/src/state_machine.cpp
+++ b/firmware/src/state_machine.cpp
@@ -14,7 +14,7 @@ const char *stateToString(StateMachine::State s)
 	case StateMachine::Speaking:
 		return "Speaking";
 	case StateMachine::ServerWwd:
-		return "ServerWwd";
+		return "Idle(Server-WWD)";
 	case StateMachine::Disconnected:
 		return "Disconnected";
 	default:

From 5dd2c34b425f9f186c7eb785116e4c53e4110234 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sat, 9 May 2026 17:01:00 +0900
Subject: [PATCH 09/15] feat: Refactor server wakeword detection methods for
 consistency and clarity

---
 stackchan_server/ws_proxy.py | 48 ++++++------------------------------
 1 file changed, 8 insertions(+), 40 deletions(-)

diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py
index 7a5e06b..4857f00 100644
--- a/stackchan_server/ws_proxy.py
+++ b/stackchan_server/ws_proxy.py
@@ -171,7 +171,7 @@ def trigger_wakeword(self) -> None:
     async def wait_for_talk_session(self) -> None:
         while True:
             if self._wakeword_event.is_set():
-                await self.stop_server_wakeword_detection()
+                await self._stop_server_wakeword_detection()
                 self._wakeword_event.clear()
                 return
             if self._closed:
@@ -179,7 +179,7 @@ async def wait_for_talk_session(self) -> None:
             await asyncio.sleep(0.05)
 
     async def listen(self) -> str:
-        await self.stop_server_wakeword_detection()
+        await self._stop_server_wakeword_detection()
         return await self._listener.listen(
             send_state_command=self.send_state_command,
             is_closed=lambda: self._closed,
@@ -246,7 +246,7 @@ async def start(self) -> None:
     async def close(self) -> None:
         self._closed = True
         self._cancel_server_wakeword_restart_task()
-        await self.stop_server_wakeword_detection()
+        await self._stop_server_wakeword_detection()
         if self._receiving_task:
             self._receiving_task.cancel()
             with suppress(asyncio.CancelledError):
@@ -262,9 +262,9 @@ async def start_talking(self, text: str) -> None:
 
     async def enable_auto_server_wakeword_detection(self) -> None:
         self._auto_start_server_wakeword = True
-        await self.start_server_wakeword_detection_if_available()
+        await self._start_server_wakeword_detection_if_available()
 
-    async def start_server_wakeword_detection_if_available(self) -> bool:
+    async def _start_server_wakeword_detection_if_available(self) -> bool:
         if (
             self._closed
             or self._server_wakeword_detector is None
@@ -283,7 +283,7 @@ async def start_server_wakeword_detection_if_available(self) -> bool:
         )
         return True
 
-    async def stop_server_wakeword_detection(self) -> None:
+    async def _stop_server_wakeword_detection(self) -> None:
         self._cancel_server_wakeword_restart_task()
         task = self._server_wakeword_task
         if task is None:
@@ -308,38 +308,6 @@ async def stop_server_wakeword_detection(self) -> None:
         except Exception:
             logger.exception("Server-side wake-word detection task failed")
 
-    async def request_server_wakeword_detection(
-        self,
-        *,
-        timeout_seconds: float | None = None,
-    ) -> bool:
-        if self._server_wakeword_detector is None or not self.server_metadata.has_server_wake_word:
-            raise WakeWordDetectionError(
-                "Server-side wake-word detection is not available for this connection"
-            )
-        if self._closed:
-            raise WebSocketDisconnect()
-
-        started = await self.start_server_wakeword_detection_if_available()
-        if not started:
-            raise WakeWordDetectionError(
-                "Server-side wake-word detection could not be started in the current state"
-            )
-
-        task = self._server_wakeword_task
-        if task is None:
-            raise WakeWordDetectionError("Server-side wake-word detection task is unavailable")
-
-        try:
-            if timeout_seconds is None:
-                return await asyncio.shield(task)
-            return await asyncio.wait_for(asyncio.shield(task), timeout=timeout_seconds)
-        except asyncio.TimeoutError as exc:
-            await self.stop_server_wakeword_detection()
-            raise WakeWordDetectionTimeout(
-                "Server-side wake-word detection timed out"
-            ) from exc
-
     async def _receive_loop(self) -> None:
         try:
             while True:
@@ -534,7 +502,7 @@ async def _handle_firmware_metadata(self, message: Any) -> None:
             )
         )
         if self._auto_start_server_wakeword:
-            await self.start_server_wakeword_detection_if_available()
+            await self._start_server_wakeword_detection_if_available()
 
     def _build_server_metadata(
         self, firmware_metadata: FirmwareMetadata
@@ -680,7 +648,7 @@ async def _restart_server_wakeword_detection_after_delay(
             await asyncio.sleep(delay_seconds)
             if self._closed:
                 return
-            await self.start_server_wakeword_detection_if_available()
+            await self._start_server_wakeword_detection_if_available()
         except asyncio.CancelledError:
             raise
         finally:

From b0e7b702c0b724669d4a284dde87f66aad791951 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sat, 9 May 2026 17:12:49 +0900
Subject: [PATCH 10/15] feat: Implement ServerWwdController for server-side
 wakeword detection management

---
 stackchan_server/server_wwd.py | 278 +++++++++++++++++++++++++++++++++
 stackchan_server/ws_proxy.py   | 265 +++----------------------------
 2 files changed, 303 insertions(+), 240 deletions(-)
 create mode 100644 stackchan_server/server_wwd.py

diff --git a/stackchan_server/server_wwd.py b/stackchan_server/server_wwd.py
new file mode 100644
index 0000000..193c99f
--- /dev/null
+++ b/stackchan_server/server_wwd.py
@@ -0,0 +1,278 @@
+from __future__ import annotations
+
+import asyncio
+from logging import getLogger
+from typing import Any, Awaitable, Callable, Optional
+
+from .wakeup_word_detection import (
+    WakeWordDetectionError,
+    WakeWordDetectionTimeout,
+    create_server_side_wake_word_detector,
+)
+
+logger = getLogger(__name__)
+
+_SERVER_WAKEWORD_RESTART_DELAY_SECONDS = 0.25
+_TRAILING_PCM_DRAIN_SECONDS = 1.0
+
+
+class ServerWwdController:
+    def __init__(
+        self,
+        *,
+        send_state_command: Callable[[int], Awaitable[None]],
+        set_current_state: Callable[[int], None],
+        close_websocket: Callable[[int, str], Awaitable[None]],
+        current_state: Callable[[], int],
+        has_server_wake_word: Callable[[], bool],
+        is_closed: Callable[[], bool],
+        on_detected: Callable[[], None],
+        has_pending_wakeword: Callable[[], bool],
+        server_wwd_state: int,
+        idle_state: int,
+    ) -> None:
+        self._send_state_command = send_state_command
+        self._set_current_state = set_current_state
+        self._close_websocket = close_websocket
+        self._current_state = current_state
+        self._has_server_wake_word = has_server_wake_word
+        self._is_closed = is_closed
+        self._on_detected = on_detected
+        self._has_pending_wakeword = has_pending_wakeword
+        self._server_wwd_state = server_wwd_state
+        self._idle_state = idle_state
+
+        self._detector = create_server_side_wake_word_detector()
+        self._task: Optional[asyncio.Task[bool]] = None
+        self._restart_task: Optional[asyncio.Task[None]] = None
+        self._auto_start = False
+        self._drain_trailing_pcm_until_end = False
+        self._drain_trailing_pcm_deadline: float | None = None
+
+    @property
+    def available(self) -> bool:
+        return self._detector is not None
+
+    @property
+    def auto_start_enabled(self) -> bool:
+        return self._auto_start
+
+    async def enable_auto_detection(self) -> None:
+        self._auto_start = True
+        await self.start_if_available()
+
+    async def start_if_available(self) -> bool:
+        if (
+            self._is_closed()
+            or self._detector is None
+            or not self._has_server_wake_word()
+            or self._current_state() != self._idle_state
+        ):
+            return False
+
+        if self._task is not None and not self._task.done():
+            return True
+
+        self._cancel_restart_task()
+        self._task = asyncio.create_task(
+            self._run_detection(),
+            name="server-side-wakeword-detection",
+        )
+        return True
+
+    async def stop(self) -> None:
+        self._cancel_restart_task()
+        task = self._task
+        if task is None:
+            return
+
+        if task.done():
+            self._task = None
+            try:
+                await task
+            except asyncio.CancelledError:
+                pass
+            except Exception:
+                logger.exception("Server-side wake-word detection task failed")
+            return
+
+        task.cancel()
+        self._task = None
+        try:
+            await task
+        except asyncio.CancelledError:
+            pass
+        except Exception:
+            logger.exception("Server-side wake-word detection task failed")
+
+    async def handle_pcm_message(self, message: Any, *, ws_pb2: Any) -> bool:
+        body_name = message.WhichOneof("body")
+
+        if self._should_drain_trailing_pcm():
+            if (
+                message.message_type == ws_pb2.MESSAGE_TYPE_START
+                and body_name == "audio_pcm_start"
+            ):
+                logger.info(
+                    "Received a new server-side wake-word PCM START while draining trailing audio; resuming normal routing"
+                )
+                self._clear_trailing_pcm_drain()
+            elif (
+                message.message_type == ws_pb2.MESSAGE_TYPE_DATA
+                and body_name == "audio_pcm_data"
+            ):
+                logger.info(
+                    "Discarding trailing server-side wake-word PCM DATA payload_bytes=%d",
+                    len(message.audio_pcm_data.pcm_bytes),
+                )
+                return True
+            elif (
+                message.message_type == ws_pb2.MESSAGE_TYPE_END
+                and body_name == "audio_pcm_end"
+            ):
+                logger.info("Finished draining trailing server-side wake-word PCM")
+                self._clear_trailing_pcm_drain()
+                return True
+
+        detector = self._detector
+        if detector is None or not detector.running:
+            logger.info(
+                "Ignoring server-side wake-word PCM while detector is inactive type=%s body=%s",
+                message.message_type,
+                body_name,
+            )
+            return True
+
+        if (
+            message.message_type == ws_pb2.MESSAGE_TYPE_START
+            and body_name == "audio_pcm_start"
+        ):
+            await detector.handle_start()
+            return True
+
+        if (
+            message.message_type == ws_pb2.MESSAGE_TYPE_DATA
+            and body_name == "audio_pcm_data"
+        ):
+            payload = bytes(message.audio_pcm_data.pcm_bytes)
+            await detector.handle_data(payload)
+            return True
+
+        if (
+            message.message_type == ws_pb2.MESSAGE_TYPE_END
+            and body_name == "audio_pcm_end"
+        ):
+            await detector.handle_end()
+            return True
+
+        await self._close_websocket(1003, "unknown server wake-word PCM protobuf body")
+        return False
+
+    def schedule_restart(
+        self,
+        delay_seconds: float = _SERVER_WAKEWORD_RESTART_DELAY_SECONDS,
+    ) -> None:
+        if not self._auto_start or self._is_closed():
+            return
+
+        self._cancel_restart_task()
+        self._restart_task = asyncio.create_task(
+            self._restart_after_delay(delay_seconds),
+            name="server-side-wakeword-restart",
+        )
+
+    async def _run_detection(self) -> bool:
+        detector = self._detector
+        if detector is None:
+            return False
+
+        detected = False
+        should_restart = False
+        try:
+            await detector.start()
+            await self._send_state_command(self._server_wwd_state)
+            detected = await detector.wait_result()
+            if detected:
+                self._on_detected()
+            return detected
+        except asyncio.CancelledError:
+            raise
+        except WakeWordDetectionTimeout as exc:
+            logger.info("Server-side wake-word detection stopped: %s", exc)
+            return False
+        except WakeWordDetectionError as exc:
+            logger.warning("Server-side wake-word detection stopped: %s", exc)
+            return False
+        except Exception:
+            logger.exception("Server-side wake-word detection failed")
+            return False
+        finally:
+            await detector.stop()
+            self._arm_trailing_pcm_drain()
+            if not self._is_closed():
+                self._set_current_state(self._idle_state)
+                try:
+                    await self._send_state_command(self._idle_state)
+                except Exception:
+                    logger.exception(
+                        "Failed to return firmware to idle after wake-word detection"
+                    )
+            should_restart = (
+                self._auto_start
+                and not detected
+                and not self._has_pending_wakeword()
+                and not self._is_closed()
+            )
+            if self._task is asyncio.current_task():
+                self._task = None
+            if should_restart:
+                self.schedule_restart()
+
+    def _cancel_restart_task(self) -> None:
+        task = self._restart_task
+        if task is None:
+            return
+        self._restart_task = None
+        task.cancel()
+
+    async def _restart_after_delay(self, delay_seconds: float) -> None:
+        try:
+            await asyncio.sleep(delay_seconds)
+            if self._is_closed():
+                return
+            await self.start_if_available()
+        except asyncio.CancelledError:
+            raise
+        finally:
+            if self._restart_task is asyncio.current_task():
+                self._restart_task = None
+
+    def _arm_trailing_pcm_drain(
+        self,
+        timeout_seconds: float = _TRAILING_PCM_DRAIN_SECONDS,
+    ) -> None:
+        loop = asyncio.get_running_loop()
+        self._drain_trailing_pcm_until_end = True
+        self._drain_trailing_pcm_deadline = loop.time() + timeout_seconds
+
+    def _clear_trailing_pcm_drain(self) -> None:
+        self._drain_trailing_pcm_until_end = False
+        self._drain_trailing_pcm_deadline = None
+
+    def _should_drain_trailing_pcm(self) -> bool:
+        if not self._drain_trailing_pcm_until_end:
+            return False
+        deadline = self._drain_trailing_pcm_deadline
+        if deadline is None:
+            return True
+        if asyncio.get_running_loop().time() <= deadline:
+            return True
+
+        logger.info(
+            "Trailing PCM drain window expired before END arrived; resuming normal routing"
+        )
+        self._clear_trailing_pcm_drain()
+        return False
+
+
+__all__ = ["ServerWwdController"]
diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py
index 4857f00..bdcd823 100644
--- a/stackchan_server/ws_proxy.py
+++ b/stackchan_server/ws_proxy.py
@@ -22,14 +22,10 @@
     encode_state_command_message,
     parse_websocket_message,
 )
+from .server_wwd import ServerWwdController
 from .speak import SpeakHandler
 from .static import LISTEN_AUDIO_FORMAT
 from .types import SpeechRecognizer, SpeechSynthesizer
-from .wakeup_word_detection import (
-    WakeWordDetectionError,
-    WakeWordDetectionTimeout,
-    create_server_side_wake_word_detector,
-)
 
 logger = getLogger(__name__)
 
@@ -47,8 +43,6 @@
 )  # half interval for the second segment start
 _LISTEN_AUDIO_TIMEOUT_SECONDS = 10.0
 _DEBUG_RECORDING_ENABLED = os.getenv("DEBUG_RECODING") == "1"
-_SERVER_WAKEWORD_RESTART_DELAY_SECONDS = 0.25
-_TRAILING_PCM_DRAIN_SECONDS = 1.0
 
 
 class FirmwareState(IntEnum):
@@ -125,13 +119,6 @@ def __init__(
             recordings_dir=self.recordings_dir,
             debug_recording=self._debug_recording,
         )
-        self._server_wakeword_detector = create_server_side_wake_word_detector()
-        self._server_wakeword_task: Optional[asyncio.Task[bool]] = None
-        self._server_wakeword_restart_task: Optional[asyncio.Task[None]] = None
-        self._auto_start_server_wakeword = False
-        self._drain_trailing_pcm_until_end = False
-        self._drain_trailing_pcm_deadline: float | None = None
-
         self._receiving_task: Optional[asyncio.Task] = None
         self._closed = False
 
@@ -146,6 +133,20 @@ def __init__(
         self._servo_done_counter = 0
         self._servo_sent_counter = 0
         self._pending_servo_wait_targets: deque[int] = deque()
+        self._server_wwd = ServerWwdController(
+            send_state_command=self.send_state_command,
+            set_current_state=lambda state: setattr(
+                self, "_current_firmware_state", FirmwareState(state)
+            ),
+            close_websocket=self.ws.close,
+            current_state=lambda: int(self._current_firmware_state),
+            has_server_wake_word=lambda: self.server_metadata.has_server_wake_word,
+            is_closed=lambda: self._closed,
+            on_detected=self._wakeword_event.set,
+            has_pending_wakeword=self._wakeword_event.is_set,
+            server_wwd_state=int(FirmwareState.SERVER_WWD),
+            idle_state=int(FirmwareState.IDLE),
+        )
 
     @property
     def closed(self) -> bool:
@@ -161,7 +162,7 @@ def receive_task(self) -> Optional[asyncio.Task]:
 
     @property
     def has_server_wakeword_detector(self) -> bool:
-        return self._server_wakeword_detector is not None
+        return self._server_wwd.available
 
     def trigger_wakeword(self) -> None:
         """Web API から擬似的に WAKEWORD_EVT を発火させる。"""
@@ -171,7 +172,7 @@ def trigger_wakeword(self) -> None:
     async def wait_for_talk_session(self) -> None:
         while True:
             if self._wakeword_event.is_set():
-                await self._stop_server_wakeword_detection()
+                await self._server_wwd.stop()
                 self._wakeword_event.clear()
                 return
             if self._closed:
@@ -179,7 +180,7 @@ async def wait_for_talk_session(self) -> None:
             await asyncio.sleep(0.05)
 
     async def listen(self) -> str:
-        await self._stop_server_wakeword_detection()
+        await self._server_wwd.stop()
         return await self._listener.listen(
             send_state_command=self.send_state_command,
             is_closed=lambda: self._closed,
@@ -205,7 +206,7 @@ async def send_state_command(
     async def reset_state(self) -> None:
         await self.send_state_command(FirmwareState.IDLE)
         self._current_firmware_state = FirmwareState.IDLE
-        self._schedule_server_wakeword_restart()
+        self._server_wwd.schedule_restart()
 
     async def move_servo(self, commands: Sequence[ServoCommand]) -> None:
         previous_counter = self._servo_sent_counter
@@ -245,8 +246,7 @@ async def start(self) -> None:
 
     async def close(self) -> None:
         self._closed = True
-        self._cancel_server_wakeword_restart_task()
-        await self._stop_server_wakeword_detection()
+        await self._server_wwd.stop()
         if self._receiving_task:
             self._receiving_task.cancel()
             with suppress(asyncio.CancelledError):
@@ -261,52 +261,7 @@ async def start_talking(self, text: str) -> None:
         await self.speak(text)
 
     async def enable_auto_server_wakeword_detection(self) -> None:
-        self._auto_start_server_wakeword = True
-        await self._start_server_wakeword_detection_if_available()
-
-    async def _start_server_wakeword_detection_if_available(self) -> bool:
-        if (
-            self._closed
-            or self._server_wakeword_detector is None
-            or not self.server_metadata.has_server_wake_word
-            or self.current_state != FirmwareState.IDLE
-        ):
-            return False
-
-        if self._server_wakeword_task is not None and not self._server_wakeword_task.done():
-            return True
-
-        self._cancel_server_wakeword_restart_task()
-        self._server_wakeword_task = asyncio.create_task(
-            self._run_server_wakeword_detection(),
-            name="server-side-wakeword-detection",
-        )
-        return True
-
-    async def _stop_server_wakeword_detection(self) -> None:
-        self._cancel_server_wakeword_restart_task()
-        task = self._server_wakeword_task
-        if task is None:
-            return
-
-        if task.done():
-            self._server_wakeword_task = None
-            try:
-                await task
-            except asyncio.CancelledError:
-                pass
-            except Exception:
-                logger.exception("Server-side wake-word detection task failed")
-            return
-
-        task.cancel()
-        self._server_wakeword_task = None
-        try:
-            await task
-        except asyncio.CancelledError:
-            pass
-        except Exception:
-            logger.exception("Server-side wake-word detection task failed")
+        await self._server_wwd.enable_auto_detection()
 
     async def _receive_loop(self) -> None:
         try:
@@ -324,7 +279,7 @@ async def _receive_loop(self) -> None:
                     break
 
                 if message.kind == ws_pb2.MESSAGE_KIND_SERVER_WWD_PCM:
-                    if not await self._handle_server_wakeword_pcm_message(message):
+                    if not await self._server_wwd.handle_pcm_message(message, ws_pb2=ws_pb2):
                         break
                     continue
 
@@ -360,69 +315,6 @@ async def _receive_loop(self) -> None:
         finally:
             self._closed = True
 
-    async def _handle_server_wakeword_pcm_message(self, message: Any) -> bool:
-        body_name = message.WhichOneof("body")
-
-        if self._should_drain_trailing_pcm():
-            if (
-                message.message_type == ws_pb2.MESSAGE_TYPE_START
-                and body_name == "audio_pcm_start"
-            ):
-                logger.info(
-                    "Received a new server-side wake-word PCM START while draining trailing audio; resuming normal routing"
-                )
-                self._clear_trailing_pcm_drain()
-            elif (
-                message.message_type == ws_pb2.MESSAGE_TYPE_DATA
-                and body_name == "audio_pcm_data"
-            ):
-                logger.info(
-                    "Discarding trailing server-side wake-word PCM DATA payload_bytes=%d",
-                    len(message.audio_pcm_data.pcm_bytes),
-                )
-                return True
-            elif (
-                message.message_type == ws_pb2.MESSAGE_TYPE_END
-                and body_name == "audio_pcm_end"
-            ):
-                logger.info("Finished draining trailing server-side wake-word PCM")
-                self._clear_trailing_pcm_drain()
-                return True
-
-        detector = self._server_wakeword_detector
-        if detector is None or not detector.running:
-            logger.info(
-                "Ignoring server-side wake-word PCM while detector is inactive type=%s body=%s",
-                message.message_type,
-                body_name,
-            )
-            return True
-
-        if (
-            message.message_type == ws_pb2.MESSAGE_TYPE_START
-            and body_name == "audio_pcm_start"
-        ):
-            await detector.handle_start()
-            return True
-
-        if (
-            message.message_type == ws_pb2.MESSAGE_TYPE_DATA
-            and body_name == "audio_pcm_data"
-        ):
-            payload = bytes(message.audio_pcm_data.pcm_bytes)
-            await detector.handle_data(payload)
-            return True
-
-        if (
-            message.message_type == ws_pb2.MESSAGE_TYPE_END
-            and body_name == "audio_pcm_end"
-        ):
-            await detector.handle_end()
-            return True
-
-        await self.ws.close(code=1003, reason="unknown server wake-word PCM protobuf body")
-        return False
-
     async def _handle_audio_pcm_message(self, message: Any) -> bool:
         body_name = message.WhichOneof("body")
 
@@ -501,13 +393,13 @@ async def _handle_firmware_metadata(self, message: Any) -> None:
                 server_version=self.server_metadata.server_version,
             )
         )
-        if self._auto_start_server_wakeword:
-            await self._start_server_wakeword_detection_if_available()
+        if self._server_wwd.auto_start_enabled:
+            await self._server_wwd.start_if_available()
 
     def _build_server_metadata(
         self, firmware_metadata: FirmwareMetadata
     ) -> ServerMetadata:
-        should_use_server_wake_word = self._server_wakeword_detector is not None
+        should_use_server_wake_word = self._server_wwd.available
         return ServerMetadata(
             has_server_wake_word=should_use_server_wake_word,
             server_version=__version__,
@@ -575,113 +467,6 @@ def _raise_websocket_disconnect_from_runtime_error(self, exc: RuntimeError) -> N
         self._closed = True
         raise WebSocketDisconnect() from exc
 
-    async def _run_server_wakeword_detection(self) -> bool:
-        detector = self._server_wakeword_detector
-        if detector is None:
-            return False
-
-        detected = False
-        should_restart = False
-        try:
-            await detector.start()
-            await self.send_state_command(FirmwareState.SERVER_WWD)
-            detected = await detector.wait_result()
-            if detected:
-                self._wakeword_event.set()
-            return detected
-        except asyncio.CancelledError:
-            raise
-        except WakeWordDetectionTimeout as exc:
-            logger.info("Server-side wake-word detection stopped: %s", exc)
-            return False
-        except WakeWordDetectionError as exc:
-            logger.warning("Server-side wake-word detection stopped: %s", exc)
-            return False
-        except Exception:
-            logger.exception("Server-side wake-word detection failed")
-            return False
-        finally:
-            await detector.stop()
-            self._arm_trailing_pcm_drain()
-            if not self._closed:
-                self._current_firmware_state = FirmwareState.IDLE
-                try:
-                    await self.send_state_command(FirmwareState.IDLE)
-                except Exception:
-                    logger.exception("Failed to return firmware to idle after wake-word detection")
-            should_restart = (
-                self._auto_start_server_wakeword
-                and not detected
-                and not self._wakeword_event.is_set()
-                and not self._closed
-            )
-            if self._server_wakeword_task is asyncio.current_task():
-                self._server_wakeword_task = None
-            if should_restart:
-                self._schedule_server_wakeword_restart()
-
-    def _schedule_server_wakeword_restart(
-        self,
-        delay_seconds: float = _SERVER_WAKEWORD_RESTART_DELAY_SECONDS,
-    ) -> None:
-        if not self._auto_start_server_wakeword or self._closed:
-            return
-
-        self._cancel_server_wakeword_restart_task()
-        self._server_wakeword_restart_task = asyncio.create_task(
-            self._restart_server_wakeword_detection_after_delay(delay_seconds),
-            name="server-side-wakeword-restart",
-        )
-
-    def _cancel_server_wakeword_restart_task(self) -> None:
-        task = self._server_wakeword_restart_task
-        if task is None:
-            return
-        self._server_wakeword_restart_task = None
-        task.cancel()
-
-    async def _restart_server_wakeword_detection_after_delay(
-        self,
-        delay_seconds: float,
-    ) -> None:
-        try:
-            await asyncio.sleep(delay_seconds)
-            if self._closed:
-                return
-            await self._start_server_wakeword_detection_if_available()
-        except asyncio.CancelledError:
-            raise
-        finally:
-            if self._server_wakeword_restart_task is asyncio.current_task():
-                self._server_wakeword_restart_task = None
-
-    def _arm_trailing_pcm_drain(
-        self,
-        timeout_seconds: float = _TRAILING_PCM_DRAIN_SECONDS,
-    ) -> None:
-        loop = asyncio.get_running_loop()
-        self._drain_trailing_pcm_until_end = True
-        self._drain_trailing_pcm_deadline = loop.time() + timeout_seconds
-
-    def _clear_trailing_pcm_drain(self) -> None:
-        self._drain_trailing_pcm_until_end = False
-        self._drain_trailing_pcm_deadline = None
-
-    def _should_drain_trailing_pcm(self) -> bool:
-        if not self._drain_trailing_pcm_until_end:
-            return False
-        deadline = self._drain_trailing_pcm_deadline
-        if deadline is None:
-            return True
-        if asyncio.get_running_loop().time() <= deadline:
-            return True
-
-        logger.info(
-            "Trailing PCM drain window expired before END arrived; resuming normal routing"
-        )
-        self._clear_trailing_pcm_drain()
-        return False
-
     async def _wait_for_counter(
         self,
         *,

From 2fc4c1190d84d3845d9acae037e2f69bdf9b8269 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sat, 9 May 2026 17:19:28 +0900
Subject: [PATCH 11/15] feat: Remove has_server_wake_word dependency and
 streamline auto detection logic

---
 stackchan_server/server_wwd.py | 4 ----
 stackchan_server/ws_proxy.py   | 3 ++-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/stackchan_server/server_wwd.py b/stackchan_server/server_wwd.py
index 193c99f..5474745 100644
--- a/stackchan_server/server_wwd.py
+++ b/stackchan_server/server_wwd.py
@@ -24,7 +24,6 @@ def __init__(
         set_current_state: Callable[[int], None],
         close_websocket: Callable[[int, str], Awaitable[None]],
         current_state: Callable[[], int],
-        has_server_wake_word: Callable[[], bool],
         is_closed: Callable[[], bool],
         on_detected: Callable[[], None],
         has_pending_wakeword: Callable[[], bool],
@@ -35,7 +34,6 @@ def __init__(
         self._set_current_state = set_current_state
         self._close_websocket = close_websocket
         self._current_state = current_state
-        self._has_server_wake_word = has_server_wake_word
         self._is_closed = is_closed
         self._on_detected = on_detected
         self._has_pending_wakeword = has_pending_wakeword
@@ -59,13 +57,11 @@ def auto_start_enabled(self) -> bool:
 
     async def enable_auto_detection(self) -> None:
         self._auto_start = True
-        await self.start_if_available()
 
     async def start_if_available(self) -> bool:
         if (
             self._is_closed()
             or self._detector is None
-            or not self._has_server_wake_word()
             or self._current_state() != self._idle_state
         ):
             return False
diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py
index bdcd823..c098f81 100644
--- a/stackchan_server/ws_proxy.py
+++ b/stackchan_server/ws_proxy.py
@@ -140,7 +140,6 @@ def __init__(
             ),
             close_websocket=self.ws.close,
             current_state=lambda: int(self._current_firmware_state),
-            has_server_wake_word=lambda: self.server_metadata.has_server_wake_word,
             is_closed=lambda: self._closed,
             on_detected=self._wakeword_event.set,
             has_pending_wakeword=self._wakeword_event.is_set,
@@ -262,6 +261,8 @@ async def start_talking(self, text: str) -> None:
 
     async def enable_auto_server_wakeword_detection(self) -> None:
         await self._server_wwd.enable_auto_detection()
+        if self.firmware_metadata is not None:
+            await self._server_wwd.start_if_available()
 
     async def _receive_loop(self) -> None:
         try:

From a29aad296c74de3cb490f85863340bf432af8d76 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sat, 9 May 2026 17:20:35 +0900
Subject: [PATCH 12/15] feat: Remove has_pending_wakeword parameter and add
 suppress_restart logic in ServerWwdController

---
 stackchan_server/server_wwd.py | 12 ++++++++----
 stackchan_server/ws_proxy.py   |  1 -
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/stackchan_server/server_wwd.py b/stackchan_server/server_wwd.py
index 5474745..43bb73e 100644
--- a/stackchan_server/server_wwd.py
+++ b/stackchan_server/server_wwd.py
@@ -26,7 +26,6 @@ def __init__(
         current_state: Callable[[], int],
         is_closed: Callable[[], bool],
         on_detected: Callable[[], None],
-        has_pending_wakeword: Callable[[], bool],
         server_wwd_state: int,
         idle_state: int,
     ) -> None:
@@ -36,7 +35,6 @@ def __init__(
         self._current_state = current_state
         self._is_closed = is_closed
         self._on_detected = on_detected
-        self._has_pending_wakeword = has_pending_wakeword
         self._server_wwd_state = server_wwd_state
         self._idle_state = idle_state
 
@@ -44,6 +42,7 @@ def __init__(
         self._task: Optional[asyncio.Task[bool]] = None
         self._restart_task: Optional[asyncio.Task[None]] = None
         self._auto_start = False
+        self._suppress_restart_once = False
         self._drain_trailing_pcm_until_end = False
         self._drain_trailing_pcm_deadline: float | None = None
 
@@ -76,12 +75,15 @@ async def start_if_available(self) -> bool:
         )
         return True
 
-    async def stop(self) -> None:
+    async def stop(self, *, suppress_restart: bool = True) -> None:
         self._cancel_restart_task()
         task = self._task
         if task is None:
             return
 
+        if suppress_restart and not task.done():
+            self._suppress_restart_once = True
+
         if task.done():
             self._task = None
             try:
@@ -213,10 +215,12 @@ async def _run_detection(self) -> bool:
                     logger.exception(
                         "Failed to return firmware to idle after wake-word detection"
                     )
+            suppress_restart = self._suppress_restart_once
+            self._suppress_restart_once = False
             should_restart = (
                 self._auto_start
                 and not detected
-                and not self._has_pending_wakeword()
+                and not suppress_restart
                 and not self._is_closed()
             )
             if self._task is asyncio.current_task():
diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py
index c098f81..1c414cc 100644
--- a/stackchan_server/ws_proxy.py
+++ b/stackchan_server/ws_proxy.py
@@ -142,7 +142,6 @@ def __init__(
             current_state=lambda: int(self._current_firmware_state),
             is_closed=lambda: self._closed,
             on_detected=self._wakeword_event.set,
-            has_pending_wakeword=self._wakeword_event.is_set,
             server_wwd_state=int(FirmwareState.SERVER_WWD),
             idle_state=int(FirmwareState.IDLE),
         )

From 8f7e40e87bd18714037ab323dcb82bd72d5b8d4b Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sat, 9 May 2026 17:24:06 +0900
Subject: [PATCH 13/15] feat: Refactor ServerWwdController initialization and
 state management methods

---
 stackchan_server/server_wwd.py | 27 +++++++++------------------
 stackchan_server/ws_proxy.py   | 17 ++++++++++-------
 2 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/stackchan_server/server_wwd.py b/stackchan_server/server_wwd.py
index 43bb73e..b249857 100644
--- a/stackchan_server/server_wwd.py
+++ b/stackchan_server/server_wwd.py
@@ -20,23 +20,19 @@ class ServerWwdController:
     def __init__(
         self,
         *,
-        send_state_command: Callable[[int], Awaitable[None]],
-        set_current_state: Callable[[int], None],
+        enter_server_wwd: Callable[[], Awaitable[None]],
+        return_to_idle: Callable[[], Awaitable[None]],
         close_websocket: Callable[[int, str], Awaitable[None]],
-        current_state: Callable[[], int],
+        is_idle_state: Callable[[], bool],
         is_closed: Callable[[], bool],
         on_detected: Callable[[], None],
-        server_wwd_state: int,
-        idle_state: int,
     ) -> None:
-        self._send_state_command = send_state_command
-        self._set_current_state = set_current_state
+        self._enter_server_wwd = enter_server_wwd
+        self._return_to_idle = return_to_idle
         self._close_websocket = close_websocket
-        self._current_state = current_state
+        self._is_idle_state = is_idle_state
         self._is_closed = is_closed
         self._on_detected = on_detected
-        self._server_wwd_state = server_wwd_state
-        self._idle_state = idle_state
 
         self._detector = create_server_side_wake_word_detector()
         self._task: Optional[asyncio.Task[bool]] = None
@@ -58,11 +54,7 @@ async def enable_auto_detection(self) -> None:
         self._auto_start = True
 
     async def start_if_available(self) -> bool:
-        if (
-            self._is_closed()
-            or self._detector is None
-            or self._current_state() != self._idle_state
-        ):
+        if self._is_closed() or self._detector is None or not self._is_idle_state():
             return False
 
         if self._task is not None and not self._task.done():
@@ -188,7 +180,7 @@ async def _run_detection(self) -> bool:
         should_restart = False
         try:
             await detector.start()
-            await self._send_state_command(self._server_wwd_state)
+            await self._enter_server_wwd()
             detected = await detector.wait_result()
             if detected:
                 self._on_detected()
@@ -208,9 +200,8 @@ async def _run_detection(self) -> bool:
             await detector.stop()
             self._arm_trailing_pcm_drain()
             if not self._is_closed():
-                self._set_current_state(self._idle_state)
                 try:
-                    await self._send_state_command(self._idle_state)
+                    await self._return_to_idle()
                 except Exception:
                     logger.exception(
                         "Failed to return firmware to idle after wake-word detection"
diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py
index 1c414cc..2d8154e 100644
--- a/stackchan_server/ws_proxy.py
+++ b/stackchan_server/ws_proxy.py
@@ -134,16 +134,12 @@ def __init__(
         self._servo_sent_counter = 0
         self._pending_servo_wait_targets: deque[int] = deque()
         self._server_wwd = ServerWwdController(
-            send_state_command=self.send_state_command,
-            set_current_state=lambda state: setattr(
-                self, "_current_firmware_state", FirmwareState(state)
-            ),
+            enter_server_wwd=self._enter_server_wwd_state,
+            return_to_idle=self._return_to_idle_state,
             close_websocket=self.ws.close,
-            current_state=lambda: int(self._current_firmware_state),
+            is_idle_state=lambda: self._current_firmware_state == FirmwareState.IDLE,
             is_closed=lambda: self._closed,
             on_detected=self._wakeword_event.set,
-            server_wwd_state=int(FirmwareState.SERVER_WWD),
-            idle_state=int(FirmwareState.IDLE),
         )
 
     @property
@@ -448,6 +444,13 @@ async def _send_state_command(
             )
         )
 
+    async def _enter_server_wwd_state(self) -> None:
+        await self.send_state_command(FirmwareState.SERVER_WWD)
+
+    async def _return_to_idle_state(self) -> None:
+        self._current_firmware_state = FirmwareState.IDLE
+        await self.send_state_command(FirmwareState.IDLE)
+
     async def _send_ws_bytes(self, data: bytes) -> None:
         try:
             await self.ws.send_bytes(data)

From d0ad278c22d663c5cd6801dcc07d4dc66009ec00 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sun, 10 May 2026 16:30:08 +0900
Subject: [PATCH 14/15] feat: Add workflows for version bump check and release
 management

---
 .../check-version-bump-for-main-pr.yml        |  75 +++++++++
 .github/workflows/release-on-main.yml         | 145 ++++++++++++++++++
 2 files changed, 220 insertions(+)
 create mode 100644 .github/workflows/check-version-bump-for-main-pr.yml
 create mode 100644 .github/workflows/release-on-main.yml

diff --git a/.github/workflows/check-version-bump-for-main-pr.yml b/.github/workflows/check-version-bump-for-main-pr.yml
new file mode 100644
index 0000000..71ef4a8
--- /dev/null
+++ b/.github/workflows/check-version-bump-for-main-pr.yml
@@ -0,0 +1,75 @@
+name: Check Version Bump For Main Or Test PR
+
+on:
+  pull_request:
+    branches:
+      - main
+      - test
+    types:
+      - opened
+      - synchronize
+      - reopened
+      - edited
+      - ready_for_review
+
+permissions:
+  contents: read
+
+jobs:
+  check-version-bump:
+    if: github.head_ref == 'develop'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+
+      - name: Compare pyproject.toml versions
+        env:
+          BASE_REF: ${{ github.base_ref }}
+          HEAD_REF: ${{ github.head_ref }}
+        run: |
+          set -euo pipefail
+          git fetch origin "${BASE_REF}" --depth=1
+          python - <<'PY'
+          import os
+          import subprocess
+          import tomllib
+          from pathlib import Path
+
+          def read_version(pyproject_text: str) -> str:
+              data = tomllib.loads(pyproject_text)
+              version = data.get("project", {}).get("version")
+              if not version:
+                  raise SystemExit("project.version not found in pyproject.toml")
+              return version
+
+          base_ref = os.environ["BASE_REF"]
+          head_ref = os.environ["HEAD_REF"]
+          base_pyproject = subprocess.check_output(
+              ["git", "show", f"origin/{base_ref}:pyproject.toml"],
+              text=True,
+          )
+          head_pyproject = Path("pyproject.toml").read_text()
+
+          base_version = read_version(base_pyproject)
+          head_version = read_version(head_pyproject)
+
+          print(f"Base branch version ({base_ref}): {base_version}")
+          print(f"PR branch version ({head_ref}): {head_version}")
+
+          if base_version == head_version:
+              print(
+                  f"::error file=pyproject.toml,title=Version bump required::"
+                  f"develop -> {base_ref} PR must update [project].version in pyproject.toml. "
+                  f"{base_ref} is {base_version} and this PR is still {head_version}."
+              )
+              raise SystemExit(1)
+          PY
diff --git a/.github/workflows/release-on-main.yml b/.github/workflows/release-on-main.yml
new file mode 100644
index 0000000..ed6cf0a
--- /dev/null
+++ b/.github/workflows/release-on-main.yml
@@ -0,0 +1,145 @@
+name: Release On Main Or Test
+
+on:
+  push:
+    branches:
+      - main
+      - test
+
+permissions:
+  contents: write
+  pull-requests: read
+
+concurrency:
+  group: release-${{ github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+
+      - name: Read release version from pyproject.toml
+        id: version
+        run: |
+          python - <<'PY'
+          import os
+          import tomllib
+          from pathlib import Path
+
+          data = tomllib.loads(Path("pyproject.toml").read_text())
+          version = data["project"]["version"]
+
+          with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as fh:
+              print(f"tag={version}", file=fh)
+
+          print(f"Release version: {version}")
+          PY
+
+      - name: Find previous version tag
+        id: previous
+        shell: bash
+        run: |
+          set -euo pipefail
+          current_tag="${{ steps.version.outputs.tag }}"
+          previous_tag="$(git tag --list '[0-9]*' --sort=-version:refname | grep -Fxv "${current_tag}" | head -n 1 || true)"
+          echo "previous_tag=${previous_tag}" >> "$GITHUB_OUTPUT"
+          if [ -n "${previous_tag}" ]; then
+            echo "Previous tag: ${previous_tag}"
+          else
+            echo "Previous tag: none"
+          fi
+
+      - name: Create tag if needed
+        uses: actions/github-script@v7
+        env:
+          TAG: ${{ steps.version.outputs.tag }}
+          TARGET_SHA: ${{ github.sha }}
+        with:
+          script: |
+            const { owner, repo } = context.repo;
+            const tag = process.env.TAG;
+            const targetSha = process.env.TARGET_SHA;
+
+            try {
+              await github.rest.git.getRef({
+                owner,
+                repo,
+                ref: `tags/${tag}`,
+              });
+              core.notice(`Tag ${tag} already exists.`);
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+
+              await github.rest.git.createRef({
+                owner,
+                repo,
+                ref: `refs/tags/${tag}`,
+                sha: targetSha,
+              });
+              core.notice(`Created tag ${tag} at ${targetSha}.`);
+            }
+
+      - name: Create GitHub Release if needed
+        uses: actions/github-script@v7
+        env:
+          TAG: ${{ steps.version.outputs.tag }}
+          PREVIOUS_TAG: ${{ steps.previous.outputs.previous_tag }}
+          TARGET_SHA: ${{ github.sha }}
+        with:
+          script: |
+            const { owner, repo } = context.repo;
+            const tag = process.env.TAG;
+            const previousTag = process.env.PREVIOUS_TAG;
+            const targetSha = process.env.TARGET_SHA;
+
+            try {
+              const existing = await github.rest.repos.getReleaseByTag({
+                owner,
+                repo,
+                tag,
+              });
+              core.notice(`Release for ${tag} already exists: ${existing.data.html_url}`);
+              return;
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+            }
+
+            const notes = await github.request(
+              "POST /repos/{owner}/{repo}/releases/generate-notes",
+              {
+                owner,
+                repo,
+                tag_name: tag,
+                target_commitish: targetSha,
+                ...(previousTag ? { previous_tag_name: previousTag } : {}),
+              },
+            );
+
+            const release = await github.rest.repos.createRelease({
+              owner,
+              repo,
+              tag_name: tag,
+              target_commitish: targetSha,
+              name: tag,
+              body: notes.data.body,
+              draft: false,
+              prerelease: false,
+              generate_release_notes: false,
+            });
+
+            core.notice(`Created release ${release.data.html_url}`);

From da6e0a7534d5a0f8a652c5652479ec7a7db98fb5 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sun, 10 May 2026 17:04:35 +0900
Subject: [PATCH 15/15] bump version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 90638f2..b6d4e0f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "websocket-control-stackchan-server"
-version = "0.1.0"
+version = "0.2.0"
 description = "A WebSocket control interface for StackChan AI agent"
 readme = "README.md"
 requires-python = ">=3.13"