From 03c5b4dd73481ea2f543b226994b03a8bda06297 Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Tue, 12 May 2026 14:13:08 +0300
Subject: [PATCH] Audio: MFCC: Add Voice Activity Detection based on Mel
 spectrum

This patch adds a new mfcc_vad module. It operates  on the Mel
log spectrum values produced by the MFCC component. The VAD is
very simple and not very selective for voice vs. other signals.
But the continuously updated background noise estimate prevents
stationary noises from triggering the VAD.

The algorithm tracks a per-bin noise floor (instant-down, slow-rise)
and computes a A-weighted energy delta. The used weight emphasizes
speech frequencies. Speech is declared when the delta exceeds a
threshold (0.35 in Q9.23) with a 20-frame hangover to prevent rapid
toggling.

The VAD flag is inserted into the output stream as the first value
after the magic header word in all format paths (S16, S24, S32).

A new Kconfig option CONFIG_COMP_MFCC_VAD (depends on COMP_MFCC,
default y) gates compilation of the VAD code and the stream format
change.

The README.txt file is updated to show help how to run the
example Python script sof_mel_to_text_live_dsp_vad.py. It uses
the MFCC Mel spectrum data and VAD flags stream as audio features
for Whisper speech to text model. The formatting is changed to md.

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 src/arch/host/configs/library_defconfig       |   1 +
 src/audio/mfcc/CMakeLists.txt                 |   3 +
 src/audio/mfcc/Kconfig                        |  11 +
 src/audio/mfcc/mfcc_common.c                  |  49 +-
 src/audio/mfcc/mfcc_setup.c                   |  20 +
 src/audio/mfcc/mfcc_vad.c                     | 244 ++++++++++
 src/audio/mfcc/tune/README.md                 |  98 ++++
 src/audio/mfcc/tune/README.txt                |  52 --
 src/audio/mfcc/tune/decode_mel.m              |  39 +-
 .../mfcc/tune/sof_mel_to_text_live_dsp_vad.py | 454 ++++++++++++++++++
 src/include/sof/audio/mfcc/mfcc_comp.h        |   8 +
 src/include/sof/audio/mfcc/mfcc_vad.h         | 101 ++++
 12 files changed, 1015 insertions(+), 65 deletions(-)
 create mode 100644 src/audio/mfcc/mfcc_vad.c
 create mode 100644 src/audio/mfcc/tune/README.md
 delete mode 100644 src/audio/mfcc/tune/README.txt
 create mode 100644 src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py
 create mode 100644 src/include/sof/audio/mfcc/mfcc_vad.h

diff --git a/src/arch/host/configs/library_defconfig b/src/arch/host/configs/library_defconfig
index 28c486bec58d..34ea0fe051f3 100644
--- a/src/arch/host/configs/library_defconfig
+++ b/src/arch/host/configs/library_defconfig
@@ -11,6 +11,7 @@ CONFIG_COMP_IIR=y
 CONFIG_COMP_IGO_NR=y
 CONFIG_COMP_LEVEL_MULTIPLIER=y
 CONFIG_COMP_MFCC=y
+CONFIG_COMP_MFCC_VAD=y
 CONFIG_COMP_MODULE_ADAPTER=y
 CONFIG_COMP_MULTIBAND_DRC=y
 CONFIG_COMP_MUX=y
diff --git a/src/audio/mfcc/CMakeLists.txt b/src/audio/mfcc/CMakeLists.txt
index f8af79d1ca8a..433aa824e713 100644
--- a/src/audio/mfcc/CMakeLists.txt
+++ b/src/audio/mfcc/CMakeLists.txt
@@ -5,4 +5,7 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT)
   add_dependencies(app mfcc)
 else()
   add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c)
+  if(CONFIG_COMP_MFCC_VAD)
+    add_local_sources(sof mfcc_vad.c)
+  endif()
 endif()
diff --git a/src/audio/mfcc/Kconfig b/src/audio/mfcc/Kconfig
index f56cadb40de2..821a3d22018d 100644
--- a/src/audio/mfcc/Kconfig
+++ b/src/audio/mfcc/Kconfig
@@ -24,3 +24,14 @@ config COMP_MFCC
 	  The characteristic of the audio features are defined in the binary
 	  control blob. Directory tools/tune/mfcc contains a tool to create
 	  the configurations.
+
+config COMP_MFCC_VAD
+	bool "MFCC Voice Activity Detection"
+	depends on COMP_MFCC
+	default y
+	help
+	  This option enables a Voice Activity Detector (VAD) that operates
+	  on the Mel spectrum values produced by the MFCC component. The VAD
+	  flag is inserted into the output stream as the first int32_t value
+	  after the magic header word. The VAD tracks a per-bin noise floor
+	  and detects speech using a weighted energy delta with hangover.
diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c
index 1079864e9259..75e027794449 100644
--- a/src/audio/mfcc/mfcc_common.c
+++ b/src/audio/mfcc/mfcc_common.c
@@ -21,6 +21,10 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#ifdef CONFIG_COMP_MFCC_VAD
+#include <sof/audio/mfcc/mfcc_vad.h>
+#endif
+
 LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL);
 
 /*
@@ -144,6 +148,10 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *
 					sat_int32(Q_MULTSR_32X32(s, config->mel_scale, 23, 12, 23));
 			}
 
+#ifdef CONFIG_COMP_MFCC_VAD
+			/* Run VAD on the mel log spectrum before further processing */
+			state->vad_flag = mfcc_vad_update(&cd->vad, state->mel_log_32);
+#endif
 			/* Store Q9.7 version in mel_spectra for s16 output mode */
 			for (j = 0; j < state->dct.num_in; j++)
 				state->mel_spectra->data[j] =
@@ -282,10 +290,14 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
 
 	/* If new output produced, set up pointer into scratch data and mark magic pending */
 	if (num_ceps > 0) {
-		if (state->mel_only)
+		if (state->mel_only) {
 			state->out_data_ptr = state->mel_spectra->data;
-		else
+#ifdef CONFIG_COMP_MFCC_VAD
+			state->vad_pending = true;
+#endif
+		} else {
 			state->out_data_ptr = state->cepstral_coef->data;
+		}
 
 		state->out_remain = num_ceps;
 		state->magic_pending = true;
@@ -301,6 +313,15 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
 		state->magic_pending = false;
 	}
 
+#ifdef CONFIG_COMP_MFCC_VAD
+	/* Write VAD flag as first value after magic (as two int16_t = one int32_t) */
+	if (state->vad_pending && sink_samples >= 2) {
+		w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, 2, (int16_t *)&state->vad_flag);
+		sink_samples -= 2;
+		state->vad_pending = false;
+	}
+#endif
+
 	/* Write cepstral/mel data from scratch buffer */
 	to_copy = MIN(state->out_remain, sink_samples);
 	if (to_copy > 0) {
@@ -386,6 +407,9 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
 				state->mel_log_32[k] >>= 8;
 
 			state->out_data_ptr_32 = state->mel_log_32;
+#ifdef CONFIG_COMP_MFCC_VAD
+			state->vad_pending = true;
+#endif
 		} else {
 			state->out_data_ptr = state->cepstral_coef->data;
 		}
@@ -404,6 +428,15 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
 		state->magic_pending = false;
 	}
 
+#ifdef CONFIG_COMP_MFCC_VAD
+	/* Write VAD flag as first value after magic */
+	if (state->vad_pending && sink_samples >= 1) {
+		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, 1, &state->vad_flag);
+		sink_samples -= 1;
+		state->vad_pending = false;
+	}
+#endif
+
 	if (state->mel_only) {
 		/* Write 32-bit mel data Q9.15, one value per int32_t */
 		to_copy = MIN(state->out_remain, sink_samples);
@@ -461,6 +494,9 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
 	if (num_ceps > 0) {
 		if (state->mel_only) {
 			state->out_data_ptr_32 = state->mel_log_32;
+#ifdef CONFIG_COMP_MFCC_VAD
+			state->vad_pending = true;
+#endif
 		} else {
 			state->out_data_ptr = state->cepstral_coef->data;
 		}
@@ -479,6 +515,15 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
 		state->magic_pending = false;
 	}
 
+#ifdef CONFIG_COMP_MFCC_VAD
+	/* Write VAD flag as first value after magic */
+	if (state->vad_pending && sink_samples >= 1) {
+		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, 1, &state->vad_flag);
+		sink_samples -= 1;
+		state->vad_pending = false;
+	}
+#endif
+
 	if (state->mel_only) {
 		/* Write 32-bit mel data Q9.23, one value per int32_t */
 		to_copy = MIN(state->out_remain, sink_samples);
diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c
index 1cad4b2b984e..aa83f93d8e3a 100644
--- a/src/audio/mfcc/mfcc_setup.c
+++ b/src/audio/mfcc/mfcc_setup.c
@@ -18,6 +18,10 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#ifdef CONFIG_COMP_MFCC_VAD
+#include <sof/audio/mfcc/mfcc_vad.h>
+#endif
+
 /* Definitions for cepstral lifter */
 #define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23)
 #define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23)
@@ -346,10 +350,22 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 	state->waiting_fill = true;
 	state->prev_samples_valid = false;
 	state->magic_pending = false;
+#ifdef CONFIG_COMP_MFCC_VAD
+	state->vad_pending = false;
+	state->vad_flag = 0;
+#endif
 	state->out_data_ptr = NULL;
 	state->out_data_ptr_32 = NULL;
 	state->out_remain = 0;
 
+#ifdef CONFIG_COMP_MFCC_VAD
+	ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod);
+	if (ret < 0) {
+		comp_err(dev, "Failed VAD init");
+		goto free_lifter;
+	}
+#endif
+
 	comp_dbg(dev, "done");
 	return 0;
 
@@ -389,4 +405,8 @@ void mfcc_free_buffers(struct processing_module *mod)
 	mod_free(mod, cd->state.melfb.data);
 	mod_free(mod, cd->state.dct.matrix);
 	mod_free(mod, cd->state.lifter.matrix);
+#ifdef CONFIG_COMP_MFCC_VAD
+	mod_free(mod, cd->vad.noise_floor);
+	mod_free(mod, cd->vad.weights);
+#endif
 }
diff --git a/src/audio/mfcc/mfcc_vad.c b/src/audio/mfcc/mfcc_vad.c
new file mode 100644
index 000000000000..cdcc3d7eaa12
--- /dev/null
+++ b/src/audio/mfcc/mfcc_vad.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2026 Intel Corporation.
+//
+// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+
+/**
+ * \file mfcc_vad.c
+ * \brief Voice Activity Detection based on Mel spectrum energy.
+ *
+ * Implements a VAD that tracks per-bin noise floor and computes a
+ * speech-frequency weighted energy above the floor. Speech is declared
+ * when the weighted delta exceeds a threshold, with hangover to prevent
+ * rapid toggling.
+ */
+
+#include <sof/audio/mfcc/mfcc_vad.h>
+
+#ifdef CONFIG_COMP_MFCC_VAD
+
+#include <sof/audio/component.h>
+#include <sof/audio/format.h>
+#include <sof/audio/module_adapter/module/module_interface.h>
+#include <sof/math/auditory.h>
+#include <sof/trace/trace.h>
+#include <errno.h>
+#include <stddef.h>
+
+LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL);
+
+/**
+ * \brief A-weighting table: 1/3 octave band center frequencies in Hz (Q16.0).
+ *
+ * From IEC 61672-1:2013, source:
+ * https://acousticalengineer.com/a-weighting-table/
+ */
+#define A_WEIGHT_TABLE_SIZE	36
+
+static const int16_t a_weight_hz[A_WEIGHT_TABLE_SIZE] = {
+	    6,     8,    10,    13,    16,    20,    25,    32,
+	   40,    50,    63,    80,   100,   125,   160,   200,
+	  250,   315,   400,   500,   630,   800,  1000,  1250,
+	 1600,  2000,  2500,  3150,  4000,  5000,  6300,  8000,
+	10000, 12500, 16000, 20000,
+};
+
+/**
+ * \brief A-weighting linear amplitude, scaled so peak (at 2500 Hz) maps
+ *        to INT16_MAX (32767).  Original dB values converted via
+ *        10^(dB/20) then scaled by 32767 / max.
+ */
+static const int16_t a_weight_lin[A_WEIGHT_TABLE_SIZE] = {
+	    2,     4,     9,    19,    43,    85,   162,   299,
+	  531,   862,  1382,  2140,  3129,  4370,  6172,  8136,
+	10362, 13196, 16234, 19518, 22669, 25730, 28212, 30230,
+	31655, 32392, 32767, 32392, 31655, 30230, 27889, 24856,
+	21156, 17196, 13045,  9670,
+};
+
+/**
+ * \brief Compute A-weighted speech-frequency emphasis weights for Mel bins.
+ *
+ * Weights are computed by linearly interpolating the A-weighting table
+ * at each Mel bin center frequency.  Output weights are in Q1.15 and
+ * sum to approximately 2^15.
+ *
+ * \param[out] weights Output weight array.
+ * \param[in] num_mel Number of Mel bins.
+ * \param[in] sample_rate Sample rate in Hz.
+ */
+static void mfcc_vad_build_weights(int16_t *weights, int num_mel, int sample_rate)
+{
+	int32_t scaled, num;
+	int32_t sum = 0;
+	int16_t f_hz, f0, f1, w, w0, w1, den;
+	int16_t mel_end = psy_hz_to_mel((int16_t)(sample_rate / 2));
+	int16_t mel_step = mel_end / (num_mel + 1);
+	int i, j;
+
+	if (!num_mel)
+		return;
+
+	for (i = 0; i < num_mel; i++) {
+		f_hz = psy_mel_to_hz((int16_t)((i + 1) * mel_step));
+
+		/* Find the table interval containing f_hz and interpolate */
+		if (f_hz <= a_weight_hz[0]) {
+			w = a_weight_lin[0];
+		} else if (f_hz >= a_weight_hz[A_WEIGHT_TABLE_SIZE - 1]) {
+			w = a_weight_lin[A_WEIGHT_TABLE_SIZE - 1];
+		} else {
+			/* Find j such that a_weight_hz[j] <= f_hz < a_weight_hz[j+1] */
+			for (j = 0; j < A_WEIGHT_TABLE_SIZE - 2; j++) {
+				if (f_hz < a_weight_hz[j + 1])
+					break;
+			}
+
+			/* Linear interpolation: w = w0 + (w1 - w0) * (f - f0) / (f1 - f0) */
+			f0 = a_weight_hz[j];
+			f1 = a_weight_hz[j + 1];
+			w0 = a_weight_lin[j];
+			w1 = a_weight_lin[j + 1];
+			num = (int32_t)(w1 - w0) * (f_hz - f0);
+			den = f1 - f0;
+			w = w0 + (int16_t)(num / den);
+		}
+
+		weights[i] = w;
+		sum += w;
+	}
+
+	/* Normalize weights so they sum to 1.0 */
+	for (i = 0; i < num_mel; i++) {
+		scaled = ((int32_t)weights[i] << 16) / sum; /* Q1.16 */
+		weights[i] = (int16_t)Q_SHIFT_RND(scaled, 16, 15); /* Round to Q1.15 */
+	}
+}
+
+int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate,
+		  struct processing_module *mod)
+{
+	if (!vad)
+		return -EINVAL;
+
+	if (num_mel_bins <= 0)
+		return -EINVAL;
+
+	vad->num_mel_bins = num_mel_bins;
+	vad->energy_threshold = MFCC_VAD_ENERGY_THRESHOLD;
+	vad->noise_rise_alpha_slow = MFCC_VAD_NOISE_RISE_ALPHA;
+	vad->noise_rise_alpha_fast = MFCC_VAD_NOISE_RISE_ALPHA_FAST;
+	vad->hangover_max = MFCC_VAD_HANGOVER_FRAMES;
+	vad->hangover_counter = 0;
+	vad->init_frames = MFCC_VAD_NOISE_INIT_FRAMES;
+	vad->frame_count = 0;
+	vad->is_speech = false;
+	vad->initialized = false;
+
+	/* Allocate per-bin noise floor */
+	vad->noise_floor = mod_zalloc(mod, num_mel_bins * sizeof(int32_t));
+	if (!vad->noise_floor)
+		return -ENOMEM;
+
+	/* Allocate and compute per-bin weights */
+	vad->weights = mod_zalloc(mod, num_mel_bins * sizeof(int16_t));
+	if (!vad->weights) {
+		mod_free(mod, vad->noise_floor);
+		vad->noise_floor = NULL;
+		return -ENOMEM;
+	}
+
+	mfcc_vad_build_weights(vad->weights, num_mel_bins, sample_rate);
+	return 0;
+}
+
+int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log)
+{
+	int64_t energy_delta = 0;
+	int32_t delta;
+	int32_t p;
+	int16_t alpha;
+	int i;
+
+	if (!vad || !mel_log)
+		return 0;
+
+	vad->frame_count++;
+
+	/* Initialize noise floor to first frame */
+	if (!vad->initialized) {
+		for (i = 0; i < vad->num_mel_bins; i++)
+			vad->noise_floor[i] = mel_log[i];
+
+		vad->initialized = true;
+	}
+
+	/* Select rise alpha based on convergence phase */
+	if (vad->frame_count <= vad->init_frames)
+		alpha = vad->noise_rise_alpha_fast;
+	else
+		alpha = vad->noise_rise_alpha_slow;
+
+	/* Update noise floor: follow down instantly, rise slowly */
+	for (i = 0; i < vad->num_mel_bins; i++) {
+		if (mel_log[i] < vad->noise_floor[i]) {
+			/* Instant follow-down */
+			vad->noise_floor[i] = mel_log[i];
+		} else {
+			/* Slow rise: floor += alpha * (mel - floor)
+			 * Q9.23 + Q1.15 * Q9.23 => need Q9.23 result
+			 * alpha is Q1.15, delta is Q9.23
+			 */
+			delta = mel_log[i] - vad->noise_floor[i];
+			p = (int32_t)Q_MULTSR_32X32((int64_t)alpha, delta, 15, 23, 23);
+			vad->noise_floor[i] += p;
+		}
+	}
+
+	/* Compute weighted energy delta above noise floor.
+	 * energy_delta = sum(weights[i] * (mel[i] - noise_floor[i]))
+	 * weights are Q1.15, mel delta is Q9.23
+	 * Product is Q10.38, accumulate in int64_t then shift to Q9.23
+	 */
+	for (i = 0; i < vad->num_mel_bins; i++) {
+		delta = mel_log[i] - vad->noise_floor[i];
+		if (delta > 0)
+			energy_delta += (int64_t)vad->weights[i] * delta;
+	}
+
+	/* Round accumulated energy from Q10.38 to Q9.23, saturate to int32 */
+	energy_delta = sat_int32(Q_SHIFT_RND(energy_delta, 38, 23));
+
+	if (energy_delta > vad->energy_threshold) {
+		vad->hangover_counter = vad->hangover_max;
+		vad->is_speech = true;
+	} else {
+		if (vad->hangover_counter > 0) {
+			vad->hangover_counter--;
+			vad->is_speech = true;
+		} else {
+			vad->is_speech = false;
+		}
+	}
+
+	return vad->is_speech ? 1 : 0;
+}
+
+void mfcc_vad_reset(struct mfcc_vad_state *vad)
+{
+	int i;
+
+	if (!vad)
+		return;
+
+	vad->frame_count = 0;
+	vad->hangover_counter = 0;
+	vad->is_speech = false;
+	vad->initialized = false;
+
+	for (i = 0; i < vad->num_mel_bins; i++)
+		vad->noise_floor[i] = 0;
+}
+
+#endif /* CONFIG_COMP_MFCC_VAD */
diff --git a/src/audio/mfcc/tune/README.md b/src/audio/mfcc/tune/README.md
new file mode 100644
index 000000000000..5fef841efff1
--- /dev/null
+++ b/src/audio/mfcc/tune/README.md
@@ -0,0 +1,98 @@
+# SOF MFCC Tuning Tools
+
+This directory contains a tool to create configuration blob for SOF
+MFCC component. It's simply run in Matlab or Octave with command
+`setup_mfcc`. The MFCC configuration parameters can be edited from the
+script.
+
+## Testbench
+
+The configuration can be test run with testbench. First the test topologies
+need to be created with `scripts/build-tools.sh -t`. Next the testbench
+is built with `scripts/rebuild-testbench.sh`.
+
+Once the previous steps are done, a sample wav file can be processed
+with script `run_mfcc.sh`. The script converts the input to raw 16 kHz
+stereo format and runs the testbench for S16, S24, and S32 bit depths,
+producing both cepstral coefficient (MFCC) and Mel spectrogram outputs.
+
+```
+./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav
+```
+
+Output files from host testbench:
+
+| File | Content |
+|------|---------|
+| `mfcc_s16.raw`, `mfcc_s24.raw`, `mfcc_s32.raw` | Cepstral coefficients |
+| `mel_s16.raw`, `mel_s24.raw`, `mel_s32.raw` | Mel spectrogram |
+
+If the `XTENSA_PATH` environment variable is set, the script also runs
+the Xtensa build of the testbench (via `xt-run`) and produces additional
+output files prefixed with `xt_`:
+
+| File | Content |
+|------|---------|
+| `xt_mfcc_s16.raw`, `xt_mfcc_s24.raw`, `xt_mfcc_s32.raw` | Cepstral coefficients |
+| `xt_mel_s16.raw`, `xt_mel_s24.raw`, `xt_mel_s32.raw` | Mel spectrogram |
+
+## Decoding and Plotting
+
+All output files can be decoded and plotted at once in Matlab or Octave
+with the `decode_all.m` script:
+
+```matlab
+decode_all
+```
+
+This calls `decode_ceps` for each MFCC file (13 cepstral coefficients) and
+`decode_mel` for each Mel file (80 Mel bins), plotting spectrograms for all
+files that exist including the Xtensa variants.
+
+Individual files can also be decoded manually:
+
+```matlab
+[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13);
+```
+
+In the above it's known from configuration script that MFCC was set up to
+output 13 cepstral coefficients from each FFT → Mel → DCT → Cepstral
+coefficients computation run.
+
+The 80 bands Mel output can be visualized with command:
+
+```matlab
+[mel, t, n] = decode_mel('mel_s16.raw', 80);
+```
+
+## Live Whisper Transcription with DSP VAD
+
+The directory contains a Python script `sof_mel_to_text_live_dsp_vad.py`.
+It can be used with development topologies
+`sof-arl-cs42l43-l0-cs35l56-l23-mfcc.tplg` and
+`sof-mtl-rt713-l0-rt1316-l12-mfcc.tplg`. It captures from default audio
+device `hw:0,47` (headset microphone) Mel audio features and VAD flags.
+The captured frames with detected speech are sent to Whisper speech
+recognizer model for conversion to text.
+
+### Prerequisites
+
+The script needs OpenVINO. Please follow the install procedure from
+<https://docs.openvino.ai/2025/get-started/install-openvino.html>.
+
+The following Python pip installs are needed into the same OpenVINO venv:
+
+```bash
+pip install openvino openvino-tokenizers openvino-genai
+pip install optimum[intel]
+pip install transformers
+pip install huggingface_hub
+```
+
+### NPU / GPU Support
+
+The script by default runs the Whisper encoder model in the NPU. To
+use the NPU, install the driver from
+<https://github.com/intel/linux-npu-driver/releases>. If the NPU is not
+available, change the encoder to CPU with run option `--encoder-device CPU`.
+With a GPU both `--encoder-device GPU` and `--decoder-device GPU` can be set.
diff --git a/src/audio/mfcc/tune/README.txt b/src/audio/mfcc/tune/README.txt
deleted file mode 100644
index a0c3189e81a3..000000000000
--- a/src/audio/mfcc/tune/README.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-This directory contains a tool to create configuration blob for SOF
-MFCC component. It's simply run in Matlab or Octave with command
-"setup_mfcc". The MFCC configuration parameters can be edited from the
-script.
-
-The configuration can be test run with testbench. First the test topologies
-need to be created with "scripts/build-tools.sh -t". Next the testbench
-is build with "scripts/rebuild-testbench.sh".
-
-Once the previous steps are done, a sample wav file can be processed
-with script run_mfcc.sh. The script converts the input to raw 16 kHz
-stereo format and runs the testbench for S16, S24, and S32 bit depths,
-producing both cepstral coefficient (MFCC) and Mel spectrogram outputs.
-
-./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav
-
-Output files from host testbench:
-  mfcc_s16.raw, mfcc_s24.raw, mfcc_s32.raw   - cepstral coefficients
-  mel_s16.raw, mel_s24.raw, mel_s32.raw       - Mel spectrogram
-
-If the XTENSA_PATH environment variable is set, the script also runs
-the Xtensa build of the testbench (via xt-run) and produces additional
-output files prefixed with "xt_":
-  xt_mfcc_s16.raw, xt_mfcc_s24.raw, xt_mfcc_s32.raw
-  xt_mel_s16.raw, xt_mel_s24.raw, xt_mel_s32.raw
-
-All output files can be decoded and plotted at once in Matlab or Octave
-with the decode_all.m script:
-
-decode_all
-
-This calls decode_ceps for each MFCC file (13 cepstral coefficients) and
-decode_mel for each Mel file (80 Mel bins), plotting spectrograms for all
-files that exist including the Xtensa variants.
-
-Individual files can also be decoded manually:
-
-[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13);
-
-In the above it's known from configuration script that MFCC was set up to
-output 13 cepstral coefficients from each FFT -> Mel -> DCT -> Cepstral
-coefficients computation run.
-
-The 80 bands Mel output can be visualized with command:
-
-[mel, t, n] = decode_mel('mel_s16.raw', 80);
-
-Other kind of signals have quite big visual difference in audio features. Try
-e.g. other sound files found in computer.
-
-./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/bark.ogg
-./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/sonar.ogg
diff --git a/src/audio/mfcc/tune/decode_mel.m b/src/audio/mfcc/tune/decode_mel.m
index f6a723aa2040..409fbccd9a52 100644
--- a/src/audio/mfcc/tune/decode_mel.m
+++ b/src/audio/mfcc/tune/decode_mel.m
@@ -1,4 +1,4 @@
-% [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels)
+% [mel, t, n, vad] = decode_mel(fn, num_mel, fmt, num_channels)
 %
 % Input
 %   fn - File with Mel data in .raw or .wav format
@@ -10,17 +10,18 @@
 %   mel - Mel coefficients
 %   t - time vector for plotting
 %   n - mel 1..num_mel vector for plotting
+%   vad - VAD flag per frame from DSP
 
 % SPDX-License-Identifier: BSD-3-Clause
 % Copyright(c) 2026 Intel Corporation.
 
-function [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels)
+function [mel, t, n, vad] = decode_mel(fn, num_mel, fmt, num_channels)
 
 if nargin < 3
 	fmt = 's16';
 end
 if nargin < 4
-	num_channels = 1;
+	num_channels = 2;
 end
 
 % MFCC stream
@@ -74,27 +75,43 @@
     num_frames = num_frames - 1;
 end
 
-t_mel = period_mel / num_channels / fs;
-t = (0:num_frames -1) * t_mel;
-n = 1:num_mel;
+% VAD flag is first int32 after magic, followed by num_mel coefficients
+payload_len = 1 + num_mel;
 
-mel = zeros(num_mel, num_frames);
+payload = zeros(payload_len, num_frames);
 for i = 1:num_frames
 	i1 = idx(i) + num_magic;
-	i2 = i1 + num_mel - 1;
-	mel(:,i) = double(data(i1:i2)) / 2^qformat;
+	i2 = i1 + payload_len - 1;
+	payload(:,i) = double(data(i1:i2));
 end
 
-figure;
+vad = payload(1, :);
+mel = payload(2:payload_len, :) / 2^qformat;
+
+t_mel = period_mel / num_channels / fs;
+t = (0:num_frames -1) * t_mel;
+n = 1:num_mel;
+
+%figure(1);
+figure
 imagesc(t, n, mel);
 axis xy;
 colormap(jet);
 colorbar;
 tstr = sprintf('SOF MFCC Mel coefficients (%s)', fn);
 title(tstr, 'Interpreter', 'None');
-xlabel('Time (s)');
 ylabel('Mel coef #');
 
+figure
+level = sum(mel(:,:));
+plot(t, vad)
+ax = axis();
+axis([ax(1:2) -0.1 1.1]);
+grid on;
+title(tstr, 'Interpreter', 'None');
+xlabel('Time (s)');
+ylabel('VAD flag');
+
 end
 
 function [data, num_channels] = get_file(fn, num_channels, fmt)
diff --git a/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py
new file mode 100644
index 000000000000..eeafb28f0b75
--- /dev/null
+++ b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py
@@ -0,0 +1,454 @@
+"""Live SOF mel capture with DSP VAD-triggered Whisper transcription.
+
+Captures mel frames from ALSA with embedded VAD flag from the DSP.
+Frame format: [magic(int32), vad_flag(int32), mel[0..79](int32)]
+When silence of 100ms is detected after speech, sends the buffered mel
+features to Whisper (OpenVINO encoder+decoder) for transcription.
+Capture continues running during Whisper inference.
+
+Usage:
+    python sof_mel_to_text_live_dsp_vad.py [--device hw:0,47] [--model whisper-medium-int4-ov]
+    python sof_mel_to_text_live_dsp_vad.py --plot  # with live spectrogram
+"""
+
+import argparse
+import os
+import struct
+import subprocess
+import threading
+import time
+import numpy as np
+import openvino as ov
+import huggingface_hub as hf_hub
+from pathlib import Path
+
+# Graphics imports deferred until --plot is used
+matplotlib = None
+plt = None
+
+# SOF mel_s32.raw format constants (with DSP VAD flag)
+SOF_MAGIC_S32 = np.int32(0x6D666363)  # ASCII 'mfcc' as int32
+SOF_MAGIC_BYTES = struct.pack('<i', 0x6D666363)
+SOF_NUM_MAGIC = 1
+SOF_NUM_VAD = 1              # VAD flag from DSP (1 = speech, 0 = silence)
+SOF_Q_FORMAT = 23            # Q9.23 fixed-point
+SOF_NUM_MEL = 80
+SOF_FRAME_INTS = SOF_NUM_MAGIC + SOF_NUM_VAD + SOF_NUM_MEL  # 82 int32 per frame
+SOF_FRAME_BYTES = SOF_FRAME_INTS * 4  # 328 bytes per frame
+
+# Speech buffering
+SILENCE_TRIGGER_MS = 100     # ms of silence after speech to trigger transcription
+SILENCE_TRIGGER_FRAMES = SILENCE_TRIGGER_MS // 10  # 10 frames at 10ms/frame
+MIN_SPEECH_MS = 500          # minimum speech duration to send to Whisper
+MIN_SPEECH_FRAMES = MIN_SPEECH_MS // 10  # 50 frames at 10ms/frame
+
+# Whisper model constants
+WHISPER_FEATURE_SIZE = 80
+WHISPER_NB_MAX_FRAMES = 3000  # 30 seconds at 10ms per frame
+
+
+def decode_mel_frame(raw_ints):
+    """Convert 80 int32 Q9.23 values to float32 mel coefficients."""
+    return raw_ints.astype(np.float64) / (2 ** SOF_Q_FORMAT)
+
+
+# ---------- Optional scrolling plot ----------
+
+SPECTROGRAM_WIDTH = 100
+
+
+class MelPlotter:
+    """Real-time scrolling mel spectrogram + VAD strip."""
+
+    def __init__(self, num_mel=SOF_NUM_MEL, width=SPECTROGRAM_WIDTH):
+        global matplotlib, plt
+        import matplotlib as _mpl
+        _mpl.use('TkAgg')
+        import matplotlib.pyplot as _plt
+        matplotlib = _mpl
+        plt = _plt
+
+        self.num_mel = num_mel
+        self.width = width
+
+        self.mel_buf = np.zeros((num_mel, width), dtype=np.float64)
+        self.vad_buf = np.zeros(width, dtype=np.float64)
+        self.x = np.arange(width)
+
+        self.fig, (self.ax_mel, self.ax_vad) = plt.subplots(
+            2, 1, figsize=(10, 5),
+            gridspec_kw={'height_ratios': [5, 1]},
+            sharex=True
+        )
+        self.fig.tight_layout(pad=2.0)
+
+        self.im_mel = self.ax_mel.imshow(
+            self.mel_buf, aspect='auto', origin='lower',
+            interpolation='nearest', cmap='turbo',
+            vmin=-2.0, vmax=2.0
+        )
+        self.ax_mel.set_ylabel('Mel bin')
+        self.ax_mel.set_title('Mel Spectrogram (scrolling) — DSP VAD')
+
+        self.line_vad, = self.ax_vad.plot(
+            self.x, self.vad_buf, color='green', linewidth=1.5,
+            drawstyle='steps-post')
+        self.ax_vad.set_ylabel('VAD')
+        self.ax_vad.set_xlabel('Frame')
+        self.ax_vad.set_ylim(-0.1, 1.1)
+        self.ax_vad.set_yticks([0, 1])
+        self.ax_vad.set_yticklabels(['Silent', 'Speech'])
+
+        plt.ion()
+        plt.show(block=False)
+        self.fig.canvas.draw()
+        self.fig.canvas.flush_events()
+
+    def update(self, mel_frame, is_speech):
+        self.mel_buf[:, :-1] = self.mel_buf[:, 1:]
+        self.mel_buf[:, -1] = mel_frame
+        self.vad_buf[:-1] = self.vad_buf[1:]
+        self.vad_buf[-1] = 1.0 if is_speech else 0.0
+
+        self.im_mel.set_data(self.mel_buf)
+        self.line_vad.set_ydata(self.vad_buf)
+
+        self.fig.canvas.draw_idle()
+        self.fig.canvas.flush_events()
+
+
+# ---------- Whisper inference ----------
+
+class WhisperTranscriber:
+    """Whisper encoder+decoder using OpenVINO, runs in a background thread."""
+
+    def __init__(self, model_path, encoder_device="NPU", decoder_device="CPU"):
+        self.model_path = model_path
+        core = ov.Core()
+        encoder_xml = str(Path(model_path) / "openvino_encoder_model.xml")
+        decoder_xml = str(Path(model_path) / "openvino_decoder_model.xml")
+        # NPU requires static shapes — fix [?,?,3000] to [1,80,3000]
+        encoder_model = core.read_model(encoder_xml)
+        encoder_model.reshape({0: [1, WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES]})
+        self.encoder = core.compile_model(encoder_model, encoder_device)
+        self.decoder = core.compile_model(decoder_xml, decoder_device)
+        self._load_tokenizer()
+        self._busy = False
+        self._lock = threading.Lock()
+
+    def _load_tokenizer(self):
+        """Load Whisper tokenizer."""
+        try:
+            from transformers import WhisperTokenizer
+            self.tokenizer = WhisperTokenizer.from_pretrained(self.model_path)
+            self._tokenizer_type = "hf"
+        except ImportError:
+            import openvino_genai as ov_genai
+            self.tokenizer = ov_genai.Tokenizer(self.model_path)
+            self._tokenizer_type = "ov"
+
+    def is_busy(self):
+        with self._lock:
+            return self._busy
+
+    def transcribe_async(self, mel_frames, callback):
+        """Run transcription in a background thread.
+
+        Args:
+            mel_frames: list of np.ndarray [80] mel frames
+            callback: function(text) called with result
+        """
+        with self._lock:
+            if self._busy:
+                return False
+            self._busy = True
+
+        t = threading.Thread(target=self._run, args=(mel_frames, callback),
+                             daemon=True)
+        t.start()
+        return True
+
+    def _run(self, mel_frames, callback):
+        try:
+            text = self._transcribe(mel_frames)
+            callback(text)
+        except Exception as e:
+            print(f"  [Whisper ERROR] {e}", flush=True)
+        finally:
+            with self._lock:
+                self._busy = False
+
+    def _transcribe(self, mel_frames):
+        """Encode mel frames and decode to text."""
+        n_frames = len(mel_frames)
+        if n_frames == 0:
+            return ""
+
+        # Stack frames into [80, n_frames]
+        features = np.column_stack(mel_frames).astype(np.float32)
+
+        # Pad to 3000 frames
+        silence_val = features.min()
+        padded = np.full((WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES),
+                         silence_val, dtype=np.float32)
+        n = min(n_frames, WHISPER_NB_MAX_FRAMES)
+        padded[:, :n] = features[:, :n]
+
+        # Encoder
+        t0 = time.time()
+        encoder_input = padded.reshape(1, WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES)
+        encoder_req = self.encoder.create_infer_request()
+        encoder_req.set_tensor("input_features", ov.Tensor(encoder_input))
+        encoder_req.infer()
+        hidden_state = encoder_req.get_tensor("last_hidden_state").data.copy()
+        t1 = time.time()
+        print(f"  [Whisper] encoder: {t1-t0:.2f}s", flush=True)
+
+        # Decoder: greedy decode
+        token_ids = self._greedy_decode(hidden_state)
+        t2 = time.time()
+        print(f"  [Whisper] decoder: {t2-t1:.2f}s ({len(token_ids)} tokens)",
+              flush=True)
+
+        # Convert to text
+        text_tokens = [t for t in token_ids if t < 50257]
+        if self._tokenizer_type == "hf":
+            text = self.tokenizer.decode(text_tokens)
+        else:
+            text = self.tokenizer.decode(text_tokens)
+
+        return text.strip()
+
+    def _greedy_decode(self, hidden_state, max_tokens=448):
+        """Greedy decoding loop."""
+        sot_tokens = [50258, 50259, 50359, 50363]
+        eos_token = 50257
+
+        decoder_req = self.decoder.create_infer_request()
+        input_names = [inp.get_any_name() for inp in self.decoder.inputs]
+        has_cache_position = "cache_position" in input_names
+
+        decoder_req.set_tensor("encoder_hidden_states", ov.Tensor(hidden_state))
+
+        # Prefill with SOT tokens
+        input_ids = np.array([sot_tokens], dtype=np.int64)
+        beam_idx = np.array([0], dtype=np.int32)
+
+        decoder_req.set_tensor("input_ids", ov.Tensor(input_ids))
+        if "beam_idx" in input_names:
+            decoder_req.set_tensor("beam_idx", ov.Tensor(beam_idx))
+        if has_cache_position:
+            cache_pos = np.arange(len(sot_tokens), dtype=np.int64).reshape(1, -1)
+            decoder_req.set_tensor("cache_position", ov.Tensor(cache_pos))
+
+        decoder_req.infer()
+        logits = decoder_req.get_tensor("logits").data
+        next_token = int(np.argmax(logits[0, -1, :]))
+
+        generated = [next_token]
+        position = len(sot_tokens)
+
+        for _ in range(max_tokens - 1):
+            if next_token == eos_token:
+                break
+
+            decoder_req.set_tensor("input_ids",
+                                   ov.Tensor(np.array([[next_token]], dtype=np.int64)))
+            if "beam_idx" in input_names:
+                decoder_req.set_tensor("beam_idx", ov.Tensor(beam_idx))
+            if has_cache_position:
+                decoder_req.set_tensor("cache_position",
+                                       ov.Tensor(np.array([[position]], dtype=np.int64)))
+
+            decoder_req.infer()
+            logits = decoder_req.get_tensor("logits").data
+            next_token = int(np.argmax(logits[0, -1, :]))
+            generated.append(next_token)
+            position += 1
+
+        return generated
+
+
+# ---------- Frame parser ----------
+
+def find_frame_in_buffer(buf):
+    """Find the first complete mel frame with DSP VAD flag in a byte buffer.
+
+    Frame layout: [magic(4B), vad_flag(4B), mel[0..79](320B)] = 328 bytes
+    Returns: (vad_flag, mel_ints, remaining_buf) or (None, None, buf)
+    """
+    while True:
+        idx = buf.find(SOF_MAGIC_BYTES)
+        if idx < 0:
+            if len(buf) > 3:
+                buf = buf[-3:]
+            return None, None, buf
+        end = idx + SOF_FRAME_BYTES
+        if end > len(buf):
+            buf = buf[idx:]
+            return None, None, buf
+        # Parse VAD flag (first int32 after magic)
+        vad_flag = struct.unpack_from('<i', buf, idx + 4)[0]
+        # Parse 80 mel coefficients (after magic + VAD)
+        mel_bytes = buf[idx + 8 : end]
+        mel_ints = np.frombuffer(mel_bytes, dtype=np.int32)
+        buf = buf[end:]
+        return vad_flag, mel_ints, buf
+
+
+# ---------- Main capture + transcription loop ----------
+
+def run_capture(device, rate, model_path, encoder_device, decoder_device,
+                enable_plot=False):
+    """Main capture loop: ALSA → DSP VAD → buffer speech → Whisper."""
+
+    plotter = MelPlotter() if enable_plot else None
+    transcriber = WhisperTranscriber(model_path, encoder_device=encoder_device,
+                                     decoder_device=decoder_device)
+
+    cmd = [
+        'arecord', '-D', device, '-f', 'S32_LE', '-c', '2',
+        '-r', str(rate), '-t', 'raw', '--buffer-size', '8192',
+    ]
+
+    print(f"Starting capture: {' '.join(cmd)}")
+    print(f"VAD source: DSP (embedded in stream)")
+    print(f"Silence trigger: {SILENCE_TRIGGER_MS}ms ({SILENCE_TRIGGER_FRAMES} frames)")
+    print(f"Whisper model: {model_path} (encoder: {encoder_device}, decoder: {decoder_device})")
+    print()
+
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    buf = b''
+    read_chunk = SOF_FRAME_BYTES * 4
+    frame_num = 0
+    prev_speech = None
+
+    # Speech buffering state
+    speech_buffer = []         # list of mel frames during speech
+    silence_counter = 0        # consecutive silence frames after speech
+    was_speaking = False       # True if we have buffered speech frames
+
+    def on_transcription(text):
+        if text:
+            print(f"\n  >> \"{text}\"\n", flush=True)
+        else:
+            print("  [Whisper] empty result", flush=True)
+
+    try:
+        while True:
+            data = proc.stdout.read(read_chunk)
+            if not data:
+                rc = proc.poll()
+                if rc is not None:
+                    stderr_out = proc.stderr.read().decode(errors='replace')
+                    print(f"\narecord exited with code {rc}")
+                    if stderr_out:
+                        print(f"stderr: {stderr_out}")
+                    break
+                continue
+
+            buf += data
+
+            while True:
+                vad_flag, frame_ints, buf = find_frame_in_buffer(buf)
+                if frame_ints is None:
+                    break
+
+                frame_num += 1
+                mel = decode_mel_frame(frame_ints)
+                speech = vad_flag != 0
+
+                # Print VAD transitions when not plotting
+                if plotter is None and speech != prev_speech:
+                    t = frame_num * 0.01
+                    tag = "SPEECH" if speech else "SILENCE"
+                    print(f"  [{t:7.2f}s] {tag}", flush=True)
+                prev_speech = speech
+
+                # Update plot
+                if plotter is not None:
+                    plotter.update(mel, speech)
+
+                # --- Speech buffering logic ---
+                if speech:
+                    speech_buffer.append(mel.copy())
+                    silence_counter = 0
+                    was_speaking = True
+                else:
+                    if was_speaking:
+                        silence_counter += 1
+                        if silence_counter >= SILENCE_TRIGGER_FRAMES:
+                            n = len(speech_buffer)
+                            duration = n * 0.01
+                            t = frame_num * 0.01
+
+                            if n < MIN_SPEECH_FRAMES:
+                                # Too short — discard
+                                speech_buffer.clear()
+                                silence_counter = 0
+                                was_speaking = False
+                                continue
+
+                            # Silence threshold reached — send to Whisper
+                            print(f"  [{t:7.2f}s] Transcribing {n} frames "
+                                  f"({duration:.1f}s)...", flush=True)
+
+                            if not transcriber.is_busy():
+                                frames_copy = list(speech_buffer)
+                                transcriber.transcribe_async(
+                                    frames_copy, on_transcription)
+                            else:
+                                print(f"  [{t:7.2f}s] (Whisper busy, "
+                                      f"dropping {n} frames)", flush=True)
+
+                            speech_buffer.clear()
+                            silence_counter = 0
+                            was_speaking = False
+
+    except (KeyboardInterrupt, BrokenPipeError):
+        pass
+    finally:
+        if proc.poll() is None:
+            proc.terminate()
+            try:
+                proc.wait(timeout=3)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+                proc.wait()
+        if plotter is not None:
+            try:
+                plt.close(plotter.fig)
+            except Exception:
+                pass
+        print("\n\nCapture stopped.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Live SOF mel capture with DSP VAD-triggered Whisper transcription")
+    parser.add_argument('--device', '-D', default='hw:0,47',
+                        help='ALSA capture device (default: hw:0,47)')
+    parser.add_argument('--rate', '-r', type=int, default=16000,
+                        help='Sample rate for arecord (default: 16000)')
+    parser.add_argument('--model', '-m', default='whisper-medium-int4-ov',
+                        help='Path to Whisper OpenVINO model directory')
+    parser.add_argument('--encoder-device', default='NPU',
+                        help='OpenVINO device for encoder (default: NPU)')
+    parser.add_argument('--decoder-device', default='CPU',
+                        help='OpenVINO device for decoder (default: CPU)')
+    parser.add_argument('--plot', action='store_true',
+                        help='Show live scrolling mel spectrogram and VAD plot')
+    args = parser.parse_args()
+    model_id = "OpenVINO/" + os.path.basename(args.model)
+    if not os.path.isdir(args.model):
+        print(f"Downloading model {model_id} ...")
+        hf_hub.snapshot_download(model_id, local_dir=args.model)
+
+    print("=== Live SOF Mel → Whisper Transcription (DSP VAD) ===\n")
+    run_capture(args.device, args.rate, args.model, args.encoder_device,
+                args.decoder_device, enable_plot=args.plot)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h
index 025eef116752..e0617e0f026f 100644
--- a/src/include/sof/audio/mfcc/mfcc_comp.h
+++ b/src/include/sof/audio/mfcc/mfcc_comp.h
@@ -12,6 +12,7 @@
 #include <sof/math/auditory.h>
 #include <sof/math/dct.h>
 #include <sof/math/fft.h>
+#include <sof/audio/mfcc/mfcc_vad.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -106,6 +107,10 @@ struct mfcc_state {
 	bool waiting_fill; /**< booleans */
 	bool prev_samples_valid;
 	bool magic_pending; /**< True when magic word not yet written for current output */
+#ifdef CONFIG_COMP_MFCC_VAD
+	bool vad_pending; /**< True when VAD flag not yet written for current output */
+	int32_t vad_flag; /**< Current VAD result: 1 = speech, 0 = silence */
+#endif
 	size_t sample_buffers_size; /**< bytes */
 	int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */
 	int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */
@@ -115,6 +120,9 @@ struct mfcc_state {
 /* MFCC component private data */
 struct mfcc_comp_data {
 	struct mfcc_state state;
+#ifdef CONFIG_COMP_MFCC_VAD
+	struct mfcc_vad_state vad;
+#endif
 	struct comp_data_blob_handler *model_handler;
 	struct sof_mfcc_config *config;
 	int max_frames;
diff --git a/src/include/sof/audio/mfcc/mfcc_vad.h b/src/include/sof/audio/mfcc/mfcc_vad.h
new file mode 100644
index 000000000000..6eac1ae08a15
--- /dev/null
+++ b/src/include/sof/audio/mfcc/mfcc_vad.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright(c) 2026 Intel Corporation.
+ *
+ * Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+ */
+
+/**
+ * \file mfcc_vad.h
+ * \brief Voice Activity Detection based on Mel spectrum energy.
+ *
+ * This VAD operates on the Q9.23 Mel log spectrum values produced by
+ * the MFCC component. It tracks a per-bin noise floor that follows
+ * the signal downward instantly and rises slowly, then computes a
+ * speech-weighted energy delta above the floor.
+ */
+
+#ifndef __SOF_AUDIO_MFCC_MFCC_VAD_H__
+#define __SOF_AUDIO_MFCC_MFCC_VAD_H__
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef CONFIG_COMP_MFCC_VAD
+
+struct processing_module;
+
+/**
+ * \brief Number of frames for fast noise floor convergence at startup (~1 s at 10 ms/frame).
+ */
+#define MFCC_VAD_NOISE_INIT_FRAMES	100
+
+/**
+ * \brief Slow noise floor rise coefficient in Q1.15 (0.0010 * 32768 = 3).
+ */
+#define MFCC_VAD_NOISE_RISE_ALPHA	33
+
+/**
+ * \brief Fast noise floor rise coefficient in Q1.15 (0.05 * 32768 = 1638).
+ */
+#define MFCC_VAD_NOISE_RISE_ALPHA_FAST	1638
+
+/**
+ * \brief Energy threshold for speech detection in Q9.23 (0.35 * 2^23 = 2936013).
+ */
+#define MFCC_VAD_ENERGY_THRESHOLD	2936013
+
+/**
+ * \brief Hangover frame count to keep VAD active after last speech detection.
+ */
+#define MFCC_VAD_HANGOVER_FRAMES	20
+
+/**
+ * \brief VAD state structure.
+ */
+struct mfcc_vad_state {
+	int32_t *noise_floor; /**< Per-bin noise floor in Q9.23 */
+	int16_t *weights; /**< Speech-frequency emphasis weights Q1.15 */
+	int32_t energy_threshold; /**< Energy threshold Q9.23 */
+	int16_t noise_rise_alpha_slow; /**< Slow rise alpha Q1.15 */
+	int16_t noise_rise_alpha_fast; /**< Fast rise alpha Q1.15 */
+	int16_t hangover_max; /**< Maximum hangover frames */
+	int16_t hangover_counter; /**< Current hangover counter */
+	int16_t num_mel_bins; /**< Number of Mel bins in use */
+	int16_t init_frames; /**< Number of initial frames for fast convergence */
+	int32_t frame_count; /**< Total frames processed */
+	bool is_speech; /**< Current VAD decision */
+	bool initialized; /**< True after first frame processed */
+};
+
+/**
+ * \brief Initialize VAD state.
+ *
+ * \param[out] vad Pointer to VAD state to initialize.
+ * \param[in] num_mel_bins Number of Mel bins.
+ * \param[in] sample_rate Audio sample rate in Hz.
+ * \param[in] mod Processing module for memory allocation.
+ * \return 0 on success, negative error code on failure.
+ */
+int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate,
+		  struct processing_module *mod);
+
+/**
+ * \brief Process one Mel spectrum frame and update VAD decision.
+ *
+ * \param[in,out] vad Pointer to VAD state.
+ * \param[in] mel_log Mel log spectrum in Q9.23, array of num_mel_bins values.
+ * \return 1 if speech detected, 0 if silence.
+ */
+int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log);
+
+/**
+ * \brief Reset VAD state without changing configuration.
+ *
+ * \param[in,out] vad Pointer to VAD state.
+ */
+void mfcc_vad_reset(struct mfcc_vad_state *vad);
+
+#endif /* CONFIG_COMP_MFCC_VAD */
+
+#endif /* __SOF_AUDIO_MFCC_MFCC_VAD_H__ */