Skip to content

Commit cdb1315

Browse files
committed
audio: mfcc: update decode tools and add Python compress scripts
Update Octave decode scripts for int32 Q9.23 output and DTX gap filling. Add DTX blob generation to setup_mfcc.m. Add Python compress capture tools: sof_mel_spectrogram_compress.py, sof_ceps_spectrogram_compress.py, sof_mel_to_text_live_compress.py. Refactor sof_mel_to_text_live_dsp_vad.py to use shared compress capture code. Add README with usage examples. Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
1 parent 9d3282c commit cdb1315

14 files changed

Lines changed: 1227 additions & 212 deletions

src/audio/mfcc/tune/README.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,3 +150,40 @@ Whisper model: whisper-medium-int4-ov (encoder: CPU, decoder: CPU)
150150

151151
>> "Hello computer"
152152
```
153+
154+
## Live Spectrogram Viewers
155+
156+
### Mel Spectrogram
157+
158+
The `sof_mel_spectrogram_compress.py` script captures Mel spectrogram
159+
frames from a SOF compress PCM device and displays them as a live
160+
scrolling spectrogram with VAD status. This is a lightweight viewer
161+
that does not run Whisper inference.
162+
163+
```bash
164+
python3 sof_mel_spectrogram_compress.py --card 0 --device 48 --width 300
165+
```
166+
167+
### Cepstral Spectrogram
168+
169+
The `sof_ceps_spectrogram_compress.py` script is similar but displays
170+
cepstral coefficients (MFCC) instead of Mel bands.
171+
172+
```bash
173+
python3 sof_ceps_spectrogram_compress.py --card 0 --device 48 --num-ceps 13 --width 300
174+
```
175+
176+
## Live Whisper Transcription with Compress PCM
177+
178+
The `sof_mel_to_text_live_compress.py` script captures Mel spectrogram
179+
frames from a SOF compress PCM device and performs live Whisper
180+
transcription using OpenVINO. Unlike `sof_mel_to_text_live_dsp_vad.py`
181+
which uses `arecord`, this script reads directly from the compress PCM
182+
device with DTX-aware frame handling.
183+
184+
```bash
185+
python3 sof_mel_to_text_live_compress.py --card 0 --device 48 --model whisper-medium-int4-ov
186+
```
187+
188+
The same OpenVINO prerequisites and pip packages apply as described above
189+
for `sof_mel_to_text_live_dsp_vad.py`.

src/audio/mfcc/tune/decode_all.m

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,34 +6,33 @@
66
num_ceps = 13;
77
num_mel = 80;
88

9-
% MFCC cepstral output files
9+
% MFCC cepstral output files (all int32 output, Q9.23)
1010
ceps_files = {'mfcc_s16.raw', 'mfcc_s24.raw', 'mfcc_s32.raw'};
1111

12-
% Mel output files with corresponding format
12+
% Mel output files (all int32 output, Q9.23)
1313
mel_files = {'mel_s16.raw', 'mel_s24.raw', 'mel_s32.raw'};
14-
mel_fmts = {'s16', 's24', 's32'};
1514

1615
% Xtensa prefixed variants
1716
xt_ceps_files = {'xt_mfcc_s16.raw', 'xt_mfcc_s24.raw', 'xt_mfcc_s32.raw'};
1817
xt_mel_files = {'xt_mel_s16.raw', 'xt_mel_s24.raw', 'xt_mel_s32.raw'};
1918

2019
all_ceps_files = [ceps_files, xt_ceps_files];
2120
all_mel_files = [mel_files, xt_mel_files];
22-
all_mel_fmts = [mel_fmts, mel_fmts];
2321

2422
for i = 1:length(all_ceps_files)
2523
fn = all_ceps_files{i};
2624
if exist(fn, 'file')
2725
fprintf('Decoding MFCC ceps: %s\n', fn);
28-
[ceps, t, n, vad, energy, noise_energy, frame_num] = decode_ceps(fn, num_ceps);
26+
[ceps, t, n, vad, energy, noise_energy, frame_num] = ...
27+
decode_ceps(fn, num_ceps);
2928
end
3029
end
3130

3231
for i = 1:length(all_mel_files)
3332
fn = all_mel_files{i};
34-
fmt = all_mel_fmts{i};
3533
if exist(fn, 'file')
3634
fprintf('Decoding Mel: %s\n', fn);
37-
[mel, t, n, vad, energy, noise_energy, frame_num] = decode_mel(fn, num_mel, fmt);
35+
[mel, t, n, vad, energy, noise_energy, frame_num] = ...
36+
decode_mel(fn, num_mel);
3837
end
3938
end

src/audio/mfcc/tune/decode_ceps.m

Lines changed: 63 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
% [ceps, t, n, vad, energy, noise_energy, frame_number] = decode_ceps(fn, num_ceps, num_channels)
1+
% [ceps, t, n, vad, energy, noise_energy, frame_number] = decode_ceps(fn, num_ceps, hop, num_channels)
22
%
33
% Input
44
% fn - File with MFCC data in .raw or .wav format
55
% num_ceps - number of cepstral coefficients per frame
6-
% num_channels - needed for .raw format, omit for .wav
6+
% hop - STFT hop in seconds, defaults to 10e-3 for 10 ms
7+
% num_channels - needed for .raw format, omit for .wav, default 1
78
%
89
% Outputs
910
% ceps - cepstral coefficients
@@ -18,65 +19,93 @@
1819
% Copyright(c) 2022-2026 Intel Corporation. All rights reserved.
1920

2021
function [ceps, t, n, vad, energy, noise_energy, frame_number] = ...
21-
decode_ceps(fn, num_ceps, num_channels)
22+
decode_ceps(fn, num_ceps, hop, num_channels)
2223

2324
if nargin < 3
25+
hop = 10e-3;
26+
end
27+
if nargin < 4
2428
num_channels = 1;
2529
end
2630

2731
% MFCC stream
28-
fs = 16e3;
29-
qformat = 7;
30-
magic = [25443 28006]; % ASCII 'mfcc' as int16
31-
num_magic = 2; % magic word is 2 x int16
32+
qformat = 23; % Q9.23 in int32
33+
magic = int32(1835426659); % 0x6D666363 as int32
34+
num_magic = 1; % magic word is 1 x int32
3235

33-
% Load output data
36+
% Load output data (always int32)
3437
[data, num_channels] = get_file(fn, num_channels);
3538

36-
idx1 = find(data == magic(1));
37-
idx = [];
38-
for i = 1:length(idx1)
39-
if data(idx1(i) + 1) == magic(2)
40-
idx = [idx idx1(i)];
41-
end
42-
end
39+
idx = find(data == magic);
4340

4441
if isempty(idx)
4542
error('No magic value markers found from stream');
4643
end
4744

48-
period_ceps = idx(2)-idx(1);
4945
num_frames = length(idx);
5046

5147
% Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag]
52-
% as int32 (10 int16 slots), followed by num_ceps coefficients.
53-
payload_len = 10 + num_ceps; % 5 int32 = 10 int16, then ceps data
48+
% as int32, followed by num_ceps coefficients (int32).
49+
payload_len = 5 + num_ceps;
5450

5551
% Last frame can be incomplete due to span over multiple periods
5652
last = idx(end) + num_magic + payload_len - 1;
5753
if (last > length(data))
5854
num_frames = num_frames - 1;
5955
end
6056

61-
t_ceps = period_ceps / num_channels / fs;
62-
t = (0:num_frames -1) * t_ceps;
63-
n = 1:num_ceps;
64-
6557
payload = zeros(payload_len, num_frames);
6658
for i = 1:num_frames
6759
i1 = idx(i) + num_magic;
6860
i2 = i1 + payload_len - 1;
6961
payload(:,i) = double(data(i1:i2));
7062
end
7163

72-
% Reassemble int32 from pairs of int16 (little-endian).
73-
% Low half must be treated as unsigned with mod() to handle negative int16.
74-
frame_number = mod(payload(1,:), 65536) + payload(2,:) * 65536;
75-
% payload(3:4,:) is reserved, skip
76-
energy = (mod(payload(5,:), 65536) + payload(6,:) * 65536) / 2^23;
77-
noise_energy = (mod(payload(7,:), 65536) + payload(8,:) * 65536) / 2^23;
78-
vad = mod(payload(9,:), 65536) + payload(10,:) * 65536;
79-
ceps = payload(11:payload_len, :) / 2^qformat;
64+
frame_number = payload(1, :);
65+
% payload(2,:) is reserved, skip
66+
energy = payload(3, :) / 2^23;
67+
noise_energy = payload(4, :) / 2^23;
68+
vad = payload(5, :);
69+
ceps = payload(6:payload_len, :) / 2^qformat;
70+
71+
% Fill gaps from DTX-suppressed VAD=0 frames to create continuous timeline.
72+
% Missing frames are filled with the minimum ceps value found in the data.
73+
first_frame = frame_number(1);
74+
last_frame = frame_number(end);
75+
total_frames = last_frame - first_frame + 1;
76+
if total_frames > num_frames
77+
ceps_fill = min(ceps(:));
78+
ceps_full = ones(num_ceps, total_frames) * ceps_fill;
79+
vad_full = zeros(1, total_frames);
80+
energy_full = zeros(1, total_frames);
81+
noise_energy_full = zeros(1, total_frames);
82+
frame_number_full = first_frame:last_frame;
83+
has_data = false(1, total_frames);
84+
for i = 1:num_frames
85+
fi = frame_number(i) - first_frame + 1;
86+
ceps_full(:, fi) = ceps(:, i);
87+
vad_full(fi) = vad(i);
88+
energy_full(fi) = energy(i);
89+
noise_energy_full(fi) = noise_energy(i);
90+
has_data(fi) = true;
91+
end
92+
% Forward-fill gaps with last received values
93+
for fi = 2:total_frames
94+
if ~has_data(fi)
95+
ceps_full(:, fi) = ceps_full(:, fi - 1);
96+
energy_full(fi) = energy_full(fi - 1);
97+
noise_energy_full(fi) = noise_energy_full(fi - 1);
98+
end
99+
end
100+
ceps = ceps_full;
101+
vad = vad_full;
102+
energy = energy_full;
103+
noise_energy = noise_energy_full;
104+
frame_number = frame_number_full;
105+
end
106+
107+
t = (frame_number - first_frame) * hop;
108+
n = 1:num_ceps;
80109

81110
figure;
82111
surf(t, n, ceps, 'EdgeColor', 'none');
@@ -96,18 +125,18 @@
96125
switch lower(ext)
97126
case '.raw'
98127
fh = fopen(fn, 'r');
99-
data = fread(fh, 'int16');
128+
data = fread(fh, 'int32');
100129
fclose(fh);
101130
case '.wav'
102131
tmp = audioread(fn, 'native');
103132
t = whos('tmp');
104-
if ~strcmp(t.class, 'int16')
105-
error('Only 16-bit wav file format is supported');
133+
if ~strcmp(t.class, 'int32')
134+
error('Expected 32-bit wav for int32 MFCC output format');
106135
end
107136
s = size(tmp);
108137
num_channels = s(2);
109138
if num_channels > 1
110-
data = int16(zeros(prod(s), 1));
139+
data = int32(zeros(prod(s), 1));
111140
for i = 1:num_channels
112141
data(i:num_channels:end) = tmp(:, i);
113142
end

0 commit comments

Comments
 (0)