|
1 | | -% [ceps, t, n, vad, energy, noise_energy, frame_number] = decode_ceps(fn, num_ceps, num_channels) |
| 1 | +% [ceps, t, n, vad, energy, noise_energy, frame_number] = decode_ceps(fn, num_ceps, hop, num_channels) |
2 | 2 | % |
3 | 3 | % Input |
4 | 4 | % fn - File with MFCC data in .raw or .wav format |
5 | 5 | % num_ceps - number of cepstral coefficients per frame |
6 | | -% num_channels - needed for .raw format, omit for .wav |
| 6 | +% hop - STFT hop in seconds, defaults to 10e-3 for 10 ms |
| 7 | +% num_channels - needed for .raw format, omit for .wav, default 1 |
7 | 8 | % |
8 | 9 | % Outputs |
9 | 10 | % ceps - cepstral coefficients |
|
18 | 19 | % Copyright(c) 2022-2026 Intel Corporation. All rights reserved. |
19 | 20 |
|
20 | 21 | function [ceps, t, n, vad, energy, noise_energy, frame_number] = ... |
21 | | - decode_ceps(fn, num_ceps, num_channels) |
| 22 | + decode_ceps(fn, num_ceps, hop, num_channels) |
22 | 23 |
|
23 | 24 | if nargin < 3 |
| 25 | + hop = 10e-3; |
| 26 | +end |
| 27 | +if nargin < 4 |
24 | 28 | num_channels = 1; |
25 | 29 | end |
26 | 30 |
|
27 | 31 | % MFCC stream |
28 | | -fs = 16e3; |
29 | | -qformat = 7; |
30 | | -magic = [25443 28006]; % ASCII 'mfcc' as int16 |
31 | | -num_magic = 2; % magic word is 2 x int16 |
| 32 | +qformat = 23; % Q9.23 in int32 |
| 33 | +magic = int32(1835426659); % 0x6D666363 as int32 |
| 34 | +num_magic = 1; % magic word is 1 x int32 |
32 | 35 |
|
33 | | -% Load output data |
| 36 | +% Load output data (always int32) |
34 | 37 | [data, num_channels] = get_file(fn, num_channels); |
35 | 38 |
|
36 | | -idx1 = find(data == magic(1)); |
37 | | -idx = []; |
38 | | -for i = 1:length(idx1) |
39 | | - if data(idx1(i) + 1) == magic(2) |
40 | | - idx = [idx idx1(i)]; |
41 | | - end |
42 | | -end |
| 39 | +idx = find(data == magic); |
43 | 40 |
|
44 | 41 | if isempty(idx) |
45 | 42 | error('No magic value markers found from stream'); |
46 | 43 | end |
47 | 44 |
|
48 | | -period_ceps = idx(2)-idx(1); |
49 | 45 | num_frames = length(idx); |
50 | 46 |
|
51 | 47 | % Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag] |
52 | | -% as int32 (10 int16 slots), followed by num_ceps coefficients. |
53 | | -payload_len = 10 + num_ceps; % 5 int32 = 10 int16, then ceps data |
| 48 | +% as int32, followed by num_ceps coefficients (int32). |
| 49 | +payload_len = 5 + num_ceps; |
54 | 50 |
|
55 | 51 | % Last frame can be incomplete due to span over multiple periods |
56 | 52 | last = idx(end) + num_magic + payload_len - 1; |
57 | 53 | if (last > length(data)) |
58 | 54 | num_frames = num_frames - 1; |
59 | 55 | end |
60 | 56 |
|
61 | | -t_ceps = period_ceps / num_channels / fs; |
62 | | -t = (0:num_frames -1) * t_ceps; |
63 | | -n = 1:num_ceps; |
64 | | - |
65 | 57 | payload = zeros(payload_len, num_frames); |
66 | 58 | for i = 1:num_frames |
67 | 59 | i1 = idx(i) + num_magic; |
68 | 60 | i2 = i1 + payload_len - 1; |
69 | 61 | payload(:,i) = double(data(i1:i2)); |
70 | 62 | end |
71 | 63 |
|
72 | | -% Reassemble int32 from pairs of int16 (little-endian). |
73 | | -% Low half must be treated as unsigned with mod() to handle negative int16. |
74 | | -frame_number = mod(payload(1,:), 65536) + payload(2,:) * 65536; |
75 | | -% payload(3:4,:) is reserved, skip |
76 | | -energy = (mod(payload(5,:), 65536) + payload(6,:) * 65536) / 2^23; |
77 | | -noise_energy = (mod(payload(7,:), 65536) + payload(8,:) * 65536) / 2^23; |
78 | | -vad = mod(payload(9,:), 65536) + payload(10,:) * 65536; |
79 | | -ceps = payload(11:payload_len, :) / 2^qformat; |
| 64 | +frame_number = payload(1, :); |
| 65 | +% payload(2,:) is reserved, skip |
| 66 | +energy = payload(3, :) / 2^23; |
| 67 | +noise_energy = payload(4, :) / 2^23; |
| 68 | +vad = payload(5, :); |
| 69 | +ceps = payload(6:payload_len, :) / 2^qformat; |
| 70 | + |
| 71 | +% Fill gaps from DTX-suppressed VAD=0 frames to create continuous timeline. |
| 72 | +% Missing frames are filled with the minimum ceps value found in the data. |
| 73 | +first_frame = frame_number(1); |
| 74 | +last_frame = frame_number(end); |
| 75 | +total_frames = last_frame - first_frame + 1; |
| 76 | +if total_frames > num_frames |
| 77 | + ceps_fill = min(ceps(:)); |
| 78 | + ceps_full = ones(num_ceps, total_frames) * ceps_fill; |
| 79 | + vad_full = zeros(1, total_frames); |
| 80 | + energy_full = zeros(1, total_frames); |
| 81 | + noise_energy_full = zeros(1, total_frames); |
| 82 | + frame_number_full = first_frame:last_frame; |
| 83 | + has_data = false(1, total_frames); |
| 84 | + for i = 1:num_frames |
| 85 | + fi = frame_number(i) - first_frame + 1; |
| 86 | + ceps_full(:, fi) = ceps(:, i); |
| 87 | + vad_full(fi) = vad(i); |
| 88 | + energy_full(fi) = energy(i); |
| 89 | + noise_energy_full(fi) = noise_energy(i); |
| 90 | + has_data(fi) = true; |
| 91 | + end |
| 92 | + % Forward-fill gaps with last received values |
| 93 | + for fi = 2:total_frames |
| 94 | + if ~has_data(fi) |
| 95 | + ceps_full(:, fi) = ceps_full(:, fi - 1); |
| 96 | + energy_full(fi) = energy_full(fi - 1); |
| 97 | + noise_energy_full(fi) = noise_energy_full(fi - 1); |
| 98 | + end |
| 99 | + end |
| 100 | + ceps = ceps_full; |
| 101 | + vad = vad_full; |
| 102 | + energy = energy_full; |
| 103 | + noise_energy = noise_energy_full; |
| 104 | + frame_number = frame_number_full; |
| 105 | +end |
| 106 | + |
| 107 | +t = (frame_number - first_frame) * hop; |
| 108 | +n = 1:num_ceps; |
80 | 109 |
|
81 | 110 | figure; |
82 | 111 | surf(t, n, ceps, 'EdgeColor', 'none'); |
|
96 | 125 | switch lower(ext) |
97 | 126 | case '.raw' |
98 | 127 | fh = fopen(fn, 'r'); |
99 | | - data = fread(fh, 'int16'); |
| 128 | + data = fread(fh, 'int32'); |
100 | 129 | fclose(fh); |
101 | 130 | case '.wav' |
102 | 131 | tmp = audioread(fn, 'native'); |
103 | 132 | t = whos('tmp'); |
104 | | - if ~strcmp(t.class, 'int16') |
105 | | - error('Only 16-bit wav file format is supported'); |
| 133 | + if ~strcmp(t.class, 'int32') |
| 134 | + error('Expected 32-bit wav for int32 MFCC output format'); |
106 | 135 | end |
107 | 136 | s = size(tmp); |
108 | 137 | num_channels = s(2); |
109 | 138 | if num_channels > 1 |
110 | | - data = int16(zeros(prod(s), 1)); |
| 139 | + data = int32(zeros(prod(s), 1)); |
111 | 140 | for i = 1:num_channels |
112 | 141 | data(i:num_channels:end) = tmp(:, i); |
113 | 142 | end |
|
0 commit comments