NeuralAmpModelerCore/NAM/convnet.cpp at 7848cda643aafa41d673bcdf108826a8822c84e4 · sdatkinson/NeuralAmpModelerCore · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#include <algorithm> // std::max_element
#include <algorithm>
#include <cmath> // pow, tanh, expf
#include <filesystem>
#include <fstream>
#include <string>
#include <unordered_map>
#include <unordered_set>

#include "dsp.h"
#include "registry.h"
#include "convnet.h"

nam::convnet::BatchNorm::BatchNorm(const int dim, std::vector<float>::iterator& weights)
{
  // Extract from param buffer
  Eigen::VectorXf running_mean(dim);
  Eigen::VectorXf running_var(dim);
  Eigen::VectorXf _weight(dim);
  Eigen::VectorXf _bias(dim);
  for (int i = 0; i < dim; i++)
    running_mean(i) = *(weights++);
  for (int i = 0; i < dim; i++)
    running_var(i) = *(weights++);
  for (int i = 0; i < dim; i++)
    _weight(i) = *(weights++);
  for (int i = 0; i < dim; i++)
    _bias(i) = *(weights++);
  float eps = *(weights++);

  // Convert to scale & loc
  this->scale.resize(dim);
  this->loc.resize(dim);
  for (int i = 0; i < dim; i++)
    this->scale(i) = _weight(i) / sqrt(eps + running_var(i));
  this->loc = _bias - this->scale.cwiseProduct(running_mean);
}

void nam::convnet::BatchNorm::process_(Eigen::MatrixXf& x, const long i_start, const long i_end) const
{
  // todo using colwise?
  // #speed but conv probably dominates
  for (auto i = i_start; i < i_end; i++)
  {
    x.col(i) = x.col(i).cwiseProduct(this->scale);
    x.col(i) += this->loc;
  }
}

void nam::convnet::ConvNetBlock::set_weights_(const int in_channels, const int out_channels, const int _dilation,
                                              const bool batchnorm,
                                              const activations::ActivationConfig& activation_config, const int groups,
                                              std::vector<float>::iterator& weights)
{
  this->_batchnorm = batchnorm;
  // HACK 2 kernel
  this->conv.set_size_and_weights_(in_channels, out_channels, 2, _dilation, !batchnorm, groups, weights);
  if (this->_batchnorm)
    this->batchnorm = BatchNorm(out_channels, weights);
  this->activation = activations::Activation::get_activation(activation_config);
}

void nam::convnet::ConvNetBlock::SetMaxBufferSize(const int maxBufferSize)
{
  this->conv.SetMaxBufferSize(maxBufferSize);
  const long out_channels = get_out_channels();
  this->_output.resize(out_channels, maxBufferSize);
  this->_output.setZero();
}

void nam::convnet::ConvNetBlock::Process(const Eigen::MatrixXf& input, const int num_frames)
{
  // Process input with Conv1D
  this->conv.Process(input, num_frames);

  // Get output from Conv1D (this is a block reference to conv's _output buffer)
  auto conv_output_block = this->conv.GetOutput().leftCols(num_frames);

  // Copy conv output to our own output buffer
  this->_output.leftCols(num_frames) = conv_output_block;

  // Apply batchnorm if needed
  if (this->_batchnorm)
  {
    // Batchnorm expects indices, so we use 0 to num_frames for our output matrix
    this->batchnorm.process_(this->_output, 0, num_frames);
  }

  // Apply activation
  this->activation->apply(this->_output.leftCols(num_frames));
}

Eigen::Block<Eigen::MatrixXf> nam::convnet::ConvNetBlock::GetOutput(const int num_frames)
{
  return this->_output.block(0, 0, this->_output.rows(), num_frames);
}

void nam::convnet::ConvNetBlock::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start,
                                          const long i_end)
{
  const long ncols = i_end - i_start;
  // Extract input slice and process with Conv1D
  Eigen::MatrixXf input_slice = input.middleCols(i_start, ncols);
  this->conv.Process(input_slice, (int)ncols);

  // Get output from Conv1D (this is a block reference to _output buffer)
  auto conv_output_block = this->conv.GetOutput().leftCols((int)ncols);

  // For batchnorm, we need a matrix reference (not a block)
  // Create a temporary matrix from the block, process it, then copy back
  Eigen::MatrixXf temp_output = conv_output_block;

  // Apply batchnorm if needed
  if (this->_batchnorm)
  {
    // Batchnorm expects indices, so we use 0 to ncols for our temp matrix
    this->batchnorm.process_(temp_output, 0, ncols);
  }

  // Apply activation
  this->activation->apply(temp_output);

  // Copy to Conv1D's output buffer and to output matrix
  conv_output_block = temp_output;
  output.middleCols(i_start, ncols) = temp_output;
}

long nam::convnet::ConvNetBlock::get_out_channels() const
{
  return this->conv.get_out_channels();
}

nam::convnet::_Head::_Head(const int in_channels, const int out_channels, std::vector<float>::iterator& weights)
{
  // Weights are stored row-major: first row (output 0), then row 1 (output 1), etc.
  // For each output channel: [w0, w1, ..., w_{in_channels-1}]
  // Then biases: [bias0, bias1, ..., bias_{out_channels-1}]
  this->_weight.resize(out_channels, in_channels);
  for (int out_ch = 0; out_ch < out_channels; out_ch++)
  {
    for (int in_ch = 0; in_ch < in_channels; in_ch++)
    {
      this->_weight(out_ch, in_ch) = *(weights++);
    }
  }

  // Biases for each output channel
  this->_bias.resize(out_channels);
  for (int out_ch = 0; out_ch < out_channels; out_ch++)
  {
    this->_bias(out_ch) = *(weights++);
  }
}

void nam::convnet::_Head::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start,
                                   const long i_end) const
{
  const long length = i_end - i_start;
  const long out_channels = this->_weight.rows();

  // Resize output to (out_channels x length)
  output.resize(out_channels, length);

  // Extract input slice: (in_channels x length)
  Eigen::MatrixXf input_slice = input.middleCols(i_start, length);

  // Compute output = weight * input_slice: (out_channels x in_channels) * (in_channels x length) = (out_channels x
  // length)
  output.noalias() = this->_weight * input_slice;

  // Add bias to each column: output.colwise() += bias
  // output is (out_channels x length), bias is (out_channels x 1), so colwise() += works
  output.colwise() += this->_bias;
}

nam::convnet::ConvNet::ConvNet(const int in_channels, const int out_channels, const int channels,
                               const std::vector<int>& dilations, const bool batchnorm,
                               const activations::ActivationConfig& activation_config, std::vector<float>& weights,
                               const double expected_sample_rate, const int groups)
: Buffer(in_channels, out_channels, *std::max_element(dilations.begin(), dilations.end()), expected_sample_rate)
{
  this->_verify_weights(channels, dilations, batchnorm, weights.size());
  this->_blocks.resize(dilations.size());
  std::vector<float>::iterator it = weights.begin();
  // First block takes in_channels input, subsequent blocks take channels input
  for (size_t i = 0; i < dilations.size(); i++)
    this->_blocks[i].set_weights_(
      i == 0 ? in_channels : channels, channels, dilations[i], batchnorm, activation_config, groups, it);
  // Only need _block_vals for the head (one entry)
  // Conv1D layers manage their own buffers now
  this->_block_vals.resize(1);
  this->_block_vals[0].setZero();

  // Create single head that outputs all channels
  this->_head = _Head(channels, out_channels, it);

  if (it != weights.end())
    throw std::runtime_error("Didn't touch all the weights when initializing ConvNet");

  mPrewarmSamples = 1;
  for (size_t i = 0; i < dilations.size(); i++)
    mPrewarmSamples += dilations[i];
}


void nam::convnet::ConvNet::process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames)

{
  this->_update_buffers_(input, num_frames);
  const int in_channels = NumInputChannels();
  const int out_channels = NumOutputChannels();

  // For multi-channel, we process each input channel independently through the network
  // and sum outputs to each output channel (simple implementation)
  // This can be extended later for more sophisticated cross-channel processing

  // Convert input buffers to matrix for first layer (stack input channels)
  Eigen::MatrixXf input_matrix(in_channels, num_frames);
  const long i_start = this->_input_buffer_offset;
  for (int ch = 0; ch < in_channels; ch++)
  {
    for (int i = 0; i < num_frames; i++)
      input_matrix(ch, i) = this->_input_buffers[ch][i_start + i];
  }

  // Process through ConvNetBlock layers
  // Each block now uses Conv1D's internal buffers via Process() and GetOutput()
  for (size_t i = 0; i < this->_blocks.size(); i++)
  {
    // Get input for this block
    Eigen::MatrixXf block_input;
    if (i == 0)
    {
      // First block uses the input matrix
      block_input = input_matrix;
    }
    else
    {
      // Subsequent blocks use output from previous block
      auto prev_output = this->_blocks[i - 1].GetOutput(num_frames);
      block_input = prev_output; // Copy to matrix
    }

    // Process block (handles Conv1D, batchnorm, and activation internally)
    this->_blocks[i].Process(block_input, num_frames);
  }

  // Process head for all output channels at once
  // We need _block_vals[0] for the head interface
  const long buffer_size = (long)this->_input_buffers[0].size();
  if (this->_block_vals[0].rows() != this->_blocks.back().get_out_channels()
      || this->_block_vals[0].cols() != buffer_size)
  {
    this->_block_vals[0].resize(this->_blocks.back().get_out_channels(), buffer_size);
  }

  // Copy last block output to _block_vals for head
  auto last_output = this->_blocks.back().GetOutput(num_frames);
  const long buffer_offset = this->_input_buffer_offset;
  const long buffer_i_end = buffer_offset + num_frames;
  // last_output is (channels x num_frames), _block_vals[0] is (channels x buffer_size)
  // Copy to the correct location in _block_vals
  this->_block_vals[0].block(0, buffer_offset, last_output.rows(), num_frames) = last_output;

  // Process head - outputs all channels at once
  // Head will resize _head_output internally
  this->_head.process_(this->_block_vals[0], this->_head_output, buffer_offset, buffer_i_end);

  // Copy to output arrays for each channel
  for (int ch = 0; ch < out_channels; ch++)
  {
    for (int s = 0; s < num_frames; s++)
      output[ch][s] = this->_head_output(ch, s);
  }

  // Prepare for next call:
  nam::Buffer::_advance_input_buffer_(num_frames);
}

void nam::convnet::ConvNet::_verify_weights(const int channels, const std::vector<int>& dilations, const bool batchnorm,
                                            const size_t actual_weights)
{
  // TODO
}

void nam::convnet::ConvNet::SetMaxBufferSize(const int maxBufferSize)
{
  nam::Buffer::SetMaxBufferSize(maxBufferSize);

  // Reset all ConvNetBlock instances with the new buffer size
  for (auto& block : _blocks)
  {
    block.SetMaxBufferSize(maxBufferSize);
  }
}

void nam::convnet::ConvNet::_update_buffers_(NAM_SAMPLE** input, const int num_frames)
{
  this->Buffer::_update_buffers_(input, num_frames);

  // All channels use the same buffer size
  const long buffer_size = (long)this->_input_buffers[0].size();

  // Only need _block_vals[0] for the head
  // Conv1D layers manage their own buffers now
  if (this->_block_vals[0].rows() != this->_blocks.back().get_out_channels()
      || this->_block_vals[0].cols() != buffer_size)
  {
    this->_block_vals[0].resize(this->_blocks.back().get_out_channels(), buffer_size);
    this->_block_vals[0].setZero();
  }
}

void nam::convnet::ConvNet::_rewind_buffers_()
{
  // Conv1D instances now manage their own ring buffers and handle rewinding internally
  // So we don't need to rewind _block_vals for Conv1D layers
  // We only need _block_vals for the head, and it doesn't need rewinding since it's only used
  // for the current frame range

  // Just rewind the input buffer (for Buffer base class)
  this->Buffer::_rewind_buffers_();
}

// Config parser
nam::convnet::ConvNetConfig nam::convnet::parse_config_json(const nlohmann::json& config)
{
  ConvNetConfig c;
  c.channels = config["channels"];
  c.dilations = config["dilations"].get<std::vector<int>>();
  c.batchnorm = config["batchnorm"];
  // Parse JSON into typed ActivationConfig at model loading boundary
  c.activation = activations::ActivationConfig::from_json(config["activation"]);
  c.groups = config.value("groups", 1); // defaults to 1
  // Default to 1 channel in/out for backward compatibility
  c.in_channels = config.value("in_channels", 1);
  c.out_channels = config.value("out_channels", 1);
  return c;
}

// ConvNetConfig::create()
std::unique_ptr<nam::DSP> nam::convnet::ConvNetConfig::create(std::vector<float> weights, double sampleRate)
{
  return std::make_unique<nam::convnet::ConvNet>(in_channels, out_channels, channels, dilations, batchnorm, activation,
                                                 weights, sampleRate, groups);
}

// Config parser for ConfigParserRegistry
std::unique_ptr<nam::ModelConfig> nam::convnet::create_config(const nlohmann::json& config, double sampleRate)
{
  (void)sampleRate;
  auto c = std::make_unique<ConvNetConfig>();
  auto parsed = parse_config_json(config);
  *c = parsed;
  return c;
}

namespace
{
static nam::ConfigParserHelper _register_ConvNet("ConvNet", nam::convnet::create_config);
}