From c6640c02b1521c25b32e5ddd6d6a78766cce8607 Mon Sep 17 00:00:00 2001
From: qxip <qxip@mini-ams.local>
Date: Sat, 28 Feb 2026 19:26:07 +0100
Subject: [PATCH 01/17] Reference audio (WAV/MP3), CI build + test-generation

- Add unified audio loader (audio.h, audio_loader.cpp, wav.h): WAV and MP3
  to stereo 48kHz float; MP3 via header-only minimp3, no temp files
- dit-vae: reference_audio path uses load_audio_48k_stereo; VAE encoder
  for timbre conditioning (requires VAE GGUF with encoder weights)
- Request API: task_type, reference_audio, src_audio, audio_cover_strength,
  repainting_start/end; docs in README and docs/MODES.md
- GitHub Actions: build.yml (Ubuntu + macOS on push/PR); test-generation.yml
  (manual + release only, short text2music/cover/full-pipeline tests)
- CI fixtures: tests/fixtures/ci-text2music.json, ci-cover.json (cover uses
  text2music WAV as reference)

Made-with: Cursor
---
 .github/workflows/build.yml           |   44 +
 .github/workflows/test-generation.yml |   72 +
 .gitignore                            |    2 +
 CMakeLists.txt                        |    2 +-
 README.md                             |   17 +-
 audio.h                               |   17 +
 audio_loader.cpp                      |  117 ++
 dit-vae.cpp                           |   65 +-
 docs/MODES.md                         |   79 ++
 examples/request-reference.json       |   13 +
 examples/test-reference.sh            |   21 +
 request.cpp                           |   39 +-
 request.h                             |   19 +-
 tests/fixtures/ci-cover.json          |   12 +
 tests/fixtures/ci-text2music.json     |   11 +
 third_party/minimp3.h                 | 1865 +++++++++++++++++++++++++
 vae.h                                 |  180 +++
 wav.h                                 |  100 ++
 18 files changed, 2653 insertions(+), 22 deletions(-)
 create mode 100644 .github/workflows/build.yml
 create mode 100644 .github/workflows/test-generation.yml
 create mode 100644 audio.h
 create mode 100644 audio_loader.cpp
 create mode 100644 docs/MODES.md
 create mode 100644 examples/request-reference.json
 create mode 100755 examples/test-reference.sh
 create mode 100644 tests/fixtures/ci-cover.json
 create mode 100644 tests/fixtures/ci-text2music.json
 create mode 100644 third_party/minimp3.h
 create mode 100644 wav.h

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..fe17246
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,44 @@
+# Validate that the project builds on Ubuntu and macOS (no model download).
+name: Build
+
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Build (Ubuntu)
+        if: matrix.os == 'ubuntu-latest'
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y -qq cmake build-essential pkg-config libopenblas-dev
+          mkdir build && cd build
+          cmake .. -DGGML_BLAS=ON
+          cmake --build . --config Release -j$(nproc)
+
+      - name: Build (macOS)
+        if: matrix.os == 'macos-latest'
+        run: |
+          mkdir build && cd build
+          cmake ..
+          cmake --build . --config Release -j$(sysctl -n hw.ncpu)
+
+      - name: Smoke test
+        run: |
+          ./build/ace-qwen3 --help 2>&1 | head -5
+          ./build/dit-vae --help 2>&1 | head -5
+          ./build/quantize --help 2>&1 | head -3
diff --git a/.github/workflows/test-generation.yml b/.github/workflows/test-generation.yml
new file mode 100644
index 0000000..3c9547c
--- /dev/null
+++ b/.github/workflows/test-generation.yml
@@ -0,0 +1,72 @@
+# Build, download models (cached), and run short generation tests for various modes.
+# Runs on release (published) or manual trigger only. Uses short duration (5s) and few steps (4).
+name: Test generation
+
+on:
+  workflow_dispatch: {}
+  release:
+    types: [published]
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Build
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y -qq cmake build-essential pkg-config libopenblas-dev
+          mkdir build && cd build
+          cmake .. -DGGML_BLAS=ON
+          cmake --build . --config Release -j$(nproc)
+
+      - name: Cache models
+        id: cache-models
+        uses: actions/cache@v4
+        with:
+          path: models
+          key: acestep-models-q8-${{ hashFiles('models.sh') }}
+          restore-keys: acestep-models-q8-
+
+      - name: Download models
+        if: steps.cache-models.outputs.cache-hit != 'true'
+        run: |
+          pip install -q hf
+          ./models.sh
+
+      - name: Test mode text2music (short)
+        run: |
+          ./build/dit-vae \
+            --request tests/fixtures/ci-text2music.json \
+            --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+            --dit models/acestep-v15-turbo-Q8_0.gguf \
+            --vae models/vae-BF16.gguf
+          test -f tests/fixtures/ci-text2music0.wav && echo "text2music WAV OK"
+
+      - name: Test mode cover with WAV reference (short)
+        run: |
+          ./build/dit-vae \
+            --request tests/fixtures/ci-cover.json \
+            --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+            --dit models/acestep-v15-turbo-Q8_0.gguf \
+            --vae models/vae-BF16.gguf
+          test -f tests/fixtures/ci-cover0.wav && echo "cover WAV OK"
+
+      - name: Test full pipeline (LLM + DiT, short)
+        run: |
+          ./build/ace-qwen3 \
+            --request tests/fixtures/ci-text2music.json \
+            --model models/acestep-5Hz-lm-4B-Q8_0.gguf
+          test -f request0.json
+          ./build/dit-vae \
+            --request request0.json \
+            --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+            --dit models/acestep-v15-turbo-Q8_0.gguf \
+            --vae models/vae-BF16.gguf
+          test -f request00.wav && echo "full pipeline WAV OK"
diff --git a/.gitignore b/.gitignore
index 4ccb4f4..0fa15e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@ build/
 *.bf16
 
 tests/*/
+!tests/fixtures/
+!tests/fixtures/*.json
 
 checkpoints/
 models/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7721447..b9a4830 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,7 +46,7 @@ macro(link_ggml_backends target)
 endmacro()
 
 # dit-vae: full pipeline (text-enc + cond + dit + vae + wav)
-add_executable(dit-vae dit-vae.cpp request.cpp)
+add_executable(dit-vae dit-vae.cpp request.cpp audio_loader.cpp)
 link_ggml_backends(dit-vae)
 
 # ace-qwen3: LLM inference (CoT + audio codes)
diff --git a/README.md b/README.md
index 2467b53..55ac16d 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,10 @@ cmake --build . --config Release -j$(nproc)
 
 Builds two binaries: `ace-qwen3` (LLM) and `dit-vae` (DiT + VAE).
 
+**CI (GitHub Actions)**  
+- **Build**: on every push/PR, builds on Ubuntu (BLAS) and macOS (Metal); smoke test runs each binary `--help`.  
+- **Test generation**: on manual trigger or push to `main`; builds, caches models, then runs short (5 s, 4 steps) generation for text2music, cover, and full pipeline (LLM → DiT → WAV). See `.github/workflows/`.
+
 ## Models
 
 Pre-quantized GGUFs on [Hugging Face](https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF).
@@ -139,10 +143,11 @@ cd examples
 ./partial.sh          # caption + lyrics + duration
 ./full.sh             # all metadata provided
 ./dit-only.sh         # skip LLM, DiT from noise
+./test-reference.sh   # reference_audio (WAV or MP3) + audio_cover_strength
 ```
 
 Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0)
-alongside the turbo default (8 steps, no CFG).
+alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights).
 
 ## Generation modes
 
@@ -170,10 +175,11 @@ Run `dit-vae` to decode existing codes. See `examples/dit-only.json`.
 
 ## Request JSON reference
 
-All fields with defaults. Only `caption` is required.
+All fields with defaults. Only `caption` is required. Built-in modes (text2music, cover, repaint) and audio inputs follow the [ACE-Step 1.5 Tutorial](https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md); see [docs/MODES.md](docs/MODES.md) for what is implemented.
 
 ```json
 {
+    "task_type":          "text2music",
     "caption":            "",
     "lyrics":             "",
     "instrumental":       false,
@@ -188,7 +194,12 @@ All fields with defaults. Only `caption` is required.
     "lm_top_p":           0.9,
     "lm_top_k":           0,
     "lm_negative_prompt": "",
+    "reference_audio":    "",
+    "src_audio":          "",
     "audio_codes":        "",
+    "audio_cover_strength": 1.0,
+    "repainting_start":   0.0,
+    "repainting_end":     0.0,
     "inference_steps":    8,
     "guidance_scale":     7.0,
     "shift":              3.0
@@ -198,7 +209,7 @@ All fields with defaults. Only `caption` is required.
 Key fields: `seed` -1 means random (resolved once, then +1 per batch
 element). `audio_codes` is generated by ace-qwen3 and consumed by
 dit-vae (comma separated FSQ token IDs). When present, the LLM is
-skipped entirely.
+skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style (MP3 decoded in memory; encoded via built-in VAE encoder; requires VAE GGUF with encoder weights). `src_audio`: not yet implemented (see docs/MODES.md).
 
 Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG).
 SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`.
diff --git a/audio.h b/audio.h
new file mode 100644
index 0000000..bd90915
--- /dev/null
+++ b/audio.h
@@ -0,0 +1,17 @@
+// audio.h: unified reference-audio loader (WAV + MP3 → stereo 48kHz float)
+// Header-only for WAV; MP3 implementation in audio_loader.cpp (minimp3, no temp files).
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+// Load WAV or MP3 file into stereo float32 at 48kHz.
+// Out: interleaved L,R,L,R,...; length = num_samples (per channel).
+// Returns num_samples (per channel), or -1 on error.
+// No temp files; MP3 decoded in memory via minimp3 (header-only dep).
+int load_audio_48k_stereo(const char * path, std::vector<float> * out);
+
+// MP3 implementation (in audio_loader.cpp; do not call from other TUs without linking it)
+int mp3_load_48k_stereo(const char * path, std::vector<float> * out);
diff --git a/audio_loader.cpp b/audio_loader.cpp
new file mode 100644
index 0000000..a0e71b5
--- /dev/null
+++ b/audio_loader.cpp
@@ -0,0 +1,117 @@
+// audio_loader.cpp: MP3 decode for reference audio (minimp3, no deps, no temp files)
+
+#define MINIMP3_IMPLEMENTATION
+#include "third_party/minimp3.h"
+
+#include "wav.h"
+#include "audio.h"
+#include <cstdio>
+#include <cstring>
+#include <vector>
+#include <algorithm>
+
+static bool path_ends_with_ci(const char * path, const char * suffix) {
+    size_t pl = strlen(path), sl = strlen(suffix);
+    if (pl < sl) return false;
+    const char * p = path + pl - sl;
+    for (size_t i = 0; i < sl; i++) {
+        char a = (char)(p[i] >= 'A' && p[i] <= 'Z' ? p[i] + 32 : p[i]);
+        char b = (char)(suffix[i] >= 'A' && suffix[i] <= 'Z' ? suffix[i] + 32 : suffix[i]);
+        if (a != b) return false;
+    }
+    return true;
+}
+
+static void pcm_to_float_stereo_48k(
+    const int16_t * pcm, size_t num_samples, int channels, unsigned int sample_rate,
+    std::vector<float> * out)
+{
+    const float scale = 1.0f / 32768.0f;
+    out->resize(num_samples * 2);
+    if (channels == 1) {
+        for (size_t i = 0; i < num_samples; i++) {
+            float s = (float)pcm[i] * scale;
+            (*out)[i * 2] = s;
+            (*out)[i * 2 + 1] = s;
+        }
+    } else {
+        for (size_t i = 0; i < num_samples * 2; i++)
+            (*out)[i] = (float)pcm[i] * scale;
+    }
+
+    if (sample_rate != 48000) {
+        size_t in_len = num_samples;
+        size_t out_len = (size_t)((double)in_len * 48000.0 / (double)sample_rate);
+        std::vector<float> resampled(out_len * 2);
+        for (size_t i = 0; i < out_len; i++) {
+            double t = (double)i * (double)in_len / (double)out_len;
+            size_t i0 = (size_t)t;
+            size_t i1 = std::min(i0 + 1, in_len - 1);
+            float w = (float)(t - (double)i0);
+            for (int c = 0; c < 2; c++)
+                resampled[i * 2 + c] = (*out)[i0 * 2 + c] * (1.0f - w) + (*out)[i1 * 2 + c] * w;
+        }
+        *out = std::move(resampled);
+    }
+}
+
+int mp3_load_48k_stereo(const char * path, std::vector<float> * out) {
+    FILE * f = fopen(path, "rb");
+    if (!f) return -1;
+    fseek(f, 0, SEEK_END);
+    long sz = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    if (sz <= 0 || sz > 200 * 1024 * 1024) {
+        fclose(f);
+        return -1;
+    }
+    std::vector<uint8_t> buf((size_t)sz);
+    if (fread(buf.data(), 1, (size_t)sz, f) != (size_t)sz) {
+        fclose(f);
+        return -1;
+    }
+    fclose(f);
+
+    mp3dec_t dec;
+    mp3dec_init(&dec);
+    mp3dec_frame_info_t info;
+    std::vector<int16_t> pcm;
+    const uint8_t * read_pos = buf.data();
+    int remaining = (int)buf.size();
+    int first_hz = 0, first_ch = 0;
+    const size_t max_samples = (size_t)(60 * 48000 * 2);
+
+    while (remaining > 0) {
+        size_t old_size = pcm.size();
+        if (old_size + (size_t)MINIMP3_MAX_SAMPLES_PER_FRAME > max_samples) break;
+        pcm.resize(old_size + (size_t)MINIMP3_MAX_SAMPLES_PER_FRAME);
+        int frame_samples = mp3dec_decode_frame(&dec, read_pos, remaining, pcm.data() + old_size, &info);
+        if (frame_samples <= 0) {
+            pcm.resize(old_size);
+            read_pos++;
+            remaining--;
+            continue;
+        }
+        if (first_hz == 0) {
+            first_hz = info.hz;
+            first_ch = info.channels;
+        }
+        pcm.resize(old_size + (size_t)(frame_samples * info.channels));
+        read_pos += info.frame_bytes;
+        remaining -= info.frame_bytes;
+    }
+
+    if (pcm.empty() || first_hz == 0) return -1;
+    size_t num_samples = pcm.size() / (size_t)first_ch;
+    pcm_to_float_stereo_48k(pcm.data(), num_samples, first_ch, (unsigned)first_hz, out);
+    return (int)(out->size() / 2);
+}
+
+int load_audio_48k_stereo(const char * path, std::vector<float> * out) {
+    if (!path || !out) return -1;
+    if (path_ends_with_ci(path, ".mp3"))
+        return mp3_load_48k_stereo(path, out);
+    if (path_ends_with_ci(path, ".wav"))
+        return wav_load_48k_stereo(path, out);
+    return -1;
+}
diff --git a/dit-vae.cpp b/dit-vae.cpp
index 608f12c..41582c9 100644
--- a/dit-vae.cpp
+++ b/dit-vae.cpp
@@ -22,6 +22,7 @@
 #include "bpe.h"
 #include "debug.h"
 #include "request.h"
+#include "audio.h"
 
 struct Timer {
     std::chrono::steady_clock::time_point t;
@@ -249,6 +250,8 @@ int main(int argc, char ** argv) {
         int num_steps         = req.inference_steps > 0 ? req.inference_steps : 8;
         float guidance_scale  = req.guidance_scale > 0 ? req.guidance_scale : 7.0f;
         float shift           = req.shift > 0 ? req.shift : 1.0f;
+        float cover_strength  = req.audio_cover_strength >= 0 && req.audio_cover_strength <= 1
+            ? req.audio_cover_strength : 1.0f;
 
         if (is_turbo && guidance_scale > 1.0f) {
             fprintf(stderr, "[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was %.1f)\n",
@@ -386,16 +389,51 @@ int main(int argc, char ** argv) {
         }
         fprintf(stderr, "[Load] ConditionEncoder: %.1f ms\n", timer.ms());
 
-        // Silence feats for timbre input: first 750 frames (30s @ 25Hz)
+        // Timbre input: reference_audio (WAV or MP3 via VAE encoder) or silence (first 750 frames = 30s @ 25Hz)
         const int S_ref = 750;
-        std::vector<float> silence_feats(S_ref * 64);
-        memcpy(silence_feats.data(), silence_full.data(), S_ref * 64 * sizeof(float));
+        std::vector<float> timbre_feats(S_ref * 64);
+        const float * timbre_ptr = silence_full.data();
+        int S_ref_actual = S_ref;
+        if (!req.reference_audio.empty()) {
+            const std::string & ref_path = req.reference_audio;
+            if (ref_path.size() >= 4 && ref_path.compare(ref_path.size() - 4, 4, ".wav") == 0) {
+                std::vector<float> wav_stereo;
+                int n_samples = load_audio_48k_stereo(ref_path.c_str(), &wav_stereo);
+                if (n_samples > 0 && have_vae) {
+                    VAEEncoderGGML enc = {};
+                    if (vae_encoder_load(&enc, vae_gguf)) {
+                        int T_audio = n_samples;
+                        if (T_audio >= 1920) {
+                            int T_lat = T_audio / 1920;
+                            std::vector<float> enc_out((size_t)T_lat * 64);
+                            T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data());
+                            if (T_lat > 0) {
+                                size_t copy_frames = (size_t)(T_lat < S_ref ? T_lat : S_ref);
+                                memcpy(timbre_feats.data(), enc_out.data(), copy_frames * 64 * sizeof(float));
+                                if (T_lat < S_ref)
+                                    memcpy(timbre_feats.data() + copy_frames * 64, silence_full.data(),
+                                           (S_ref - (int)copy_frames) * 64 * sizeof(float));
+                                S_ref_actual = (int)copy_frames;
+                                if (T_lat > S_ref) S_ref_actual = S_ref;
+                                timbre_ptr = timbre_feats.data();
+                                fprintf(stderr, "[Timbre] encoded %s -> %d frames (25Hz)\n", ref_path.c_str(), S_ref_actual);
+                            }
+                        }
+                        vae_encoder_free(&enc);
+                    }
+                } else if (n_samples <= 0) {
+                    fprintf(stderr, "[Timbre] WARNING: cannot load WAV %s, using silence\n", ref_path.c_str());
+                } else if (!have_vae) {
+                    fprintf(stderr, "[Timbre] WAV requires --vae (with encoder weights); using silence\n");
+                }
+            }
+        }
 
         timer.reset();
         std::vector<float> enc_hidden;
         cond_ggml_forward(&cond, text_hidden.data(), S_text,
                            lyric_embed.data(), S_lyric,
-                           silence_feats.data(), S_ref,
+                           timbre_ptr, S_ref_actual,
                            enc_hidden, &enc_S);
         fprintf(stderr, "[Encode] ConditionEncoder: %.1f ms, enc_S=%d\n", timer.ms(), enc_S);
 
@@ -438,15 +476,20 @@ int main(int argc, char ** argv) {
         }
 
         // Build single context: [T, ctx_ch] = src_latents[64] + mask_ones[64]
-        // src_latents = decoded_codes[0:decoded_T] + silence_latent[0:T-decoded_T]
-        // Padding reads silence from frame 0 (not from decoded_T), matching reference implementation
+        // src_latents = blend(decoded_codes, silence) for t<decoded_T, else silence; audio_cover_strength controls blend
         std::vector<float> context_single(T * ctx_ch);
         for (int t = 0; t < T; t++) {
-            const float * src = (t < decoded_T)
-                ? decoded_latents.data() + t * Oc
-                : silence_full.data() + (t - decoded_T) * Oc;
-            for (int c = 0; c < Oc; c++)
-                context_single[t * ctx_ch + c] = src[c];
+            for (int c = 0; c < Oc; c++) {
+                float v;
+                if (t < decoded_T) {
+                    float dec = decoded_latents[t * Oc + c];
+                    float sil = silence_full[c];  // frame 0 of silence
+                    v = (1.0f - cover_strength) * sil + cover_strength * dec;
+                } else {
+                    v = silence_full[(t - decoded_T) * Oc + c];
+                }
+                context_single[t * ctx_ch + c] = v;
+            }
             for (int c = 0; c < Oc; c++)
                 context_single[t * ctx_ch + Oc + c] = 1.0f;
         }
diff --git a/docs/MODES.md b/docs/MODES.md
new file mode 100644
index 0000000..4149ae7
--- /dev/null
+++ b/docs/MODES.md
@@ -0,0 +1,79 @@
+# ACE-Step 1.5 built-in modes (acestep.cpp)
+
+This document maps the [ACE-Step 1.5 Tutorial](https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md) built-in modes to the current C++ implementation.
+
+## Task types (Tutorial: Input Control)
+
+| `task_type`   | Description | Turbo/SFT | Base only | C++ status |
+|---------------|-------------|-----------|-----------|------------|
+| **text2music** | Generate from caption/lyrics (and optional reference) | ✅ | — | ✅ **Supported** |
+| **cover**      | Re-synthesize with structure from source; optional timbre from reference | ✅ | — | ⚠️ **Partial** (see below) |
+| **repaint**    | Local edit in time range using source as context | ✅ | — | ❌ Not implemented |
+| **lego**       | Add new tracks to existing audio | — | ✅ | ❌ Base model only |
+| **extract**    | Extract single track from mix | — | ✅ | ❌ Base model only |
+| **complete**   | Add accompaniment to single track | — | ✅ | ❌ Base model only |
+
+We only ship Turbo and SFT DiT weights; **lego**, **extract**, **complete** require the Base DiT and are out of scope for now.
+
+---
+
+## What we support today
+
+### text2music (default)
+- **Input**: `caption`, optional `lyrics`, metadata (bpm, duration, keyscale, …).
+- **Flow**: LM (optional) → CoT + audio codes → DiT (context = silence) → VAE → WAV.
+- **Timbre**: Always uses built-in silence latent from the DiT GGUF (no user reference yet).
+
+### cover (when `audio_codes` are provided)
+- **Input**: Same as text2music, plus **precomputed** `audio_codes` (e.g. from a previous run or from Python).
+- **Flow**: Skip LM; decode `audio_codes` to latents → DiT context = decoded + silence padding → DiT → VAE → WAV.
+- **Limitation**: We do **not** convert a WAV file into `audio_codes`. So “cover from a file” is only possible if you already have codes (e.g. from Python or from a prior `ace-qwen3` run). The request fields `reference_audio` and `src_audio` are accepted in JSON but **not yet used** in the pipeline.
+
+---
+
+## What’s not implemented yet
+
+### reference_audio (global timbre/style)
+- **Tutorial**: Load WAV → stereo 48 kHz, pad/repeat to ≥30 s → **VAE encode** → latents → feed as timbre condition into DiT.
+- **C++**: Implemented. Set `reference_audio` to a **WAV or MP3 file path**. dit-vae loads the file (WAV: any sample rate resampled to 48 kHz; MP3: decoded in memory via header-only minimp3, no temp files, then resampled to 48 kHz if needed), runs the **VAE encoder** (Oobleck, in C++ in `vae.h`), and feeds the 64-d latents to the CondEncoder timbre path. No Python, no external deps. Requires a **full VAE GGUF** that includes `encoder.*` tensors (decoder-only GGUFs will print a clear error).
+- **audio_cover_strength** (0.0–1.0): Implemented. When `audio_codes` are present, context latents are blended with silence: `(1 - strength)*silence + strength*decoded`.
+
+### src_audio (Cover from file)
+- **Tutorial**: Source audio is converted to **semantic codes** (melody, rhythm, chords, etc.); then DiT uses those as in cover mode.
+- **C++**: That implies **audio → codes**. Likely path: WAV → VAE encode → **FSQ tokenizer** (latents → 5 Hz codes). We have the **FSQ detokenizer** (codes → latents); the tokenizer (encode) side would need to be added. Then: `src_audio` path → load WAV → VAE encode → FSQ encode → `audio_codes` → existing cover path.
+
+### audio_cover_strength
+- **Tutorial**: 0.0–1.0, how strongly generation follows reference/codes.
+- **C++**: Field is in the request and parsed; no blending logic in the DiT/context path yet.
+
+### repaint
+- **Tutorial**: Specify `repainting_start` / `repainting_end` (seconds); model uses source audio as context and only generates in that interval (3–90 s).
+- **C++**: Would require **masked diffusion**: context carries “given” frames; ODE only updates the repaint region. DiT’s context has a 64-channel “mask” that we currently set to 1.0; repaint would set mask per frame and the generation loop would only update unmasked frames. Not implemented.
+
+---
+
+## Request fields (aligned with Tutorial)
+
+All of these are in `AceRequest` and parsed from / written to JSON. Backend behavior is as above.
+
+| Field | Type | Purpose |
+|-------|------|--------|
+| `task_type` | string | `"text2music"` \| `"cover"` \| `"repaint"` \| … |
+| `reference_audio` | string | Path to WAV or MP3 for timbre (implemented) |
+| `src_audio` | string | Path to WAV for cover/repaint source (not used yet) |
+| `audio_codes` | string | Comma-separated FSQ codes; non-empty ⇒ cover path |
+| `audio_cover_strength` | float | 0.0–1.0 (parsed, not used yet) |
+| `repainting_start` | float | Start time (s) for repaint (not used yet) |
+| `repainting_end` | float | End time (s) for repaint (not used yet) |
+
+See `request.h` and the README “Request JSON reference” for the full list.
+
+---
+
+## Summary
+
+- **Fully supported**: text2music; cover when you supply **precomputed** `audio_codes`.
+- **Schema only** (no backend): `task_type`, `reference_audio`, `src_audio`, `audio_cover_strength`, `repainting_start`/`repainting_end`.
+- **To support reference_audio**: add VAE encoder, then feed its output into the existing CondEncoder timbre path.
+- **To support cover from file**: add VAE encoder + FSQ tokenizer (or equivalent audio→codes), then reuse existing cover path.
+- **To support repaint**: implement masked DiT generation (context mask + ODE only on repaint interval).
diff --git a/examples/request-reference.json b/examples/request-reference.json
new file mode 100644
index 0000000..55479ee
--- /dev/null
+++ b/examples/request-reference.json
@@ -0,0 +1,13 @@
+{
+    "task_type": "text2music",
+    "caption": "Calm acoustic guitar, soft male vocal, intimate",
+    "lyrics": "[Verse]\nQuiet strings and a gentle tune\n[Chorus]\nHold the moment in this room",
+    "duration": 30,
+    "seed": 42,
+    "inference_steps": 8,
+    "guidance_scale": 1.0,
+    "shift": 3.0,
+    "reference_audio": "reference.wav",
+    "audio_codes": "",
+    "audio_cover_strength": 1.0
+}
diff --git a/examples/test-reference.sh b/examples/test-reference.sh
new file mode 100755
index 0000000..073a465
--- /dev/null
+++ b/examples/test-reference.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Test reference_audio (WAV) and audio_cover_strength.
+# Put a WAV file at reference.wav (or set reference_audio path in request-reference.json).
+# Requires: built dit-vae, --vae with encoder weights, and models in ../models/.
+
+set -eu
+cd "$(dirname "$0")"
+
+if [ ! -f "reference.wav" ]; then
+    echo "No reference.wav found. Copy a WAV file to reference.wav (stereo 48kHz or any rate; will be resampled)."
+    echo "Then run: $0"
+    exit 1
+fi
+
+../build/dit-vae \
+    --request request-reference.json \
+    --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit ../models/acestep-v15-turbo-Q8_0.gguf \
+    --vae ../models/vae-BF16.gguf
+
+echo "Done. Check request-reference0.wav (and request-reference1.wav if --batch 2)."
diff --git a/request.cpp b/request.cpp
index 9b20423..a24d838 100644
--- a/request.cpp
+++ b/request.cpp
@@ -10,8 +10,9 @@
 #include <string>
 #include <vector>
 
-// Defaults (aligned with Python GenerationParams)
+// Defaults (aligned with Python GenerationParams and ACE-Step 1.5 Tutorial)
 void request_init(AceRequest * r) {
+    r->task_type          = "text2music";
     r->caption            = "";
     r->lyrics             = "";
     r->instrumental       = false;
@@ -26,7 +27,12 @@ void request_init(AceRequest * r) {
     r->lm_top_p           = 0.9f;
     r->lm_top_k           = 0;
     r->lm_negative_prompt = "";
+    r->reference_audio    = "";
+    r->src_audio          = "";
     r->audio_codes        = "";
+    r->audio_cover_strength = 1.0f;
+    r->repainting_start   = 0.0f;
+    r->repainting_end     = 0.0f;
     r->inference_steps    = 8;
     r->guidance_scale     = 1.0f;
     r->shift              = 3.0f;
@@ -218,11 +224,14 @@ bool request_parse(AceRequest * r, const char * path) {
         const std::string & v = kv.value;
 
         // strings
-        if      (k == "caption")            r->caption            = v;
+        if      (k == "task_type")          r->task_type          = v;
+        else if (k == "caption")            r->caption            = v;
         else if (k == "lyrics")             r->lyrics             = v;
         else if (k == "keyscale")           r->keyscale           = v;
         else if (k == "timesignature")      r->timesignature      = v;
         else if (k == "vocal_language")     r->vocal_language     = v;
+        else if (k == "reference_audio")   r->reference_audio    = v;
+        else if (k == "src_audio")          r->src_audio          = v;
         else if (k == "audio_codes")        r->audio_codes        = v;
         else if (k == "lm_negative_prompt") r->lm_negative_prompt = v;
 
@@ -236,6 +245,9 @@ bool request_parse(AceRequest * r, const char * path) {
         else if (k == "lm_cfg_scale")       r->lm_cfg_scale       = (float)atof(v.c_str());
         else if (k == "lm_top_p")           r->lm_top_p           = (float)atof(v.c_str());
         else if (k == "lm_top_k")           r->lm_top_k           = atoi(v.c_str());
+        else if (k == "audio_cover_strength") r->audio_cover_strength = (float)atof(v.c_str());
+        else if (k == "repainting_start")   r->repainting_start   = (float)atof(v.c_str());
+        else if (k == "repainting_end")     r->repainting_end    = (float)atof(v.c_str());
         else if (k == "inference_steps")    r->inference_steps    = atoi(v.c_str());
         else if (k == "guidance_scale")     r->guidance_scale     = (float)atof(v.c_str());
         else if (k == "shift")              r->shift              = (float)atof(v.c_str());
@@ -257,6 +269,7 @@ bool request_write(const AceRequest * r, const char * path) {
     }
 
     fprintf(f, "{\n");
+    fprintf(f, "  \"task_type\": \"%s\",\n",         json_escape(r->task_type).c_str());
     fprintf(f, "  \"caption\": \"%s\",\n",            json_escape(r->caption).c_str());
     fprintf(f, "  \"lyrics\": \"%s\",\n",             json_escape(r->lyrics).c_str());
     if (r->instrumental)
@@ -272,10 +285,18 @@ bool request_write(const AceRequest * r, const char * path) {
     fprintf(f, "  \"lm_top_p\": %.2f,\n",             r->lm_top_p);
     fprintf(f, "  \"lm_top_k\": %d,\n",               r->lm_top_k);
     fprintf(f, "  \"lm_negative_prompt\": \"%s\",\n", json_escape(r->lm_negative_prompt).c_str());
+    if (!r->reference_audio.empty())
+        fprintf(f, "  \"reference_audio\": \"%s\",\n", json_escape(r->reference_audio).c_str());
+    if (!r->src_audio.empty())
+        fprintf(f, "  \"src_audio\": \"%s\",\n",       json_escape(r->src_audio).c_str());
+    fprintf(f, "  \"audio_cover_strength\": %.2f,\n", r->audio_cover_strength);
+    if (r->repainting_start != 0.0f || r->repainting_end != 0.0f) {
+        fprintf(f, "  \"repainting_start\": %.1f,\n", r->repainting_start);
+        fprintf(f, "  \"repainting_end\": %.1f,\n",   r->repainting_end);
+    }
     fprintf(f, "  \"inference_steps\": %d,\n",        r->inference_steps);
     fprintf(f, "  \"guidance_scale\": %.1f,\n",       r->guidance_scale);
     fprintf(f, "  \"shift\": %.1f,\n",                r->shift);
-    // audio_codes last (no trailing comma)
     fprintf(f, "  \"audio_codes\": \"%s\"\n",         json_escape(r->audio_codes).c_str());
     fprintf(f, "}\n");
 
@@ -285,7 +306,7 @@ bool request_write(const AceRequest * r, const char * path) {
 }
 
 void request_dump(const AceRequest * r, FILE * f) {
-    fprintf(f, "[Request] seed=%lld\n", (long long)r->seed);
+    fprintf(f, "[Request] task=%s seed=%lld\n", r->task_type.c_str(), (long long)r->seed);
     fprintf(f, "  caption:    %.60s%s\n",
             r->caption.c_str(), r->caption.size() > 60 ? "..." : "");
     fprintf(f, "  lyrics:     %zu bytes\n", r->lyrics.size());
@@ -296,6 +317,12 @@ void request_dump(const AceRequest * r, FILE * f) {
             r->lm_temperature, r->lm_cfg_scale, r->lm_top_p, r->lm_top_k);
     fprintf(f, "  dit: steps=%d guidance=%.1f shift=%.1f\n",
             r->inference_steps, r->guidance_scale, r->shift);
-    fprintf(f, "  audio_codes: %s\n",
-            r->audio_codes.empty() ? "(none)" : "(present)");
+    if (!r->reference_audio.empty())
+        fprintf(f, "  reference_audio: %s\n", r->reference_audio.c_str());
+    if (!r->src_audio.empty())
+        fprintf(f, "  src_audio: %s\n", r->src_audio.c_str());
+    fprintf(f, "  audio_codes: %s  cover_strength=%.2f\n",
+            r->audio_codes.empty() ? "(none)" : "(present)", r->audio_cover_strength);
+    if (r->repainting_start != 0.0f || r->repainting_end != 0.0f)
+        fprintf(f, "  repaint: %.1f–%.1fs\n", r->repainting_start, r->repainting_end);
 }
diff --git a/request.h b/request.h
index 1295b83..e9222a0 100644
--- a/request.h
+++ b/request.h
@@ -3,13 +3,19 @@
 // request.h - AceStep generation request (JSON serialization)
 //
 // Pure data container + JSON read/write. Zero business logic.
-// Aligned with Python GenerationParams (inference.py:39) and API /release_task.
+// Aligned with Python GenerationParams and ACE-Step 1.5 Tutorial:
+// https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md
 //
 
 #include <string>
 #include <cstdio>
 
 struct AceRequest {
+    // --- Task & audio inputs (Tutorial: Input Control) ---
+    // task_type: "text2music" | "cover" | "repaint" | "lego" | "extract" | "complete"
+    // (lego/extract/complete require Base DiT; we only have Turbo/SFT)
+    std::string task_type;         // "text2music"
+
     // text content
     std::string caption;            // ""
     std::string lyrics;             // ""
@@ -32,9 +38,20 @@ struct AceRequest {
     int         lm_top_k;           // 0 = disabled (matches Python None)
     std::string lm_negative_prompt; // ""
 
+    // Audio control (Tutorial: reference_audio, src_audio, audio_codes)
+    // reference_audio: path to WAV for global timbre/style (VAE encode → CondEncoder timbre)
+    std::string reference_audio;   // ""
+    // src_audio: path to WAV for Cover (encode → codes) or Repaint context
+    std::string src_audio;          // ""
     // codes (Python-compatible string: "3101,11837,27514,...")
     // empty = text2music (silence context), non-empty = cover mode
     std::string audio_codes;        // ""
+    // 0.0–1.0: how strongly generation follows reference/codes (Tutorial: audio_cover_strength)
+    float       audio_cover_strength; // 1.0
+
+    // Repaint interval (seconds). Only used when task_type == "repaint".
+    float       repainting_start;   // 0.0
+    float       repainting_end;     // 0.0
 
     // DiT control (Python: inference_steps, guidance_scale, shift)
     int         inference_steps;    // 8
diff --git a/tests/fixtures/ci-cover.json b/tests/fixtures/ci-cover.json
new file mode 100644
index 0000000..3d21e1f
--- /dev/null
+++ b/tests/fixtures/ci-cover.json
@@ -0,0 +1,12 @@
+{
+  "task_type": "cover",
+  "caption": "Short CI cover test",
+  "duration": 5,
+  "inference_steps": 4,
+  "guidance_scale": 1,
+  "shift": 3,
+  "seed": 42,
+  "reference_audio": "tests/fixtures/ci-text2music0.wav",
+  "audio_cover_strength": 0.8,
+  "audio_codes": "43316,18426,13366,59455,17783,49303,7423,29855,37158,37157,62317,61455,12847,19583,57031,34656,20254,10770,11416,15905,31413,23339,47091,12198,49531,37355,33090,38645,40707,16324,61436,46095,13941,5287,2239,13975,63815,2757,4862,13571,63495,39,29887,49426,12696,50847,40498,61056,25666,12989,23987,54763,25485,31683,28554,25355,16373,28995,2351,1655,7940,55831,34359,15350,15277,11717,20476,52239,5015,19807,24087,3559,20471,34193,32552,60999,29360,25338,38873,16768,17912,27584,24008,1528,449,25563,52684,53223,42183,37215,12343,39431,26055,28148,57286,38382,28863,7191,58397,18991,7695,30716,36784,12687,8707,25649,33718,3202,23035,10747,26354,63965,16260,11223,45679,14343,8679,49351,52927,2535,19207,46447,49615,12694,21110"
+}
diff --git a/tests/fixtures/ci-text2music.json b/tests/fixtures/ci-text2music.json
new file mode 100644
index 0000000..1613723
--- /dev/null
+++ b/tests/fixtures/ci-text2music.json
@@ -0,0 +1,11 @@
+{
+  "task_type": "text2music",
+  "caption": "Short CI test clip",
+  "lyrics": "",
+  "duration": 5,
+  "inference_steps": 4,
+  "guidance_scale": 1,
+  "shift": 3,
+  "seed": 42,
+  "audio_codes": ""
+}
diff --git a/third_party/minimp3.h b/third_party/minimp3.h
new file mode 100644
index 0000000..3220ae1
--- /dev/null
+++ b/third_party/minimp3.h
@@ -0,0 +1,1865 @@
+#ifndef MINIMP3_H
+#define MINIMP3_H
+/*
+    https://github.com/lieff/minimp3
+    To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide.
+    This software is distributed without any warranty.
+    See <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#include <stdint.h>
+
+#define MINIMP3_MAX_SAMPLES_PER_FRAME (1152*2)
+
+typedef struct
+{
+    int frame_bytes, frame_offset, channels, hz, layer, bitrate_kbps;
+} mp3dec_frame_info_t;
+
+typedef struct
+{
+    float mdct_overlap[2][9*32], qmf_state[15*2*32];
+    int reserv, free_format_bytes;
+    unsigned char header[4], reserv_buf[511];
+} mp3dec_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+void mp3dec_init(mp3dec_t *dec);
+#ifndef MINIMP3_FLOAT_OUTPUT
+typedef int16_t mp3d_sample_t;
+#else /* MINIMP3_FLOAT_OUTPUT */
+typedef float mp3d_sample_t;
+void mp3dec_f32_to_s16(const float *in, int16_t *out, int num_samples);
+#endif /* MINIMP3_FLOAT_OUTPUT */
+int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_sample_t *pcm, mp3dec_frame_info_t *info);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* MINIMP3_H */
+#if defined(MINIMP3_IMPLEMENTATION) && !defined(_MINIMP3_IMPLEMENTATION_GUARD)
+#define _MINIMP3_IMPLEMENTATION_GUARD
+
+#include <stdlib.h>
+#include <string.h>
+
+#define MAX_FREE_FORMAT_FRAME_SIZE  2304    /* more than ISO spec's */
+#ifndef MAX_FRAME_SYNC_MATCHES
+#define MAX_FRAME_SYNC_MATCHES      10
+#endif /* MAX_FRAME_SYNC_MATCHES */
+
+#define MAX_L3_FRAME_PAYLOAD_BYTES  MAX_FREE_FORMAT_FRAME_SIZE /* MUST be >= 320000/8/32000*1152 = 1440 */
+
+#define MAX_BITRESERVOIR_BYTES      511
+#define SHORT_BLOCK_TYPE            2
+#define STOP_BLOCK_TYPE             3
+#define MODE_MONO                   3
+#define MODE_JOINT_STEREO           1
+#define HDR_SIZE                    4
+#define HDR_IS_MONO(h)              (((h[3]) & 0xC0) == 0xC0)
+#define HDR_IS_MS_STEREO(h)         (((h[3]) & 0xE0) == 0x60)
+#define HDR_IS_FREE_FORMAT(h)       (((h[2]) & 0xF0) == 0)
+#define HDR_IS_CRC(h)               (!((h[1]) & 1))
+#define HDR_TEST_PADDING(h)         ((h[2]) & 0x2)
+#define HDR_TEST_MPEG1(h)           ((h[1]) & 0x8)
+#define HDR_TEST_NOT_MPEG25(h)      ((h[1]) & 0x10)
+#define HDR_TEST_I_STEREO(h)        ((h[3]) & 0x10)
+#define HDR_TEST_MS_STEREO(h)       ((h[3]) & 0x20)
+#define HDR_GET_STEREO_MODE(h)      (((h[3]) >> 6) & 3)
+#define HDR_GET_STEREO_MODE_EXT(h)  (((h[3]) >> 4) & 3)
+#define HDR_GET_LAYER(h)            (((h[1]) >> 1) & 3)
+#define HDR_GET_BITRATE(h)          ((h[2]) >> 4)
+#define HDR_GET_SAMPLE_RATE(h)      (((h[2]) >> 2) & 3)
+#define HDR_GET_MY_SAMPLE_RATE(h)   (HDR_GET_SAMPLE_RATE(h) + (((h[1] >> 3) & 1) + ((h[1] >> 4) & 1))*3)
+#define HDR_IS_FRAME_576(h)         ((h[1] & 14) == 2)
+#define HDR_IS_LAYER_1(h)           ((h[1] & 6) == 6)
+
+#define BITS_DEQUANTIZER_OUT        -1
+#define MAX_SCF                     (255 + BITS_DEQUANTIZER_OUT*4 - 210)
+#define MAX_SCFI                    ((MAX_SCF + 3) & ~3)
+
+#define MINIMP3_MIN(a, b)           ((a) > (b) ? (b) : (a))
+#define MINIMP3_MAX(a, b)           ((a) < (b) ? (b) : (a))
+
+#if !defined(MINIMP3_NO_SIMD)
+
+#if !defined(MINIMP3_ONLY_SIMD) && (defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_ARM64))
+/* x64 always have SSE2, arm64 always have neon, no need for generic code */
+#define MINIMP3_ONLY_SIMD
+#endif /* SIMD checks... */
+
+#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || ((defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__))
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif /* defined(_MSC_VER) */
+#include <immintrin.h>
+#define HAVE_SSE 1
+#define HAVE_SIMD 1
+#define VSTORE _mm_storeu_ps
+#define VLD _mm_loadu_ps
+#define VSET _mm_set1_ps
+#define VADD _mm_add_ps
+#define VSUB _mm_sub_ps
+#define VMUL _mm_mul_ps
+#define VMAC(a, x, y) _mm_add_ps(a, _mm_mul_ps(x, y))
+#define VMSB(a, x, y) _mm_sub_ps(a, _mm_mul_ps(x, y))
+#define VMUL_S(x, s)  _mm_mul_ps(x, _mm_set1_ps(s))
+#define VREV(x) _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3))
+typedef __m128 f4;
+#if defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD)
+#define minimp3_cpuid __cpuid
+#else /* defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) */
+static __inline__ __attribute__((always_inline)) void minimp3_cpuid(int CPUInfo[], const int InfoType)
+{
+#if defined(__PIC__)
+    __asm__ __volatile__(
+#if defined(__x86_64__)
+        "push %%rbx\n"
+        "cpuid\n"
+        "xchgl %%ebx, %1\n"
+        "pop  %%rbx\n"
+#else /* defined(__x86_64__) */
+        "xchgl %%ebx, %1\n"
+        "cpuid\n"
+        "xchgl %%ebx, %1\n"
+#endif /* defined(__x86_64__) */
+        : "=a" (CPUInfo[0]), "=r" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
+        : "a" (InfoType));
+#else /* defined(__PIC__) */
+    __asm__ __volatile__(
+        "cpuid"
+        : "=a" (CPUInfo[0]), "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
+        : "a" (InfoType));
+#endif /* defined(__PIC__)*/
+}
+#endif /* defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) */
+static int have_simd(void)
+{
+#ifdef MINIMP3_ONLY_SIMD
+    return 1;
+#else /* MINIMP3_ONLY_SIMD */
+    static int g_have_simd;
+    int CPUInfo[4];
+#ifdef MINIMP3_TEST
+    static int g_counter;
+    if (g_counter++ > 100)
+        return 0;
+#endif /* MINIMP3_TEST */
+    if (g_have_simd)
+        goto end;
+    minimp3_cpuid(CPUInfo, 0);
+    g_have_simd = 1;
+    if (CPUInfo[0] > 0)
+    {
+        minimp3_cpuid(CPUInfo, 1);
+        g_have_simd = (CPUInfo[3] & (1 << 26)) + 1; /* SSE2 */
+    }
+end:
+    return g_have_simd - 1;
+#endif /* MINIMP3_ONLY_SIMD */
+}
+#elif defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64)
+#include <arm_neon.h>
+#define HAVE_SSE 0
+#define HAVE_SIMD 1
+#define VSTORE vst1q_f32
+#define VLD vld1q_f32
+#define VSET vmovq_n_f32
+#define VADD vaddq_f32
+#define VSUB vsubq_f32
+#define VMUL vmulq_f32
+#define VMAC(a, x, y) vmlaq_f32(a, x, y)
+#define VMSB(a, x, y) vmlsq_f32(a, x, y)
+#define VMUL_S(x, s)  vmulq_f32(x, vmovq_n_f32(s))
+#define VREV(x) vcombine_f32(vget_high_f32(vrev64q_f32(x)), vget_low_f32(vrev64q_f32(x)))
+typedef float32x4_t f4;
+static int have_simd()
+{   /* TODO: detect neon for !MINIMP3_ONLY_SIMD */
+    return 1;
+}
+#else /* SIMD checks... */
+#define HAVE_SSE 0
+#define HAVE_SIMD 0
+#ifdef MINIMP3_ONLY_SIMD
+#error MINIMP3_ONLY_SIMD used, but SSE/NEON not enabled
+#endif /* MINIMP3_ONLY_SIMD */
+#endif /* SIMD checks... */
+#else /* !defined(MINIMP3_NO_SIMD) */
+#define HAVE_SIMD 0
+#endif /* !defined(MINIMP3_NO_SIMD) */
+
+#if defined(__ARM_ARCH) && (__ARM_ARCH >= 6) && !defined(__aarch64__) && !defined(_M_ARM64)
+#define HAVE_ARMV6 1
+static __inline__ __attribute__((always_inline)) int32_t minimp3_clip_int16_arm(int32_t a)
+{
+    int32_t x = 0;
+    __asm__ ("ssat %0, #16, %1" : "=r"(x) : "r"(a));
+    return x;
+}
+#else
+#define HAVE_ARMV6 0
+#endif
+
+typedef struct
+{
+    const uint8_t *buf;
+    int pos, limit;
+} bs_t;
+
+typedef struct
+{
+    float scf[3*64];
+    uint8_t total_bands, stereo_bands, bitalloc[64], scfcod[64];
+} L12_scale_info;
+
+typedef struct
+{
+    uint8_t tab_offset, code_tab_width, band_count;
+} L12_subband_alloc_t;
+
+typedef struct
+{
+    const uint8_t *sfbtab;
+    uint16_t part_23_length, big_values, scalefac_compress;
+    uint8_t global_gain, block_type, mixed_block_flag, n_long_sfb, n_short_sfb;
+    uint8_t table_select[3], region_count[3], subblock_gain[3];
+    uint8_t preflag, scalefac_scale, count1_table, scfsi;
+} L3_gr_info_t;
+
+typedef struct
+{
+    bs_t bs;
+    uint8_t maindata[MAX_BITRESERVOIR_BYTES + MAX_L3_FRAME_PAYLOAD_BYTES];
+    L3_gr_info_t gr_info[4];
+    float grbuf[2][576], scf[40], syn[18 + 15][2*32];
+    uint8_t ist_pos[2][39];
+} mp3dec_scratch_t;
+
+static void bs_init(bs_t *bs, const uint8_t *data, int bytes)
+{
+    bs->buf   = data;
+    bs->pos   = 0;
+    bs->limit = bytes*8;
+}
+
+static uint32_t get_bits(bs_t *bs, int n)
+{
+    uint32_t next, cache = 0, s = bs->pos & 7;
+    int shl = n + s;
+    const uint8_t *p = bs->buf + (bs->pos >> 3);
+    if ((bs->pos += n) > bs->limit)
+        return 0;
+    next = *p++ & (255 >> s);
+    while ((shl -= 8) > 0)
+    {
+        cache |= next << shl;
+        next = *p++;
+    }
+    return cache | (next >> -shl);
+}
+
+static int hdr_valid(const uint8_t *h)
+{
+    return h[0] == 0xff &&
+        ((h[1] & 0xF0) == 0xf0 || (h[1] & 0xFE) == 0xe2) &&
+        (HDR_GET_LAYER(h) != 0) &&
+        (HDR_GET_BITRATE(h) != 15) &&
+        (HDR_GET_SAMPLE_RATE(h) != 3);
+}
+
+static int hdr_compare(const uint8_t *h1, const uint8_t *h2)
+{
+    return hdr_valid(h2) &&
+        ((h1[1] ^ h2[1]) & 0xFE) == 0 &&
+        ((h1[2] ^ h2[2]) & 0x0C) == 0 &&
+        !(HDR_IS_FREE_FORMAT(h1) ^ HDR_IS_FREE_FORMAT(h2));
+}
+
+static unsigned hdr_bitrate_kbps(const uint8_t *h)
+{
+    static const uint8_t halfrate[2][3][15] = {
+        { { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,16,24,28,32,40,48,56,64,72,80,88,96,112,128 } },
+        { { 0,16,20,24,28,32,40,48,56,64,80,96,112,128,160 }, { 0,16,24,28,32,40,48,56,64,80,96,112,128,160,192 }, { 0,16,32,48,64,80,96,112,128,144,160,176,192,208,224 } },
+    };
+    return 2*halfrate[!!HDR_TEST_MPEG1(h)][HDR_GET_LAYER(h) - 1][HDR_GET_BITRATE(h)];
+}
+
+static unsigned hdr_sample_rate_hz(const uint8_t *h)
+{
+    static const unsigned g_hz[3] = { 44100, 48000, 32000 };
+    return g_hz[HDR_GET_SAMPLE_RATE(h)] >> (int)!HDR_TEST_MPEG1(h) >> (int)!HDR_TEST_NOT_MPEG25(h);
+}
+
+static unsigned hdr_frame_samples(const uint8_t *h)
+{
+    return HDR_IS_LAYER_1(h) ? 384 : (1152 >> (int)HDR_IS_FRAME_576(h));
+}
+
+static int hdr_frame_bytes(const uint8_t *h, int free_format_size)
+{
+    int frame_bytes = hdr_frame_samples(h)*hdr_bitrate_kbps(h)*125/hdr_sample_rate_hz(h);
+    if (HDR_IS_LAYER_1(h))
+    {
+        frame_bytes &= ~3; /* slot align */
+    }
+    return frame_bytes ? frame_bytes : free_format_size;
+}
+
+static int hdr_padding(const uint8_t *h)
+{
+    return HDR_TEST_PADDING(h) ? (HDR_IS_LAYER_1(h) ? 4 : 1) : 0;
+}
+
+#ifndef MINIMP3_ONLY_MP3
+static const L12_subband_alloc_t *L12_subband_alloc_table(const uint8_t *hdr, L12_scale_info *sci)
+{
+    const L12_subband_alloc_t *alloc;
+    int mode = HDR_GET_STEREO_MODE(hdr);
+    int nbands, stereo_bands = (mode == MODE_MONO) ? 0 : (mode == MODE_JOINT_STEREO) ? (HDR_GET_STEREO_MODE_EXT(hdr) << 2) + 4 : 32;
+
+    if (HDR_IS_LAYER_1(hdr))
+    {
+        static const L12_subband_alloc_t g_alloc_L1[] = { { 76, 4, 32 } };
+        alloc = g_alloc_L1;
+        nbands = 32;
+    } else if (!HDR_TEST_MPEG1(hdr))
+    {
+        static const L12_subband_alloc_t g_alloc_L2M2[] = { { 60, 4, 4 }, { 44, 3, 7 }, { 44, 2, 19 } };
+        alloc = g_alloc_L2M2;
+        nbands = 30;
+    } else
+    {
+        static const L12_subband_alloc_t g_alloc_L2M1[] = { { 0, 4, 3 }, { 16, 4, 8 }, { 32, 3, 12 }, { 40, 2, 7 } };
+        int sample_rate_idx = HDR_GET_SAMPLE_RATE(hdr);
+        unsigned kbps = hdr_bitrate_kbps(hdr) >> (int)(mode != MODE_MONO);
+        if (!kbps) /* free-format */
+        {
+            kbps = 192;
+        }
+
+        alloc = g_alloc_L2M1;
+        nbands = 27;
+        if (kbps < 56)
+        {
+            static const L12_subband_alloc_t g_alloc_L2M1_lowrate[] = { { 44, 4, 2 }, { 44, 3, 10 } };
+            alloc = g_alloc_L2M1_lowrate;
+            nbands = sample_rate_idx == 2 ? 12 : 8;
+        } else if (kbps >= 96 && sample_rate_idx != 1)
+        {
+            nbands = 30;
+        }
+    }
+
+    sci->total_bands = (uint8_t)nbands;
+    sci->stereo_bands = (uint8_t)MINIMP3_MIN(stereo_bands, nbands);
+
+    return alloc;
+}
+
+static void L12_read_scalefactors(bs_t *bs, uint8_t *pba, uint8_t *scfcod, int bands, float *scf)
+{
+    static const float g_deq_L12[18*3] = {
+#define DQ(x) 9.53674316e-07f/x, 7.56931807e-07f/x, 6.00777173e-07f/x
+        DQ(3),DQ(7),DQ(15),DQ(31),DQ(63),DQ(127),DQ(255),DQ(511),DQ(1023),DQ(2047),DQ(4095),DQ(8191),DQ(16383),DQ(32767),DQ(65535),DQ(3),DQ(5),DQ(9)
+    };
+    int i, m;
+    for (i = 0; i < bands; i++)
+    {
+        float s = 0;
+        int ba = *pba++;
+        int mask = ba ? 4 + ((19 >> scfcod[i]) & 3) : 0;
+        for (m = 4; m; m >>= 1)
+        {
+            if (mask & m)
+            {
+                int b = get_bits(bs, 6);
+                s = g_deq_L12[ba*3 - 6 + b % 3]*(1 << 21 >> b/3);
+            }
+            *scf++ = s;
+        }
+    }
+}
+
+static void L12_read_scale_info(const uint8_t *hdr, bs_t *bs, L12_scale_info *sci)
+{
+    static const uint8_t g_bitalloc_code_tab[] = {
+        0,17, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16,
+        0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,16,
+        0,17,18, 3,19,4,5,16,
+        0,17,18,16,
+        0,17,18,19, 4,5,6, 7,8, 9,10,11,12,13,14,15,
+        0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,14,
+        0, 2, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16
+    };
+    const L12_subband_alloc_t *subband_alloc = L12_subband_alloc_table(hdr, sci);
+
+    int i, k = 0, ba_bits = 0;
+    const uint8_t *ba_code_tab = g_bitalloc_code_tab;
+
+    for (i = 0; i < sci->total_bands; i++)
+    {
+        uint8_t ba;
+        if (i == k)
+        {
+            k += subband_alloc->band_count;
+            ba_bits = subband_alloc->code_tab_width;
+            ba_code_tab = g_bitalloc_code_tab + subband_alloc->tab_offset;
+            subband_alloc++;
+        }
+        ba = ba_code_tab[get_bits(bs, ba_bits)];
+        sci->bitalloc[2*i] = ba;
+        if (i < sci->stereo_bands)
+        {
+            ba = ba_code_tab[get_bits(bs, ba_bits)];
+        }
+        sci->bitalloc[2*i + 1] = sci->stereo_bands ? ba : 0;
+    }
+
+    for (i = 0; i < 2*sci->total_bands; i++)
+    {
+        sci->scfcod[i] = sci->bitalloc[i] ? HDR_IS_LAYER_1(hdr) ? 2 : get_bits(bs, 2) : 6;
+    }
+
+    L12_read_scalefactors(bs, sci->bitalloc, sci->scfcod, sci->total_bands*2, sci->scf);
+
+    for (i = sci->stereo_bands; i < sci->total_bands; i++)
+    {
+        sci->bitalloc[2*i + 1] = 0;
+    }
+}
+
+static int L12_dequantize_granule(float *grbuf, bs_t *bs, L12_scale_info *sci, int group_size)
+{
+    int i, j, k, choff = 576;
+    for (j = 0; j < 4; j++)
+    {
+        float *dst = grbuf + group_size*j;
+        for (i = 0; i < 2*sci->total_bands; i++)
+        {
+            int ba = sci->bitalloc[i];
+            if (ba != 0)
+            {
+                if (ba < 17)
+                {
+                    int half = (1 << (ba - 1)) - 1;
+                    for (k = 0; k < group_size; k++)
+                    {
+                        dst[k] = (float)((int)get_bits(bs, ba) - half);
+                    }
+                } else
+                {
+                    unsigned mod = (2 << (ba - 17)) + 1;    /* 3, 5, 9 */
+                    unsigned code = get_bits(bs, mod + 2 - (mod >> 3));  /* 5, 7, 10 */
+                    for (k = 0; k < group_size; k++, code /= mod)
+                    {
+                        dst[k] = (float)((int)(code % mod - mod/2));
+                    }
+                }
+            }
+            dst += choff;
+            choff = 18 - choff;
+        }
+    }
+    return group_size*4;
+}
+
+static void L12_apply_scf_384(L12_scale_info *sci, const float *scf, float *dst)
+{
+    int i, k;
+    memcpy(dst + 576 + sci->stereo_bands*18, dst + sci->stereo_bands*18, (sci->total_bands - sci->stereo_bands)*18*sizeof(float));
+    for (i = 0; i < sci->total_bands; i++, dst += 18, scf += 6)
+    {
+        for (k = 0; k < 12; k++)
+        {
+            dst[k + 0]   *= scf[0];
+            dst[k + 576] *= scf[3];
+        }
+    }
+}
+#endif /* MINIMP3_ONLY_MP3 */
+
+static int L3_read_side_info(bs_t *bs, L3_gr_info_t *gr, const uint8_t *hdr)
+{
+    static const uint8_t g_scf_long[8][23] = {
+        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
+        { 12,12,12,12,12,12,16,20,24,28,32,40,48,56,64,76,90,2,2,2,2,2,0 },
+        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
+        { 6,6,6,6,6,6,8,10,12,14,16,18,22,26,32,38,46,54,62,70,76,36,0 },
+        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
+        { 4,4,4,4,4,4,6,6,8,8,10,12,16,20,24,28,34,42,50,54,76,158,0 },
+        { 4,4,4,4,4,4,6,6,6,8,10,12,16,18,22,28,34,40,46,54,54,192,0 },
+        { 4,4,4,4,4,4,6,6,8,10,12,16,20,24,30,38,46,56,68,84,102,26,0 }
+    };
+    static const uint8_t g_scf_short[8][40] = {
+        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+        { 8,8,8,8,8,8,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 },
+        { 4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 },
+        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 },
+        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 },
+        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 },
+        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 }
+    };
+    static const uint8_t g_scf_mixed[8][40] = {
+        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+        { 12,12,12,4,4,4,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 },
+        { 6,6,6,6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 },
+        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 },
+        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 },
+        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 },
+        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 }
+    };
+
+    unsigned tables, scfsi = 0;
+    int main_data_begin, part_23_sum = 0;
+    int sr_idx = HDR_GET_MY_SAMPLE_RATE(hdr); sr_idx -= (sr_idx != 0);
+    int gr_count = HDR_IS_MONO(hdr) ? 1 : 2;
+
+    if (HDR_TEST_MPEG1(hdr))
+    {
+        gr_count *= 2;
+        main_data_begin = get_bits(bs, 9);
+        scfsi = get_bits(bs, 7 + gr_count);
+    } else
+    {
+        main_data_begin = get_bits(bs, 8 + gr_count) >> gr_count;
+    }
+
+    do
+    {
+        if (HDR_IS_MONO(hdr))
+        {
+            scfsi <<= 4;
+        }
+        gr->part_23_length = (uint16_t)get_bits(bs, 12);
+        part_23_sum += gr->part_23_length;
+        gr->big_values = (uint16_t)get_bits(bs,  9);
+        if (gr->big_values > 288)
+        {
+            return -1;
+        }
+        gr->global_gain = (uint8_t)get_bits(bs, 8);
+        gr->scalefac_compress = (uint16_t)get_bits(bs, HDR_TEST_MPEG1(hdr) ? 4 : 9);
+        gr->sfbtab = g_scf_long[sr_idx];
+        gr->n_long_sfb  = 22;
+        gr->n_short_sfb = 0;
+        if (get_bits(bs, 1))
+        {
+            gr->block_type = (uint8_t)get_bits(bs, 2);
+            if (!gr->block_type)
+            {
+                return -1;
+            }
+            gr->mixed_block_flag = (uint8_t)get_bits(bs, 1);
+            gr->region_count[0] = 7;
+            gr->region_count[1] = 255;
+            if (gr->block_type == SHORT_BLOCK_TYPE)
+            {
+                scfsi &= 0x0F0F;
+                if (!gr->mixed_block_flag)
+                {
+                    gr->region_count[0] = 8;
+                    gr->sfbtab = g_scf_short[sr_idx];
+                    gr->n_long_sfb = 0;
+                    gr->n_short_sfb = 39;
+                } else
+                {
+                    gr->sfbtab = g_scf_mixed[sr_idx];
+                    gr->n_long_sfb = HDR_TEST_MPEG1(hdr) ? 8 : 6;
+                    gr->n_short_sfb = 30;
+                }
+            }
+            tables = get_bits(bs, 10);
+            tables <<= 5;
+            gr->subblock_gain[0] = (uint8_t)get_bits(bs, 3);
+            gr->subblock_gain[1] = (uint8_t)get_bits(bs, 3);
+            gr->subblock_gain[2] = (uint8_t)get_bits(bs, 3);
+        } else
+        {
+            gr->block_type = 0;
+            gr->mixed_block_flag = 0;
+            tables = get_bits(bs, 15);
+            gr->region_count[0] = (uint8_t)get_bits(bs, 4);
+            gr->region_count[1] = (uint8_t)get_bits(bs, 3);
+            gr->region_count[2] = 255;
+        }
+        gr->table_select[0] = (uint8_t)(tables >> 10);
+        gr->table_select[1] = (uint8_t)((tables >> 5) & 31);
+        gr->table_select[2] = (uint8_t)((tables) & 31);
+        gr->preflag = HDR_TEST_MPEG1(hdr) ? get_bits(bs, 1) : (gr->scalefac_compress >= 500);
+        gr->scalefac_scale = (uint8_t)get_bits(bs, 1);
+        gr->count1_table = (uint8_t)get_bits(bs, 1);
+        gr->scfsi = (uint8_t)((scfsi >> 12) & 15);
+        scfsi <<= 4;
+        gr++;
+    } while(--gr_count);
+
+    if (part_23_sum + bs->pos > bs->limit + main_data_begin*8)
+    {
+        return -1;
+    }
+
+    return main_data_begin;
+}
+
+static void L3_read_scalefactors(uint8_t *scf, uint8_t *ist_pos, const uint8_t *scf_size, const uint8_t *scf_count, bs_t *bitbuf, int scfsi)
+{
+    int i, k;
+    for (i = 0; i < 4 && scf_count[i]; i++, scfsi *= 2)
+    {
+        int cnt = scf_count[i];
+        if (scfsi & 8)
+        {
+            memcpy(scf, ist_pos, cnt);
+        } else
+        {
+            int bits = scf_size[i];
+            if (!bits)
+            {
+                memset(scf, 0, cnt);
+                memset(ist_pos, 0, cnt);
+            } else
+            {
+                int max_scf = (scfsi < 0) ? (1 << bits) - 1 : -1;
+                for (k = 0; k < cnt; k++)
+                {
+                    int s = get_bits(bitbuf, bits);
+                    ist_pos[k] = (s == max_scf ? -1 : s);
+                    scf[k] = s;
+                }
+            }
+        }
+        ist_pos += cnt;
+        scf += cnt;
+    }
+    scf[0] = scf[1] = scf[2] = 0;
+}
+
+static float L3_ldexp_q2(float y, int exp_q2)
+{
+    static const float g_expfrac[4] = { 9.31322575e-10f,7.83145814e-10f,6.58544508e-10f,5.53767716e-10f };
+    int e;
+    do
+    {
+        e = MINIMP3_MIN(30*4, exp_q2);
+        y *= g_expfrac[e & 3]*(1 << 30 >> (e >> 2));
+    } while ((exp_q2 -= e) > 0);
+    return y;
+}
+
+static void L3_decode_scalefactors(const uint8_t *hdr, uint8_t *ist_pos, bs_t *bs, const L3_gr_info_t *gr, float *scf, int ch)
+{
+    static const uint8_t g_scf_partitions[3][28] = {
+        { 6,5,5, 5,6,5,5,5,6,5, 7,3,11,10,0,0, 7, 7, 7,0, 6, 6,6,3, 8, 8,5,0 },
+        { 8,9,6,12,6,9,9,9,6,9,12,6,15,18,0,0, 6,15,12,0, 6,12,9,6, 6,18,9,0 },
+        { 9,9,6,12,9,9,9,9,9,9,12,6,18,18,0,0,12,12,12,0,12, 9,9,6,15,12,9,0 }
+    };
+    const uint8_t *scf_partition = g_scf_partitions[!!gr->n_short_sfb + !gr->n_long_sfb];
+    uint8_t scf_size[4], iscf[40];
+    int i, scf_shift = gr->scalefac_scale + 1, gain_exp, scfsi = gr->scfsi;
+    float gain;
+
+    if (HDR_TEST_MPEG1(hdr))
+    {
+        static const uint8_t g_scfc_decode[16] = { 0,1,2,3, 12,5,6,7, 9,10,11,13, 14,15,18,19 };
+        int part = g_scfc_decode[gr->scalefac_compress];
+        scf_size[1] = scf_size[0] = (uint8_t)(part >> 2);
+        scf_size[3] = scf_size[2] = (uint8_t)(part & 3);
+    } else
+    {
+        static const uint8_t g_mod[6*4] = { 5,5,4,4,5,5,4,1,4,3,1,1,5,6,6,1,4,4,4,1,4,3,1,1 };
+        int k, modprod, sfc, ist = HDR_TEST_I_STEREO(hdr) && ch;
+        sfc = gr->scalefac_compress >> ist;
+        for (k = ist*3*4; sfc >= 0; sfc -= modprod, k += 4)
+        {
+            for (modprod = 1, i = 3; i >= 0; i--)
+            {
+                scf_size[i] = (uint8_t)(sfc / modprod % g_mod[k + i]);
+                modprod *= g_mod[k + i];
+            }
+        }
+        scf_partition += k;
+        scfsi = -16;
+    }
+    L3_read_scalefactors(iscf, ist_pos, scf_size, scf_partition, bs, scfsi);
+
+    if (gr->n_short_sfb)
+    {
+        int sh = 3 - scf_shift;
+        for (i = 0; i < gr->n_short_sfb; i += 3)
+        {
+            iscf[gr->n_long_sfb + i + 0] += gr->subblock_gain[0] << sh;
+            iscf[gr->n_long_sfb + i + 1] += gr->subblock_gain[1] << sh;
+            iscf[gr->n_long_sfb + i + 2] += gr->subblock_gain[2] << sh;
+        }
+    } else if (gr->preflag)
+    {
+        static const uint8_t g_preamp[10] = { 1,1,1,1,2,2,3,3,3,2 };
+        for (i = 0; i < 10; i++)
+        {
+            iscf[11 + i] += g_preamp[i];
+        }
+    }
+
+    gain_exp = gr->global_gain + BITS_DEQUANTIZER_OUT*4 - 210 - (HDR_IS_MS_STEREO(hdr) ? 2 : 0);
+    gain = L3_ldexp_q2(1 << (MAX_SCFI/4),  MAX_SCFI - gain_exp);
+    for (i = 0; i < (int)(gr->n_long_sfb + gr->n_short_sfb); i++)
+    {
+        scf[i] = L3_ldexp_q2(gain, iscf[i] << scf_shift);
+    }
+}
+
+static const float g_pow43[129 + 16] = {
+    0,-1,-2.519842f,-4.326749f,-6.349604f,-8.549880f,-10.902724f,-13.390518f,-16.000000f,-18.720754f,-21.544347f,-24.463781f,-27.473142f,-30.567351f,-33.741992f,-36.993181f,
+    0,1,2.519842f,4.326749f,6.349604f,8.549880f,10.902724f,13.390518f,16.000000f,18.720754f,21.544347f,24.463781f,27.473142f,30.567351f,33.741992f,36.993181f,40.317474f,43.711787f,47.173345f,50.699631f,54.288352f,57.937408f,61.644865f,65.408941f,69.227979f,73.100443f,77.024898f,81.000000f,85.024491f,89.097188f,93.216975f,97.382800f,101.593667f,105.848633f,110.146801f,114.487321f,118.869381f,123.292209f,127.755065f,132.257246f,136.798076f,141.376907f,145.993119f,150.646117f,155.335327f,160.060199f,164.820202f,169.614826f,174.443577f,179.305980f,184.201575f,189.129918f,194.090580f,199.083145f,204.107210f,209.162385f,214.248292f,219.364564f,224.510845f,229.686789f,234.892058f,240.126328f,245.389280f,250.680604f,256.000000f,261.347174f,266.721841f,272.123723f,277.552547f,283.008049f,288.489971f,293.998060f,299.532071f,305.091761f,310.676898f,316.287249f,321.922592f,327.582707f,333.267377f,338.976394f,344.709550f,350.466646f,356.247482f,362.051866f,367.879608f,373.730522f,379.604427f,385.501143f,391.420496f,397.362314f,403.326427f,409.312672f,415.320884f,421.350905f,427.402579f,433.475750f,439.570269f,445.685987f,451.822757f,457.980436f,464.158883f,470.357960f,476.577530f,482.817459f,489.077615f,495.357868f,501.658090f,507.978156f,514.317941f,520.677324f,527.056184f,533.454404f,539.871867f,546.308458f,552.764065f,559.238575f,565.731879f,572.243870f,578.774440f,585.323483f,591.890898f,598.476581f,605.080431f,611.702349f,618.342238f,625.000000f,631.675540f,638.368763f,645.079578f
+};
+
+static float L3_pow_43(int x)
+{
+    float frac;
+    int sign, mult = 256;
+
+    if (x < 129)
+    {
+        return g_pow43[16 + x];
+    }
+
+    if (x < 1024)
+    {
+        mult = 16;
+        x <<= 3;
+    }
+
+    sign = 2*x & 64;
+    frac = (float)((x & 63) - sign) / ((x & ~63) + sign);
+    return g_pow43[16 + ((x + sign) >> 6)]*(1.f + frac*((4.f/3) + frac*(2.f/9)))*mult;
+}
+
+static void L3_huffman(float *dst, bs_t *bs, const L3_gr_info_t *gr_info, const float *scf, int layer3gr_limit)
+{
+    static const int16_t tabs[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        785,785,785,785,784,784,784,784,513,513,513,513,513,513,513,513,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,
+        -255,1313,1298,1282,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,290,288,
+        -255,1313,1298,1282,769,769,769,769,529,529,529,529,529,529,529,529,528,528,528,528,528,528,528,528,512,512,512,512,512,512,512,512,290,288,
+        -253,-318,-351,-367,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,819,818,547,547,275,275,275,275,561,560,515,546,289,274,288,258,
+        -254,-287,1329,1299,1314,1312,1057,1057,1042,1042,1026,1026,784,784,784,784,529,529,529,529,529,529,529,529,769,769,769,769,768,768,768,768,563,560,306,306,291,259,
+        -252,-413,-477,-542,1298,-575,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-383,-399,1107,1092,1106,1061,849,849,789,789,1104,1091,773,773,1076,1075,341,340,325,309,834,804,577,577,532,532,516,516,832,818,803,816,561,561,531,531,515,546,289,289,288,258,
+        -252,-429,-493,-559,1057,1057,1042,1042,529,529,529,529,529,529,529,529,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,-382,1077,-415,1106,1061,1104,849,849,789,789,1091,1076,1029,1075,834,834,597,581,340,340,339,324,804,833,532,532,832,772,818,803,817,787,816,771,290,290,290,290,288,258,
+        -253,-349,-414,-447,-463,1329,1299,-479,1314,1312,1057,1057,1042,1042,1026,1026,785,785,785,785,784,784,784,784,769,769,769,769,768,768,768,768,-319,851,821,-335,836,850,805,849,341,340,325,336,533,533,579,579,564,564,773,832,578,548,563,516,321,276,306,291,304,259,
+        -251,-572,-733,-830,-863,-879,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,1396,1351,1381,1366,1395,1335,1380,-559,1334,1138,1138,1063,1063,1350,1392,1031,1031,1062,1062,1364,1363,1120,1120,1333,1348,881,881,881,881,375,374,359,373,343,358,341,325,791,791,1123,1122,-703,1105,1045,-719,865,865,790,790,774,774,1104,1029,338,293,323,308,-799,-815,833,788,772,818,803,816,322,292,307,320,561,531,515,546,289,274,288,258,
+        -251,-525,-605,-685,-765,-831,-846,1298,1057,1057,1312,1282,785,785,785,785,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,1399,1398,1383,1367,1382,1396,1351,-511,1381,1366,1139,1139,1079,1079,1124,1124,1364,1349,1363,1333,882,882,882,882,807,807,807,807,1094,1094,1136,1136,373,341,535,535,881,775,867,822,774,-591,324,338,-671,849,550,550,866,864,609,609,293,336,534,534,789,835,773,-751,834,804,308,307,833,788,832,772,562,562,547,547,305,275,560,515,290,290,
+        -252,-397,-477,-557,-622,-653,-719,-735,-750,1329,1299,1314,1057,1057,1042,1042,1312,1282,1024,1024,785,785,785,785,784,784,784,784,769,769,769,769,-383,1127,1141,1111,1126,1140,1095,1110,869,869,883,883,1079,1109,882,882,375,374,807,868,838,881,791,-463,867,822,368,263,852,837,836,-543,610,610,550,550,352,336,534,534,865,774,851,821,850,805,593,533,579,564,773,832,578,578,548,548,577,577,307,276,306,291,516,560,259,259,
+        -250,-2107,-2507,-2764,-2909,-2974,-3007,-3023,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-767,-1052,-1213,-1277,-1358,-1405,-1469,-1535,-1550,-1582,-1614,-1647,-1662,-1694,-1726,-1759,-1774,-1807,-1822,-1854,-1886,1565,-1919,-1935,-1951,-1967,1731,1730,1580,1717,-1983,1729,1564,-1999,1548,-2015,-2031,1715,1595,-2047,1714,-2063,1610,-2079,1609,-2095,1323,1323,1457,1457,1307,1307,1712,1547,1641,1700,1699,1594,1685,1625,1442,1442,1322,1322,-780,-973,-910,1279,1278,1277,1262,1276,1261,1275,1215,1260,1229,-959,974,974,989,989,-943,735,478,478,495,463,506,414,-1039,1003,958,1017,927,942,987,957,431,476,1272,1167,1228,-1183,1256,-1199,895,895,941,941,1242,1227,1212,1135,1014,1014,490,489,503,487,910,1013,985,925,863,894,970,955,1012,847,-1343,831,755,755,984,909,428,366,754,559,-1391,752,486,457,924,997,698,698,983,893,740,740,908,877,739,739,667,667,953,938,497,287,271,271,683,606,590,712,726,574,302,302,738,736,481,286,526,725,605,711,636,724,696,651,589,681,666,710,364,467,573,695,466,466,301,465,379,379,709,604,665,679,316,316,634,633,436,436,464,269,424,394,452,332,438,363,347,408,393,448,331,422,362,407,392,421,346,406,391,376,375,359,1441,1306,-2367,1290,-2383,1337,-2399,-2415,1426,1321,-2431,1411,1336,-2447,-2463,-2479,1169,1169,1049,1049,1424,1289,1412,1352,1319,-2495,1154,1154,1064,1064,1153,1153,416,390,360,404,403,389,344,374,373,343,358,372,327,357,342,311,356,326,1395,1394,1137,1137,1047,1047,1365,1392,1287,1379,1334,1364,1349,1378,1318,1363,792,792,792,792,1152,1152,1032,1032,1121,1121,1046,1046,1120,1120,1030,1030,-2895,1106,1061,1104,849,849,789,789,1091,1076,1029,1090,1060,1075,833,833,309,324,532,532,832,772,818,803,561,561,531,560,515,546,289,274,288,258,
+        -250,-1179,-1579,-1836,-1996,-2124,-2253,-2333,-2413,-2477,-2542,-2574,-2607,-2622,-2655,1314,1313,1298,1312,1282,785,785,785,785,1040,1040,1025,1025,768,768,768,768,-766,-798,-830,-862,-895,-911,-927,-943,-959,-975,-991,-1007,-1023,-1039,-1055,-1070,1724,1647,-1103,-1119,1631,1767,1662,1738,1708,1723,-1135,1780,1615,1779,1599,1677,1646,1778,1583,-1151,1777,1567,1737,1692,1765,1722,1707,1630,1751,1661,1764,1614,1736,1676,1763,1750,1645,1598,1721,1691,1762,1706,1582,1761,1566,-1167,1749,1629,767,766,751,765,494,494,735,764,719,749,734,763,447,447,748,718,477,506,431,491,446,476,461,505,415,430,475,445,504,399,460,489,414,503,383,474,429,459,502,502,746,752,488,398,501,473,413,472,486,271,480,270,-1439,-1455,1357,-1471,-1487,-1503,1341,1325,-1519,1489,1463,1403,1309,-1535,1372,1448,1418,1476,1356,1462,1387,-1551,1475,1340,1447,1402,1386,-1567,1068,1068,1474,1461,455,380,468,440,395,425,410,454,364,467,466,464,453,269,409,448,268,432,1371,1473,1432,1417,1308,1460,1355,1446,1459,1431,1083,1083,1401,1416,1458,1445,1067,1067,1370,1457,1051,1051,1291,1430,1385,1444,1354,1415,1400,1443,1082,1082,1173,1113,1186,1066,1185,1050,-1967,1158,1128,1172,1097,1171,1081,-1983,1157,1112,416,266,375,400,1170,1142,1127,1065,793,793,1169,1033,1156,1096,1141,1111,1155,1080,1126,1140,898,898,808,808,897,897,792,792,1095,1152,1032,1125,1110,1139,1079,1124,882,807,838,881,853,791,-2319,867,368,263,822,852,837,866,806,865,-2399,851,352,262,534,534,821,836,594,594,549,549,593,593,533,533,848,773,579,579,564,578,548,563,276,276,577,576,306,291,516,560,305,305,275,259,
+        -251,-892,-2058,-2620,-2828,-2957,-3023,-3039,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,-559,1530,-575,-591,1528,1527,1407,1526,1391,1023,1023,1023,1023,1525,1375,1268,1268,1103,1103,1087,1087,1039,1039,1523,-604,815,815,815,815,510,495,509,479,508,463,507,447,431,505,415,399,-734,-782,1262,-815,1259,1244,-831,1258,1228,-847,-863,1196,-879,1253,987,987,748,-767,493,493,462,477,414,414,686,669,478,446,461,445,474,429,487,458,412,471,1266,1264,1009,1009,799,799,-1019,-1276,-1452,-1581,-1677,-1757,-1821,-1886,-1933,-1997,1257,1257,1483,1468,1512,1422,1497,1406,1467,1496,1421,1510,1134,1134,1225,1225,1466,1451,1374,1405,1252,1252,1358,1480,1164,1164,1251,1251,1238,1238,1389,1465,-1407,1054,1101,-1423,1207,-1439,830,830,1248,1038,1237,1117,1223,1148,1236,1208,411,426,395,410,379,269,1193,1222,1132,1235,1221,1116,976,976,1192,1162,1177,1220,1131,1191,963,963,-1647,961,780,-1663,558,558,994,993,437,408,393,407,829,978,813,797,947,-1743,721,721,377,392,844,950,828,890,706,706,812,859,796,960,948,843,934,874,571,571,-1919,690,555,689,421,346,539,539,944,779,918,873,932,842,903,888,570,570,931,917,674,674,-2575,1562,-2591,1609,-2607,1654,1322,1322,1441,1441,1696,1546,1683,1593,1669,1624,1426,1426,1321,1321,1639,1680,1425,1425,1305,1305,1545,1668,1608,1623,1667,1592,1638,1666,1320,1320,1652,1607,1409,1409,1304,1304,1288,1288,1664,1637,1395,1395,1335,1335,1622,1636,1394,1394,1319,1319,1606,1621,1392,1392,1137,1137,1137,1137,345,390,360,375,404,373,1047,-2751,-2767,-2783,1062,1121,1046,-2799,1077,-2815,1106,1061,789,789,1105,1104,263,355,310,340,325,354,352,262,339,324,1091,1076,1029,1090,1060,1075,833,833,788,788,1088,1028,818,818,803,803,561,561,531,531,816,771,546,546,289,274,288,258,
+        -253,-317,-381,-446,-478,-509,1279,1279,-811,-1179,-1451,-1756,-1900,-2028,-2189,-2253,-2333,-2414,-2445,-2511,-2526,1313,1298,-2559,1041,1041,1040,1040,1025,1025,1024,1024,1022,1007,1021,991,1020,975,1019,959,687,687,1018,1017,671,671,655,655,1016,1015,639,639,758,758,623,623,757,607,756,591,755,575,754,559,543,543,1009,783,-575,-621,-685,-749,496,-590,750,749,734,748,974,989,1003,958,988,973,1002,942,987,957,972,1001,926,986,941,971,956,1000,910,985,925,999,894,970,-1071,-1087,-1102,1390,-1135,1436,1509,1451,1374,-1151,1405,1358,1480,1420,-1167,1507,1494,1389,1342,1465,1435,1450,1326,1505,1310,1493,1373,1479,1404,1492,1464,1419,428,443,472,397,736,526,464,464,486,457,442,471,484,482,1357,1449,1434,1478,1388,1491,1341,1490,1325,1489,1463,1403,1309,1477,1372,1448,1418,1433,1476,1356,1462,1387,-1439,1475,1340,1447,1402,1474,1324,1461,1371,1473,269,448,1432,1417,1308,1460,-1711,1459,-1727,1441,1099,1099,1446,1386,1431,1401,-1743,1289,1083,1083,1160,1160,1458,1445,1067,1067,1370,1457,1307,1430,1129,1129,1098,1098,268,432,267,416,266,400,-1887,1144,1187,1082,1173,1113,1186,1066,1050,1158,1128,1143,1172,1097,1171,1081,420,391,1157,1112,1170,1142,1127,1065,1169,1049,1156,1096,1141,1111,1155,1080,1126,1154,1064,1153,1140,1095,1048,-2159,1125,1110,1137,-2175,823,823,1139,1138,807,807,384,264,368,263,868,838,853,791,867,822,852,837,866,806,865,790,-2319,851,821,836,352,262,850,805,849,-2399,533,533,835,820,336,261,578,548,563,577,532,532,832,772,562,562,547,547,305,275,560,515,290,290,288,258 };
+    static const uint8_t tab32[] = { 130,162,193,209,44,28,76,140,9,9,9,9,9,9,9,9,190,254,222,238,126,94,157,157,109,61,173,205 };
+    static const uint8_t tab33[] = { 252,236,220,204,188,172,156,140,124,108,92,76,60,44,28,12 };
+    static const int16_t tabindex[2*16] = { 0,32,64,98,0,132,180,218,292,364,426,538,648,746,0,1126,1460,1460,1460,1460,1460,1460,1460,1460,1842,1842,1842,1842,1842,1842,1842,1842 };
+    static const uint8_t g_linbits[] =  { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,6,8,10,13,4,5,6,7,8,9,11,13 };
+
+#define PEEK_BITS(n)  (bs_cache >> (32 - n))
+#define FLUSH_BITS(n) { bs_cache <<= (n); bs_sh += (n); }
+#define CHECK_BITS    while (bs_sh >= 0) { bs_cache |= (uint32_t)*bs_next_ptr++ << bs_sh; bs_sh -= 8; }
+#define BSPOS         ((bs_next_ptr - bs->buf)*8 - 24 + bs_sh)
+
+    float one = 0.0f;
+    int ireg = 0, big_val_cnt = gr_info->big_values;
+    const uint8_t *sfb = gr_info->sfbtab;
+    const uint8_t *bs_next_ptr = bs->buf + bs->pos/8;
+    uint32_t bs_cache = (((bs_next_ptr[0]*256u + bs_next_ptr[1])*256u + bs_next_ptr[2])*256u + bs_next_ptr[3]) << (bs->pos & 7);
+    int pairs_to_decode, np, bs_sh = (bs->pos & 7) - 8;
+    bs_next_ptr += 4;
+
+    while (big_val_cnt > 0)
+    {
+        int tab_num = gr_info->table_select[ireg];
+        int sfb_cnt = gr_info->region_count[ireg++];
+        const int16_t *codebook = tabs + tabindex[tab_num];
+        int linbits = g_linbits[tab_num];
+        if (linbits)
+        {
+            do
+            {
+                np = *sfb++ / 2;
+                pairs_to_decode = MINIMP3_MIN(big_val_cnt, np);
+                one = *scf++;
+                do
+                {
+                    int j, w = 5;
+                    int leaf = codebook[PEEK_BITS(w)];
+                    while (leaf < 0)
+                    {
+                        FLUSH_BITS(w);
+                        w = leaf & 7;
+                        leaf = codebook[PEEK_BITS(w) - (leaf >> 3)];
+                    }
+                    FLUSH_BITS(leaf >> 8);
+
+                    for (j = 0; j < 2; j++, dst++, leaf >>= 4)
+                    {
+                        int lsb = leaf & 0x0F;
+                        if (lsb == 15)
+                        {
+                            lsb += PEEK_BITS(linbits);
+                            FLUSH_BITS(linbits);
+                            CHECK_BITS;
+                            *dst = one*L3_pow_43(lsb)*((int32_t)bs_cache < 0 ? -1: 1);
+                        } else
+                        {
+                            *dst = g_pow43[16 + lsb - 16*(bs_cache >> 31)]*one;
+                        }
+                        FLUSH_BITS(lsb ? 1 : 0);
+                    }
+                    CHECK_BITS;
+                } while (--pairs_to_decode);
+            } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0);
+        } else
+        {
+            do
+            {
+                np = *sfb++ / 2;
+                pairs_to_decode = MINIMP3_MIN(big_val_cnt, np);
+                one = *scf++;
+                do
+                {
+                    int j, w = 5;
+                    int leaf = codebook[PEEK_BITS(w)];
+                    while (leaf < 0)
+                    {
+                        FLUSH_BITS(w);
+                        w = leaf & 7;
+                        leaf = codebook[PEEK_BITS(w) - (leaf >> 3)];
+                    }
+                    FLUSH_BITS(leaf >> 8);
+
+                    for (j = 0; j < 2; j++, dst++, leaf >>= 4)
+                    {
+                        int lsb = leaf & 0x0F;
+                        *dst = g_pow43[16 + lsb - 16*(bs_cache >> 31)]*one;
+                        FLUSH_BITS(lsb ? 1 : 0);
+                    }
+                    CHECK_BITS;
+                } while (--pairs_to_decode);
+            } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0);
+        }
+    }
+
+    for (np = 1 - big_val_cnt;; dst += 4)
+    {
+        const uint8_t *codebook_count1 = (gr_info->count1_table) ? tab33 : tab32;
+        int leaf = codebook_count1[PEEK_BITS(4)];
+        if (!(leaf & 8))
+        {
+            leaf = codebook_count1[(leaf >> 3) + (bs_cache << 4 >> (32 - (leaf & 3)))];
+        }
+        FLUSH_BITS(leaf & 7);
+        if (BSPOS > layer3gr_limit)
+        {
+            break;
+        }
+#define RELOAD_SCALEFACTOR  if (!--np) { np = *sfb++/2; if (!np) break; one = *scf++; }
+#define DEQ_COUNT1(s) if (leaf & (128 >> s)) { dst[s] = ((int32_t)bs_cache < 0) ? -one : one; FLUSH_BITS(1) }
+        RELOAD_SCALEFACTOR;
+        DEQ_COUNT1(0);
+        DEQ_COUNT1(1);
+        RELOAD_SCALEFACTOR;
+        DEQ_COUNT1(2);
+        DEQ_COUNT1(3);
+        CHECK_BITS;
+    }
+
+    bs->pos = layer3gr_limit;
+}
+
+static void L3_midside_stereo(float *left, int n)
+{
+    int i = 0;
+    float *right = left + 576;
+#if HAVE_SIMD
+    if (have_simd())
+    {
+        for (; i < n - 3; i += 4)
+        {
+            f4 vl = VLD(left + i);
+            f4 vr = VLD(right + i);
+            VSTORE(left + i, VADD(vl, vr));
+            VSTORE(right + i, VSUB(vl, vr));
+        }
+#ifdef __GNUC__
+        /* Workaround for spurious -Waggressive-loop-optimizations warning from gcc.
+         * For more info see: https://github.com/lieff/minimp3/issues/88
+         */
+        if (__builtin_constant_p(n % 4 == 0) && n % 4 == 0)
+            return;
+#endif
+    }
+#endif /* HAVE_SIMD */
+    for (; i < n; i++)
+    {
+        float a = left[i];
+        float b = right[i];
+        left[i] = a + b;
+        right[i] = a - b;
+    }
+}
+
+static void L3_intensity_stereo_band(float *left, int n, float kl, float kr)
+{
+    int i;
+    for (i = 0; i < n; i++)
+    {
+        left[i + 576] = left[i]*kr;
+        left[i] = left[i]*kl;
+    }
+}
+
+static void L3_stereo_top_band(const float *right, const uint8_t *sfb, int nbands, int max_band[3])
+{
+    int i, k;
+
+    max_band[0] = max_band[1] = max_band[2] = -1;
+
+    for (i = 0; i < nbands; i++)
+    {
+        for (k = 0; k < sfb[i]; k += 2)
+        {
+            if (right[k] != 0 || right[k + 1] != 0)
+            {
+                max_band[i % 3] = i;
+                break;
+            }
+        }
+        right += sfb[i];
+    }
+}
+
+static void L3_stereo_process(float *left, const uint8_t *ist_pos, const uint8_t *sfb, const uint8_t *hdr, int max_band[3], int mpeg2_sh)
+{
+    static const float g_pan[7*2] = { 0,1,0.21132487f,0.78867513f,0.36602540f,0.63397460f,0.5f,0.5f,0.63397460f,0.36602540f,0.78867513f,0.21132487f,1,0 };
+    unsigned i, max_pos = HDR_TEST_MPEG1(hdr) ? 7 : 64;
+
+    for (i = 0; sfb[i]; i++)
+    {
+        unsigned ipos = ist_pos[i];
+        if ((int)i > max_band[i % 3] && ipos < max_pos)
+        {
+            float kl, kr, s = HDR_TEST_MS_STEREO(hdr) ? 1.41421356f : 1;
+            if (HDR_TEST_MPEG1(hdr))
+            {
+                kl = g_pan[2*ipos];
+                kr = g_pan[2*ipos + 1];
+            } else
+            {
+                kl = 1;
+                kr = L3_ldexp_q2(1, (ipos + 1) >> 1 << mpeg2_sh);
+                if (ipos & 1)
+                {
+                    kl = kr;
+                    kr = 1;
+                }
+            }
+            L3_intensity_stereo_band(left, sfb[i], kl*s, kr*s);
+        } else if (HDR_TEST_MS_STEREO(hdr))
+        {
+            L3_midside_stereo(left, sfb[i]);
+        }
+        left += sfb[i];
+    }
+}
+
+static void L3_intensity_stereo(float *left, uint8_t *ist_pos, const L3_gr_info_t *gr, const uint8_t *hdr)
+{
+    int max_band[3], n_sfb = gr->n_long_sfb + gr->n_short_sfb;
+    int i, max_blocks = gr->n_short_sfb ? 3 : 1;
+
+    L3_stereo_top_band(left + 576, gr->sfbtab, n_sfb, max_band);
+    if (gr->n_long_sfb)
+    {
+        max_band[0] = max_band[1] = max_band[2] = MINIMP3_MAX(MINIMP3_MAX(max_band[0], max_band[1]), max_band[2]);
+    }
+    for (i = 0; i < max_blocks; i++)
+    {
+        int default_pos = HDR_TEST_MPEG1(hdr) ? 3 : 0;
+        int itop = n_sfb - max_blocks + i;
+        int prev = itop - max_blocks;
+        ist_pos[itop] = max_band[i] >= prev ? default_pos : ist_pos[prev];
+    }
+    L3_stereo_process(left, ist_pos, gr->sfbtab, hdr, max_band, gr[1].scalefac_compress & 1);
+}
+
+static void L3_reorder(float *grbuf, float *scratch, const uint8_t *sfb)
+{
+    int i, len;
+    float *src = grbuf, *dst = scratch;
+
+    for (;0 != (len = *sfb); sfb += 3, src += 2*len)
+    {
+        for (i = 0; i < len; i++, src++)
+        {
+            *dst++ = src[0*len];
+            *dst++ = src[1*len];
+            *dst++ = src[2*len];
+        }
+    }
+    memcpy(grbuf, scratch, (dst - scratch)*sizeof(float));
+}
+
+static void L3_antialias(float *grbuf, int nbands)
+{
+    static const float g_aa[2][8] = {
+        {0.85749293f,0.88174200f,0.94962865f,0.98331459f,0.99551782f,0.99916056f,0.99989920f,0.99999316f},
+        {0.51449576f,0.47173197f,0.31337745f,0.18191320f,0.09457419f,0.04096558f,0.01419856f,0.00369997f}
+    };
+
+    for (; nbands > 0; nbands--, grbuf += 18)
+    {
+        int i = 0;
+#if HAVE_SIMD
+        if (have_simd()) for (; i < 8; i += 4)
+        {
+            f4 vu = VLD(grbuf + 18 + i);
+            f4 vd = VLD(grbuf + 14 - i);
+            f4 vc0 = VLD(g_aa[0] + i);
+            f4 vc1 = VLD(g_aa[1] + i);
+            vd = VREV(vd);
+            VSTORE(grbuf + 18 + i, VSUB(VMUL(vu, vc0), VMUL(vd, vc1)));
+            vd = VADD(VMUL(vu, vc1), VMUL(vd, vc0));
+            VSTORE(grbuf + 14 - i, VREV(vd));
+        }
+#endif /* HAVE_SIMD */
+#ifndef MINIMP3_ONLY_SIMD
+        for(; i < 8; i++)
+        {
+            float u = grbuf[18 + i];
+            float d = grbuf[17 - i];
+            grbuf[18 + i] = u*g_aa[0][i] - d*g_aa[1][i];
+            grbuf[17 - i] = u*g_aa[1][i] + d*g_aa[0][i];
+        }
+#endif /* MINIMP3_ONLY_SIMD */
+    }
+}
+
+static void L3_dct3_9(float *y)
+{
+    float s0, s1, s2, s3, s4, s5, s6, s7, s8, t0, t2, t4;
+
+    s0 = y[0]; s2 = y[2]; s4 = y[4]; s6 = y[6]; s8 = y[8];
+    t0 = s0 + s6*0.5f;
+    s0 -= s6;
+    t4 = (s4 + s2)*0.93969262f;
+    t2 = (s8 + s2)*0.76604444f;
+    s6 = (s4 - s8)*0.17364818f;
+    s4 += s8 - s2;
+
+    s2 = s0 - s4*0.5f;
+    y[4] = s4 + s0;
+    s8 = t0 - t2 + s6;
+    s0 = t0 - t4 + t2;
+    s4 = t0 + t4 - s6;
+
+    s1 = y[1]; s3 = y[3]; s5 = y[5]; s7 = y[7];
+
+    s3 *= 0.86602540f;
+    t0 = (s5 + s1)*0.98480775f;
+    t4 = (s5 - s7)*0.34202014f;
+    t2 = (s1 + s7)*0.64278761f;
+    s1 = (s1 - s5 - s7)*0.86602540f;
+
+    s5 = t0 - s3 - t2;
+    s7 = t4 - s3 - t0;
+    s3 = t4 + s3 - t2;
+
+    y[0] = s4 - s7;
+    y[1] = s2 + s1;
+    y[2] = s0 - s3;
+    y[3] = s8 + s5;
+    y[5] = s8 - s5;
+    y[6] = s0 + s3;
+    y[7] = s2 - s1;
+    y[8] = s4 + s7;
+}
+
+static void L3_imdct36(float *grbuf, float *overlap, const float *window, int nbands)
+{
+    int i, j;
+    static const float g_twid9[18] = {
+        0.73727734f,0.79335334f,0.84339145f,0.88701083f,0.92387953f,0.95371695f,0.97629601f,0.99144486f,0.99904822f,0.67559021f,0.60876143f,0.53729961f,0.46174861f,0.38268343f,0.30070580f,0.21643961f,0.13052619f,0.04361938f
+    };
+
+    for (j = 0; j < nbands; j++, grbuf += 18, overlap += 9)
+    {
+        float co[9], si[9];
+        co[0] = -grbuf[0];
+        si[0] = grbuf[17];
+        for (i = 0; i < 4; i++)
+        {
+            si[8 - 2*i] =   grbuf[4*i + 1] - grbuf[4*i + 2];
+            co[1 + 2*i] =   grbuf[4*i + 1] + grbuf[4*i + 2];
+            si[7 - 2*i] =   grbuf[4*i + 4] - grbuf[4*i + 3];
+            co[2 + 2*i] = -(grbuf[4*i + 3] + grbuf[4*i + 4]);
+        }
+        L3_dct3_9(co);
+        L3_dct3_9(si);
+
+        si[1] = -si[1];
+        si[3] = -si[3];
+        si[5] = -si[5];
+        si[7] = -si[7];
+
+        i = 0;
+
+#if HAVE_SIMD
+        if (have_simd()) for (; i < 8; i += 4)
+        {
+            f4 vovl = VLD(overlap + i);
+            f4 vc = VLD(co + i);
+            f4 vs = VLD(si + i);
+            f4 vr0 = VLD(g_twid9 + i);
+            f4 vr1 = VLD(g_twid9 + 9 + i);
+            f4 vw0 = VLD(window + i);
+            f4 vw1 = VLD(window + 9 + i);
+            f4 vsum = VADD(VMUL(vc, vr1), VMUL(vs, vr0));
+            VSTORE(overlap + i, VSUB(VMUL(vc, vr0), VMUL(vs, vr1)));
+            VSTORE(grbuf + i, VSUB(VMUL(vovl, vw0), VMUL(vsum, vw1)));
+            vsum = VADD(VMUL(vovl, vw1), VMUL(vsum, vw0));
+            VSTORE(grbuf + 14 - i, VREV(vsum));
+        }
+#endif /* HAVE_SIMD */
+        for (; i < 9; i++)
+        {
+            float ovl  = overlap[i];
+            float sum  = co[i]*g_twid9[9 + i] + si[i]*g_twid9[0 + i];
+            overlap[i] = co[i]*g_twid9[0 + i] - si[i]*g_twid9[9 + i];
+            grbuf[i]      = ovl*window[0 + i] - sum*window[9 + i];
+            grbuf[17 - i] = ovl*window[9 + i] + sum*window[0 + i];
+        }
+    }
+}
+
+static void L3_idct3(float x0, float x1, float x2, float *dst)
+{
+    float m1 = x1*0.86602540f;
+    float a1 = x0 - x2*0.5f;
+    dst[1] = x0 + x2;
+    dst[0] = a1 + m1;
+    dst[2] = a1 - m1;
+}
+
+static void L3_imdct12(float *x, float *dst, float *overlap)
+{
+    static const float g_twid3[6] = { 0.79335334f,0.92387953f,0.99144486f, 0.60876143f,0.38268343f,0.13052619f };
+    float co[3], si[3];
+    int i;
+
+    L3_idct3(-x[0], x[6] + x[3], x[12] + x[9], co);
+    L3_idct3(x[15], x[12] - x[9], x[6] - x[3], si);
+    si[1] = -si[1];
+
+    for (i = 0; i < 3; i++)
+    {
+        float ovl  = overlap[i];
+        float sum  = co[i]*g_twid3[3 + i] + si[i]*g_twid3[0 + i];
+        overlap[i] = co[i]*g_twid3[0 + i] - si[i]*g_twid3[3 + i];
+        dst[i]     = ovl*g_twid3[2 - i] - sum*g_twid3[5 - i];
+        dst[5 - i] = ovl*g_twid3[5 - i] + sum*g_twid3[2 - i];
+    }
+}
+
+static void L3_imdct_short(float *grbuf, float *overlap, int nbands)
+{
+    for (;nbands > 0; nbands--, overlap += 9, grbuf += 18)
+    {
+        float tmp[18];
+        memcpy(tmp, grbuf, sizeof(tmp));
+        memcpy(grbuf, overlap, 6*sizeof(float));
+        L3_imdct12(tmp, grbuf + 6, overlap + 6);
+        L3_imdct12(tmp + 1, grbuf + 12, overlap + 6);
+        L3_imdct12(tmp + 2, overlap, overlap + 6);
+    }
+}
+
+static void L3_change_sign(float *grbuf)
+{
+    int b, i;
+    for (b = 0, grbuf += 18; b < 32; b += 2, grbuf += 36)
+        for (i = 1; i < 18; i += 2)
+            grbuf[i] = -grbuf[i];
+}
+
+static void L3_imdct_gr(float *grbuf, float *overlap, unsigned block_type, unsigned n_long_bands)
+{
+    static const float g_mdct_window[2][18] = {
+        { 0.99904822f,0.99144486f,0.97629601f,0.95371695f,0.92387953f,0.88701083f,0.84339145f,0.79335334f,0.73727734f,0.04361938f,0.13052619f,0.21643961f,0.30070580f,0.38268343f,0.46174861f,0.53729961f,0.60876143f,0.67559021f },
+        { 1,1,1,1,1,1,0.99144486f,0.92387953f,0.79335334f,0,0,0,0,0,0,0.13052619f,0.38268343f,0.60876143f }
+    };
+    if (n_long_bands)
+    {
+        L3_imdct36(grbuf, overlap, g_mdct_window[0], n_long_bands);
+        grbuf += 18*n_long_bands;
+        overlap += 9*n_long_bands;
+    }
+    if (block_type == SHORT_BLOCK_TYPE)
+        L3_imdct_short(grbuf, overlap, 32 - n_long_bands);
+    else
+        L3_imdct36(grbuf, overlap, g_mdct_window[block_type == STOP_BLOCK_TYPE], 32 - n_long_bands);
+}
+
+static void L3_save_reservoir(mp3dec_t *h, mp3dec_scratch_t *s)
+{
+    int pos = (s->bs.pos + 7)/8u;
+    int remains = s->bs.limit/8u - pos;
+    if (remains > MAX_BITRESERVOIR_BYTES)
+    {
+        pos += remains - MAX_BITRESERVOIR_BYTES;
+        remains = MAX_BITRESERVOIR_BYTES;
+    }
+    if (remains > 0)
+    {
+        memmove(h->reserv_buf, s->maindata + pos, remains);
+    }
+    h->reserv = remains;
+}
+
+static int L3_restore_reservoir(mp3dec_t *h, bs_t *bs, mp3dec_scratch_t *s, int main_data_begin)
+{
+    int frame_bytes = (bs->limit - bs->pos)/8;
+    int bytes_have = MINIMP3_MIN(h->reserv, main_data_begin);
+    memcpy(s->maindata, h->reserv_buf + MINIMP3_MAX(0, h->reserv - main_data_begin), MINIMP3_MIN(h->reserv, main_data_begin));
+    memcpy(s->maindata + bytes_have, bs->buf + bs->pos/8, frame_bytes);
+    bs_init(&s->bs, s->maindata, bytes_have + frame_bytes);
+    return h->reserv >= main_data_begin;
+}
+
+static void L3_decode(mp3dec_t *h, mp3dec_scratch_t *s, L3_gr_info_t *gr_info, int nch)
+{
+    int ch;
+
+    for (ch = 0; ch < nch; ch++)
+    {
+        int layer3gr_limit = s->bs.pos + gr_info[ch].part_23_length;
+        L3_decode_scalefactors(h->header, s->ist_pos[ch], &s->bs, gr_info + ch, s->scf, ch);
+        L3_huffman(s->grbuf[ch], &s->bs, gr_info + ch, s->scf, layer3gr_limit);
+    }
+
+    if (HDR_TEST_I_STEREO(h->header))
+    {
+        L3_intensity_stereo(s->grbuf[0], s->ist_pos[1], gr_info, h->header);
+    } else if (HDR_IS_MS_STEREO(h->header))
+    {
+        L3_midside_stereo(s->grbuf[0], 576);
+    }
+
+    for (ch = 0; ch < nch; ch++, gr_info++)
+    {
+        int aa_bands = 31;
+        int n_long_bands = (gr_info->mixed_block_flag ? 2 : 0) << (int)(HDR_GET_MY_SAMPLE_RATE(h->header) == 2);
+
+        if (gr_info->n_short_sfb)
+        {
+            aa_bands = n_long_bands - 1;
+            L3_reorder(s->grbuf[ch] + n_long_bands*18, s->syn[0], gr_info->sfbtab + gr_info->n_long_sfb);
+        }
+
+        L3_antialias(s->grbuf[ch], aa_bands);
+        L3_imdct_gr(s->grbuf[ch], h->mdct_overlap[ch], gr_info->block_type, n_long_bands);
+        L3_change_sign(s->grbuf[ch]);
+    }
+}
+
+static void mp3d_DCT_II(float *grbuf, int n)
+{
+    static const float g_sec[24] = {
+        10.19000816f,0.50060302f,0.50241929f,3.40760851f,0.50547093f,0.52249861f,2.05778098f,0.51544732f,0.56694406f,1.48416460f,0.53104258f,0.64682180f,1.16943991f,0.55310392f,0.78815460f,0.97256821f,0.58293498f,1.06067765f,0.83934963f,0.62250412f,1.72244716f,0.74453628f,0.67480832f,5.10114861f
+    };
+    int i, k = 0;
+#if HAVE_SIMD
+    if (have_simd()) for (; k < n; k += 4)
+    {
+        f4 t[4][8], *x;
+        float *y = grbuf + k;
+
+        for (x = t[0], i = 0; i < 8; i++, x++)
+        {
+            f4 x0 = VLD(&y[i*18]);
+            f4 x1 = VLD(&y[(15 - i)*18]);
+            f4 x2 = VLD(&y[(16 + i)*18]);
+            f4 x3 = VLD(&y[(31 - i)*18]);
+            f4 t0 = VADD(x0, x3);
+            f4 t1 = VADD(x1, x2);
+            f4 t2 = VMUL_S(VSUB(x1, x2), g_sec[3*i + 0]);
+            f4 t3 = VMUL_S(VSUB(x0, x3), g_sec[3*i + 1]);
+            x[0] = VADD(t0, t1);
+            x[8] = VMUL_S(VSUB(t0, t1), g_sec[3*i + 2]);
+            x[16] = VADD(t3, t2);
+            x[24] = VMUL_S(VSUB(t3, t2), g_sec[3*i + 2]);
+        }
+        for (x = t[0], i = 0; i < 4; i++, x += 8)
+        {
+            f4 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt;
+            xt = VSUB(x0, x7); x0 = VADD(x0, x7);
+            x7 = VSUB(x1, x6); x1 = VADD(x1, x6);
+            x6 = VSUB(x2, x5); x2 = VADD(x2, x5);
+            x5 = VSUB(x3, x4); x3 = VADD(x3, x4);
+            x4 = VSUB(x0, x3); x0 = VADD(x0, x3);
+            x3 = VSUB(x1, x2); x1 = VADD(x1, x2);
+            x[0] = VADD(x0, x1);
+            x[4] = VMUL_S(VSUB(x0, x1), 0.70710677f);
+            x5 = VADD(x5, x6);
+            x6 = VMUL_S(VADD(x6, x7), 0.70710677f);
+            x7 = VADD(x7, xt);
+            x3 = VMUL_S(VADD(x3, x4), 0.70710677f);
+            x5 = VSUB(x5, VMUL_S(x7, 0.198912367f)); /* rotate by PI/8 */
+            x7 = VADD(x7, VMUL_S(x5, 0.382683432f));
+            x5 = VSUB(x5, VMUL_S(x7, 0.198912367f));
+            x0 = VSUB(xt, x6); xt = VADD(xt, x6);
+            x[1] = VMUL_S(VADD(xt, x7), 0.50979561f);
+            x[2] = VMUL_S(VADD(x4, x3), 0.54119611f);
+            x[3] = VMUL_S(VSUB(x0, x5), 0.60134488f);
+            x[5] = VMUL_S(VADD(x0, x5), 0.89997619f);
+            x[6] = VMUL_S(VSUB(x4, x3), 1.30656302f);
+            x[7] = VMUL_S(VSUB(xt, x7), 2.56291556f);
+        }
+
+        if (k > n - 3)
+        {
+#if HAVE_SSE
+#define VSAVE2(i, v) _mm_storel_pi((__m64 *)(void*)&y[i*18], v)
+#else /* HAVE_SSE */
+#define VSAVE2(i, v) vst1_f32((float32_t *)&y[i*18],  vget_low_f32(v))
+#endif /* HAVE_SSE */
+            for (i = 0; i < 7; i++, y += 4*18)
+            {
+                f4 s = VADD(t[3][i], t[3][i + 1]);
+                VSAVE2(0, t[0][i]);
+                VSAVE2(1, VADD(t[2][i], s));
+                VSAVE2(2, VADD(t[1][i], t[1][i + 1]));
+                VSAVE2(3, VADD(t[2][1 + i], s));
+            }
+            VSAVE2(0, t[0][7]);
+            VSAVE2(1, VADD(t[2][7], t[3][7]));
+            VSAVE2(2, t[1][7]);
+            VSAVE2(3, t[3][7]);
+        } else
+        {
+#define VSAVE4(i, v) VSTORE(&y[i*18], v)
+            for (i = 0; i < 7; i++, y += 4*18)
+            {
+                f4 s = VADD(t[3][i], t[3][i + 1]);
+                VSAVE4(0, t[0][i]);
+                VSAVE4(1, VADD(t[2][i], s));
+                VSAVE4(2, VADD(t[1][i], t[1][i + 1]));
+                VSAVE4(3, VADD(t[2][1 + i], s));
+            }
+            VSAVE4(0, t[0][7]);
+            VSAVE4(1, VADD(t[2][7], t[3][7]));
+            VSAVE4(2, t[1][7]);
+            VSAVE4(3, t[3][7]);
+        }
+    } else
+#endif /* HAVE_SIMD */
+#ifdef MINIMP3_ONLY_SIMD
+    {} /* for HAVE_SIMD=1, MINIMP3_ONLY_SIMD=1 case we do not need non-intrinsic "else" branch */
+#else /* MINIMP3_ONLY_SIMD */
+    for (; k < n; k++)
+    {
+        float t[4][8], *x, *y = grbuf + k;
+
+        for (x = t[0], i = 0; i < 8; i++, x++)
+        {
+            float x0 = y[i*18];
+            float x1 = y[(15 - i)*18];
+            float x2 = y[(16 + i)*18];
+            float x3 = y[(31 - i)*18];
+            float t0 = x0 + x3;
+            float t1 = x1 + x2;
+            float t2 = (x1 - x2)*g_sec[3*i + 0];
+            float t3 = (x0 - x3)*g_sec[3*i + 1];
+            x[0] = t0 + t1;
+            x[8] = (t0 - t1)*g_sec[3*i + 2];
+            x[16] = t3 + t2;
+            x[24] = (t3 - t2)*g_sec[3*i + 2];
+        }
+        for (x = t[0], i = 0; i < 4; i++, x += 8)
+        {
+            float x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt;
+            xt = x0 - x7; x0 += x7;
+            x7 = x1 - x6; x1 += x6;
+            x6 = x2 - x5; x2 += x5;
+            x5 = x3 - x4; x3 += x4;
+            x4 = x0 - x3; x0 += x3;
+            x3 = x1 - x2; x1 += x2;
+            x[0] = x0 + x1;
+            x[4] = (x0 - x1)*0.70710677f;
+            x5 =  x5 + x6;
+            x6 = (x6 + x7)*0.70710677f;
+            x7 =  x7 + xt;
+            x3 = (x3 + x4)*0.70710677f;
+            x5 -= x7*0.198912367f;  /* rotate by PI/8 */
+            x7 += x5*0.382683432f;
+            x5 -= x7*0.198912367f;
+            x0 = xt - x6; xt += x6;
+            x[1] = (xt + x7)*0.50979561f;
+            x[2] = (x4 + x3)*0.54119611f;
+            x[3] = (x0 - x5)*0.60134488f;
+            x[5] = (x0 + x5)*0.89997619f;
+            x[6] = (x4 - x3)*1.30656302f;
+            x[7] = (xt - x7)*2.56291556f;
+
+        }
+        for (i = 0; i < 7; i++, y += 4*18)
+        {
+            y[0*18] = t[0][i];
+            y[1*18] = t[2][i] + t[3][i] + t[3][i + 1];
+            y[2*18] = t[1][i] + t[1][i + 1];
+            y[3*18] = t[2][i + 1] + t[3][i] + t[3][i + 1];
+        }
+        y[0*18] = t[0][7];
+        y[1*18] = t[2][7] + t[3][7];
+        y[2*18] = t[1][7];
+        y[3*18] = t[3][7];
+    }
+#endif /* MINIMP3_ONLY_SIMD */
+}
+
+#ifndef MINIMP3_FLOAT_OUTPUT
+static int16_t mp3d_scale_pcm(float sample)
+{
+#if HAVE_ARMV6
+    int32_t s32 = (int32_t)(sample + .5f);
+    s32 -= (s32 < 0);
+    int16_t s = (int16_t)minimp3_clip_int16_arm(s32);
+#else
+    if (sample >=  32766.5) return (int16_t) 32767;
+    if (sample <= -32767.5) return (int16_t)-32768;
+    int16_t s = (int16_t)(sample + .5f);
+    s -= (s < 0);   /* away from zero, to be compliant */
+#endif
+    return s;
+}
+#else /* MINIMP3_FLOAT_OUTPUT */
+static float mp3d_scale_pcm(float sample)
+{
+    return sample*(1.f/32768.f);
+}
+#endif /* MINIMP3_FLOAT_OUTPUT */
+
+static void mp3d_synth_pair(mp3d_sample_t *pcm, int nch, const float *z)
+{
+    float a;
+    a  = (z[14*64] - z[    0]) * 29;
+    a += (z[ 1*64] + z[13*64]) * 213;
+    a += (z[12*64] - z[ 2*64]) * 459;
+    a += (z[ 3*64] + z[11*64]) * 2037;
+    a += (z[10*64] - z[ 4*64]) * 5153;
+    a += (z[ 5*64] + z[ 9*64]) * 6574;
+    a += (z[ 8*64] - z[ 6*64]) * 37489;
+    a +=  z[ 7*64]             * 75038;
+    pcm[0] = mp3d_scale_pcm(a);
+
+    z += 2;
+    a  = z[14*64] * 104;
+    a += z[12*64] * 1567;
+    a += z[10*64] * 9727;
+    a += z[ 8*64] * 64019;
+    a += z[ 6*64] * -9975;
+    a += z[ 4*64] * -45;
+    a += z[ 2*64] * 146;
+    a += z[ 0*64] * -5;
+    pcm[16*nch] = mp3d_scale_pcm(a);
+}
+
+static void mp3d_synth(float *xl, mp3d_sample_t *dstl, int nch, float *lins)
+{
+    int i;
+    float *xr = xl + 576*(nch - 1);
+    mp3d_sample_t *dstr = dstl + (nch - 1);
+
+    static const float g_win[] = {
+        -1,26,-31,208,218,401,-519,2063,2000,4788,-5517,7134,5959,35640,-39336,74992,
+        -1,24,-35,202,222,347,-581,2080,1952,4425,-5879,7640,5288,33791,-41176,74856,
+        -1,21,-38,196,225,294,-645,2087,1893,4063,-6237,8092,4561,31947,-43006,74630,
+        -1,19,-41,190,227,244,-711,2085,1822,3705,-6589,8492,3776,30112,-44821,74313,
+        -1,17,-45,183,228,197,-779,2075,1739,3351,-6935,8840,2935,28289,-46617,73908,
+        -1,16,-49,176,228,153,-848,2057,1644,3004,-7271,9139,2037,26482,-48390,73415,
+        -2,14,-53,169,227,111,-919,2032,1535,2663,-7597,9389,1082,24694,-50137,72835,
+        -2,13,-58,161,224,72,-991,2001,1414,2330,-7910,9592,70,22929,-51853,72169,
+        -2,11,-63,154,221,36,-1064,1962,1280,2006,-8209,9750,-998,21189,-53534,71420,
+        -2,10,-68,147,215,2,-1137,1919,1131,1692,-8491,9863,-2122,19478,-55178,70590,
+        -3,9,-73,139,208,-29,-1210,1870,970,1388,-8755,9935,-3300,17799,-56778,69679,
+        -3,8,-79,132,200,-57,-1283,1817,794,1095,-8998,9966,-4533,16155,-58333,68692,
+        -4,7,-85,125,189,-83,-1356,1759,605,814,-9219,9959,-5818,14548,-59838,67629,
+        -4,7,-91,117,177,-106,-1428,1698,402,545,-9416,9916,-7154,12980,-61289,66494,
+        -5,6,-97,111,163,-127,-1498,1634,185,288,-9585,9838,-8540,11455,-62684,65290
+    };
+    float *zlin = lins + 15*64;
+    const float *w = g_win;
+
+    zlin[4*15]     = xl[18*16];
+    zlin[4*15 + 1] = xr[18*16];
+    zlin[4*15 + 2] = xl[0];
+    zlin[4*15 + 3] = xr[0];
+
+    zlin[4*31]     = xl[1 + 18*16];
+    zlin[4*31 + 1] = xr[1 + 18*16];
+    zlin[4*31 + 2] = xl[1];
+    zlin[4*31 + 3] = xr[1];
+
+    mp3d_synth_pair(dstr, nch, lins + 4*15 + 1);
+    mp3d_synth_pair(dstr + 32*nch, nch, lins + 4*15 + 64 + 1);
+    mp3d_synth_pair(dstl, nch, lins + 4*15);
+    mp3d_synth_pair(dstl + 32*nch, nch, lins + 4*15 + 64);
+
+#if HAVE_SIMD
+    if (have_simd()) for (i = 14; i >= 0; i--)
+    {
+#define VLOAD(k) f4 w0 = VSET(*w++); f4 w1 = VSET(*w++); f4 vz = VLD(&zlin[4*i - 64*k]); f4 vy = VLD(&zlin[4*i - 64*(15 - k)]);
+#define V0(k) { VLOAD(k) b =         VADD(VMUL(vz, w1), VMUL(vy, w0)) ; a =         VSUB(VMUL(vz, w0), VMUL(vy, w1));  }
+#define V1(k) { VLOAD(k) b = VADD(b, VADD(VMUL(vz, w1), VMUL(vy, w0))); a = VADD(a, VSUB(VMUL(vz, w0), VMUL(vy, w1))); }
+#define V2(k) { VLOAD(k) b = VADD(b, VADD(VMUL(vz, w1), VMUL(vy, w0))); a = VADD(a, VSUB(VMUL(vy, w1), VMUL(vz, w0))); }
+        f4 a, b;
+        zlin[4*i]     = xl[18*(31 - i)];
+        zlin[4*i + 1] = xr[18*(31 - i)];
+        zlin[4*i + 2] = xl[1 + 18*(31 - i)];
+        zlin[4*i + 3] = xr[1 + 18*(31 - i)];
+        zlin[4*i + 64] = xl[1 + 18*(1 + i)];
+        zlin[4*i + 64 + 1] = xr[1 + 18*(1 + i)];
+        zlin[4*i - 64 + 2] = xl[18*(1 + i)];
+        zlin[4*i - 64 + 3] = xr[18*(1 + i)];
+
+        V0(0) V2(1) V1(2) V2(3) V1(4) V2(5) V1(6) V2(7)
+
+        {
+#ifndef MINIMP3_FLOAT_OUTPUT
+#if HAVE_SSE
+            static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f };
+            static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
+            __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)),
+                                           _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min)));
+            dstr[(15 - i)*nch] = _mm_extract_epi16(pcm8, 1);
+            dstr[(17 + i)*nch] = _mm_extract_epi16(pcm8, 5);
+            dstl[(15 - i)*nch] = _mm_extract_epi16(pcm8, 0);
+            dstl[(17 + i)*nch] = _mm_extract_epi16(pcm8, 4);
+            dstr[(47 - i)*nch] = _mm_extract_epi16(pcm8, 3);
+            dstr[(49 + i)*nch] = _mm_extract_epi16(pcm8, 7);
+            dstl[(47 - i)*nch] = _mm_extract_epi16(pcm8, 2);
+            dstl[(49 + i)*nch] = _mm_extract_epi16(pcm8, 6);
+#else /* HAVE_SSE */
+            int16x4_t pcma, pcmb;
+            a = VADD(a, VSET(0.5f));
+            b = VADD(b, VSET(0.5f));
+            pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0)))));
+            pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0)))));
+            vst1_lane_s16(dstr + (15 - i)*nch, pcma, 1);
+            vst1_lane_s16(dstr + (17 + i)*nch, pcmb, 1);
+            vst1_lane_s16(dstl + (15 - i)*nch, pcma, 0);
+            vst1_lane_s16(dstl + (17 + i)*nch, pcmb, 0);
+            vst1_lane_s16(dstr + (47 - i)*nch, pcma, 3);
+            vst1_lane_s16(dstr + (49 + i)*nch, pcmb, 3);
+            vst1_lane_s16(dstl + (47 - i)*nch, pcma, 2);
+            vst1_lane_s16(dstl + (49 + i)*nch, pcmb, 2);
+#endif /* HAVE_SSE */
+
+#else /* MINIMP3_FLOAT_OUTPUT */
+
+            static const f4 g_scale = { 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f };
+            a = VMUL(a, g_scale);
+            b = VMUL(b, g_scale);
+#if HAVE_SSE
+            _mm_store_ss(dstr + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
+            _mm_store_ss(dstr + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
+            _mm_store_ss(dstl + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)));
+            _mm_store_ss(dstl + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 0, 0, 0)));
+            _mm_store_ss(dstr + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)));
+            _mm_store_ss(dstr + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 3, 3)));
+            _mm_store_ss(dstl + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
+            _mm_store_ss(dstl + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 2, 2)));
+#else /* HAVE_SSE */
+            vst1q_lane_f32(dstr + (15 - i)*nch, a, 1);
+            vst1q_lane_f32(dstr + (17 + i)*nch, b, 1);
+            vst1q_lane_f32(dstl + (15 - i)*nch, a, 0);
+            vst1q_lane_f32(dstl + (17 + i)*nch, b, 0);
+            vst1q_lane_f32(dstr + (47 - i)*nch, a, 3);
+            vst1q_lane_f32(dstr + (49 + i)*nch, b, 3);
+            vst1q_lane_f32(dstl + (47 - i)*nch, a, 2);
+            vst1q_lane_f32(dstl + (49 + i)*nch, b, 2);
+#endif /* HAVE_SSE */
+#endif /* MINIMP3_FLOAT_OUTPUT */
+        }
+    } else
+#endif /* HAVE_SIMD */
+#ifdef MINIMP3_ONLY_SIMD
+    {} /* for HAVE_SIMD=1, MINIMP3_ONLY_SIMD=1 case we do not need non-intrinsic "else" branch */
+#else /* MINIMP3_ONLY_SIMD */
+    for (i = 14; i >= 0; i--)
+    {
+#define LOAD(k) float w0 = *w++; float w1 = *w++; float *vz = &zlin[4*i - k*64]; float *vy = &zlin[4*i - (15 - k)*64];
+#define S0(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j]  = vz[j]*w1 + vy[j]*w0, a[j]  = vz[j]*w0 - vy[j]*w1; }
+#define S1(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vz[j]*w0 - vy[j]*w1; }
+#define S2(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vy[j]*w1 - vz[j]*w0; }
+        float a[4], b[4];
+
+        zlin[4*i]     = xl[18*(31 - i)];
+        zlin[4*i + 1] = xr[18*(31 - i)];
+        zlin[4*i + 2] = xl[1 + 18*(31 - i)];
+        zlin[4*i + 3] = xr[1 + 18*(31 - i)];
+        zlin[4*(i + 16)]   = xl[1 + 18*(1 + i)];
+        zlin[4*(i + 16) + 1] = xr[1 + 18*(1 + i)];
+        zlin[4*(i - 16) + 2] = xl[18*(1 + i)];
+        zlin[4*(i - 16) + 3] = xr[18*(1 + i)];
+
+        S0(0) S2(1) S1(2) S2(3) S1(4) S2(5) S1(6) S2(7)
+
+        dstr[(15 - i)*nch] = mp3d_scale_pcm(a[1]);
+        dstr[(17 + i)*nch] = mp3d_scale_pcm(b[1]);
+        dstl[(15 - i)*nch] = mp3d_scale_pcm(a[0]);
+        dstl[(17 + i)*nch] = mp3d_scale_pcm(b[0]);
+        dstr[(47 - i)*nch] = mp3d_scale_pcm(a[3]);
+        dstr[(49 + i)*nch] = mp3d_scale_pcm(b[3]);
+        dstl[(47 - i)*nch] = mp3d_scale_pcm(a[2]);
+        dstl[(49 + i)*nch] = mp3d_scale_pcm(b[2]);
+    }
+#endif /* MINIMP3_ONLY_SIMD */
+}
+
+static void mp3d_synth_granule(float *qmf_state, float *grbuf, int nbands, int nch, mp3d_sample_t *pcm, float *lins)
+{
+    int i;
+    for (i = 0; i < nch; i++)
+    {
+        mp3d_DCT_II(grbuf + 576*i, nbands);
+    }
+
+    memcpy(lins, qmf_state, sizeof(float)*15*64);
+
+    for (i = 0; i < nbands; i += 2)
+    {
+        mp3d_synth(grbuf + i, pcm + 32*nch*i, nch, lins + i*64);
+    }
+#ifndef MINIMP3_NONSTANDARD_BUT_LOGICAL
+    if (nch == 1)
+    {
+        for (i = 0; i < 15*64; i += 2)
+        {
+            qmf_state[i] = lins[nbands*64 + i];
+        }
+    } else
+#endif /* MINIMP3_NONSTANDARD_BUT_LOGICAL */
+    {
+        memcpy(qmf_state, lins + nbands*64, sizeof(float)*15*64);
+    }
+}
+
+static int mp3d_match_frame(const uint8_t *hdr, int mp3_bytes, int frame_bytes)
+{
+    int i, nmatch;
+    for (i = 0, nmatch = 0; nmatch < MAX_FRAME_SYNC_MATCHES; nmatch++)
+    {
+        i += hdr_frame_bytes(hdr + i, frame_bytes) + hdr_padding(hdr + i);
+        if (i + HDR_SIZE > mp3_bytes)
+            return nmatch > 0;
+        if (!hdr_compare(hdr, hdr + i))
+            return 0;
+    }
+    return 1;
+}
+
+static int mp3d_find_frame(const uint8_t *mp3, int mp3_bytes, int *free_format_bytes, int *ptr_frame_bytes)
+{
+    int i, k;
+    for (i = 0; i < mp3_bytes - HDR_SIZE; i++, mp3++)
+    {
+        if (hdr_valid(mp3))
+        {
+            int frame_bytes = hdr_frame_bytes(mp3, *free_format_bytes);
+            int frame_and_padding = frame_bytes + hdr_padding(mp3);
+
+            for (k = HDR_SIZE; !frame_bytes && k < MAX_FREE_FORMAT_FRAME_SIZE && i + 2*k < mp3_bytes - HDR_SIZE; k++)
+            {
+                if (hdr_compare(mp3, mp3 + k))
+                {
+                    int fb = k - hdr_padding(mp3);
+                    int nextfb = fb + hdr_padding(mp3 + k);
+                    if (i + k + nextfb + HDR_SIZE > mp3_bytes || !hdr_compare(mp3, mp3 + k + nextfb))
+                        continue;
+                    frame_and_padding = k;
+                    frame_bytes = fb;
+                    *free_format_bytes = fb;
+                }
+            }
+            if ((frame_bytes && i + frame_and_padding <= mp3_bytes &&
+                mp3d_match_frame(mp3, mp3_bytes - i, frame_bytes)) ||
+                (!i && frame_and_padding == mp3_bytes))
+            {
+                *ptr_frame_bytes = frame_and_padding;
+                return i;
+            }
+            *free_format_bytes = 0;
+        }
+    }
+    *ptr_frame_bytes = 0;
+    return mp3_bytes;
+}
+
+void mp3dec_init(mp3dec_t *dec)
+{
+    dec->header[0] = 0;
+}
+
+int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_sample_t *pcm, mp3dec_frame_info_t *info)
+{
+    int i = 0, igr, frame_size = 0, success = 1;
+    const uint8_t *hdr;
+    bs_t bs_frame[1];
+    mp3dec_scratch_t scratch;
+
+    if (mp3_bytes > 4 && dec->header[0] == 0xff && hdr_compare(dec->header, mp3))
+    {
+        frame_size = hdr_frame_bytes(mp3, dec->free_format_bytes) + hdr_padding(mp3);
+        if (frame_size != mp3_bytes && (frame_size + HDR_SIZE > mp3_bytes || !hdr_compare(mp3, mp3 + frame_size)))
+        {
+            frame_size = 0;
+        }
+    }
+    if (!frame_size)
+    {
+        memset(dec, 0, sizeof(mp3dec_t));
+        i = mp3d_find_frame(mp3, mp3_bytes, &dec->free_format_bytes, &frame_size);
+        if (!frame_size || i + frame_size > mp3_bytes)
+        {
+            info->frame_bytes = i;
+            return 0;
+        }
+    }
+
+    hdr = mp3 + i;
+    memcpy(dec->header, hdr, HDR_SIZE);
+    info->frame_bytes = i + frame_size;
+    info->frame_offset = i;
+    info->channels = HDR_IS_MONO(hdr) ? 1 : 2;
+    info->hz = hdr_sample_rate_hz(hdr);
+    info->layer = 4 - HDR_GET_LAYER(hdr);
+    info->bitrate_kbps = hdr_bitrate_kbps(hdr);
+
+    if (!pcm)
+    {
+        return hdr_frame_samples(hdr);
+    }
+
+    bs_init(bs_frame, hdr + HDR_SIZE, frame_size - HDR_SIZE);
+    if (HDR_IS_CRC(hdr))
+    {
+        get_bits(bs_frame, 16);
+    }
+
+    if (info->layer == 3)
+    {
+        int main_data_begin = L3_read_side_info(bs_frame, scratch.gr_info, hdr);
+        if (main_data_begin < 0 || bs_frame->pos > bs_frame->limit)
+        {
+            mp3dec_init(dec);
+            return 0;
+        }
+        success = L3_restore_reservoir(dec, bs_frame, &scratch, main_data_begin);
+        if (success)
+        {
+            for (igr = 0; igr < (HDR_TEST_MPEG1(hdr) ? 2 : 1); igr++, pcm += 576*info->channels)
+            {
+                memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
+                L3_decode(dec, &scratch, scratch.gr_info + igr*info->channels, info->channels);
+                mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 18, info->channels, pcm, scratch.syn[0]);
+            }
+        }
+        L3_save_reservoir(dec, &scratch);
+    } else
+    {
+#ifdef MINIMP3_ONLY_MP3
+        return 0;
+#else /* MINIMP3_ONLY_MP3 */
+        L12_scale_info sci[1];
+        L12_read_scale_info(hdr, bs_frame, sci);
+
+        memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
+        for (i = 0, igr = 0; igr < 3; igr++)
+        {
+            if (12 == (i += L12_dequantize_granule(scratch.grbuf[0] + i, bs_frame, sci, info->layer | 1)))
+            {
+                i = 0;
+                L12_apply_scf_384(sci, sci->scf + igr, scratch.grbuf[0]);
+                mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 12, info->channels, pcm, scratch.syn[0]);
+                memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
+                pcm += 384*info->channels;
+            }
+            if (bs_frame->pos > bs_frame->limit)
+            {
+                mp3dec_init(dec);
+                return 0;
+            }
+        }
+#endif /* MINIMP3_ONLY_MP3 */
+    }
+    return success*hdr_frame_samples(dec->header);
+}
+
+#ifdef MINIMP3_FLOAT_OUTPUT
+void mp3dec_f32_to_s16(const float *in, int16_t *out, int num_samples)
+{
+    int i = 0;
+#if HAVE_SIMD
+    int aligned_count = num_samples & ~7;
+    for(; i < aligned_count; i += 8)
+    {
+        static const f4 g_scale = { 32768.0f, 32768.0f, 32768.0f, 32768.0f };
+        f4 a = VMUL(VLD(&in[i  ]), g_scale);
+        f4 b = VMUL(VLD(&in[i+4]), g_scale);
+#if HAVE_SSE
+        static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f };
+        static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
+        __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)),
+                                       _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min)));
+        out[i  ] = _mm_extract_epi16(pcm8, 0);
+        out[i+1] = _mm_extract_epi16(pcm8, 1);
+        out[i+2] = _mm_extract_epi16(pcm8, 2);
+        out[i+3] = _mm_extract_epi16(pcm8, 3);
+        out[i+4] = _mm_extract_epi16(pcm8, 4);
+        out[i+5] = _mm_extract_epi16(pcm8, 5);
+        out[i+6] = _mm_extract_epi16(pcm8, 6);
+        out[i+7] = _mm_extract_epi16(pcm8, 7);
+#else /* HAVE_SSE */
+        int16x4_t pcma, pcmb;
+        a = VADD(a, VSET(0.5f));
+        b = VADD(b, VSET(0.5f));
+        pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0)))));
+        pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0)))));
+        vst1_lane_s16(out+i  , pcma, 0);
+        vst1_lane_s16(out+i+1, pcma, 1);
+        vst1_lane_s16(out+i+2, pcma, 2);
+        vst1_lane_s16(out+i+3, pcma, 3);
+        vst1_lane_s16(out+i+4, pcmb, 0);
+        vst1_lane_s16(out+i+5, pcmb, 1);
+        vst1_lane_s16(out+i+6, pcmb, 2);
+        vst1_lane_s16(out+i+7, pcmb, 3);
+#endif /* HAVE_SSE */
+    }
+#endif /* HAVE_SIMD */
+    for(; i < num_samples; i++)
+    {
+        float sample = in[i] * 32768.0f;
+        if (sample >=  32766.5)
+            out[i] = (int16_t) 32767;
+        else if (sample <= -32767.5)
+            out[i] = (int16_t)-32768;
+        else
+        {
+            int16_t s = (int16_t)(sample + .5f);
+            s -= (s < 0);   /* away from zero, to be compliant */
+            out[i] = s;
+        }
+    }
+}
+#endif /* MINIMP3_FLOAT_OUTPUT */
+#endif /* MINIMP3_IMPLEMENTATION && !_MINIMP3_IMPLEMENTATION_GUARD */
diff --git a/vae.h b/vae.h
index 58fd558..4713581 100644
--- a/vae.h
+++ b/vae.h
@@ -556,3 +556,183 @@ static void vae_ggml_free(VAEGGML * m) {
     if (m->cpu_backend) ggml_backend_free(m->cpu_backend);
     *m = {};
 }
+
+// ---------------------------------------------------------------------------
+// VAE Encoder (audio -> 64-d latents @ 25Hz for reference timbre)
+// Oobleck encoder: conv1(2->128) -> 5 blocks (stride 2,4,4,8,8) -> snake -> conv2(2048->128)
+// Output 128 = mean(64) + scale(64); we use mean only.
+// Requires encoder.* tensors in the same VAE GGUF (full autoencoder export).
+// ---------------------------------------------------------------------------
+struct VAEEncoderBlock {
+    VAEResUnit ru[3];
+    struct ggml_tensor * sa, * sb;
+    struct ggml_tensor * c1w, * c1b;
+    int in_ch, out_ch, stride;
+};
+
+struct VAEEncoderGGML {
+    struct ggml_tensor * c1w, * c1b;
+    VAEEncoderBlock blk[5];
+    struct ggml_tensor * sa, * sb;
+    struct ggml_tensor * c2w, * c2b;
+
+    ggml_backend_t backend;
+    ggml_backend_t cpu_backend;
+    ggml_backend_sched_t sched;
+    ggml_backend_buffer_t buf;
+    struct ggml_context * weight_ctx;
+    bool has_encoder;
+};
+
+static bool vae_encoder_load(VAEEncoderGGML * m, const char * path) {
+    m->has_encoder = false;
+    GGUFModel gf = {};
+    if (!gf_load(&gf, path)) {
+        fprintf(stderr, "[VAE Encoder] cannot load %s\n", path);
+        return false;
+    }
+    if (!ggml_get_tensor(gf.meta, "encoder.conv1.weight_v")) {
+        gf_close(&gf);
+        fprintf(stderr, "[VAE Encoder] no encoder.* in %s (decoder-only GGUF). Use a full VAE GGUF for reference_audio WAV.\n", path);
+        return false;
+    }
+
+    static const int enc_strides[] = {2, 4, 4, 8, 8};
+    static const int enc_in_ch[]  = {128, 256, 512, 1024, 2048};
+    static const int enc_out_ch[] = {256, 512, 1024, 2048, 2048};
+    static const int dilations[] = {1, 3, 9};
+
+    size_t ctx_size = ggml_tensor_overhead() * 200;
+    struct ggml_init_params p = { ctx_size, NULL, true };
+    m->weight_ctx = ggml_init(p);
+    struct ggml_context * ctx = m->weight_ctx;
+
+    m->c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, 2, 128);
+    m->c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128);
+
+    for (int i = 0; i < 5; i++) {
+        VAEEncoderBlock & b = m->blk[i];
+        b.in_ch = enc_in_ch[i];
+        b.out_ch = enc_out_ch[i];
+        b.stride = enc_strides[i];
+        int C = b.in_ch;
+        b.sa  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+        b.sb  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+        b.c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 2 * b.stride, C, b.out_ch);
+        b.c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, b.out_ch);
+        for (int r = 0; r < 3; r++) {
+            VAEResUnit & ru = b.ru[r];
+            ru.dilation = dilations[r];
+            ru.s1a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+            ru.s1b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+            ru.c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, C, C);
+            ru.c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C);
+            ru.s2a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+            ru.s2b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+            ru.c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 1, C, C);
+            ru.c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C);
+        }
+    }
+    m->sa  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048);
+    m->sb  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048);
+    m->c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 3, 2048, 128);
+    m->c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128);
+
+    BackendPair bp = backend_init("VAE-Encoder");
+    m->backend = bp.backend;
+    m->cpu_backend = bp.cpu_backend;
+    m->sched = backend_sched_new(bp, 8192);
+    m->buf = ggml_backend_alloc_ctx_tensors(ctx, m->backend);
+
+    vae_fuse_wn(m->c1w, gf, "encoder.conv1");
+    vae_load_bias(m->c1b, gf, "encoder.conv1.bias");
+
+    for (int i = 0; i < 5; i++) {
+        VAEEncoderBlock & b = m->blk[i];
+        std::string pfx = "encoder.block." + std::to_string(i);
+        for (int r = 0; r < 3; r++) {
+            std::string rp = pfx + ".res_unit" + std::to_string(r + 1);
+            vae_load_snake(b.ru[r].s1a, gf, rp + ".snake1.alpha");
+            vae_load_snake_inv(b.ru[r].s1b, gf, rp + ".snake1.beta");
+            vae_fuse_wn(b.ru[r].c1w, gf, rp + ".conv1");
+            vae_load_bias(b.ru[r].c1b, gf, rp + ".conv1.bias");
+            vae_load_snake(b.ru[r].s2a, gf, rp + ".snake2.alpha");
+            vae_load_snake_inv(b.ru[r].s2b, gf, rp + ".snake2.beta");
+            vae_fuse_wn(b.ru[r].c2w, gf, rp + ".conv2");
+            vae_load_bias(b.ru[r].c2b, gf, rp + ".conv2.bias");
+        }
+        vae_load_snake(b.sa, gf, pfx + ".snake1.alpha");
+        vae_load_snake_inv(b.sb, gf, pfx + ".snake1.beta");
+        vae_fuse_wn(b.c1w, gf, pfx + ".conv1");
+        vae_load_bias(b.c1b, gf, pfx + ".conv1.bias");
+    }
+    vae_load_snake(m->sa, gf, "encoder.snake1.alpha");
+    vae_load_snake_inv(m->sb, gf, "encoder.snake1.beta");
+    vae_fuse_wn(m->c2w, gf, "encoder.conv2");
+    vae_load_bias(m->c2b, gf, "encoder.conv2.bias");
+
+    gf_close(&gf);
+    m->has_encoder = true;
+    fprintf(stderr, "[VAE Encoder] loaded (2->128->...->2048->128, 64-d mean)\n");
+    return true;
+}
+
+static int vae_encoder_forward(VAEEncoderGGML * m, const float * audio, int T_audio,
+                               float * latent_out) {
+    if (!m->has_encoder || T_audio < 1920) return -1;
+    int T_latent = T_audio / 1920;
+
+    ggml_backend_sched_reset(m->sched);
+    size_t ctx_size = 4096 * ggml_tensor_overhead() + ggml_graph_overhead();
+    struct ggml_init_params gp = { ctx_size, NULL, true };
+    struct ggml_context * ctx = ggml_init(gp);
+    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 4096, false);
+
+    struct ggml_tensor * x = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, T_audio, 2);
+    ggml_set_name(x, "audio_in");
+    ggml_set_input(x);
+    x = vae_conv1d(ctx, m->c1w, m->c1b, x, 1, 3, 1);
+
+    for (int i = 0; i < 5; i++) {
+        VAEEncoderBlock & b = m->blk[i];
+        for (int r = 0; r < 3; r++)
+            x = vae_res_unit(ctx, &b.ru[r], x);
+        x = vae_snake(ctx, x, b.sa, b.sb);
+        int pad = (int)((float)b.stride / 2.0f + 0.5f);
+        x = vae_conv1d(ctx, b.c1w, b.c1b, x, b.stride, pad, 1);
+    }
+
+    x = vae_snake(ctx, x, m->sa, m->sb);
+    x = vae_conv1d(ctx, m->c2w, m->c2b, x, 1, 1, 1);
+    ggml_set_name(x, "enc_out");
+    ggml_set_output(x);
+
+    ggml_build_forward_expand(gf, x);
+    if (!ggml_backend_sched_alloc_graph(m->sched, gf)) {
+        ggml_free(ctx);
+        return -1;
+    }
+    ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "audio_in"), audio, 0, (size_t)T_audio * 2 * sizeof(float));
+    ggml_backend_sched_graph_compute(m->sched, gf);
+    ggml_backend_sched_synchronize(m->sched);
+
+    struct ggml_tensor * out = ggml_graph_get_tensor(gf, "enc_out");
+    std::vector<float> tmp(128 * T_latent);
+    ggml_backend_tensor_get(out, tmp.data(), 0, 128 * (size_t)T_latent * sizeof(float));
+    for (int t = 0; t < T_latent; t++)
+        for (int c = 0; c < 64; c++)
+            latent_out[t * 64 + c] = tmp[t * 128 + c];
+
+    ggml_backend_sched_reset(m->sched);
+    ggml_free(ctx);
+    return T_latent;
+}
+
+static void vae_encoder_free(VAEEncoderGGML * m) {
+    if (m->sched) ggml_backend_sched_free(m->sched);
+    if (m->buf) ggml_backend_buffer_free(m->buf);
+    if (m->weight_ctx) ggml_free(m->weight_ctx);
+    if (m->backend && m->backend != m->cpu_backend) ggml_backend_free(m->backend);
+    if (m->cpu_backend) ggml_backend_free(m->cpu_backend);
+    *m = {};
+}
diff --git a/wav.h b/wav.h
new file mode 100644
index 0000000..ded473d
--- /dev/null
+++ b/wav.h
@@ -0,0 +1,100 @@
+// wav.h: minimal WAV loader for reference audio (stereo 48kHz float out)
+// No Python or external deps. Handles 16-bit PCM, mono/stereo, resamples to 48kHz if needed.
+
+#pragma once
+
+#include <cstdio>
+#include <cstring>
+#include <vector>
+#include <algorithm>
+
+// Load WAV file into stereo float32 at 48kHz.
+// Out: interleaved L,R,L,R,... length = num_samples (both channels).
+// Returns num_samples (per channel), or -1 on error.
+static int wav_load_48k_stereo(const char * path, std::vector<float> * out) {
+    FILE * f = fopen(path, "rb");
+    if (!f) return -1;
+
+    char riff[4], fmt[4];
+    if (fread(riff, 1, 4, f) != 4 || memcmp(riff, "RIFF", 4) != 0) {
+        fclose(f);
+        return -1;
+    }
+    uint32_t file_len;
+    if (fread(&file_len, 4, 1, f) != 1) { fclose(f); return -1; }
+    if (fread(fmt, 1, 4, f) != 4 || memcmp(fmt, "WAVE", 4) != 0) {
+        fclose(f);
+        return -1;
+    }
+
+    uint16_t channels = 2, bits = 16;
+    uint32_t sample_rate = 48000;
+    bool found_fmt = false;
+
+    while (1) {
+        char chunk_id[4];
+        if (fread(chunk_id, 1, 4, f) != 4) break;
+        uint32_t chunk_size;
+        if (fread(&chunk_size, 4, 1, f) != 1) break;
+        long chunk_start = ftell(f);
+
+        if (memcmp(chunk_id, "fmt ", 4) == 0 && chunk_size >= 16) {
+            uint16_t fmt_tag, block_align;
+            uint32_t byte_rate;
+            if (fread(&fmt_tag, 2, 1, f) != 1) break;
+            if (fread(&channels, 2, 1, f) != 1) break;
+            if (fread(&sample_rate, 4, 1, f) != 1) break;
+            if (fread(&byte_rate, 4, 1, f) != 1) break;
+            if (fread(&block_align, 2, 1, f) != 1) break;
+            if (fread(&bits, 2, 1, f) != 1) break;
+            found_fmt = true;
+        } else if (memcmp(chunk_id, "data", 4) == 0 && found_fmt) {
+            size_t num_bytes = chunk_size;
+            size_t num_samples = num_bytes / (channels * (bits / 8));
+            if (num_samples == 0) { fclose(f); return -1; }
+
+            std::vector<int16_t> raw(num_samples * channels);
+            if (fread(raw.data(), 2, raw.size(), f) != raw.size()) {
+                fclose(f);
+                return -1;
+            }
+
+            out->resize(num_samples * 2);
+            float scale = 1.0f / 32768.0f;
+            if (channels == 1) {
+                for (size_t i = 0; i < num_samples; i++) {
+                    float s = (float)raw[i] * scale;
+                    (*out)[i * 2] = s;
+                    (*out)[i * 2 + 1] = s;
+                }
+            } else {
+                for (size_t i = 0; i < num_samples * 2; i++)
+                    (*out)[i] = (float)raw[i] * scale;
+            }
+
+            fclose(f);
+
+            // Resample to 48kHz if needed (linear interpolation)
+            if (sample_rate != 48000) {
+                size_t in_len = num_samples;
+                size_t out_len = (size_t)((double)in_len * 48000.0 / (double)sample_rate);
+                std::vector<float> resampled(out_len * 2);
+                for (size_t i = 0; i < out_len; i++) {
+                    double t = (double)i * (double)in_len / (double)out_len;
+                    size_t i0 = (size_t)t;
+                    size_t i1 = std::min(i0 + 1, in_len - 1);
+                    float w = (float)(t - (double)i0);
+                    for (int c = 0; c < 2; c++)
+                        resampled[i * 2 + c] = (*out)[i0 * 2 + c] * (1.0f - w) + (*out)[i1 * 2 + c] * w;
+                }
+                *out = std::move(resampled);
+                return (int)out_len;
+            }
+            return (int)num_samples;
+        }
+
+        fseek(f, chunk_start + (long)chunk_size, SEEK_SET);
+    }
+    fclose(f);
+    return -1;
+}

From 0d22a861b4c1a0ac1459a5e2d35e323a3db15bd3 Mon Sep 17 00:00:00 2001
From: qxip <qxip@mini-ams.local>
Date: Sat, 28 Feb 2026 19:33:38 +0100
Subject: [PATCH 02/17] Add cover examples, harden test-generation cache,
 document examples

- examples/cover.json + cover.sh: cover mode (precomputed audio_codes, no LLM)
- examples/cover-reference.json + cover-reference.sh: cover + reference_audio (WAV/MP3)
- README: list cover and cover-reference in examples
- test-generation: cache key with runner.os, restore-keys per GitHub docs

Made-with: Cursor
---
 .github/workflows/test-generation.yml |  7 +++++--
 README.md                             |  2 ++
 examples/cover-reference.json         | 16 ++++++++++++++++
 examples/cover-reference.sh           | 20 ++++++++++++++++++++
 examples/cover.json                   | 15 +++++++++++++++
 examples/cover.sh                     | 15 +++++++++++++++
 6 files changed, 73 insertions(+), 2 deletions(-)
 create mode 100644 examples/cover-reference.json
 create mode 100755 examples/cover-reference.sh
 create mode 100644 examples/cover.json
 create mode 100755 examples/cover.sh

diff --git a/.github/workflows/test-generation.yml b/.github/workflows/test-generation.yml
index 3c9547c..18afb24 100644
--- a/.github/workflows/test-generation.yml
+++ b/.github/workflows/test-generation.yml
@@ -26,13 +26,16 @@ jobs:
           cmake .. -DGGML_BLAS=ON
           cmake --build . --config Release -j$(nproc)
 
+      # Restore/save model artifacts (see https://docs.github.com/en/actions/reference/workflows-and-actions/dependency-caching)
       - name: Cache models
         id: cache-models
         uses: actions/cache@v4
         with:
           path: models
-          key: acestep-models-q8-${{ hashFiles('models.sh') }}
-          restore-keys: acestep-models-q8-
+          key: ${{ runner.os }}-acestep-models-q8-${{ hashFiles('models.sh') }}
+          restore-keys: |
+            ${{ runner.os }}-acestep-models-q8-
+            ${{ runner.os }}-acestep-models-
 
       - name: Download models
         if: steps.cache-models.outputs.cache-hit != 'true'
diff --git a/README.md b/README.md
index 55ac16d..87a0e34 100644
--- a/README.md
+++ b/README.md
@@ -143,6 +143,8 @@ cd examples
 ./partial.sh          # caption + lyrics + duration
 ./full.sh             # all metadata provided
 ./dit-only.sh         # skip LLM, DiT from noise
+./cover.sh            # cover mode: decode precomputed audio_codes (no LLM)
+./cover-reference.sh  # cover + reference_audio for timbre (WAV/MP3; needs reference.wav or .mp3)
 ./test-reference.sh   # reference_audio (WAV or MP3) + audio_cover_strength
 ```
 
diff --git a/examples/cover-reference.json b/examples/cover-reference.json
new file mode 100644
index 0000000..313d419
--- /dev/null
+++ b/examples/cover-reference.json
@@ -0,0 +1,16 @@
+{
+  "task_type": "cover",
+  "caption": "Cover with timbre from reference WAV/MP3",
+  "duration": 10,
+  "bpm": 83,
+  "keyscale": "G major",
+  "timesignature": "4",
+  "vocal_language": "fr",
+  "inference_steps": 8,
+  "guidance_scale": 1,
+  "shift": 3,
+  "seed": 42,
+  "audio_cover_strength": 0.9,
+  "audio_codes": "43316,18426,13366,59455,17783,49303,7423,29855,37158,37157,62317,61455,12847,19583,57031,34656,20254,10770,11416,15905,31413,23339,47091,12198,49531,37355,33090,38645,40707,16324,61436,46095,13941,5287,2239,13975,63815,2757,4862,13571,63495,39,29887,49426,12696,50847,40498,61056,25666,12989,23987,54763,25485,31683,28554,25355,16373,28995,2351,1655,7940,55831,34359,15350,15277,11717,20476,52239,5015,19807,24087,3559,20471,34193,32552,60999,29360,25338,38873,16768,17912,27584,24008,1528,449,25563,52684,53223,42183,37215,12343,39431,26055,28148,57286,38382,28863,7191,58397,18991,7695,30716,36784,12687,8707,25649,33718,3202,23035,10747,26354,63965,16260,11223,45679,14343,8679,49351,52927,2535,19207,46447,49615,12694,21110,46597,60991,27711,49751,54656,30448,33125,13585,29256,5161,42434,11753,39402,60354,21953,39532,14282,52160,34248,16304,4671,14172,5127,25991,15343,8583,61902,16328,31700,48415,28879,11215,52715,25541,11203,7695,63951,33803,40453,17750,28006,8231,40464,3136,51006,23839,18711,18711,18711,18711,3343,3279,2823,16071,3271,2823,2319,55815,40260,16215,12047,16631,26927,21863,20060,10166,51070,39,12099,63440,18418,25271,10792,2128,44166,53750,41263,44247,61287,42303,27614,21997,24879,38799,12648,38341,36833,19408,11769,2979,63979,44239,25559,27591,17626,44087,33796,4901,53176,57399,37180,38024,9216,63485,2005,13656,15914,45576,29194,45624,62332,53237,63988,40332,20486,31367,10951,46207,22231,63479,38877,17262,49335,42045,57388",
+  "reference_audio": "reference.wav"
+}
\ No newline at end of file
diff --git a/examples/cover-reference.sh b/examples/cover-reference.sh
new file mode 100755
index 0000000..2687b09
--- /dev/null
+++ b/examples/cover-reference.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Cover mode with reference timbre: audio_codes + reference_audio (WAV or MP3).
+# Put a WAV/MP3 at reference.wav (or reference.mp3) or set reference_audio in cover-reference.json.
+# Requires VAE GGUF with encoder weights (same as request-reference / test-reference).
+set -eu
+cd "$(dirname "$0")"
+
+if [ ! -f "reference.wav" ] && [ ! -f "reference.mp3" ]; then
+    echo "No reference.wav or reference.mp3 found. Copy a file to reference.wav (or .mp3), or set reference_audio in cover-reference.json."
+    echo "Then run: $0"
+    exit 1
+fi
+
+../build/dit-vae \
+    --request cover-reference.json \
+    --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit ../models/acestep-v15-turbo-Q8_0.gguf \
+    --vae ../models/vae-BF16.gguf
+
+echo "Done. Check cover-reference0.wav"
diff --git a/examples/cover.json b/examples/cover.json
new file mode 100644
index 0000000..82cbaaa
--- /dev/null
+++ b/examples/cover.json
@@ -0,0 +1,15 @@
+{
+  "task_type": "cover",
+  "caption": "Re-synthesize from precomputed codes (e.g. from a previous ace-qwen3 run)",
+  "duration": 10,
+  "bpm": 83,
+  "keyscale": "G major",
+  "timesignature": "4",
+  "vocal_language": "fr",
+  "inference_steps": 8,
+  "guidance_scale": 1,
+  "shift": 3,
+  "seed": 42,
+  "audio_cover_strength": 0.9,
+  "audio_codes": "43316,18426,13366,59455,17783,49303,7423,29855,37158,37157,62317,61455,12847,19583,57031,34656,20254,10770,11416,15905,31413,23339,47091,12198,49531,37355,33090,38645,40707,16324,61436,46095,13941,5287,2239,13975,63815,2757,4862,13571,63495,39,29887,49426,12696,50847,40498,61056,25666,12989,23987,54763,25485,31683,28554,25355,16373,28995,2351,1655,7940,55831,34359,15350,15277,11717,20476,52239,5015,19807,24087,3559,20471,34193,32552,60999,29360,25338,38873,16768,17912,27584,24008,1528,449,25563,52684,53223,42183,37215,12343,39431,26055,28148,57286,38382,28863,7191,58397,18991,7695,30716,36784,12687,8707,25649,33718,3202,23035,10747,26354,63965,16260,11223,45679,14343,8679,49351,52927,2535,19207,46447,49615,12694,21110,46597,60991,27711,49751,54656,30448,33125,13585,29256,5161,42434,11753,39402,60354,21953,39532,14282,52160,34248,16304,4671,14172,5127,25991,15343,8583,61902,16328,31700,48415,28879,11215,52715,25541,11203,7695,63951,33803,40453,17750,28006,8231,40464,3136,51006,23839,18711,18711,18711,18711,3343,3279,2823,16071,3271,2823,2319,55815,40260,16215,12047,16631,26927,21863,20060,10166,51070,39,12099,63440,18418,25271,10792,2128,44166,53750,41263,44247,61287,42303,27614,21997,24879,38799,12648,38341,36833,19408,11769,2979,63979,44239,25559,27591,17626,44087,33796,4901,53176,57399,37180,38024,9216,63485,2005,13656,15914,45576,29194,45624,62332,53237,63988,40332,20486,31367,10951,46207,22231,63479,38877,17262,49335,42045,57388"
+}
\ No newline at end of file
diff --git a/examples/cover.sh b/examples/cover.sh
new file mode 100755
index 0000000..14d340e
--- /dev/null
+++ b/examples/cover.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# Cover mode: decode precomputed audio_codes to WAV (no LLM).
+# Use cover.json as-is, or replace audio_codes with output from a previous run:
+#   ../build/ace-qwen3 --request simple.json --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf
+#   # then use simple0.json as input, or copy its audio_codes into cover.json
+set -eu
+cd "$(dirname "$0")"
+
+../build/dit-vae \
+    --request cover.json \
+    --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit ../models/acestep-v15-turbo-Q8_0.gguf \
+    --vae ../models/vae-BF16.gguf
+
+echo "Done. Check cover0.wav"

From a637e598ae959a8915176113d43c4e84235f5ddf Mon Sep 17 00:00:00 2001
From: qxip <qxip@mini-ams.local>
Date: Sat, 28 Feb 2026 20:10:34 +0100
Subject: [PATCH 03/17] VAE encoder fix, local test script, full-pipeline
 output path

- vae.h: use encoder output tensor length (out->ne[0]) for read size instead of
  T_audio/1920 to fix tensor read out of bounds on reference_audio encode
- dit-vae: support WAV and MP3 for reference_audio (single load path)
- tests/run-generation-tests.sh: local run of same three steps as CI; step 3
  copies fixture to request.json so ace-qwen3 writes request0.json
- test-generation.yml: same request.json copy for full-pipeline step
- README: validate locally first, then CI
- .gitignore: request.json, request0.json, tests/fixtures/*0.json

Made-with: Cursor
---
 .github/workflows/test-generation.yml |  6 +-
 .gitignore                            |  3 +
 README.md                             |  2 +-
 dit-vae.cpp                           | 51 ++++++++--------
 tests/run-generation-tests.sh         | 83 +++++++++++++++++++++++++++
 vae.h                                 |  8 ++-
 6 files changed, 120 insertions(+), 33 deletions(-)
 create mode 100755 tests/run-generation-tests.sh

diff --git a/.github/workflows/test-generation.yml b/.github/workflows/test-generation.yml
index 18afb24..c63b3c0 100644
--- a/.github/workflows/test-generation.yml
+++ b/.github/workflows/test-generation.yml
@@ -1,5 +1,6 @@
 # Build, download models (cached), and run short generation tests for various modes.
-# Runs on release (published) or manual trigger only. Uses short duration (5s) and few steps (4).
+# Validate locally first: from repo root run tests/run-generation-tests.sh (after build + ./models.sh).
+# CI runs the same steps. Trigger: release (published) or workflow_dispatch only.
 name: Test generation
 
 on:
@@ -63,8 +64,9 @@ jobs:
 
       - name: Test full pipeline (LLM + DiT, short)
         run: |
+          cp tests/fixtures/ci-text2music.json request.json
           ./build/ace-qwen3 \
-            --request tests/fixtures/ci-text2music.json \
+            --request request.json \
             --model models/acestep-5Hz-lm-4B-Q8_0.gguf
           test -f request0.json
           ./build/dit-vae \
diff --git a/.gitignore b/.gitignore
index 0fa15e6..91ddabb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,9 @@ tests/*/
 !tests/fixtures/
 !tests/fixtures/*.json
 
+request.json
+request0.json
+tests/fixtures/*0.json
 checkpoints/
 models/
 __pycache__/
diff --git a/README.md b/README.md
index 87a0e34..178fe39 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ Builds two binaries: `ace-qwen3` (LLM) and `dit-vae` (DiT + VAE).
 
 **CI (GitHub Actions)**  
 - **Build**: on every push/PR, builds on Ubuntu (BLAS) and macOS (Metal); smoke test runs each binary `--help`.  
-- **Test generation**: on manual trigger or push to `main`; builds, caches models, then runs short (5 s, 4 steps) generation for text2music, cover, and full pipeline (LLM → DiT → WAV). See `.github/workflows/`.
+- **Test generation**: on release or manual trigger only; runs the same checks as **local** `tests/run-generation-tests.sh`. Validate locally first (build + `./models.sh`, then `tests/run-generation-tests.sh`), then use CI to confirm. See `.github/workflows/`.
 
 ## Models
 
diff --git a/dit-vae.cpp b/dit-vae.cpp
index 41582c9..5f57194 100644
--- a/dit-vae.cpp
+++ b/dit-vae.cpp
@@ -396,36 +396,33 @@ int main(int argc, char ** argv) {
         int S_ref_actual = S_ref;
         if (!req.reference_audio.empty()) {
             const std::string & ref_path = req.reference_audio;
-            if (ref_path.size() >= 4 && ref_path.compare(ref_path.size() - 4, 4, ".wav") == 0) {
-                std::vector<float> wav_stereo;
-                int n_samples = load_audio_48k_stereo(ref_path.c_str(), &wav_stereo);
-                if (n_samples > 0 && have_vae) {
-                    VAEEncoderGGML enc = {};
-                    if (vae_encoder_load(&enc, vae_gguf)) {
-                        int T_audio = n_samples;
-                        if (T_audio >= 1920) {
-                            int T_lat = T_audio / 1920;
-                            std::vector<float> enc_out((size_t)T_lat * 64);
-                            T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data());
-                            if (T_lat > 0) {
-                                size_t copy_frames = (size_t)(T_lat < S_ref ? T_lat : S_ref);
-                                memcpy(timbre_feats.data(), enc_out.data(), copy_frames * 64 * sizeof(float));
-                                if (T_lat < S_ref)
-                                    memcpy(timbre_feats.data() + copy_frames * 64, silence_full.data(),
-                                           (S_ref - (int)copy_frames) * 64 * sizeof(float));
-                                S_ref_actual = (int)copy_frames;
-                                if (T_lat > S_ref) S_ref_actual = S_ref;
-                                timbre_ptr = timbre_feats.data();
-                                fprintf(stderr, "[Timbre] encoded %s -> %d frames (25Hz)\n", ref_path.c_str(), S_ref_actual);
-                            }
+            std::vector<float> wav_stereo;
+            int n_samples = load_audio_48k_stereo(ref_path.c_str(), &wav_stereo);
+            if (n_samples > 0 && have_vae) {
+                VAEEncoderGGML enc = {};
+                if (vae_encoder_load(&enc, vae_gguf)) {
+                    int T_audio = n_samples;
+                    if (T_audio >= 1920) {
+                        std::vector<float> enc_out((size_t)S_ref * 64);
+                        int T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data());
+                        if (T_lat > 0) {
+                            size_t copy_frames = (size_t)(T_lat < S_ref ? T_lat : S_ref);
+                            memcpy(timbre_feats.data(), enc_out.data(), copy_frames * 64 * sizeof(float));
+                            if (T_lat < S_ref)
+                                memcpy(timbre_feats.data() + copy_frames * 64, silence_full.data(),
+                                       (S_ref - (int)copy_frames) * 64 * sizeof(float));
+                            S_ref_actual = (int)copy_frames;
+                            if (T_lat > S_ref) S_ref_actual = S_ref;
+                            timbre_ptr = timbre_feats.data();
+                            fprintf(stderr, "[Timbre] encoded %s -> %d frames (25Hz)\n", ref_path.c_str(), S_ref_actual);
                         }
-                        vae_encoder_free(&enc);
                     }
-                } else if (n_samples <= 0) {
-                    fprintf(stderr, "[Timbre] WARNING: cannot load WAV %s, using silence\n", ref_path.c_str());
-                } else if (!have_vae) {
-                    fprintf(stderr, "[Timbre] WAV requires --vae (with encoder weights); using silence\n");
+                    vae_encoder_free(&enc);
                 }
+            } else if (n_samples <= 0) {
+                fprintf(stderr, "[Timbre] WARNING: cannot load audio %s (use .wav or .mp3), using silence\n", ref_path.c_str());
+            } else if (!have_vae) {
+                fprintf(stderr, "[Timbre] reference_audio requires --vae (with encoder weights); using silence\n");
             }
         }
 
diff --git a/tests/run-generation-tests.sh b/tests/run-generation-tests.sh
new file mode 100755
index 0000000..666698c
--- /dev/null
+++ b/tests/run-generation-tests.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# Run the same generation tests as the GitHub Action (test-generation.yml).
+# Use this to validate locally before pushing. No assumptions: build and models required.
+#
+# From repo root:
+#   ./models.sh              # once: download Q8_0 + VAE into models/
+#   mkdir -p build && cd build && cmake .. && cmake --build . --config Release
+#   cd .. && tests/run-generation-tests.sh
+
+set -e
+cd "$(dirname "$0")/.."
+REPO_ROOT="$PWD"
+
+# --- Build ---
+if [ ! -f build/dit-vae ] || [ ! -f build/ace-qwen3 ]; then
+    echo "Missing build/dit-vae or build/ace-qwen3. Build first:"
+    echo "  mkdir -p build && cd build && cmake .. && cmake --build . --config Release"
+    exit 1
+fi
+
+# --- Models ---
+TEXT_ENC="models/Qwen3-Embedding-0.6B-Q8_0.gguf"
+DIT="models/acestep-v15-turbo-Q8_0.gguf"
+VAE="models/vae-BF16.gguf"
+LM="models/acestep-5Hz-lm-4B-Q8_0.gguf"
+for f in "$TEXT_ENC" "$DIT" "$VAE"; do
+    if [ ! -f "$f" ]; then
+        echo "Missing $f. Download models once: ./models.sh"
+        exit 1
+    fi
+done
+
+echo "[1/3] Test mode text2music (short)"
+./build/dit-vae \
+    --request tests/fixtures/ci-text2music.json \
+    --text-encoder "$TEXT_ENC" \
+    --dit "$DIT" \
+    --vae "$VAE"
+if [ ! -f tests/fixtures/ci-text2music0.wav ]; then
+    echo "FAIL: tests/fixtures/ci-text2music0.wav not created"
+    exit 1
+fi
+echo "  text2music WAV OK"
+
+echo "[2/3] Test mode cover with WAV reference (short)"
+./build/dit-vae \
+    --request tests/fixtures/ci-cover.json \
+    --text-encoder "$TEXT_ENC" \
+    --dit "$DIT" \
+    --vae "$VAE"
+if [ ! -f tests/fixtures/ci-cover0.wav ]; then
+    echo "FAIL: tests/fixtures/ci-cover0.wav not created"
+    exit 1
+fi
+echo "  cover WAV OK"
+
+echo "[3/3] Test full pipeline (LLM + DiT, short)"
+if [ ! -f "$LM" ]; then
+    echo "Missing $LM; skipping full pipeline. Run ./models.sh to include LM."
+    exit 1
+fi
+# ace-qwen3 names output from input path (e.g. request.json -> request0.json)
+cp tests/fixtures/ci-text2music.json request.json
+./build/ace-qwen3 \
+    --request request.json \
+    --model "$LM"
+if [ ! -f request0.json ]; then
+    echo "FAIL: request0.json not created by ace-qwen3"
+    exit 1
+fi
+./build/dit-vae \
+    --request request0.json \
+    --text-encoder "$TEXT_ENC" \
+    --dit "$DIT" \
+    --vae "$VAE"
+if [ ! -f request00.wav ]; then
+    echo "FAIL: request00.wav not created"
+    exit 1
+fi
+echo "  full pipeline WAV OK"
+
+echo ""
+echo "All generation tests passed locally. Safe to rely on CI for the same checks."
diff --git a/vae.h b/vae.h
index 4713581..92ff718 100644
--- a/vae.h
+++ b/vae.h
@@ -680,7 +680,6 @@ static bool vae_encoder_load(VAEEncoderGGML * m, const char * path) {
 static int vae_encoder_forward(VAEEncoderGGML * m, const float * audio, int T_audio,
                                float * latent_out) {
     if (!m->has_encoder || T_audio < 1920) return -1;
-    int T_latent = T_audio / 1920;
 
     ggml_backend_sched_reset(m->sched);
     size_t ctx_size = 4096 * ggml_tensor_overhead() + ggml_graph_overhead();
@@ -717,8 +716,11 @@ static int vae_encoder_forward(VAEEncoderGGML * m, const float * audio, int T_au
     ggml_backend_sched_synchronize(m->sched);
 
     struct ggml_tensor * out = ggml_graph_get_tensor(gf, "enc_out");
-    std::vector<float> tmp(128 * T_latent);
-    ggml_backend_tensor_get(out, tmp.data(), 0, 128 * (size_t)T_latent * sizeof(float));
+    // Encoder strides 2,4,4,8,8 give T_out != T_audio/1920; use actual output shape to avoid read out of bounds
+    int T_latent = (int)out->ne[0];
+    size_t nbytes = (size_t)T_latent * 128 * sizeof(float);
+    std::vector<float> tmp((size_t)T_latent * 128);
+    ggml_backend_tensor_get(out, tmp.data(), 0, nbytes);
     for (int t = 0; t < T_latent; t++)
         for (int c = 0; c < 64; c++)
             latent_out[t * 64 + c] = tmp[t * 128 + c];

From 9f23438ba155bb718f2bc4a38afc880c72b073d4 Mon Sep 17 00:00:00 2001
From: qxip <qxip@mini-ams.local>
Date: Sat, 28 Feb 2026 20:36:33 +0100
Subject: [PATCH 04/17] CI: run example scripts with short fixtures

- Add examples/run-examples-ci.sh to run all 7 example scripts in order
- Add short-duration CI fixtures: ci-dit-only, ci-partial, ci-full, ci-request-reference
- Replace inline test-generation steps with single step running run-examples-ci.sh

Made-with: Cursor
---
 .github/workflows/test-generation.yml    | 39 +++-----------------
 examples/run-examples-ci.sh              | 47 ++++++++++++++++++++++++
 tests/fixtures/ci-dit-only.json          | 11 ++++++
 tests/fixtures/ci-full.json              | 11 ++++++
 tests/fixtures/ci-partial.json           |  8 ++++
 tests/fixtures/ci-request-reference.json | 13 +++++++
 6 files changed, 95 insertions(+), 34 deletions(-)
 create mode 100755 examples/run-examples-ci.sh
 create mode 100644 tests/fixtures/ci-dit-only.json
 create mode 100644 tests/fixtures/ci-full.json
 create mode 100644 tests/fixtures/ci-partial.json
 create mode 100644 tests/fixtures/ci-request-reference.json

diff --git a/.github/workflows/test-generation.yml b/.github/workflows/test-generation.yml
index c63b3c0..36cefd1 100644
--- a/.github/workflows/test-generation.yml
+++ b/.github/workflows/test-generation.yml
@@ -1,6 +1,6 @@
-# Build, download models (cached), and run short generation tests for various modes.
-# Validate locally first: from repo root run tests/run-generation-tests.sh (after build + ./models.sh).
-# CI runs the same steps. Trigger: release (published) or workflow_dispatch only.
+# Build, download models (cached), and run all example scripts with short CI fixtures.
+# Validate locally: from repo root run ./examples/run-examples-ci.sh (after build + ./models.sh).
+# Trigger: release (published) or workflow_dispatch only.
 name: Test generation
 
 on:
@@ -44,34 +44,5 @@ jobs:
           pip install -q hf
           ./models.sh
 
-      - name: Test mode text2music (short)
-        run: |
-          ./build/dit-vae \
-            --request tests/fixtures/ci-text2music.json \
-            --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
-            --dit models/acestep-v15-turbo-Q8_0.gguf \
-            --vae models/vae-BF16.gguf
-          test -f tests/fixtures/ci-text2music0.wav && echo "text2music WAV OK"
-
-      - name: Test mode cover with WAV reference (short)
-        run: |
-          ./build/dit-vae \
-            --request tests/fixtures/ci-cover.json \
-            --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
-            --dit models/acestep-v15-turbo-Q8_0.gguf \
-            --vae models/vae-BF16.gguf
-          test -f tests/fixtures/ci-cover0.wav && echo "cover WAV OK"
-
-      - name: Test full pipeline (LLM + DiT, short)
-        run: |
-          cp tests/fixtures/ci-text2music.json request.json
-          ./build/ace-qwen3 \
-            --request request.json \
-            --model models/acestep-5Hz-lm-4B-Q8_0.gguf
-          test -f request0.json
-          ./build/dit-vae \
-            --request request0.json \
-            --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
-            --dit models/acestep-v15-turbo-Q8_0.gguf \
-            --vae models/vae-BF16.gguf
-          test -f request00.wav && echo "full pipeline WAV OK"
+      - name: Run examples (CI fixtures, short duration)
+        run: chmod +x examples/run-examples-ci.sh && ./examples/run-examples-ci.sh
diff --git a/examples/run-examples-ci.sh b/examples/run-examples-ci.sh
new file mode 100755
index 0000000..362f92a
--- /dev/null
+++ b/examples/run-examples-ci.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Run all example scripts with short-duration CI fixtures (from repo root).
+# Prereqs: build/ and models/ present; run after build and ./models.sh.
+set -eu
+cd "$(dirname "$0")/.."
+EXAMPLES=examples
+cd "$EXAMPLES"
+
+run() { echo "== $*" && "$@"; }
+
+# 1) DiT-only (no LLM), 5s
+run cp ../tests/fixtures/ci-dit-only.json dit-only.json
+run ./dit-only.sh
+test -f dit-only0.wav && echo "dit-only OK"
+
+# 2) Cover from precomputed audio_codes (existing cover.json, 10s)
+run ./cover.sh
+test -f cover0.wav && echo "cover OK"
+
+# 3) reference.wav for cover-reference and test-reference
+run cp cover0.wav reference.wav
+
+# 4) Cover + reference timbre
+run ./cover-reference.sh
+test -f cover-reference0.wav && echo "cover-reference OK"
+
+# 5) text2music with reference_audio
+run cp ../tests/fixtures/ci-request-reference.json request-reference.json
+run ./test-reference.sh
+test -f request-reference0.wav && echo "test-reference OK"
+
+# 6) Simple (caption only, LLM fills), 5s
+run cp ../tests/fixtures/ci-text2music.json simple.json
+run ./simple.sh
+test -f simple00.wav && echo "simple OK"
+
+# 7) Partial (caption + lyrics + duration), 5s
+run cp ../tests/fixtures/ci-partial.json partial.json
+run ./partial.sh
+test -f partial00.wav && echo "partial OK"
+
+# 8) Full (all metadata), 5s
+run cp ../tests/fixtures/ci-full.json full.json
+run ./full.sh
+test -f full00.wav && echo "full OK"
+
+echo "All example scripts passed."
diff --git a/tests/fixtures/ci-dit-only.json b/tests/fixtures/ci-dit-only.json
new file mode 100644
index 0000000..0a83cb8
--- /dev/null
+++ b/tests/fixtures/ci-dit-only.json
@@ -0,0 +1,11 @@
+{
+  "caption": "Short CI clip",
+  "lyrics": "",
+  "bpm": 90,
+  "duration": 5,
+  "keyscale": "C minor",
+  "timesignature": "4",
+  "vocal_language": "en",
+  "inference_steps": 4,
+  "shift": 3
+}
diff --git a/tests/fixtures/ci-full.json b/tests/fixtures/ci-full.json
new file mode 100644
index 0000000..3a37bfc
--- /dev/null
+++ b/tests/fixtures/ci-full.json
@@ -0,0 +1,11 @@
+{
+  "caption": "Short CI house clip",
+  "lyrics": "[Intro]\n\n[Verse 1]\nTest\n\n[Outro]\nDone",
+  "bpm": 120,
+  "duration": 5,
+  "keyscale": "C major",
+  "timesignature": "4",
+  "vocal_language": "fr",
+  "inference_steps": 4,
+  "shift": 3
+}
diff --git a/tests/fixtures/ci-partial.json b/tests/fixtures/ci-partial.json
new file mode 100644
index 0000000..19ae9db
--- /dev/null
+++ b/tests/fixtures/ci-partial.json
@@ -0,0 +1,8 @@
+{
+  "caption": "Short CI hip hop clip",
+  "lyrics": "[Intro]\nYeah\n\n[Verse 1]\nOne two\n\n[Chorus]\nTest\n\n[Outro]\nDone",
+  "duration": 5,
+  "vocal_language": "fr",
+  "inference_steps": 4,
+  "shift": 3
+}
diff --git a/tests/fixtures/ci-request-reference.json b/tests/fixtures/ci-request-reference.json
new file mode 100644
index 0000000..141d77a
--- /dev/null
+++ b/tests/fixtures/ci-request-reference.json
@@ -0,0 +1,13 @@
+{
+  "task_type": "text2music",
+  "caption": "Short CI reference test",
+  "lyrics": "[Verse]\nTest\n[Chorus]\nRef",
+  "duration": 5,
+  "seed": 42,
+  "inference_steps": 4,
+  "guidance_scale": 1,
+  "shift": 3,
+  "reference_audio": "reference.wav",
+  "audio_codes": "",
+  "audio_cover_strength": 1
+}

From 14b108c3df1c5ae91463aaf7870bd0d4588d64a3 Mon Sep 17 00:00:00 2001
From: qxip <qxip@mini-ams.local>
Date: Sat, 28 Feb 2026 20:38:43 +0100
Subject: [PATCH 05/17] CI: upload generated WAVs as artifact for inspection

Made-with: Cursor
---
 .github/workflows/test-generation.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/test-generation.yml b/.github/workflows/test-generation.yml
index 36cefd1..2f84eed 100644
--- a/.github/workflows/test-generation.yml
+++ b/.github/workflows/test-generation.yml
@@ -46,3 +46,16 @@ jobs:
 
       - name: Run examples (CI fixtures, short duration)
         run: chmod +x examples/run-examples-ci.sh && ./examples/run-examples-ci.sh
+
+      - name: Upload generated audio
+        uses: actions/upload-artifact@v4
+        with:
+          name: generated-audio
+          path: |
+            examples/dit-only0.wav
+            examples/cover0.wav
+            examples/cover-reference0.wav
+            examples/request-reference0.wav
+            examples/simple00.wav
+            examples/partial00.wav
+            examples/full00.wav

From 6e49bbddde8706312f12f14c0f68481337a2ebf7 Mon Sep 17 00:00:00 2001
From: qxip <qxip@mini-ams.local>
Date: Sat, 28 Feb 2026 21:21:54 +0100
Subject: [PATCH 06/17] LoRA: adapter loading + example + README

- Add safetensors reader and dit_ggml_load_lora (PEFT adapter_model.safetensors)
- Apply LoRA at linear layers in DiT (self/cross-attn, MLP) when base weights are separate
- CLI: --lora <path> and --lora-scale <float> in dit-vae
- Example: examples/lora.sh + lora.json (duckdbot/acestep-lora-cryda)
- README: LoRA section, example list, dit-vae options

Made-with: Cursor
---
 CMakeLists.txt     |   4 +-
 README.md          |   7 ++
 examples/lora.json |  11 +++
 examples/lora.sh   |  31 +++++++
 src/dit-graph.h    |  47 +++++++----
 src/dit-lora.cpp   | 198 +++++++++++++++++++++++++++++++++++++++++++++
 src/dit.h          |  20 +++++
 src/safetensors.h  | 107 ++++++++++++++++++++++++
 tools/dit-vae.cpp  |  23 +++++-
 9 files changed, 429 insertions(+), 19 deletions(-)
 create mode 100644 examples/lora.json
 create mode 100755 examples/lora.sh
 create mode 100644 src/dit-lora.cpp
 create mode 100644 src/safetensors.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 551a072..503145a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,8 +53,8 @@ add_library(acestep-core STATIC
 link_ggml_backends(acestep-core)
 target_include_directories(acestep-core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 
-# dit-vae: full pipeline (text-enc + cond + dit + vae + wav)
-add_executable(dit-vae tools/dit-vae.cpp)
+# dit-vae: full pipeline (text-enc + cond + dit + vae + wav) + LoRA support
+add_executable(dit-vae tools/dit-vae.cpp src/dit-lora.cpp)
 target_link_libraries(dit-vae PRIVATE acestep-core)
 link_ggml_backends(dit-vae)
 
diff --git a/README.md b/README.md
index 178fe39..9ad2a2b 100644
--- a/README.md
+++ b/README.md
@@ -146,11 +146,14 @@ cd examples
 ./cover.sh            # cover mode: decode precomputed audio_codes (no LLM)
 ./cover-reference.sh  # cover + reference_audio for timbre (WAV/MP3; needs reference.wav or .mp3)
 ./test-reference.sh   # reference_audio (WAV or MP3) + audio_cover_strength
+./lora.sh             # DiT + LoRA adapter (e.g. duckdbot/acestep-lora-cryda; put adapter in examples/lora/)
 ```
 
 Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0)
 alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights).
 
+**LoRA adapters**: use `--lora <path>` and optional `--lora-scale <float>` with dit-vae to run the DiT with a PEFT-style LoRA (e.g. [duckdbot/acestep-lora-cryda](https://huggingface.co/duckdbot/acestep-lora-cryda)). Adapter must be `adapter_model.safetensors` (safetensors with `lora_A` / `lora_B` keys matching `decoder.layers.*`). Put the file in `examples/lora/` and run `./lora.sh`, or pass the path explicitly.
+
 ## Generation modes
 
 The LLM fills what's missing in the JSON and generates audio codes.
@@ -254,6 +257,10 @@ Required:
   --dit <gguf>            DiT GGUF file
   --vae <gguf>            VAE GGUF file
 
+LoRA:
+  --lora <path>           LoRA adapter (adapter_model.safetensors)
+  --lora-scale <float>    LoRA scale, e.g. alpha/rank (default: 1.0)
+
 Batch:
   --batch <N>             DiT variations per request (default: 1, max 9)
 
diff --git a/examples/lora.json b/examples/lora.json
new file mode 100644
index 0000000..8317521
--- /dev/null
+++ b/examples/lora.json
@@ -0,0 +1,11 @@
+{
+  "task_type": "text2music",
+  "caption": "Emotional vocal track with soft synths",
+  "lyrics": "",
+  "duration": 10,
+  "inference_steps": 8,
+  "guidance_scale": 1,
+  "shift": 3,
+  "seed": 42,
+  "vocal_language": "en"
+}
diff --git a/examples/lora.sh b/examples/lora.sh
new file mode 100755
index 0000000..9c25d33
--- /dev/null
+++ b/examples/lora.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# LoRA example: generate with a PEFT LoRA adapter (e.g. duckdbot/acestep-lora-cryda).
+# Requires adapter_model.safetensors in lora/ (download once; see below).
+set -eu
+cd "$(dirname "$0")"
+
+ADAPTER="lora/adapter_model.safetensors"
+if [ ! -f "$ADAPTER" ]; then
+    echo "LoRA adapter not found at $ADAPTER"
+    echo "Download once (e.g. from Hugging Face):"
+    echo "  mkdir -p lora"
+    echo "  curl -L -o $ADAPTER 'https://huggingface.co/duckdbot/acestep-lora-cryda/resolve/main/adapter_model.safetensors'"
+    echo "Or: pip install hf && huggingface-cli download duckdbot/acestep-lora-cryda adapter_model.safetensors --local-dir lora"
+    exit 1
+fi
+
+# LLM: fill lyrics + codes
+../build/ace-qwen3 \
+    --request lora.json \
+    --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf
+
+# DiT+VAE with LoRA (scale = alpha/rank; 1.0 is typical)
+../build/dit-vae \
+    --request lora0.json \
+    --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit ../models/acestep-v15-turbo-Q8_0.gguf \
+    --vae ../models/vae-BF16.gguf \
+    --lora "$ADAPTER" \
+    --lora-scale 1.0
+
+echo "Done. Check lora00.wav"
diff --git a/src/dit-graph.h b/src/dit-graph.h
index 2a92324..ab5839e 100644
--- a/src/dit-graph.h
+++ b/src/dit-graph.h
@@ -44,6 +44,23 @@ static struct ggml_tensor * dit_ggml_linear(
     return ggml_mul_mat(ctx, weight, input);
 }
 
+// Linear with optional LoRA: out = W@x + scale * (B@(A@x)). lora_a/lora_b may be NULL.
+static struct ggml_tensor * dit_ggml_linear_lora(
+        struct ggml_context * ctx,
+        struct ggml_tensor * weight,
+        struct ggml_tensor * lora_a,   // [in, r]
+        struct ggml_tensor * lora_b,   // [r, out]
+        float lora_scale,
+        struct ggml_tensor * input) {
+    struct ggml_tensor * out = ggml_mul_mat(ctx, weight, input);
+    if (lora_a && lora_b && lora_scale != 0.0f) {
+        struct ggml_tensor * ax = ggml_mul_mat(ctx, lora_a, input);
+        struct ggml_tensor * bax = ggml_mul_mat(ctx, lora_b, ax);
+        out = ggml_add(ctx, out, ggml_scale(ctx, bax, lora_scale));
+    }
+    return out;
+}
+
 // Helper: Linear layer with bias
 static struct ggml_tensor * dit_ggml_linear_bias(
         struct ggml_context * ctx,
@@ -164,6 +181,7 @@ static struct ggml_tensor * dit_ggml_build_self_attn(
     struct ggml_tensor * q, * k, * v;
     int q_dim  = Nh * D;
     int kv_dim = Nkv * D;
+    float lora_scale = m->lora_scale;
     if (ly->sa_qkv) {
         struct ggml_tensor * qkv = dit_ggml_linear(ctx, ly->sa_qkv, norm_sa);
         q = ggml_cont(ctx, ggml_view_3d(ctx, qkv, q_dim, S, N, qkv->nb[1], qkv->nb[2], 0));
@@ -173,11 +191,11 @@ static struct ggml_tensor * dit_ggml_build_self_attn(
         struct ggml_tensor * qk = dit_ggml_linear(ctx, ly->sa_qk, norm_sa);
         q = ggml_cont(ctx, ggml_view_3d(ctx, qk, q_dim, S, N, qk->nb[1], qk->nb[2], 0));
         k = ggml_cont(ctx, ggml_view_3d(ctx, qk, kv_dim, S, N, qk->nb[1], qk->nb[2], (size_t)q_dim * qk->nb[0]));
-        v = dit_ggml_linear(ctx, ly->sa_v_proj, norm_sa);
+        v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa);
     } else {
-        q = dit_ggml_linear(ctx, ly->sa_q_proj, norm_sa);
-        k = dit_ggml_linear(ctx, ly->sa_k_proj, norm_sa);
-        v = dit_ggml_linear(ctx, ly->sa_v_proj, norm_sa);
+        q = dit_ggml_linear_lora(ctx, ly->sa_q_proj, ly->lora_sa_q_a, ly->lora_sa_q_b, lora_scale, norm_sa);
+        k = dit_ggml_linear_lora(ctx, ly->sa_k_proj, ly->lora_sa_k_a, ly->lora_sa_k_b, lora_scale, norm_sa);
+        v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa);
     }
 
     // 2) Reshape to heads: [Nh*D, S, N] -> [D, Nh, S, N]
@@ -239,7 +257,7 @@ static struct ggml_tensor * dit_ggml_build_self_attn(
     }
 
     // 8) O projection: [Nh*D, S, N] -> [H, S, N]
-    struct ggml_tensor * out = dit_ggml_linear(ctx, ly->sa_o_proj, attn);
+    struct ggml_tensor * out = dit_ggml_linear_lora(ctx, ly->sa_o_proj, ly->lora_sa_o_a, ly->lora_sa_o_b, m->lora_scale, attn);
     return out;
 }
 
@@ -253,20 +271,21 @@ static struct ggml_tensor * dit_ggml_build_mlp(
         struct ggml_tensor * norm_ffn,
         int S) {
 
+    float lora_scale = m->lora_scale;
     struct ggml_tensor * ff;
     if (ly->gate_up) {
         // Fused: single matmul [H, 2*I] x [H, S, N] -> [2*I, S, N], then swiglu splits ne[0]
         struct ggml_tensor * gu = dit_ggml_linear(ctx, ly->gate_up, norm_ffn);
         ff = ggml_swiglu(ctx, gu);
     } else {
-        // Separate: two matmuls + split swiglu
-        struct ggml_tensor * gate = dit_ggml_linear(ctx, ly->gate_proj, norm_ffn);
-        struct ggml_tensor * up   = dit_ggml_linear(ctx, ly->up_proj, norm_ffn);
+        // Separate: two matmuls + split swiglu (with optional LoRA)
+        struct ggml_tensor * gate = dit_ggml_linear_lora(ctx, ly->gate_proj, ly->lora_gate_a, ly->lora_gate_b, lora_scale, norm_ffn);
+        struct ggml_tensor * up   = dit_ggml_linear_lora(ctx, ly->up_proj, ly->lora_up_a, ly->lora_up_b, lora_scale, norm_ffn);
         ff = ggml_swiglu_split(ctx, gate, up);
     }
 
     // Down projection: [I, S] -> [H, S]
-    return dit_ggml_linear(ctx, ly->down_proj, ff);
+    return dit_ggml_linear_lora(ctx, ly->down_proj, ly->lora_down_a, ly->lora_down_b, lora_scale, ff);
 }
 
 // Build cross-attention sub-graph for a single layer.
@@ -305,14 +324,14 @@ static struct ggml_tensor * dit_ggml_build_cross_attn(
         v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0]));
     } else if (ly->ca_kv) {
         // Q separate, K+V fused
-        q = dit_ggml_linear(ctx, ly->ca_q_proj, norm_ca);
+        q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca);
         struct ggml_tensor * kv = ggml_mul_mat(ctx, ly->ca_kv, enc);
         k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0));
         v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0]));
     } else {
-        q = dit_ggml_linear(ctx, ly->ca_q_proj, norm_ca);
-        k = dit_ggml_linear(ctx, ly->ca_k_proj, enc);
-        v = dit_ggml_linear(ctx, ly->ca_v_proj, enc);
+        q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca);
+        k = dit_ggml_linear_lora(ctx, ly->ca_k_proj, ly->lora_ca_k_a, ly->lora_ca_k_b, m->lora_scale, enc);
+        v = dit_ggml_linear_lora(ctx, ly->ca_v_proj, ly->lora_ca_v_a, ly->lora_ca_v_b, m->lora_scale, enc);
     }
 
     // reshape to [D, heads, seq, N] then permute to [D, seq, heads, N]
@@ -342,7 +361,7 @@ static struct ggml_tensor * dit_ggml_build_cross_attn(
     attn = ggml_reshape_3d(ctx, attn, Nh * D, S, N);
 
     // O projection
-    return dit_ggml_linear(ctx, ly->ca_o_proj, attn);
+    return dit_ggml_linear_lora(ctx, ly->ca_o_proj, ly->lora_ca_o_a, ly->lora_ca_o_b, m->lora_scale, attn);
 }
 
 // Build one full DiT layer (AdaLN + self-attn + cross-attn + FFN + gated residuals)
diff --git a/src/dit-lora.cpp b/src/dit-lora.cpp
new file mode 100644
index 0000000..881d941
--- /dev/null
+++ b/src/dit-lora.cpp
@@ -0,0 +1,198 @@
+// dit-lora.cpp: Load LoRA adapters from safetensors into DiT (ACE-Step).
+// Compatible with PEFT adapter_model.safetensors (lora_A / lora_B per target layer).
+
+#include "dit.h"
+#include "safetensors.h"
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <unordered_map>
+
+// Normalize adapter key to base name: decoder.layers.N.<proj>
+// e.g. "base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_A.default" -> "decoder.layers.0.self_attn.q_proj"
+static std::string lora_key_to_base(const std::string & key) {
+    std::string s = key;
+    const char * prefixes[] = { "base_model.model.model.", "base_model.model." };
+    for (const char * p : prefixes) {
+        size_t pl = strlen(p);
+        if (s.size() >= pl && s.compare(0, pl, p) == 0) {
+            s = s.substr(pl);
+            break;
+        }
+    }
+    if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.default") == 0)
+        s = s.substr(0, s.size() - 14);
+    else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.default") == 0)
+        s = s.substr(0, s.size() - 14);
+    else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_A") == 0)
+        s = s.substr(0, s.size() - 7);
+    else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_B") == 0)
+        s = s.substr(0, s.size() - 7);
+    return s;
+}
+
+static bool is_lora_a(const std::string & key) {
+    return key.find("lora_A") != std::string::npos;
+}
+
+// Slot index for layer: 0=sa_q, 1=sa_k, 2=sa_v, 3=sa_o, 4=ca_q, 5=ca_k, 6=ca_v, 7=ca_o, 8=gate, 9=up, 10=down
+enum LoraSlot {
+    SA_Q, SA_K, SA_V, SA_O, CA_Q, CA_K, CA_V, CA_O, GATE, UP, DOWN, N_SLOTS
+};
+
+static bool parse_base_name(const std::string & base, int * layer_idx, LoraSlot * slot) {
+    int L = -1;
+    if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.q_proj", &L) == 1) { *layer_idx = L; *slot = SA_Q; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.k_proj", &L) == 1) { *layer_idx = L; *slot = SA_K; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.v_proj", &L) == 1) { *layer_idx = L; *slot = SA_V; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.o_proj", &L) == 1) { *layer_idx = L; *slot = SA_O; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.q_proj", &L) == 1) { *layer_idx = L; *slot = CA_Q; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.k_proj", &L) == 1) { *layer_idx = L; *slot = CA_K; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.v_proj", &L) == 1) { *layer_idx = L; *slot = CA_V; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.o_proj", &L) == 1) { *layer_idx = L; *slot = CA_O; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.mlp.gate_proj", &L) == 1) { *layer_idx = L; *slot = GATE; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.mlp.up_proj", &L) == 1) { *layer_idx = L; *slot = UP; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.mlp.down_proj", &L) == 1) { *layer_idx = L; *slot = DOWN; return true; }
+    return false;
+}
+
+static struct ggml_tensor ** slot_to_ptr(DiTGGMLLayer * ly, LoraSlot slot, bool is_b) {
+    if (is_b) {
+        switch (slot) {
+            case SA_Q: return &ly->lora_sa_q_b; case SA_K: return &ly->lora_sa_k_b; case SA_V: return &ly->lora_sa_v_b; case SA_O: return &ly->lora_sa_o_b;
+            case CA_Q: return &ly->lora_ca_q_b; case CA_K: return &ly->lora_ca_k_b; case CA_V: return &ly->lora_ca_v_b; case CA_O: return &ly->lora_ca_o_b;
+            case GATE: return &ly->lora_gate_b; case UP: return &ly->lora_up_b; case DOWN: return &ly->lora_down_b;
+            default: return nullptr;
+        }
+    } else {
+        switch (slot) {
+            case SA_Q: return &ly->lora_sa_q_a; case SA_K: return &ly->lora_sa_k_a; case SA_V: return &ly->lora_sa_v_a; case SA_O: return &ly->lora_sa_o_a;
+            case CA_Q: return &ly->lora_ca_q_a; case CA_K: return &ly->lora_ca_k_a; case CA_V: return &ly->lora_ca_v_a; case CA_O: return &ly->lora_ca_o_a;
+            case GATE: return &ly->lora_gate_a; case UP: return &ly->lora_up_a; case DOWN: return &ly->lora_down_a;
+            default: return nullptr;
+        }
+    }
+}
+
+bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale) {
+    FILE * fp = fopen(lora_path, "rb");
+    if (!fp) {
+        fprintf(stderr, "[LoRA] cannot open %s\n", lora_path);
+        return false;
+    }
+    uint8_t h8[8];
+    if (fread(h8, 1, 8, fp) != 8) {
+        fclose(fp);
+        return false;
+    }
+    uint64_t header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24)
+        | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56);
+    uint64_t data_section_start = 8 + header_len;
+
+    std::unordered_map<std::string, SafeTensorInfo> tensors;
+    int n = safetensors_parse_lora(fp, &tensors);
+    if (n == 0) {
+        fclose(fp);
+        fprintf(stderr, "[LoRA] no LoRA tensors found in %s\n", lora_path);
+        return false;
+    }
+
+    // Count pairs we will load: for each lora_A key, find the matching lora_B (same base name)
+    std::unordered_map<std::string, std::pair<std::string, std::string>> pairs;  // base -> (key_a, key_b)
+    std::unordered_map<std::string, std::string> base_to_b;
+    for (const auto & kv : tensors) {
+        std::string base = lora_key_to_base(kv.first);
+        if (base.empty()) continue;
+        if (is_lora_a(kv.first))
+            base_to_b[base] = "";  // mark base as having A; we'll find B next
+    }
+    for (const auto & kv : tensors) {
+        std::string base = lora_key_to_base(kv.first);
+        if (base.empty()) continue;
+        if (base_to_b.count(base) && kv.first.find("lora_B") != std::string::npos)
+            base_to_b[base] = kv.first;
+    }
+    for (const auto & kv : tensors) {
+        if (!is_lora_a(kv.first)) continue;
+        std::string base = lora_key_to_base(kv.first);
+        auto it = base_to_b.find(base);
+        if (it != base_to_b.end() && !it->second.empty())
+            pairs[base] = { kv.first, it->second };
+    }
+
+    int n_pairs = (int)pairs.size();
+    wctx_init(&m->lora_wctx, n_pairs * 2);  // A and B per pair
+
+    fseek(fp, (long)data_section_start, SEEK_SET);
+
+    for (const auto & p : pairs) {
+        const std::string & base = p.first;
+        const std::string & key_a = p.second.first;
+        const std::string & key_b = p.second.second;
+        int layer_idx = 0;
+        LoraSlot slot = N_SLOTS;
+        if (!parse_base_name(base, &layer_idx, &slot) || layer_idx < 0 || layer_idx >= m->cfg.n_layers) continue;
+
+        DiTGGMLLayer * ly = &m->layers[layer_idx];
+        SafeTensorInfo & info_a = tensors[key_a];
+        SafeTensorInfo & info_b = tensors[key_b];
+        if (info_a.n_dims != 2 || info_b.n_dims != 2) continue;
+        // A_pt [r, in], B_pt [out, r]. We need A_ggml [r, in] for mul_mat(A,x)=[r,S], B_ggml [out, r] for mul_mat(B, Ax)=[out,S].
+        // ggml layout: ne[0]=cols, ne[1]=rows. So A: [r, in] -> ne[0]=in, ne[1]=r. B: [out, r] -> ne[0]=r, ne[1]=out.
+        int64_t r = info_a.shape[0], in_dim = info_a.shape[1];
+        int64_t out_dim = info_b.shape[0];
+        if (info_b.shape[1] != r) continue;
+
+        struct ggml_tensor * ta = ggml_new_tensor_2d(m->lora_wctx.ctx, GGML_TYPE_F32, (int64_t)in_dim, (int64_t)r);
+        struct ggml_tensor * tb = ggml_new_tensor_2d(m->lora_wctx.ctx, GGML_TYPE_F32, (int64_t)r, (int64_t)out_dim);
+        ggml_set_name(ta, key_a.c_str());
+        ggml_set_name(tb, key_b.c_str());
+
+        // Copy A: file is row-major [r, in], we need ggml col-major [in, r] (transpose)
+        size_t na = (size_t)(r * in_dim);
+        m->lora_wctx.staging.emplace_back(na);
+        float * buf_a = m->lora_wctx.staging.back().data();
+        if (!safetensors_read_tensor_data(fp, data_section_start, info_a.data_start, info_a.data_end, buf_a)) {
+            fclose(fp);
+            wctx_free(&m->lora_wctx);
+            return false;
+        }
+        m->lora_wctx.staging.emplace_back(na);
+        float * transposed_a = m->lora_wctx.staging.back().data();
+        for (int64_t i = 0; i < r; i++)
+            for (int64_t j = 0; j < in_dim; j++)
+                transposed_a[(size_t)(j * r + i)] = buf_a[(size_t)(i * in_dim + j)];
+        m->lora_wctx.pending.push_back({ ta, transposed_a, na * sizeof(float), 0 });
+
+        size_t nb = (size_t)(out_dim * r);
+        m->lora_wctx.staging.emplace_back(nb);
+        float * buf_b = m->lora_wctx.staging.back().data();
+        if (!safetensors_read_tensor_data(fp, data_section_start, info_b.data_start, info_b.data_end, buf_b)) {
+            fclose(fp);
+            wctx_free(&m->lora_wctx);
+            return false;
+        }
+        m->lora_wctx.staging.emplace_back(nb);
+        float * transposed_b = m->lora_wctx.staging.back().data();
+        for (int64_t i = 0; i < out_dim; i++)
+            for (int64_t j = 0; j < r; j++)
+                transposed_b[(size_t)(j * out_dim + i)] = buf_b[(size_t)(i * r + j)];
+        m->lora_wctx.pending.push_back({ tb, transposed_b, nb * sizeof(float), 0 });
+
+        struct ggml_tensor ** pa = slot_to_ptr(ly, slot, false);
+        struct ggml_tensor ** pb = slot_to_ptr(ly, slot, true);
+        if (pa) *pa = ta;
+        if (pb) *pb = tb;
+    }
+    fclose(fp);
+    fp = nullptr;
+
+    if (!wctx_alloc(&m->lora_wctx, m->backend)) {
+        fprintf(stderr, "[LoRA] failed to allocate LoRA tensors on backend\n");
+        wctx_free(&m->lora_wctx);
+        return false;
+    }
+    m->lora_scale = scale;
+    fprintf(stderr, "[LoRA] loaded %d adapter pairs from %s (scale=%.4f)\n", n_pairs, lora_path, scale);
+    return true;
+}
diff --git a/src/dit.h b/src/dit.h
index 524dd76..9c842b0 100644
--- a/src/dit.h
+++ b/src/dit.h
@@ -81,6 +81,19 @@ struct DiTGGMLLayer {
     // AdaLN scale-shift table: [6*hidden] (6 rows of [hidden])
     struct ggml_tensor * scale_shift_table; // [hidden, 6] in ggml layout
 
+    // Optional LoRA adapters (F32, applied when base projection is separate)
+    struct ggml_tensor * lora_sa_q_a, * lora_sa_q_b;
+    struct ggml_tensor * lora_sa_k_a, * lora_sa_k_b;
+    struct ggml_tensor * lora_sa_v_a, * lora_sa_v_b;
+    struct ggml_tensor * lora_sa_o_a, * lora_sa_o_b;
+    struct ggml_tensor * lora_ca_q_a, * lora_ca_q_b;
+    struct ggml_tensor * lora_ca_k_a, * lora_ca_k_b;
+    struct ggml_tensor * lora_ca_v_a, * lora_ca_v_b;
+    struct ggml_tensor * lora_ca_o_a, * lora_ca_o_b;
+    struct ggml_tensor * lora_gate_a, * lora_gate_b;
+    struct ggml_tensor * lora_up_a, * lora_up_b;
+    struct ggml_tensor * lora_down_a, * lora_down_b;
+
     int layer_type;  // 0=sliding, 1=full
 };
 
@@ -122,6 +135,8 @@ struct DiTGGML {
 
     // Weight storage
     WeightCtx wctx;
+    WeightCtx lora_wctx;     // optional LoRA adapter tensors (when lora_scale > 0)
+    float lora_scale;        // alpha/rank for LoRA (0 = no LoRA)
 
     // Pre-allocated constant for AdaLN (1+scale) fusion
     struct ggml_tensor * scalar_one;  // [1] = 1.0f, broadcast in ggml_add
@@ -389,10 +404,15 @@ static void dit_ggml_init_backend(DiTGGML * m) {
     m->use_flash_attn = (bp.backend != bp.cpu_backend);
 }
 
+// Load LoRA adapter from safetensors (e.g. adapter_model.safetensors).
+// scale = alpha/rank (typical 1.0). Call after dit_ggml_load. Returns false on error.
+bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale);
+
 static void dit_ggml_free(DiTGGML * m) {
     if (m->sched) ggml_backend_sched_free(m->sched);
     if (m->backend && m->backend != m->cpu_backend) ggml_backend_free(m->backend);
     if (m->cpu_backend) ggml_backend_free(m->cpu_backend);
     wctx_free(&m->wctx);
+    if (m->lora_wctx.ctx) wctx_free(&m->lora_wctx);
     *m = {};
 }
diff --git a/src/safetensors.h b/src/safetensors.h
new file mode 100644
index 0000000..74d5967
--- /dev/null
+++ b/src/safetensors.h
@@ -0,0 +1,107 @@
+#pragma once
+// safetensors.h: minimal reader for LoRA adapter_model.safetensors
+//
+// Format: 8-byte header length (LE uint64), then JSON header, then raw tensor data.
+// We only parse keys that look like "*lora_A*" / "*lora_B*" and extract shape + data_offsets.
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+struct SafeTensorInfo {
+    std::string dtype;       // "F32", "F16", "BF16"
+    int64_t shape[2];        // [dim0, dim1] from JSON
+    int n_dims;
+    uint64_t data_start;     // byte offset in file (after header)
+    uint64_t data_end;
+};
+
+// Open file, read header, parse tensor metadata for LoRA tensors.
+// Returns number of LoRA tensors found; fills *out with tensor name -> info.
+// Caller must fclose(fp) and free the map; file position is left at start of data section.
+static int safetensors_parse_lora(FILE * fp, std::unordered_map<std::string, SafeTensorInfo> * out) {
+    out->clear();
+    uint64_t header_len = 0;
+    uint8_t h8[8];
+    if (fread(h8, 1, 8, fp) != 8) return 0;
+    header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24)
+        | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56);
+    if (header_len == 0 || header_len > 10 * 1024 * 1024) return 0;  // cap 10MB header
+    std::vector<char> buf(header_len + 1);
+    if (fread(buf.data(), 1, header_len, fp) != header_len) return 0;
+    buf[header_len] = '\0';
+    const char * json = buf.data();
+
+    // Find each key that contains "lora_A" or "lora_B"
+    const char * p = json;
+    int count = 0;
+    while ((p = strstr(p, "\"")) != nullptr) {
+        const char * key_start = p + 1;
+        p = strchr(key_start, '"');
+        if (!p) break;
+        std::string key(key_start, (size_t)(p - key_start));
+        p++;
+        if (key.find("lora_A") == std::string::npos && key.find("lora_B") == std::string::npos) {
+            continue;
+        }
+        // Find the value object for this key: skip ":
+        while (*p && (*p == ' ' || *p == ':')) p++;
+        if (*p != '{') continue;
+        const char * obj = p;
+        SafeTensorInfo info = {};
+        info.shape[0] = info.shape[1] = 1;
+        info.n_dims = 0;
+        // "shape":[n,m] or [n]
+        const char * sh = strstr(obj, "\"shape\"");
+        if (sh) {
+            const char * br = strchr(sh, '[');
+            if (br) {
+                long long a = 0, b = 0;
+                int n = sscanf(br, "[%lld,%lld]", &a, &b);
+                if (n >= 1) { info.shape[0] = (int64_t)a; info.n_dims = 1; }
+                if (n >= 2) { info.shape[1] = (int64_t)b; info.n_dims = 2; }
+            }
+        }
+        const char * dt = strstr(obj, "\"dtype\"");
+        if (dt) {
+            const char * q = strchr(dt, '"');
+            if (q) q = strchr(q + 1, '"');
+            if (q) {
+                const char * start = q + 1;
+                const char * end = strchr(start, '"');
+                if (end) info.dtype = std::string(start, end - start);
+            }
+        }
+        const char * off = strstr(obj, "\"data_offsets\"");
+        if (off) {
+            const char * br = strchr(off, '[');
+            if (br) {
+                uint64_t s = 0, e = 0;
+                if (sscanf(br, "[%llu,%llu]", (unsigned long long*)&s, (unsigned long long*)&e) == 2) {
+                    info.data_start = s;
+                    info.data_end = e;
+                }
+            }
+        }
+        if (info.dtype.empty() || info.n_dims == 0) continue;
+        (*out)[key] = info;
+        count++;
+    }
+    return count;
+}
+
+// Read raw tensor data from file. File must be positioned at start of data section
+// (i.e. after the 8-byte header length + header bytes).
+// data_offset in the JSON is relative to the start of the data section.
+static bool safetensors_read_tensor_data(FILE * fp, uint64_t data_section_start,
+        uint64_t tensor_start, uint64_t tensor_end, void * out_buf) {
+    uint64_t off = data_section_start + tensor_start;
+    uint64_t nbytes = tensor_end - tensor_start;
+    if (fseek(fp, (long)off, SEEK_SET) != 0) return false;
+    if (fread(out_buf, 1, nbytes, fp) != nbytes) return false;
+    return true;
+}
diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp
index cac80a5..d889da1 100644
--- a/tools/dit-vae.cpp
+++ b/tools/dit-vae.cpp
@@ -67,6 +67,9 @@ static void print_usage(const char * prog) {
         "  --text-encoder <gguf>   Text encoder GGUF file\n"
         "  --dit <gguf>            DiT GGUF file\n"
         "  --vae <gguf>            VAE GGUF file\n\n"
+        "LoRA:\n"
+        "  --lora <path>           LoRA adapter (adapter_model.safetensors)\n"
+        "  --lora-scale <float>    LoRA scale, e.g. alpha/rank (default: 1.0)\n\n"
         "Batch:\n"
         "  --batch <N>             DiT variations per request (default: 1, max 9)\n\n"
         "Output naming: input.json -> input0.wav, input1.wav, ... (last digit = batch index)\n\n"
@@ -99,9 +102,11 @@ int main(int argc, char ** argv) {
     const char * dit_gguf      = NULL;
     const char * vae_gguf       = NULL;
     const char * dump_dir      = NULL;
-    int batch_n                = 1;
-    int vae_chunk              = 256;
-    int vae_overlap            = 64;
+    const char * lora_path     = NULL;
+    float lora_scale            = 1.0f;
+    int batch_n                 = 1;
+    int vae_chunk               = 256;
+    int vae_overlap             = 64;
 
     for (int i = 1; i < argc; i++) {
         if (strcmp(argv[i], "--request") == 0) {
@@ -116,6 +121,8 @@ int main(int argc, char ** argv) {
         else if (strcmp(argv[i], "--batch") == 0 && i+1 < argc) batch_n = atoi(argv[++i]);
         else if (strcmp(argv[i], "--vae-chunk") == 0 && i+1 < argc) vae_chunk = atoi(argv[++i]);
         else if (strcmp(argv[i], "--vae-overlap") == 0 && i+1 < argc) vae_overlap = atoi(argv[++i]);
+        else if (strcmp(argv[i], "--lora") == 0 && i+1 < argc) lora_path = argv[++i];
+        else if (strcmp(argv[i], "--lora-scale") == 0 && i+1 < argc) lora_scale = (float)atof(argv[++i]);
         else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
             print_usage(argv[0]); return 0;
         } else {
@@ -161,6 +168,16 @@ int main(int argc, char ** argv) {
     }
     fprintf(stderr, "[Load] DiT weight load: %.1f ms\n", timer.ms());
 
+    if (lora_path) {
+        timer.reset();
+        if (!dit_ggml_load_lora(&model, lora_path, lora_scale)) {
+            fprintf(stderr, "FATAL: failed to load LoRA from %s\n", lora_path);
+            dit_ggml_free(&model);
+            return 1;
+        }
+        fprintf(stderr, "[Load] LoRA: %.1f ms\n", timer.ms());
+    }
+
     // Read DiT GGUF metadata + silence_latent tensor (once)
     bool is_turbo = false;
     std::vector<float> silence_full;  // [15000, 64] f32

From 658ca3035663c0051252c618a94348ec8729af53 Mon Sep 17 00:00:00 2001
From: qxip <qxip@mini-ams.local>
Date: Sat, 28 Feb 2026 21:27:23 +0100
Subject: [PATCH 07/17] Fix VAE encoder double free and enc_out buffer size in
 reference_audio path

- vae_encoder_free: reset sched, free in order (sched, weight_ctx, buf, backends), null ptrs
- dit-vae: size enc_out to max_lat = (T_audio/2048)+1 to avoid encoder write overflow

Made-with: Cursor
---
 src/vae.h         | 28 +++++++++++++++++++++++-----
 tools/dit-vae.cpp |  4 +++-
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/vae.h b/src/vae.h
index f97da51..b4335e1 100644
--- a/src/vae.h
+++ b/src/vae.h
@@ -731,10 +731,28 @@ static int vae_encoder_forward(VAEEncoderGGML * m, const float * audio, int T_au
 }
 
 static void vae_encoder_free(VAEEncoderGGML * m) {
-    if (m->sched) ggml_backend_sched_free(m->sched);
-    if (m->buf) ggml_backend_buffer_free(m->buf);
-    if (m->weight_ctx) ggml_free(m->weight_ctx);
-    if (m->backend && m->backend != m->cpu_backend) ggml_backend_free(m->backend);
-    if (m->cpu_backend) ggml_backend_free(m->cpu_backend);
+    // Order: reset sched, free sched (drops refs to graph/alloc), free weight_ctx (tensor metadata),
+    // then buffer (tensor data), then backends. Avoids double free on some GGML backends.
+    if (m->sched) {
+        ggml_backend_sched_reset(m->sched);
+        ggml_backend_sched_free(m->sched);
+        m->sched = NULL;
+    }
+    if (m->weight_ctx) {
+        ggml_free(m->weight_ctx);
+        m->weight_ctx = NULL;
+    }
+    if (m->buf) {
+        ggml_backend_buffer_free(m->buf);
+        m->buf = NULL;
+    }
+    if (m->backend && m->backend != m->cpu_backend) {
+        ggml_backend_free(m->backend);
+        m->backend = NULL;
+    }
+    if (m->cpu_backend) {
+        ggml_backend_free(m->cpu_backend);
+        m->cpu_backend = NULL;
+    }
     *m = {};
 }
diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp
index cac80a5..b24c967 100644
--- a/tools/dit-vae.cpp
+++ b/tools/dit-vae.cpp
@@ -393,7 +393,9 @@ int main(int argc, char ** argv) {
                 if (vae_encoder_load(&enc, vae_gguf)) {
                     int T_audio = n_samples;
                     if (T_audio >= 1920) {
-                        std::vector<float> enc_out((size_t)S_ref * 64);
+                        // Encoder strides 2,4,4,8,8 -> max latent frames = T_audio/2048 + 1
+                        size_t max_lat = (size_t)(T_audio / 2048) + 1;
+                        std::vector<float> enc_out(max_lat * 64);
                         int T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data());
                         if (T_lat > 0) {
                             size_t copy_frames = (size_t)(T_lat < S_ref ? T_lat : S_ref);

From 907a068a24dba3208671edd45775a7cd1334d9ad Mon Sep 17 00:00:00 2001
From: qxip <qxip@mini-ams.local>
Date: Sat, 28 Feb 2026 22:57:57 +0100
Subject: [PATCH 08/17] LoRA: apply on fused layers, add custom_tag/genre,
 update example

- dit-graph.h: apply LoRA deltas when base uses fused QKV/gate_up/ca_qkv
  so self-attn, MLP, and cross-attn all use adapters (fixes no audible effect)
- dit-lora.cpp: fix safetensors parse (rewind fp before parse); normalize
  keys for base_model.model.layers.* and .lora_A.weight/.lora_B.weight
- request: add custom_tag (LoRA trigger) and genre; parse language,
  is_instrumental, formatted_lyrics
- dit-vae: append custom_tag to caption for condition encoder when set
- examples/lora.json: nu-disco example with custom_tag crydamoure

Made-with: Cursor
---
 examples/lora.json | 16 +++++++++++-----
 src/dit-graph.h    | 48 ++++++++++++++++++++++++++++++++++++++++++++--
 src/dit-lora.cpp   | 20 +++++++++++--------
 src/request.cpp    | 13 +++++++++++++
 src/request.h      |  6 +++++-
 tools/dit-vae.cpp  |  7 +++++--
 6 files changed, 92 insertions(+), 18 deletions(-)

diff --git a/examples/lora.json b/examples/lora.json
index 8317521..c872efb 100644
--- a/examples/lora.json
+++ b/examples/lora.json
@@ -1,11 +1,17 @@
 {
   "task_type": "text2music",
-  "caption": "Emotional vocal track with soft synths",
-  "lyrics": "",
-  "duration": 10,
+  "caption": "An energetic nu-disco track built on a foundation of a tight, funky slap bassline and a crisp, four-on-the-floor drum machine beat. The song opens with a distinctive, filtered wah-wah guitar riff that serves as a recurring motif. The arrangement is layered with shimmering synth pads, punchy synth stabs, and subtle arpeggiated synth textures that add movement. The track progresses through dynamic sections, including a brief atmospheric breakdown before rebuilding the main groove.",
+  "genre": "Nu-disco",
+  "lyrics": "[Instrumental]",
+  "bpm": 115,
+  "keyscale": "C# major",
+  "timesignature": "4",
+  "duration": 256,
+  "language": "unknown",
+  "instrumental": true,
+  "custom_tag": "crydamoure",
   "inference_steps": 8,
   "guidance_scale": 1,
   "shift": 3,
-  "seed": 42,
-  "vocal_language": "en"
+  "seed": -1
 }
diff --git a/src/dit-graph.h b/src/dit-graph.h
index ab5839e..1241bc5 100644
--- a/src/dit-graph.h
+++ b/src/dit-graph.h
@@ -187,10 +187,25 @@ static struct ggml_tensor * dit_ggml_build_self_attn(
         q = ggml_cont(ctx, ggml_view_3d(ctx, qkv, q_dim, S, N, qkv->nb[1], qkv->nb[2], 0));
         k = ggml_cont(ctx, ggml_view_3d(ctx, qkv, kv_dim, S, N, qkv->nb[1], qkv->nb[2], (size_t)q_dim * qkv->nb[0]));
         v = ggml_cont(ctx, ggml_view_3d(ctx, qkv, kv_dim, S, N, qkv->nb[1], qkv->nb[2], (size_t)(q_dim + kv_dim) * qkv->nb[0]));
+        // LoRA on fused path: add scale * (B @ (A @ x)) per projection when adapters are loaded
+        if (lora_scale != 0.0f) {
+            if (ly->lora_sa_q_a && ly->lora_sa_q_b)
+                q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_q_b, ggml_mul_mat(ctx, ly->lora_sa_q_a, norm_sa)), lora_scale));
+            if (ly->lora_sa_k_a && ly->lora_sa_k_b)
+                k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_k_b, ggml_mul_mat(ctx, ly->lora_sa_k_a, norm_sa)), lora_scale));
+            if (ly->lora_sa_v_a && ly->lora_sa_v_b)
+                v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_v_b, ggml_mul_mat(ctx, ly->lora_sa_v_a, norm_sa)), lora_scale));
+        }
     } else if (ly->sa_qk) {
         struct ggml_tensor * qk = dit_ggml_linear(ctx, ly->sa_qk, norm_sa);
         q = ggml_cont(ctx, ggml_view_3d(ctx, qk, q_dim, S, N, qk->nb[1], qk->nb[2], 0));
         k = ggml_cont(ctx, ggml_view_3d(ctx, qk, kv_dim, S, N, qk->nb[1], qk->nb[2], (size_t)q_dim * qk->nb[0]));
+        if (lora_scale != 0.0f) {
+            if (ly->lora_sa_q_a && ly->lora_sa_q_b)
+                q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_q_b, ggml_mul_mat(ctx, ly->lora_sa_q_a, norm_sa)), lora_scale));
+            if (ly->lora_sa_k_a && ly->lora_sa_k_b)
+                k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_k_b, ggml_mul_mat(ctx, ly->lora_sa_k_a, norm_sa)), lora_scale));
+        }
         v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa);
     } else {
         q = dit_ggml_linear_lora(ctx, ly->sa_q_proj, ly->lora_sa_q_a, ly->lora_sa_q_b, lora_scale, norm_sa);
@@ -271,12 +286,25 @@ static struct ggml_tensor * dit_ggml_build_mlp(
         struct ggml_tensor * norm_ffn,
         int S) {
 
+    DiTGGMLConfig & c = m->cfg;
+    int I = c.intermediate_size;
+    int N = (int)norm_ffn->ne[2];
     float lora_scale = m->lora_scale;
     struct ggml_tensor * ff;
     if (ly->gate_up) {
         // Fused: single matmul [H, 2*I] x [H, S, N] -> [2*I, S, N], then swiglu splits ne[0]
         struct ggml_tensor * gu = dit_ggml_linear(ctx, ly->gate_up, norm_ffn);
-        ff = ggml_swiglu(ctx, gu);
+        if (lora_scale != 0.0f && ((ly->lora_gate_a && ly->lora_gate_b) || (ly->lora_up_a && ly->lora_up_b))) {
+            struct ggml_tensor * gate = ggml_cont(ctx, ggml_view_3d(ctx, gu, I, S, N, gu->nb[1], gu->nb[2], 0));
+            struct ggml_tensor * up   = ggml_cont(ctx, ggml_view_3d(ctx, gu, I, S, N, gu->nb[1], gu->nb[2], (size_t)I * gu->nb[0]));
+            if (ly->lora_gate_a && ly->lora_gate_b)
+                gate = ggml_add(ctx, gate, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_gate_b, ggml_mul_mat(ctx, ly->lora_gate_a, norm_ffn)), lora_scale));
+            if (ly->lora_up_a && ly->lora_up_b)
+                up   = ggml_add(ctx, up, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_up_b, ggml_mul_mat(ctx, ly->lora_up_a, norm_ffn)), lora_scale));
+            ff = ggml_swiglu_split(ctx, gate, up);
+        } else {
+            ff = ggml_swiglu(ctx, gu);
+        }
     } else {
         // Separate: two matmuls + split swiglu (with optional LoRA)
         struct ggml_tensor * gate = dit_ggml_linear_lora(ctx, ly->gate_proj, ly->lora_gate_a, ly->lora_gate_b, lora_scale, norm_ffn);
@@ -311,6 +339,7 @@ static struct ggml_tensor * dit_ggml_build_cross_attn(
     // Q from hidden, KV from encoder (full fused, Q+KV partial, separate)
     int q_dim  = Nh * D;
     int kv_dim = Nkv * D;
+    float lora_scale = m->lora_scale;
     struct ggml_tensor * q, * k, * v;
     if (ly->ca_qkv) {
         // Full QKV fused: split Q from hidden, KV from enc via weight views
@@ -322,12 +351,27 @@ static struct ggml_tensor * dit_ggml_build_cross_attn(
         struct ggml_tensor * kv = ggml_mul_mat(ctx, w_kv, enc);
         k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0));
         v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0]));
+        // LoRA on fused path: add scale * (B @ (A @ x)) for Q (from norm_ca), K/V (from enc)
+        if (lora_scale != 0.0f) {
+            if (ly->lora_ca_q_a && ly->lora_ca_q_b)
+                q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_q_b, ggml_mul_mat(ctx, ly->lora_ca_q_a, norm_ca)), lora_scale));
+            if (ly->lora_ca_k_a && ly->lora_ca_k_b)
+                k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_k_b, ggml_mul_mat(ctx, ly->lora_ca_k_a, enc)), lora_scale));
+            if (ly->lora_ca_v_a && ly->lora_ca_v_b)
+                v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_v_b, ggml_mul_mat(ctx, ly->lora_ca_v_a, enc)), lora_scale));
+        }
     } else if (ly->ca_kv) {
         // Q separate, K+V fused
-        q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca);
+        q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, lora_scale, norm_ca);
         struct ggml_tensor * kv = ggml_mul_mat(ctx, ly->ca_kv, enc);
         k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0));
         v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0]));
+        if (lora_scale != 0.0f) {
+            if (ly->lora_ca_k_a && ly->lora_ca_k_b)
+                k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_k_b, ggml_mul_mat(ctx, ly->lora_ca_k_a, enc)), lora_scale));
+            if (ly->lora_ca_v_a && ly->lora_ca_v_b)
+                v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_v_b, ggml_mul_mat(ctx, ly->lora_ca_v_a, enc)), lora_scale));
+        }
     } else {
         q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca);
         k = dit_ggml_linear_lora(ctx, ly->ca_k_proj, ly->lora_ca_k_a, ly->lora_ca_k_b, m->lora_scale, enc);
diff --git a/src/dit-lora.cpp b/src/dit-lora.cpp
index 881d941..b14e090 100644
--- a/src/dit-lora.cpp
+++ b/src/dit-lora.cpp
@@ -9,7 +9,7 @@
 #include <unordered_map>
 
 // Normalize adapter key to base name: decoder.layers.N.<proj>
-// e.g. "base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_A.default" -> "decoder.layers.0.self_attn.q_proj"
+// Handles: base_model.model.model., base_model.model.; decoder.layers. or layers.; .lora_A.default/.lora_B.default or .lora_A.weight/.lora_B.weight
 static std::string lora_key_to_base(const std::string & key) {
     std::string s = key;
     const char * prefixes[] = { "base_model.model.model.", "base_model.model." };
@@ -20,14 +20,22 @@ static std::string lora_key_to_base(const std::string & key) {
             break;
         }
     }
+    // PEFT-style suffix
     if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.default") == 0)
         s = s.substr(0, s.size() - 14);
     else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.default") == 0)
         s = s.substr(0, s.size() - 14);
+    else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.weight") == 0)
+        s = s.substr(0, s.size() - 14);
+    else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.weight") == 0)
+        s = s.substr(0, s.size() - 14);
     else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_A") == 0)
         s = s.substr(0, s.size() - 7);
     else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_B") == 0)
         s = s.substr(0, s.size() - 7);
+    // HuggingFace adapter: layers.N -> decoder.layers.N for our DiT naming
+    if (s.size() >= 7 && s.compare(0, 7, "layers.") == 0)
+        s = "decoder." + s;
     return s;
 }
 
@@ -80,17 +88,13 @@ bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale) {
         fprintf(stderr, "[LoRA] cannot open %s\n", lora_path);
         return false;
     }
-    uint8_t h8[8];
-    if (fread(h8, 1, 8, fp) != 8) {
+    std::unordered_map<std::string, SafeTensorInfo> tensors;
+    if (fseek(fp, 0, SEEK_SET) != 0) {
         fclose(fp);
         return false;
     }
-    uint64_t header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24)
-        | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56);
-    uint64_t data_section_start = 8 + header_len;
-
-    std::unordered_map<std::string, SafeTensorInfo> tensors;
     int n = safetensors_parse_lora(fp, &tensors);
+    uint64_t data_section_start = (uint64_t)ftell(fp);
     if (n == 0) {
         fclose(fp);
         fprintf(stderr, "[LoRA] no LoRA tensors found in %s\n", lora_path);
diff --git a/src/request.cpp b/src/request.cpp
index a24d838..f85873a 100644
--- a/src/request.cpp
+++ b/src/request.cpp
@@ -16,6 +16,8 @@ void request_init(AceRequest * r) {
     r->caption            = "";
     r->lyrics             = "";
     r->instrumental       = false;
+    r->custom_tag         = "";
+    r->genre              = "";
     r->bpm                = 0;
     r->duration           = -1.0f;
     r->keyscale           = "";
@@ -227,7 +229,11 @@ bool request_parse(AceRequest * r, const char * path) {
         if      (k == "task_type")          r->task_type          = v;
         else if (k == "caption")            r->caption            = v;
         else if (k == "lyrics")             r->lyrics             = v;
+        else if (k == "custom_tag")         r->custom_tag         = v;
+        else if (k == "genre")              r->genre             = v;
         else if (k == "keyscale")           r->keyscale           = v;
+        else if (k == "formatted_lyrics")    r->lyrics             = v;  // alias for lyrics
+        else if (k == "language")           r->vocal_language    = v;  // alias for vocal_language
         else if (k == "timesignature")      r->timesignature      = v;
         else if (k == "vocal_language")     r->vocal_language     = v;
         else if (k == "reference_audio")   r->reference_audio    = v;
@@ -254,6 +260,7 @@ bool request_parse(AceRequest * r, const char * path) {
 
         // bools
         else if (k == "instrumental")       r->instrumental       = (v == "true");
+        else if (k == "is_instrumental")    r->instrumental       = (v == "true");
         // unknown keys: silently ignored (forward compat)
     }
 
@@ -274,6 +281,10 @@ bool request_write(const AceRequest * r, const char * path) {
     fprintf(f, "  \"lyrics\": \"%s\",\n",             json_escape(r->lyrics).c_str());
     if (r->instrumental)
         fprintf(f, "  \"instrumental\": true,\n");
+    if (!r->custom_tag.empty())
+        fprintf(f, "  \"custom_tag\": \"%s\",\n",     json_escape(r->custom_tag).c_str());
+    if (!r->genre.empty())
+        fprintf(f, "  \"genre\": \"%s\",\n",          json_escape(r->genre).c_str());
     fprintf(f, "  \"bpm\": %d,\n",                    r->bpm);
     fprintf(f, "  \"duration\": %.1f,\n",             r->duration);
     fprintf(f, "  \"keyscale\": \"%s\",\n",           json_escape(r->keyscale).c_str());
@@ -310,6 +321,8 @@ void request_dump(const AceRequest * r, FILE * f) {
     fprintf(f, "  caption:    %.60s%s\n",
             r->caption.c_str(), r->caption.size() > 60 ? "..." : "");
     fprintf(f, "  lyrics:     %zu bytes\n", r->lyrics.size());
+    if (!r->custom_tag.empty())
+        fprintf(f, "  custom_tag: %s\n", r->custom_tag.c_str());
     fprintf(f, "  bpm=%d dur=%.0f key=%s ts=%s lang=%s\n",
             r->bpm, r->duration, r->keyscale.c_str(),
             r->timesignature.c_str(), r->vocal_language.c_str());
diff --git a/src/request.h b/src/request.h
index e9222a0..ef4b41f 100644
--- a/src/request.h
+++ b/src/request.h
@@ -19,7 +19,11 @@ struct AceRequest {
     // text content
     std::string caption;            // ""
     std::string lyrics;             // ""
-    bool        instrumental;       // false
+    bool        instrumental;      // false
+
+    // LoRA / style trigger (appended to caption for condition encoder when set)
+    std::string custom_tag;         // "" e.g. "crydamoure"
+    std::string genre;             // "" e.g. "Nu-disco"
 
     // metadata (user-provided or LLM-enriched)
     int         bpm;                // 0 = unset
diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp
index d889da1..fd5fe47 100644
--- a/tools/dit-vae.cpp
+++ b/tools/dit-vae.cpp
@@ -243,8 +243,11 @@ int main(int argc, char ** argv) {
             continue;
         }
 
-        // Extract params
-        const char * caption  = req.caption.c_str();
+        // Extract params (append custom_tag to caption for LoRA/condition so trigger is in text)
+        std::string caption_for_cond = req.caption;
+        if (!req.custom_tag.empty())
+            caption_for_cond += ", " + req.custom_tag;
+        const char * caption  = caption_for_cond.c_str();
         const char * lyrics   = req.lyrics.c_str();
         char bpm_str[16] = "N/A";
         if (req.bpm > 0) snprintf(bpm_str, sizeof(bpm_str), "%d", req.bpm);

From acd84020a3f43a0ebfd0dbec323dca7f2781cafa Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Sat, 28 Feb 2026 23:23:08 +0100
Subject: [PATCH 09/17] Fix formatting and clarify LoRA adapter instructions

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9ad2a2b..d809146 100644
--- a/README.md
+++ b/README.md
@@ -146,13 +146,13 @@ cd examples
 ./cover.sh            # cover mode: decode precomputed audio_codes (no LLM)
 ./cover-reference.sh  # cover + reference_audio for timbre (WAV/MP3; needs reference.wav or .mp3)
 ./test-reference.sh   # reference_audio (WAV or MP3) + audio_cover_strength
-./lora.sh             # DiT + LoRA adapter (e.g. duckdbot/acestep-lora-cryda; put adapter in examples/lora/)
+./lora.sh             # DiT + LoRA adapter
 ```
 
 Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0)
 alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights).
 
-**LoRA adapters**: use `--lora <path>` and optional `--lora-scale <float>` with dit-vae to run the DiT with a PEFT-style LoRA (e.g. [duckdbot/acestep-lora-cryda](https://huggingface.co/duckdbot/acestep-lora-cryda)). Adapter must be `adapter_model.safetensors` (safetensors with `lora_A` / `lora_B` keys matching `decoder.layers.*`). Put the file in `examples/lora/` and run `./lora.sh`, or pass the path explicitly.
+**LoRA adapters**: use `--lora <path>` and optional `--lora-scale <float>` with dit-vae to run the DiT with PEFT-style Ace-Step LoRAs.
 
 ## Generation modes
 
@@ -214,7 +214,7 @@ All fields with defaults. Only `caption` is required. Built-in modes (text2music
 Key fields: `seed` -1 means random (resolved once, then +1 per batch
 element). `audio_codes` is generated by ace-qwen3 and consumed by
 dit-vae (comma separated FSQ token IDs). When present, the LLM is
-skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style (MP3 decoded in memory; encoded via built-in VAE encoder; requires VAE GGUF with encoder weights). `src_audio`: not yet implemented (see docs/MODES.md).
+skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style. `src_audio`: not yet implemented (see docs/MODES.md).
 
 Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG).
 SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`.

From 9b087b2ee191ecfce4c97bbb09cdae3cb93a7aa7 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Sat, 28 Feb 2026 23:24:06 +0100
Subject: [PATCH 10/17] Remove download instructions for LoRA adapter

Removed instructions for downloading LoRA adapter from Hugging Face.
---
 examples/lora.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/examples/lora.sh b/examples/lora.sh
index 9c25d33..db7ce2b 100755
--- a/examples/lora.sh
+++ b/examples/lora.sh
@@ -7,10 +7,6 @@ cd "$(dirname "$0")"
 ADAPTER="lora/adapter_model.safetensors"
 if [ ! -f "$ADAPTER" ]; then
     echo "LoRA adapter not found at $ADAPTER"
-    echo "Download once (e.g. from Hugging Face):"
-    echo "  mkdir -p lora"
-    echo "  curl -L -o $ADAPTER 'https://huggingface.co/duckdbot/acestep-lora-cryda/resolve/main/adapter_model.safetensors'"
-    echo "Or: pip install hf && huggingface-cli download duckdbot/acestep-lora-cryda adapter_model.safetensors --local-dir lora"
     exit 1
 fi
 

From fc2408ae7cda0e0fdd5b5fb59318d4db88545f69 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Sat, 28 Feb 2026 23:24:56 +0100
Subject: [PATCH 11/17] Update comments for custom_tag and genre fields

---
 src/request.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/request.h b/src/request.h
index ef4b41f..ba85821 100644
--- a/src/request.h
+++ b/src/request.h
@@ -22,8 +22,8 @@ struct AceRequest {
     bool        instrumental;      // false
 
     // LoRA / style trigger (appended to caption for condition encoder when set)
-    std::string custom_tag;         // "" e.g. "crydamoure"
-    std::string genre;             // "" e.g. "Nu-disco"
+    std::string custom_tag;         // "" LoRA trigger word
+    std::string genre;             // "" LoRA genre
 
     // metadata (user-provided or LLM-enriched)
     int         bpm;                // 0 = unset

From e860c79d45ce7329dd7f6557ae2ea44a337967cf Mon Sep 17 00:00:00 2001
From: qxip <qxip@mini-ams.local>
Date: Sat, 28 Feb 2026 23:46:24 +0100
Subject: [PATCH 12/17] Cover from file (src_audio), docs, README strength
 clarification

- src_audio: load WAV/MP3, VAE encode, FSQ nearest-codeword encode to codes
  (fsq-detok.h: codeword table + latent_frames_to_codes; dit-vae: wire path)
- reference_audio + cover (audio_codes/src_audio) fully supported without Python
- MODES.md: cover and reference_audio marked supported; request table updated
- README: clarify audio_cover_strength vs guidance_scale vs reference_audio
  (audio_cover_strength = cover blend; reference_audio = no strength knob;
   guidance_scale = DiT CFG, separate)

Made-with: Cursor
---
 README.md         |  7 ++++++-
 docs/MODES.md     | 38 +++++++++++---------------------------
 src/fsq-detok.h   | 45 +++++++++++++++++++++++++++++++++++++++++++++
 tools/dit-vae.cpp | 33 ++++++++++++++++++++++++++++++++-
 4 files changed, 94 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index d809146..8ad6ebf 100644
--- a/README.md
+++ b/README.md
@@ -214,7 +214,12 @@ All fields with defaults. Only `caption` is required. Built-in modes (text2music
 Key fields: `seed` -1 means random (resolved once, then +1 per batch
 element). `audio_codes` is generated by ace-qwen3 and consumed by
 dit-vae (comma separated FSQ token IDs). When present, the LLM is
-skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style. `src_audio`: not yet implemented (see docs/MODES.md).
+skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style (MP3 decoded in memory; encoded via built-in VAE encoder; requires VAE GGUF with encoder weights). `src_audio`: path to a **WAV or MP3** for cover source; dit-vae encodes it (VAE + FSQ nearest-codeword) to codes internally, no Python required (see docs/MODES.md).
+
+**Reference and cover strength (not the same as guidance_scale):**
+- **`audio_cover_strength`** (0.0–1.0): Controls how strongly the **cover/source** (from `audio_codes` or `src_audio`) influences the DiT context. The context is blended with silence: `(1 - audio_cover_strength)*silence + audio_cover_strength*decoded`. Use 1.0 for full cover influence, lower values to soften it. Only applies when cover context is present.
+- **`reference_audio`**: Timbre from the reference file is applied at full strength; there is no separate strength parameter for reference timbre.
+- **`guidance_scale`**: This is **DiT classifier-free guidance** (conditioned vs unconditioned prediction), not reference or cover strength. Turbo models ignore it (forced to 1.0).
 
 Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG).
 SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`.
diff --git a/docs/MODES.md b/docs/MODES.md
index 4149ae7..ae0b616 100644
--- a/docs/MODES.md
+++ b/docs/MODES.md
@@ -7,7 +7,7 @@ This document maps the [ACE-Step 1.5 Tutorial](https://github.com/ace-step/ACE-S
 | `task_type`   | Description | Turbo/SFT | Base only | C++ status |
 |---------------|-------------|-----------|-----------|------------|
 | **text2music** | Generate from caption/lyrics (and optional reference) | ✅ | — | ✅ **Supported** |
-| **cover**      | Re-synthesize with structure from source; optional timbre from reference | ✅ | — | ⚠️ **Partial** (see below) |
+| **cover**      | Re-synthesize with structure from source; optional timbre from reference | ✅ | — | ✅ **Supported** (audio_codes or src_audio WAV/MP3) |
 | **repaint**    | Local edit in time range using source as context | ✅ | — | ❌ Not implemented |
 | **lego**       | Add new tracks to existing audio | — | ✅ | ❌ Base model only |
 | **extract**    | Extract single track from mix | — | ✅ | ❌ Base model only |
@@ -22,30 +22,16 @@ We only ship Turbo and SFT DiT weights; **lego**, **extract**, **complete** requ
 ### text2music (default)
 - **Input**: `caption`, optional `lyrics`, metadata (bpm, duration, keyscale, …).
 - **Flow**: LM (optional) → CoT + audio codes → DiT (context = silence) → VAE → WAV.
-- **Timbre**: Always uses built-in silence latent from the DiT GGUF (no user reference yet).
-
-### cover (when `audio_codes` are provided)
-- **Input**: Same as text2music, plus **precomputed** `audio_codes` (e.g. from a previous run or from Python).
-- **Flow**: Skip LM; decode `audio_codes` to latents → DiT context = decoded + silence padding → DiT → VAE → WAV.
-- **Limitation**: We do **not** convert a WAV file into `audio_codes`. So “cover from a file” is only possible if you already have codes (e.g. from Python or from a prior `ace-qwen3` run). The request fields `reference_audio` and `src_audio` are accepted in JSON but **not yet used** in the pipeline.
+- **Timbre**: Optional **reference_audio** (WAV/MP3) → VAE encode → CondEncoder timbre; else built-in silence.
 
+### cover (when `audio_codes` or `src_audio` are provided)
+- **Input**: Same as text2music, plus either **precomputed** `audio_codes` or **`src_audio`** (WAV/MP3 path). Optional **reference_audio** for timbre.
+- **Flow**: If `src_audio` set and no `audio_codes`: load WAV/MP3 → VAE encode → FSQ nearest-codeword encode → codes. Then decode codes to latents → DiT context (blend with silence) → DiT → VAE → WAV. No Python.
+- **reference_audio** and **audio_cover_strength**: Implemented (timbre; blend).
 ---
 
 ## What’s not implemented yet
 
-### reference_audio (global timbre/style)
-- **Tutorial**: Load WAV → stereo 48 kHz, pad/repeat to ≥30 s → **VAE encode** → latents → feed as timbre condition into DiT.
-- **C++**: Implemented. Set `reference_audio` to a **WAV or MP3 file path**. dit-vae loads the file (WAV: any sample rate resampled to 48 kHz; MP3: decoded in memory via header-only minimp3, no temp files, then resampled to 48 kHz if needed), runs the **VAE encoder** (Oobleck, in C++ in `vae.h`), and feeds the 64-d latents to the CondEncoder timbre path. No Python, no external deps. Requires a **full VAE GGUF** that includes `encoder.*` tensors (decoder-only GGUFs will print a clear error).
-- **audio_cover_strength** (0.0–1.0): Implemented. When `audio_codes` are present, context latents are blended with silence: `(1 - strength)*silence + strength*decoded`.
-
-### src_audio (Cover from file)
-- **Tutorial**: Source audio is converted to **semantic codes** (melody, rhythm, chords, etc.); then DiT uses those as in cover mode.
-- **C++**: That implies **audio → codes**. Likely path: WAV → VAE encode → **FSQ tokenizer** (latents → 5 Hz codes). We have the **FSQ detokenizer** (codes → latents); the tokenizer (encode) side would need to be added. Then: `src_audio` path → load WAV → VAE encode → FSQ encode → `audio_codes` → existing cover path.
-
-### audio_cover_strength
-- **Tutorial**: 0.0–1.0, how strongly generation follows reference/codes.
-- **C++**: Field is in the request and parsed; no blending logic in the DiT/context path yet.
-
 ### repaint
 - **Tutorial**: Specify `repainting_start` / `repainting_end` (seconds); model uses source audio as context and only generates in that interval (3–90 s).
 - **C++**: Would require **masked diffusion**: context carries “given” frames; ODE only updates the repaint region. DiT’s context has a 64-channel “mask” that we currently set to 1.0; repaint would set mask per frame and the generation loop would only update unmasked frames. Not implemented.
@@ -60,9 +46,9 @@ All of these are in `AceRequest` and parsed from / written to JSON. Backend beha
 |-------|------|--------|
 | `task_type` | string | `"text2music"` \| `"cover"` \| `"repaint"` \| … |
 | `reference_audio` | string | Path to WAV or MP3 for timbre (implemented) |
-| `src_audio` | string | Path to WAV for cover/repaint source (not used yet) |
-| `audio_codes` | string | Comma-separated FSQ codes; non-empty ⇒ cover path |
-| `audio_cover_strength` | float | 0.0–1.0 (parsed, not used yet) |
+| `src_audio` | string | Path to WAV or MP3 for cover source; encoded to codes internally (implemented) |
+| `audio_codes` | string | Comma-separated FSQ codes; non-empty ⇒ cover path (or from `src_audio`) |
+| `audio_cover_strength` | float | 0.0–1.0 blend of decoded context with silence (implemented) |
 | `repainting_start` | float | Start time (s) for repaint (not used yet) |
 | `repainting_end` | float | End time (s) for repaint (not used yet) |
 
@@ -72,8 +58,6 @@ See `request.h` and the README “Request JSON reference” for the full list.
 
 ## Summary
 
-- **Fully supported**: text2music; cover when you supply **precomputed** `audio_codes`.
-- **Schema only** (no backend): `task_type`, `reference_audio`, `src_audio`, `audio_cover_strength`, `repainting_start`/`repainting_end`.
-- **To support reference_audio**: add VAE encoder, then feed its output into the existing CondEncoder timbre path.
-- **To support cover from file**: add VAE encoder + FSQ tokenizer (or equivalent audio→codes), then reuse existing cover path.
+- **Fully supported**: text2music (with optional reference_audio for timbre); cover from **precomputed** `audio_codes` or from **WAV/MP3** via `src_audio` (VAE encode + FSQ nearest-codeword encode); reference_audio (timbre); audio_cover_strength (blend).
+- **Schema only** (no backend): `repainting_start`/`repainting_end`.
 - **To support repaint**: implement masked DiT generation (context mask + ODE only on repaint interval).
diff --git a/src/fsq-detok.h b/src/fsq-detok.h
index 0d4e33c..29eef5f 100644
--- a/src/fsq-detok.h
+++ b/src/fsq-detok.h
@@ -10,10 +10,13 @@
 
 #pragma once
 #include "qwen3-enc.h"
+#include <vector>
 
 // FSQ constants
 static const int FSQ_NDIMS = 6;
 static const int FSQ_LEVELS[6] = {8, 8, 8, 5, 5, 5};
+static const int FSQ_N_CODES = 8 * 8 * 8 * 5 * 5 * 5;  // 8000
+static const int FSQ_FRAMES_PER_CODE = 5;
 
 // FSQ decode: integer index -> 6 normalized float values
 // Each dimension: level_idx / ((L-1)/2) - 1.0  (maps to [-1, 1])
@@ -214,6 +217,48 @@ static int detok_ggml_decode(DetokGGML * m, const int * codes, int T_5Hz,
     return T_25Hz;
 }
 
+// Build codeword table for latent->code (cover from file): for each code 0..FSQ_N_CODES-1,
+// decode to 5*64 floats. table_out must be at least FSQ_N_CODES * FSQ_FRAMES_PER_CODE * 64 floats.
+static void detok_ggml_build_codeword_table(DetokGGML * m, float * table_out) {
+    const int chunk = FSQ_FRAMES_PER_CODE * 64;
+    for (int i = 0; i < FSQ_N_CODES; i++) {
+        int n = detok_ggml_decode(m, &i, 1, table_out + (size_t)i * chunk);
+        (void)n;
+    }
+}
+
+// Encode latent frames to 5Hz codes by nearest codeword. T_latent = number of 25Hz frames (64-d each).
+// Groups frames in chunks of 5; for each chunk finds the code whose codeword minimizes L2 distance.
+// codeword_table from detok_ggml_build_codeword_table (FSQ_N_CODES * 5 * 64 floats).
+// Pads last chunk with zeros if T_latent not divisible by 5.
+static void latent_frames_to_codes(int T_latent, const float * latent_64d,
+                                   const float * codeword_table,
+                                   std::vector<int> * out_codes) {
+    out_codes->clear();
+    const int chunk_frames = FSQ_FRAMES_PER_CODE;
+    const int chunk_size = chunk_frames * 64;
+    int n_chunks = T_latent / chunk_frames;
+    if (n_chunks <= 0) return;
+    for (int g = 0; g < n_chunks; g++) {
+        const float * chunk = latent_64d + (size_t)g * chunk_size;
+        int best = 0;
+        float best_d2 = 1e30f;
+        for (int i = 0; i < FSQ_N_CODES; i++) {
+            const float * cw = codeword_table + (size_t)i * chunk_size;
+            float d2 = 0.0f;
+            for (int j = 0; j < chunk_size; j++) {
+                float d = chunk[j] - cw[j];
+                d2 += d * d;
+            }
+            if (d2 < best_d2) {
+                best_d2 = d2;
+                best = i;
+            }
+        }
+        out_codes->push_back(best);
+    }
+}
+
 // Free
 static void detok_ggml_free(DetokGGML * m) {
     if (m->sched) ggml_backend_sched_free(m->sched);
diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp
index 7b5f188..bfe274d 100644
--- a/tools/dit-vae.cpp
+++ b/tools/dit-vae.cpp
@@ -277,8 +277,39 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "[Pipeline] seed=%lld, steps=%d, guidance=%.1f, shift=%.1f, duration=%.1fs\n",
                 seed, num_steps, guidance_scale, shift, duration);
 
-        // Parse audio codes from request
+        // Parse audio codes from request (or produce from src_audio WAV/MP3)
         std::vector<int> codes_vec = parse_codes_string(req.audio_codes);
+        if (codes_vec.empty() && !req.src_audio.empty() && have_vae) {
+            const std::string & src_path = req.src_audio;
+            std::vector<float> wav_stereo;
+            int n_samples = load_audio_48k_stereo(src_path.c_str(), &wav_stereo);
+            if (n_samples > 0) {
+                int T_audio = n_samples;
+                if (T_audio >= 1920) {
+                    VAEEncoderGGML enc = {};
+                    if (vae_encoder_load(&enc, vae_gguf)) {
+                        size_t max_lat = (size_t)(T_audio / 2048) + 1;
+                        std::vector<float> enc_out(max_lat * 64);
+                        int T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data());
+                        vae_encoder_free(&enc);
+                        if (T_lat >= FSQ_FRAMES_PER_CODE) {
+                            DetokGGML detok = {};
+                            if (detok_ggml_load(&detok, dit_gguf, model.backend, model.cpu_backend)) {
+                                std::vector<float> codeword_table((size_t)FSQ_N_CODES * FSQ_FRAMES_PER_CODE * 64);
+                                fprintf(stderr, "[Cover] building FSQ codeword table (8000 codes)...\n");
+                                detok_ggml_build_codeword_table(&detok, codeword_table.data());
+                                latent_frames_to_codes(T_lat, enc_out.data(), codeword_table.data(), &codes_vec);
+                                fprintf(stderr, "[Cover] encoded %s -> %zu codes (%.1fs @ 5Hz)\n",
+                                        src_path.c_str(), codes_vec.size(), (float)codes_vec.size() / 5.0f);
+                                detok_ggml_free(&detok);
+                            }
+                        }
+                    }
+                }
+            } else {
+                fprintf(stderr, "[Cover] WARNING: cannot load src_audio %s (use .wav or .mp3), skipping cover-from-file\n", src_path.c_str());
+            }
+        }
         if (!codes_vec.empty())
             fprintf(stderr, "[Pipeline] %zu audio codes (%.1fs @ 5Hz)\n",
                     codes_vec.size(), (float)codes_vec.size() / 5.0f);

From d4d3e3b6df669a9da06f7f230597979cbd8f5842 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 1 Mar 2026 22:39:35 +0000
Subject: [PATCH 13/17] Initial plan


From b237e8e285b2f12929b6ab2c8720ffc6bbb8de49 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 1 Mar 2026 22:58:27 +0000
Subject: [PATCH 14/17] Resolve all 16 merge conflicts: add upstream features,
 preserve fork additions

Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com>
---
 _codeql_detected_source_root | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 _codeql_detected_source_root

diff --git a/_codeql_detected_source_root b/_codeql_detected_source_root
new file mode 120000
index 0000000..945c9b4
--- /dev/null
+++ b/_codeql_detected_source_root
@@ -0,0 +1 @@
+.
\ No newline at end of file

From 8ddc5d2ac8975dcc30f5551ca09f7ccaee803703 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Mon, 2 Mar 2026 00:04:05 +0100
Subject: [PATCH 15/17] Delete _codeql_detected_source_root

---
 _codeql_detected_source_root | 1 -
 1 file changed, 1 deletion(-)
 delete mode 120000 _codeql_detected_source_root

diff --git a/_codeql_detected_source_root b/_codeql_detected_source_root
deleted file mode 120000
index 945c9b4..0000000
--- a/_codeql_detected_source_root
+++ /dev/null
@@ -1 +0,0 @@
-.
\ No newline at end of file

From 71695c517324b6f6aa49e89a3e10fef906dfabf3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 7 Mar 2026 10:05:16 +0000
Subject: [PATCH 16/17] Initial plan


From bbe369fa15c5743cb259ebd3a00e2f0d218d783e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 7 Mar 2026 10:09:04 +0000
Subject: [PATCH 17/17] Reset master to upstream
 (ServeurpersoCom/acestep.cpp@aeff0b8)

---
 .github/workflows/build.yml              |   44 -
 .github/workflows/test-generation.yml    |   61 -
 .gitignore                               |    5 -
 CMakeLists.txt                           |   17 +-
 README.md                                |  267 +++-
 build.sh                                 |    8 -
 docs/MODES.md                            |   63 -
 examples/cover-reference.json            |   16 -
 examples/cover-reference.sh              |   20 -
 examples/cover.json                      |   15 -
 examples/cover.sh                        |   15 -
 examples/lora.json                       |   17 -
 examples/lora.sh                         |   27 -
 examples/request-reference.json          |   13 -
 examples/run-examples-ci.sh              |   47 -
 examples/test-reference.sh               |   21 -
 src/audio.h                              |   17 -
 src/audio_loader.cpp                     |  117 --
 src/backend.h                            |   16 +-
 src/cond-enc.h                           |    5 +-
 src/debug.h                              |    2 -
 src/dit-graph.h                          |   96 +-
 src/dit-lora.cpp                         |  202 ---
 src/dit-sampler.h                        |    4 -
 src/dit.h                                |   24 -
 src/fsq-detok.h                          |   49 +-
 src/gguf-weights.h                       |    1 -
 src/metadata-fsm.h                       |    2 -
 src/prompt.h                             |    1 -
 src/qwen3-enc.h                          |   11 +-
 src/qwen3-lm.h                           |   13 +-
 src/request.cpp                          |   62 +-
 src/request.h                            |   27 +-
 src/safetensors.h                        |  107 --
 src/vae-enc.h                            |  391 +++++
 src/vae.h                                |  207 +--
 src/wav.h                                |  101 --
 tests/BF16.log                           |  130 --
 tests/CPU-BF16.log                       |  113 +-
 tests/CPU-Q4_K_M.log                     |  115 +-
 tests/CPU-Q5_K_M.log                     |  113 +-
 tests/CPU-Q6_K.log                       |  113 +-
 tests/CPU-Q8_0.log                       |  113 +-
 tests/CPU_BF16.log                       |  130 --
 tests/CPU_Q4_K_M.log                     |  130 --
 tests/CPU_Q5_K_M.log                     |  130 --
 tests/CPU_Q6_K.log                       |  130 --
 tests/CPU_Q8_0.log                       |  130 --
 tests/CUDA-BF16.log                      |  115 +-
 tests/CUDA-Q4_K_M.log                    |  115 +-
 tests/CUDA-Q5_K_M.log                    |  113 +-
 tests/CUDA-Q6_K.log                      |  115 +-
 tests/CUDA-Q8_0.log                      |  113 +-
 tests/Metal_Q4_K_M.log                   |  835 ----------
 tests/Metal_Q5_K_M.log                   |  835 ----------
 tests/Metal_Q6_K.log                     |  819 ----------
 tests/Metal_Q8_0.log                     |  823 ----------
 tests/Q4_K_M.log                         |  130 --
 tests/Q5_K_M.log                         |  130 --
 tests/Q6_K.log                           |  130 --
 tests/Q8_0.log                           |  130 --
 tests/Vulkan-BF16.log                    |  229 ++-
 tests/Vulkan-Q4_K_M.log                  |  185 ++-
 tests/Vulkan-Q5_K_M.log                  |  175 +-
 tests/Vulkan-Q6_K.log                    |  229 ++-
 tests/Vulkan-Q8_0.log                    |  227 ++-
 tests/Vulkan_BF16.log                    |  130 --
 tests/Vulkan_Q4_K_M.log                  |  130 --
 tests/Vulkan_Q5_K_M.log                  |  130 --
 tests/Vulkan_Q6_K.log                    |  130 --
 tests/Vulkan_Q8_0.log                    |  130 --
 tests/fixtures/ci-cover.json             |   12 -
 tests/fixtures/ci-dit-only.json          |   11 -
 tests/fixtures/ci-full.json              |   11 -
 tests/fixtures/ci-partial.json           |    8 -
 tests/fixtures/ci-request-reference.json |   13 -
 tests/fixtures/ci-text2music.json        |   11 -
 tests/run-generation-tests.sh            |   83 -
 third_party/minimp3.h                    | 1865 ----------------------
 tools/ace-qwen3.cpp                      |    3 +-
 tools/dit-vae.cpp                        |  136 +-
 tools/neural-codec.cpp                   |  522 ++++++
 tools/quantize.cpp                       |    1 -
 83 files changed, 2313 insertions(+), 9919 deletions(-)
 delete mode 100644 .github/workflows/build.yml
 delete mode 100644 .github/workflows/test-generation.yml
 delete mode 100755 build.sh
 delete mode 100644 docs/MODES.md
 delete mode 100644 examples/cover-reference.json
 delete mode 100755 examples/cover-reference.sh
 delete mode 100644 examples/cover.json
 delete mode 100755 examples/cover.sh
 delete mode 100644 examples/lora.json
 delete mode 100755 examples/lora.sh
 delete mode 100644 examples/request-reference.json
 delete mode 100755 examples/run-examples-ci.sh
 delete mode 100755 examples/test-reference.sh
 delete mode 100644 src/audio.h
 delete mode 100644 src/audio_loader.cpp
 delete mode 100644 src/dit-lora.cpp
 delete mode 100644 src/safetensors.h
 create mode 100644 src/vae-enc.h
 delete mode 100644 src/wav.h
 delete mode 100644 tests/BF16.log
 delete mode 100644 tests/CPU_BF16.log
 delete mode 100644 tests/CPU_Q4_K_M.log
 delete mode 100644 tests/CPU_Q5_K_M.log
 delete mode 100644 tests/CPU_Q6_K.log
 delete mode 100644 tests/CPU_Q8_0.log
 delete mode 100644 tests/Metal_Q4_K_M.log
 delete mode 100644 tests/Metal_Q5_K_M.log
 delete mode 100644 tests/Metal_Q6_K.log
 delete mode 100644 tests/Metal_Q8_0.log
 delete mode 100644 tests/Q4_K_M.log
 delete mode 100644 tests/Q5_K_M.log
 delete mode 100644 tests/Q6_K.log
 delete mode 100644 tests/Q8_0.log
 delete mode 100644 tests/Vulkan_BF16.log
 delete mode 100644 tests/Vulkan_Q4_K_M.log
 delete mode 100644 tests/Vulkan_Q5_K_M.log
 delete mode 100644 tests/Vulkan_Q6_K.log
 delete mode 100644 tests/Vulkan_Q8_0.log
 delete mode 100644 tests/fixtures/ci-cover.json
 delete mode 100644 tests/fixtures/ci-dit-only.json
 delete mode 100644 tests/fixtures/ci-full.json
 delete mode 100644 tests/fixtures/ci-partial.json
 delete mode 100644 tests/fixtures/ci-request-reference.json
 delete mode 100644 tests/fixtures/ci-text2music.json
 delete mode 100755 tests/run-generation-tests.sh
 delete mode 100644 third_party/minimp3.h
 create mode 100644 tools/neural-codec.cpp

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
deleted file mode 100644
index fe17246..0000000
--- a/.github/workflows/build.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-# Validate that the project builds on Ubuntu and macOS (no model download).
-name: Build
-
-on:
-  push:
-    branches: [main, master]
-  pull_request:
-    branches: [main, master]
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, macos-latest]
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Build (Ubuntu)
-        if: matrix.os == 'ubuntu-latest'
-        run: |
-          sudo apt-get update -qq
-          sudo apt-get install -y -qq cmake build-essential pkg-config libopenblas-dev
-          mkdir build && cd build
-          cmake .. -DGGML_BLAS=ON
-          cmake --build . --config Release -j$(nproc)
-
-      - name: Build (macOS)
-        if: matrix.os == 'macos-latest'
-        run: |
-          mkdir build && cd build
-          cmake ..
-          cmake --build . --config Release -j$(sysctl -n hw.ncpu)
-
-      - name: Smoke test
-        run: |
-          ./build/ace-qwen3 --help 2>&1 | head -5
-          ./build/dit-vae --help 2>&1 | head -5
-          ./build/quantize --help 2>&1 | head -3
diff --git a/.github/workflows/test-generation.yml b/.github/workflows/test-generation.yml
deleted file mode 100644
index 2f84eed..0000000
--- a/.github/workflows/test-generation.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Build, download models (cached), and run all example scripts with short CI fixtures.
-# Validate locally: from repo root run ./examples/run-examples-ci.sh (after build + ./models.sh).
-# Trigger: release (published) or workflow_dispatch only.
-name: Test generation
-
-on:
-  workflow_dispatch: {}
-  release:
-    types: [published]
-
-jobs:
-  build-and-test:
-    runs-on: ubuntu-latest
-    timeout-minutes: 60
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Build
-        run: |
-          sudo apt-get update -qq
-          sudo apt-get install -y -qq cmake build-essential pkg-config libopenblas-dev
-          mkdir build && cd build
-          cmake .. -DGGML_BLAS=ON
-          cmake --build . --config Release -j$(nproc)
-
-      # Restore/save model artifacts (see https://docs.github.com/en/actions/reference/workflows-and-actions/dependency-caching)
-      - name: Cache models
-        id: cache-models
-        uses: actions/cache@v4
-        with:
-          path: models
-          key: ${{ runner.os }}-acestep-models-q8-${{ hashFiles('models.sh') }}
-          restore-keys: |
-            ${{ runner.os }}-acestep-models-q8-
-            ${{ runner.os }}-acestep-models-
-
-      - name: Download models
-        if: steps.cache-models.outputs.cache-hit != 'true'
-        run: |
-          pip install -q hf
-          ./models.sh
-
-      - name: Run examples (CI fixtures, short duration)
-        run: chmod +x examples/run-examples-ci.sh && ./examples/run-examples-ci.sh
-
-      - name: Upload generated audio
-        uses: actions/upload-artifact@v4
-        with:
-          name: generated-audio
-          path: |
-            examples/dit-only0.wav
-            examples/cover0.wav
-            examples/cover-reference0.wav
-            examples/request-reference0.wav
-            examples/simple00.wav
-            examples/partial00.wav
-            examples/full00.wav
diff --git a/.gitignore b/.gitignore
index 91ddabb..4ccb4f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,12 +3,7 @@ build/
 *.bf16
 
 tests/*/
-!tests/fixtures/
-!tests/fixtures/*.json
 
-request.json
-request0.json
-tests/fixtures/*0.json
 checkpoints/
 models/
 __pycache__/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 99903d6..d7af387 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,12 @@ add_compile_definitions(GGML_MAX_NAME=128)
 # CUDA architectures: cover Turing to Blackwell for distributed binaries.
 # Users can override with -DCMAKE_CUDA_ARCHITECTURES=native for local builds.
 if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real")
+    find_package(CUDAToolkit QUIET)
+    if(CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
+        set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real")
+    else()
+        set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real")
+    endif()
 endif()
 
 # ggml as subdirectory, inherits GGML_CUDA, GGML_METAL, etc. from cmake flags
@@ -54,13 +59,11 @@ endmacro()
 # Core library (shared between binaries)
 add_library(acestep-core STATIC
     src/request.cpp
-    src/audio_loader.cpp
 )
 link_ggml_backends(acestep-core)
-target_include_directories(acestep-core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 
-# dit-vae: full pipeline (text-enc + cond + dit + vae + wav) + LoRA support
-add_executable(dit-vae tools/dit-vae.cpp src/dit-lora.cpp)
+# dit-vae: full pipeline (text-enc + cond + dit + vae + wav)
+add_executable(dit-vae tools/dit-vae.cpp)
 target_link_libraries(dit-vae PRIVATE acestep-core)
 link_ggml_backends(dit-vae)
 
@@ -72,3 +75,7 @@ link_ggml_backends(ace-qwen3)
 # quantize: GGUF requantizer (BF16 -> K-quants)
 add_executable(quantize tools/quantize.cpp)
 link_ggml_backends(quantize)
+
+# neural-codec: Oobleck VAE neural audio codec (encode/decode WAV <-> latent)
+add_executable(neural-codec tools/neural-codec.cpp)
+link_ggml_backends(neural-codec)
diff --git a/README.md b/README.md
index 25449d0..84f39dd 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # acestep.cpp
 
 Portable C++17 implementation of ACE-Step 1.5 music generation using GGML.
-Text + lyrics in, stereo 48kHz WAV out. Runs on CPU, CUDA, Metal, Vulkan.
+Text + lyrics in, stereo 48kHz WAV out. Runs on CPU, CUDA, ROCm, Metal, Vulkan.
 
 ## Build
 
@@ -16,6 +16,9 @@ cmake ..
 # Linux with NVIDIA GPU
 cmake .. -DGGML_CUDA=ON
 
+# Linux with AMD GPU (ROCm)
+cmake .. -DGGML_HIP=ON
+
 # Linux with Vulkan
 cmake .. -DGGML_VULKAN=ON
 
@@ -29,7 +32,7 @@ cmake .. -DGGML_CUDA=ON -DGGML_BLAS=ON
 cmake --build . --config Release -j$(nproc)
 ```
 
-Builds two binaries: `ace-qwen3` (LLM) and `dit-vae` (DiT + VAE).
+Builds three binaries: `ace-qwen3` (LLM), `dit-vae` (DiT + VAE) and `neural-codec` (VAE encode/decode).
 
 ## Models
 
@@ -94,13 +97,13 @@ EOF
 # LLM: request.json -> request0.json (enriched with lyrics + codes)
 ./build/ace-qwen3 \
     --request /tmp/request.json \
-    --model models/acestep-5Hz-lm-4B-BF16.gguf
+    --model models/acestep-5Hz-lm-4B-Q8_0.gguf
 
 # DiT+VAE: request0.json -> request00.wav
 ./build/dit-vae \
     --request /tmp/request0.json \
-    --text-encoder models/Qwen3-Embedding-0.6B-BF16.gguf \
-    --dit models/acestep-v15-turbo-BF16.gguf \
+    --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit models/acestep-v15-turbo-Q8_0.gguf \
     --vae models/vae-BF16.gguf
 ```
 
@@ -111,7 +114,7 @@ Generate multiple songs at once with `--batch`:
 # -> request0.json, request1.json (different lyrics/codes, seeds auto+0, auto+1)
 ./build/ace-qwen3 \
     --request /tmp/request.json \
-    --model models/acestep-5Hz-lm-4B-BF16.gguf \
+    --model models/acestep-5Hz-lm-4B-Q8_0.gguf \
     --batch 2
 
 # DiT+VAE: (2 DiT variations of LM output 1 and 2)
@@ -119,8 +122,8 @@ Generate multiple songs at once with `--batch`:
 # -> request1.json -> request10.wav, request11.wav
 ./build/dit-vae \
     --request /tmp/request0.json /tmp/request1.json \
-    --text-encoder models/Qwen3-Embedding-0.6B-BF16.gguf \
-    --dit models/acestep-v15-turbo-BF16.gguf \
+    --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit models/acestep-v15-turbo-Q8_0.gguf \
     --vae models/vae-BF16.gguf \
     --batch 2
 ```
@@ -151,34 +154,43 @@ Empty field = "fill it". Filled = "don't touch".
 All modes always output numbered files (`request0.json` .. `requestN-1.json`).
 The input JSON is never modified.
 
-**Caption only**: the LLM generates lyrics, metadata (bpm, key, time
-signature, duration) and audio codes. With `--batch N`, each element
-generates its own lyrics and metadata from a different seed, producing
-N completely different songs. See `examples/simple.json`.
+**Caption only** (`lyrics=""`): two LLM passes. Phase 1 uses the "Expand"
+prompt to generate lyrics and metadata (bpm, keyscale, timesignature,
+duration) via CoT. Phase 2 reinjects the CoT and generates audio codes using
+the "Generate tokens" prompt. CFG is forced to 1.0 in phase 1 (free
+sampling); `lm_cfg_scale` only applies in phase 2. With `--batch N`, each
+element runs its own phase 1 from a different seed, producing N completely
+different songs. See `examples/simple.json`.
 
-**Caption + lyrics (+ optional metadata)**: the LLM fills missing
-metadata via CoT, then generates audio codes. User provided fields
-are preserved. See `examples/partial.json`.
+**Caption + lyrics (+ optional metadata)**: single LLM pass. The "Generate
+tokens" prompt is used directly. Missing metadata is filled via CoT, then
+audio codes are generated. User-provided fields are never overwritten.
+`lm_cfg_scale` applies to both CoT and code generation. See
+`examples/partial.json`.
 
 **Everything provided** (caption, lyrics, bpm, duration, keyscale,
 timesignature): the LLM skips CoT and generates audio codes directly.
 With `--batch N`, all elements share the same prompt (single prefill,
 KV cache copied). See `examples/full.json`.
 
+**Instrumental** (`lyrics="[Instrumental]"`): treated as "lyrics provided",
+so the single-pass "Generate tokens" path is used. No lyrics generation.
+The DiT was trained with this exact string as the no-vocal condition.
+
 **Passthrough** (`audio_codes` present): LLM is skipped entirely.
 Run `dit-vae` to decode existing codes. See `examples/dit-only.json`.
 
 ## Request JSON reference
 
-All fields with defaults. Only `caption` is required.
+Only `caption` is required. All other fields default to "unset" which means
+the LLM fills them, or a sensible runtime default is applied.
 
 ```json
 {
     "caption":            "",
     "lyrics":             "",
-    "instrumental":       false,
     "bpm":                0,
-    "duration":           -1,
+    "duration":           0,
     "keyscale":           "",
     "timesignature":      "",
     "vocal_language":     "unknown",
@@ -190,18 +202,98 @@ All fields with defaults. Only `caption` is required.
     "lm_negative_prompt": "",
     "audio_codes":        "",
     "inference_steps":    8,
-    "guidance_scale":     7.0,
+    "guidance_scale":     0.0,
     "shift":              3.0
 }
 ```
 
-Key fields: `seed` -1 means random (resolved once, then +1 per batch
-element). `audio_codes` is generated by ace-qwen3 and consumed by
-dit-vae (comma separated FSQ token IDs). When present, the LLM is
-skipped entirely.
+### Text conditioning (ace-qwen3 + dit-vae)
+
+**`caption`** (string, required)
+Natural language description of the music style, mood, instruments, etc.
+Fed to both the LLM and the DiT text encoder.
+
+**`lyrics`** (string, default `""`)
+Controls vocal generation. Three valid states:
+- `""`: LLM generates lyrics from the caption (phase 1 "Expand" prompt).
+- `"[Instrumental]"`: no vocals. Passed directly to the DiT, LLM skips lyrics generation.
+- Any other string: user-provided lyrics used as-is, LLM only fills missing metadata.
+
+There is no `instrumental` flag. This field is the single source of truth for
+vocal content.
+
+### Metadata (LLM-filled if unset)
+
+**`bpm`** (int, default `0` = unset)
+Beats per minute. LLM generates one if 0.
+
+**`duration`** (float seconds, default `0` = unset)
+Target audio duration. `0` means the LLM picks it. Clamped to [1, 600]s after
+generation. `1` means 1 second.
+
+**`keyscale`** (string, default `""` = unset)
+Musical key and scale, e.g. `"C major"`, `"F# minor"`. LLM fills if empty.
+
+**`timesignature`** (string, default `""` = unset)
+Time signature numerator as a string, e.g. `"4"` for 4/4, `"3"` for 3/4.
+LLM fills if empty.
+
+**`vocal_language`** (string, default `"unknown"`)
+BCP-47 language code for lyrics, e.g. `"en"`, `"fr"`, `"ja"`. When set and
+lyrics are being generated, the FSM constrains the LLM output to that language.
+`"unknown"` lets the LLM decide.
+
+### Generation control
+
+**`seed`** (int64, default `-1` = random)
+RNG seed. Resolved once at startup to a random value if -1. Batch elements
+use `seed+0`, `seed+1`, ... `seed+N-1`.
+
+**`audio_codes`** (string, default `""`)
+Comma-separated FSQ token IDs produced by ace-qwen3. When non-empty, the
+entire LLM pass is skipped and dit-vae decodes these codes directly
+(passthrough / cover mode).
+
+### LM sampling (ace-qwen3)
+
+**`lm_temperature`** (float, default `0.85`)
+Sampling temperature for both phase 1 (lyrics/metadata) and phase 2 (audio
+codes). Lower = more deterministic.
+
+**`lm_cfg_scale`** (float, default `2.0`)
+Classifier-Free Guidance scale for the LM. Only active in phase 2 (audio
+code generation) and in phase 1 when lyrics are already provided. When
+`lyrics` is empty, phase 1 always runs with `cfg=1.0` (free sampling).
+`1.0` disables CFG.
 
-Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG).
-SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`.
+**`lm_top_p`** (float, default `0.9`)
+Nucleus sampling cutoff. `1.0` disables. When `top_k=0`, an internal
+pre-filter of 256 tokens is applied before top_p for performance.
+
+**`lm_top_k`** (int, default `0` = disabled)
+Top-K sampling. `0` disables hard top-K (top_p still applies).
+
+**`lm_negative_prompt`** (string, default `""`)
+Negative caption for CFG in phase 2. Empty string falls back to a
+caption-less unconditional prompt.
+
+### DiT flow matching (dit-vae)
+
+**`inference_steps`** (int, default `8`)
+Number of diffusion denoising steps. Turbo preset: `8`. SFT preset: `50`.
+
+**`guidance_scale`** (float, default `0.0` = auto)
+CFG scale for the DiT. `0.0` is resolved at runtime:
+- Turbo models: forced to `1.0` (CFG disabled, turbo was trained without it).
+- SFT/base models: `7.0`.
+Any value > 1.0 on a turbo model is overridden to 1.0 with a warning.
+
+**`shift`** (float, default `3.0`)
+Flow-matching schedule shift. Controls the timestep distribution.
+`shift = s*t / (1 + (s-1)*t)`. Turbo preset: `3.0`. SFT preset: `6.0`.
+
+Turbo preset: `inference_steps=8, shift=3.0` (guidance_scale auto-resolved to 1.0).
+SFT preset: `inference_steps=50, guidance_scale=7.0, shift=6.0`.
 
 ## ace-qwen3 reference
 
@@ -258,6 +350,71 @@ Debug:
 
 Models are loaded once and reused across all requests.
 
+## neural-codec
+
+GGML-native neural audio codec based on the Oobleck VAE encoder and decoder.
+Serves two purposes: validating the precision of the full VAE chain (encode +
+decode roundtrip), and compressing music at ~850 B/s with no perceptible
+difference from the original.
+
+```
+Usage: neural-codec --vae <gguf> --encode|--decode -i <input> [-o <o>] [--q8|--q4]
+
+Required:
+  --vae <path>            VAE GGUF file
+  --encode | --decode     Encode WAV to latent, or decode latent to WAV
+  -i <path>               Input (WAV for encode, latent for decode)
+
+Output:
+  -o <path>               Output file (auto-named if omitted)
+  --q8                    Quantize latent to int8 (~13 kbit/s)
+  --q4                    Quantize latent to int4 (~6.8 kbit/s)
+
+Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4)
+               song.latent -> song.wav
+
+VAE tiling (memory control):
+  --vae-chunk <N>         Latent frames per tile (default: 256)
+  --vae-overlap <N>       Overlap frames per side (default: 64)
+
+Latent formats (decode auto-detects):
+  f32:  flat [T, 64] f32, no header. ~51 kbit/s.
+  NAC8: header + per-frame Q8. ~13 kbit/s.
+  NAC4: header + per-frame Q4. ~6.8 kbit/s.
+```
+
+The encoder is the symmetric mirror of the decoder: same snake activations,
+same residual units, strided conv1d for downsampling instead of transposed
+conv1d for upsampling. No new GGML ops. Downsample 2x4x4x6x10 = 1920x.
+
+48kHz stereo audio is compressed to 64-dimensional latent frames at 25 Hz.
+Three output formats, decode auto-detects from file content:
+
+| Format | Frame size | Bitrate | 3 min song | vs f32 (cossim) |
+|--------|-----------|---------|------------|-----------------|
+| f32    | 256B      | 51 kbit/s | 1.1 MB   | baseline        |
+| NAC8   | 66B       | 13 kbit/s | 290 KB   | 0.9999          |
+| NAC4   | 34B       | 6.8 kbit/s | 150 KB  | 0.989           |
+
+NAC = Neural Audio Codec. The NAC8 and NAC4 file formats are headerless
+except for a 4-byte magic (`NAC8` or `NAC4`) and a uint32 frame count.
+Q8 quantization error is 39 dB below the VAE reconstruction error (free).
+Q4 quantization error is 16 dB below the VAE reconstruction error (inaudible
+on most material).
+
+```bash
+# encode (Q4: 6.8 kbit/s, ~150 KB for 3 minutes)
+neural-codec --vae models/vae-BF16.gguf --encode --q4 -i song.wav -o song.nac4
+
+# encode (Q8: 13 kbit/s, ~290 KB for 3 minutes)
+neural-codec --vae models/vae-BF16.gguf --encode --q8 -i song.wav -o song.nac8
+
+# decode (auto-detects format)
+neural-codec --vae models/vae-BF16.gguf --decode -i song.nac4 -o song_decoded.wav
+
+# roundtrip validation: compare song.wav and song_decoded.wav with your ears
+```
+
 ## Architecture
 
 ```
@@ -278,6 +435,39 @@ dit-vae
   WAV stereo 48kHz
 ```
 
+## Roadmap
+
+This project started from a simple idea: a Telegram bot using llama.cpp to
+prompt a music generator, and the desire to make GGML sing. No more, no less.
+No cloud, no black box, scriptable and nothing between you and the model.
+
+### LLM modes
+- [ ] Remaining modes: Understand, Rewrite (single-pass, no audio codes)
+- [ ] Reference audio input: repaint and cover tasks (src_audio + cover_strength)
+
+### Audio I/O
+Current: raw PCM f32 WAV via hand-rolled writer, no external deps.
+Trade-off to document:
+- **Keep as-is**: zero dependencies, clean licensing, works everywhere
+- **ffmpeg pipe**: trivial bash wrapper handles any codec/format, no C++ codec hell
+  - pro: MP3/FLAC/OGG out of the box, input resampling for reference audio
+  - con: runtime dependency, not embedded
+Conclusion pending. Likely ffmpeg as optional external pipe, documented in README.
+
+### API and interface
+- [ ] JSON HTTP server (minimal, well-documented, stable contract)
+- [ ] Web interface on top - vibecodeable by anyone, API stays simple
+Goal: document the internals and how the model actually works,
+not reproduce the Python spaghetti. Expert-first, no commercial fluff.
+
+### Documentation
+Current README is technical study + API reference, intentional.
+- [ ] Split when a user-facing interface exists: README (user) + ARCHITECTURE.md (internals)
+
+### Future models
+- [ ] ACE-Step 2.0: evaluate architecture delta, add headers/weights as needed
+No commitment, easy to adapt by adding headers or new compilation units as needed.
+
 ## LM specifics
 
 ace-qwen3 is not a general-purpose chat engine. It is a two-phase autoregressive
@@ -318,7 +508,7 @@ python3 debug-dit-cossim.py       # DiT: per-layer cossim GGML vs Python (turbo/
 ## Patched GGML fork
 
 Uses a patched GGML fork (submodule) with two new ops, a Metal im2col optimization, and
-a CUDA bugfix for the Oobleck VAE decoder. All backends: CPU, CUDA, Metal, Vulkan.
+a CUDA bugfix for the Oobleck VAE decoder. All backends: CPU, CUDA, ROCm, Metal, Vulkan.
 F32/F16/BF16 data types. The DiT uses only standard GGML ops and needs no patches.
 
 The VAE reconstructs audio from latent space through 5 upsampling blocks (total 1920x),
@@ -373,6 +563,19 @@ Upstream `im2col_kernel` uses OW directly as grid dimension Y, which exceeds the
 times per tile at output widths up to 491520. Fixed with a grid-stride loop on OW and
 `MIN(OW, MAX_GRIDDIM_Z)` clamping.
 
+### Upstream divergence
+
+The GGML submodule diverges from upstream only by the addition of
+`GGML_OP_SNAKE` and `GGML_OP_COL2IM_1D`. No existing upstream kernel is
+modified. These ops are required; the VAE does not work without them.
+
+An earlier approach patched the upstream naive ops instead of adding custom
+ones. Those patches were dropped. They are documented here in case someone
+wants to study the naive path:
+
+- `conv_transpose_1d`: bounded loop replacing O(T_in) brute-force, CUDA and Metal
+- `im2col`: grid-stride loop on OW to fix gridDim.y overflow for large tensors
+
 ## Acknowledgements
 
 Independent implementation based on ACE-Step 1.5 by ACE Studio and StepFun.
@@ -387,3 +590,15 @@ All model weights are theirs, this is just a native backend.
 	note={GitHub repository}
 }
 ```
+
+## Samples
+
+https://github.com/user-attachments/assets/9a50c1f4-9ec0-474a-bd14-e8c6b00622a1
+
+https://github.com/user-attachments/assets/fb606249-0269-4153-b651-bf78e05baf22
+
+https://github.com/user-attachments/assets/e0580468-5e33-4a1f-a0f4-b914e4b9a8c2
+
+https://github.com/user-attachments/assets/292a31f1-f97e-4060-9207-ed8364d9a794
+
+https://github.com/user-attachments/assets/34b1b781-a5bc-46c4-90a6-615a10bc2c6a
diff --git a/build.sh b/build.sh
deleted file mode 100755
index 67f711f..0000000
--- a/build.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-rm -rf build
-mkdir build
-cd build
-
-cmake .. -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc
-cmake --build . --config Release -j "$(nproc)"
diff --git a/docs/MODES.md b/docs/MODES.md
deleted file mode 100644
index ae0b616..0000000
--- a/docs/MODES.md
+++ /dev/null
@@ -1,63 +0,0 @@
-# ACE-Step 1.5 built-in modes (acestep.cpp)
-
-This document maps the [ACE-Step 1.5 Tutorial](https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md) built-in modes to the current C++ implementation.
-
-## Task types (Tutorial: Input Control)
-
-| `task_type`   | Description | Turbo/SFT | Base only | C++ status |
-|---------------|-------------|-----------|-----------|------------|
-| **text2music** | Generate from caption/lyrics (and optional reference) | ✅ | — | ✅ **Supported** |
-| **cover**      | Re-synthesize with structure from source; optional timbre from reference | ✅ | — | ✅ **Supported** (audio_codes or src_audio WAV/MP3) |
-| **repaint**    | Local edit in time range using source as context | ✅ | — | ❌ Not implemented |
-| **lego**       | Add new tracks to existing audio | — | ✅ | ❌ Base model only |
-| **extract**    | Extract single track from mix | — | ✅ | ❌ Base model only |
-| **complete**   | Add accompaniment to single track | — | ✅ | ❌ Base model only |
-
-We only ship Turbo and SFT DiT weights; **lego**, **extract**, **complete** require the Base DiT and are out of scope for now.
-
----
-
-## What we support today
-
-### text2music (default)
-- **Input**: `caption`, optional `lyrics`, metadata (bpm, duration, keyscale, …).
-- **Flow**: LM (optional) → CoT + audio codes → DiT (context = silence) → VAE → WAV.
-- **Timbre**: Optional **reference_audio** (WAV/MP3) → VAE encode → CondEncoder timbre; else built-in silence.
-
-### cover (when `audio_codes` or `src_audio` are provided)
-- **Input**: Same as text2music, plus either **precomputed** `audio_codes` or **`src_audio`** (WAV/MP3 path). Optional **reference_audio** for timbre.
-- **Flow**: If `src_audio` set and no `audio_codes`: load WAV/MP3 → VAE encode → FSQ nearest-codeword encode → codes. Then decode codes to latents → DiT context (blend with silence) → DiT → VAE → WAV. No Python.
-- **reference_audio** and **audio_cover_strength**: Implemented (timbre; blend).
----
-
-## What’s not implemented yet
-
-### repaint
-- **Tutorial**: Specify `repainting_start` / `repainting_end` (seconds); model uses source audio as context and only generates in that interval (3–90 s).
-- **C++**: Would require **masked diffusion**: context carries “given” frames; ODE only updates the repaint region. DiT’s context has a 64-channel “mask” that we currently set to 1.0; repaint would set mask per frame and the generation loop would only update unmasked frames. Not implemented.
-
----
-
-## Request fields (aligned with Tutorial)
-
-All of these are in `AceRequest` and parsed from / written to JSON. Backend behavior is as above.
-
-| Field | Type | Purpose |
-|-------|------|--------|
-| `task_type` | string | `"text2music"` \| `"cover"` \| `"repaint"` \| … |
-| `reference_audio` | string | Path to WAV or MP3 for timbre (implemented) |
-| `src_audio` | string | Path to WAV or MP3 for cover source; encoded to codes internally (implemented) |
-| `audio_codes` | string | Comma-separated FSQ codes; non-empty ⇒ cover path (or from `src_audio`) |
-| `audio_cover_strength` | float | 0.0–1.0 blend of decoded context with silence (implemented) |
-| `repainting_start` | float | Start time (s) for repaint (not used yet) |
-| `repainting_end` | float | End time (s) for repaint (not used yet) |
-
-See `request.h` and the README “Request JSON reference” for the full list.
-
----
-
-## Summary
-
-- **Fully supported**: text2music (with optional reference_audio for timbre); cover from **precomputed** `audio_codes` or from **WAV/MP3** via `src_audio` (VAE encode + FSQ nearest-codeword encode); reference_audio (timbre); audio_cover_strength (blend).
-- **Schema only** (no backend): `repainting_start`/`repainting_end`.
-- **To support repaint**: implement masked DiT generation (context mask + ODE only on repaint interval).
diff --git a/examples/cover-reference.json b/examples/cover-reference.json
deleted file mode 100644
index 313d419..0000000
--- a/examples/cover-reference.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "task_type": "cover",
-  "caption": "Cover with timbre from reference WAV/MP3",
-  "duration": 10,
-  "bpm": 83,
-  "keyscale": "G major",
-  "timesignature": "4",
-  "vocal_language": "fr",
-  "inference_steps": 8,
-  "guidance_scale": 1,
-  "shift": 3,
-  "seed": 42,
-  "audio_cover_strength": 0.9,
-  "audio_codes": "43316,18426,13366,59455,17783,49303,7423,29855,37158,37157,62317,61455,12847,19583,57031,34656,20254,10770,11416,15905,31413,23339,47091,12198,49531,37355,33090,38645,40707,16324,61436,46095,13941,5287,2239,13975,63815,2757,4862,13571,63495,39,29887,49426,12696,50847,40498,61056,25666,12989,23987,54763,25485,31683,28554,25355,16373,28995,2351,1655,7940,55831,34359,15350,15277,11717,20476,52239,5015,19807,24087,3559,20471,34193,32552,60999,29360,25338,38873,16768,17912,27584,24008,1528,449,25563,52684,53223,42183,37215,12343,39431,26055,28148,57286,38382,28863,7191,58397,18991,7695,30716,36784,12687,8707,25649,33718,3202,23035,10747,26354,63965,16260,11223,45679,14343,8679,49351,52927,2535,19207,46447,49615,12694,21110,46597,60991,27711,49751,54656,30448,33125,13585,29256,5161,42434,11753,39402,60354,21953,39532,14282,52160,34248,16304,4671,14172,5127,25991,15343,8583,61902,16328,31700,48415,28879,11215,52715,25541,11203,7695,63951,33803,40453,17750,28006,8231,40464,3136,51006,23839,18711,18711,18711,18711,3343,3279,2823,16071,3271,2823,2319,55815,40260,16215,12047,16631,26927,21863,20060,10166,51070,39,12099,63440,18418,25271,10792,2128,44166,53750,41263,44247,61287,42303,27614,21997,24879,38799,12648,38341,36833,19408,11769,2979,63979,44239,25559,27591,17626,44087,33796,4901,53176,57399,37180,38024,9216,63485,2005,13656,15914,45576,29194,45624,62332,53237,63988,40332,20486,31367,10951,46207,22231,63479,38877,17262,49335,42045,57388",
-  "reference_audio": "reference.wav"
-}
\ No newline at end of file
diff --git a/examples/cover-reference.sh b/examples/cover-reference.sh
deleted file mode 100755
index 2687b09..0000000
--- a/examples/cover-reference.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-# Cover mode with reference timbre: audio_codes + reference_audio (WAV or MP3).
-# Put a WAV/MP3 at reference.wav (or reference.mp3) or set reference_audio in cover-reference.json.
-# Requires VAE GGUF with encoder weights (same as request-reference / test-reference).
-set -eu
-cd "$(dirname "$0")"
-
-if [ ! -f "reference.wav" ] && [ ! -f "reference.mp3" ]; then
-    echo "No reference.wav or reference.mp3 found. Copy a file to reference.wav (or .mp3), or set reference_audio in cover-reference.json."
-    echo "Then run: $0"
-    exit 1
-fi
-
-../build/dit-vae \
-    --request cover-reference.json \
-    --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
-    --dit ../models/acestep-v15-turbo-Q8_0.gguf \
-    --vae ../models/vae-BF16.gguf
-
-echo "Done. Check cover-reference0.wav"
diff --git a/examples/cover.json b/examples/cover.json
deleted file mode 100644
index 82cbaaa..0000000
--- a/examples/cover.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-  "task_type": "cover",
-  "caption": "Re-synthesize from precomputed codes (e.g. from a previous ace-qwen3 run)",
-  "duration": 10,
-  "bpm": 83,
-  "keyscale": "G major",
-  "timesignature": "4",
-  "vocal_language": "fr",
-  "inference_steps": 8,
-  "guidance_scale": 1,
-  "shift": 3,
-  "seed": 42,
-  "audio_cover_strength": 0.9,
-  "audio_codes": "43316,18426,13366,59455,17783,49303,7423,29855,37158,37157,62317,61455,12847,19583,57031,34656,20254,10770,11416,15905,31413,23339,47091,12198,49531,37355,33090,38645,40707,16324,61436,46095,13941,5287,2239,13975,63815,2757,4862,13571,63495,39,29887,49426,12696,50847,40498,61056,25666,12989,23987,54763,25485,31683,28554,25355,16373,28995,2351,1655,7940,55831,34359,15350,15277,11717,20476,52239,5015,19807,24087,3559,20471,34193,32552,60999,29360,25338,38873,16768,17912,27584,24008,1528,449,25563,52684,53223,42183,37215,12343,39431,26055,28148,57286,38382,28863,7191,58397,18991,7695,30716,36784,12687,8707,25649,33718,3202,23035,10747,26354,63965,16260,11223,45679,14343,8679,49351,52927,2535,19207,46447,49615,12694,21110,46597,60991,27711,49751,54656,30448,33125,13585,29256,5161,42434,11753,39402,60354,21953,39532,14282,52160,34248,16304,4671,14172,5127,25991,15343,8583,61902,16328,31700,48415,28879,11215,52715,25541,11203,7695,63951,33803,40453,17750,28006,8231,40464,3136,51006,23839,18711,18711,18711,18711,3343,3279,2823,16071,3271,2823,2319,55815,40260,16215,12047,16631,26927,21863,20060,10166,51070,39,12099,63440,18418,25271,10792,2128,44166,53750,41263,44247,61287,42303,27614,21997,24879,38799,12648,38341,36833,19408,11769,2979,63979,44239,25559,27591,17626,44087,33796,4901,53176,57399,37180,38024,9216,63485,2005,13656,15914,45576,29194,45624,62332,53237,63988,40332,20486,31367,10951,46207,22231,63479,38877,17262,49335,42045,57388"
-}
\ No newline at end of file
diff --git a/examples/cover.sh b/examples/cover.sh
deleted file mode 100755
index 14d340e..0000000
--- a/examples/cover.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-# Cover mode: decode precomputed audio_codes to WAV (no LLM).
-# Use cover.json as-is, or replace audio_codes with output from a previous run:
-#   ../build/ace-qwen3 --request simple.json --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf
-#   # then use simple0.json as input, or copy its audio_codes into cover.json
-set -eu
-cd "$(dirname "$0")"
-
-../build/dit-vae \
-    --request cover.json \
-    --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
-    --dit ../models/acestep-v15-turbo-Q8_0.gguf \
-    --vae ../models/vae-BF16.gguf
-
-echo "Done. Check cover0.wav"
diff --git a/examples/lora.json b/examples/lora.json
deleted file mode 100644
index c872efb..0000000
--- a/examples/lora.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "task_type": "text2music",
-  "caption": "An energetic nu-disco track built on a foundation of a tight, funky slap bassline and a crisp, four-on-the-floor drum machine beat. The song opens with a distinctive, filtered wah-wah guitar riff that serves as a recurring motif. The arrangement is layered with shimmering synth pads, punchy synth stabs, and subtle arpeggiated synth textures that add movement. The track progresses through dynamic sections, including a brief atmospheric breakdown before rebuilding the main groove.",
-  "genre": "Nu-disco",
-  "lyrics": "[Instrumental]",
-  "bpm": 115,
-  "keyscale": "C# major",
-  "timesignature": "4",
-  "duration": 256,
-  "language": "unknown",
-  "instrumental": true,
-  "custom_tag": "crydamoure",
-  "inference_steps": 8,
-  "guidance_scale": 1,
-  "shift": 3,
-  "seed": -1
-}
diff --git a/examples/lora.sh b/examples/lora.sh
deleted file mode 100755
index db7ce2b..0000000
--- a/examples/lora.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# LoRA example: generate with a PEFT LoRA adapter (e.g. duckdbot/acestep-lora-cryda).
-# Requires adapter_model.safetensors in lora/ (download once; see below).
-set -eu
-cd "$(dirname "$0")"
-
-ADAPTER="lora/adapter_model.safetensors"
-if [ ! -f "$ADAPTER" ]; then
-    echo "LoRA adapter not found at $ADAPTER"
-    exit 1
-fi
-
-# LLM: fill lyrics + codes
-../build/ace-qwen3 \
-    --request lora.json \
-    --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf
-
-# DiT+VAE with LoRA (scale = alpha/rank; 1.0 is typical)
-../build/dit-vae \
-    --request lora0.json \
-    --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
-    --dit ../models/acestep-v15-turbo-Q8_0.gguf \
-    --vae ../models/vae-BF16.gguf \
-    --lora "$ADAPTER" \
-    --lora-scale 1.0
-
-echo "Done. Check lora00.wav"
diff --git a/examples/request-reference.json b/examples/request-reference.json
deleted file mode 100644
index 55479ee..0000000
--- a/examples/request-reference.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "task_type": "text2music",
-    "caption": "Calm acoustic guitar, soft male vocal, intimate",
-    "lyrics": "[Verse]\nQuiet strings and a gentle tune\n[Chorus]\nHold the moment in this room",
-    "duration": 30,
-    "seed": 42,
-    "inference_steps": 8,
-    "guidance_scale": 1.0,
-    "shift": 3.0,
-    "reference_audio": "reference.wav",
-    "audio_codes": "",
-    "audio_cover_strength": 1.0
-}
diff --git a/examples/run-examples-ci.sh b/examples/run-examples-ci.sh
deleted file mode 100755
index 362f92a..0000000
--- a/examples/run-examples-ci.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Run all example scripts with short-duration CI fixtures (from repo root).
-# Prereqs: build/ and models/ present; run after build and ./models.sh.
-set -eu
-cd "$(dirname "$0")/.."
-EXAMPLES=examples
-cd "$EXAMPLES"
-
-run() { echo "== $*" && "$@"; }
-
-# 1) DiT-only (no LLM), 5s
-run cp ../tests/fixtures/ci-dit-only.json dit-only.json
-run ./dit-only.sh
-test -f dit-only0.wav && echo "dit-only OK"
-
-# 2) Cover from precomputed audio_codes (existing cover.json, 10s)
-run ./cover.sh
-test -f cover0.wav && echo "cover OK"
-
-# 3) reference.wav for cover-reference and test-reference
-run cp cover0.wav reference.wav
-
-# 4) Cover + reference timbre
-run ./cover-reference.sh
-test -f cover-reference0.wav && echo "cover-reference OK"
-
-# 5) text2music with reference_audio
-run cp ../tests/fixtures/ci-request-reference.json request-reference.json
-run ./test-reference.sh
-test -f request-reference0.wav && echo "test-reference OK"
-
-# 6) Simple (caption only, LLM fills), 5s
-run cp ../tests/fixtures/ci-text2music.json simple.json
-run ./simple.sh
-test -f simple00.wav && echo "simple OK"
-
-# 7) Partial (caption + lyrics + duration), 5s
-run cp ../tests/fixtures/ci-partial.json partial.json
-run ./partial.sh
-test -f partial00.wav && echo "partial OK"
-
-# 8) Full (all metadata), 5s
-run cp ../tests/fixtures/ci-full.json full.json
-run ./full.sh
-test -f full00.wav && echo "full OK"
-
-echo "All example scripts passed."
diff --git a/examples/test-reference.sh b/examples/test-reference.sh
deleted file mode 100755
index 073a465..0000000
--- a/examples/test-reference.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-# Test reference_audio (WAV) and audio_cover_strength.
-# Put a WAV file at reference.wav (or set reference_audio path in request-reference.json).
-# Requires: built dit-vae, --vae with encoder weights, and models in ../models/.
-
-set -eu
-cd "$(dirname "$0")"
-
-if [ ! -f "reference.wav" ]; then
-    echo "No reference.wav found. Copy a WAV file to reference.wav (stereo 48kHz or any rate; will be resampled)."
-    echo "Then run: $0"
-    exit 1
-fi
-
-../build/dit-vae \
-    --request request-reference.json \
-    --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
-    --dit ../models/acestep-v15-turbo-Q8_0.gguf \
-    --vae ../models/vae-BF16.gguf
-
-echo "Done. Check request-reference0.wav (and request-reference1.wav if --batch 2)."
diff --git a/src/audio.h b/src/audio.h
deleted file mode 100644
index bd90915..0000000
--- a/src/audio.h
+++ /dev/null
@@ -1,17 +0,0 @@
-// audio.h: unified reference-audio loader (WAV + MP3 → stereo 48kHz float)
-// Header-only for WAV; MP3 implementation in audio_loader.cpp (minimp3, no temp files).
-
-#pragma once
-
-#include <cstddef>
-#include <string>
-#include <vector>
-
-// Load WAV or MP3 file into stereo float32 at 48kHz.
-// Out: interleaved L,R,L,R,...; length = num_samples (per channel).
-// Returns num_samples (per channel), or -1 on error.
-// No temp files; MP3 decoded in memory via minimp3 (header-only dep).
-int load_audio_48k_stereo(const char * path, std::vector<float> * out);
-
-// MP3 implementation (in audio_loader.cpp; do not call from other TUs without linking it)
-int mp3_load_48k_stereo(const char * path, std::vector<float> * out);
diff --git a/src/audio_loader.cpp b/src/audio_loader.cpp
deleted file mode 100644
index a0e71b5..0000000
--- a/src/audio_loader.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-// audio_loader.cpp: MP3 decode for reference audio (minimp3, no deps, no temp files)
-
-#define MINIMP3_IMPLEMENTATION
-#include "third_party/minimp3.h"
-
-#include "wav.h"
-#include "audio.h"
-#include <cstdio>
-#include <cstring>
-#include <vector>
-#include <algorithm>
-
-static bool path_ends_with_ci(const char * path, const char * suffix) {
-    size_t pl = strlen(path), sl = strlen(suffix);
-    if (pl < sl) return false;
-    const char * p = path + pl - sl;
-    for (size_t i = 0; i < sl; i++) {
-        char a = (char)(p[i] >= 'A' && p[i] <= 'Z' ? p[i] + 32 : p[i]);
-        char b = (char)(suffix[i] >= 'A' && suffix[i] <= 'Z' ? suffix[i] + 32 : suffix[i]);
-        if (a != b) return false;
-    }
-    return true;
-}
-
-static void pcm_to_float_stereo_48k(
-    const int16_t * pcm, size_t num_samples, int channels, unsigned int sample_rate,
-    std::vector<float> * out)
-{
-    const float scale = 1.0f / 32768.0f;
-    out->resize(num_samples * 2);
-    if (channels == 1) {
-        for (size_t i = 0; i < num_samples; i++) {
-            float s = (float)pcm[i] * scale;
-            (*out)[i * 2] = s;
-            (*out)[i * 2 + 1] = s;
-        }
-    } else {
-        for (size_t i = 0; i < num_samples * 2; i++)
-            (*out)[i] = (float)pcm[i] * scale;
-    }
-
-    if (sample_rate != 48000) {
-        size_t in_len = num_samples;
-        size_t out_len = (size_t)((double)in_len * 48000.0 / (double)sample_rate);
-        std::vector<float> resampled(out_len * 2);
-        for (size_t i = 0; i < out_len; i++) {
-            double t = (double)i * (double)in_len / (double)out_len;
-            size_t i0 = (size_t)t;
-            size_t i1 = std::min(i0 + 1, in_len - 1);
-            float w = (float)(t - (double)i0);
-            for (int c = 0; c < 2; c++)
-                resampled[i * 2 + c] = (*out)[i0 * 2 + c] * (1.0f - w) + (*out)[i1 * 2 + c] * w;
-        }
-        *out = std::move(resampled);
-    }
-}
-
-int mp3_load_48k_stereo(const char * path, std::vector<float> * out) {
-    FILE * f = fopen(path, "rb");
-    if (!f) return -1;
-    fseek(f, 0, SEEK_END);
-    long sz = ftell(f);
-    fseek(f, 0, SEEK_SET);
-    if (sz <= 0 || sz > 200 * 1024 * 1024) {
-        fclose(f);
-        return -1;
-    }
-    std::vector<uint8_t> buf((size_t)sz);
-    if (fread(buf.data(), 1, (size_t)sz, f) != (size_t)sz) {
-        fclose(f);
-        return -1;
-    }
-    fclose(f);
-
-    mp3dec_t dec;
-    mp3dec_init(&dec);
-    mp3dec_frame_info_t info;
-    std::vector<int16_t> pcm;
-    const uint8_t * read_pos = buf.data();
-    int remaining = (int)buf.size();
-    int first_hz = 0, first_ch = 0;
-    const size_t max_samples = (size_t)(60 * 48000 * 2);
-
-    while (remaining > 0) {
-        size_t old_size = pcm.size();
-        if (old_size + (size_t)MINIMP3_MAX_SAMPLES_PER_FRAME > max_samples) break;
-        pcm.resize(old_size + (size_t)MINIMP3_MAX_SAMPLES_PER_FRAME);
-        int frame_samples = mp3dec_decode_frame(&dec, read_pos, remaining, pcm.data() + old_size, &info);
-        if (frame_samples <= 0) {
-            pcm.resize(old_size);
-            read_pos++;
-            remaining--;
-            continue;
-        }
-        if (first_hz == 0) {
-            first_hz = info.hz;
-            first_ch = info.channels;
-        }
-        pcm.resize(old_size + (size_t)(frame_samples * info.channels));
-        read_pos += info.frame_bytes;
-        remaining -= info.frame_bytes;
-    }
-
-    if (pcm.empty() || first_hz == 0) return -1;
-    size_t num_samples = pcm.size() / (size_t)first_ch;
-    pcm_to_float_stereo_48k(pcm.data(), num_samples, first_ch, (unsigned)first_hz, out);
-    return (int)(out->size() / 2);
-}
-
-int load_audio_48k_stereo(const char * path, std::vector<float> * out) {
-    if (!path || !out) return -1;
-    if (path_ends_with_ci(path, ".mp3"))
-        return mp3_load_48k_stereo(path, out);
-    if (path_ends_with_ci(path, ".wav"))
-        return wav_load_48k_stereo(path, out);
-    return -1;
-}
diff --git a/src/backend.h b/src/backend.h
index 4b8566b..df33975 100644
--- a/src/backend.h
+++ b/src/backend.h
@@ -13,6 +13,7 @@
 extern "C" int cudaDeviceGetAttribute(int *, int, int);
 #endif
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <thread>
 
@@ -41,6 +42,10 @@ static BackendPair backend_init(const char * label) {
     ggml_backend_load_all();
     BackendPair bp = {};
     bp.backend = ggml_backend_init_best();
+    if (!bp.backend) {
+        fprintf(stderr, "[Load] FATAL: no backend available\n");
+        exit(1);
+    }
     int n_threads = (int)std::thread::hardware_concurrency() / 2;
     if (n_threads < 1) n_threads = 1;
     // [GGML] If best backend is already CPU, reuse it (avoid 2 CPU instances
@@ -51,6 +56,10 @@ static BackendPair backend_init(const char * label) {
         ggml_backend_cpu_set_n_threads(bp.backend, n_threads);
     } else {
         bp.cpu_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
+        if (!bp.cpu_backend) {
+            fprintf(stderr, "[Load] FATAL: failed to init CPU backend\n");
+            exit(1);
+        }
         ggml_backend_cpu_set_n_threads(bp.cpu_backend, n_threads);
     }
     fprintf(stderr, "[Load] %s backend: %s (CPU threads: %d)\n",
@@ -87,5 +96,10 @@ static void backend_release(ggml_backend_t backend, ggml_backend_t cpu_backend)
 static ggml_backend_sched_t backend_sched_new(BackendPair bp, int max_nodes) {
     ggml_backend_t backends[2] = { bp.backend, bp.cpu_backend };
     int n = (bp.backend == bp.cpu_backend) ? 1 : 2;
-    return ggml_backend_sched_new(backends, NULL, n, max_nodes, false, true);
+    ggml_backend_sched_t sched = ggml_backend_sched_new(backends, NULL, n, max_nodes, false, true);
+    if (!sched) {
+        fprintf(stderr, "[Load] FATAL: failed to create scheduler\n");
+        exit(1);
+    }
+    return sched;
 }
diff --git a/src/cond-enc.h b/src/cond-enc.h
index e85b6fd..ba53163 100644
--- a/src/cond-enc.h
+++ b/src/cond-enc.h
@@ -270,7 +270,10 @@ static void cond_ggml_forward(CondGGML * m,
     if (timbre_out) ggml_build_forward_expand(gf, timbre_out);
 
     // Allocate and set inputs
-    ggml_backend_sched_alloc_graph(m->sched, gf);
+    if (!ggml_backend_sched_alloc_graph(m->sched, gf)) {
+        fprintf(stderr, "[CondEncoder] FATAL: failed to allocate graph\n");
+        exit(1);
+    }
 
     ggml_backend_tensor_set(t_lyric_in, lyric_embed, 0, 1024 * S_lyric * sizeof(float));
     ggml_backend_tensor_set(t_text_in, text_hidden, 0, 1024 * S_text * sizeof(float));
diff --git a/src/debug.h b/src/debug.h
index dc7a626..a32cd11 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -6,8 +6,6 @@
 #include <cstdio>
 #include <cstdint>
 #include <cmath>
-#include <cstring>
-#include <string>
 #include <vector>
 
 struct DebugDumper {
diff --git a/src/dit-graph.h b/src/dit-graph.h
index 1241bc5..abe64d0 100644
--- a/src/dit-graph.h
+++ b/src/dit-graph.h
@@ -10,10 +10,7 @@
 
 #include "dit.h"
 
-#include <cstdio>
-#include <cstdlib>
 #include <cmath>
-#include <vector>
 
 // Helper: ensure tensor is f32 (cast if bf16/f16)
 static struct ggml_tensor * dit_ggml_f32(
@@ -44,23 +41,6 @@ static struct ggml_tensor * dit_ggml_linear(
     return ggml_mul_mat(ctx, weight, input);
 }
 
-// Linear with optional LoRA: out = W@x + scale * (B@(A@x)). lora_a/lora_b may be NULL.
-static struct ggml_tensor * dit_ggml_linear_lora(
-        struct ggml_context * ctx,
-        struct ggml_tensor * weight,
-        struct ggml_tensor * lora_a,   // [in, r]
-        struct ggml_tensor * lora_b,   // [r, out]
-        float lora_scale,
-        struct ggml_tensor * input) {
-    struct ggml_tensor * out = ggml_mul_mat(ctx, weight, input);
-    if (lora_a && lora_b && lora_scale != 0.0f) {
-        struct ggml_tensor * ax = ggml_mul_mat(ctx, lora_a, input);
-        struct ggml_tensor * bax = ggml_mul_mat(ctx, lora_b, ax);
-        out = ggml_add(ctx, out, ggml_scale(ctx, bax, lora_scale));
-    }
-    return out;
-}
-
 // Helper: Linear layer with bias
 static struct ggml_tensor * dit_ggml_linear_bias(
         struct ggml_context * ctx,
@@ -181,36 +161,20 @@ static struct ggml_tensor * dit_ggml_build_self_attn(
     struct ggml_tensor * q, * k, * v;
     int q_dim  = Nh * D;
     int kv_dim = Nkv * D;
-    float lora_scale = m->lora_scale;
     if (ly->sa_qkv) {
         struct ggml_tensor * qkv = dit_ggml_linear(ctx, ly->sa_qkv, norm_sa);
         q = ggml_cont(ctx, ggml_view_3d(ctx, qkv, q_dim, S, N, qkv->nb[1], qkv->nb[2], 0));
         k = ggml_cont(ctx, ggml_view_3d(ctx, qkv, kv_dim, S, N, qkv->nb[1], qkv->nb[2], (size_t)q_dim * qkv->nb[0]));
         v = ggml_cont(ctx, ggml_view_3d(ctx, qkv, kv_dim, S, N, qkv->nb[1], qkv->nb[2], (size_t)(q_dim + kv_dim) * qkv->nb[0]));
-        // LoRA on fused path: add scale * (B @ (A @ x)) per projection when adapters are loaded
-        if (lora_scale != 0.0f) {
-            if (ly->lora_sa_q_a && ly->lora_sa_q_b)
-                q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_q_b, ggml_mul_mat(ctx, ly->lora_sa_q_a, norm_sa)), lora_scale));
-            if (ly->lora_sa_k_a && ly->lora_sa_k_b)
-                k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_k_b, ggml_mul_mat(ctx, ly->lora_sa_k_a, norm_sa)), lora_scale));
-            if (ly->lora_sa_v_a && ly->lora_sa_v_b)
-                v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_v_b, ggml_mul_mat(ctx, ly->lora_sa_v_a, norm_sa)), lora_scale));
-        }
     } else if (ly->sa_qk) {
         struct ggml_tensor * qk = dit_ggml_linear(ctx, ly->sa_qk, norm_sa);
         q = ggml_cont(ctx, ggml_view_3d(ctx, qk, q_dim, S, N, qk->nb[1], qk->nb[2], 0));
         k = ggml_cont(ctx, ggml_view_3d(ctx, qk, kv_dim, S, N, qk->nb[1], qk->nb[2], (size_t)q_dim * qk->nb[0]));
-        if (lora_scale != 0.0f) {
-            if (ly->lora_sa_q_a && ly->lora_sa_q_b)
-                q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_q_b, ggml_mul_mat(ctx, ly->lora_sa_q_a, norm_sa)), lora_scale));
-            if (ly->lora_sa_k_a && ly->lora_sa_k_b)
-                k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_k_b, ggml_mul_mat(ctx, ly->lora_sa_k_a, norm_sa)), lora_scale));
-        }
-        v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa);
+        v = dit_ggml_linear(ctx, ly->sa_v_proj, norm_sa);
     } else {
-        q = dit_ggml_linear_lora(ctx, ly->sa_q_proj, ly->lora_sa_q_a, ly->lora_sa_q_b, lora_scale, norm_sa);
-        k = dit_ggml_linear_lora(ctx, ly->sa_k_proj, ly->lora_sa_k_a, ly->lora_sa_k_b, lora_scale, norm_sa);
-        v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa);
+        q = dit_ggml_linear(ctx, ly->sa_q_proj, norm_sa);
+        k = dit_ggml_linear(ctx, ly->sa_k_proj, norm_sa);
+        v = dit_ggml_linear(ctx, ly->sa_v_proj, norm_sa);
     }
 
     // 2) Reshape to heads: [Nh*D, S, N] -> [D, Nh, S, N]
@@ -272,7 +236,7 @@ static struct ggml_tensor * dit_ggml_build_self_attn(
     }
 
     // 8) O projection: [Nh*D, S, N] -> [H, S, N]
-    struct ggml_tensor * out = dit_ggml_linear_lora(ctx, ly->sa_o_proj, ly->lora_sa_o_a, ly->lora_sa_o_b, m->lora_scale, attn);
+    struct ggml_tensor * out = dit_ggml_linear(ctx, ly->sa_o_proj, attn);
     return out;
 }
 
@@ -286,34 +250,20 @@ static struct ggml_tensor * dit_ggml_build_mlp(
         struct ggml_tensor * norm_ffn,
         int S) {
 
-    DiTGGMLConfig & c = m->cfg;
-    int I = c.intermediate_size;
-    int N = (int)norm_ffn->ne[2];
-    float lora_scale = m->lora_scale;
     struct ggml_tensor * ff;
     if (ly->gate_up) {
         // Fused: single matmul [H, 2*I] x [H, S, N] -> [2*I, S, N], then swiglu splits ne[0]
         struct ggml_tensor * gu = dit_ggml_linear(ctx, ly->gate_up, norm_ffn);
-        if (lora_scale != 0.0f && ((ly->lora_gate_a && ly->lora_gate_b) || (ly->lora_up_a && ly->lora_up_b))) {
-            struct ggml_tensor * gate = ggml_cont(ctx, ggml_view_3d(ctx, gu, I, S, N, gu->nb[1], gu->nb[2], 0));
-            struct ggml_tensor * up   = ggml_cont(ctx, ggml_view_3d(ctx, gu, I, S, N, gu->nb[1], gu->nb[2], (size_t)I * gu->nb[0]));
-            if (ly->lora_gate_a && ly->lora_gate_b)
-                gate = ggml_add(ctx, gate, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_gate_b, ggml_mul_mat(ctx, ly->lora_gate_a, norm_ffn)), lora_scale));
-            if (ly->lora_up_a && ly->lora_up_b)
-                up   = ggml_add(ctx, up, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_up_b, ggml_mul_mat(ctx, ly->lora_up_a, norm_ffn)), lora_scale));
-            ff = ggml_swiglu_split(ctx, gate, up);
-        } else {
-            ff = ggml_swiglu(ctx, gu);
-        }
+        ff = ggml_swiglu(ctx, gu);
     } else {
-        // Separate: two matmuls + split swiglu (with optional LoRA)
-        struct ggml_tensor * gate = dit_ggml_linear_lora(ctx, ly->gate_proj, ly->lora_gate_a, ly->lora_gate_b, lora_scale, norm_ffn);
-        struct ggml_tensor * up   = dit_ggml_linear_lora(ctx, ly->up_proj, ly->lora_up_a, ly->lora_up_b, lora_scale, norm_ffn);
+        // Separate: two matmuls + split swiglu
+        struct ggml_tensor * gate = dit_ggml_linear(ctx, ly->gate_proj, norm_ffn);
+        struct ggml_tensor * up   = dit_ggml_linear(ctx, ly->up_proj, norm_ffn);
         ff = ggml_swiglu_split(ctx, gate, up);
     }
 
     // Down projection: [I, S] -> [H, S]
-    return dit_ggml_linear_lora(ctx, ly->down_proj, ly->lora_down_a, ly->lora_down_b, lora_scale, ff);
+    return dit_ggml_linear(ctx, ly->down_proj, ff);
 }
 
 // Build cross-attention sub-graph for a single layer.
@@ -339,7 +289,6 @@ static struct ggml_tensor * dit_ggml_build_cross_attn(
     // Q from hidden, KV from encoder (full fused, Q+KV partial, separate)
     int q_dim  = Nh * D;
     int kv_dim = Nkv * D;
-    float lora_scale = m->lora_scale;
     struct ggml_tensor * q, * k, * v;
     if (ly->ca_qkv) {
         // Full QKV fused: split Q from hidden, KV from enc via weight views
@@ -351,31 +300,16 @@ static struct ggml_tensor * dit_ggml_build_cross_attn(
         struct ggml_tensor * kv = ggml_mul_mat(ctx, w_kv, enc);
         k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0));
         v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0]));
-        // LoRA on fused path: add scale * (B @ (A @ x)) for Q (from norm_ca), K/V (from enc)
-        if (lora_scale != 0.0f) {
-            if (ly->lora_ca_q_a && ly->lora_ca_q_b)
-                q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_q_b, ggml_mul_mat(ctx, ly->lora_ca_q_a, norm_ca)), lora_scale));
-            if (ly->lora_ca_k_a && ly->lora_ca_k_b)
-                k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_k_b, ggml_mul_mat(ctx, ly->lora_ca_k_a, enc)), lora_scale));
-            if (ly->lora_ca_v_a && ly->lora_ca_v_b)
-                v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_v_b, ggml_mul_mat(ctx, ly->lora_ca_v_a, enc)), lora_scale));
-        }
     } else if (ly->ca_kv) {
         // Q separate, K+V fused
-        q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, lora_scale, norm_ca);
+        q = dit_ggml_linear(ctx, ly->ca_q_proj, norm_ca);
         struct ggml_tensor * kv = ggml_mul_mat(ctx, ly->ca_kv, enc);
         k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0));
         v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0]));
-        if (lora_scale != 0.0f) {
-            if (ly->lora_ca_k_a && ly->lora_ca_k_b)
-                k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_k_b, ggml_mul_mat(ctx, ly->lora_ca_k_a, enc)), lora_scale));
-            if (ly->lora_ca_v_a && ly->lora_ca_v_b)
-                v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_v_b, ggml_mul_mat(ctx, ly->lora_ca_v_a, enc)), lora_scale));
-        }
     } else {
-        q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca);
-        k = dit_ggml_linear_lora(ctx, ly->ca_k_proj, ly->lora_ca_k_a, ly->lora_ca_k_b, m->lora_scale, enc);
-        v = dit_ggml_linear_lora(ctx, ly->ca_v_proj, ly->lora_ca_v_a, ly->lora_ca_v_b, m->lora_scale, enc);
+        q = dit_ggml_linear(ctx, ly->ca_q_proj, norm_ca);
+        k = dit_ggml_linear(ctx, ly->ca_k_proj, enc);
+        v = dit_ggml_linear(ctx, ly->ca_v_proj, enc);
     }
 
     // reshape to [D, heads, seq, N] then permute to [D, seq, heads, N]
@@ -405,7 +339,7 @@ static struct ggml_tensor * dit_ggml_build_cross_attn(
     attn = ggml_reshape_3d(ctx, attn, Nh * D, S, N);
 
     // O projection
-    return dit_ggml_linear_lora(ctx, ly->ca_o_proj, ly->lora_ca_o_a, ly->lora_ca_o_b, m->lora_scale, attn);
+    return dit_ggml_linear(ctx, ly->ca_o_proj, attn);
 }
 
 // Build one full DiT layer (AdaLN + self-attn + cross-attn + FFN + gated residuals)
diff --git a/src/dit-lora.cpp b/src/dit-lora.cpp
deleted file mode 100644
index b14e090..0000000
--- a/src/dit-lora.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-// dit-lora.cpp: Load LoRA adapters from safetensors into DiT (ACE-Step).
-// Compatible with PEFT adapter_model.safetensors (lora_A / lora_B per target layer).
-
-#include "dit.h"
-#include "safetensors.h"
-#include <cstdio>
-#include <cstring>
-#include <string>
-#include <unordered_map>
-
-// Normalize adapter key to base name: decoder.layers.N.<proj>
-// Handles: base_model.model.model., base_model.model.; decoder.layers. or layers.; .lora_A.default/.lora_B.default or .lora_A.weight/.lora_B.weight
-static std::string lora_key_to_base(const std::string & key) {
-    std::string s = key;
-    const char * prefixes[] = { "base_model.model.model.", "base_model.model." };
-    for (const char * p : prefixes) {
-        size_t pl = strlen(p);
-        if (s.size() >= pl && s.compare(0, pl, p) == 0) {
-            s = s.substr(pl);
-            break;
-        }
-    }
-    // PEFT-style suffix
-    if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.default") == 0)
-        s = s.substr(0, s.size() - 14);
-    else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.default") == 0)
-        s = s.substr(0, s.size() - 14);
-    else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.weight") == 0)
-        s = s.substr(0, s.size() - 14);
-    else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.weight") == 0)
-        s = s.substr(0, s.size() - 14);
-    else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_A") == 0)
-        s = s.substr(0, s.size() - 7);
-    else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_B") == 0)
-        s = s.substr(0, s.size() - 7);
-    // HuggingFace adapter: layers.N -> decoder.layers.N for our DiT naming
-    if (s.size() >= 7 && s.compare(0, 7, "layers.") == 0)
-        s = "decoder." + s;
-    return s;
-}
-
-static bool is_lora_a(const std::string & key) {
-    return key.find("lora_A") != std::string::npos;
-}
-
-// Slot index for layer: 0=sa_q, 1=sa_k, 2=sa_v, 3=sa_o, 4=ca_q, 5=ca_k, 6=ca_v, 7=ca_o, 8=gate, 9=up, 10=down
-enum LoraSlot {
-    SA_Q, SA_K, SA_V, SA_O, CA_Q, CA_K, CA_V, CA_O, GATE, UP, DOWN, N_SLOTS
-};
-
-static bool parse_base_name(const std::string & base, int * layer_idx, LoraSlot * slot) {
-    int L = -1;
-    if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.q_proj", &L) == 1) { *layer_idx = L; *slot = SA_Q; return true; }
-    if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.k_proj", &L) == 1) { *layer_idx = L; *slot = SA_K; return true; }
-    if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.v_proj", &L) == 1) { *layer_idx = L; *slot = SA_V; return true; }
-    if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.o_proj", &L) == 1) { *layer_idx = L; *slot = SA_O; return true; }
-    if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.q_proj", &L) == 1) { *layer_idx = L; *slot = CA_Q; return true; }
-    if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.k_proj", &L) == 1) { *layer_idx = L; *slot = CA_K; return true; }
-    if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.v_proj", &L) == 1) { *layer_idx = L; *slot = CA_V; return true; }
-    if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.o_proj", &L) == 1) { *layer_idx = L; *slot = CA_O; return true; }
-    if (sscanf(base.c_str(), "decoder.layers.%d.mlp.gate_proj", &L) == 1) { *layer_idx = L; *slot = GATE; return true; }
-    if (sscanf(base.c_str(), "decoder.layers.%d.mlp.up_proj", &L) == 1) { *layer_idx = L; *slot = UP; return true; }
-    if (sscanf(base.c_str(), "decoder.layers.%d.mlp.down_proj", &L) == 1) { *layer_idx = L; *slot = DOWN; return true; }
-    return false;
-}
-
-static struct ggml_tensor ** slot_to_ptr(DiTGGMLLayer * ly, LoraSlot slot, bool is_b) {
-    if (is_b) {
-        switch (slot) {
-            case SA_Q: return &ly->lora_sa_q_b; case SA_K: return &ly->lora_sa_k_b; case SA_V: return &ly->lora_sa_v_b; case SA_O: return &ly->lora_sa_o_b;
-            case CA_Q: return &ly->lora_ca_q_b; case CA_K: return &ly->lora_ca_k_b; case CA_V: return &ly->lora_ca_v_b; case CA_O: return &ly->lora_ca_o_b;
-            case GATE: return &ly->lora_gate_b; case UP: return &ly->lora_up_b; case DOWN: return &ly->lora_down_b;
-            default: return nullptr;
-        }
-    } else {
-        switch (slot) {
-            case SA_Q: return &ly->lora_sa_q_a; case SA_K: return &ly->lora_sa_k_a; case SA_V: return &ly->lora_sa_v_a; case SA_O: return &ly->lora_sa_o_a;
-            case CA_Q: return &ly->lora_ca_q_a; case CA_K: return &ly->lora_ca_k_a; case CA_V: return &ly->lora_ca_v_a; case CA_O: return &ly->lora_ca_o_a;
-            case GATE: return &ly->lora_gate_a; case UP: return &ly->lora_up_a; case DOWN: return &ly->lora_down_a;
-            default: return nullptr;
-        }
-    }
-}
-
-bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale) {
-    FILE * fp = fopen(lora_path, "rb");
-    if (!fp) {
-        fprintf(stderr, "[LoRA] cannot open %s\n", lora_path);
-        return false;
-    }
-    std::unordered_map<std::string, SafeTensorInfo> tensors;
-    if (fseek(fp, 0, SEEK_SET) != 0) {
-        fclose(fp);
-        return false;
-    }
-    int n = safetensors_parse_lora(fp, &tensors);
-    uint64_t data_section_start = (uint64_t)ftell(fp);
-    if (n == 0) {
-        fclose(fp);
-        fprintf(stderr, "[LoRA] no LoRA tensors found in %s\n", lora_path);
-        return false;
-    }
-
-    // Count pairs we will load: for each lora_A key, find the matching lora_B (same base name)
-    std::unordered_map<std::string, std::pair<std::string, std::string>> pairs;  // base -> (key_a, key_b)
-    std::unordered_map<std::string, std::string> base_to_b;
-    for (const auto & kv : tensors) {
-        std::string base = lora_key_to_base(kv.first);
-        if (base.empty()) continue;
-        if (is_lora_a(kv.first))
-            base_to_b[base] = "";  // mark base as having A; we'll find B next
-    }
-    for (const auto & kv : tensors) {
-        std::string base = lora_key_to_base(kv.first);
-        if (base.empty()) continue;
-        if (base_to_b.count(base) && kv.first.find("lora_B") != std::string::npos)
-            base_to_b[base] = kv.first;
-    }
-    for (const auto & kv : tensors) {
-        if (!is_lora_a(kv.first)) continue;
-        std::string base = lora_key_to_base(kv.first);
-        auto it = base_to_b.find(base);
-        if (it != base_to_b.end() && !it->second.empty())
-            pairs[base] = { kv.first, it->second };
-    }
-
-    int n_pairs = (int)pairs.size();
-    wctx_init(&m->lora_wctx, n_pairs * 2);  // A and B per pair
-
-    fseek(fp, (long)data_section_start, SEEK_SET);
-
-    for (const auto & p : pairs) {
-        const std::string & base = p.first;
-        const std::string & key_a = p.second.first;
-        const std::string & key_b = p.second.second;
-        int layer_idx = 0;
-        LoraSlot slot = N_SLOTS;
-        if (!parse_base_name(base, &layer_idx, &slot) || layer_idx < 0 || layer_idx >= m->cfg.n_layers) continue;
-
-        DiTGGMLLayer * ly = &m->layers[layer_idx];
-        SafeTensorInfo & info_a = tensors[key_a];
-        SafeTensorInfo & info_b = tensors[key_b];
-        if (info_a.n_dims != 2 || info_b.n_dims != 2) continue;
-        // A_pt [r, in], B_pt [out, r]. We need A_ggml [r, in] for mul_mat(A,x)=[r,S], B_ggml [out, r] for mul_mat(B, Ax)=[out,S].
-        // ggml layout: ne[0]=cols, ne[1]=rows. So A: [r, in] -> ne[0]=in, ne[1]=r. B: [out, r] -> ne[0]=r, ne[1]=out.
-        int64_t r = info_a.shape[0], in_dim = info_a.shape[1];
-        int64_t out_dim = info_b.shape[0];
-        if (info_b.shape[1] != r) continue;
-
-        struct ggml_tensor * ta = ggml_new_tensor_2d(m->lora_wctx.ctx, GGML_TYPE_F32, (int64_t)in_dim, (int64_t)r);
-        struct ggml_tensor * tb = ggml_new_tensor_2d(m->lora_wctx.ctx, GGML_TYPE_F32, (int64_t)r, (int64_t)out_dim);
-        ggml_set_name(ta, key_a.c_str());
-        ggml_set_name(tb, key_b.c_str());
-
-        // Copy A: file is row-major [r, in], we need ggml col-major [in, r] (transpose)
-        size_t na = (size_t)(r * in_dim);
-        m->lora_wctx.staging.emplace_back(na);
-        float * buf_a = m->lora_wctx.staging.back().data();
-        if (!safetensors_read_tensor_data(fp, data_section_start, info_a.data_start, info_a.data_end, buf_a)) {
-            fclose(fp);
-            wctx_free(&m->lora_wctx);
-            return false;
-        }
-        m->lora_wctx.staging.emplace_back(na);
-        float * transposed_a = m->lora_wctx.staging.back().data();
-        for (int64_t i = 0; i < r; i++)
-            for (int64_t j = 0; j < in_dim; j++)
-                transposed_a[(size_t)(j * r + i)] = buf_a[(size_t)(i * in_dim + j)];
-        m->lora_wctx.pending.push_back({ ta, transposed_a, na * sizeof(float), 0 });
-
-        size_t nb = (size_t)(out_dim * r);
-        m->lora_wctx.staging.emplace_back(nb);
-        float * buf_b = m->lora_wctx.staging.back().data();
-        if (!safetensors_read_tensor_data(fp, data_section_start, info_b.data_start, info_b.data_end, buf_b)) {
-            fclose(fp);
-            wctx_free(&m->lora_wctx);
-            return false;
-        }
-        m->lora_wctx.staging.emplace_back(nb);
-        float * transposed_b = m->lora_wctx.staging.back().data();
-        for (int64_t i = 0; i < out_dim; i++)
-            for (int64_t j = 0; j < r; j++)
-                transposed_b[(size_t)(j * out_dim + i)] = buf_b[(size_t)(i * r + j)];
-        m->lora_wctx.pending.push_back({ tb, transposed_b, nb * sizeof(float), 0 });
-
-        struct ggml_tensor ** pa = slot_to_ptr(ly, slot, false);
-        struct ggml_tensor ** pb = slot_to_ptr(ly, slot, true);
-        if (pa) *pa = ta;
-        if (pb) *pb = tb;
-    }
-    fclose(fp);
-    fp = nullptr;
-
-    if (!wctx_alloc(&m->lora_wctx, m->backend)) {
-        fprintf(stderr, "[LoRA] failed to allocate LoRA tensors on backend\n");
-        wctx_free(&m->lora_wctx);
-        return false;
-    }
-    m->lora_scale = scale;
-    fprintf(stderr, "[LoRA] loaded %d adapter pairs from %s (scale=%.4f)\n", n_pairs, lora_path, scale);
-    return true;
-}
diff --git a/src/dit-sampler.h b/src/dit-sampler.h
index 92540a8..31d9817 100644
--- a/src/dit-sampler.h
+++ b/src/dit-sampler.h
@@ -8,12 +8,8 @@
 #include "dit-graph.h"
 #include "debug.h"
 
-#include "ggml.h"
-#include "ggml-backend.h"
-#include "ggml-alloc.h"
 
 #include <cstdio>
-#include <cstdlib>
 #include <cstring>
 #include <cmath>
 #include <vector>
diff --git a/src/dit.h b/src/dit.h
index 872bf06..cd2936e 100644
--- a/src/dit.h
+++ b/src/dit.h
@@ -10,17 +10,13 @@
 
 #include "ggml.h"
 #include "ggml-backend.h"
-#include "ggml-alloc.h"
 #include "gguf-weights.h"
 #include "backend.h"
 
-#include "debug.h"
 
 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
 #include <string>
-#include <vector>
 
 // Config (mirrors dit.cuh DiTConfig)
 struct DiTGGMLConfig {
@@ -81,19 +77,6 @@ struct DiTGGMLLayer {
     // AdaLN scale-shift table: [6*hidden] (6 rows of [hidden])
     struct ggml_tensor * scale_shift_table; // [hidden, 6] in ggml layout
 
-    // Optional LoRA adapters (F32, applied when base projection is separate)
-    struct ggml_tensor * lora_sa_q_a, * lora_sa_q_b;
-    struct ggml_tensor * lora_sa_k_a, * lora_sa_k_b;
-    struct ggml_tensor * lora_sa_v_a, * lora_sa_v_b;
-    struct ggml_tensor * lora_sa_o_a, * lora_sa_o_b;
-    struct ggml_tensor * lora_ca_q_a, * lora_ca_q_b;
-    struct ggml_tensor * lora_ca_k_a, * lora_ca_k_b;
-    struct ggml_tensor * lora_ca_v_a, * lora_ca_v_b;
-    struct ggml_tensor * lora_ca_o_a, * lora_ca_o_b;
-    struct ggml_tensor * lora_gate_a, * lora_gate_b;
-    struct ggml_tensor * lora_up_a, * lora_up_b;
-    struct ggml_tensor * lora_down_a, * lora_down_b;
-
     int layer_type;  // 0=sliding, 1=full
 };
 
@@ -135,8 +118,6 @@ struct DiTGGML {
 
     // Weight storage
     WeightCtx wctx;
-    WeightCtx lora_wctx;     // optional LoRA adapter tensors (when lora_scale > 0)
-    float lora_scale;        // alpha/rank for LoRA (0 = no LoRA)
 
     // Pre-allocated constant for AdaLN (1+scale) fusion
     struct ggml_tensor * scalar_one;  // [1] = 1.0f, broadcast in ggml_add
@@ -404,14 +385,9 @@ static void dit_ggml_init_backend(DiTGGML * m) {
     m->use_flash_attn = (bp.backend != bp.cpu_backend);
 }
 
-// Load LoRA adapter from safetensors (e.g. adapter_model.safetensors).
-// scale = alpha/rank (typical 1.0). Call after dit_ggml_load. Returns false on error.
-bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale);
-
 static void dit_ggml_free(DiTGGML * m) {
     if (m->sched) ggml_backend_sched_free(m->sched);
     backend_release(m->backend, m->cpu_backend);
     wctx_free(&m->wctx);
-    if (m->lora_wctx.ctx) wctx_free(&m->lora_wctx);
     *m = {};
 }
diff --git a/src/fsq-detok.h b/src/fsq-detok.h
index 7430db7..5cc3d7c 100644
--- a/src/fsq-detok.h
+++ b/src/fsq-detok.h
@@ -10,13 +10,10 @@
 
 #pragma once
 #include "qwen3-enc.h"
-#include <vector>
 
 // FSQ constants
 static const int FSQ_NDIMS = 6;
 static const int FSQ_LEVELS[6] = {8, 8, 8, 5, 5, 5};
-static const int FSQ_N_CODES = 8 * 8 * 8 * 5 * 5 * 5;  // 8000
-static const int FSQ_FRAMES_PER_CODE = 5;
 
 // FSQ decode: integer index -> 6 normalized float values
 // Each dimension: level_idx / ((L-1)/2) - 1.0  (maps to [-1, 1])
@@ -113,6 +110,10 @@ static bool detok_ggml_load(DetokGGML * m, const char * gguf_path,
     ggml_backend_t backends[2] = { backend, cpu_backend };
     int n = (backend == cpu_backend) ? 1 : 2;
     m->sched = ggml_backend_sched_new(backends, NULL, n, 4096, false, true);
+    if (!m->sched) {
+        fprintf(stderr, "[FSQ] FATAL: failed to create scheduler\n");
+        return false;
+    }
 
     fprintf(stderr, "[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)\n");
     return true;
@@ -220,48 +221,6 @@ static int detok_ggml_decode(DetokGGML * m, const int * codes, int T_5Hz,
     return T_25Hz;
 }
 
-// Build codeword table for latent->code (cover from file): for each code 0..FSQ_N_CODES-1,
-// decode to 5*64 floats. table_out must be at least FSQ_N_CODES * FSQ_FRAMES_PER_CODE * 64 floats.
-static void detok_ggml_build_codeword_table(DetokGGML * m, float * table_out) {
-    const int chunk = FSQ_FRAMES_PER_CODE * 64;
-    for (int i = 0; i < FSQ_N_CODES; i++) {
-        int n = detok_ggml_decode(m, &i, 1, table_out + (size_t)i * chunk);
-        (void)n;
-    }
-}
-
-// Encode latent frames to 5Hz codes by nearest codeword. T_latent = number of 25Hz frames (64-d each).
-// Groups frames in chunks of 5; for each chunk finds the code whose codeword minimizes L2 distance.
-// codeword_table from detok_ggml_build_codeword_table (FSQ_N_CODES * 5 * 64 floats).
-// Pads last chunk with zeros if T_latent not divisible by 5.
-static void latent_frames_to_codes(int T_latent, const float * latent_64d,
-                                   const float * codeword_table,
-                                   std::vector<int> * out_codes) {
-    out_codes->clear();
-    const int chunk_frames = FSQ_FRAMES_PER_CODE;
-    const int chunk_size = chunk_frames * 64;
-    int n_chunks = T_latent / chunk_frames;
-    if (n_chunks <= 0) return;
-    for (int g = 0; g < n_chunks; g++) {
-        const float * chunk = latent_64d + (size_t)g * chunk_size;
-        int best = 0;
-        float best_d2 = 1e30f;
-        for (int i = 0; i < FSQ_N_CODES; i++) {
-            const float * cw = codeword_table + (size_t)i * chunk_size;
-            float d2 = 0.0f;
-            for (int j = 0; j < chunk_size; j++) {
-                float d = chunk[j] - cw[j];
-                d2 += d * d;
-            }
-            if (d2 < best_d2) {
-                best_d2 = d2;
-                best = i;
-            }
-        }
-        out_codes->push_back(best);
-    }
-}
-
 // Free
 static void detok_ggml_free(DetokGGML * m) {
     if (m->sched) ggml_backend_sched_free(m->sched);
diff --git a/src/gguf-weights.h b/src/gguf-weights.h
index ac5f22d..be5bede 100644
--- a/src/gguf-weights.h
+++ b/src/gguf-weights.h
@@ -18,7 +18,6 @@
 
 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
 #include <string>
 
 #ifdef _WIN32
diff --git a/src/metadata-fsm.h b/src/metadata-fsm.h
index becbe1c..69ae125 100644
--- a/src/metadata-fsm.h
+++ b/src/metadata-fsm.h
@@ -11,10 +11,8 @@
 #include <string>
 #include <vector>
 #include <map>
-#include <unordered_map>
 #include <algorithm>
 #include <cstdio>
-#include <cstring>
 
 // Prefix tree for FSM constrained decoding
 struct PrefixTree {
diff --git a/src/prompt.h b/src/prompt.h
index 99782d8..c568f5f 100644
--- a/src/prompt.h
+++ b/src/prompt.h
@@ -8,7 +8,6 @@
 
 #include <string>
 #include <vector>
-#include <cstdio>
 #include <cstdlib>
 #include <cstring>
 
diff --git a/src/qwen3-enc.h b/src/qwen3-enc.h
index b8ea213..ee9b628 100644
--- a/src/qwen3-enc.h
+++ b/src/qwen3-enc.h
@@ -17,7 +17,6 @@
 #include "gguf-weights.h"
 #include <cmath>
 #include <cstdio>
-#include <cstring>
 #include <string>
 #include <vector>
 
@@ -404,7 +403,10 @@ static void qwen3_forward(Qwen3GGML * m, const int * token_ids, int S, float * o
     ggml_build_forward_expand(gf, out);
 
     // Allocate
-    ggml_backend_sched_alloc_graph(m->sched, gf);
+    if (!ggml_backend_sched_alloc_graph(m->sched, gf)) {
+        fprintf(stderr, "[TextEncoder] FATAL: failed to allocate graph (%d tokens)\n", S);
+        exit(1);
+    }
 
     // Set inputs
     ggml_backend_tensor_set(t_ids, token_ids, 0, S * sizeof(int));
@@ -455,7 +457,10 @@ static void qwen3_embed_lookup(Qwen3GGML * m, const int * token_ids, int S, floa
     ggml_set_output(out);
     ggml_build_forward_expand(gf, out);
 
-    ggml_backend_sched_alloc_graph(m->sched, gf);
+    if (!ggml_backend_sched_alloc_graph(m->sched, gf)) {
+        fprintf(stderr, "[TextEncoder] FATAL: failed to allocate graph (embed lookup, %d tokens)\n", S);
+        exit(1);
+    }
     ggml_backend_tensor_set(t_ids, token_ids, 0, S * sizeof(int));
     ggml_backend_sched_graph_compute(m->sched, gf);
     ggml_backend_tensor_get(out, output, 0, (size_t)H * S * sizeof(float));
diff --git a/src/qwen3-lm.h b/src/qwen3-lm.h
index 3bbd514..5f5e290 100644
--- a/src/qwen3-lm.h
+++ b/src/qwen3-lm.h
@@ -4,14 +4,11 @@
 #pragma once
 
 #include "qwen3-enc.h" // Qwen3Layer, Qwen3Config, layer build helpers
-#include "ggml-alloc.h"
-#include "bpe.h"
 
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <cmath>
-#include <string>
 #include <vector>
 
 // LM config (superset of encoder config)
@@ -450,7 +447,10 @@ static void qw3lm_forward(Qwen3LM * m, const int * token_ids, int n_tokens,
     ggml_build_forward_expand(gf, lgt);
 
     // Schedule + allocate
-    ggml_backend_sched_alloc_graph(m->sched, gf);
+    if (!ggml_backend_sched_alloc_graph(m->sched, gf)) {
+        fprintf(stderr, "[LM] FATAL: failed to allocate graph (prefill, %d tokens)\n", n_tokens);
+        exit(1);
+    }
 
     // Set token IDs
     ggml_backend_tensor_set(token_ids_t, token_ids, 0, n_tokens * sizeof(int));
@@ -678,7 +678,10 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids,
     ggml_build_forward_expand(gf, lgt);
 
     // Allocate
-    ggml_backend_sched_alloc_graph(m->sched, gf);
+    if (!ggml_backend_sched_alloc_graph(m->sched, gf)) {
+        fprintf(stderr, "[LM] FATAL: failed to allocate graph (batch decode, N=%d)\n", N);
+        exit(1);
+    }
 
     // Set token IDs
     ggml_backend_tensor_set(token_ids_t, token_ids, 0, N * sizeof(int));
diff --git a/src/request.cpp b/src/request.cpp
index f85873a..c851eb3 100644
--- a/src/request.cpp
+++ b/src/request.cpp
@@ -6,20 +6,16 @@
 
 #include "request.h"
 #include <cstdlib>
-#include <cstring>
 #include <string>
 #include <vector>
 
-// Defaults (aligned with Python GenerationParams and ACE-Step 1.5 Tutorial)
+// Defaults (aligned with Python GenerationParams)
 void request_init(AceRequest * r) {
-    r->task_type          = "text2music";
     r->caption            = "";
     r->lyrics             = "";
-    r->instrumental       = false;
-    r->custom_tag         = "";
-    r->genre              = "";
+
     r->bpm                = 0;
-    r->duration           = -1.0f;
+    r->duration           = 0.0f;
     r->keyscale           = "";
     r->timesignature      = "";
     r->vocal_language     = "unknown";
@@ -29,14 +25,9 @@ void request_init(AceRequest * r) {
     r->lm_top_p           = 0.9f;
     r->lm_top_k           = 0;
     r->lm_negative_prompt = "";
-    r->reference_audio    = "";
-    r->src_audio          = "";
     r->audio_codes        = "";
-    r->audio_cover_strength = 1.0f;
-    r->repainting_start   = 0.0f;
-    r->repainting_end     = 0.0f;
     r->inference_steps    = 8;
-    r->guidance_scale     = 1.0f;
+    r->guidance_scale     = 0.0f;
     r->shift              = 3.0f;
 }
 
@@ -226,18 +217,11 @@ bool request_parse(AceRequest * r, const char * path) {
         const std::string & v = kv.value;
 
         // strings
-        if      (k == "task_type")          r->task_type          = v;
-        else if (k == "caption")            r->caption            = v;
+        if      (k == "caption")            r->caption            = v;
         else if (k == "lyrics")             r->lyrics             = v;
-        else if (k == "custom_tag")         r->custom_tag         = v;
-        else if (k == "genre")              r->genre             = v;
         else if (k == "keyscale")           r->keyscale           = v;
-        else if (k == "formatted_lyrics")    r->lyrics             = v;  // alias for lyrics
-        else if (k == "language")           r->vocal_language    = v;  // alias for vocal_language
         else if (k == "timesignature")      r->timesignature      = v;
         else if (k == "vocal_language")     r->vocal_language     = v;
-        else if (k == "reference_audio")   r->reference_audio    = v;
-        else if (k == "src_audio")          r->src_audio          = v;
         else if (k == "audio_codes")        r->audio_codes        = v;
         else if (k == "lm_negative_prompt") r->lm_negative_prompt = v;
 
@@ -251,16 +235,11 @@ bool request_parse(AceRequest * r, const char * path) {
         else if (k == "lm_cfg_scale")       r->lm_cfg_scale       = (float)atof(v.c_str());
         else if (k == "lm_top_p")           r->lm_top_p           = (float)atof(v.c_str());
         else if (k == "lm_top_k")           r->lm_top_k           = atoi(v.c_str());
-        else if (k == "audio_cover_strength") r->audio_cover_strength = (float)atof(v.c_str());
-        else if (k == "repainting_start")   r->repainting_start   = (float)atof(v.c_str());
-        else if (k == "repainting_end")     r->repainting_end    = (float)atof(v.c_str());
         else if (k == "inference_steps")    r->inference_steps    = atoi(v.c_str());
         else if (k == "guidance_scale")     r->guidance_scale     = (float)atof(v.c_str());
         else if (k == "shift")              r->shift              = (float)atof(v.c_str());
 
         // bools
-        else if (k == "instrumental")       r->instrumental       = (v == "true");
-        else if (k == "is_instrumental")    r->instrumental       = (v == "true");
         // unknown keys: silently ignored (forward compat)
     }
 
@@ -276,15 +255,8 @@ bool request_write(const AceRequest * r, const char * path) {
     }
 
     fprintf(f, "{\n");
-    fprintf(f, "  \"task_type\": \"%s\",\n",         json_escape(r->task_type).c_str());
     fprintf(f, "  \"caption\": \"%s\",\n",            json_escape(r->caption).c_str());
     fprintf(f, "  \"lyrics\": \"%s\",\n",             json_escape(r->lyrics).c_str());
-    if (r->instrumental)
-        fprintf(f, "  \"instrumental\": true,\n");
-    if (!r->custom_tag.empty())
-        fprintf(f, "  \"custom_tag\": \"%s\",\n",     json_escape(r->custom_tag).c_str());
-    if (!r->genre.empty())
-        fprintf(f, "  \"genre\": \"%s\",\n",          json_escape(r->genre).c_str());
     fprintf(f, "  \"bpm\": %d,\n",                    r->bpm);
     fprintf(f, "  \"duration\": %.1f,\n",             r->duration);
     fprintf(f, "  \"keyscale\": \"%s\",\n",           json_escape(r->keyscale).c_str());
@@ -296,18 +268,10 @@ bool request_write(const AceRequest * r, const char * path) {
     fprintf(f, "  \"lm_top_p\": %.2f,\n",             r->lm_top_p);
     fprintf(f, "  \"lm_top_k\": %d,\n",               r->lm_top_k);
     fprintf(f, "  \"lm_negative_prompt\": \"%s\",\n", json_escape(r->lm_negative_prompt).c_str());
-    if (!r->reference_audio.empty())
-        fprintf(f, "  \"reference_audio\": \"%s\",\n", json_escape(r->reference_audio).c_str());
-    if (!r->src_audio.empty())
-        fprintf(f, "  \"src_audio\": \"%s\",\n",       json_escape(r->src_audio).c_str());
-    fprintf(f, "  \"audio_cover_strength\": %.2f,\n", r->audio_cover_strength);
-    if (r->repainting_start != 0.0f || r->repainting_end != 0.0f) {
-        fprintf(f, "  \"repainting_start\": %.1f,\n", r->repainting_start);
-        fprintf(f, "  \"repainting_end\": %.1f,\n",   r->repainting_end);
-    }
     fprintf(f, "  \"inference_steps\": %d,\n",        r->inference_steps);
     fprintf(f, "  \"guidance_scale\": %.1f,\n",       r->guidance_scale);
     fprintf(f, "  \"shift\": %.1f,\n",                r->shift);
+    // audio_codes last (no trailing comma)
     fprintf(f, "  \"audio_codes\": \"%s\"\n",         json_escape(r->audio_codes).c_str());
     fprintf(f, "}\n");
 
@@ -317,12 +281,10 @@ bool request_write(const AceRequest * r, const char * path) {
 }
 
 void request_dump(const AceRequest * r, FILE * f) {
-    fprintf(f, "[Request] task=%s seed=%lld\n", r->task_type.c_str(), (long long)r->seed);
+    fprintf(f, "[Request] seed=%lld\n", (long long)r->seed);
     fprintf(f, "  caption:    %.60s%s\n",
             r->caption.c_str(), r->caption.size() > 60 ? "..." : "");
     fprintf(f, "  lyrics:     %zu bytes\n", r->lyrics.size());
-    if (!r->custom_tag.empty())
-        fprintf(f, "  custom_tag: %s\n", r->custom_tag.c_str());
     fprintf(f, "  bpm=%d dur=%.0f key=%s ts=%s lang=%s\n",
             r->bpm, r->duration, r->keyscale.c_str(),
             r->timesignature.c_str(), r->vocal_language.c_str());
@@ -330,12 +292,6 @@ void request_dump(const AceRequest * r, FILE * f) {
             r->lm_temperature, r->lm_cfg_scale, r->lm_top_p, r->lm_top_k);
     fprintf(f, "  dit: steps=%d guidance=%.1f shift=%.1f\n",
             r->inference_steps, r->guidance_scale, r->shift);
-    if (!r->reference_audio.empty())
-        fprintf(f, "  reference_audio: %s\n", r->reference_audio.c_str());
-    if (!r->src_audio.empty())
-        fprintf(f, "  src_audio: %s\n", r->src_audio.c_str());
-    fprintf(f, "  audio_codes: %s  cover_strength=%.2f\n",
-            r->audio_codes.empty() ? "(none)" : "(present)", r->audio_cover_strength);
-    if (r->repainting_start != 0.0f || r->repainting_end != 0.0f)
-        fprintf(f, "  repaint: %.1f–%.1fs\n", r->repainting_start, r->repainting_end);
+    fprintf(f, "  audio_codes: %s\n",
+            r->audio_codes.empty() ? "(none)" : "(present)");
 }
diff --git a/src/request.h b/src/request.h
index ba85821..d1748b5 100644
--- a/src/request.h
+++ b/src/request.h
@@ -3,31 +3,21 @@
 // request.h - AceStep generation request (JSON serialization)
 //
 // Pure data container + JSON read/write. Zero business logic.
-// Aligned with Python GenerationParams and ACE-Step 1.5 Tutorial:
-// https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md
+// Aligned with Python GenerationParams (inference.py:39) and API /release_task.
 //
 
+#include <cstdint>
 #include <string>
 #include <cstdio>
 
 struct AceRequest {
-    // --- Task & audio inputs (Tutorial: Input Control) ---
-    // task_type: "text2music" | "cover" | "repaint" | "lego" | "extract" | "complete"
-    // (lego/extract/complete require Base DiT; we only have Turbo/SFT)
-    std::string task_type;         // "text2music"
-
     // text content
     std::string caption;            // ""
     std::string lyrics;             // ""
-    bool        instrumental;      // false
-
-    // LoRA / style trigger (appended to caption for condition encoder when set)
-    std::string custom_tag;         // "" LoRA trigger word
-    std::string genre;             // "" LoRA genre
 
     // metadata (user-provided or LLM-enriched)
     int         bpm;                // 0 = unset
-    float       duration;           // -1 = unset
+    float       duration;           // 0 = unset
     std::string keyscale;           // "" = unset
     std::string timesignature;      // "" = unset
     std::string vocal_language;     // "unknown"
@@ -42,20 +32,9 @@ struct AceRequest {
     int         lm_top_k;           // 0 = disabled (matches Python None)
     std::string lm_negative_prompt; // ""
 
-    // Audio control (Tutorial: reference_audio, src_audio, audio_codes)
-    // reference_audio: path to WAV for global timbre/style (VAE encode → CondEncoder timbre)
-    std::string reference_audio;   // ""
-    // src_audio: path to WAV for Cover (encode → codes) or Repaint context
-    std::string src_audio;          // ""
     // codes (Python-compatible string: "3101,11837,27514,...")
     // empty = text2music (silence context), non-empty = cover mode
     std::string audio_codes;        // ""
-    // 0.0–1.0: how strongly generation follows reference/codes (Tutorial: audio_cover_strength)
-    float       audio_cover_strength; // 1.0
-
-    // Repaint interval (seconds). Only used when task_type == "repaint".
-    float       repainting_start;   // 0.0
-    float       repainting_end;     // 0.0
 
     // DiT control (Python: inference_steps, guidance_scale, shift)
     int         inference_steps;    // 8
diff --git a/src/safetensors.h b/src/safetensors.h
deleted file mode 100644
index 74d5967..0000000
--- a/src/safetensors.h
+++ /dev/null
@@ -1,107 +0,0 @@
-#pragma once
-// safetensors.h: minimal reader for LoRA adapter_model.safetensors
-//
-// Format: 8-byte header length (LE uint64), then JSON header, then raw tensor data.
-// We only parse keys that look like "*lora_A*" / "*lora_B*" and extract shape + data_offsets.
-
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <string>
-#include <vector>
-#include <unordered_map>
-
-struct SafeTensorInfo {
-    std::string dtype;       // "F32", "F16", "BF16"
-    int64_t shape[2];        // [dim0, dim1] from JSON
-    int n_dims;
-    uint64_t data_start;     // byte offset in file (after header)
-    uint64_t data_end;
-};
-
-// Open file, read header, parse tensor metadata for LoRA tensors.
-// Returns number of LoRA tensors found; fills *out with tensor name -> info.
-// Caller must fclose(fp) and free the map; file position is left at start of data section.
-static int safetensors_parse_lora(FILE * fp, std::unordered_map<std::string, SafeTensorInfo> * out) {
-    out->clear();
-    uint64_t header_len = 0;
-    uint8_t h8[8];
-    if (fread(h8, 1, 8, fp) != 8) return 0;
-    header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24)
-        | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56);
-    if (header_len == 0 || header_len > 10 * 1024 * 1024) return 0;  // cap 10MB header
-    std::vector<char> buf(header_len + 1);
-    if (fread(buf.data(), 1, header_len, fp) != header_len) return 0;
-    buf[header_len] = '\0';
-    const char * json = buf.data();
-
-    // Find each key that contains "lora_A" or "lora_B"
-    const char * p = json;
-    int count = 0;
-    while ((p = strstr(p, "\"")) != nullptr) {
-        const char * key_start = p + 1;
-        p = strchr(key_start, '"');
-        if (!p) break;
-        std::string key(key_start, (size_t)(p - key_start));
-        p++;
-        if (key.find("lora_A") == std::string::npos && key.find("lora_B") == std::string::npos) {
-            continue;
-        }
-        // Find the value object for this key: skip ":
-        while (*p && (*p == ' ' || *p == ':')) p++;
-        if (*p != '{') continue;
-        const char * obj = p;
-        SafeTensorInfo info = {};
-        info.shape[0] = info.shape[1] = 1;
-        info.n_dims = 0;
-        // "shape":[n,m] or [n]
-        const char * sh = strstr(obj, "\"shape\"");
-        if (sh) {
-            const char * br = strchr(sh, '[');
-            if (br) {
-                long long a = 0, b = 0;
-                int n = sscanf(br, "[%lld,%lld]", &a, &b);
-                if (n >= 1) { info.shape[0] = (int64_t)a; info.n_dims = 1; }
-                if (n >= 2) { info.shape[1] = (int64_t)b; info.n_dims = 2; }
-            }
-        }
-        const char * dt = strstr(obj, "\"dtype\"");
-        if (dt) {
-            const char * q = strchr(dt, '"');
-            if (q) q = strchr(q + 1, '"');
-            if (q) {
-                const char * start = q + 1;
-                const char * end = strchr(start, '"');
-                if (end) info.dtype = std::string(start, end - start);
-            }
-        }
-        const char * off = strstr(obj, "\"data_offsets\"");
-        if (off) {
-            const char * br = strchr(off, '[');
-            if (br) {
-                uint64_t s = 0, e = 0;
-                if (sscanf(br, "[%llu,%llu]", (unsigned long long*)&s, (unsigned long long*)&e) == 2) {
-                    info.data_start = s;
-                    info.data_end = e;
-                }
-            }
-        }
-        if (info.dtype.empty() || info.n_dims == 0) continue;
-        (*out)[key] = info;
-        count++;
-    }
-    return count;
-}
-
-// Read raw tensor data from file. File must be positioned at start of data section
-// (i.e. after the 8-byte header length + header bytes).
-// data_offset in the JSON is relative to the start of the data section.
-static bool safetensors_read_tensor_data(FILE * fp, uint64_t data_section_start,
-        uint64_t tensor_start, uint64_t tensor_end, void * out_buf) {
-    uint64_t off = data_section_start + tensor_start;
-    uint64_t nbytes = tensor_end - tensor_start;
-    if (fseek(fp, (long)off, SEEK_SET) != 0) return false;
-    if (fread(out_buf, 1, nbytes, fp) != nbytes) return false;
-    return true;
-}
diff --git a/src/vae-enc.h b/src/vae-enc.h
new file mode 100644
index 0000000..f5c67f2
--- /dev/null
+++ b/src/vae-enc.h
@@ -0,0 +1,391 @@
+// vae-enc.h: AutoencoderOobleck encoder (audio -> latent) via ggml
+//
+// Mirror of vae.h decoder. Reuses VAEResUnit, load helpers, graph ops.
+// Architecture: conv1(2->128,k=7) -> 5x(3xresunit+snake+strided_conv) -> snake+conv2(2048->128,k=3)
+// Output 128ch = mean[64] + scale[64]. Deterministic encode returns mean.
+// Downsample: 2x4x4x6x10 = 1920x (matches decoder upsample)
+
+#pragma once
+#include "vae.h"
+
+// Encoder block: 3xResUnit(in_ch) -> snake(in_ch) -> strided Conv1d(in_ch -> out_ch)
+// Decoder block is the mirror: snake(in_ch) -> ConvT(in_ch -> out_ch) -> 3xResUnit(out_ch)
+struct VAEEncBlock {
+    VAEResUnit ru[3];
+    struct ggml_tensor * sa, * sb;       // snake [1, in_ch]
+    struct ggml_tensor * dw, * db;       // strided conv [K, in_ch, out_ch], bias [out_ch]
+    int in_ch, out_ch, stride, kernel, padding;
+};
+
+struct VAEEncoder {
+    struct ggml_tensor * c1w, * c1b;     // conv1 [7, 2, 128], bias [128]
+    VAEEncBlock blk[5];
+    struct ggml_tensor * sa, * sb;       // final snake [1, 2048]
+    struct ggml_tensor * c2w, * c2b;     // conv2 [3, 2048, 128], bias [128]
+
+    ggml_backend_t backend;
+    ggml_backend_t cpu_backend;
+    ggml_backend_sched_t sched;
+    ggml_backend_buffer_t buf;
+    struct ggml_context * weight_ctx;
+
+    // graph cache (rebuilt when T_audio changes)
+    struct ggml_context * graph_ctx;
+    uint8_t            * graph_buf;
+    struct ggml_cgraph  * graph;
+    struct ggml_tensor  * graph_input;   // [T_audio, 2]
+    struct ggml_tensor  * graph_output;  // [T_latent, 128]
+    int                   graph_T;       // cached T_audio (0 = no cache)
+
+    std::vector<float> scratch_in;       // transposed input [2 * T_audio]
+};
+
+// Load encoder weights from the same VAE GGUF (encoder.* tensors)
+static void vae_enc_load(VAEEncoder * m, const char * path) {
+    GGUFModel gf = {};
+    if (!gf_load(&gf, path)) {
+        fprintf(stderr, "[VAE-Enc] FATAL: cannot load %s\n", path);
+        exit(1);
+    }
+
+    // Encoder channel layout (mirror of decoder, bottom-up):
+    //   conv1: 2 -> 128
+    //   block: [128->128, 128->256, 256->512, 512->1024, 1024->2048]
+    //   conv2: 2048 -> 128 (split: mean[64] + scale[64])
+    // ResUnits run at in_ch (before downsample), unlike decoder (at out_ch, after upsample).
+    static const int in_ch[]   = {128, 128, 256,  512, 1024};
+    static const int out_ch[]  = {128, 256, 512, 1024, 2048};
+    static const int strides[] = {  2,   4,   4,    6,   10};
+    static const int dilations[] = {1, 3, 9};
+
+    // Phase 1: create weight tensors
+    size_t ctx_size = ggml_tensor_overhead() * 256;
+    struct ggml_init_params p = { ctx_size, NULL, true };
+    m->weight_ctx = ggml_init(p);
+    struct ggml_context * ctx = m->weight_ctx;
+
+    m->c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, 2, 128);
+    m->c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128);
+
+    for (int i = 0; i < 5; i++) {
+        VAEEncBlock & b = m->blk[i];
+        b.in_ch   = in_ch[i];
+        b.out_ch  = out_ch[i];
+        b.stride  = strides[i];
+        b.kernel  = strides[i] * 2;
+        b.padding = (strides[i] + 1) / 2;   // ceil(stride / 2)
+        int C = in_ch[i];                    // res_units + snake at in_ch
+
+        // 3 res units at in_ch
+        for (int r = 0; r < 3; r++) {
+            VAEResUnit & ru = b.ru[r];
+            ru.dilation = dilations[r];
+            ru.s1a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+            ru.s1b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+            ru.c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, C, C);
+            ru.c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C);
+            ru.s2a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+            ru.s2b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+            ru.c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 1, C, C);
+            ru.c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C);
+        }
+
+        // snake at in_ch (before downsample conv)
+        b.sa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+        b.sb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
+
+        // strided conv1d: [K, in_ch, out_ch]
+        b.dw = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, b.kernel, in_ch[i], out_ch[i]);
+        b.db = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_ch[i]);
+    }
+
+    m->sa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048);
+    m->sb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048);
+    m->c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 3, 2048, 128);
+    m->c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128);
+
+    // Phase 2: allocate backend buffer
+    BackendPair bp = backend_init("VAE-Enc");
+    m->backend = bp.backend;
+    m->cpu_backend = bp.cpu_backend;
+    m->sched = backend_sched_new(bp, 8192);
+    m->buf = ggml_backend_alloc_ctx_tensors(ctx, m->backend);
+    if (!m->buf) {
+        fprintf(stderr, "[VAE-Enc] FATAL: failed to allocate weight buffer\n");
+        exit(1);
+    }
+    fprintf(stderr, "[VAE-Enc] Backend: %s, Weight buffer: %.1f MB\n",
+            ggml_backend_name(m->backend),
+            (float)ggml_backend_buffer_get_size(m->buf) / (1024 * 1024));
+
+    // Phase 3: load and fuse weights
+    vae_fuse_wn(m->c1w, gf, "encoder.conv1");
+    vae_load_bias(m->c1b, gf, "encoder.conv1.bias");
+
+    for (int i = 0; i < 5; i++) {
+        VAEEncBlock & b = m->blk[i];
+        std::string blk_pfx = "encoder.block." + std::to_string(i);
+
+        // res_units first (same load pattern as decoder)
+        for (int r = 0; r < 3; r++) {
+            VAEResUnit & ru = b.ru[r];
+            std::string rp = blk_pfx + ".res_unit" + std::to_string(r + 1);
+            vae_load_snake(ru.s1a, gf, rp + ".snake1.alpha");
+            vae_load_snake_inv(ru.s1b, gf, rp + ".snake1.beta");
+            vae_fuse_wn(ru.c1w, gf, rp + ".conv1");
+            vae_load_bias(ru.c1b, gf, rp + ".conv1.bias");
+            vae_load_snake(ru.s2a, gf, rp + ".snake2.alpha");
+            vae_load_snake_inv(ru.s2b, gf, rp + ".snake2.beta");
+            vae_fuse_wn(ru.c2w, gf, rp + ".conv2");
+            vae_load_bias(ru.c2b, gf, rp + ".conv2.bias");
+        }
+
+        // snake + strided downsample conv (regular conv1d, NOT transposed)
+        vae_load_snake(b.sa, gf, blk_pfx + ".snake1.alpha");
+        vae_load_snake_inv(b.sb, gf, blk_pfx + ".snake1.beta");
+        vae_fuse_wn(b.dw, gf, blk_pfx + ".conv1");
+        vae_load_bias(b.db, gf, blk_pfx + ".conv1.bias");
+    }
+
+    vae_load_snake(m->sa, gf, "encoder.snake1.alpha");
+    vae_load_snake_inv(m->sb, gf, "encoder.snake1.beta");
+    vae_fuse_wn(m->c2w, gf, "encoder.conv2");
+    vae_load_bias(m->c2b, gf, "encoder.conv2.bias");
+
+    fprintf(stderr, "[VAE-Enc] Loaded: 5 blocks, downsample=1920x, F32 activations\n");
+    gf_close(&gf);
+}
+
+// Build encoder graph: audio [T_audio, 2] -> [T_latent, 128]
+static struct ggml_tensor * vae_enc_build_graph(
+        struct ggml_context * ctx,
+        VAEEncoder * m,
+        struct ggml_tensor * audio) {   // [T, 2]
+
+    // conv1: [T, 2] -> [T, 128]
+    struct ggml_tensor * x = vae_conv1d(ctx, m->c1w, m->c1b, audio, 1, 3, 1);
+
+    // 5 encoder blocks: resunits(in_ch) -> snake(in_ch) -> strided conv(in_ch -> out_ch)
+    for (int i = 0; i < 5; i++) {
+        VAEEncBlock & b = m->blk[i];
+        for (int r = 0; r < 3; r++)
+            x = vae_res_unit(ctx, &b.ru[r], x);
+        x = vae_snake(ctx, x, b.sa, b.sb);
+        x = vae_conv1d(ctx, b.dw, b.db, x, b.stride, b.padding, 1);
+    }
+
+    // Final: snake(2048) -> conv2(2048 -> 128, k=3, pad=1)
+    x = vae_snake(ctx, x, m->sa, m->sb);
+    x = vae_conv1d(ctx, m->c2w, m->c2b, x, 1, 1, 1);
+
+    return x;  // [T_latent, 128]
+}
+
+// Core compute: build/cache graph, set input, run. Returns T_latent or -1.
+// Output stays in m->graph_output for caller to read.
+static int vae_enc_compute(
+        VAEEncoder * m,
+        const float * audio,    // [T_audio, 2] time-major interleaved stereo
+        int T_audio) {
+
+    // Rebuild graph when T_audio changes
+    if (m->graph_T != T_audio) {
+        if (m->graph_ctx) {
+            ggml_backend_sched_reset(m->sched);
+            ggml_free(m->graph_ctx);
+            free(m->graph_buf);
+        }
+
+        size_t ctx_size = ggml_tensor_overhead() * 1024 + ggml_graph_overhead_custom(8192, false);
+        m->graph_buf = (uint8_t *)malloc(ctx_size);
+        struct ggml_init_params p = { ctx_size, m->graph_buf, true };
+        struct ggml_context * ctx = ggml_init(p);
+
+        m->graph_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, T_audio, 2);
+        ggml_set_name(m->graph_input, "enc_input");
+        ggml_set_input(m->graph_input);
+
+        m->graph_output = vae_enc_build_graph(ctx, m, m->graph_input);
+        ggml_set_name(m->graph_output, "enc_output");
+        ggml_set_output(m->graph_output);
+
+        m->graph = ggml_new_graph_custom(ctx, 8192, false);
+        ggml_build_forward_expand(m->graph, m->graph_output);
+
+        if (!ggml_backend_sched_alloc_graph(m->sched, m->graph)) {
+            fprintf(stderr, "[VAE-Enc] FATAL: graph alloc failed for T=%d\n", T_audio);
+            ggml_free(ctx);
+            free(m->graph_buf);
+            m->graph_ctx = NULL;
+            m->graph_buf = NULL;
+            m->graph_T = 0;
+            return -1;
+        }
+
+        m->graph_ctx = ctx;
+        m->graph_T = T_audio;
+        fprintf(stderr, "[VAE-Enc] Graph: %d nodes, T_audio=%d\n",
+                ggml_graph_n_nodes(m->graph), T_audio);
+    }
+
+    // Transpose: [T, 2] time-major -> ggml [T, 2] channel-contiguous
+    // ggml ne[0]=T is the contiguous dim, so we write all T samples per channel
+    size_t in_size = (size_t)2 * T_audio;
+    if (m->scratch_in.size() < in_size)
+        m->scratch_in.resize(in_size);
+    for (int c = 0; c < 2; c++)
+        for (int t = 0; t < T_audio; t++)
+            m->scratch_in[c * T_audio + t] = audio[t * 2 + c];
+    ggml_backend_tensor_set(m->graph_input,
+                            m->scratch_in.data(), 0, in_size * sizeof(float));
+
+    ggml_backend_sched_graph_compute(m->sched, m->graph);
+
+    return (int)m->graph_output->ne[0];  // T_latent
+}
+
+// Encode API: audio [T_audio, 2] -> latent_out [T_latent, 64] (mean only, deterministic)
+// Returns T_latent (or -1 on error).
+// latent_out must hold at least (T_audio / 1920) * 64 floats.
+static int vae_enc_encode(
+        VAEEncoder * m,
+        const float * audio,     // [T_audio, 2] interleaved stereo
+        int T_audio,
+        float * latent_out,      // [T_latent, 64] output, time-major
+        int max_T_latent) {
+
+    int T_latent = vae_enc_compute(m, audio, T_audio);
+    if (T_latent < 0) return -1;
+
+    if (T_latent > max_T_latent) {
+        fprintf(stderr, "[VAE-Enc] T_latent %d exceeds max %d\n", T_latent, max_T_latent);
+        return -1;
+    }
+
+    // Graph output is [ne0=T_latent, ne1=128] in ggml, channel-contiguous.
+    // Channels 0..63 = mean, 64..127 = scale. We only read mean.
+    // ggml layout: data[c * T_latent + t] for channel c, time t.
+    // We write time-major: latent_out[t * 64 + c] = data[c * T_latent + t]
+    //
+    // Read the full 128ch output once, extract mean channels 0..63
+    size_t out_bytes = (size_t)128 * T_latent * sizeof(float);
+    std::vector<float> raw(128 * T_latent);
+    ggml_backend_tensor_get(m->graph_output, raw.data(), 0, out_bytes);
+
+    for (int t = 0; t < T_latent; t++)
+        for (int c = 0; c < 64; c++)
+            latent_out[t * 64 + c] = raw[c * T_latent + t];
+
+    fprintf(stderr, "[VAE-Enc] Encode: T_audio=%d -> T_latent=%d (%.2fs @ 48kHz)\n",
+            T_audio, T_latent, (float)T_audio / 48000.0f);
+
+    return T_latent;
+}
+
+// Tiled encode for long audio (same chunking strategy as decoder)
+// chunk_size: latent frames per tile, overlap: context frames on each side
+static int vae_enc_encode_tiled(
+        VAEEncoder * m,
+        const float * audio,     // [T_audio, 2] interleaved stereo
+        int T_audio,
+        float * latent_out,      // [T_latent, 64] output, time-major
+        int max_T_latent,
+        int chunk_size = 256,
+        int overlap = 64) {
+
+    // Work in audio-sample space. Each latent frame = 1920 audio samples.
+    int audio_chunk = chunk_size * 1920;
+    int audio_overlap = overlap * 1920;
+
+    // Shrink overlap until stride is positive
+    while (audio_chunk - 2 * audio_overlap <= 0 && audio_overlap > 0)
+        audio_overlap /= 2;
+
+    // Short audio: encode directly
+    if (T_audio <= audio_chunk)
+        return vae_enc_encode(m, audio, T_audio, latent_out, max_T_latent);
+
+    int audio_stride = audio_chunk - 2 * audio_overlap;
+    int num_steps = (T_audio + audio_stride - 1) / audio_stride;
+
+    fprintf(stderr, "[VAE-Enc] Tiled encode: %d tiles (chunk=%d, overlap=%d, stride=%d audio samples)\n",
+            num_steps, audio_chunk, audio_overlap, audio_stride);
+
+    float downsample_factor = 0.0f;
+    int latent_write_pos = 0;
+
+    for (int i = 0; i < num_steps; i++) {
+        // Core range in audio samples (the part we keep)
+        int core_start = i * audio_stride;
+        int core_end = core_start + audio_stride;
+        if (core_end > T_audio) core_end = T_audio;
+
+        // Window with overlap context
+        int win_start = core_start - audio_overlap;
+        if (win_start < 0) win_start = 0;
+        int win_end = core_end + audio_overlap;
+        if (win_end > T_audio) win_end = T_audio;
+        int win_len = win_end - win_start;
+
+        // Encode this window
+        int tile_T = vae_enc_compute(m, audio + win_start * 2, win_len);
+        if (tile_T < 0) {
+            fprintf(stderr, "[VAE-Enc] FATAL: tile %d encode failed\n", i);
+            return -1;
+        }
+
+        // Determine downsample factor from first tile
+        if (i == 0) {
+            downsample_factor = (float)tile_T / (float)win_len;
+            fprintf(stderr, "[VAE-Enc] Downsample factor: %.6f (expected ~1/1920)\n",
+                    downsample_factor);
+        }
+
+        // Trim in latent frames (mirror of decoder trim logic)
+        int added_start = core_start - win_start;
+        int trim_start = (int)roundf((float)added_start * downsample_factor);
+        int added_end = win_end - core_end;
+        int trim_end = (int)roundf((float)added_end * downsample_factor);
+
+        int end_idx = (trim_end > 0) ? (tile_T - trim_end) : tile_T;
+        int core_len = end_idx - trim_start;
+        if (core_len <= 0) continue;
+
+        if (latent_write_pos + core_len > max_T_latent) {
+            fprintf(stderr, "[VAE-Enc] FATAL: tiled output exceeds max_T_latent\n");
+            return -1;
+        }
+
+        // Read tile output [ne0=tile_T, ne1=128], extract mean (ch 0..63), transpose
+        // Only read the first 64 channels (mean), skip scale channels 64..127
+        size_t out_bytes = (size_t)128 * tile_T * sizeof(float);
+        std::vector<float> raw(128 * tile_T);
+        ggml_backend_tensor_get(m->graph_output, raw.data(), 0, out_bytes);
+
+        for (int t = 0; t < core_len; t++)
+            for (int c = 0; c < 64; c++)
+                latent_out[(latent_write_pos + t) * 64 + c] =
+                    raw[c * tile_T + (trim_start + t)];
+
+        latent_write_pos += core_len;
+    }
+
+    fprintf(stderr, "[VAE-Enc] Tiled encode done: %d tiles -> T_latent=%d (%.2fs @ 48kHz)\n",
+            num_steps, latent_write_pos, (float)T_audio / 48000.0f);
+
+    return latent_write_pos;
+}
+
+// Free all resources
+static void vae_enc_free(VAEEncoder * m) {
+    if (m->graph_ctx) {
+        ggml_backend_sched_reset(m->sched);
+        ggml_free(m->graph_ctx);
+        free(m->graph_buf);
+    }
+    if (m->sched) ggml_backend_sched_free(m->sched);
+    if (m->buf) ggml_backend_buffer_free(m->buf);
+    if (m->weight_ctx) ggml_free(m->weight_ctx);
+    backend_release(m->backend, m->cpu_backend);
+    *m = {};
+}
diff --git a/src/vae.h b/src/vae.h
index 20e2b6f..fbf6d5f 100644
--- a/src/vae.h
+++ b/src/vae.h
@@ -14,7 +14,6 @@
 #include "backend.h"
 #include <cmath>
 #include <cstdio>
-#include <cstring>
 #include <string>
 #include <vector>
 
@@ -210,12 +209,16 @@ static void vae_ggml_load(VAEGGML * m, const char * path) {
     m->sb  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 128);
     m->c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, 128, 2);
 
-    // Phase 2: allocate backend buffer (im2col grid Y fix enables long-sequence conv1d)
+    // Phase 2: allocate backend buffer
     BackendPair bp = backend_init("VAE");
     m->backend = bp.backend;
     m->cpu_backend = bp.cpu_backend;
     m->sched = backend_sched_new(bp, 8192);
     m->buf = ggml_backend_alloc_ctx_tensors(ctx, m->backend);
+    if (!m->buf) {
+        fprintf(stderr, "[VAE] FATAL: failed to allocate weight buffer\n");
+        exit(1);
+    }
     fprintf(stderr, "[VAE] Backend: %s, Weight buffer: %.1f MB\n",
             ggml_backend_name(m->backend),
             (float)ggml_backend_buffer_get_size(m->buf) / (1024 * 1024));
@@ -555,203 +558,3 @@ static void vae_ggml_free(VAEGGML * m) {
     backend_release(m->backend, m->cpu_backend);
     *m = {};
 }
-
-// ---------------------------------------------------------------------------
-// VAE Encoder (audio -> 64-d latents @ 25Hz for reference timbre)
-// Oobleck encoder: conv1(2->128) -> 5 blocks (stride 2,4,4,8,8) -> snake -> conv2(2048->128)
-// Output 128 = mean(64) + scale(64); we use mean only.
-// Requires encoder.* tensors in the same VAE GGUF (full autoencoder export).
-// ---------------------------------------------------------------------------
-struct VAEEncoderBlock {
-    VAEResUnit ru[3];
-    struct ggml_tensor * sa, * sb;
-    struct ggml_tensor * c1w, * c1b;
-    int in_ch, out_ch, stride;
-};
-
-struct VAEEncoderGGML {
-    struct ggml_tensor * c1w, * c1b;
-    VAEEncoderBlock blk[5];
-    struct ggml_tensor * sa, * sb;
-    struct ggml_tensor * c2w, * c2b;
-
-    ggml_backend_t backend;
-    ggml_backend_t cpu_backend;
-    ggml_backend_sched_t sched;
-    ggml_backend_buffer_t buf;
-    struct ggml_context * weight_ctx;
-    bool has_encoder;
-};
-
-static bool vae_encoder_load(VAEEncoderGGML * m, const char * path) {
-    m->has_encoder = false;
-    GGUFModel gf = {};
-    if (!gf_load(&gf, path)) {
-        fprintf(stderr, "[VAE Encoder] cannot load %s\n", path);
-        return false;
-    }
-    if (!ggml_get_tensor(gf.meta, "encoder.conv1.weight_v")) {
-        gf_close(&gf);
-        fprintf(stderr, "[VAE Encoder] no encoder.* in %s (decoder-only GGUF). Use a full VAE GGUF for reference_audio WAV.\n", path);
-        return false;
-    }
-
-    static const int enc_strides[] = {2, 4, 4, 8, 8};
-    static const int enc_in_ch[]  = {128, 256, 512, 1024, 2048};
-    static const int enc_out_ch[] = {256, 512, 1024, 2048, 2048};
-    static const int dilations[] = {1, 3, 9};
-
-    size_t ctx_size = ggml_tensor_overhead() * 200;
-    struct ggml_init_params p = { ctx_size, NULL, true };
-    m->weight_ctx = ggml_init(p);
-    struct ggml_context * ctx = m->weight_ctx;
-
-    m->c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, 2, 128);
-    m->c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128);
-
-    for (int i = 0; i < 5; i++) {
-        VAEEncoderBlock & b = m->blk[i];
-        b.in_ch = enc_in_ch[i];
-        b.out_ch = enc_out_ch[i];
-        b.stride = enc_strides[i];
-        int C = b.in_ch;
-        b.sa  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
-        b.sb  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
-        b.c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 2 * b.stride, C, b.out_ch);
-        b.c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, b.out_ch);
-        for (int r = 0; r < 3; r++) {
-            VAEResUnit & ru = b.ru[r];
-            ru.dilation = dilations[r];
-            ru.s1a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
-            ru.s1b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
-            ru.c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, C, C);
-            ru.c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C);
-            ru.s2a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
-            ru.s2b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C);
-            ru.c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 1, C, C);
-            ru.c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C);
-        }
-    }
-    m->sa  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048);
-    m->sb  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048);
-    m->c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 3, 2048, 128);
-    m->c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128);
-
-    BackendPair bp = backend_init("VAE-Encoder");
-    m->backend = bp.backend;
-    m->cpu_backend = bp.cpu_backend;
-    m->sched = backend_sched_new(bp, 8192);
-    m->buf = ggml_backend_alloc_ctx_tensors(ctx, m->backend);
-
-    vae_fuse_wn(m->c1w, gf, "encoder.conv1");
-    vae_load_bias(m->c1b, gf, "encoder.conv1.bias");
-
-    for (int i = 0; i < 5; i++) {
-        VAEEncoderBlock & b = m->blk[i];
-        std::string pfx = "encoder.block." + std::to_string(i);
-        for (int r = 0; r < 3; r++) {
-            std::string rp = pfx + ".res_unit" + std::to_string(r + 1);
-            vae_load_snake(b.ru[r].s1a, gf, rp + ".snake1.alpha");
-            vae_load_snake_inv(b.ru[r].s1b, gf, rp + ".snake1.beta");
-            vae_fuse_wn(b.ru[r].c1w, gf, rp + ".conv1");
-            vae_load_bias(b.ru[r].c1b, gf, rp + ".conv1.bias");
-            vae_load_snake(b.ru[r].s2a, gf, rp + ".snake2.alpha");
-            vae_load_snake_inv(b.ru[r].s2b, gf, rp + ".snake2.beta");
-            vae_fuse_wn(b.ru[r].c2w, gf, rp + ".conv2");
-            vae_load_bias(b.ru[r].c2b, gf, rp + ".conv2.bias");
-        }
-        vae_load_snake(b.sa, gf, pfx + ".snake1.alpha");
-        vae_load_snake_inv(b.sb, gf, pfx + ".snake1.beta");
-        vae_fuse_wn(b.c1w, gf, pfx + ".conv1");
-        vae_load_bias(b.c1b, gf, pfx + ".conv1.bias");
-    }
-    vae_load_snake(m->sa, gf, "encoder.snake1.alpha");
-    vae_load_snake_inv(m->sb, gf, "encoder.snake1.beta");
-    vae_fuse_wn(m->c2w, gf, "encoder.conv2");
-    vae_load_bias(m->c2b, gf, "encoder.conv2.bias");
-
-    gf_close(&gf);
-    m->has_encoder = true;
-    fprintf(stderr, "[VAE Encoder] loaded (2->128->...->2048->128, 64-d mean)\n");
-    return true;
-}
-
-static int vae_encoder_forward(VAEEncoderGGML * m, const float * audio, int T_audio,
-                               float * latent_out) {
-    if (!m->has_encoder || T_audio < 1920) return -1;
-
-    ggml_backend_sched_reset(m->sched);
-    size_t ctx_size = 4096 * ggml_tensor_overhead() + ggml_graph_overhead();
-    struct ggml_init_params gp = { ctx_size, NULL, true };
-    struct ggml_context * ctx = ggml_init(gp);
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 4096, false);
-
-    struct ggml_tensor * x = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, T_audio, 2);
-    ggml_set_name(x, "audio_in");
-    ggml_set_input(x);
-    x = vae_conv1d(ctx, m->c1w, m->c1b, x, 1, 3, 1);
-
-    for (int i = 0; i < 5; i++) {
-        VAEEncoderBlock & b = m->blk[i];
-        for (int r = 0; r < 3; r++)
-            x = vae_res_unit(ctx, &b.ru[r], x);
-        x = vae_snake(ctx, x, b.sa, b.sb);
-        int pad = (int)((float)b.stride / 2.0f + 0.5f);
-        x = vae_conv1d(ctx, b.c1w, b.c1b, x, b.stride, pad, 1);
-    }
-
-    x = vae_snake(ctx, x, m->sa, m->sb);
-    x = vae_conv1d(ctx, m->c2w, m->c2b, x, 1, 1, 1);
-    ggml_set_name(x, "enc_out");
-    ggml_set_output(x);
-
-    ggml_build_forward_expand(gf, x);
-    if (!ggml_backend_sched_alloc_graph(m->sched, gf)) {
-        ggml_free(ctx);
-        return -1;
-    }
-    ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "audio_in"), audio, 0, (size_t)T_audio * 2 * sizeof(float));
-    ggml_backend_sched_graph_compute(m->sched, gf);
-    ggml_backend_sched_synchronize(m->sched);
-
-    struct ggml_tensor * out = ggml_graph_get_tensor(gf, "enc_out");
-    // Encoder strides 2,4,4,8,8 give T_out != T_audio/1920; use actual output shape to avoid read out of bounds
-    int T_latent = (int)out->ne[0];
-    size_t nbytes = (size_t)T_latent * 128 * sizeof(float);
-    std::vector<float> tmp((size_t)T_latent * 128);
-    ggml_backend_tensor_get(out, tmp.data(), 0, nbytes);
-    for (int t = 0; t < T_latent; t++)
-        for (int c = 0; c < 64; c++)
-            latent_out[t * 64 + c] = tmp[t * 128 + c];
-
-    ggml_backend_sched_reset(m->sched);
-    ggml_free(ctx);
-    return T_latent;
-}
-
-static void vae_encoder_free(VAEEncoderGGML * m) {
-    // Order: reset sched, free sched (drops refs to graph/alloc), free weight_ctx (tensor metadata),
-    // then buffer (tensor data), then backends. Avoids double free on some GGML backends.
-    if (m->sched) {
-        ggml_backend_sched_reset(m->sched);
-        ggml_backend_sched_free(m->sched);
-        m->sched = NULL;
-    }
-    if (m->weight_ctx) {
-        ggml_free(m->weight_ctx);
-        m->weight_ctx = NULL;
-    }
-    if (m->buf) {
-        ggml_backend_buffer_free(m->buf);
-        m->buf = NULL;
-    }
-    if (m->backend && m->backend != m->cpu_backend) {
-        ggml_backend_free(m->backend);
-        m->backend = NULL;
-    }
-    if (m->cpu_backend) {
-        ggml_backend_free(m->cpu_backend);
-        m->cpu_backend = NULL;
-    }
-    *m = {};
-}
diff --git a/src/wav.h b/src/wav.h
deleted file mode 100644
index e7f0f02..0000000
--- a/src/wav.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// wav.h: minimal WAV loader for reference audio (stereo 48kHz float out)
-// No Python or external deps. Handles 16-bit PCM, mono/stereo, resamples to 48kHz if needed.
-
-#pragma once
-
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-#include <vector>
-#include <algorithm>
-
-// Load WAV file into stereo float32 at 48kHz.
-// Out: interleaved L,R,L,R,... length = num_samples (both channels).
-// Returns num_samples (per channel), or -1 on error.
-static int wav_load_48k_stereo(const char * path, std::vector<float> * out) {
-    FILE * f = fopen(path, "rb");
-    if (!f) return -1;
-
-    char riff[4], fmt[4];
-    if (fread(riff, 1, 4, f) != 4 || memcmp(riff, "RIFF", 4) != 0) {
-        fclose(f);
-        return -1;
-    }
-    uint32_t file_len;
-    if (fread(&file_len, 4, 1, f) != 1) { fclose(f); return -1; }
-    if (fread(fmt, 1, 4, f) != 4 || memcmp(fmt, "WAVE", 4) != 0) {
-        fclose(f);
-        return -1;
-    }
-
-    uint16_t channels = 2, bits = 16;
-    uint32_t sample_rate = 48000;
-    bool found_fmt = false;
-
-    while (1) {
-        char chunk_id[4];
-        if (fread(chunk_id, 1, 4, f) != 4) break;
-        uint32_t chunk_size;
-        if (fread(&chunk_size, 4, 1, f) != 1) break;
-        long chunk_start = ftell(f);
-
-        if (memcmp(chunk_id, "fmt ", 4) == 0 && chunk_size >= 16) {
-            uint16_t fmt_tag, block_align;
-            uint32_t byte_rate;
-            if (fread(&fmt_tag, 2, 1, f) != 1) break;
-            if (fread(&channels, 2, 1, f) != 1) break;
-            if (fread(&sample_rate, 4, 1, f) != 1) break;
-            if (fread(&byte_rate, 4, 1, f) != 1) break;
-            if (fread(&block_align, 2, 1, f) != 1) break;
-            if (fread(&bits, 2, 1, f) != 1) break;
-            found_fmt = true;
-        } else if (memcmp(chunk_id, "data", 4) == 0 && found_fmt) {
-            size_t num_bytes = chunk_size;
-            size_t num_samples = num_bytes / (channels * (bits / 8));
-            if (num_samples == 0) { fclose(f); return -1; }
-
-            std::vector<int16_t> raw(num_samples * channels);
-            if (fread(raw.data(), 2, raw.size(), f) != raw.size()) {
-                fclose(f);
-                return -1;
-            }
-
-            out->resize(num_samples * 2);
-            float scale = 1.0f / 32768.0f;
-            if (channels == 1) {
-                for (size_t i = 0; i < num_samples; i++) {
-                    float s = (float)raw[i] * scale;
-                    (*out)[i * 2] = s;
-                    (*out)[i * 2 + 1] = s;
-                }
-            } else {
-                for (size_t i = 0; i < num_samples * 2; i++)
-                    (*out)[i] = (float)raw[i] * scale;
-            }
-
-            fclose(f);
-
-            // Resample to 48kHz if needed (linear interpolation)
-            if (sample_rate != 48000) {
-                size_t in_len = num_samples;
-                size_t out_len = (size_t)((double)in_len * 48000.0 / (double)sample_rate);
-                std::vector<float> resampled(out_len * 2);
-                for (size_t i = 0; i < out_len; i++) {
-                    double t = (double)i * (double)in_len / (double)out_len;
-                    size_t i0 = (size_t)t;
-                    size_t i1 = std::min(i0 + 1, in_len - 1);
-                    float w = (float)(t - (double)i0);
-                    for (int c = 0; c < 2; c++)
-                        resampled[i * 2 + c] = (*out)[i0 * 2 + c] * (1.0f - w) + (*out)[i1 * 2 + c] * w;
-                }
-                *out = std::move(resampled);
-                return (int)out_len;
-            }
-            return (int)num_samples;
-        }
-
-        fseek(f, chunk_start + (long)chunk_size, SEEK_SET);
-    }
-    fclose(f);
-    return -1;
-}
diff --git a/tests/BF16.log b/tests/BF16.log
deleted file mode 100644
index 7ea7d57..0000000
--- a/tests/BF16.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf
-[GGML] Running acestep-v15-turbo-BF16.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999805
-  lyric_embed                          1.000000
-  enc_hidden                           0.999830
-  detok_output                         0.999996
-  context                              0.999998
-  noise                                1.000000
-  temb_t                               0.999999
-  hidden_after_proj_in                 0.999988
-  enc_after_cond_emb                   0.999818
-  layer0_sa_output                     0.999951
-  hidden_after_layer0                  0.999978
-  hidden_after_layer6                  0.999916
-  hidden_after_layer12                 0.999234
-  hidden_after_layer18                 0.996570
-  hidden_after_layer23                 0.993528
-  dit_step0_vt                         0.974876
-  dit_step0_xt                         0.999945
-  dit_step1_vt                         0.980053
-  dit_step1_xt                         0.999834
-  dit_step2_vt                         0.981541
-  dit_step2_xt                         0.999553
-  dit_step3_vt                         0.982418
-  dit_step3_xt                         0.998924
-  dit_step4_vt                         0.980811
-  dit_step4_xt                         0.997503
-  dit_step5_vt                         0.977877
-  dit_step5_xt                         0.994298
-  dit_step6_vt                         0.974930
-  dit_step6_xt                         0.988188
-  dit_step7_vt                         0.969375
-  dit_x0                               0.979213
-  vae_audio                            0.901377
-  vae_audio (STFT cosine)              0.975525
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999945   0.135628   0.006709  -0.002312   0.972932  -0.002342   0.972003
-  dit_step1_xt             0.999834   0.266762   0.011267  -0.005306   0.942657  -0.005313   0.941730
-  dit_step2_xt             0.999553   0.453190   0.017486  -0.009350   0.909152  -0.009311   0.908527
-  dit_step3_xt             0.998924   0.643865   0.025962  -0.014715   0.873769  -0.014577   0.873624
-  dit_step4_xt             0.997503   0.790038   0.037807  -0.021768   0.841938  -0.021660   0.841995
-  dit_step5_xt             0.994298   1.239881   0.055598  -0.031834   0.825214  -0.032109   0.824593
-  dit_step6_xt             0.988188   2.076383   0.082565  -0.046121   0.856115  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-BF16.gguf
-[GGML] Running acestep-v15-sft-BF16.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999805
-  lyric_embed                          1.000000
-  enc_hidden                           0.999830
-  detok_output                         0.999996
-  context                              0.999998
-  noise                                1.000000
-  temb_t                               0.999997
-  hidden_after_proj_in                 0.999987
-  enc_after_cond_emb                   0.999820
-  layer0_sa_output                     0.999942
-  hidden_after_layer0                  0.999980
-  hidden_after_layer6                  0.999847
-  hidden_after_layer12                 0.999483
-  hidden_after_layer18                 0.998723
-  hidden_after_layer23                 0.998976
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998938
-  dit_step0_vt_uncond                  0.998662
-  dit_step0_vt                         0.995622
-  dit_step0_xt                         0.999998
-  dit_step5_vt_cond                    0.999492
-  dit_step5_vt                         0.993792
-  dit_step5_xt                         0.999962
-  dit_step10_vt_cond                   0.998783
-  dit_step10_vt                        0.993293
-  dit_step10_xt                        0.999885
-  dit_step15_vt_cond                   0.997654
-  dit_step15_vt                        0.987992
-  dit_step15_xt                        0.999675
-  dit_step20_vt_cond                   0.995364
-  dit_step20_vt                        0.980590
-  dit_step20_xt                        0.999177
-  dit_step25_vt_cond                   0.990719
-  dit_step25_vt                        0.970351
-  dit_step25_xt                        0.998116
-  dit_step30_vt_cond                   0.985676
-  dit_step30_vt                        0.965303
-  dit_step30_xt                        0.996402
-  dit_step35_vt_cond                   0.981229
-  dit_step35_vt                        0.957586
-  dit_step35_xt                        0.994272
-  dit_step40_vt_cond                   0.978699
-  dit_step40_vt                        0.951774
-  dit_step40_xt                        0.992207
-  dit_step45_vt_cond                   0.981165
-  dit_step45_vt                        0.954789
-  dit_step45_xt                        0.990734
-  dit_step49_vt_cond                   0.983553
-  dit_step49_vt                        0.924041
-  dit_x0                               0.990243
-  vae_audio                            0.956370
-  vae_audio (STFT cosine)              0.981929
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999998   0.038950   0.002063  -0.001725   0.980009  -0.001741   0.980402
-  dit_step5_xt             0.999962   0.130437   0.005829  -0.006903   0.888898  -0.007143   0.887999
-  dit_step10_xt            0.999885   0.226949   0.009019  -0.012332   0.810283  -0.012603   0.811299
-  dit_step15_xt            0.999675   0.364782   0.013694  -0.017622   0.745056  -0.018114   0.745268
-  dit_step20_xt            0.999177   0.445386   0.020236  -0.023046   0.699325  -0.023808   0.699582
-  dit_step25_xt            0.998116   0.652368   0.029048  -0.028568   0.677830  -0.029311   0.679278
-  dit_step30_xt            0.996402   1.067296   0.039895  -0.034151   0.683829  -0.035027   0.685262
-  dit_step35_xt            0.994272   1.703333   0.052370  -0.039663   0.716078  -0.040716   0.717195
-  dit_step40_xt            0.992207   2.069015   0.065941  -0.045141   0.769969  -0.046462   0.771853
-  dit_step45_xt            0.990734   2.329453   0.078903  -0.051095   0.841302  -0.052475   0.843036
diff --git a/tests/CPU-BF16.log b/tests/CPU-BF16.log
index b20ebae..74300ed 100644
--- a/tests/CPU-BF16.log
+++ b/tests/CPU-BF16.log
@@ -7,36 +7,34 @@
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 3007.9 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 464.0 ms
+[Load] DiT weight load: 301.5 ms
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CPU (CPU threads: 16)
+[Load] VAE backend: CPU (shared)
 [VAE] Backend: CPU, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 651.3 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 666.9 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 31.9 ms
+[Load] BPE tokenizer: 30.9 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CPU (CPU threads: 16)
+[Load] TextEncoder backend: CPU (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 226.8 ms
-[Encode] TextEncoder (70 tokens): 59.7 ms
+[Load] TextEncoder: 121.5 ms
+[Encode] TextEncoder (70 tokens): 58.0 ms
 [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.7 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CPU (CPU threads: 16)
+[Load] CondEncoder backend: CPU (shared)
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -46,18 +44,18 @@
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 1160.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 230.8 ms
+[Load] ConditionEncoder: 111.5 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 274.9 ms, enc_S=238
+[Encode] ConditionEncoder: 268.3 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.758296 -0.049593 -0.132844 0.058496
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [WeightCtx] Loaded 30 tensors, 200.3 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 34.6 ms
+[Load] Detokenizer: 23.6 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 958.8 ms
+[Context] Detokenizer: 889.4 ms
 [Debug] detok_output: [2170, 64] first4: -0.124160 1.435260 0.310138 -0.624584
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -112,35 +110,32 @@
 [Debug] dit_step7_vt: [2170, 64] first4: 0.002176 0.183052 -1.467304 3.113325
 [Debug] dit_x0: [2170, 64] first4: 0.083178 1.441022 0.423316 -1.927701
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 18721.5 ms (18721.5 ms/sample)
+[DiT] Total generation: 17583.4 ms (17583.4 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.083178 1.441022 0.423316 -1.927701
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 51818.0 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000519 0.001024 0.000897 0.001200
+[VAE Batch0] Decode: 46859.3 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000480 0.000983 0.000816 0.001189
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:57:38.585 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:57:38.585 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:57:38.585 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:57:38.586 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:57:38.586 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:57:39.413 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:57:40.961 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:57:40.961 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:57:40.966 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:57:41.132 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:49:02.827 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:49:02.916 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:49:04.251 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:49:04.252 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:49:04.253 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:49:04.259 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:49:04.454 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -182,25 +177,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:57:41.134 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:49:04.456 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:57:41.140 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:57:41.153 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:57:41.153 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:57:41.175 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:57:41.483 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:57:41.483 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:57:41.483 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00688624382019043, 'diffusion_time_cost': 0.30014586448669434, 'diffusion_per_step_time_cost': 0.03751823306083679, 'total_time_cost': 0.30703210830688477, 'offload_time_cost': 0.0}
-2026-03-01 19:57:41.498 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:57:41.500 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:57:41.500 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB
-2026-03-01 19:57:41.500 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:57:41.500 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB
-2026-03-01 19:57:41.500 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB
-2026-03-01 19:57:41.500 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:57:41.775 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:57:41.777 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:57:41.780 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:49:04.463 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:49:04.478 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:49:04.478 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:49:04.514 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:49:04.845 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:49:04.846 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:49:04.846 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007018327713012695, 'diffusion_time_cost': 0.32423973083496094, 'diffusion_per_step_time_cost': 0.04052996635437012, 'total_time_cost': 0.33125805854797363, 'offload_time_cost': 0.0}
+2026-03-04 21:49:04.860 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:49:04.862 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:49:04.862 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB
+2026-03-04 21:49:04.862 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:49:04.862 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB
+2026-03-04 21:49:04.862 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB
+2026-03-04 21:49:04.862 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:49:05.138 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:49:05.140 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:49:05.142 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf
 [GGML] Running acestep-v15-turbo-BF16.gguf...
@@ -244,8 +239,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.988142
   dit_step7_vt                         0.969102
   dit_x0                               0.979106
-  vae_audio                            0.901370
-  vae_audio (STFT cosine)              0.975816
+  vae_audio                            0.901389
+  vae_audio (STFT cosine)              0.975826
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999946   0.136541   0.006626  -0.002312   0.972951  -0.002342   0.972003
diff --git a/tests/CPU-Q4_K_M.log b/tests/CPU-Q4_K_M.log
index 508a20c..540c4c4 100644
--- a/tests/CPU-Q4_K_M.log
+++ b/tests/CPU-Q4_K_M.log
@@ -1,5 +1,5 @@
 [Load] DiT backend: CPU (CPU threads: 16)
-[Load] Backend init: 6.3 ms
+[Load] Backend init: 1.6 ms
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K fused, V separate
 [DiT] Cross-attn: all separate
@@ -7,36 +7,34 @@
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 895.6 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 118.4 ms
+[Load] DiT weight load: 121.8 ms
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CPU (CPU threads: 16)
+[Load] VAE backend: CPU (shared)
 [VAE] Backend: CPU, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 696.2 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 699.6 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 33.0 ms
+[Load] BPE tokenizer: 33.3 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CPU (CPU threads: 16)
+[Load] TextEncoder backend: CPU (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 148.2 ms
-[Encode] TextEncoder (70 tokens): 58.0 ms
+[Load] TextEncoder: 122.9 ms
+[Encode] TextEncoder (70 tokens): 60.4 ms
 [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.6 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CPU (CPU threads: 16)
+[Load] CondEncoder backend: CPU (shared)
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K fused, V separate
@@ -46,18 +44,18 @@
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 352.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 37.5 ms
+[Load] ConditionEncoder: 34.8 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 294.2 ms, enc_S=238
+[Encode] ConditionEncoder: 300.3 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.759313 -0.049345 -0.129442 0.055759
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 64.7 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 10.1 ms
+[Load] Detokenizer: 9.1 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 354.8 ms
+[Context] Detokenizer: 361.0 ms
 [Debug] detok_output: [2170, 64] first4: -0.106265 1.448869 0.309591 -0.650098
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -112,35 +110,32 @@
 [Debug] dit_step7_vt: [2170, 64] first4: -0.463452 0.896626 -1.673395 3.222673
 [Debug] dit_x0: [2170, 64] first4: 0.290887 1.122067 0.588729 -1.917174
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 21769.5 ms (21769.5 ms/sample)
+[DiT] Total generation: 21823.6 ms (21823.6 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.290887 1.122067 0.588729 -1.917174
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 52184.7 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000272 0.000786 0.000556 0.000990
+[VAE Batch0] Decode: 47904.5 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000330 0.000828 0.000665 0.001038
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 20:03:15.903 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 20:03:15.903 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 20:03:15.903 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 20:03:15.903 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 20:03:15.904 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 20:03:16.714 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 20:03:18.309 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 20:03:18.309 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 20:03:18.315 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 20:03:18.480 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:54:26.607 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:54:26.698 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:54:28.050 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:54:28.050 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:54:28.054 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:54:28.059 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:54:28.263 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -182,25 +177,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 20:03:18.482 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:54:28.265 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 20:03:18.488 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 20:03:18.501 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 20:03:18.501 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 20:03:18.540 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 20:03:18.854 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 20:03:18.855 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 20:03:18.855 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006970643997192383, 'diffusion_time_cost': 0.3072662353515625, 'diffusion_per_step_time_cost': 0.03840827941894531, 'total_time_cost': 0.3142368793487549, 'offload_time_cost': 0.0}
-2026-03-01 20:03:18.869 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 20:03:18.872 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 20:03:18.872 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 20:03:18.872 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 20:03:18.872 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 20:03:18.872 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 20:03:18.872 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 20:03:19.148 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 20:03:19.151 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 20:03:19.154 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:54:28.272 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:54:28.288 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:54:28.288 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:54:28.323 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:54:28.640 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:54:28.641 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:54:28.641 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0070536136627197266, 'diffusion_time_cost': 0.30983686447143555, 'diffusion_per_step_time_cost': 0.03872960805892944, 'total_time_cost': 0.3168904781341553, 'offload_time_cost': 0.0}
+2026-03-04 21:54:28.655 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:54:28.666 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:54:28.666 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:54:28.666 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:54:28.666 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:54:28.666 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:54:28.666 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:54:28.949 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:54:28.951 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:54:28.952 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
 [GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
@@ -244,8 +239,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.977196
   dit_step7_vt                         0.939970
   dit_x0                               0.959881
-  vae_audio                            0.834993
-  vae_audio (STFT cosine)              0.955098
+  vae_audio                            0.834992
+  vae_audio (STFT cosine)              0.955102
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999883   0.167680   0.010319  -0.002256   0.973185  -0.002342   0.972003
diff --git a/tests/CPU-Q5_K_M.log b/tests/CPU-Q5_K_M.log
index e0d9936..6722100 100644
--- a/tests/CPU-Q5_K_M.log
+++ b/tests/CPU-Q5_K_M.log
@@ -7,36 +7,34 @@
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1061.2 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 140.3 ms
+[Load] DiT weight load: 110.6 ms
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CPU (CPU threads: 16)
+[Load] VAE backend: CPU (shared)
 [VAE] Backend: CPU, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 699.1 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 698.0 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 33.4 ms
+[Load] BPE tokenizer: 33.1 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CPU (CPU threads: 16)
+[Load] TextEncoder backend: CPU (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 149.7 ms
-[Encode] TextEncoder (70 tokens): 57.3 ms
+[Load] TextEncoder: 123.1 ms
+[Encode] TextEncoder (70 tokens): 57.9 ms
 [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.5 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CPU (CPU threads: 16)
+[Load] CondEncoder backend: CPU (shared)
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K fused, V separate
@@ -46,18 +44,18 @@
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 412.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 45.1 ms
+[Load] ConditionEncoder: 41.0 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 387.5 ms, enc_S=238
+[Encode] ConditionEncoder: 388.3 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.760901 -0.053445 -0.132760 0.058505
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 73.2 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 11.3 ms
+[Load] Detokenizer: 10.3 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 447.0 ms
+[Context] Detokenizer: 446.1 ms
 [Debug] detok_output: [2170, 64] first4: -0.129311 1.458194 0.298132 -0.651512
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -112,35 +110,32 @@
 [Debug] dit_step7_vt: [2170, 64] first4: -0.003599 0.325174 -1.377289 3.053612
 [Debug] dit_x0: [2170, 64] first4: 0.058232 1.415164 0.443289 -1.901864
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 27970.1 ms (27970.1 ms/sample)
+[DiT] Total generation: 28035.0 ms (28035.0 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.058232 1.415164 0.443289 -1.901864
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 51966.1 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000740 0.001305 0.001083 0.001434
+[VAE Batch0] Decode: 47798.0 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000762 0.001320 0.001139 0.001557
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 20:01:55.226 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 20:01:55.226 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 20:01:55.226 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 20:01:55.226 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 20:01:55.226 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 20:01:56.032 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 20:01:57.576 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 20:01:57.577 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 20:01:57.581 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 20:01:57.747 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:53:09.193 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:53:09.323 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:53:10.674 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:53:10.674 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:53:10.676 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:53:10.682 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:53:10.881 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -182,25 +177,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 20:01:57.749 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:53:10.884 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 20:01:57.755 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 20:01:57.768 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 20:01:57.768 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 20:01:57.801 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 20:01:58.109 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 20:01:58.109 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 20:01:58.109 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007002353668212891, 'diffusion_time_cost': 0.30033254623413086, 'diffusion_per_step_time_cost': 0.03754156827926636, 'total_time_cost': 0.30733489990234375, 'offload_time_cost': 0.0}
-2026-03-01 20:01:58.124 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 20:01:58.126 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 20:01:58.126 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 20:01:58.126 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 20:01:58.126 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 20:01:58.126 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 20:01:58.126 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 20:01:58.401 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 20:01:58.403 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 20:01:58.406 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:53:10.890 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:53:10.930 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:53:10.930 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:53:10.966 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:53:11.283 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:53:11.284 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:53:11.284 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006951332092285156, 'diffusion_time_cost': 0.3100306987762451, 'diffusion_per_step_time_cost': 0.03875383734703064, 'total_time_cost': 0.3169820308685303, 'offload_time_cost': 0.0}
+2026-03-04 21:53:11.298 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:53:11.300 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:53:11.300 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB
+2026-03-04 21:53:11.300 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:53:11.300 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB
+2026-03-04 21:53:11.300 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB
+2026-03-04 21:53:11.300 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:53:11.575 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:53:11.577 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:53:11.579 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
 [GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
@@ -244,8 +239,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.983513
   dit_step7_vt                         0.954349
   dit_x0                               0.970379
-  vae_audio                            0.874800
-  vae_audio (STFT cosine)              0.967703
+  vae_audio                            0.874850
+  vae_audio (STFT cosine)              0.967714
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999926   0.135378   0.008030  -0.002303   0.973012  -0.002342   0.972003
diff --git a/tests/CPU-Q6_K.log b/tests/CPU-Q6_K.log
index 7d4c411..75b4fd2 100644
--- a/tests/CPU-Q6_K.log
+++ b/tests/CPU-Q6_K.log
@@ -7,36 +7,34 @@
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1237.2 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 169.4 ms
+[Load] DiT weight load: 150.0 ms
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CPU (CPU threads: 16)
+[Load] VAE backend: CPU (shared)
 [VAE] Backend: CPU, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 699.2 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 689.7 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 32.5 ms
+[Load] BPE tokenizer: 33.1 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CPU (CPU threads: 16)
+[Load] TextEncoder backend: CPU (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 148.3 ms
-[Encode] TextEncoder (70 tokens): 57.5 ms
+[Load] TextEncoder: 124.2 ms
+[Encode] TextEncoder (70 tokens): 58.0 ms
 [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.6 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CPU (CPU threads: 16)
+[Load] CondEncoder backend: CPU (shared)
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -46,18 +44,18 @@
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 476.3 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 52.6 ms
+[Load] ConditionEncoder: 47.5 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 348.9 ms, enc_S=238
+[Encode] ConditionEncoder: 349.5 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.761694 -0.052035 -0.131773 0.058231
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 82.2 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 12.3 ms
+[Load] Detokenizer: 11.0 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 414.3 ms
+[Context] Detokenizer: 417.1 ms
 [Debug] detok_output: [2170, 64] first4: -0.151355 1.462444 0.326907 -0.627213
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -112,35 +110,32 @@
 [Debug] dit_step7_vt: [2170, 64] first4: 0.118016 0.207620 -1.266971 2.955565
 [Debug] dit_x0: [2170, 64] first4: 0.004752 1.435176 0.398691 -1.887822
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 25398.3 ms (25398.3 ms/sample)
+[DiT] Total generation: 25477.6 ms (25477.6 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.004752 1.435176 0.398691 -1.887822
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 52074.7 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000467 0.001015 0.000873 0.001303
+[VAE Batch0] Decode: 47852.2 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000553 0.001102 0.000938 0.001323
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 20:00:28.298 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 20:00:28.298 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 20:00:28.298 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 20:00:28.298 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 20:00:28.298 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 20:00:29.103 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 20:00:30.690 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 20:00:30.690 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 20:00:30.695 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 20:00:30.860 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:51:45.520 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:51:45.634 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:51:46.994 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:51:46.994 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:51:46.995 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:51:47.001 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:51:47.198 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -182,25 +177,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 20:00:30.862 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:51:47.201 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 20:00:30.869 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 20:00:30.881 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 20:00:30.882 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 20:00:30.914 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 20:00:31.231 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 20:00:31.232 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 20:00:31.232 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006938934326171875, 'diffusion_time_cost': 0.31071925163269043, 'diffusion_per_step_time_cost': 0.038839906454086304, 'total_time_cost': 0.3176581859588623, 'offload_time_cost': 0.0}
-2026-03-01 20:00:31.246 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 20:00:31.249 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 20:00:31.249 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB
-2026-03-01 20:00:31.249 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 20:00:31.249 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB
-2026-03-01 20:00:31.249 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB
-2026-03-01 20:00:31.249 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 20:00:31.524 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 20:00:31.527 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 20:00:31.531 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:51:47.208 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:51:47.224 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:51:47.224 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:51:47.259 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:51:47.579 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:51:47.579 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:51:47.579 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007021188735961914, 'diffusion_time_cost': 0.31169986724853516, 'diffusion_per_step_time_cost': 0.038962483406066895, 'total_time_cost': 0.31872105598449707, 'offload_time_cost': 0.0}
+2026-03-04 21:51:47.593 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:51:47.595 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:51:47.595 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB
+2026-03-04 21:51:47.596 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:51:47.596 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB
+2026-03-04 21:51:47.596 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB
+2026-03-04 21:51:47.596 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:51:47.870 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:51:47.872 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:51:47.874 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
 [GGML] Running acestep-v15-turbo-Q6_K.gguf...
@@ -244,8 +239,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.984569
   dit_step7_vt                         0.958147
   dit_x0                               0.972312
-  vae_audio                            0.891761
-  vae_audio (STFT cosine)              0.969080
+  vae_audio                            0.891790
+  vae_audio (STFT cosine)              0.969088
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999936   0.151952   0.007283  -0.002271   0.972870  -0.002342   0.972003
diff --git a/tests/CPU-Q8_0.log b/tests/CPU-Q8_0.log
index 76183ea..3eb253c 100644
--- a/tests/CPU-Q8_0.log
+++ b/tests/CPU-Q8_0.log
@@ -7,36 +7,34 @@
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1600.7 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 188.0 ms
+[Load] DiT weight load: 178.6 ms
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CPU (CPU threads: 16)
+[Load] VAE backend: CPU (shared)
 [VAE] Backend: CPU, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 690.8 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 692.2 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 32.8 ms
+[Load] BPE tokenizer: 32.4 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CPU (CPU threads: 16)
+[Load] TextEncoder backend: CPU (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 160.0 ms
-[Encode] TextEncoder (70 tokens): 57.9 ms
+[Load] TextEncoder: 123.5 ms
+[Encode] TextEncoder (70 tokens): 58.2 ms
 [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 13.0 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CPU (CPU threads: 16)
+[Load] CondEncoder backend: CPU (shared)
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -46,18 +44,18 @@
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 616.6 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 126.4 ms
+[Load] ConditionEncoder: 65.1 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 390.3 ms, enc_S=238
+[Encode] ConditionEncoder: 373.4 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.758873 -0.049568 -0.132802 0.057792
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 106.5 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 13.6 ms
+[Load] Detokenizer: 14.3 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 447.8 ms
+[Context] Detokenizer: 448.5 ms
 [Debug] detok_output: [2170, 64] first4: -0.126218 1.441045 0.305219 -0.629688
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -112,35 +110,32 @@
 [Debug] dit_step7_vt: [2170, 64] first4: -0.037024 0.233524 -1.487499 3.098410
 [Debug] dit_x0: [2170, 64] first4: 0.094459 1.422387 0.433039 -1.914712
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 26043.3 ms (26043.3 ms/sample)
+[DiT] Total generation: 26009.5 ms (26009.5 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.094459 1.422387 0.433039 -1.914712
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 52114.7 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000455 0.000930 0.000816 0.001121
+[VAE Batch0] Decode: 47762.1 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000441 0.000946 0.000788 0.001168
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:59:03.882 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:59:03.882 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:59:03.882 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:59:03.883 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:59:03.883 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:59:04.691 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:59:06.262 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:59:06.262 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:59:06.268 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:59:06.433 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:50:24.424 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:50:24.514 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:50:25.858 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:50:25.858 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:50:25.860 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:50:25.865 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:50:26.063 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -182,25 +177,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:59:06.436 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:50:26.065 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:59:06.443 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:59:06.457 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:59:06.457 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:59:06.478 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:59:06.802 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:59:06.803 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:59:06.803 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006929874420166016, 'diffusion_time_cost': 0.3164329528808594, 'diffusion_per_step_time_cost': 0.03955411911010742, 'total_time_cost': 0.3233628273010254, 'offload_time_cost': 0.0}
-2026-03-01 19:59:06.817 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:59:06.819 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:59:06.819 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:59:06.819 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:59:06.819 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:59:06.819 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:59:06.819 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:59:07.095 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:59:07.098 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:59:07.101 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:50:26.073 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:50:26.088 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:50:26.088 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:50:26.120 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:50:26.438 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:50:26.438 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:50:26.438 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007014036178588867, 'diffusion_time_cost': 0.30962181091308594, 'diffusion_per_step_time_cost': 0.03870272636413574, 'total_time_cost': 0.3166358470916748, 'offload_time_cost': 0.0}
+2026-03-04 21:50:26.452 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:50:26.455 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:50:26.455 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.86 GB
+2026-03-04 21:50:26.455 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:50:26.455 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.86 GB
+2026-03-04 21:50:26.455 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.86 GB
+2026-03-04 21:50:26.455 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:50:26.730 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:50:26.732 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:50:26.734 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
 [GGML] Running acestep-v15-turbo-Q8_0.gguf...
@@ -244,8 +239,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.988647
   dit_step7_vt                         0.970238
   dit_x0                               0.980014
-  vae_audio                            0.903408
-  vae_audio (STFT cosine)              0.976427
+  vae_audio                            0.903437
+  vae_audio (STFT cosine)              0.976438
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999946   0.139652   0.006645  -0.002330   0.972930  -0.002342   0.972003
diff --git a/tests/CPU_BF16.log b/tests/CPU_BF16.log
deleted file mode 100644
index fcae074..0000000
--- a/tests/CPU_BF16.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf
-[GGML] Running acestep-v15-turbo-BF16.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999816
-  lyric_embed                          1.000000
-  enc_hidden                           0.999841
-  detok_output                         0.999995
-  context                              0.999997
-  noise                                1.000000
-  temb_t                               0.999999
-  hidden_after_proj_in                 0.999988
-  enc_after_cond_emb                   0.999832
-  layer0_sa_output                     0.999960
-  hidden_after_layer0                  0.999982
-  hidden_after_layer6                  0.999924
-  hidden_after_layer12                 0.999332
-  hidden_after_layer18                 0.996692
-  hidden_after_layer23                 0.993786
-  dit_step0_vt                         0.975712
-  dit_step0_xt                         0.999946
-  dit_step1_vt                         0.979525
-  dit_step1_xt                         0.999833
-  dit_step2_vt                         0.981808
-  dit_step2_xt                         0.999552
-  dit_step3_vt                         0.982382
-  dit_step3_xt                         0.998917
-  dit_step4_vt                         0.980777
-  dit_step4_xt                         0.997480
-  dit_step5_vt                         0.978078
-  dit_step5_xt                         0.994264
-  dit_step6_vt                         0.974849
-  dit_step6_xt                         0.988142
-  dit_step7_vt                         0.969102
-  dit_x0                               0.979106
-  vae_audio                            0.901370
-  vae_audio (STFT cosine)              0.975816
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999946   0.136541   0.006626  -0.002312   0.972951  -0.002342   0.972003
-  dit_step1_xt             0.999833   0.265486   0.011288  -0.005309   0.942692  -0.005313   0.941730
-  dit_step2_xt             0.999552   0.451896   0.017477  -0.009347   0.909217  -0.009311   0.908527
-  dit_step3_xt             0.998917   0.642624   0.025957  -0.014710   0.873863  -0.014577   0.873624
-  dit_step4_xt             0.997480   0.778374   0.037868  -0.021751   0.842047  -0.021660   0.841995
-  dit_step5_xt             0.994264   1.244624   0.055630  -0.031814   0.825360  -0.032109   0.824593
-  dit_step6_xt             0.988142   2.080976   0.082605  -0.046091   0.856212  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-BF16.gguf
-[GGML] Running acestep-v15-sft-BF16.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999816
-  lyric_embed                          1.000000
-  enc_hidden                           0.999841
-  detok_output                         0.999995
-  context                              0.999997
-  noise                                1.000000
-  temb_t                               0.999998
-  hidden_after_proj_in                 0.999988
-  enc_after_cond_emb                   0.999834
-  layer0_sa_output                     0.999959
-  hidden_after_layer0                  0.999984
-  hidden_after_layer6                  0.999851
-  hidden_after_layer12                 0.999471
-  hidden_after_layer18                 0.998749
-  hidden_after_layer23                 0.998994
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998963
-  dit_step0_vt_uncond                  0.998717
-  dit_step0_vt                         0.995766
-  dit_step0_xt                         0.999998
-  dit_step5_vt_cond                    0.999507
-  dit_step5_vt                         0.993884
-  dit_step5_xt                         0.999963
-  dit_step10_vt_cond                   0.998797
-  dit_step10_vt                        0.993423
-  dit_step10_xt                        0.999887
-  dit_step15_vt_cond                   0.997670
-  dit_step15_vt                        0.988372
-  dit_step15_xt                        0.999682
-  dit_step20_vt_cond                   0.995498
-  dit_step20_vt                        0.982137
-  dit_step20_xt                        0.999190
-  dit_step25_vt_cond                   0.991181
-  dit_step25_vt                        0.972161
-  dit_step25_xt                        0.998167
-  dit_step30_vt_cond                   0.986183
-  dit_step30_vt                        0.967394
-  dit_step30_xt                        0.996519
-  dit_step35_vt_cond                   0.981815
-  dit_step35_vt                        0.959696
-  dit_step35_xt                        0.994436
-  dit_step40_vt_cond                   0.979298
-  dit_step40_vt                        0.954151
-  dit_step40_xt                        0.992400
-  dit_step45_vt_cond                   0.981642
-  dit_step45_vt                        0.955459
-  dit_step45_xt                        0.990953
-  dit_step49_vt_cond                   0.982680
-  dit_step49_vt                        0.941788
-  dit_x0                               0.990427
-  vae_audio                            0.960778
-  vae_audio (STFT cosine)              0.984703
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999998   0.038465   0.002037  -0.001739   0.980023  -0.001741   0.980402
-  dit_step5_xt             0.999963   0.130767   0.005794  -0.006951   0.888986  -0.007143   0.887999
-  dit_step10_xt            0.999887   0.230145   0.008907  -0.012421   0.810420  -0.012603   0.811299
-  dit_step15_xt            0.999682   0.369882   0.013468  -0.017757   0.745283  -0.018114   0.745268
-  dit_step20_xt            0.999190   0.439784   0.019899  -0.023189   0.699688  -0.023808   0.699582
-  dit_step25_xt            0.998167   0.657918   0.028642  -0.028736   0.678283  -0.029311   0.679278
-  dit_step30_xt            0.996519   1.070616   0.039415  -0.034342   0.684394  -0.035027   0.685262
-  dit_step35_xt            0.994436   1.684599   0.051968  -0.039891   0.716568  -0.040716   0.717195
-  dit_step40_xt            0.992400   2.115248   0.065570  -0.045402   0.770424  -0.046462   0.771853
-  dit_step45_xt            0.990953   2.369087   0.078496  -0.051406   0.841668  -0.052475   0.843036
diff --git a/tests/CPU_Q4_K_M.log b/tests/CPU_Q4_K_M.log
deleted file mode 100644
index 44fd5b2..0000000
--- a/tests/CPU_Q4_K_M.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
-[GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999816
-  lyric_embed                          1.000000
-  enc_hidden                           0.997095
-  detok_output                         0.999577
-  context                              0.999730
-  noise                                1.000000
-  temb_t                               0.999896
-  hidden_after_proj_in                 0.999903
-  enc_after_cond_emb                   0.997571
-  layer0_sa_output                     0.998370
-  hidden_after_layer0                  0.999619
-  hidden_after_layer6                  0.999177
-  hidden_after_layer12                 0.995111
-  hidden_after_layer18                 0.991459
-  hidden_after_layer23                 0.985217
-  dit_step0_vt                         0.946613
-  dit_step0_xt                         0.999883
-  dit_step1_vt                         0.947613
-  dit_step1_xt                         0.999611
-  dit_step2_vt                         0.958491
-  dit_step2_xt                         0.999010
-  dit_step3_vt                         0.962965
-  dit_step3_xt                         0.997773
-  dit_step4_vt                         0.960997
-  dit_step4_xt                         0.994989
-  dit_step5_vt                         0.957636
-  dit_step5_xt                         0.988832
-  dit_step6_vt                         0.952016
-  dit_step6_xt                         0.977196
-  dit_step7_vt                         0.939970
-  dit_x0                               0.959881
-  vae_audio                            0.834993
-  vae_audio (STFT cosine)              0.955098
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999883   0.167680   0.010319  -0.002256   0.973185  -0.002342   0.972003
-  dit_step1_xt             0.999611   0.268237   0.018204  -0.005104   0.943179  -0.005313   0.941730
-  dit_step2_xt             0.999010   0.434671   0.027774  -0.009029   0.910147  -0.009311   0.908527
-  dit_step3_xt             0.997773   0.601206   0.039926  -0.014325   0.875171  -0.014577   0.873624
-  dit_step4_xt             0.994989   0.892883   0.057385  -0.021274   0.843615  -0.021660   0.841995
-  dit_step5_xt             0.988832   1.381146   0.083605  -0.031218   0.827061  -0.032109   0.824593
-  dit_step6_xt             0.977196   2.021005   0.123750  -0.045473   0.858175  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q4_K_M.gguf
-[GGML] Running acestep-v15-sft-Q4_K_M.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999816
-  lyric_embed                          1.000000
-  enc_hidden                           0.997095
-  detok_output                         0.999577
-  context                              0.999730
-  noise                                1.000000
-  temb_t                               0.999645
-  hidden_after_proj_in                 0.999904
-  enc_after_cond_emb                   0.997560
-  layer0_sa_output                     0.998513
-  hidden_after_layer0                  0.999624
-  hidden_after_layer6                  0.999091
-  hidden_after_layer12                 0.997675
-  hidden_after_layer18                 0.996682
-  hidden_after_layer23                 0.996897
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.996806
-  dit_step0_vt_uncond                  0.996163
-  dit_step0_vt                         0.990085
-  dit_step0_xt                         0.999995
-  dit_step5_vt_cond                    0.995410
-  dit_step5_vt                         0.978964
-  dit_step5_xt                         0.999822
-  dit_step10_vt_cond                   0.991521
-  dit_step10_vt                        0.970202
-  dit_step10_xt                        0.999221
-  dit_step15_vt_cond                   0.981975
-  dit_step15_vt                        0.945173
-  dit_step15_xt                        0.997485
-  dit_step20_vt_cond                   0.967221
-  dit_step20_vt                        0.918272
-  dit_step20_xt                        0.993402
-  dit_step25_vt_cond                   0.950021
-  dit_step25_vt                        0.894843
-  dit_step25_xt                        0.986289
-  dit_step30_vt_cond                   0.929833
-  dit_step30_vt                        0.870341
-  dit_step30_xt                        0.976182
-  dit_step35_vt_cond                   0.909548
-  dit_step35_vt                        0.845635
-  dit_step35_xt                        0.964963
-  dit_step40_vt_cond                   0.897534
-  dit_step40_vt                        0.827777
-  dit_step40_xt                        0.954875
-  dit_step45_vt_cond                   0.908619
-  dit_step45_vt                        0.841100
-  dit_step45_xt                        0.948114
-  dit_step49_vt_cond                   0.927278
-  dit_step49_vt                        0.867932
-  dit_x0                               0.945906
-  vae_audio                            0.825297
-  vae_audio (STFT cosine)              0.924406
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999995   0.035570   0.002883  -0.001844   0.980345  -0.001741   0.980402
-  dit_step5_xt             0.999822   0.188835   0.013032  -0.007303   0.890510  -0.007143   0.887999
-  dit_step10_xt            0.999221   0.527206   0.024125  -0.012987   0.812393  -0.012603   0.811299
-  dit_step15_xt            0.997485   0.839391   0.039117  -0.018648   0.747696  -0.018114   0.745268
-  dit_step20_xt            0.993402   1.146206   0.058860  -0.024311   0.701939  -0.023808   0.699582
-  dit_step25_xt            0.986289   1.528936   0.081899  -0.030231   0.679540  -0.029311   0.679278
-  dit_step30_xt            0.976182   1.891257   0.108598  -0.036282   0.684111  -0.035027   0.685262
-  dit_step35_xt            0.964963   2.208873   0.137902  -0.042366   0.714637  -0.040716   0.717195
-  dit_step40_xt            0.954875   2.494038   0.168832  -0.048453   0.767102  -0.046462   0.771853
-  dit_step45_xt            0.948114   2.800970   0.198350  -0.054785   0.837697  -0.052475   0.843036
diff --git a/tests/CPU_Q5_K_M.log b/tests/CPU_Q5_K_M.log
deleted file mode 100644
index 4732362..0000000
--- a/tests/CPU_Q5_K_M.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
-[GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999816
-  lyric_embed                          1.000000
-  enc_hidden                           0.999099
-  detok_output                         0.999843
-  context                              0.999900
-  noise                                1.000000
-  temb_t                               0.999968
-  hidden_after_proj_in                 0.999954
-  enc_after_cond_emb                   0.999196
-  layer0_sa_output                     0.999388
-  hidden_after_layer0                  0.999773
-  hidden_after_layer6                  0.999687
-  hidden_after_layer12                 0.998560
-  hidden_after_layer18                 0.995178
-  hidden_after_layer23                 0.990907
-  dit_step0_vt                         0.966084
-  dit_step0_xt                         0.999926
-  dit_step1_vt                         0.972329
-  dit_step1_xt                         0.999780
-  dit_step2_vt                         0.971107
-  dit_step2_xt                         0.999383
-  dit_step3_vt                         0.973886
-  dit_step3_xt                         0.998543
-  dit_step4_vt                         0.971976
-  dit_step4_xt                         0.996642
-  dit_step5_vt                         0.967575
-  dit_step5_xt                         0.992211
-  dit_step6_vt                         0.962964
-  dit_step6_xt                         0.983513
-  dit_step7_vt                         0.954349
-  dit_x0                               0.970379
-  vae_audio                            0.874800
-  vae_audio (STFT cosine)              0.967703
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999926   0.135378   0.008030  -0.002303   0.973012  -0.002342   0.972003
-  dit_step1_xt             0.999780   0.276712   0.013491  -0.005310   0.942849  -0.005313   0.941730
-  dit_step2_xt             0.999383   0.460420   0.021261  -0.009337   0.909465  -0.009311   0.908527
-  dit_step3_xt             0.998543   0.681684   0.031463  -0.014739   0.874175  -0.014577   0.873624
-  dit_step4_xt             0.996642   0.853164   0.045737  -0.021967   0.842445  -0.021660   0.841995
-  dit_step5_xt             0.992211   1.314129   0.067657  -0.032346   0.825989  -0.032109   0.824593
-  dit_step6_xt             0.983513   2.191432   0.101363  -0.046949   0.857195  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q5_K_M.gguf
-[GGML] Running acestep-v15-sft-Q5_K_M.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999816
-  lyric_embed                          1.000000
-  enc_hidden                           0.999099
-  detok_output                         0.999843
-  context                              0.999900
-  noise                                1.000000
-  temb_t                               0.999877
-  hidden_after_proj_in                 0.999954
-  enc_after_cond_emb                   0.999196
-  layer0_sa_output                     0.999446
-  hidden_after_layer0                  0.999823
-  hidden_after_layer6                  0.999554
-  hidden_after_layer12                 0.998967
-  hidden_after_layer18                 0.997974
-  hidden_after_layer23                 0.998436
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998372
-  dit_step0_vt_uncond                  0.998354
-  dit_step0_vt                         0.994379
-  dit_step0_xt                         0.999997
-  dit_step5_vt_cond                    0.998658
-  dit_step5_vt                         0.988358
-  dit_step5_xt                         0.999933
-  dit_step10_vt_cond                   0.997095
-  dit_step10_vt                        0.985993
-  dit_step10_xt                        0.999758
-  dit_step15_vt_cond                   0.993108
-  dit_step15_vt                        0.970538
-  dit_step15_xt                        0.999209
-  dit_step20_vt_cond                   0.985753
-  dit_step20_vt                        0.954524
-  dit_step20_xt                        0.997715
-  dit_step25_vt_cond                   0.976423
-  dit_step25_vt                        0.938088
-  dit_step25_xt                        0.994906
-  dit_step30_vt_cond                   0.965769
-  dit_step30_vt                        0.925268
-  dit_step30_xt                        0.990600
-  dit_step35_vt_cond                   0.955274
-  dit_step35_vt                        0.909442
-  dit_step35_xt                        0.985533
-  dit_step40_vt_cond                   0.949378
-  dit_step40_vt                        0.894016
-  dit_step40_xt                        0.980757
-  dit_step45_vt_cond                   0.956168
-  dit_step45_vt                        0.901535
-  dit_step45_xt                        0.977447
-  dit_step49_vt_cond                   0.966288
-  dit_step49_vt                        0.914297
-  dit_x0                               0.976302
-  vae_audio                            0.889659
-  vae_audio (STFT cosine)              0.945409
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999997   0.037808   0.002296  -0.001776   0.980078  -0.001741   0.980402
-  dit_step5_xt             0.999933   0.104447   0.007971  -0.006973   0.889460  -0.007143   0.887999
-  dit_step10_xt            0.999758   0.210002   0.013370  -0.012530   0.810881  -0.012603   0.811299
-  dit_step15_xt            0.999209   0.418503   0.021538  -0.017971   0.745622  -0.018114   0.745268
-  dit_step20_xt            0.997715   0.623172   0.033317  -0.023695   0.699368  -0.023808   0.699582
-  dit_step25_xt            0.994906   0.874752   0.047642  -0.029485   0.676770  -0.029311   0.679278
-  dit_step30_xt            0.990600   1.161649   0.065018  -0.035311   0.680992  -0.035027   0.685262
-  dit_step35_xt            0.985533   1.453686   0.084547  -0.041122   0.711332  -0.040716   0.717195
-  dit_step40_xt            0.980757   1.810532   0.105436  -0.046941   0.764001  -0.046462   0.771853
-  dit_step45_xt            0.977447   2.167346   0.125231  -0.053123   0.834843  -0.052475   0.843036
diff --git a/tests/CPU_Q6_K.log b/tests/CPU_Q6_K.log
deleted file mode 100644
index 93d1e05..0000000
--- a/tests/CPU_Q6_K.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
-[GGML] Running acestep-v15-turbo-Q6_K.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999816
-  lyric_embed                          1.000000
-  enc_hidden                           0.999634
-  detok_output                         0.999927
-  context                              0.999954
-  noise                                1.000000
-  temb_t                               0.999986
-  hidden_after_proj_in                 0.999975
-  enc_after_cond_emb                   0.999619
-  layer0_sa_output                     0.999718
-  hidden_after_layer0                  0.999827
-  hidden_after_layer6                  0.999788
-  hidden_after_layer12                 0.998843
-  hidden_after_layer18                 0.995848
-  hidden_after_layer23                 0.992196
-  dit_step0_vt                         0.971124
-  dit_step0_xt                         0.999936
-  dit_step1_vt                         0.975111
-  dit_step1_xt                         0.999802
-  dit_step2_vt                         0.978218
-  dit_step2_xt                         0.999477
-  dit_step3_vt                         0.977576
-  dit_step3_xt                         0.998723
-  dit_step4_vt                         0.973938
-  dit_step4_xt                         0.996945
-  dit_step5_vt                         0.969356
-  dit_step5_xt                         0.992753
-  dit_step6_vt                         0.965671
-  dit_step6_xt                         0.984569
-  dit_step7_vt                         0.958147
-  dit_x0                               0.972312
-  vae_audio                            0.891761
-  vae_audio (STFT cosine)              0.969080
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999936   0.151952   0.007283  -0.002271   0.972870  -0.002342   0.972003
-  dit_step1_xt             0.999802   0.296519   0.012516  -0.005212   0.942575  -0.005313   0.941730
-  dit_step2_xt             0.999477   0.478400   0.019283  -0.009184   0.908992  -0.009311   0.908527
-  dit_step3_xt             0.998723   0.734609   0.028810  -0.014535   0.873457  -0.014577   0.873624
-  dit_step4_xt             0.996945   1.045720   0.042804  -0.021712   0.841447  -0.021660   0.841995
-  dit_step5_xt             0.992753   1.512605   0.064324  -0.032020   0.824620  -0.032109   0.824593
-  dit_step6_xt             0.984569   2.166596   0.096699  -0.046604   0.855715  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q6_K.gguf
-[GGML] Running acestep-v15-sft-Q6_K.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999816
-  lyric_embed                          1.000000
-  enc_hidden                           0.999634
-  detok_output                         0.999927
-  context                              0.999954
-  noise                                1.000000
-  temb_t                               0.999952
-  hidden_after_proj_in                 0.999974
-  enc_after_cond_emb                   0.999624
-  layer0_sa_output                     0.999731
-  hidden_after_layer0                  0.999858
-  hidden_after_layer6                  0.999745
-  hidden_after_layer12                 0.999282
-  hidden_after_layer18                 0.998391
-  hidden_after_layer23                 0.998703
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998624
-  dit_step0_vt_uncond                  0.998134
-  dit_step0_vt                         0.994531
-  dit_step0_xt                         0.999997
-  dit_step5_vt_cond                    0.999105
-  dit_step5_vt                         0.991049
-  dit_step5_xt                         0.999950
-  dit_step10_vt_cond                   0.997890
-  dit_step10_vt                        0.988681
-  dit_step10_xt                        0.999825
-  dit_step15_vt_cond                   0.995763
-  dit_step15_vt                        0.978576
-  dit_step15_xt                        0.999458
-  dit_step20_vt_cond                   0.991824
-  dit_step20_vt                        0.966730
-  dit_step20_xt                        0.998566
-  dit_step25_vt_cond                   0.986001
-  dit_step25_vt                        0.952775
-  dit_step25_xt                        0.996897
-  dit_step30_vt_cond                   0.979821
-  dit_step30_vt                        0.943526
-  dit_step30_xt                        0.994344
-  dit_step35_vt_cond                   0.973662
-  dit_step35_vt                        0.929345
-  dit_step35_xt                        0.991309
-  dit_step40_vt_cond                   0.969585
-  dit_step40_vt                        0.918968
-  dit_step40_xt                        0.988416
-  dit_step45_vt_cond                   0.972816
-  dit_step45_vt                        0.918164
-  dit_step45_xt                        0.986334
-  dit_step49_vt_cond                   0.976204
-  dit_step49_vt                        0.909094
-  dit_x0                               0.985561
-  vae_audio                            0.940827
-  vae_audio (STFT cosine)              0.976287
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999997   0.037619   0.002240  -0.001750   0.980170  -0.001741   0.980402
-  dit_step5_xt             0.999950   0.129572   0.006928  -0.006971   0.889777  -0.007143   0.887999
-  dit_step10_xt            0.999825   0.192490   0.011325  -0.012410   0.811294  -0.012603   0.811299
-  dit_step15_xt            0.999458   0.319211   0.017944  -0.017698   0.745779  -0.018114   0.745268
-  dit_step20_xt            0.998566   0.553748   0.026838  -0.023098   0.699443  -0.023808   0.699582
-  dit_step25_xt            0.996897   0.760972   0.037747  -0.028532   0.677161  -0.029311   0.679278
-  dit_step30_xt            0.994344   1.235259   0.050893  -0.033936   0.681526  -0.035027   0.685262
-  dit_step35_xt            0.991309   1.863492   0.065806  -0.039291   0.711899  -0.040716   0.717195
-  dit_step40_xt            0.988416   2.112072   0.082079  -0.044606   0.764056  -0.046462   0.771853
-  dit_step45_xt            0.986334   2.338981   0.097741  -0.050358   0.834033  -0.052475   0.843036
diff --git a/tests/CPU_Q8_0.log b/tests/CPU_Q8_0.log
deleted file mode 100644
index f4a9086..0000000
--- a/tests/CPU_Q8_0.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
-[GGML] Running acestep-v15-turbo-Q8_0.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999816
-  lyric_embed                          1.000000
-  enc_hidden                           0.999814
-  detok_output                         0.999983
-  context                              0.999990
-  noise                                1.000000
-  temb_t                               0.999997
-  hidden_after_proj_in                 0.999985
-  enc_after_cond_emb                   0.999791
-  layer0_sa_output                     0.999925
-  hidden_after_layer0                  0.999955
-  hidden_after_layer6                  0.999892
-  hidden_after_layer12                 0.999219
-  hidden_after_layer18                 0.996644
-  hidden_after_layer23                 0.993707
-  dit_step0_vt                         0.975605
-  dit_step0_xt                         0.999946
-  dit_step1_vt                         0.978928
-  dit_step1_xt                         0.999831
-  dit_step2_vt                         0.981129
-  dit_step2_xt                         0.999551
-  dit_step3_vt                         0.982813
-  dit_step3_xt                         0.998932
-  dit_step4_vt                         0.981292
-  dit_step4_xt                         0.997544
-  dit_step5_vt                         0.979091
-  dit_step5_xt                         0.994467
-  dit_step6_vt                         0.976152
-  dit_step6_xt                         0.988647
-  dit_step7_vt                         0.970238
-  dit_x0                               0.980014
-  vae_audio                            0.903408
-  vae_audio (STFT cosine)              0.976427
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999946   0.139652   0.006645  -0.002330   0.972930  -0.002342   0.972003
-  dit_step1_xt             0.999831   0.267117   0.011368  -0.005325   0.942659  -0.005313   0.941730
-  dit_step2_xt             0.999551   0.452101   0.017578  -0.009369   0.909163  -0.009311   0.908527
-  dit_step3_xt             0.998932   0.629880   0.025911  -0.014735   0.873792  -0.014577   0.873624
-  dit_step4_xt             0.997544   0.759572   0.037583  -0.021796   0.841987  -0.021660   0.841995
-  dit_step5_xt             0.994467   1.235701   0.054893  -0.031886   0.825306  -0.032109   0.824593
-  dit_step6_xt             0.988647   2.096131   0.081207  -0.046181   0.856264  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q8_0.gguf
-[GGML] Running acestep-v15-sft-Q8_0.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999816
-  lyric_embed                          1.000000
-  enc_hidden                           0.999814
-  detok_output                         0.999983
-  context                              0.999990
-  noise                                1.000000
-  temb_t                               0.999991
-  hidden_after_proj_in                 0.999986
-  enc_after_cond_emb                   0.999795
-  layer0_sa_output                     0.999912
-  hidden_after_layer0                  0.999958
-  hidden_after_layer6                  0.999824
-  hidden_after_layer12                 0.999445
-  hidden_after_layer18                 0.998719
-  hidden_after_layer23                 0.998974
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998922
-  dit_step0_vt_uncond                  0.998427
-  dit_step0_vt                         0.995455
-  dit_step0_xt                         0.999998
-  dit_step5_vt_cond                    0.999446
-  dit_step5_vt                         0.993188
-  dit_step5_xt                         0.999961
-  dit_step10_vt_cond                   0.998529
-  dit_step10_vt                        0.992281
-  dit_step10_xt                        0.999875
-  dit_step15_vt_cond                   0.996311
-  dit_step15_vt                        0.982856
-  dit_step15_xt                        0.999609
-  dit_step20_vt_cond                   0.992095
-  dit_step20_vt                        0.974098
-  dit_step20_xt                        0.998863
-  dit_step25_vt_cond                   0.986516
-  dit_step25_vt                        0.962299
-  dit_step25_xt                        0.997338
-  dit_step30_vt_cond                   0.980702
-  dit_step30_vt                        0.955880
-  dit_step30_xt                        0.995005
-  dit_step35_vt_cond                   0.975404
-  dit_step35_vt                        0.945189
-  dit_step35_xt                        0.992202
-  dit_step40_vt_cond                   0.972588
-  dit_step40_vt                        0.935722
-  dit_step40_xt                        0.989533
-  dit_step45_vt_cond                   0.975984
-  dit_step45_vt                        0.937094
-  dit_step45_xt                        0.987666
-  dit_step49_vt_cond                   0.978734
-  dit_step49_vt                        0.917631
-  dit_x0                               0.986993
-  vae_audio                            0.937093
-  vae_audio (STFT cosine)              0.971416
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999998   0.038134   0.002096  -0.001710   0.980019  -0.001741   0.980402
-  dit_step5_xt             0.999961   0.137689   0.005996  -0.006894   0.889095  -0.007143   0.887999
-  dit_step10_xt            0.999875   0.219306   0.009469  -0.012337   0.810457  -0.012603   0.811299
-  dit_step15_xt            0.999609   0.356501   0.014905  -0.017570   0.745282  -0.018114   0.745268
-  dit_step20_xt            0.998863   0.570726   0.023002  -0.022897   0.699575  -0.023808   0.699582
-  dit_step25_xt            0.997338   0.870836   0.033418  -0.028306   0.678021  -0.029311   0.679278
-  dit_step30_xt            0.995005   1.126647   0.045749  -0.033772   0.683965  -0.035027   0.685262
-  dit_step35_xt            0.992202   1.561250   0.059823  -0.039172   0.715848  -0.040716   0.717195
-  dit_step40_xt            0.989533   1.985042   0.074909  -0.044584   0.769539  -0.046462   0.771853
-  dit_step45_xt            0.987666   2.384698   0.089346  -0.050474   0.840839  -0.052475   0.843036
diff --git a/tests/CUDA-BF16.log b/tests/CUDA-BF16.log
index d73a934..5ed30ff 100644
--- a/tests/CUDA-BF16.log
+++ b/tests/CUDA-BF16.log
@@ -1,7 +1,7 @@
 ggml_cuda_init: found 1 CUDA devices:
   Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
 [Load] DiT backend: CUDA0 (CPU threads: 16)
-[Load] Backend init: 70.8 ms
+[Load] Backend init: 32.6 ms
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [DiT] Self-attn: Q+K+V fused
 [DiT] Cross-attn: Q+K+V fused
@@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices:
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 3007.9 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 375.6 ms
+[Load] DiT weight load: 310.9 ms
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CUDA0 (CPU threads: 16)
+[Load] VAE backend: CUDA0 (shared)
 [VAE] Backend: CUDA0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 661.0 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 653.9 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 32.8 ms
+[Load] BPE tokenizer: 30.9 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[Load] TextEncoder backend: CUDA0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 128.5 ms
-[Encode] TextEncoder (70 tokens): 50.6 ms
+[Load] TextEncoder: 102.3 ms
+[Encode] TextEncoder (70 tokens): 50.4 ms
 [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.5 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[Load] CondEncoder backend: CUDA0 (shared)
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices:
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 1160.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 127.1 ms
+[Load] ConditionEncoder: 90.6 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 7.9 ms, enc_S=238
+[Encode] ConditionEncoder: 8.2 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.758648 -0.049409 -0.132412 0.058372
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [WeightCtx] Loaded 30 tensors, 200.3 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 24.2 ms
+[Load] Detokenizer: 17.6 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 141.9 ms
+[Context] Detokenizer: 140.1 ms
 [Debug] detok_output: [2170, 64] first4: -0.124204 1.435425 0.309963 -0.624679
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices:
 [Debug] dit_step7_vt: [2170, 64] first4: -0.004009 0.190141 -1.466879 3.103273
 [Debug] dit_x0: [2170, 64] first4: 0.085060 1.438241 0.424145 -1.920485
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 248.3 ms (248.3 ms/sample)
+[DiT] Total generation: 243.9 ms (243.9 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.085060 1.438241 0.424145 -1.920485
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 812.8 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000547 0.000898 0.000798 0.001064
+[VAE Batch0] Decode: 615.3 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000498 0.000900 0.000800 0.001124
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:54:08.539 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:54:08.540 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:54:08.540 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:54:08.540 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:54:08.540 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:54:09.277 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:54:10.804 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:54:10.804 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:54:10.810 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:54:10.970 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:45:24.010 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:45:24.091 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:45:25.418 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:45:25.418 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:45:25.421 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:45:25.426 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:45:25.618 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:54:10.972 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:45:25.621 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:54:10.978 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:54:10.991 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:54:10.991 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:54:11.023 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:54:11.329 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:54:11.330 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:54:11.330 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0068187713623046875, 'diffusion_time_cost': 0.2986173629760742, 'diffusion_per_step_time_cost': 0.03732717037200928, 'total_time_cost': 0.3054361343383789, 'offload_time_cost': 0.0}
-2026-03-01 19:54:11.344 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:54:11.349 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:54:11.349 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:54:11.349 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:54:11.349 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:54:11.349 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:54:11.349 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:54:11.625 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:54:11.628 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:54:11.632 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:45:25.628 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:45:25.643 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:45:25.643 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:45:25.674 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:45:25.993 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:45:25.994 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:45:25.994 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006845712661743164, 'diffusion_time_cost': 0.3112342357635498, 'diffusion_per_step_time_cost': 0.038904279470443726, 'total_time_cost': 0.31807994842529297, 'offload_time_cost': 0.0}
+2026-03-04 21:45:26.008 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:45:26.010 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:45:26.010 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:45:26.010 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:45:26.010 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:45:26.010 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:45:26.010 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:45:26.284 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:45:26.286 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:45:26.288 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf
 [GGML] Running acestep-v15-turbo-BF16.gguf...
@@ -246,8 +241,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.988188
   dit_step7_vt                         0.969375
   dit_x0                               0.979213
-  vae_audio                            0.901377
-  vae_audio (STFT cosine)              0.975525
+  vae_audio                            0.901411
+  vae_audio (STFT cosine)              0.975533
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999945   0.135628   0.006709  -0.002312   0.972932  -0.002342   0.972003
diff --git a/tests/CUDA-Q4_K_M.log b/tests/CUDA-Q4_K_M.log
index 189cb71..403d030 100644
--- a/tests/CUDA-Q4_K_M.log
+++ b/tests/CUDA-Q4_K_M.log
@@ -1,7 +1,7 @@
 ggml_cuda_init: found 1 CUDA devices:
   Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
 [Load] DiT backend: CUDA0 (CPU threads: 16)
-[Load] Backend init: 11.2 ms
+[Load] Backend init: 9.6 ms
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K fused, V separate
 [DiT] Cross-attn: all separate
@@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices:
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 895.6 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 403.0 ms
+[Load] DiT weight load: 141.8 ms
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CUDA0 (CPU threads: 16)
+[Load] VAE backend: CUDA0 (shared)
 [VAE] Backend: CUDA0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 655.9 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 652.4 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 31.4 ms
+[Load] BPE tokenizer: 32.6 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[Load] TextEncoder backend: CUDA0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 126.3 ms
-[Encode] TextEncoder (70 tokens): 52.7 ms
+[Load] TextEncoder: 103.0 ms
+[Encode] TextEncoder (70 tokens): 50.9 ms
 [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.1 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[Load] CondEncoder backend: CUDA0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K fused, V separate
@@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices:
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 352.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 118.9 ms
+[Load] ConditionEncoder: 29.8 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 12.7 ms, enc_S=238
+[Encode] ConditionEncoder: 13.3 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.759848 -0.046220 -0.129361 0.057668
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 64.7 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 22.1 ms
+[Load] Detokenizer: 6.3 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 124.0 ms
+[Context] Detokenizer: 124.2 ms
 [Debug] detok_output: [2170, 64] first4: -0.098446 1.438721 0.299255 -0.646500
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices:
 [Debug] dit_step7_vt: [2170, 64] first4: -0.488470 0.849564 -1.659694 3.185843
 [Debug] dit_x0: [2170, 64] first4: 0.317955 1.165446 0.587176 -1.877443
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 249.1 ms (249.1 ms/sample)
+[DiT] Total generation: 249.0 ms (249.0 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.317955 1.165446 0.587176 -1.877443
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 820.0 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000325 0.000812 0.000671 0.000911
+[VAE Batch0] Decode: 616.0 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000379 0.000847 0.000704 0.001000
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:54:39.264 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:54:39.265 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:54:39.265 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:54:39.265 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:54:39.265 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:54:40.025 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:54:41.587 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:54:41.587 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:54:41.592 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:54:41.751 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:45:55.364 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:45:55.452 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:45:56.779 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:45:56.779 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:45:56.781 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:45:56.786 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:45:56.978 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:45:56.980 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:45:56.980 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:45:56.980 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:45:56.980 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:45:56.981 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:45:56.981 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:54:41.753 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:45:56.981 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:54:41.759 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:54:41.771 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:54:41.772 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:54:41.805 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:54:42.113 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:54:42.114 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:54:42.114 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006765604019165039, 'diffusion_time_cost': 0.3010725975036621, 'diffusion_per_step_time_cost': 0.037634074687957764, 'total_time_cost': 0.30783820152282715, 'offload_time_cost': 0.0}
-2026-03-01 19:54:42.128 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:54:42.131 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:54:42.131 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:54:42.131 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:54:42.131 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:54:42.131 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:54:42.131 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:54:42.405 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:54:42.408 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:54:42.411 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:45:56.987 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:45:57.002 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:45:57.002 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:45:57.032 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:45:57.348 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:45:57.349 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:45:57.349 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006890535354614258, 'diffusion_time_cost': 0.30885934829711914, 'diffusion_per_step_time_cost': 0.03860741853713989, 'total_time_cost': 0.3157498836517334, 'offload_time_cost': 0.0}
+2026-03-04 21:45:57.363 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:45:57.366 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:45:57.366 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.78 GB
+2026-03-04 21:45:57.366 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:45:57.366 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.78 GB
+2026-03-04 21:45:57.366 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.78 GB
+2026-03-04 21:45:57.366 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:45:57.640 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:45:57.642 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:45:57.644 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
 [GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
@@ -246,8 +241,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.976494
   dit_step7_vt                         0.938658
   dit_x0                               0.958725
-  vae_audio                            0.837763
-  vae_audio (STFT cosine)              0.954448
+  vae_audio                            0.837780
+  vae_audio (STFT cosine)              0.954457
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999885   0.165835   0.010206  -0.002260   0.973133  -0.002342   0.972003
diff --git a/tests/CUDA-Q5_K_M.log b/tests/CUDA-Q5_K_M.log
index 00b9652..4e72f4f 100644
--- a/tests/CUDA-Q5_K_M.log
+++ b/tests/CUDA-Q5_K_M.log
@@ -1,7 +1,7 @@
 ggml_cuda_init: found 1 CUDA devices:
   Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
 [Load] DiT backend: CUDA0 (CPU threads: 16)
-[Load] Backend init: 25.7 ms
+[Load] Backend init: 9.6 ms
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K fused, V separate
 [DiT] Cross-attn: all separate
@@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices:
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1061.2 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 465.4 ms
+[Load] DiT weight load: 152.8 ms
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CUDA0 (CPU threads: 16)
+[Load] VAE backend: CUDA0 (shared)
 [VAE] Backend: CUDA0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 656.4 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 655.0 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
 [Load] BPE tokenizer: 31.3 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[Load] TextEncoder backend: CUDA0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 127.3 ms
-[Encode] TextEncoder (70 tokens): 49.5 ms
+[Load] TextEncoder: 102.1 ms
+[Encode] TextEncoder (70 tokens): 70.3 ms
 [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.4 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[Load] CondEncoder backend: CUDA0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K fused, V separate
@@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices:
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 412.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 138.7 ms
+[Load] ConditionEncoder: 34.4 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 13.1 ms, enc_S=238
+[Encode] ConditionEncoder: 13.5 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.760389 -0.050879 -0.130835 0.059141
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 73.2 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 24.2 ms
+[Load] Detokenizer: 6.8 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 121.7 ms
+[Context] Detokenizer: 124.1 ms
 [Debug] detok_output: [2170, 64] first4: -0.125017 1.460327 0.292545 -0.654237
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices:
 [Debug] dit_step7_vt: [2170, 64] first4: 0.031181 0.378487 -1.509792 3.095486
 [Debug] dit_x0: [2170, 64] first4: 0.032336 1.392616 0.498835 -1.905283
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 251.1 ms (251.1 ms/sample)
+[DiT] Total generation: 261.4 ms (261.4 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.032336 1.392616 0.498835 -1.905283
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 804.2 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000692 0.001098 0.000938 0.001230
+[VAE Batch0] Decode: 614.5 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000681 0.001094 0.000878 0.001246
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:54:31.395 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:54:31.395 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:54:31.395 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:54:31.395 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:54:31.395 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:54:32.168 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:54:33.881 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:54:33.882 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:54:33.887 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:54:34.060 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:45:47.565 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:45:47.662 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:45:48.979 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:45:48.979 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:45:48.981 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:45:48.987 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:45:49.182 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:54:34.062 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:45:49.184 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:54:34.068 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:54:34.081 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:54:34.081 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:54:34.105 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:54:34.415 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:54:34.416 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:54:34.416 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006921052932739258, 'diffusion_time_cost': 0.3029003143310547, 'diffusion_per_step_time_cost': 0.037862539291381836, 'total_time_cost': 0.30982136726379395, 'offload_time_cost': 0.0}
-2026-03-01 19:54:34.431 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:54:34.436 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:54:34.436 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:54:34.436 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:54:34.436 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:54:34.436 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:54:34.436 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:54:34.714 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:54:34.716 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:54:34.720 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:45:49.211 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:45:49.226 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:45:49.226 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:45:49.260 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:45:49.577 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:45:49.577 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:45:49.578 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00789022445678711, 'diffusion_time_cost': 0.30838513374328613, 'diffusion_per_step_time_cost': 0.03854814171791077, 'total_time_cost': 0.31627535820007324, 'offload_time_cost': 0.0}
+2026-03-04 21:45:49.591 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:45:49.594 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:45:49.594 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:45:49.594 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:45:49.594 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:45:49.594 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:45:49.594 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:45:49.873 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:45:49.875 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:45:49.877 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
 [GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
@@ -246,8 +241,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.983446
   dit_step7_vt                         0.953383
   dit_x0                               0.970119
-  vae_audio                            0.883226
-  vae_audio (STFT cosine)              0.968463
+  vae_audio                            0.883245
+  vae_audio (STFT cosine)              0.968470
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999930   0.139407   0.007818  -0.002306   0.973025  -0.002342   0.972003
diff --git a/tests/CUDA-Q6_K.log b/tests/CUDA-Q6_K.log
index 10b9a7a..4950234 100644
--- a/tests/CUDA-Q6_K.log
+++ b/tests/CUDA-Q6_K.log
@@ -1,7 +1,7 @@
 ggml_cuda_init: found 1 CUDA devices:
   Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
 [Load] DiT backend: CUDA0 (CPU threads: 16)
-[Load] Backend init: 9.5 ms
+[Load] Backend init: 10.2 ms
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K+V fused
 [DiT] Cross-attn: Q+K+V fused
@@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices:
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1237.2 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 514.8 ms
+[Load] DiT weight load: 176.0 ms
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CUDA0 (CPU threads: 16)
+[Load] VAE backend: CUDA0 (shared)
 [VAE] Backend: CUDA0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 657.3 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 655.0 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 30.7 ms
+[Load] BPE tokenizer: 31.5 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[Load] TextEncoder backend: CUDA0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 125.7 ms
-[Encode] TextEncoder (70 tokens): 49.2 ms
+[Load] TextEncoder: 102.6 ms
+[Encode] TextEncoder (70 tokens): 51.1 ms
 [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.3 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[Load] CondEncoder backend: CUDA0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices:
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 476.3 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 145.8 ms
+[Load] ConditionEncoder: 40.6 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 11.0 ms, enc_S=238
+[Encode] ConditionEncoder: 10.8 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.760759 -0.050104 -0.133269 0.058044
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 82.2 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 26.4 ms
+[Load] Detokenizer: 7.8 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 123.5 ms
+[Context] Detokenizer: 123.6 ms
 [Debug] detok_output: [2170, 64] first4: -0.140341 1.456987 0.310602 -0.632665
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices:
 [Debug] dit_step7_vt: [2170, 64] first4: 0.081321 0.135461 -1.397063 2.986206
 [Debug] dit_x0: [2170, 64] first4: 0.028793 1.462229 0.417478 -1.887184
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 273.2 ms (273.2 ms/sample)
+[DiT] Total generation: 270.6 ms (270.6 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.028793 1.462229 0.417478 -1.887184
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 804.3 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000481 0.000872 0.000838 0.001216
+[VAE Batch0] Decode: 616.4 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000531 0.001035 0.000900 0.001303
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:54:23.682 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:54:23.683 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:54:23.683 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:54:23.683 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:54:23.683 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:54:24.419 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:54:25.992 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:54:25.992 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:54:25.998 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:54:26.157 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:45:39.727 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:45:39.815 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:45:41.135 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:45:41.135 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:45:41.137 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:45:41.142 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:45:41.335 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:54:26.159 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:45:41.337 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:54:26.166 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:54:26.178 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:54:26.178 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:54:26.214 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:54:26.528 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:54:26.528 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:54:26.528 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00680994987487793, 'diffusion_time_cost': 0.30716919898986816, 'diffusion_per_step_time_cost': 0.03839614987373352, 'total_time_cost': 0.3139791488647461, 'offload_time_cost': 0.0}
-2026-03-01 19:54:26.543 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:54:26.545 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:54:26.545 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB
-2026-03-01 19:54:26.545 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:54:26.545 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB
-2026-03-01 19:54:26.545 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB
-2026-03-01 19:54:26.545 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:54:26.821 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:54:26.824 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:54:26.828 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:45:41.345 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:45:41.359 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:45:41.359 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:45:41.390 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:45:41.705 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:45:41.706 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:45:41.706 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006890773773193359, 'diffusion_time_cost': 0.30776047706604004, 'diffusion_per_step_time_cost': 0.038470059633255005, 'total_time_cost': 0.3146512508392334, 'offload_time_cost': 0.0}
+2026-03-04 21:45:41.720 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:45:41.722 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:45:41.723 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:45:41.723 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:45:41.723 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:45:41.723 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:45:41.723 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:45:41.997 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:45:41.999 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:45:42.001 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
 [GGML] Running acestep-v15-turbo-Q6_K.gguf...
@@ -246,8 +241,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.985862
   dit_step7_vt                         0.962454
   dit_x0                               0.974866
-  vae_audio                            0.893678
-  vae_audio (STFT cosine)              0.969663
+  vae_audio                            0.893720
+  vae_audio (STFT cosine)              0.969672
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999937   0.147590   0.007252  -0.002265   0.972930  -0.002342   0.972003
diff --git a/tests/CUDA-Q8_0.log b/tests/CUDA-Q8_0.log
index 3a84ce1..2744819 100644
--- a/tests/CUDA-Q8_0.log
+++ b/tests/CUDA-Q8_0.log
@@ -1,7 +1,7 @@
 ggml_cuda_init: found 1 CUDA devices:
   Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes
 [Load] DiT backend: CUDA0 (CPU threads: 16)
-[Load] Backend init: 9.5 ms
+[Load] Backend init: 9.7 ms
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K+V fused
 [DiT] Cross-attn: Q+K+V fused
@@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices:
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1600.7 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 221.9 ms
+[Load] DiT weight load: 201.4 ms
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: CUDA0 (CPU threads: 16)
+[Load] VAE backend: CUDA0 (shared)
 [VAE] Backend: CUDA0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 658.9 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 655.8 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
 [Load] BPE tokenizer: 31.2 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: CUDA0 (CPU threads: 16)
+[Load] TextEncoder backend: CUDA0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 127.0 ms
-[Encode] TextEncoder (70 tokens): 68.2 ms
+[Load] TextEncoder: 102.2 ms
+[Encode] TextEncoder (70 tokens): 57.4 ms
 [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 12.3 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.1 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: CUDA0 (CPU threads: 16)
+[Load] CondEncoder backend: CUDA0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices:
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 616.6 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 65.2 ms
+[Load] ConditionEncoder: 52.3 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 8.9 ms, enc_S=238
+[Encode] ConditionEncoder: 9.0 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.759220 -0.049559 -0.133467 0.058389
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 106.5 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 12.1 ms
+[Load] Detokenizer: 9.2 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 104.8 ms
+[Context] Detokenizer: 103.8 ms
 [Debug] detok_output: [2170, 64] first4: -0.120490 1.436288 0.301594 -0.632564
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices:
 [Debug] dit_step7_vt: [2170, 64] first4: -0.007394 0.229067 -1.488817 3.083439
 [Debug] dit_x0: [2170, 64] first4: 0.087028 1.415554 0.432225 -1.919150
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 242.9 ms (242.9 ms/sample)
+[DiT] Total generation: 236.6 ms (236.6 ms/sample)
 [Debug] dit_output: [2170, 64] first4: 0.087028 1.415554 0.432225 -1.919150
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 822.6 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000524 0.000859 0.000752 0.001056
+[VAE Batch0] Decode: 618.6 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000531 0.000916 0.000781 0.001161
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:54:15.905 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:54:15.906 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:54:15.906 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:54:15.906 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:54:15.906 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:54:16.672 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:54:18.198 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:54:18.198 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:54:18.207 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:54:18.371 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:45:31.851 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:45:31.953 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:45:33.265 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:45:33.265 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:45:33.269 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:45:33.275 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:45:33.468 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:54:18.373 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:45:33.470 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:54:18.380 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:54:18.392 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:54:18.392 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:54:18.418 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:54:18.724 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:54:18.724 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:54:18.724 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006882190704345703, 'diffusion_time_cost': 0.298403263092041, 'diffusion_per_step_time_cost': 0.03730040788650513, 'total_time_cost': 0.3052854537963867, 'offload_time_cost': 0.0}
-2026-03-01 19:54:18.739 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:54:18.741 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:54:18.741 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB
-2026-03-01 19:54:18.741 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:54:18.741 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB
-2026-03-01 19:54:18.741 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB
-2026-03-01 19:54:18.741 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:54:19.031 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:54:19.034 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:54:19.037 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:45:33.490 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:45:33.505 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:45:33.505 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:45:33.539 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:45:33.854 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:45:33.855 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:45:33.855 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0069425106048583984, 'diffusion_time_cost': 0.30779337882995605, 'diffusion_per_step_time_cost': 0.03847417235374451, 'total_time_cost': 0.31473588943481445, 'offload_time_cost': 0.0}
+2026-03-04 21:45:33.869 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:45:33.871 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:45:33.871 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.78 GB
+2026-03-04 21:45:33.871 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:45:33.871 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.78 GB
+2026-03-04 21:45:33.871 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.78 GB
+2026-03-04 21:45:33.871 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:45:34.145 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:45:34.147 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:45:34.149 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
 [GGML] Running acestep-v15-turbo-Q8_0.gguf...
@@ -246,8 +241,8 @@ Using precomputed LM hints
   dit_step6_xt                         0.988641
   dit_step7_vt                         0.970144
   dit_x0                               0.979969
-  vae_audio                            0.905525
-  vae_audio (STFT cosine)              0.976530
+  vae_audio                            0.905563
+  vae_audio (STFT cosine)              0.976538
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999948   0.134961   0.006551  -0.002307   0.972901  -0.002342   0.972003
diff --git a/tests/Metal_Q4_K_M.log b/tests/Metal_Q4_K_M.log
deleted file mode 100644
index e1ad24f..0000000
--- a/tests/Metal_Q4_K_M.log
+++ /dev/null
@@ -1,835 +0,0 @@
-ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
-ggml_metal_library_init: using embedded metal library
-ggml_metal_library_init: loaded in 0.006 sec
-ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
-ggml_metal_device_init: GPU name:   MTL0
-ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
-ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
-ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
-ggml_metal_device_init: simdgroup reduction   = true
-ggml_metal_device_init: simdgroup matrix mul. = true
-ggml_metal_device_init: has unified memory    = true
-ggml_metal_device_init: has bfloat            = true
-ggml_metal_device_init: has tensor            = false
-ggml_metal_device_init: use residency sets    = true
-ggml_metal_device_init: use shared buffers    = true
-ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] DiT backend: MTL0 (CPU threads: 5)
-[Load] Backend init: 20.9 ms
-[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
-[DiT] Self-attn: Q+K fused, V separate
-[DiT] Cross-attn: all separate
-[DiT] MLP: gate+up fused
-[Load] null_condition_emb found (CFG available)
-[WeightCtx] Loaded 478 tensors, 895.6 MB into backend
-[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 1421.5 ms
-[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
-[Load] silence_latent: [15000, 64] from GGUF
-[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] VAE backend: MTL0 (CPU threads: 5)
-[VAE] Backend: MTL0, Weight buffer: 255.7 MB
-[VAE] Loaded: 5 blocks, upsample=1920x
-[Load] VAE weights: 337.8 ms
-[Request 1/1] ggml-turbo/request0.json (batch=1)
-[Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
-[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
-[Pipeline] 434 audio codes (86.8s @ 5Hz)
-[Pipeline] T=2170, S=1085
-[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 42.3 ms
-[Pipeline] caption: 70 tokens, lyrics: 167 tokens
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] TextEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 28L, H=1024, Nh=16/8
-[Load] TextEncoder: 593.9 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
-ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x11de0dee0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x11de0e340 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x11de0ebb0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
-ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x11de0f030 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x11de0f8a0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x11de0fed0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x11de107b0 | th_max =  576 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x11de11170 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x11de10350 | th_max = 1024 | th_width =   32
-[Encode] TextEncoder (70 tokens): 44.4 ms
-[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 33.8 ms
-[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] CondEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
-[WeightCtx] Loaded 140 tensors, 352.5 MB into backend
-[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 543.9 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1            0x11de1b4b0 | th_max =  896 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q4_K_f32', name = 'kernel_mul_mm_q4_K_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q4_K_f32_bci=0_bco=1            0x11de1ba60 | th_max =  896 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x11de1bea0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x11de1c500 | th_max =  640 | th_width =   32
-[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 149.3 ms, enc_S=238
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Debug] enc_hidden: [238, 2048] first4: 1.751263 -0.045978 -0.129705 0.058765
-[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
-[WeightCtx] Loaded 30 tensors, 64.7 MB into backend
-[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 113.4 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x11de10d70 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x11de0aff0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2                  0x11de0b950 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x11de1c9a0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q4_K_f32_r1_5', name = 'kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8      0x11de1d9f0 | th_max =  640 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8      0x11de1dfa0 | th_max =  640 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x11de1e320 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x11de1e580 | th_max =  448 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x11de1ef20 | th_max = 1024 | th_width =   32
-[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 1044.0 ms
-[Debug] detok_output: [2170, 64] first4: -0.105288 1.440285 0.304742 -0.636920
-[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
-[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
-[Debug] context: [2170, 128] first4: -0.105288 1.440285 0.304742 -0.636920
-[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
-[DiT] Batch N=1, T=2170, S=1085, enc_S=238
-[DiT] Graph: 1775 nodes
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x11f008d70 | th_max =  832 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x11f009830 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x11f009c40 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q4_K_f32', name = 'kernel_mul_mv_q4_K_f32_nsg=2'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q4_K_f32_nsg=2                  0x11f00ac80 | th_max =  768 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x11f00b000 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x11f00b6c0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x11f00b920 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x11f00bec0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x11f00c450 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x11f00cd60 | th_max =  640 | th_width =   32
-[Debug] tproj: [12288] first4: 0.260912 -0.160417 -0.090199 0.048634
-[Debug] temb: [2048] first4: 0.000215 -0.133911 -0.034469 0.065007
-[Debug] temb_t: [2048] first4: 0.000971 0.025677 -0.052124 0.063327
-[Debug] temb_r: [2048] first4: -0.000756 -0.159588 0.017655 0.001680
-[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
-[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
-[Debug] temb_lin1_t: [2048] first4: -0.049286 -0.053324 -0.012254 -0.047666
-[Debug] temb_lin1_r: [2048] first4: -0.015463 -0.031534 -0.021259 0.006135
-[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.050396 -0.992003 0.526498 0.458000
-[Debug] proj_in_input: [192, 2170] first4: -0.105288 1.440285 0.304742 -0.636920
-[Debug] enc_after_cond_emb: [2048, 238] first4: -0.174268 0.781178 0.275122 -0.515942
-[Debug] layer0_sa_input: [2048, 1085] first4: -0.726228 -0.772737 -0.041859 0.262417
-[Debug] layer0_q_after_rope: [128, 16] first4: -12.136272 0.820533 1.509364 1.799582
-[Debug] layer0_k_after_rope: [128, 8] first4: -0.174268 0.781178 0.275122 -0.515942
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.599759 0.160940 -0.480259 0.455996
-[Debug] layer0_attn_out: [2048, 1085] first4: -12.315464 1.144032 1.760677 1.796125
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.579560 -1.062863 0.061853 0.466855
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.703488 -0.838320 -0.450424 0.503514
-[Debug] hidden_after_layer0: [2048, 1085] first4: -8.870923 0.423529 48.381233 -0.778579
-[Debug] hidden_after_layer6: [2048, 1085] first4: -21.397562 -1.526012 29.991730 -3.928804
-[Debug] hidden_after_layer12: [2048, 1085] first4: -17.419617 -13.309786 66.317848 28.914410
-[Debug] hidden_after_layer18: [2048, 1085] first4: -16.562674 9.657765 55.222641 17.661957
-[Debug] hidden_after_layer23: [2048, 1085] first4: -19.112629 7.039753 181.464966 133.927719
-[Debug] dit_step0_vt: [2170, 64] first4: -0.112419 1.107940 0.244994 2.200569
-[Debug] dit_step0_xt: [2170, 64] first4: 0.199446 2.105889 -0.183011 0.747630
-[DiT] step 1/8 t=1.000
-[Debug] dit_step1_vt: [2170, 64] first4: -0.082195 1.204432 -0.273788 1.824850
-[Debug] dit_step1_xt: [2170, 64] first4: 0.203929 2.040193 -0.168077 0.648093
-[DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: -0.043690 1.209513 0.074423 2.191977
-[Debug] dit_step2_xt: [2170, 64] first4: 0.206842 1.959559 -0.173039 0.501961
-[DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: 0.238132 1.171738 0.272480 2.506455
-[Debug] dit_step3_xt: [2170, 64] first4: 0.186998 1.861914 -0.195745 0.293090
-[DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 0.293275 1.147521 0.096848 2.639339
-[Debug] dit_step4_xt: [2170, 64] first4: 0.155575 1.738965 -0.206122 0.010304
-[DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 0.200179 1.089847 -0.403776 2.739777
-[Debug] dit_step5_xt: [2170, 64] first4: 0.126978 1.583273 -0.148440 -0.381093
-[DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: -0.078240 0.999644 -1.058107 2.768797
-[Debug] dit_step6_xt: [2170, 64] first4: 0.142626 1.383344 0.063182 -0.934852
-[DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: -0.417903 0.862772 -1.662739 3.246292
-[Debug] dit_x0: [2170, 64] first4: 0.267997 1.124512 0.562003 -1.908740
-[DiT] step 8/8 t=0.300
-[DiT] Total generation: 7809.5 ms (7809.5 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.267997 1.124512 0.562003 -1.908740
-[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 474 nodes, T_latent=192
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
-ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x11de1ab80 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x11de1ceb0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x11de1f410 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x11de1f670 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x11de1fa20 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x11de20200 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x11de20760 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x11de216c0 | th_max = 1024 | th_width =   32
-[VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 474 nodes, T_latent=256
-[VAE] Graph: 474 nodes, T_latent=186
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x11de21920 | th_max =  896 | th_width =   32
-[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 609663.4 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000307 0.000830 0.000664 0.001050
-[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Request 1/1] Done
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Pipeline] All done
-ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
-ggml_metal_library_init: using embedded metal library
-ggml_metal_library_init: loaded in 0.006 sec
-ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
-ggml_metal_device_init: GPU name:   MTL0
-ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
-ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
-ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
-ggml_metal_device_init: simdgroup reduction   = true
-ggml_metal_device_init: simdgroup matrix mul. = true
-ggml_metal_device_init: has unified memory    = true
-ggml_metal_device_init: has bfloat            = true
-ggml_metal_device_init: has tensor            = false
-ggml_metal_device_init: use residency sets    = true
-ggml_metal_device_init: use shared buffers    = true
-ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] DiT backend: MTL0 (CPU threads: 5)
-[Load] Backend init: 18.8 ms
-[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800
-[DiT] Self-attn: Q+K fused, V separate
-[DiT] Cross-attn: all separate
-[DiT] MLP: gate+up fused
-[Load] null_condition_emb found (CFG available)
-[WeightCtx] Loaded 478 tensors, 895.6 MB into backend
-[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 1269.3 ms
-[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800
-[Load] silence_latent: [15000, 64] from GGUF
-[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] VAE backend: MTL0 (CPU threads: 5)
-[VAE] Backend: MTL0, Weight buffer: 255.7 MB
-[VAE] Loaded: 5 blocks, upsample=1920x
-[Load] VAE weights: 272.1 ms
-[Request 1/1] ggml-sft/request0.json (batch=1)
-[Request] parsed ggml-sft/request0.json (18 fields)
-[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s
-[Pipeline] 434 audio codes (86.8s @ 5Hz)
-[Pipeline] T=2170, S=1085
-[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 41.8 ms
-[Pipeline] caption: 70 tokens, lyrics: 167 tokens
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] TextEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 28L, H=1024, Nh=16/8
-[Load] TextEncoder: 231.9 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
-ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x15570a490 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x15570a8f0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x15570b160 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
-ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x15570b5e0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x15570be50 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x15570c480 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x15570cd60 | th_max =  576 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x15570d170 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x15570d3d0 | th_max = 1024 | th_width =   32
-[Encode] TextEncoder (70 tokens): 48.9 ms
-[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 33.9 ms
-[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] CondEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800
-[WeightCtx] Loaded 140 tensors, 352.5 MB into backend
-[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 601.2 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1            0x155717100 | th_max =  896 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q4_K_f32', name = 'kernel_mul_mm_q4_K_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q4_K_f32_bci=0_bco=1            0x1557176b0 | th_max =  896 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x155717a30 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x155718090 | th_max =  640 | th_width =   32
-[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 151.9 ms, enc_S=238
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Debug] enc_hidden: [238, 2048] first4: 1.751314 -0.046022 -0.129862 0.058756
-[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800
-[WeightCtx] Loaded 30 tensors, 64.7 MB into backend
-[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 101.7 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x15570ebf0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x155707790 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2                  0x155707dc0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x1557074e0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q4_K_f32_r1_5', name = 'kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8      0x1557192f0 | th_max =  640 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8      0x1557198a0 | th_max =  640 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x155719c20 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x155719e80 | th_max =  448 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x15571a8c0 | th_max = 1024 | th_width =   32
-[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 1040.2 ms
-[Debug] detok_output: [2170, 64] first4: -0.105274 1.439665 0.307319 -0.637002
-[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
-[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
-[Debug] context: [2170, 128] first4: -0.105274 1.439665 0.307319 -0.637002
-[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1
-[DiT] Batch N=1, T=2170, S=1085, enc_S=238
-[DiT] Graph: 1775 nodes
-[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683
-[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683
-[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x15560cd80 | th_max =  832 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x15560d720 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x15560db30 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q4_K_f32', name = 'kernel_mul_mv_q4_K_f32_nsg=2'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q4_K_f32_nsg=2                  0x15560eb70 | th_max =  768 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x15560eef0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x15560f5b0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x15560f810 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x15560fdb0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x155610340 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x155610d60 | th_max =  640 | th_width =   32
-[Debug] tproj: [12288] first4: 0.154826 -0.114975 -0.093002 0.082122
-[Debug] temb: [2048] first4: -0.003593 -0.176168 0.003892 -0.001352
-[Debug] temb_t: [2048] first4: -0.002002 0.003482 -0.013423 -0.001611
-[Debug] temb_r: [2048] first4: -0.001591 -0.179650 0.017315 0.000259
-[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
-[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
-[Debug] temb_lin1_t: [2048] first4: -0.026166 0.013606 0.032789 -0.028782
-[Debug] temb_lin1_r: [2048] first4: -0.001795 -0.011535 -0.006725 -0.011136
-[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.101326 -0.948224 0.490204 0.449757
-[Debug] proj_in_input: [192, 2170] first4: -0.105274 1.439665 0.307319 -0.637002
-[Debug] enc_after_cond_emb: [2048, 238] first4: -0.189214 0.805478 0.284418 -0.472295
-[Debug] layer0_sa_input: [2048, 1085] first4: -0.923880 -0.725952 -0.044805 0.297821
-[Debug] layer0_q_after_rope: [128, 16] first4: -12.125128 0.516320 1.460617 1.783048
-[Debug] layer0_k_after_rope: [128, 8] first4: -0.189214 0.805478 0.284418 -0.472295
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.642741 0.751692 -0.708185 0.515940
-[Debug] layer0_attn_out: [2048, 1085] first4: -11.610563 1.032188 1.685498 1.814675
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.803507 -1.373816 -0.306776 0.394307
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.998315 -1.012332 -0.558752 0.397301
-[Debug] hidden_after_layer0: [2048, 1085] first4: -9.408201 1.261657 57.661659 -1.674409
-[Debug] hidden_after_layer6: [2048, 1085] first4: -13.125732 4.401457 57.923130 1.593087
-[Debug] hidden_after_layer12: [2048, 1085] first4: -12.760151 8.784775 -27.576780 1.266083
-[Debug] hidden_after_layer18: [2048, 1085] first4: -3.029438 18.924910 -37.522003 -17.408060
-[Debug] hidden_after_layer23: [2048, 1085] first4: 25.718348 50.253456 58.487469 -24.616550
-[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.549879 2.587143 -0.199758 1.525680
-[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.221552 2.068977 -0.854510 1.731250
-[Debug] dit_step0_vt: [2170, 64] first4: -0.770128 3.170936 0.103367 1.213956
-[Debug] dit_step0_xt: [2170, 64] first4: 0.209738 2.092831 -0.173942 0.823377
-[DiT] step 1/50 t=1.000
-[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.553963 2.540515 -0.004453 1.412831
-[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.507386 2.385438 -0.093360 1.515296
-[Debug] dit_step1_vt: [2170, 64] first4: -0.244245 1.996188 -0.241419 1.557151
-[Debug] dit_step1_xt: [2170, 64] first4: 0.214623 2.052907 -0.169114 0.792234
-[DiT] step 2/50 t=0.980
-[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.537810 2.506870 -0.002615 1.406658
-[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.493937 2.362162 -0.101829 1.455003
-[Debug] dit_step2_vt: [2170, 64] first4: -0.705231 2.991064 0.252674 1.183649
-[Debug] dit_step2_xt: [2170, 64] first4: 0.228728 1.993086 -0.174167 0.768561
-[DiT] step 3/50 t=0.960
-[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.501613 2.438805 -0.019274 1.410215
-[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.521661 2.364079 -0.095044 1.376828
-[Debug] dit_step3_vt: [2170, 64] first4: -0.201260 2.055526 -0.239553 1.689172
-[Debug] dit_step3_xt: [2170, 64] first4: 0.232753 1.951976 -0.169376 0.734778
-[DiT] step 4/50 t=0.940
-[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.465795 2.359768 -0.032364 1.399407
-[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.532122 2.334485 -0.099644 1.341739
-[Debug] dit_step4_vt: [2170, 64] first4: -0.511269 2.624130 0.214392 1.268924
-[Debug] dit_step4_xt: [2170, 64] first4: 0.242979 1.899493 -0.173664 0.709399
-[DiT] step 5/50 t=0.920
-[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.416940 2.273875 -0.055556 1.387350
-[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.572103 2.299005 -0.092359 1.353066
-[Debug] dit_step5_vt: [2170, 64] first4: 0.057514 1.863401 -0.254107 1.537004
-[Debug] dit_step5_xt: [2170, 64] first4: 0.241828 1.862225 -0.168582 0.678659
-[DiT] step 6/50 t=0.900
-[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.399254 2.210152 -0.071076 1.369134
-[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.539425 2.227666 -0.114236 1.361075
-[Debug] dit_step6_vt: [2170, 64] first4: -0.380751 2.356979 0.167812 1.213706
-[Debug] dit_step6_xt: [2170, 64] first4: 0.249443 1.815086 -0.171938 0.654385
-[DiT] step 7/50 t=0.880
-[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.339429 2.118412 -0.091855 1.350106
-[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.475619 2.122700 -0.120878 1.360558
-[Debug] dit_step7_vt: [2170, 64] first4: 0.052995 1.858614 -0.256165 1.379718
-[Debug] dit_step7_xt: [2170, 64] first4: 0.248383 1.777913 -0.166815 0.626791
-[DiT] step 8/50 t=0.860
-[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.274483 2.023758 -0.093816 1.332238
-[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.393477 2.023790 -0.130114 1.332444
-[Debug] dit_step8_vt: [2170, 64] first4: -0.218486 2.105614 0.134615 1.230365
-[Debug] dit_step8_xt: [2170, 64] first4: 0.252753 1.735801 -0.169507 0.602183
-[DiT] step 9/50 t=0.840
-[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.208702 1.940720 -0.100297 1.317338
-[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.302713 1.942014 -0.150935 1.306566
-[Debug] dit_step9_vt: [2170, 64] first4: 0.068625 1.756381 -0.163156 1.360642
-[Debug] dit_step9_xt: [2170, 64] first4: 0.251381 1.700673 -0.166244 0.574971
-[DiT] step 10/50 t=0.820
-[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.162154 1.880021 -0.110640 1.303073
-[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.210358 1.886162 -0.152094 1.303815
-[Debug] dit_step10_vt: [2170, 64] first4: -0.200484 1.879984 0.061434 1.187651
-[Debug] dit_step10_xt: [2170, 64] first4: 0.255390 1.663074 -0.167473 0.551217
-[DiT] step 11/50 t=0.800
-[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.132763 1.841353 -0.152935 1.280443
-[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.156466 1.839952 -0.166283 1.309973
-[Debug] dit_step11_vt: [2170, 64] first4: -0.006319 1.715424 -0.248815 1.180641
-[Debug] dit_step11_xt: [2170, 64] first4: 0.255517 1.628765 -0.162497 0.527605
-[DiT] step 12/50 t=0.780
-[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.108732 1.804132 -0.204569 1.271017
-[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.137749 1.799717 -0.174060 1.349185
-[Debug] dit_step12_vt: [2170, 64] first4: -0.093850 1.775385 -0.218540 0.972914
-[Debug] dit_step12_xt: [2170, 64] first4: 0.257394 1.593257 -0.158126 0.508146
-[DiT] step 13/50 t=0.760
-[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.084325 1.755919 -0.251734 1.253830
-[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.116151 1.744928 -0.223829 1.345488
-[Debug] dit_step13_vt: [2170, 64] first4: 0.034148 1.681178 -0.334965 1.042164
-[Debug] dit_step13_xt: [2170, 64] first4: 0.256711 1.559634 -0.151426 0.487303
-[DiT] step 14/50 t=0.740
-[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.062454 1.706585 -0.275264 1.242871
-[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.092396 1.687153 -0.270903 1.319513
-[Debug] dit_step14_vt: [2170, 64] first4: -0.030339 1.704105 -0.218537 1.004399
-[Debug] dit_step14_xt: [2170, 64] first4: 0.257318 1.525552 -0.147056 0.467215
-[DiT] step 15/50 t=0.720
-[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.039531 1.653934 -0.274129 1.244472
-[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.065533 1.623524 -0.308950 1.280105
-[Debug] dit_step15_vt: [2170, 64] first4: 0.042593 1.646848 -0.174753 1.192683
-[Debug] dit_step15_xt: [2170, 64] first4: 0.256466 1.492615 -0.143561 0.443362
-[DiT] step 16/50 t=0.700
-[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.024221 1.582624 -0.288380 1.229998
-[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.041512 1.552975 -0.330420 1.243577
-[Debug] dit_step16_vt: [2170, 64] first4: -0.014702 1.584471 -0.181940 1.121346
-[Debug] dit_step16_xt: [2170, 64] first4: 0.256760 1.460925 -0.139922 0.420935
-[DiT] step 17/50 t=0.680
-[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.016144 1.507916 -0.306446 1.209517
-[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.023216 1.483080 -0.342848 1.208134
-[Debug] dit_step17_vt: [2170, 64] first4: 0.010192 1.492126 -0.218166 1.213425
-[Debug] dit_step17_xt: [2170, 64] first4: 0.256556 1.431083 -0.135559 0.396666
-[DiT] step 18/50 t=0.660
-[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.011327 1.429419 -0.322466 1.189975
-[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.006504 1.414708 -0.351011 1.186830
-[Debug] dit_step18_vt: [2170, 64] first4: -0.055648 1.401301 -0.242752 1.127735
-[Debug] dit_step18_xt: [2170, 64] first4: 0.257669 1.403057 -0.130704 0.374111
-[DiT] step 19/50 t=0.640
-[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.008919 1.352955 -0.336887 1.164963
-[Debug] dit_step19_vt_uncond: [2170, 64] first4: 0.006420 1.358623 -0.354804 1.168313
-[Debug] dit_step19_vt: [2170, 64] first4: -0.054127 1.236317 -0.295143 1.130394
-[Debug] dit_step19_xt: [2170, 64] first4: 0.258751 1.378330 -0.124801 0.351504
-[DiT] step 20/50 t=0.620
-[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.004449 1.272026 -0.345863 1.142193
-[Debug] dit_step20_vt_uncond: [2170, 64] first4: 0.019787 1.305161 -0.354228 1.148333
-[Debug] dit_step20_vt: [2170, 64] first4: -0.100401 1.079987 -0.320124 1.076506
-[Debug] dit_step20_xt: [2170, 64] first4: 0.260759 1.356731 -0.118398 0.329973
-[DiT] step 21/50 t=0.600
-[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.002161 1.194354 -0.356476 1.115376
-[Debug] dit_step21_vt_uncond: [2170, 64] first4: 0.027066 1.258520 -0.355503 1.123235
-[Debug] dit_step21_vt: [2170, 64] first4: -0.089629 0.890893 -0.367078 1.066256
-[Debug] dit_step21_xt: [2170, 64] first4: 0.262552 1.338913 -0.111057 0.308648
-[DiT] step 22/50 t=0.580
-[Debug] dit_step22_vt_cond: [2170, 64] first4: 0.001542 1.116787 -0.366798 1.082653
-[Debug] dit_step22_vt_uncond: [2170, 64] first4: 0.034784 1.215104 -0.359348 1.094688
-[Debug] dit_step22_vt: [2170, 64] first4: -0.114017 0.710875 -0.381058 1.001636
-[Debug] dit_step22_xt: [2170, 64] first4: 0.264832 1.324695 -0.103435 0.288616
-[DiT] step 23/50 t=0.560
-[Debug] dit_step23_vt_cond: [2170, 64] first4: 0.004356 1.043939 -0.376088 1.054782
-[Debug] dit_step23_vt_uncond: [2170, 64] first4: 0.040331 1.176215 -0.358597 1.069999
-[Debug] dit_step23_vt: [2170, 64] first4: -0.106657 0.513238 -0.439613 0.976581
-[Debug] dit_step23_xt: [2170, 64] first4: 0.266965 1.314431 -0.094643 0.269084
-[DiT] step 24/50 t=0.540
-[Debug] dit_step24_vt_cond: [2170, 64] first4: 0.004404 0.961254 -0.387939 1.015311
-[Debug] dit_step24_vt_uncond: [2170, 64] first4: 0.043793 1.129819 -0.356263 1.035491
-[Debug] dit_step24_vt: [2170, 64] first4: -0.131273 0.309370 -0.487982 0.900439
-[Debug] dit_step24_xt: [2170, 64] first4: 0.269591 1.308243 -0.084884 0.251075
-[DiT] step 25/50 t=0.520
-[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.001606 0.858703 -0.396162 0.970976
-[Debug] dit_step25_vt_uncond: [2170, 64] first4: 0.045187 1.067146 -0.350258 0.994534
-[Debug] dit_step25_vt: [2170, 64] first4: -0.160841 0.082930 -0.542274 0.862474
-[Debug] dit_step25_xt: [2170, 64] first4: 0.272808 1.306585 -0.074038 0.233826
-[DiT] step 26/50 t=0.500
-[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.011834 0.743138 -0.406478 0.912916
-[Debug] dit_step26_vt_uncond: [2170, 64] first4: 0.044098 0.988983 -0.348666 0.943761
-[Debug] dit_step26_vt: [2170, 64] first4: -0.203731 -0.135469 -0.575882 0.759197
-[Debug] dit_step26_xt: [2170, 64] first4: 0.276882 1.309294 -0.062520 0.218642
-[DiT] step 27/50 t=0.480
-[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.028043 0.640231 -0.413465 0.856122
-[Debug] dit_step27_vt_uncond: [2170, 64] first4: 0.038067 0.910543 -0.350117 0.887872
-[Debug] dit_step27_vt: [2170, 64] first4: -0.249926 -0.275849 -0.588337 0.733838
-[Debug] dit_step27_xt: [2170, 64] first4: 0.281881 1.314811 -0.050754 0.203965
-[DiT] step 28/50 t=0.460
-[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.048697 0.519480 -0.427048 0.785924
-[Debug] dit_step28_vt_uncond: [2170, 64] first4: 0.029577 0.811304 -0.356754 0.820204
-[Debug] dit_step28_vt: [2170, 64] first4: -0.313111 -0.465662 -0.625360 0.626629
-[Debug] dit_step28_xt: [2170, 64] first4: 0.288143 1.324124 -0.038247 0.191432
-[DiT] step 29/50 t=0.440
-[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.073682 0.390412 -0.435695 0.713586
-[Debug] dit_step29_vt_uncond: [2170, 64] first4: 0.022755 0.688592 -0.366629 0.750458
-[Debug] dit_step29_vt: [2170, 64] first4: -0.404692 -0.558608 -0.601264 0.570632
-[Debug] dit_step29_xt: [2170, 64] first4: 0.296237 1.335296 -0.026221 0.180020
-[DiT] step 30/50 t=0.420
-[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.100612 0.256910 -0.442863 0.643070
-[Debug] dit_step30_vt_uncond: [2170, 64] first4: 0.014270 0.550700 -0.380145 0.680719
-[Debug] dit_step30_vt: [2170, 64] first4: -0.477652 -0.675684 -0.591087 0.486411
-[Debug] dit_step30_xt: [2170, 64] first4: 0.305790 1.348810 -0.014400 0.170292
-[DiT] step 31/50 t=0.400
-[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.127005 0.130974 -0.446946 0.576489
-[Debug] dit_step31_vt_uncond: [2170, 64] first4: 0.003612 0.415976 -0.399074 0.614345
-[Debug] dit_step31_vt: [2170, 64] first4: -0.549710 -0.743030 -0.526327 0.431312
-[Debug] dit_step31_xt: [2170, 64] first4: 0.316784 1.363671 -0.003873 0.161665
-[DiT] step 32/50 t=0.380
-[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.154932 -0.000795 -0.447535 0.511295
-[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.007317 0.275916 -0.413101 0.549311
-[Debug] dit_step32_vt: [2170, 64] first4: -0.628125 -0.848536 -0.505066 0.360242
-[Debug] dit_step32_xt: [2170, 64] first4: 0.329347 1.380641 0.006228 0.154460
-[DiT] step 33/50 t=0.360
-[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.183072 -0.130801 -0.438493 0.449678
-[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.021971 0.136892 -0.420384 0.490091
-[Debug] dit_step33_vt: [2170, 64] first4: -0.685087 -0.931651 -0.428386 0.294226
-[Debug] dit_step33_xt: [2170, 64] first4: 0.343048 1.399274 0.014796 0.148576
-[DiT] step 34/50 t=0.340
-[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.207282 -0.251064 -0.429462 0.399560
-[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.035614 0.010201 -0.426610 0.442224
-[Debug] dit_step34_vt: [2170, 64] first4: -0.740469 -1.039289 -0.393755 0.238626
-[Debug] dit_step34_xt: [2170, 64] first4: 0.357858 1.420060 0.022671 0.143803
-[DiT] step 35/50 t=0.320
-[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.234011 -0.373429 -0.414613 0.349351
-[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.051328 -0.116322 -0.423153 0.392585
-[Debug] dit_step35_vt: [2170, 64] first4: -0.800518 -1.139187 -0.342183 0.192528
-[Debug] dit_step35_xt: [2170, 64] first4: 0.373868 1.442844 0.029515 0.139953
-[DiT] step 36/50 t=0.300
-[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.261591 -0.503509 -0.392160 0.303680
-[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.072050 -0.249828 -0.410849 0.351470
-[Debug] dit_step36_vt: [2170, 64] first4: -0.838416 -1.260836 -0.298992 0.122180
-[Debug] dit_step36_xt: [2170, 64] first4: 0.390637 1.468061 0.035495 0.137509
-[DiT] step 37/50 t=0.280
-[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.290611 -0.615966 -0.361295 0.261135
-[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.095822 -0.367916 -0.388325 0.310791
-[Debug] dit_step37_vt: [2170, 64] first4: -0.893251 -1.349895 -0.245346 0.089192
-[Debug] dit_step37_xt: [2170, 64] first4: 0.408502 1.495059 0.040402 0.135725
-[DiT] step 38/50 t=0.260
-[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.316862 -0.724614 -0.326989 0.221074
-[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.120406 -0.482601 -0.361356 0.272140
-[Debug] dit_step38_vt: [2170, 64] first4: -0.917953 -1.452874 -0.195436 0.033767
-[Debug] dit_step38_xt: [2170, 64] first4: 0.426861 1.524116 0.044310 0.135050
-[DiT] step 39/50 t=0.240
-[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.344701 -0.840724 -0.280406 0.181682
-[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.151500 -0.605403 -0.318787 0.232017
-[Debug] dit_step39_vt: [2170, 64] first4: -0.945851 -1.537027 -0.144223 0.008566
-[Debug] dit_step39_xt: [2170, 64] first4: 0.445778 1.554857 0.047195 0.134879
-[DiT] step 40/50 t=0.220
-[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.369051 -0.939547 -0.228334 0.139823
-[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.182335 -0.713639 -0.270236 0.191739
-[Debug] dit_step40_vt: [2170, 64] first4: -0.958883 -1.593756 -0.082150 -0.057526
-[Debug] dit_step40_xt: [2170, 64] first4: 0.464955 1.586732 0.048838 0.136029
-[DiT] step 41/50 t=0.200
-[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.388759 -1.034758 -0.170808 0.098079
-[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.215027 -0.818665 -0.212783 0.153622
-[Debug] dit_step41_vt: [2170, 64] first4: -0.929079 -1.656826 -0.036355 -0.101313
-[Debug] dit_step41_xt: [2170, 64] first4: 0.483537 1.619868 0.049565 0.138056
-[DiT] step 42/50 t=0.180
-[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.404481 -1.121373 -0.110304 0.048469
-[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.250394 -0.918649 -0.148512 0.113292
-[Debug] dit_step42_vt: [2170, 64] first4: -0.870129 -1.689520 0.009394 -0.198920
-[Debug] dit_step42_xt: [2170, 64] first4: 0.500939 1.653659 0.049377 0.142034
-[DiT] step 43/50 t=0.160
-[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.416518 -1.199422 -0.047277 -0.004303
-[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.285961 -1.014739 -0.080642 0.076449
-[Debug] dit_step43_vt: [2170, 64] first4: -0.799869 -1.709703 0.054214 -0.305297
-[Debug] dit_step43_xt: [2170, 64] first4: 0.516937 1.687853 0.048293 0.148140
-[DiT] step 44/50 t=0.140
-[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.422798 -1.270758 0.022277 -0.058297
-[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.318056 -1.108378 -0.007512 0.042141
-[Debug] dit_step44_vt: [2170, 64] first4: -0.718613 -1.710690 0.113612 -0.432909
-[Debug] dit_step44_xt: [2170, 64] first4: 0.531309 1.722067 0.046020 0.156798
-[DiT] step 45/50 t=0.120
-[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.430461 -1.334901 0.090295 -0.107751
-[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.346132 -1.190932 0.060499 0.012419
-[Debug] dit_step45_vt: [2170, 64] first4: -0.676233 -1.740750 0.184198 -0.543741
-[Debug] dit_step45_xt: [2170, 64] first4: 0.544834 1.756882 0.042336 0.167673
-[DiT] step 46/50 t=0.100
-[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.442548 -1.408986 0.177202 -0.124432
-[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.366053 -1.276834 0.124694 -0.015436
-[Debug] dit_step46_vt: [2170, 64] first4: -0.689058 -1.805405 0.393872 -0.448936
-[Debug] dit_step46_xt: [2170, 64] first4: 0.558615 1.792990 0.034459 0.176652
-[DiT] step 47/50 t=0.080
-[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.439555 -1.466634 0.226367 -0.147289
-[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.380429 -1.352640 0.169813 -0.038167
-[Debug] dit_step47_vt: [2170, 64] first4: -0.594441 -1.800792 0.366639 -0.559854
-[Debug] dit_step47_xt: [2170, 64] first4: 0.570504 1.829005 0.027126 0.187849
-[DiT] step 48/50 t=0.060
-[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.421519 -1.502992 0.243896 -0.165260
-[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.386849 -1.417176 0.200885 -0.065191
-[Debug] dit_step48_vt: [2170, 64] first4: -0.516278 -1.762812 0.360980 -0.463950
-[Debug] dit_step48_xt: [2170, 64] first4: 0.580829 1.864262 0.019907 0.197128
-[DiT] step 49/50 t=0.040
-[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.442348 -1.531937 0.237906 -0.192473
-[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.399571 -1.435245 0.199709 -0.084932
-[Debug] dit_step49_vt: [2170, 64] first4: -0.632891 -1.901084 0.347748 -0.622644
-[Debug] dit_x0: [2170, 64] first4: 0.593487 1.902283 0.012952 0.209581
-[DiT] step 50/50 t=0.020
-[DiT] Total generation: 97237.2 ms (97237.2 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.593487 1.902283 0.012952 0.209581
-[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 474 nodes, T_latent=192
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
-ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x1556105a0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x1556166d0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x155616930 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x155616fc0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x155617400 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x155617a00 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x155617f60 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x155618e40 | th_max = 1024 | th_width =   32
-[VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 474 nodes, T_latent=256
-[VAE] Graph: 474 nodes, T_latent=186
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x10b705130 | th_max =  896 | th_width =   32
-[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 609639.3 ms
-[Debug] vae_audio: [2, 4166400] first4: -0.001780 -0.001606 -0.001703 -0.001406
-[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Request 1/1] Done
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Pipeline] All done
-[Request] Loaded request0.json
-[Noise] Reusing existing rng_philox_seed42.bf16
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
-[GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
-[GGML] Done, 47 dump files
-[Turbo] Reusing existing Python dumps: python-turbo
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999813
-  lyric_embed                          1.000000
-  enc_hidden                           0.997096
-  detok_output                         0.999629
-  context                              0.999763
-  noise                                1.000000
-  temb_t                               0.999906
-  hidden_after_proj_in                 0.999918
-  enc_after_cond_emb                   0.997606
-  layer0_sa_output                     0.998452
-  hidden_after_layer0                  0.999696
-  hidden_after_layer6                  0.999330
-  hidden_after_layer12                 0.995408
-  hidden_after_layer18                 0.991270
-  hidden_after_layer23                 0.984826
-  dit_step0_vt                         0.944528
-  dit_step0_xt                         0.999878
-  dit_step1_vt                         0.947871
-  dit_step1_xt                         0.999609
-  dit_step2_vt                         0.956355
-  dit_step2_xt                         0.998980
-  dit_step3_vt                         0.961293
-  dit_step3_xt                         0.997669
-  dit_step4_vt                         0.958834
-  dit_step4_xt                         0.994713
-  dit_step5_vt                         0.956132
-  dit_step5_xt                         0.988221
-  dit_step6_vt                         0.950838
-  dit_step6_xt                         0.976124
-  dit_step7_vt                         0.938802
-  dit_x0                               0.958347
-  vae_audio                            0.832313
-  vae_audio (log spectral)             0.999533
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999877   0.165977   0.010464  -0.002251   0.973155  -0.002342   0.972003
-  dit_step1_xt             0.999608   0.266862   0.018170  -0.005108   0.943161  -0.005313   0.941730
-  dit_step2_xt             0.998979   0.448963   0.028101  -0.009001   0.910184  -0.009311   0.908527
-  dit_step3_xt             0.997667   0.610427   0.040689  -0.014279   0.875248  -0.014577   0.873624
-  dit_step4_xt             0.994712   0.903635   0.058677  -0.021196   0.843722  -0.021660   0.841995
-  dit_step5_xt             0.988220   1.370464   0.085448  -0.031128   0.827283  -0.032109   0.824593
-  dit_step6_xt             0.976123   1.998804   0.126069  -0.045345   0.858424  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q4_K_M.gguf
-[GGML] Running acestep-v15-sft-Q4_K_M.gguf...
-[GGML] Done, 233 dump files
-[SFT] Reusing existing Python dumps: python-sft
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999813
-  lyric_embed                          1.000000
-  enc_hidden                           0.997097
-  detok_output                         0.999629
-  context                              0.999763
-  noise                                1.000000
-  temb_t                               0.999673
-  hidden_after_proj_in                 0.999917
-  enc_after_cond_emb                   0.997598
-  layer0_sa_output                     0.998569
-  hidden_after_layer0                  0.999686
-  hidden_after_layer6                  0.999172
-  hidden_after_layer12                 0.997776
-  hidden_after_layer18                 0.996818
-  hidden_after_layer23                 0.997039
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.996934
-  dit_step0_vt_uncond                  0.996212
-  dit_step0_vt                         0.990566
-  dit_step0_xt                         0.999995
-  dit_step5_vt_cond                    0.995434
-  dit_step5_vt                         0.980046
-  dit_step5_xt                         0.999823
-  dit_step10_vt_cond                   0.991133
-  dit_step10_vt                        0.971906
-  dit_step10_xt                        0.999207
-  dit_step15_vt_cond                   0.982704
-  dit_step15_vt                        0.950629
-  dit_step15_xt                        0.997454
-  dit_step20_vt_cond                   0.968600
-  dit_step20_vt                        0.929360
-  dit_step20_xt                        0.993412
-  dit_step25_vt_cond                   0.951686
-  dit_step25_vt                        0.903442
-  dit_step25_xt                        0.986280
-  dit_step30_vt_cond                   0.931805
-  dit_step30_vt                        0.881992
-  dit_step30_xt                        0.976117
-  dit_step35_vt_cond                   0.911309
-  dit_step35_vt                        0.858516
-  dit_step35_xt                        0.964745
-  dit_step40_vt_cond                   0.898448
-  dit_step40_vt                        0.843064
-  dit_step40_xt                        0.954421
-  dit_step45_vt_cond                   0.908747
-  dit_step45_vt                        0.865504
-  dit_step45_xt                        0.947533
-  dit_step49_vt_cond                   0.927312
-  dit_step49_vt                        0.885368
-  dit_x0                               0.945292
-  vae_audio                            0.825801
-  vae_audio (log spectral)             0.999459
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999994   0.035677   0.002825  -0.001840   0.980345  -0.001741   0.980402
-  dit_step5_xt             0.999822   0.191921   0.012992  -0.007283   0.890515  -0.007143   0.887999
-  dit_step10_xt            0.999206   0.526469   0.024282  -0.012946   0.812557  -0.012603   0.811299
-  dit_step15_xt            0.997453   0.836399   0.039177  -0.018559   0.748243  -0.018114   0.745269
-  dit_step20_xt            0.993411   1.152330   0.058726  -0.024275   0.703300  -0.023808   0.699582
-  dit_step25_xt            0.986279   1.542745   0.081991  -0.030177   0.682229  -0.029311   0.679278
-  dit_step30_xt            0.976117   1.915049   0.109049  -0.036245   0.688533  -0.035027   0.685262
-  dit_step35_xt            0.964744   2.242426   0.138946  -0.042318   0.720837  -0.040716   0.717196
-  dit_step40_xt            0.954421   2.562076   0.170565  -0.048389   0.775001  -0.046462   0.771853
-  dit_step45_xt            0.947532   2.889421   0.200672  -0.054787   0.846930  -0.052475   0.843036
diff --git a/tests/Metal_Q5_K_M.log b/tests/Metal_Q5_K_M.log
deleted file mode 100644
index a25afc6..0000000
--- a/tests/Metal_Q5_K_M.log
+++ /dev/null
@@ -1,835 +0,0 @@
-ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
-ggml_metal_library_init: using embedded metal library
-ggml_metal_library_init: loaded in 0.007 sec
-ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
-ggml_metal_device_init: GPU name:   MTL0
-ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
-ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
-ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
-ggml_metal_device_init: simdgroup reduction   = true
-ggml_metal_device_init: simdgroup matrix mul. = true
-ggml_metal_device_init: has unified memory    = true
-ggml_metal_device_init: has bfloat            = true
-ggml_metal_device_init: has tensor            = false
-ggml_metal_device_init: use residency sets    = true
-ggml_metal_device_init: use shared buffers    = true
-ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] DiT backend: MTL0 (CPU threads: 5)
-[Load] Backend init: 21.7 ms
-[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
-[DiT] Self-attn: Q+K fused, V separate
-[DiT] Cross-attn: all separate
-[DiT] MLP: gate+up fused
-[Load] null_condition_emb found (CFG available)
-[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend
-[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 1538.7 ms
-[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
-[Load] silence_latent: [15000, 64] from GGUF
-[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] VAE backend: MTL0 (CPU threads: 5)
-[VAE] Backend: MTL0, Weight buffer: 255.7 MB
-[VAE] Loaded: 5 blocks, upsample=1920x
-[Load] VAE weights: 275.7 ms
-[Request 1/1] ggml-turbo/request0.json (batch=1)
-[Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
-[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
-[Pipeline] 434 audio codes (86.8s @ 5Hz)
-[Pipeline] T=2170, S=1085
-[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 42.1 ms
-[Pipeline] caption: 70 tokens, lyrics: 167 tokens
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] TextEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 28L, H=1024, Nh=16/8
-[Load] TextEncoder: 230.3 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
-ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x11cf0b930 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x11cf0bd90 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x11cf0c600 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
-ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x11cf0ca80 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x11cf0d2f0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x11cf0d920 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x11cf0e200 | th_max =  576 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x11cf0e610 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x11cf0e870 | th_max = 1024 | th_width =   32
-[Encode] TextEncoder (70 tokens): 44.0 ms
-[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 33.7 ms
-[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] CondEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
-[WeightCtx] Loaded 140 tensors, 412.5 MB into backend
-[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 572.5 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1            0x11ce0c140 | th_max =  896 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q5_K_f32', name = 'kernel_mul_mm_q5_K_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q5_K_f32_bci=0_bco=1            0x11cf17e80 | th_max =  832 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x11cf18860 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x11cf18ec0 | th_max =  640 | th_width =   32
-[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 158.6 ms, enc_S=238
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Debug] enc_hidden: [238, 2048] first4: 1.751803 -0.051174 -0.133188 0.058982
-[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
-[WeightCtx] Loaded 30 tensors, 73.2 MB into backend
-[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 113.6 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x11ce0cf30 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x11ce0d840 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2                  0x11ce0ddf0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x11ce0e050 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q5_K_f32_r1_5', name = 'kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8      0x11ce0ea30 | th_max =  640 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8      0x11ce0efe0 | th_max =  640 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x11ce0f360 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x11ce0f5c0 | th_max =  448 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x11ce10000 | th_max = 1024 | th_width =   32
-[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 1065.0 ms
-[Debug] detok_output: [2170, 64] first4: -0.124883 1.453879 0.292856 -0.646204
-[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
-[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
-[Debug] context: [2170, 128] first4: -0.124883 1.453879 0.292856 -0.646204
-[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
-[DiT] Batch N=1, T=2170, S=1085, enc_S=238
-[DiT] Graph: 1775 nodes
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x11cf09240 | th_max =  832 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x11cf19120 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x11cf19380 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q5_K_f32', name = 'kernel_mul_mv_q5_K_f32_nsg=2'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q5_K_f32_nsg=2                  0x11cf1a3c0 | th_max =  576 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x11cf1a740 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x11cf1ae00 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x11cf1b060 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x11cf1b600 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x11cf1bb90 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x11cf1c5b0 | th_max =  640 | th_width =   32
-[Debug] tproj: [12288] first4: 0.260409 -0.161609 -0.102203 0.051602
-[Debug] temb: [2048] first4: -0.000151 -0.132293 -0.035516 0.064751
-[Debug] temb_t: [2048] first4: 0.000578 0.026708 -0.052786 0.063514
-[Debug] temb_r: [2048] first4: -0.000729 -0.159001 0.017269 0.001237
-[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
-[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
-[Debug] temb_lin1_t: [2048] first4: -0.051153 -0.053631 -0.012192 -0.039024
-[Debug] temb_lin1_r: [2048] first4: -0.016165 -0.021121 -0.015801 -0.000525
-[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.044511 -0.951831 0.540187 0.457322
-[Debug] proj_in_input: [192, 2170] first4: -0.124883 1.453879 0.292856 -0.646204
-[Debug] enc_after_cond_emb: [2048, 238] first4: -0.153168 0.787275 0.319340 -0.492001
-[Debug] layer0_sa_input: [2048, 1085] first4: -0.722961 -0.753736 -0.051927 0.265661
-[Debug] layer0_q_after_rope: [128, 16] first4: -12.602057 0.798570 1.518488 1.778495
-[Debug] layer0_k_after_rope: [128, 8] first4: -0.153168 0.787275 0.319340 -0.492001
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.498292 0.150378 -0.398807 0.484326
-[Debug] layer0_attn_out: [2048, 1085] first4: -12.773369 1.105118 1.773309 1.768943
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.542001 -1.018193 0.152304 0.468235
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.605642 -0.786551 -0.346129 0.499558
-[Debug] hidden_after_layer0: [2048, 1085] first4: -9.171107 0.593998 51.751106 -0.883031
-[Debug] hidden_after_layer6: [2048, 1085] first4: -20.936150 0.582827 29.989494 -4.872031
-[Debug] hidden_after_layer12: [2048, 1085] first4: -18.277052 -17.088211 71.559052 24.992846
-[Debug] hidden_after_layer18: [2048, 1085] first4: -25.915581 10.692349 65.928192 19.066517
-[Debug] hidden_after_layer23: [2048, 1085] first4: -6.799564 38.425339 203.467468 153.140854
-[Debug] dit_step0_vt: [2170, 64] first4: 0.015160 1.163890 0.353989 2.352075
-[Debug] dit_step0_xt: [2170, 64] first4: 0.193647 2.103346 -0.187965 0.740744
-[DiT] step 1/8 t=1.000
-[Debug] dit_step1_vt: [2170, 64] first4: -0.238755 1.372093 -0.135596 1.879695
-[Debug] dit_step1_xt: [2170, 64] first4: 0.206670 2.028504 -0.180569 0.638215
-[DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: -0.034453 1.243445 0.102498 2.382742
-[Debug] dit_step2_xt: [2170, 64] first4: 0.208967 1.945608 -0.187402 0.479365
-[DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: 0.286766 1.110088 0.263285 2.616079
-[Debug] dit_step3_xt: [2170, 64] first4: 0.185070 1.853101 -0.209343 0.261359
-[DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 0.359540 0.909711 0.077998 2.701789
-[Debug] dit_step4_xt: [2170, 64] first4: 0.146547 1.755632 -0.217700 -0.028118
-[DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 0.361413 0.800675 -0.393300 2.731152
-[Debug] dit_step5_xt: [2170, 64] first4: 0.094917 1.641250 -0.161514 -0.418283
-[DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: 0.247442 0.617176 -0.960503 2.789753
-[Debug] dit_step6_xt: [2170, 64] first4: 0.045429 1.517814 0.030587 -0.976234
-[DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: 0.019480 0.316869 -1.427597 3.153955
-[Debug] dit_x0: [2170, 64] first4: 0.039585 1.422753 0.458866 -1.922420
-[DiT] step 8/8 t=0.300
-[DiT] Total generation: 8546.5 ms (8546.5 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.039585 1.422753 0.458866 -1.922420
-[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 474 nodes, T_latent=192
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
-ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x11ce0b610 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x11ce10380 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x11ce10870 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x11ce10ad0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x11ce10e80 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x11ce11660 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x11ce11bc0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x11ce12b20 | th_max = 1024 | th_width =   32
-[VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 474 nodes, T_latent=256
-[VAE] Graph: 474 nodes, T_latent=186
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x11ce12d80 | th_max =  896 | th_width =   32
-[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 609611.2 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000665 0.001184 0.001013 0.001406
-[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Request 1/1] Done
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Pipeline] All done
-ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
-ggml_metal_library_init: using embedded metal library
-ggml_metal_library_init: loaded in 0.006 sec
-ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
-ggml_metal_device_init: GPU name:   MTL0
-ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
-ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
-ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
-ggml_metal_device_init: simdgroup reduction   = true
-ggml_metal_device_init: simdgroup matrix mul. = true
-ggml_metal_device_init: has unified memory    = true
-ggml_metal_device_init: has bfloat            = true
-ggml_metal_device_init: has tensor            = false
-ggml_metal_device_init: use residency sets    = true
-ggml_metal_device_init: use shared buffers    = true
-ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] DiT backend: MTL0 (CPU threads: 5)
-[Load] Backend init: 21.5 ms
-[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800
-[DiT] Self-attn: Q+K fused, V separate
-[DiT] Cross-attn: all separate
-[DiT] MLP: gate+up fused
-[Load] null_condition_emb found (CFG available)
-[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend
-[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 1513.9 ms
-[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800
-[Load] silence_latent: [15000, 64] from GGUF
-[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] VAE backend: MTL0 (CPU threads: 5)
-[VAE] Backend: MTL0, Weight buffer: 255.7 MB
-[VAE] Loaded: 5 blocks, upsample=1920x
-[Load] VAE weights: 272.4 ms
-[Request 1/1] ggml-sft/request0.json (batch=1)
-[Request] parsed ggml-sft/request0.json (18 fields)
-[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s
-[Pipeline] 434 audio codes (86.8s @ 5Hz)
-[Pipeline] T=2170, S=1085
-[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 42.2 ms
-[Pipeline] caption: 70 tokens, lyrics: 167 tokens
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] TextEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 28L, H=1024, Nh=16/8
-[Load] TextEncoder: 225.9 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
-ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x12de0dc30 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x12de0e090 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x12de0e900 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
-ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x12de0ed80 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x12de0f5f0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x12de0fc20 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x12de10500 | th_max =  576 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x12de10910 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x12de10b70 | th_max = 1024 | th_width =   32
-[Encode] TextEncoder (70 tokens): 49.1 ms
-[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 42.5 ms
-[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] CondEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800
-[WeightCtx] Loaded 140 tensors, 412.5 MB into backend
-[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 760.1 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1            0x103e0be70 | th_max =  896 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q5_K_f32', name = 'kernel_mul_mm_q5_K_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q5_K_f32_bci=0_bco=1            0x103e0c420 | th_max =  832 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x103e0c860 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x103e0cfd0 | th_max =  640 | th_width =   32
-[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 162.8 ms, enc_S=238
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Debug] enc_hidden: [238, 2048] first4: 1.751803 -0.051174 -0.133188 0.058982
-[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800
-[WeightCtx] Loaded 30 tensors, 73.2 MB into backend
-[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 115.2 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x103f05ca0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x103f065b0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2                  0x103f06b60 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x103f06dc0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q5_K_f32_r1_5', name = 'kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8      0x103f07ca0 | th_max =  640 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8      0x103f08250 | th_max =  640 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x103f085d0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x103f08830 | th_max =  448 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x103f091d0 | th_max = 1024 | th_width =   32
-[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 1055.5 ms
-[Debug] detok_output: [2170, 64] first4: -0.124883 1.453879 0.292856 -0.646204
-[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
-[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
-[Debug] context: [2170, 128] first4: -0.124883 1.453879 0.292856 -0.646204
-[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1
-[DiT] Batch N=1, T=2170, S=1085, enc_S=238
-[DiT] Graph: 1775 nodes
-[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683
-[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683
-[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x103e0dfd0 | th_max =  832 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x103e0e530 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x103e0e940 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q5_K_f32', name = 'kernel_mul_mv_q5_K_f32_nsg=2'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q5_K_f32_nsg=2                  0x103e0f980 | th_max =  576 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x103e0fd00 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x103e103c0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x103e10620 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x103e10bc0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x103e11150 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x103e11b70 | th_max =  640 | th_width =   32
-[Debug] tproj: [12288] first4: 0.155137 -0.118305 -0.084248 0.082623
-[Debug] temb: [2048] first4: -0.002843 -0.176820 0.004745 -0.001924
-[Debug] temb_t: [2048] first4: -0.001351 0.003023 -0.012552 -0.001712
-[Debug] temb_r: [2048] first4: -0.001491 -0.179843 0.017298 -0.000212
-[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
-[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
-[Debug] temb_lin1_t: [2048] first4: -0.034754 0.028817 0.027120 -0.031729
-[Debug] temb_lin1_r: [2048] first4: -0.002680 0.004202 0.000655 -0.002088
-[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.095122 -0.908724 0.502637 0.450925
-[Debug] proj_in_input: [192, 2170] first4: -0.124883 1.453879 0.292856 -0.646204
-[Debug] enc_after_cond_emb: [2048, 238] first4: -0.173051 0.876023 0.351566 -0.532545
-[Debug] layer0_sa_input: [2048, 1085] first4: -0.920384 -0.707757 -0.034391 0.299813
-[Debug] layer0_q_after_rope: [128, 16] first4: -12.596228 0.535827 1.482060 1.773901
-[Debug] layer0_k_after_rope: [128, 8] first4: -0.173051 0.876023 0.351566 -0.532545
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.637092 0.777700 -0.593592 0.534410
-[Debug] layer0_attn_out: [2048, 1085] first4: -12.115236 1.011131 1.711030 1.787191
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.803575 -1.350455 -0.166552 0.391822
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.983645 -0.994904 -0.442195 0.398258
-[Debug] hidden_after_layer0: [2048, 1085] first4: -9.639871 1.314413 59.265984 -1.678902
-[Debug] hidden_after_layer6: [2048, 1085] first4: -15.443645 3.665146 59.001129 -0.430717
-[Debug] hidden_after_layer12: [2048, 1085] first4: -13.175318 0.713974 -24.768734 -0.709223
-[Debug] hidden_after_layer18: [2048, 1085] first4: -5.504215 14.850023 -44.686668 -9.688757
-[Debug] hidden_after_layer23: [2048, 1085] first4: 32.081551 63.384781 49.094582 -14.152830
-[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.659668 2.541722 -0.175072 1.431705
-[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.363007 2.087823 -0.714051 1.721254
-[Debug] dit_step0_vt: [2170, 64] first4: -0.859789 3.013237 0.095956 1.039126
-[Debug] dit_step0_xt: [2170, 64] first4: 0.211532 2.095985 -0.173794 0.826874
-[DiT] step 1/50 t=1.000
-[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.721169 2.535385 -0.028817 1.265576
-[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.587318 2.330142 -0.122353 1.467132
-[Debug] dit_step1_vt: [2170, 64] first4: -0.560584 2.181154 -0.231214 1.319647
-[Debug] dit_step1_xt: [2170, 64] first4: 0.222743 2.052362 -0.169170 0.800481
-[DiT] step 2/50 t=0.980
-[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.714072 2.499362 0.020078 1.278304
-[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.600958 2.309783 -0.102115 1.400073
-[Debug] dit_step2_vt: [2170, 64] first4: -0.861209 2.897427 0.258299 0.998091
-[Debug] dit_step2_xt: [2170, 64] first4: 0.239968 1.994414 -0.174336 0.780519
-[DiT] step 3/50 t=0.960
-[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.674846 2.427423 0.055491 1.297147
-[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.635843 2.332050 -0.021028 1.341071
-[Debug] dit_step3_vt: [2170, 64] first4: -0.498860 2.133466 -0.141608 1.438968
-[Debug] dit_step3_xt: [2170, 64] first4: 0.249945 1.951744 -0.171504 0.751740
-[DiT] step 4/50 t=0.940
-[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.605752 2.344337 0.051579 1.306835
-[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.617271 2.293072 -0.026830 1.327208
-[Debug] dit_step4_vt: [2170, 64] first4: -0.645479 2.581793 0.280999 1.082338
-[Debug] dit_step4_xt: [2170, 64] first4: 0.262854 1.900108 -0.177124 0.730093
-[DiT] step 5/50 t=0.920
-[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.529691 2.265976 0.024912 1.308485
-[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.613722 2.254884 0.007785 1.353816
-[Debug] dit_step5_vt: [2170, 64] first4: -0.220550 1.958779 -0.197940 1.314650
-[Debug] dit_step5_xt: [2170, 64] first4: 0.267265 1.860933 -0.173165 0.703800
-[DiT] step 6/50 t=0.900
-[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.492980 2.220784 0.013978 1.314708
-[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.574326 2.194118 0.017516 1.377784
-[Debug] dit_step6_vt: [2170, 64] first4: -0.484828 2.424557 0.188767 1.065213
-[Debug] dit_step6_xt: [2170, 64] first4: 0.276962 1.812442 -0.176940 0.682496
-[DiT] step 7/50 t=0.880
-[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.442502 2.140999 -0.004548 1.307788
-[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.527369 2.112375 0.025084 1.375065
-[Debug] dit_step7_vt: [2170, 64] first4: -0.162851 1.943518 -0.225018 1.235321
-[Debug] dit_step7_xt: [2170, 64] first4: 0.280219 1.773571 -0.172440 0.657789
-[DiT] step 8/50 t=0.860
-[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.402943 2.052812 -0.006712 1.290008
-[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.459439 2.025463 0.030814 1.342298
-[Debug] dit_step8_vt: [2170, 64] first4: -0.414733 2.160834 0.094455 1.088282
-[Debug] dit_step8_xt: [2170, 64] first4: 0.288514 1.730355 -0.174329 0.636023
-[DiT] step 9/50 t=0.840
-[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.369760 1.969441 -0.010690 1.271576
-[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.396671 1.951135 0.030780 1.310076
-[Debug] dit_step9_vt: [2170, 64] first4: -0.235488 1.803751 -0.198291 1.206838
-[Debug] dit_step9_xt: [2170, 64] first4: 0.293223 1.694280 -0.170363 0.611887
-[DiT] step 10/50 t=0.820
-[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.344175 1.892885 0.003662 1.257559
-[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.342085 1.891713 0.047752 1.301667
-[Debug] dit_step10_vt: [2170, 64] first4: -0.420278 1.858245 0.037965 1.052360
-[Debug] dit_step10_xt: [2170, 64] first4: 0.301629 1.657115 -0.171122 0.590840
-[DiT] step 11/50 t=0.800
-[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.321399 1.822005 0.005853 1.241717
-[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.313690 1.840958 0.057351 1.308882
-[Debug] dit_step11_vt: [2170, 64] first4: -0.256015 1.588119 -0.173446 1.051672
-[Debug] dit_step11_xt: [2170, 64] first4: 0.306749 1.625353 -0.167653 0.569806
-[DiT] step 12/50 t=0.780
-[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.300376 1.758913 0.017258 1.234417
-[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.284777 1.790303 0.055927 1.316604
-[Debug] dit_step12_vt: [2170, 64] first4: -0.382294 1.631086 0.039774 0.961382
-[Debug] dit_step12_xt: [2170, 64] first4: 0.314395 1.592731 -0.168449 0.550578
-[DiT] step 13/50 t=0.760
-[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.282512 1.699357 0.023821 1.222209
-[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.256755 1.739260 0.045941 1.313994
-[Debug] dit_step13_vt: [2170, 64] first4: -0.292858 1.451734 -0.071923 0.964463
-[Debug] dit_step13_xt: [2170, 64] first4: 0.320252 1.563696 -0.167011 0.531289
-[DiT] step 14/50 t=0.740
-[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.268355 1.643249 0.035884 1.219110
-[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.234205 1.685310 0.038125 1.306284
-[Debug] dit_step14_vt: [2170, 64] first4: -0.380481 1.484198 0.105917 0.956716
-[Debug] dit_step14_xt: [2170, 64] first4: 0.327862 1.534012 -0.169129 0.512155
-[DiT] step 15/50 t=0.720
-[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.257674 1.591244 0.046474 1.215229
-[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.215398 1.632289 0.020497 1.288409
-[Debug] dit_step15_vt: [2170, 64] first4: -0.335985 1.372340 0.098681 1.003970
-[Debug] dit_step15_xt: [2170, 64] first4: 0.334582 1.506565 -0.171102 0.492076
-[DiT] step 16/50 t=0.700
-[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.250808 1.534114 0.052932 1.211927
-[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.205743 1.580690 0.005785 1.270423
-[Debug] dit_step16_vt: [2170, 64] first4: -0.382949 1.342146 0.222679 1.022127
-[Debug] dit_step16_xt: [2170, 64] first4: 0.342241 1.479722 -0.175556 0.471633
-[DiT] step 17/50 t=0.680
-[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.246631 1.471787 0.045976 1.197702
-[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.202947 1.526995 -0.016021 1.248045
-[Debug] dit_step17_vt: [2170, 64] first4: -0.343461 1.218771 0.212458 1.019693
-[Debug] dit_step17_xt: [2170, 64] first4: 0.349110 1.455347 -0.179805 0.451239
-[DiT] step 18/50 t=0.660
-[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.243157 1.409443 0.036330 1.184456
-[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.204529 1.477009 -0.037415 1.231383
-[Debug] dit_step18_vt: [2170, 64] first4: -0.354021 1.150632 0.298290 1.001465
-[Debug] dit_step18_xt: [2170, 64] first4: 0.356190 1.432334 -0.185771 0.431210
-[DiT] step 19/50 t=0.640
-[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.237715 1.343371 0.017683 1.161345
-[Debug] dit_step19_vt_uncond: [2170, 64] first4: -0.204017 1.427205 -0.062926 1.210816
-[Debug] dit_step19_vt: [2170, 64] first4: -0.318427 0.994659 0.274882 0.959663
-[Debug] dit_step19_xt: [2170, 64] first4: 0.362559 1.412441 -0.191269 0.412017
-[DiT] step 20/50 t=0.620
-[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.232230 1.263409 -0.001007 1.136674
-[Debug] dit_step20_vt_uncond: [2170, 64] first4: -0.200904 1.370160 -0.090913 1.187652
-[Debug] dit_step20_vt: [2170, 64] first4: -0.327181 0.859238 0.330215 0.939772
-[Debug] dit_step20_xt: [2170, 64] first4: 0.369102 1.395257 -0.197873 0.393221
-[DiT] step 21/50 t=0.600
-[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.230263 1.180950 -0.026976 1.107196
-[Debug] dit_step21_vt_uncond: [2170, 64] first4: -0.199895 1.312066 -0.122788 1.160427
-[Debug] dit_step21_vt: [2170, 64] first4: -0.309986 0.682107 0.288212 0.884258
-[Debug] dit_step21_xt: [2170, 64] first4: 0.375302 1.381614 -0.203637 0.375536
-[DiT] step 22/50 t=0.580
-[Debug] dit_step22_vt_cond: [2170, 64] first4: -0.226975 1.087024 -0.049604 1.072087
-[Debug] dit_step22_vt_uncond: [2170, 64] first4: -0.197016 1.244136 -0.149074 1.129456
-[Debug] dit_step22_vt: [2170, 64] first4: -0.315666 0.502494 0.302365 0.836432
-[Debug] dit_step22_xt: [2170, 64] first4: 0.381615 1.371564 -0.209685 0.358807
-[DiT] step 23/50 t=0.560
-[Debug] dit_step23_vt_cond: [2170, 64] first4: -0.229945 0.987764 -0.068857 1.041486
-[Debug] dit_step23_vt_uncond: [2170, 64] first4: -0.196528 1.172394 -0.166018 1.101852
-[Debug] dit_step23_vt: [2170, 64] first4: -0.331807 0.301117 0.239278 0.795295
-[Debug] dit_step23_xt: [2170, 64] first4: 0.388252 1.365542 -0.214470 0.342901
-[DiT] step 24/50 t=0.540
-[Debug] dit_step24_vt_cond: [2170, 64] first4: -0.234566 0.878375 -0.086414 0.999502
-[Debug] dit_step24_vt_uncond: [2170, 64] first4: -0.196025 1.088575 -0.178235 1.066777
-[Debug] dit_step24_vt: [2170, 64] first4: -0.353102 0.129063 0.219470 0.719129
-[Debug] dit_step24_xt: [2170, 64] first4: 0.395314 1.362961 -0.218859 0.328519
-[DiT] step 25/50 t=0.520
-[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.244918 0.762549 -0.099009 0.962606
-[Debug] dit_step25_vt_uncond: [2170, 64] first4: -0.200310 0.995220 -0.184174 1.033471
-[Debug] dit_step25_vt: [2170, 64] first4: -0.384523 -0.079339 0.165061 0.686562
-[Debug] dit_step25_xt: [2170, 64] first4: 0.403004 1.364548 -0.222161 0.314788
-[DiT] step 26/50 t=0.500
-[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.256133 0.634451 -0.110018 0.919318
-[Debug] dit_step26_vt_uncond: [2170, 64] first4: -0.204624 0.887617 -0.187237 0.997615
-[Debug] dit_step26_vt: [2170, 64] first4: -0.416561 -0.257153 0.136664 0.594071
-[Debug] dit_step26_xt: [2170, 64] first4: 0.411335 1.369691 -0.224894 0.302906
-[DiT] step 27/50 t=0.480
-[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.269545 0.510911 -0.116178 0.879919
-[Debug] dit_step27_vt_uncond: [2170, 64] first4: -0.210466 0.778243 -0.183774 0.961990
-[Debug] dit_step27_vt: [2170, 64] first4: -0.454081 -0.397413 0.085143 0.568775
-[Debug] dit_step27_xt: [2170, 64] first4: 0.420417 1.377639 -0.226597 0.291531
-[DiT] step 28/50 t=0.460
-[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.282172 0.372450 -0.120048 0.831178
-[Debug] dit_step28_vt_uncond: [2170, 64] first4: -0.216011 0.655762 -0.179339 0.918627
-[Debug] dit_step28_vt: [2170, 64] first4: -0.483525 -0.575904 0.063843 0.478002
-[Debug] dit_step28_xt: [2170, 64] first4: 0.430087 1.389157 -0.227874 0.281971
-[DiT] step 29/50 t=0.440
-[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.293168 0.231214 -0.124586 0.781733
-[Debug] dit_step29_vt_uncond: [2170, 64] first4: -0.219691 0.525349 -0.170166 0.873745
-[Debug] dit_step29_vt: [2170, 64] first4: -0.518206 -0.710899 -0.005456 0.435657
-[Debug] dit_step29_xt: [2170, 64] first4: 0.440451 1.403375 -0.227765 0.273257
-[DiT] step 30/50 t=0.420
-[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.302506 0.086124 -0.124549 0.731757
-[Debug] dit_step30_vt_uncond: [2170, 64] first4: -0.221950 0.385721 -0.158065 0.830404
-[Debug] dit_step30_vt: [2170, 64] first4: -0.544895 -0.854978 -0.023964 0.343625
-[Debug] dit_step30_xt: [2170, 64] first4: 0.451349 1.420475 -0.227285 0.266385
-[DiT] step 31/50 t=0.400
-[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.310194 -0.052540 -0.126955 0.680247
-[Debug] dit_step31_vt_uncond: [2170, 64] first4: -0.222645 0.246419 -0.151881 0.786437
-[Debug] dit_step31_vt: [2170, 64] first4: -0.574628 -0.958966 -0.058730 0.282347
-[Debug] dit_step31_xt: [2170, 64] first4: 0.462842 1.439654 -0.226111 0.260738
-[DiT] step 32/50 t=0.380
-[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.314911 -0.194979 -0.126385 0.629081
-[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.220352 0.102109 -0.144540 0.743076
-[Debug] dit_step32_vt: [2170, 64] first4: -0.598710 -1.085776 -0.069698 0.196720
-[Debug] dit_step32_xt: [2170, 64] first4: 0.474816 1.461369 -0.224717 0.256804
-[DiT] step 33/50 t=0.360
-[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.318386 -0.334863 -0.120002 0.581452
-[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.218793 -0.045891 -0.135709 0.701808
-[Debug] dit_step33_vt: [2170, 64] first4: -0.615695 -1.169131 -0.067985 0.140436
-[Debug] dit_step33_xt: [2170, 64] first4: 0.487130 1.484752 -0.223357 0.253995
-[DiT] step 34/50 t=0.340
-[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.319302 -0.465754 -0.110709 0.538110
-[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.211723 -0.189661 -0.128844 0.664668
-[Debug] dit_step34_vt: [2170, 64] first4: -0.653452 -1.253376 -0.044227 0.068009
-[Debug] dit_step34_xt: [2170, 64] first4: 0.500199 1.509819 -0.222472 0.252635
-[DiT] step 35/50 t=0.320
-[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.319083 -0.598486 -0.097674 0.492459
-[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.205768 -0.335506 -0.119003 0.623259
-[Debug] dit_step35_vt: [2170, 64] first4: -0.661011 -1.330961 -0.027972 0.020086
-[Debug] dit_step35_xt: [2170, 64] first4: 0.513419 1.536439 -0.221913 0.252233
-[DiT] step 36/50 t=0.300
-[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.318070 -0.730204 -0.080886 0.449659
-[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.199361 -0.482989 -0.108305 0.581833
-[Debug] dit_step36_vt: [2170, 64] first4: -0.690251 -1.401439 0.013562 -0.032488
-[Debug] dit_step36_xt: [2170, 64] first4: 0.527224 1.564467 -0.222184 0.252883
-[DiT] step 37/50 t=0.280
-[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.316356 -0.845106 -0.054751 0.408449
-[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.194137 -0.614114 -0.090670 0.540072
-[Debug] dit_step37_vt: [2170, 64] first4: -0.694876 -1.462489 0.059738 -0.070205
-[Debug] dit_step37_xt: [2170, 64] first4: 0.541122 1.593717 -0.223379 0.254287
-[DiT] step 38/50 t=0.260
-[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.317038 -0.954626 -0.027545 0.366891
-[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.190420 -0.739252 -0.071602 0.495985
-[Debug] dit_step38_vt: [2170, 64] first4: -0.728120 -1.522043 0.107842 -0.107965
-[Debug] dit_step38_xt: [2170, 64] first4: 0.555684 1.624158 -0.225536 0.256446
-[DiT] step 39/50 t=0.240
-[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.317547 -1.061431 0.009151 0.320700
-[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.189703 -0.862508 -0.047778 0.446086
-[Debug] dit_step39_vt: [2170, 64] first4: -0.718521 -1.567608 0.187925 -0.137347
-[Debug] dit_step39_xt: [2170, 64] first4: 0.570055 1.655510 -0.229294 0.259193
-[DiT] step 40/50 t=0.220
-[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.321349 -1.147692 0.054264 0.273095
-[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.194555 -0.963666 -0.014199 0.392062
-[Debug] dit_step40_vt: [2170, 64] first4: -0.725549 -1.599213 0.254155 -0.152648
-[Debug] dit_step40_xt: [2170, 64] first4: 0.584566 1.687495 -0.234377 0.262246
-[DiT] step 41/50 t=0.200
-[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.323505 -1.229047 0.104885 0.223181
-[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.200140 -1.061003 0.024296 0.333073
-[Debug] dit_step41_vt: [2170, 64] first4: -0.706612 -1.620246 0.347929 -0.161344
-[Debug] dit_step41_xt: [2170, 64] first4: 0.598698 1.719899 -0.241336 0.265473
-[DiT] step 42/50 t=0.180
-[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.325457 -1.305037 0.162371 0.163510
-[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.210729 -1.152184 0.073317 0.265414
-[Debug] dit_step42_vt: [2170, 64] first4: -0.675503 -1.653079 0.417194 -0.191713
-[Debug] dit_step42_xt: [2170, 64] first4: 0.612208 1.752961 -0.249680 0.269307
-[DiT] step 43/50 t=0.160
-[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.327040 -1.367895 0.222307 0.103005
-[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.223734 -1.229896 0.124878 0.195856
-[Debug] dit_step43_vt: [2170, 64] first4: -0.637198 -1.669221 0.519679 -0.207779
-[Debug] dit_step43_xt: [2170, 64] first4: 0.624952 1.786345 -0.260074 0.273463
-[DiT] step 44/50 t=0.140
-[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.324606 -1.422529 0.282540 0.041568
-[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.235813 -1.298376 0.179092 0.128338
-[Debug] dit_step44_vt: [2170, 64] first4: -0.582920 -1.697035 0.593491 -0.255212
-[Debug] dit_step44_xt: [2170, 64] first4: 0.636610 1.820286 -0.271943 0.278567
-[DiT] step 45/50 t=0.120
-[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.322754 -1.471227 0.337328 -0.019488
-[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.253130 -1.357642 0.232497 0.062397
-[Debug] dit_step45_vt: [2170, 64] first4: -0.511094 -1.728875 0.663531 -0.289321
-[Debug] dit_step45_xt: [2170, 64] first4: 0.646832 1.854864 -0.285214 0.284353
-[DiT] step 46/50 t=0.100
-[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.317589 -1.518562 0.387160 -0.074592
-[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.269982 -1.418198 0.282772 0.001510
-[Debug] dit_step46_vt: [2170, 64] first4: -0.434923 -1.750584 0.706043 -0.325901
-[Debug] dit_step46_xt: [2170, 64] first4: 0.655531 1.889875 -0.299335 0.290871
-[DiT] step 47/50 t=0.080
-[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.316530 -1.561702 0.429371 -0.113134
-[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.290275 -1.474672 0.330953 -0.045588
-[Debug] dit_step47_vt: [2170, 64] first4: -0.369515 -1.780826 0.721569 -0.327625
-[Debug] dit_step47_xt: [2170, 64] first4: 0.662921 1.925492 -0.313766 0.297424
-[DiT] step 48/50 t=0.060
-[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.304095 -1.593375 0.469693 -0.135493
-[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.298372 -1.526686 0.379661 -0.093868
-[Debug] dit_step48_vt: [2170, 64] first4: -0.296147 -1.763528 0.744123 -0.229345
-[Debug] dit_step48_xt: [2170, 64] first4: 0.668844 1.960763 -0.328649 0.302011
-[DiT] step 49/50 t=0.040
-[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.330730 -1.622756 0.480628 -0.154374
-[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.324303 -1.543248 0.396626 -0.105187
-[Debug] dit_step49_vt: [2170, 64] first4: -0.371182 -1.888395 0.739232 -0.345413
-[Debug] dit_x0: [2170, 64] first4: 0.676268 1.998530 -0.343433 0.308919
-[DiT] step 50/50 t=0.020
-[DiT] Total generation: 106456.5 ms (106456.5 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.676268 1.998530 -0.343433 0.308919
-[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 474 nodes, T_latent=192
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
-ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x103f04c20 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x103f072d0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x103f09950 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x103f05240 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x103f09cd0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x103f0a8c0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x103f0ab20 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x103f0beb0 | th_max = 1024 | th_width =   32
-[VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 474 nodes, T_latent=256
-[VAE] Graph: 474 nodes, T_latent=186
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x103f04080 | th_max =  896 | th_width =   32
-[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 609604.9 ms
-[Debug] vae_audio: [2, 4166400] first4: -0.002491 -0.002402 -0.002394 -0.002024
-[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Request 1/1] Done
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Pipeline] All done
-[Request] Loaded request0.json
-[Noise] Reusing existing rng_philox_seed42.bf16
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
-[GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
-[GGML] Done, 47 dump files
-[Turbo] Reusing existing Python dumps: python-turbo
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999813
-  lyric_embed                          1.000000
-  enc_hidden                           0.999083
-  detok_output                         0.999884
-  context                              0.999927
-  noise                                1.000000
-  temb_t                               0.999972
-  hidden_after_proj_in                 0.999966
-  enc_after_cond_emb                   0.999209
-  layer0_sa_output                     0.999459
-  hidden_after_layer0                  0.999838
-  hidden_after_layer6                  0.999790
-  hidden_after_layer12                 0.998662
-  hidden_after_layer18                 0.995505
-  hidden_after_layer23                 0.991560
-  dit_step0_vt                         0.968885
-  dit_step0_xt                         0.999932
-  dit_step1_vt                         0.972718
-  dit_step1_xt                         0.999793
-  dit_step2_vt                         0.970980
-  dit_step2_xt                         0.999392
-  dit_step3_vt                         0.974057
-  dit_step3_xt                         0.998550
-  dit_step4_vt                         0.972601
-  dit_step4_xt                         0.996666
-  dit_step5_vt                         0.967840
-  dit_step5_xt                         0.992262
-  dit_step6_vt                         0.963419
-  dit_step6_xt                         0.983648
-  dit_step7_vt                         0.954759
-  dit_x0                               0.970661
-  vae_audio                            0.881689
-  vae_audio (log spectral)             0.999788
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999930   0.140512   0.007718  -0.002317   0.973035  -0.002342   0.972003
-  dit_step1_xt             0.999791   0.264415   0.013154  -0.005313   0.942911  -0.005313   0.941730
-  dit_step2_xt             0.999391   0.457878   0.021002  -0.009296   0.909537  -0.009311   0.908527
-  dit_step3_xt             0.998548   0.672565   0.031169  -0.014659   0.874300  -0.014577   0.873624
-  dit_step4_xt             0.996664   0.977397   0.045289  -0.021867   0.842610  -0.021660   0.841995
-  dit_step5_xt             0.992261   1.456099   0.067099  -0.032222   0.826249  -0.032109   0.824593
-  dit_step6_xt             0.983647   2.128287   0.100579  -0.046802   0.857538  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q5_K_M.gguf
-[GGML] Running acestep-v15-sft-Q5_K_M.gguf...
-[GGML] Done, 233 dump files
-[SFT] Reusing existing Python dumps: python-sft
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999813
-  lyric_embed                          1.000000
-  enc_hidden                           0.999083
-  detok_output                         0.999884
-  context                              0.999927
-  noise                                1.000000
-  temb_t                               0.999900
-  hidden_after_proj_in                 0.999966
-  enc_after_cond_emb                   0.999209
-  layer0_sa_output                     0.999536
-  hidden_after_layer0                  0.999891
-  hidden_after_layer6                  0.999626
-  hidden_after_layer12                 0.998995
-  hidden_after_layer18                 0.998026
-  hidden_after_layer23                 0.998535
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998436
-  dit_step0_vt_uncond                  0.998344
-  dit_step0_vt                         0.994668
-  dit_step0_xt                         0.999997
-  dit_step5_vt_cond                    0.998676
-  dit_step5_vt                         0.989830
-  dit_step5_xt                         0.999935
-  dit_step10_vt_cond                   0.996806
-  dit_step10_vt                        0.987585
-  dit_step10_xt                        0.999744
-  dit_step15_vt_cond                   0.992244
-  dit_step15_vt                        0.973038
-  dit_step15_xt                        0.999108
-  dit_step20_vt_cond                   0.984474
-  dit_step20_vt                        0.958153
-  dit_step20_xt                        0.997397
-  dit_step25_vt_cond                   0.974096
-  dit_step25_vt                        0.945640
-  dit_step25_xt                        0.994154
-  dit_step30_vt_cond                   0.962790
-  dit_step30_vt                        0.934107
-  dit_step30_xt                        0.989253
-  dit_step35_vt_cond                   0.951958
-  dit_step35_vt                        0.920426
-  dit_step35_xt                        0.983572
-  dit_step40_vt_cond                   0.945880
-  dit_step40_vt                        0.910054
-  dit_step40_xt                        0.978292
-  dit_step45_vt_cond                   0.952542
-  dit_step45_vt                        0.924831
-  dit_step45_xt                        0.974685
-  dit_step49_vt_cond                   0.963084
-  dit_step49_vt                        0.916267
-  dit_x0                               0.973449
-  vae_audio                            0.878623
-  vae_audio (log spectral)             0.999566
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999996   0.038422   0.002252  -0.001777   0.980099  -0.001741   0.980402
-  dit_step5_xt             0.999933   0.110159   0.007862  -0.006926   0.889483  -0.007143   0.887999
-  dit_step10_xt            0.999743   0.216004   0.013603  -0.012446   0.811152  -0.012603   0.811299
-  dit_step15_xt            0.999108   0.433603   0.022505  -0.017845   0.746187  -0.018114   0.745269
-  dit_step20_xt            0.997397   0.645668   0.035020  -0.023481   0.700583  -0.023808   0.699582
-  dit_step25_xt            0.994154   0.894286   0.050478  -0.029250   0.679073  -0.029311   0.679278
-  dit_step30_xt            0.989253   1.155794   0.069043  -0.035037   0.684973  -0.035027   0.685262
-  dit_step35_xt            0.983572   1.518599   0.089822  -0.040808   0.717172  -0.040716   0.717196
-  dit_step40_xt            0.978291   1.917882   0.111854  -0.046581   0.771460  -0.046462   0.771853
-  dit_step45_xt            0.974684   2.279520   0.132709  -0.052804   0.843506  -0.052475   0.843036
diff --git a/tests/Metal_Q6_K.log b/tests/Metal_Q6_K.log
deleted file mode 100644
index 19a2f5c..0000000
--- a/tests/Metal_Q6_K.log
+++ /dev/null
@@ -1,819 +0,0 @@
-ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
-ggml_metal_library_init: using embedded metal library
-ggml_metal_library_init: loaded in 0.006 sec
-ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
-ggml_metal_device_init: GPU name:   MTL0
-ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
-ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
-ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
-ggml_metal_device_init: simdgroup reduction   = true
-ggml_metal_device_init: simdgroup matrix mul. = true
-ggml_metal_device_init: has unified memory    = true
-ggml_metal_device_init: has bfloat            = true
-ggml_metal_device_init: has tensor            = false
-ggml_metal_device_init: use residency sets    = true
-ggml_metal_device_init: use shared buffers    = true
-ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] DiT backend: MTL0 (CPU threads: 5)
-[Load] Backend init: 18.8 ms
-[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
-[DiT] MLP: gate+up fused
-[Load] null_condition_emb found (CFG available)
-[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend
-[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 1759.5 ms
-[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
-[Load] silence_latent: [15000, 64] from GGUF
-[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] VAE backend: MTL0 (CPU threads: 5)
-[VAE] Backend: MTL0, Weight buffer: 255.7 MB
-[VAE] Loaded: 5 blocks, upsample=1920x
-[Load] VAE weights: 335.9 ms
-[Request 1/1] ggml-turbo/request0.json (batch=1)
-[Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
-[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
-[Pipeline] 434 audio codes (86.8s @ 5Hz)
-[Pipeline] T=2170, S=1085
-[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 42.1 ms
-[Pipeline] caption: 70 tokens, lyrics: 167 tokens
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] TextEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 28L, H=1024, Nh=16/8
-[Load] TextEncoder: 294.2 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
-ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x13a80b9e0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x13a80be40 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x13a80c6b0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
-ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x13a80cb30 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x13a80d3a0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x13a80d9d0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x13a80e2b0 | th_max =  576 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x13a80e6c0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x13a80e920 | th_max = 1024 | th_width =   32
-[Encode] TextEncoder (70 tokens): 46.1 ms
-[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 33.8 ms
-[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] CondEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
-[WeightCtx] Loaded 140 tensors, 476.3 MB into backend
-[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 652.0 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1            0x13a818c40 | th_max =  896 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x13a819080 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x13a8197f0 | th_max =  640 | th_width =   32
-[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 146.8 ms, enc_S=238
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Debug] enc_hidden: [238, 2048] first4: 1.752129 -0.050073 -0.134015 0.059631
-[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
-[WeightCtx] Loaded 30 tensors, 82.2 MB into backend
-[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 125.6 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x13a817f60 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x13a818470 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2                  0x13a808aa0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x13a808d00 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8      0x13a81a190 | th_max =  640 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x13a81a3f0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x13a81adc0 | th_max =  448 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x13a81b250 | th_max = 1024 | th_width =   32
-[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 1009.6 ms
-[Debug] detok_output: [2170, 64] first4: -0.141063 1.454431 0.315142 -0.623566
-[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
-[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
-[Debug] context: [2170, 128] first4: -0.141063 1.454431 0.315142 -0.623566
-[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
-[DiT] Batch N=1, T=2170, S=1085, enc_S=238
-[DiT] Graph: 1841 nodes
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x13970a020 | th_max =  832 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x13970a280 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x13970a4e0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x13970b610 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x13970be80 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x13970c0e0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x13970c7b0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x13970cc30 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x13970d300 | th_max =  640 | th_width =   32
-[Debug] tproj: [12288] first4: 0.261089 -0.161223 -0.098728 0.051901
-[Debug] temb: [2048] first4: 0.000236 -0.132397 -0.035348 0.064653
-[Debug] temb_t: [2048] first4: 0.001398 0.026957 -0.052741 0.063660
-[Debug] temb_r: [2048] first4: -0.001162 -0.159353 0.017394 0.000993
-[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
-[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
-[Debug] temb_lin1_t: [2048] first4: -0.049071 -0.051112 -0.017769 -0.037193
-[Debug] temb_lin1_r: [2048] first4: -0.014408 -0.020609 -0.015729 0.003875
-[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.037692 -0.956719 0.540867 0.451860
-[Debug] proj_in_input: [192, 2170] first4: -0.141063 1.454431 0.315142 -0.623566
-[Debug] enc_after_cond_emb: [2048, 238] first4: -0.167564 0.852700 0.309671 -0.538299
-[Debug] layer0_sa_input: [2048, 1085] first4: -0.716202 -0.756050 -0.048455 0.263529
-[Debug] layer0_q_after_rope: [128, 16] first4: -0.167564 0.852700 0.309671 -0.538299
-[Debug] layer0_k_after_rope: [128, 8] first4: -1.214772 -0.856039 -1.908578 -2.256124
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.502833 0.209946 -0.367812 0.520536
-[Debug] layer0_attn_out: [2048, 1085] first4: -1.134820 -0.084089 -34.867664 -0.724257
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540231 -1.049932 0.181504 0.461969
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.609974 -0.819551 -0.333653 0.497179
-[Debug] hidden_after_layer0: [2048, 1085] first4: -9.196066 0.534182 52.334564 -0.880322
-[Debug] hidden_after_layer6: [2048, 1085] first4: -21.325979 -0.207006 34.129318 -4.337931
-[Debug] hidden_after_layer12: [2048, 1085] first4: -15.411194 -16.311844 76.549057 29.816362
-[Debug] hidden_after_layer18: [2048, 1085] first4: -28.025963 13.209218 65.994347 20.087559
-[Debug] hidden_after_layer23: [2048, 1085] first4: -19.575611 48.863052 201.092041 136.881271
-[Debug] dit_step0_vt: [2170, 64] first4: 0.099154 1.132388 0.349667 2.375307
-[Debug] dit_step0_xt: [2170, 64] first4: 0.189829 2.104778 -0.187769 0.739688
-[DiT] step 1/8 t=1.000
-[Debug] dit_step1_vt: [2170, 64] first4: -0.140258 1.329038 -0.173978 1.924904
-[Debug] dit_step1_xt: [2170, 64] first4: 0.197479 2.032285 -0.178279 0.634693
-[DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: 0.064148 1.236530 0.118618 2.406788
-[Debug] dit_step2_xt: [2170, 64] first4: 0.193203 1.949849 -0.186187 0.474240
-[DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: 0.305678 1.101620 0.246811 2.656265
-[Debug] dit_step3_xt: [2170, 64] first4: 0.167730 1.858048 -0.206755 0.252885
-[DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 0.333444 1.032630 0.077940 2.735898
-[Debug] dit_step4_xt: [2170, 64] first4: 0.132004 1.747409 -0.215105 -0.040247
-[DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 0.310135 0.905818 -0.324717 2.786166
-[Debug] dit_step5_xt: [2170, 64] first4: 0.087699 1.618006 -0.168717 -0.438271
-[DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: 0.184311 0.624224 -0.863634 2.781863
-[Debug] dit_step6_xt: [2170, 64] first4: 0.050836 1.493161 0.004009 -0.994643
-[DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: 0.049488 0.168290 -1.435298 3.015385
-[Debug] dit_x0: [2170, 64] first4: 0.035990 1.442675 0.434599 -1.899259
-[DiT] step 8/8 t=0.300
-[DiT] Total generation: 8015.7 ms (8015.7 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.035990 1.442675 0.434599 -1.899259
-[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 474 nodes, T_latent=192
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
-ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x13a81b7d0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x13a81c1a0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x13a81c400 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x13a81ca90 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x13a81ccf0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x13a81d4d0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x13a81da30 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x13a81e910 | th_max = 1024 | th_width =   32
-[VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 474 nodes, T_latent=256
-[VAE] Graph: 474 nodes, T_latent=186
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x13a81eb70 | th_max =  896 | th_width =   32
-[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 609630.7 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000453 0.000980 0.000826 0.001209
-[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Request 1/1] Done
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Pipeline] All done
-ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
-ggml_metal_library_init: using embedded metal library
-ggml_metal_library_init: loaded in 0.006 sec
-ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
-ggml_metal_device_init: GPU name:   MTL0
-ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
-ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
-ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
-ggml_metal_device_init: simdgroup reduction   = true
-ggml_metal_device_init: simdgroup matrix mul. = true
-ggml_metal_device_init: has unified memory    = true
-ggml_metal_device_init: has bfloat            = true
-ggml_metal_device_init: has tensor            = false
-ggml_metal_device_init: use residency sets    = true
-ggml_metal_device_init: use shared buffers    = true
-ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] DiT backend: MTL0 (CPU threads: 5)
-[Load] Backend init: 20.5 ms
-[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800
-[DiT] MLP: gate+up fused
-[Load] null_condition_emb found (CFG available)
-[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend
-[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 1781.2 ms
-[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800
-[Load] silence_latent: [15000, 64] from GGUF
-[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] VAE backend: MTL0 (CPU threads: 5)
-[VAE] Backend: MTL0, Weight buffer: 255.7 MB
-[VAE] Loaded: 5 blocks, upsample=1920x
-[Load] VAE weights: 275.0 ms
-[Request 1/1] ggml-sft/request0.json (batch=1)
-[Request] parsed ggml-sft/request0.json (18 fields)
-[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s
-[Pipeline] 434 audio codes (86.8s @ 5Hz)
-[Pipeline] T=2170, S=1085
-[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 42.4 ms
-[Pipeline] caption: 70 tokens, lyrics: 167 tokens
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] TextEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 28L, H=1024, Nh=16/8
-[Load] TextEncoder: 361.7 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
-ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x106006bc0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x106007320 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x106008160 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
-ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x1060083c0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x106008e40 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x1060093a0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x106009600 | th_max =  576 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x10600a090 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x10600a2f0 | th_max = 1024 | th_width =   32
-[Encode] TextEncoder (70 tokens): 45.5 ms
-[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 35.7 ms
-[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] CondEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800
-[WeightCtx] Loaded 140 tensors, 476.3 MB into backend
-[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 850.5 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1            0x120606720 | th_max =  896 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x120606ba0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x1206071d0 | th_max =  640 | th_width =   32
-[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 151.8 ms, enc_S=238
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Debug] enc_hidden: [238, 2048] first4: 1.752129 -0.050073 -0.134015 0.059631
-[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800
-[WeightCtx] Loaded 30 tensors, 82.2 MB into backend
-[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 134.7 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x106012a70 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x106013260 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2                  0x106013b60 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x1060134c0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8      0x1206079b0 | th_max =  640 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x120608040 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x1206082a0 | th_max =  448 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x120608730 | th_max = 1024 | th_width =   32
-[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 1004.2 ms
-[Debug] detok_output: [2170, 64] first4: -0.141063 1.454431 0.315142 -0.623566
-[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
-[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
-[Debug] context: [2170, 128] first4: -0.141063 1.454431 0.315142 -0.623566
-[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1
-[DiT] Batch N=1, T=2170, S=1085, enc_S=238
-[DiT] Graph: 1841 nodes
-[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683
-[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683
-[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x120608bb0 | th_max =  832 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x120608e10 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x120609070 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x120609f80 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x12060a7f0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x12060aa50 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x12060b0d0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x12060b550 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x12060bc10 | th_max =  640 | th_width =   32
-[Debug] tproj: [12288] first4: 0.153861 -0.117528 -0.090110 0.080834
-[Debug] temb: [2048] first4: -0.002466 -0.176370 0.004369 -0.002069
-[Debug] temb_t: [2048] first4: -0.000999 0.003474 -0.013219 -0.002130
-[Debug] temb_r: [2048] first4: -0.001467 -0.179844 0.017589 0.000062
-[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
-[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
-[Debug] temb_lin1_t: [2048] first4: -0.041140 0.030274 0.027836 -0.025460
-[Debug] temb_lin1_r: [2048] first4: 0.004272 0.006720 0.000208 -0.005103
-[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.088484 -0.913424 0.502796 0.445566
-[Debug] proj_in_input: [192, 2170] first4: -0.141063 1.454431 0.315142 -0.623566
-[Debug] enc_after_cond_emb: [2048, 238] first4: -0.194042 0.920094 0.309464 -0.544236
-[Debug] layer0_sa_input: [2048, 1085] first4: -0.914448 -0.710483 -0.040214 0.295227
-[Debug] layer0_q_after_rope: [128, 16] first4: -0.194042 0.920094 0.309464 -0.544236
-[Debug] layer0_k_after_rope: [128, 8] first4: -1.341203 -0.993715 -1.828661 -2.252987
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.651907 0.800758 -0.600550 0.531539
-[Debug] layer0_attn_out: [2048, 1085] first4: -1.094031 -0.025671 -33.031021 -0.629337
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.804741 -1.365866 -0.176846 0.385942
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.975780 -1.029709 -0.454110 0.391604
-[Debug] hidden_after_layer0: [2048, 1085] first4: -9.417660 1.271689 57.716125 -1.718801
-[Debug] hidden_after_layer6: [2048, 1085] first4: -17.205166 2.402088 59.038250 -1.336451
-[Debug] hidden_after_layer12: [2048, 1085] first4: -10.471869 6.708532 -25.396618 -2.966099
-[Debug] hidden_after_layer18: [2048, 1085] first4: -4.594971 20.646416 -42.849018 -14.024486
-[Debug] hidden_after_layer23: [2048, 1085] first4: 34.838955 64.575096 51.865501 -11.288853
-[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.621944 2.533786 -0.220703 1.441472
-[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.335422 2.090178 -0.712111 1.749312
-[Debug] dit_step0_vt: [2170, 64] first4: -0.817132 3.005553 0.060651 1.037697
-[Debug] dit_step0_xt: [2170, 64] first4: 0.210679 2.096139 -0.173088 0.826902
-[DiT] step 1/50 t=1.000
-[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.663130 2.509250 -0.047312 1.284879
-[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.535073 2.303810 -0.132373 1.511020
-[Debug] dit_step1_vt: [2170, 64] first4: -0.505551 2.169540 -0.259578 1.319904
-[Debug] dit_step1_xt: [2170, 64] first4: 0.220790 2.052748 -0.167896 0.800504
-[DiT] step 2/50 t=0.980
-[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.658958 2.478008 -0.004274 1.297306
-[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.555012 2.295219 -0.109707 1.445456
-[Debug] dit_step2_vt: [2170, 64] first4: -0.800255 2.877401 0.236257 0.991010
-[Debug] dit_step2_xt: [2170, 64] first4: 0.236795 1.995200 -0.172622 0.780684
-[DiT] step 3/50 t=0.960
-[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.623685 2.405478 0.016157 1.310913
-[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.584147 2.331472 -0.052434 1.362097
-[Debug] dit_step3_vt: [2170, 64] first4: -0.460909 2.085223 -0.179917 1.477093
-[Debug] dit_step3_xt: [2170, 64] first4: 0.246013 1.953496 -0.169023 0.751142
-[DiT] step 4/50 t=0.940
-[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.573136 2.336644 0.011644 1.310671
-[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.579254 2.305885 -0.063045 1.333517
-[Debug] dit_step4_vt: [2170, 64] first4: -0.615409 2.553339 0.248993 1.073298
-[Debug] dit_step4_xt: [2170, 64] first4: 0.258321 1.902429 -0.174003 0.729676
-[DiT] step 5/50 t=0.920
-[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.517031 2.271256 -0.017464 1.306595
-[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.593495 2.274611 -0.028605 1.340369
-[Debug] dit_step5_vt: [2170, 64] first4: -0.226837 1.944980 -0.246283 1.356041
-[Debug] dit_step5_xt: [2170, 64] first4: 0.262858 1.863529 -0.169077 0.702555
-[DiT] step 6/50 t=0.900
-[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.492334 2.237296 -0.030848 1.308453
-[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.559225 2.211779 -0.028865 1.356664
-[Debug] dit_step6_vt: [2170, 64] first4: -0.506265 2.451765 0.152689 1.063906
-[Debug] dit_step6_xt: [2170, 64] first4: 0.272983 1.814494 -0.172131 0.681277
-[DiT] step 7/50 t=0.880
-[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.448929 2.166704 -0.051454 1.296641
-[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.516741 2.132911 -0.036027 1.347799
-[Debug] dit_step7_vt: [2170, 64] first4: -0.201648 1.970928 -0.247107 1.267663
-[Debug] dit_step7_xt: [2170, 64] first4: 0.277016 1.775075 -0.167189 0.655924
-[DiT] step 8/50 t=0.860
-[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.412205 2.087660 -0.059856 1.279323
-[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.450006 2.049930 -0.047531 1.315284
-[Debug] dit_step8_vt: [2170, 64] first4: -0.455255 2.220170 0.079066 1.097043
-[Debug] dit_step8_xt: [2170, 64] first4: 0.286121 1.730672 -0.168770 0.633983
-[DiT] step 9/50 t=0.840
-[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.378611 2.005139 -0.068357 1.261302
-[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.379138 1.976703 -0.055227 1.283708
-[Debug] dit_step9_vt: [2170, 64] first4: -0.302897 1.841593 -0.214058 1.239796
-[Debug] dit_step9_xt: [2170, 64] first4: 0.292179 1.693840 -0.164489 0.609187
-[DiT] step 10/50 t=0.820
-[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.355196 1.928296 -0.064251 1.252096
-[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.318644 1.912086 -0.042776 1.279346
-[Debug] dit_step10_vt: [2170, 64] first4: -0.498521 1.941743 -0.002070 1.076981
-[Debug] dit_step10_xt: [2170, 64] first4: 0.302150 1.655005 -0.164448 0.587648
-[DiT] step 11/50 t=0.800
-[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.335946 1.853130 -0.070414 1.240474
-[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.288372 1.862547 -0.022430 1.290233
-[Debug] dit_step11_vt: [2170, 64] first4: -0.354338 1.618477 -0.262413 1.094882
-[Debug] dit_step11_xt: [2170, 64] first4: 0.309236 1.622636 -0.159200 0.565750
-[DiT] step 12/50 t=0.780
-[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.320539 1.785762 -0.070432 1.238213
-[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.262826 1.816614 -0.015609 1.306050
-[Debug] dit_step12_vt: [2170, 64] first4: -0.491342 1.658213 -0.091632 0.993836
-[Debug] dit_step12_xt: [2170, 64] first4: 0.319063 1.589471 -0.157367 0.545873
-[DiT] step 13/50 t=0.760
-[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.304664 1.719325 -0.071341 1.231198
-[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.239698 1.764744 -0.021893 1.311546
-[Debug] dit_step13_vt: [2170, 64] first4: -0.399484 1.443110 -0.224588 1.008114
-[Debug] dit_step13_xt: [2170, 64] first4: 0.327053 1.560609 -0.152875 0.525711
-[DiT] step 14/50 t=0.740
-[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.287200 1.663083 -0.055876 1.237650
-[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.221389 1.709672 -0.033314 1.310252
-[Debug] dit_step14_vt: [2170, 64] first4: -0.462595 1.498639 -0.013459 1.015139
-[Debug] dit_step14_xt: [2170, 64] first4: 0.336305 1.530637 -0.152606 0.505408
-[DiT] step 15/50 t=0.720
-[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.268932 1.604917 -0.048117 1.238302
-[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.201852 1.656007 -0.050695 1.299129
-[Debug] dit_step15_vt: [2170, 64] first4: -0.405852 1.327548 -0.070650 1.046717
-[Debug] dit_step15_xt: [2170, 64] first4: 0.344422 1.504086 -0.151193 0.484474
-[DiT] step 16/50 t=0.700
-[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.252512 1.546370 -0.039756 1.239714
-[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.180964 1.599575 -0.068729 1.281288
-[Debug] dit_step16_vt: [2170, 64] first4: -0.465245 1.345990 0.113584 1.099017
-[Debug] dit_step16_xt: [2170, 64] first4: 0.353727 1.477166 -0.153465 0.462493
-[DiT] step 17/50 t=0.680
-[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.235390 1.484958 -0.035930 1.232975
-[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.159705 1.538287 -0.087686 1.257131
-[Debug] dit_step17_vt: [2170, 64] first4: -0.434245 1.219246 0.101116 1.128435
-[Debug] dit_step17_xt: [2170, 64] first4: 0.362412 1.452781 -0.155487 0.439925
-[DiT] step 18/50 t=0.660
-[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.216501 1.424307 -0.036471 1.226410
-[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.139621 1.481410 -0.105683 1.236565
-[Debug] dit_step18_vt: [2170, 64] first4: -0.455215 1.183846 0.234644 1.168681
-[Debug] dit_step18_xt: [2170, 64] first4: 0.371516 1.429104 -0.160180 0.416551
-[DiT] step 19/50 t=0.640
-[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.193781 1.365146 -0.052930 1.209294
-[Debug] dit_step19_vt_uncond: [2170, 64] first4: -0.117725 1.431031 -0.131161 1.210289
-[Debug] dit_step19_vt: [2170, 64] first4: -0.413012 1.059516 0.190559 1.170309
-[Debug] dit_step19_xt: [2170, 64] first4: 0.379776 1.407914 -0.163991 0.393145
-[DiT] step 20/50 t=0.620
-[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.171132 1.297945 -0.074904 1.190113
-[Debug] dit_step20_vt_uncond: [2170, 64] first4: -0.094949 1.381890 -0.154719 1.182902
-[Debug] dit_step20_vt: [2170, 64] first4: -0.416916 0.961257 0.217113 1.187745
-[Debug] dit_step20_xt: [2170, 64] first4: 0.388114 1.388689 -0.168333 0.369390
-[DiT] step 21/50 t=0.600
-[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.156818 1.217449 -0.102774 1.165961
-[Debug] dit_step21_vt_uncond: [2170, 64] first4: -0.076230 1.330654 -0.181784 1.153171
-[Debug] dit_step21_vt: [2170, 64] first4: -0.413541 0.733536 0.153392 1.169193
-[Debug] dit_step21_xt: [2170, 64] first4: 0.396385 1.374018 -0.171401 0.346006
-[DiT] step 22/50 t=0.580
-[Debug] dit_step22_vt_cond: [2170, 64] first4: -0.149490 1.124467 -0.129160 1.136317
-[Debug] dit_step22_vt_uncond: [2170, 64] first4: -0.066301 1.272416 -0.203485 1.126914
-[Debug] dit_step22_vt: [2170, 64] first4: -0.414485 0.550591 0.128185 1.113165
-[Debug] dit_step22_xt: [2170, 64] first4: 0.404675 1.363006 -0.173965 0.323743
-[DiT] step 23/50 t=0.560
-[Debug] dit_step23_vt_cond: [2170, 64] first4: -0.151829 1.034549 -0.154075 1.109805
-[Debug] dit_step23_vt_uncond: [2170, 64] first4: -0.064656 1.215351 -0.221178 1.104513
-[Debug] dit_step23_vt: [2170, 64] first4: -0.436045 0.335528 0.059138 1.081705
-[Debug] dit_step23_xt: [2170, 64] first4: 0.413396 1.356296 -0.175148 0.302109
-[DiT] step 24/50 t=0.540
-[Debug] dit_step24_vt_cond: [2170, 64] first4: -0.158201 0.930576 -0.174559 1.077020
-[Debug] dit_step24_vt_uncond: [2170, 64] first4: -0.070645 1.149243 -0.231212 1.077814
-[Debug] dit_step24_vt: [2170, 64] first4: -0.428632 0.115268 0.009393 1.014248
-[Debug] dit_step24_xt: [2170, 64] first4: 0.421968 1.353990 -0.175335 0.281824
-[DiT] step 25/50 t=0.520
-[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.172706 0.820171 -0.187534 1.046402
-[Debug] dit_step25_vt_uncond: [2170, 64] first4: -0.084898 1.070370 -0.233334 1.052725
-[Debug] dit_step25_vt: [2170, 64] first4: -0.451610 -0.083828 -0.046615 0.975977
-[Debug] dit_step25_xt: [2170, 64] first4: 0.431001 1.355667 -0.174403 0.262304
-[DiT] step 26/50 t=0.500
-[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.189538 0.693448 -0.199819 1.008348
-[Debug] dit_step26_vt_uncond: [2170, 64] first4: -0.101718 0.976297 -0.234447 1.021465
-[Debug] dit_step26_vt: [2170, 64] first4: -0.457462 -0.318421 -0.092668 0.902675
-[Debug] dit_step26_xt: [2170, 64] first4: 0.440150 1.362035 -0.172550 0.244251
-[DiT] step 27/50 t=0.480
-[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.208769 0.570284 -0.208205 0.971380
-[Debug] dit_step27_vt_uncond: [2170, 64] first4: -0.121884 0.876404 -0.234553 0.991057
-[Debug] dit_step27_vt: [2170, 64] first4: -0.474894 -0.475314 -0.124745 0.852438
-[Debug] dit_step27_xt: [2170, 64] first4: 0.449648 1.371541 -0.170055 0.227202
-[DiT] step 28/50 t=0.460
-[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.231451 0.438197 -0.217041 0.925335
-[Debug] dit_step28_vt_uncond: [2170, 64] first4: -0.142976 0.763591 -0.236660 0.951937
-[Debug] dit_step28_vt: [2170, 64] first4: -0.503044 -0.662942 -0.154526 0.775360
-[Debug] dit_step28_xt: [2170, 64] first4: 0.459709 1.384800 -0.166964 0.211695
-[DiT] step 29/50 t=0.440
-[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.255602 0.305935 -0.225167 0.878463
-[Debug] dit_step29_vt_uncond: [2170, 64] first4: -0.161997 0.641782 -0.240440 0.915606
-[Debug] dit_step29_vt: [2170, 64] first4: -0.546771 -0.770888 -0.172051 0.693937
-[Debug] dit_step29_xt: [2170, 64] first4: 0.470644 1.400218 -0.163523 0.197816
-[DiT] step 30/50 t=0.420
-[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.279856 0.166208 -0.230015 0.829064
-[Debug] dit_step30_vt_uncond: [2170, 64] first4: -0.179567 0.505654 -0.246080 0.880209
-[Debug] dit_step30_vt: [2170, 64] first4: -0.586251 -0.913478 -0.165068 0.591419
-[Debug] dit_step30_xt: [2170, 64] first4: 0.482369 1.418488 -0.160222 0.185988
-[DiT] step 31/50 t=0.400
-[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.303020 0.026116 -0.234342 0.778238
-[Debug] dit_step31_vt_uncond: [2170, 64] first4: -0.194565 0.361688 -0.252981 0.842483
-[Debug] dit_step31_vt: [2170, 64] first4: -0.634687 -1.004404 -0.163206 0.505303
-[Debug] dit_step31_xt: [2170, 64] first4: 0.495063 1.438576 -0.156958 0.175882
-[DiT] step 32/50 t=0.380
-[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.323948 -0.114179 -0.233989 0.723499
-[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.205043 0.215050 -0.257018 0.804404
-[Debug] dit_step32_vt: [2170, 64] first4: -0.683278 -1.112943 -0.143677 0.386194
-[Debug] dit_step32_xt: [2170, 64] first4: 0.508728 1.460835 -0.154084 0.168158
-[DiT] step 33/50 t=0.360
-[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.343586 -0.257977 -0.227555 0.671680
-[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.213907 0.060018 -0.258127 0.766574
-[Debug] dit_step33_vt: [2170, 64] first4: -0.732613 -1.188033 -0.112312 0.307609
-[Debug] dit_step33_xt: [2170, 64] first4: 0.523381 1.484595 -0.151838 0.162006
-[DiT] step 34/50 t=0.340
-[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.359997 -0.390317 -0.220456 0.624948
-[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.218993 -0.085222 -0.256678 0.732610
-[Debug] dit_step34_vt: [2170, 64] first4: -0.782051 -1.277164 -0.095120 0.209992
-[Debug] dit_step34_xt: [2170, 64] first4: 0.539022 1.510138 -0.149936 0.157806
-[DiT] step 35/50 t=0.320
-[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.376117 -0.519578 -0.205748 0.575179
-[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.225790 -0.227651 -0.246590 0.695063
-[Debug] dit_step35_vt: [2170, 64] first4: -0.824555 -1.344921 -0.065901 0.128571
-[Debug] dit_step35_xt: [2170, 64] first4: 0.555513 1.537037 -0.148618 0.155234
-[DiT] step 36/50 t=0.300
-[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.388908 -0.645821 -0.189103 0.531176
-[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.231184 -0.367522 -0.234059 0.657502
-[Debug] dit_step36_vt: [2170, 64] first4: -0.864100 -1.422112 -0.041499 0.066854
-[Debug] dit_step36_xt: [2170, 64] first4: 0.572795 1.565479 -0.147788 0.153897
-[DiT] step 37/50 t=0.280
-[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.403025 -0.763416 -0.163423 0.484469
-[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.241143 -0.498980 -0.214114 0.616995
-[Debug] dit_step37_vt: [2170, 64] first4: -0.892219 -1.488433 0.005198 -0.010545
-[Debug] dit_step37_xt: [2170, 64] first4: 0.590639 1.595248 -0.147892 0.154108
-[DiT] step 38/50 t=0.260
-[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.413133 -0.876420 -0.134134 0.440490
-[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.251082 -0.625665 -0.187976 0.573381
-[Debug] dit_step38_vt: [2170, 64] first4: -0.908542 -1.561219 0.031824 -0.045067
-[Debug] dit_step38_xt: [2170, 64] first4: 0.608810 1.626472 -0.148528 0.155009
-[DiT] step 39/50 t=0.240
-[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.423588 -0.987294 -0.096889 0.393114
-[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.265262 -0.752590 -0.154346 0.525557
-[Debug] dit_step39_vt: [2170, 64] first4: -0.912287 -1.609328 0.086172 -0.106501
-[Debug] dit_step39_xt: [2170, 64] first4: 0.627056 1.658659 -0.150252 0.157140
-[DiT] step 40/50 t=0.220
-[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.432563 -1.079476 -0.052369 0.343707
-[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.280351 -0.859098 -0.115334 0.472040
-[Debug] dit_step40_vt: [2170, 64] first4: -0.898876 -1.653373 0.143320 -0.117817
-[Debug] dit_step40_xt: [2170, 64] first4: 0.645033 1.691726 -0.153118 0.159496
-[DiT] step 41/50 t=0.200
-[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.437202 -1.169207 -0.002996 0.295201
-[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.297226 -0.965837 -0.069953 0.414322
-[Debug] dit_step41_vt: [2170, 64] first4: -0.848467 -1.668746 0.201099 -0.128392
-[Debug] dit_step41_xt: [2170, 64] first4: 0.662003 1.725101 -0.157140 0.162064
-[DiT] step 42/50 t=0.180
-[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.440767 -1.252195 0.052645 0.236928
-[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.315933 -1.066464 -0.015775 0.349839
-[Debug] dit_step42_vt: [2170, 64] first4: -0.805727 -1.702440 0.256392 -0.165544
-[Debug] dit_step42_xt: [2170, 64] first4: 0.678117 1.759150 -0.162268 0.165375
-[DiT] step 43/50 t=0.160
-[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.441682 -1.322971 0.111570 0.178405
-[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.332472 -1.154346 0.037873 0.279796
-[Debug] dit_step43_vt: [2170, 64] first4: -0.752842 -1.718463 0.345657 -0.158267
-[Debug] dit_step43_xt: [2170, 64] first4: 0.693174 1.793519 -0.169181 0.168540
-[DiT] step 44/50 t=0.140
-[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.436481 -1.389212 0.170327 0.116380
-[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.347603 -1.238272 0.093730 0.210065
-[Debug] dit_step44_vt: [2170, 64] first4: -0.673781 -1.742915 0.402893 -0.211575
-[Debug] dit_step44_xt: [2170, 64] first4: 0.706650 1.828377 -0.177239 0.172771
-[DiT] step 45/50 t=0.120
-[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.432089 -1.448218 0.222042 0.052259
-[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.365667 -1.310638 0.145240 0.138891
-[Debug] dit_step45_vt: [2170, 64] first4: -0.598716 -1.780947 0.463354 -0.230193
-[Debug] dit_step45_xt: [2170, 64] first4: 0.718624 1.863996 -0.186506 0.177375
-[DiT] step 46/50 t=0.100
-[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.419096 -1.509236 0.267908 -0.006720
-[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.381439 -1.387897 0.192344 0.071553
-[Debug] dit_step46_vt: [2170, 64] first4: -0.477449 -1.801432 0.493477 -0.266383
-[Debug] dit_step46_xt: [2170, 64] first4: 0.728173 1.900025 -0.196375 0.182703
-[DiT] step 47/50 t=0.080
-[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.407120 -1.565253 0.302819 -0.051005
-[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.401163 -1.460867 0.230993 0.012604
-[Debug] dit_step47_vt: [2170, 64] first4: -0.369522 -1.829498 0.516088 -0.236691
-[Debug] dit_step47_xt: [2170, 64] first4: 0.735563 1.936615 -0.206697 0.187437
-[DiT] step 48/50 t=0.060
-[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.382883 -1.607141 0.333374 -0.083609
-[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.407261 -1.531759 0.269964 -0.041764
-[Debug] dit_step48_vt: [2170, 64] first4: -0.250388 -1.766134 0.508857 -0.194591
-[Debug] dit_step48_xt: [2170, 64] first4: 0.740571 1.971938 -0.216874 0.191329
-[DiT] step 49/50 t=0.040
-[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.416988 -1.643981 0.337042 -0.115695
-[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.434090 -1.549805 0.279877 -0.060649
-[Debug] dit_step49_vt: [2170, 64] first4: -0.398854 -1.970749 0.508508 -0.360412
-[Debug] dit_x0: [2170, 64] first4: 0.748548 2.011353 -0.227044 0.198537
-[DiT] step 50/50 t=0.020
-[DiT] Total generation: 99823.1 ms (99823.1 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.748548 2.011353 -0.227044 0.198537
-[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 474 nodes, T_latent=192
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
-ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x12060b7b0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x1206126e0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x120612940 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x120612fd0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x120613410 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x120613a10 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x120613f70 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x120614e50 | th_max = 1024 | th_width =   32
-[VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 474 nodes, T_latent=256
-[VAE] Graph: 474 nodes, T_latent=186
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x10600ac10 | th_max =  896 | th_width =   32
-[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 609612.5 ms
-[Debug] vae_audio: [2, 4166400] first4: -0.003173 -0.003180 -0.003117 -0.002677
-[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Request 1/1] Done
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Pipeline] All done
-[Request] Loaded request0.json
-[Noise] Reusing existing rng_philox_seed42.bf16
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
-[GGML] Running acestep-v15-turbo-Q6_K.gguf...
-[GGML] Done, 47 dump files
-[Turbo] Reusing existing Python dumps: python-turbo
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999813
-  lyric_embed                          1.000000
-  enc_hidden                           0.999631
-  detok_output                         0.999969
-  context                              0.999982
-  noise                                1.000000
-  temb_t                               0.999991
-  hidden_after_proj_in                 0.999987
-  enc_after_cond_emb                   0.999648
-  layer0_sa_output                     0.999791
-  hidden_after_layer0                  0.999898
-  hidden_after_layer6                  0.999877
-  hidden_after_layer12                 0.998721
-  hidden_after_layer18                 0.995721
-  hidden_after_layer23                 0.992012
-  dit_step0_vt                         0.970006
-  dit_step0_xt                         0.999934
-  dit_step1_vt                         0.973568
-  dit_step1_xt                         0.999795
-  dit_step2_vt                         0.976942
-  dit_step2_xt                         0.999458
-  dit_step3_vt                         0.977714
-  dit_step3_xt                         0.998700
-  dit_step4_vt                         0.976433
-  dit_step4_xt                         0.997003
-  dit_step5_vt                         0.973498
-  dit_step5_xt                         0.993187
-  dit_step6_vt                         0.970259
-  dit_step6_xt                         0.985910
-  dit_step7_vt                         0.963169
-  dit_x0                               0.975098
-  vae_audio                            0.894235
-  vae_audio (log spectral)             0.999805
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999933   0.149343   0.007343  -0.002290   0.972926  -0.002342   0.972003
-  dit_step1_xt             0.999794   0.293469   0.012678  -0.005225   0.942670  -0.005313   0.941730
-  dit_step2_xt             0.999456   0.477608   0.019544  -0.009177   0.909085  -0.009311   0.908527
-  dit_step3_xt             0.998699   0.734476   0.028962  -0.014472   0.873547  -0.014577   0.873624
-  dit_step4_xt             0.997001   1.052176   0.042099  -0.021523   0.841546  -0.021660   0.841995
-  dit_step5_xt             0.993185   1.523836   0.061805  -0.031603   0.824654  -0.032109   0.824593
-  dit_step6_xt             0.985908   2.172513   0.091680  -0.045910   0.855362  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q6_K.gguf
-[GGML] Running acestep-v15-sft-Q6_K.gguf...
-[GGML] Done, 233 dump files
-[SFT] Reusing existing Python dumps: python-sft
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999813
-  lyric_embed                          1.000000
-  enc_hidden                           0.999631
-  detok_output                         0.999969
-  context                              0.999982
-  noise                                1.000000
-  temb_t                               0.999973
-  hidden_after_proj_in                 0.999987
-  enc_after_cond_emb                   0.999652
-  layer0_sa_output                     0.999803
-  hidden_after_layer0                  0.999920
-  hidden_after_layer6                  0.999785
-  hidden_after_layer12                 0.999350
-  hidden_after_layer18                 0.998528
-  hidden_after_layer23                 0.998828
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998729
-  dit_step0_vt_uncond                  0.998412
-  dit_step0_vt                         0.995061
-  dit_step0_xt                         0.999998
-  dit_step5_vt_cond                    0.999147
-  dit_step5_vt                         0.992746
-  dit_step5_xt                         0.999953
-  dit_step10_vt_cond                   0.997986
-  dit_step10_vt                        0.991731
-  dit_step10_xt                        0.999832
-  dit_step15_vt_cond                   0.995896
-  dit_step15_vt                        0.984377
-  dit_step15_xt                        0.999476
-  dit_step20_vt_cond                   0.992581
-  dit_step20_vt                        0.975428
-  dit_step20_xt                        0.998605
-  dit_step25_vt_cond                   0.986288
-  dit_step25_vt                        0.963191
-  dit_step25_xt                        0.996899
-  dit_step30_vt_cond                   0.979478
-  dit_step30_vt                        0.956129
-  dit_step30_xt                        0.994252
-  dit_step35_vt_cond                   0.972963
-  dit_step35_vt                        0.947489
-  dit_step35_xt                        0.991078
-  dit_step40_vt_cond                   0.968903
-  dit_step40_vt                        0.939482
-  dit_step40_xt                        0.987990
-  dit_step45_vt_cond                   0.973091
-  dit_step45_vt                        0.949768
-  dit_step45_xt                        0.985825
-  dit_step49_vt_cond                   0.979346
-  dit_step49_vt                        0.959720
-  dit_x0                               0.985104
-  vae_audio                            0.940564
-  vae_audio (log spectral)             0.999648
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999996   0.039016   0.002154  -0.001750   0.980178  -0.001741   0.980402
-  dit_step5_xt             0.999952   0.136674   0.006709  -0.006940   0.889822  -0.007143   0.887999
-  dit_step10_xt            0.999831   0.203842   0.011045  -0.012357   0.811533  -0.012603   0.811299
-  dit_step15_xt            0.999475   0.335757   0.017566  -0.017603   0.746439  -0.018114   0.745269
-  dit_step20_xt            0.998605   0.555654   0.026541  -0.022932   0.700822  -0.023808   0.699582
-  dit_step25_xt            0.996899   0.830926   0.037973  -0.028358   0.679564  -0.029311   0.679278
-  dit_step30_xt            0.994252   1.135793   0.051746  -0.033803   0.685565  -0.035027   0.685262
-  dit_step35_xt            0.991078   1.467212   0.067373  -0.039173   0.717556  -0.040716   0.717196
-  dit_step40_xt            0.987990   1.880554   0.084328  -0.044527   0.771174  -0.046462   0.771853
-  dit_step45_xt            0.985824   2.238589   0.100473  -0.050335   0.842316  -0.052475   0.843036
diff --git a/tests/Metal_Q8_0.log b/tests/Metal_Q8_0.log
deleted file mode 100644
index 3ddbb85..0000000
--- a/tests/Metal_Q8_0.log
+++ /dev/null
@@ -1,823 +0,0 @@
-ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
-ggml_metal_library_init: using embedded metal library
-ggml_metal_library_init: loaded in 0.006 sec
-ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
-ggml_metal_device_init: GPU name:   MTL0
-ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
-ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
-ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
-ggml_metal_device_init: simdgroup reduction   = true
-ggml_metal_device_init: simdgroup matrix mul. = true
-ggml_metal_device_init: has unified memory    = true
-ggml_metal_device_init: has bfloat            = true
-ggml_metal_device_init: has tensor            = false
-ggml_metal_device_init: use residency sets    = true
-ggml_metal_device_init: use shared buffers    = true
-ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] DiT backend: MTL0 (CPU threads: 5)
-[Load] Backend init: 21.3 ms
-[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
-[DiT] MLP: gate+up fused
-[Load] null_condition_emb found (CFG available)
-[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend
-[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 1779.3 ms
-[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
-[Load] silence_latent: [15000, 64] from GGUF
-[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] VAE backend: MTL0 (CPU threads: 5)
-[VAE] Backend: MTL0, Weight buffer: 255.7 MB
-[VAE] Loaded: 5 blocks, upsample=1920x
-[Load] VAE weights: 272.0 ms
-[Request 1/1] ggml-turbo/request0.json (batch=1)
-[Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
-[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
-[Pipeline] 434 audio codes (86.8s @ 5Hz)
-[Pipeline] T=2170, S=1085
-[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 41.5 ms
-[Pipeline] caption: 70 tokens, lyrics: 167 tokens
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] TextEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 28L, H=1024, Nh=16/8
-[Load] TextEncoder: 228.7 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
-ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x14170b900 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x14170bd60 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x14170c5d0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
-ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x14170ca50 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x14170d2c0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x14170d8f0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x14170e1d0 | th_max =  576 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x14170e5e0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x14170e840 | th_max = 1024 | th_width =   32
-[Encode] TextEncoder (70 tokens): 44.9 ms
-[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 34.4 ms
-[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] CondEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
-[WeightCtx] Loaded 140 tensors, 616.6 MB into backend
-[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 787.4 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q8_0_f32', name = 'kernel_mul_mm_q8_0_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q8_0_f32_bci=0_bco=1            0x141718bd0 | th_max =  896 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x141719010 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x141719670 | th_max =  640 | th_width =   32
-[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 132.0 ms, enc_S=238
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Debug] enc_hidden: [238, 2048] first4: 1.750375 -0.049236 -0.134516 0.059443
-[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
-[WeightCtx] Loaded 30 tensors, 106.5 MB into backend
-[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 165.9 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x14160b720 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x14160c030 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q8_0_f32', name = 'kernel_mul_mv_q8_0_f32_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q8_0_f32_nsg=4                  0x14160c760 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x14160c9c0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_q8_0_f32', name = 'kernel_cpy_q8_0_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_cpy_q8_0_f32                           0x14160ce00 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q8_0_f32_r1_5', name = 'kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8      0x14160da60 | th_max =  640 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x14160dde0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x14160e040 | th_max =  448 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x14160ea80 | th_max = 1024 | th_width =   32
-[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 664.8 ms
-[Debug] detok_output: [2170, 64] first4: -0.124953 1.437660 0.307949 -0.624704
-[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
-[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
-[Debug] context: [2170, 128] first4: -0.124953 1.437660 0.307949 -0.624704
-[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1
-[DiT] Batch N=1, T=2170, S=1085, enc_S=238
-[DiT] Graph: 1841 nodes
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x14160f030 | th_max =  832 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x14160aa00 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x14160f9a0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x1416102f0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x1416109b0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x1416113c0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x1416118b0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x141610670 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x141611dc0 | th_max =  640 | th_width =   32
-[Debug] tproj: [12288] first4: 0.260124 -0.161873 -0.097043 0.052039
-[Debug] temb: [2048] first4: 0.000130 -0.132501 -0.035452 0.064788
-[Debug] temb_t: [2048] first4: 0.001146 0.026826 -0.052770 0.063722
-[Debug] temb_r: [2048] first4: -0.001015 -0.159327 0.017318 0.001066
-[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
-[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
-[Debug] temb_lin1_t: [2048] first4: -0.048950 -0.051683 -0.015299 -0.038721
-[Debug] temb_lin1_r: [2048] first4: -0.013066 -0.018836 -0.015732 0.008463
-[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.039670 -0.968864 0.535370 0.447502
-[Debug] proj_in_input: [192, 2170] first4: -0.124953 1.437660 0.307949 -0.624704
-[Debug] enc_after_cond_emb: [2048, 238] first4: -0.166626 0.855863 0.327675 -0.524847
-[Debug] layer0_sa_input: [2048, 1085] first4: -0.719151 -0.764016 -0.047420 0.261850
-[Debug] layer0_q_after_rope: [128, 16] first4: -0.166626 0.855863 0.327675 -0.524847
-[Debug] layer0_k_after_rope: [128, 8] first4: -1.224136 -0.868663 -1.921617 -2.258156
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.509784 0.173032 -0.350482 0.513236
-[Debug] layer0_attn_out: [2048, 1085] first4: -1.198180 -0.062361 -34.349155 -0.672093
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.543300 -1.045637 0.193163 0.457042
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.610051 -0.820831 -0.300355 0.492100
-[Debug] hidden_after_layer0: [2048, 1085] first4: -9.086482 0.559607 52.406876 -0.905369
-[Debug] hidden_after_layer6: [2048, 1085] first4: -21.433996 -0.091888 33.781910 -4.433238
-[Debug] hidden_after_layer12: [2048, 1085] first4: -15.201079 -18.070684 72.561172 28.713606
-[Debug] hidden_after_layer18: [2048, 1085] first4: -26.474438 14.961594 62.515419 20.237282
-[Debug] hidden_after_layer23: [2048, 1085] first4: -7.978052 44.256046 198.826355 145.129532
-[Debug] dit_step0_vt: [2170, 64] first4: 0.022187 1.144711 0.357881 2.375370
-[Debug] dit_step0_xt: [2170, 64] first4: 0.193327 2.104218 -0.188142 0.739685
-[DiT] step 1/8 t=1.000
-[Debug] dit_step1_vt: [2170, 64] first4: -0.218329 1.318116 -0.102946 1.902612
-[Debug] dit_step1_xt: [2170, 64] first4: 0.205236 2.032320 -0.182527 0.635906
-[DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: 0.013556 1.214101 0.135533 2.387155
-[Debug] dit_step2_xt: [2170, 64] first4: 0.204333 1.951380 -0.191563 0.476762
-[DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: 0.268002 1.073703 0.267121 2.643928
-[Debug] dit_step3_xt: [2170, 64] first4: 0.181999 1.861905 -0.213823 0.256435
-[DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 0.304715 1.021377 0.118111 2.720495
-[Debug] dit_step4_xt: [2170, 64] first4: 0.149351 1.752472 -0.226477 -0.035047
-[DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 0.279773 0.924189 -0.283976 2.780081
-[Debug] dit_step5_xt: [2170, 64] first4: 0.109384 1.620445 -0.185910 -0.432201
-[DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: 0.163348 0.641980 -0.841978 2.816087
-[Debug] dit_step6_xt: [2170, 64] first4: 0.076714 1.492049 -0.017514 -0.995418
-[DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: -0.026257 0.197844 -1.519455 3.080479
-[Debug] dit_x0: [2170, 64] first4: 0.084591 1.432696 0.438323 -1.919562
-[DiT] step 8/8 t=0.300
-[DiT] Total generation: 7097.2 ms (7097.2 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.084591 1.432696 0.438323 -1.919562
-[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 474 nodes, T_latent=192
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
-ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x141717870 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x1417100a0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x1417089d0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x141708e50 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x141719b10 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x14171a1d0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x14171a730 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x14171b660 | th_max = 1024 | th_width =   32
-[VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 474 nodes, T_latent=256
-[VAE] Graph: 474 nodes, T_latent=186
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x141618f80 | th_max =  896 | th_width =   32
-[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 609553.7 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000462 0.000971 0.000803 0.001170
-[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Request 1/1] Done
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Pipeline] All done
-ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
-ggml_metal_library_init: using embedded metal library
-ggml_metal_library_init: loaded in 0.006 sec
-ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
-ggml_metal_device_init: GPU name:   MTL0
-ggml_metal_device_init: GPU family: MTLGPUFamilyApple8  (1008)
-ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
-ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3  (5001)
-ggml_metal_device_init: simdgroup reduction   = true
-ggml_metal_device_init: simdgroup matrix mul. = true
-ggml_metal_device_init: has unified memory    = true
-ggml_metal_device_init: has bfloat            = true
-ggml_metal_device_init: has tensor            = false
-ggml_metal_device_init: use residency sets    = true
-ggml_metal_device_init: use shared buffers    = true
-ggml_metal_device_init: recommendedMaxWorkingSetSize  = 11453.25 MB
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] DiT backend: MTL0 (CPU threads: 5)
-[Load] Backend init: 20.2 ms
-[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800
-[DiT] MLP: gate+up fused
-[Load] null_condition_emb found (CFG available)
-[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend
-[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 2506.1 ms
-[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800
-[Load] silence_latent: [15000, 64] from GGUF
-[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] VAE backend: MTL0 (CPU threads: 5)
-[VAE] Backend: MTL0, Weight buffer: 255.7 MB
-[VAE] Loaded: 5 blocks, upsample=1920x
-[Load] VAE weights: 340.1 ms
-[Request 1/1] ggml-sft/request0.json (batch=1)
-[Request] parsed ggml-sft/request0.json (18 fields)
-[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s
-[Pipeline] 434 audio codes (86.8s @ 5Hz)
-[Pipeline] T=2170, S=1085
-[BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 40.9 ms
-[Pipeline] caption: 70 tokens, lyrics: 167 tokens
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] TextEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 28L, H=1024, Nh=16/8
-[Load] TextEncoder: 238.6 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16'
-ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16                          0x13060e0d0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4                     0x13060e830 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1            0x13060f670 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0'
-ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0                 0x13060f8d0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64      0x130610350 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64      0x1306108b0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4      0x130610b10 | th_max =  576 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0      0x1306115a0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32                             0x130611800 | th_max = 1024 | th_width =   32
-[Encode] TextEncoder (70 tokens): 49.2 ms
-[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 34.1 ms
-[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-ggml_metal_init: allocating
-ggml_metal_init: found device: Apple M2 Pro
-ggml_metal_init: picking default device: Apple M2 Pro
-ggml_metal_init: use fusion         = true
-ggml_metal_init: use concurrency    = true
-ggml_metal_init: use graph optimize = true
-[Load] CondEncoder backend: MTL0 (CPU threads: 5)
-[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800
-[WeightCtx] Loaded 140 tensors, 616.6 MB into backend
-[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 615.4 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q8_0_f32', name = 'kernel_mul_mm_q8_0_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q8_0_f32_bci=0_bco=1            0x130709710 | th_max =  896 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64      0x130709b90 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4      0x13070a1c0 | th_max =  640 | th_width =   32
-[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 129.6 ms, enc_S=238
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Debug] enc_hidden: [238, 2048] first4: 1.750375 -0.049236 -0.134516 0.059443
-[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800
-[WeightCtx] Loaded 30 tensors, 106.5 MB into backend
-[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 77.5 ms
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1            0x130708890 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1      0x13070ab80 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q8_0_f32', name = 'kernel_mul_mv_q8_0_f32_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q8_0_f32_nsg=4                  0x13070ade0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32                             0x13070b260 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_q8_0_f32', name = 'kernel_cpy_q8_0_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_cpy_q8_0_f32                           0x13070b970 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q8_0_f32_r1_5', name = 'kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8      0x13070c5e0 | th_max =  640 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32      0x13070c840 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32      0x13070cc50 | th_max =  448 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32      0x13070d590 | th_max = 1024 | th_width =   32
-[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 663.7 ms
-[Debug] detok_output: [2170, 64] first4: -0.124953 1.437660 0.307949 -0.624704
-[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16
-[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
-[Debug] context: [2170, 128] first4: -0.124953 1.437660 0.307949 -0.624704
-[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1
-[DiT] Batch N=1, T=2170, S=1085, enc_S=238
-[DiT] Graph: 1841 nodes
-[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683
-[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683
-[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1             0x130612120 | th_max =  832 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_scale_f32                              0x13061a740 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32                 0x13061af30 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4                             0x13061b880 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1      0x13061bf40 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1      0x13061c950 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0      0x13061ce40 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32                            0x13061bc00 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4      0x13061d350 | th_max =  640 | th_width =   32
-[Debug] tproj: [12288] first4: 0.154891 -0.116111 -0.086457 0.081949
-[Debug] temb: [2048] first4: -0.002756 -0.176432 0.004178 -0.001982
-[Debug] temb_t: [2048] first4: -0.001185 0.003330 -0.013113 -0.002073
-[Debug] temb_r: [2048] first4: -0.001571 -0.179762 0.017291 0.000091
-[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583
-[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000
-[Debug] temb_lin1_t: [2048] first4: -0.038370 0.029875 0.028026 -0.024772
-[Debug] temb_lin1_r: [2048] first4: 0.001863 0.003353 -0.000552 -0.000197
-[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.090297 -0.925707 0.497575 0.441158
-[Debug] proj_in_input: [192, 2170] first4: -0.124953 1.437660 0.307949 -0.624704
-[Debug] enc_after_cond_emb: [2048, 238] first4: -0.185352 0.911478 0.323017 -0.548477
-[Debug] layer0_sa_input: [2048, 1085] first4: -0.915448 -0.716620 -0.037252 0.294421
-[Debug] layer0_q_after_rope: [128, 16] first4: -0.185352 0.911478 0.323017 -0.548477
-[Debug] layer0_k_after_rope: [128, 8] first4: -1.352387 -0.993045 -1.790654 -2.255961
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.646956 0.807681 -0.548860 0.541048
-[Debug] layer0_attn_out: [2048, 1085] first4: -1.095906 0.020644 -33.592499 -0.642108
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.803416 -1.383010 -0.122776 0.380531
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.967658 -1.044807 -0.403243 0.382801
-[Debug] hidden_after_layer0: [2048, 1085] first4: -9.409370 1.156843 57.423218 -1.617135
-[Debug] hidden_after_layer6: [2048, 1085] first4: -17.478519 4.365310 57.869217 -2.590640
-[Debug] hidden_after_layer12: [2048, 1085] first4: -10.105301 4.450487 -23.010748 -1.911694
-[Debug] hidden_after_layer18: [2048, 1085] first4: -3.554647 18.982800 -38.417198 -11.175929
-[Debug] hidden_after_layer23: [2048, 1085] first4: 34.303474 64.211922 58.212040 -9.279413
-[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.583175 2.543502 -0.173930 1.461996
-[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.311259 2.094594 -0.691959 1.747500
-[Debug] dit_step0_vt: [2170, 64] first4: -0.765032 3.014946 0.101383 1.079777
-[Debug] dit_step0_xt: [2170, 64] first4: 0.209637 2.095951 -0.173903 0.826061
-[DiT] step 1/50 t=1.000
-[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.635693 2.541408 0.005301 1.299802
-[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.555907 2.342016 -0.102693 1.478369
-[Debug] dit_step1_vt: [2170, 64] first4: -0.416332 2.156285 -0.189682 1.386406
-[Debug] dit_step1_xt: [2170, 64] first4: 0.217963 2.052825 -0.170109 0.798333
-[DiT] step 2/50 t=0.980
-[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.628899 2.502596 0.041825 1.309368
-[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.545262 2.302532 -0.090023 1.423282
-[Debug] dit_step2_vt: [2170, 64] first4: -0.777333 2.936136 0.278430 1.015660
-[Debug] dit_step2_xt: [2170, 64] first4: 0.233510 1.994103 -0.175678 0.778019
-[DiT] step 3/50 t=0.960
-[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.587814 2.422845 0.053595 1.321287
-[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.577692 2.332772 -0.037127 1.357753
-[Debug] dit_step3_vt: [2170, 64] first4: -0.357759 2.100169 -0.131042 1.500563
-[Debug] dit_step3_xt: [2170, 64] first4: 0.240665 1.952099 -0.173057 0.748008
-[DiT] step 4/50 t=0.940
-[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.531520 2.346682 0.048538 1.319634
-[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.572059 2.299766 -0.047866 1.336576
-[Debug] dit_step4_vt: [2170, 64] first4: -0.562290 2.601383 0.287872 1.079526
-[Debug] dit_step4_xt: [2170, 64] first4: 0.251911 1.900071 -0.178814 0.726417
-[DiT] step 5/50 t=0.920
-[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.471980 2.275284 0.016048 1.313642
-[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.581892 2.263602 -0.014694 1.345006
-[Debug] dit_step5_vt: [2170, 64] first4: -0.118539 1.956480 -0.197979 1.364062
-[Debug] dit_step5_xt: [2170, 64] first4: 0.254282 1.860942 -0.174855 0.699136
-[DiT] step 6/50 t=0.900
-[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.438105 2.231154 0.001016 1.310053
-[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.544084 2.196794 -0.016910 1.357073
-[Debug] dit_step6_vt: [2170, 64] first4: -0.411409 2.467071 0.199489 1.062841
-[Debug] dit_step6_xt: [2170, 64] first4: 0.262510 1.811600 -0.178844 0.677879
-[DiT] step 7/50 t=0.880
-[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.388773 2.150816 -0.023557 1.299900
-[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.488913 2.112037 -0.027399 1.344294
-[Debug] dit_step7_vt: [2170, 64] first4: -0.088803 1.961096 -0.200603 1.289382
-[Debug] dit_step7_xt: [2170, 64] first4: 0.264286 1.772379 -0.174832 0.652092
-[DiT] step 8/50 t=0.860
-[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.350721 2.065278 -0.035497 1.282128
-[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.410401 2.018277 -0.041600 1.310604
-[Debug] dit_step8_vt: [2170, 64] first4: -0.374781 2.238836 0.127401 1.108719
-[Debug] dit_step8_xt: [2170, 64] first4: 0.271781 1.727602 -0.177380 0.629917
-[DiT] step 9/50 t=0.840
-[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.318524 1.978134 -0.051657 1.264737
-[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.341550 1.941071 -0.051996 1.283970
-[Debug] dit_step9_vt: [2170, 64] first4: -0.191845 1.828466 -0.186632 1.247382
-[Debug] dit_step9_xt: [2170, 64] first4: 0.275618 1.691033 -0.173648 0.604970
-[DiT] step 10/50 t=0.820
-[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.295512 1.899216 -0.053004 1.254855
-[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.287151 1.875865 -0.046272 1.283265
-[Debug] dit_step10_vt: [2170, 64] first4: -0.389814 1.941252 0.040587 1.075441
-[Debug] dit_step10_xt: [2170, 64] first4: 0.283415 1.652207 -0.174460 0.583461
-[DiT] step 11/50 t=0.800
-[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.275416 1.819571 -0.061343 1.243315
-[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.256136 1.821522 -0.037205 1.292460
-[Debug] dit_step11_vt: [2170, 64] first4: -0.238914 1.604097 -0.208856 1.109252
-[Debug] dit_step11_xt: [2170, 64] first4: 0.288193 1.620125 -0.170282 0.561276
-[DiT] step 12/50 t=0.780
-[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.256205 1.750170 -0.060178 1.242222
-[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.227482 1.778021 -0.026911 1.317358
-[Debug] dit_step12_vt: [2170, 64] first4: -0.364171 1.624805 -0.047159 0.969305
-[Debug] dit_step12_xt: [2170, 64] first4: 0.295476 1.587629 -0.169339 0.541890
-[DiT] step 13/50 t=0.760
-[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.235268 1.684496 -0.057297 1.240694
-[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.197700 1.726529 -0.035090 1.325942
-[Debug] dit_step13_vt: [2170, 64] first4: -0.273492 1.425781 -0.144172 1.021587
-[Debug] dit_step13_xt: [2170, 64] first4: 0.300946 1.559114 -0.166456 0.521458
-[DiT] step 14/50 t=0.740
-[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.215348 1.630674 -0.040594 1.249328
-[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.174794 1.672403 -0.046957 1.327406
-[Debug] dit_step14_vt: [2170, 64] first4: -0.334811 1.472660 0.055936 0.996336
-[Debug] dit_step14_xt: [2170, 64] first4: 0.307642 1.529660 -0.167575 0.501531
-[DiT] step 15/50 t=0.720
-[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.197163 1.578918 -0.035665 1.254164
-[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.153038 1.617873 -0.066125 1.317649
-[Debug] dit_step15_vt: [2170, 64] first4: -0.277163 1.353290 0.013043 1.079428
-[Debug] dit_step15_xt: [2170, 64] first4: 0.313186 1.502595 -0.167835 0.479943
-[DiT] step 16/50 t=0.700
-[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.178978 1.530298 -0.037333 1.260345
-[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.134279 1.564621 -0.087480 1.303262
-[Debug] dit_step16_vt: [2170, 64] first4: -0.306646 1.373590 0.150592 1.110865
-[Debug] dit_step16_xt: [2170, 64] first4: 0.319319 1.475123 -0.170847 0.457726
-[DiT] step 17/50 t=0.680
-[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.160694 1.478322 -0.055287 1.256185
-[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.120116 1.510010 -0.115773 1.278657
-[Debug] dit_step17_vt: [2170, 64] first4: -0.246079 1.282344 0.092105 1.178406
-[Debug] dit_step17_xt: [2170, 64] first4: 0.324240 1.449476 -0.172689 0.434157
-[DiT] step 18/50 t=0.660
-[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.143064 1.420482 -0.082808 1.247799
-[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.106988 1.460190 -0.151634 1.257033
-[Debug] dit_step18_vt: [2170, 64] first4: -0.255780 1.219606 0.175261 1.179049
-[Debug] dit_step18_xt: [2170, 64] first4: 0.329356 1.425084 -0.176195 0.410576
-[DiT] step 19/50 t=0.640
-[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.128746 1.354206 -0.117595 1.227311
-[Debug] dit_step19_vt_uncond: [2170, 64] first4: -0.089307 1.410491 -0.191474 1.230714
-[Debug] dit_step19_vt: [2170, 64] first4: -0.249608 1.072434 0.109807 1.183548
-[Debug] dit_step19_xt: [2170, 64] first4: 0.334348 1.403635 -0.178391 0.386906
-[DiT] step 20/50 t=0.620
-[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.119003 1.272067 -0.154096 1.204746
-[Debug] dit_step20_vt_uncond: [2170, 64] first4: -0.074174 1.352716 -0.224701 1.203207
-[Debug] dit_step20_vt: [2170, 64] first4: -0.270581 0.922657 0.097471 1.173580
-[Debug] dit_step20_xt: [2170, 64] first4: 0.339760 1.385182 -0.180340 0.363434
-[DiT] step 21/50 t=0.600
-[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.119830 1.180927 -0.187708 1.173581
-[Debug] dit_step21_vt_uncond: [2170, 64] first4: -0.067616 1.288429 -0.252699 1.168760
-[Debug] dit_step21_vt: [2170, 64] first4: -0.293490 0.743300 0.018199 1.149608
-[Debug] dit_step21_xt: [2170, 64] first4: 0.345629 1.370316 -0.180704 0.340442
-[DiT] step 22/50 t=0.580
-[Debug] dit_step22_vt_cond: [2170, 64] first4: -0.122855 1.082593 -0.215883 1.140040
-[Debug] dit_step22_vt_uncond: [2170, 64] first4: -0.066099 1.222055 -0.270495 1.136248
-[Debug] dit_step22_vt: [2170, 64] first4: -0.307841 0.532235 -0.036651 1.101102
-[Debug] dit_step22_xt: [2170, 64] first4: 0.351786 1.359671 -0.179971 0.318420
-[DiT] step 23/50 t=0.560
-[Debug] dit_step23_vt_cond: [2170, 64] first4: -0.132629 0.986949 -0.241285 1.111620
-[Debug] dit_step23_vt_uncond: [2170, 64] first4: -0.070734 1.155219 -0.283975 1.102751
-[Debug] dit_step23_vt: [2170, 64] first4: -0.338930 0.377693 -0.112170 1.114765
-[Debug] dit_step23_xt: [2170, 64] first4: 0.358565 1.352118 -0.177728 0.296124
-[DiT] step 24/50 t=0.540
-[Debug] dit_step24_vt_cond: [2170, 64] first4: -0.144419 0.884916 -0.261786 1.077981
-[Debug] dit_step24_vt_uncond: [2170, 64] first4: -0.082416 1.084641 -0.291092 1.068368
-[Debug] dit_step24_vt: [2170, 64] first4: -0.333728 0.169552 -0.174635 1.048605
-[Debug] dit_step24_xt: [2170, 64] first4: 0.365239 1.348727 -0.174235 0.275152
-[DiT] step 25/50 t=0.520
-[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.162952 0.778074 -0.275456 1.049087
-[Debug] dit_step25_vt_uncond: [2170, 64] first4: -0.097955 1.005611 -0.292343 1.037690
-[Debug] dit_step25_vt: [2170, 64] first4: -0.381506 0.021539 -0.229765 1.053622
-[Debug] dit_step25_xt: [2170, 64] first4: 0.372869 1.348296 -0.169640 0.254080
-[DiT] step 26/50 t=0.500
-[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.181652 0.659926 -0.286432 1.013943
-[Debug] dit_step26_vt_uncond: [2170, 64] first4: -0.115043 0.914795 -0.293755 1.005103
-[Debug] dit_step26_vt: [2170, 64] first4: -0.384189 -0.243706 -0.263819 0.974856
-[Debug] dit_step26_xt: [2170, 64] first4: 0.380553 1.353170 -0.164363 0.234583
-[DiT] step 27/50 t=0.480
-[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.201740 0.544023 -0.293109 0.975384
-[Debug] dit_step27_vt_uncond: [2170, 64] first4: -0.133396 0.820908 -0.294792 0.973169
-[Debug] dit_step27_vt: [2170, 64] first4: -0.422354 -0.384602 -0.284394 0.929557
-[Debug] dit_step27_xt: [2170, 64] first4: 0.389000 1.360862 -0.158675 0.215992
-[DiT] step 28/50 t=0.460
-[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.224075 0.417509 -0.297748 0.929072
-[Debug] dit_step28_vt_uncond: [2170, 64] first4: -0.152221 0.713812 -0.298235 0.936752
-[Debug] dit_step28_vt: [2170, 64] first4: -0.444730 -0.577905 -0.283219 0.830288
-[Debug] dit_step28_xt: [2170, 64] first4: 0.397895 1.372420 -0.153011 0.199386
-[DiT] step 29/50 t=0.440
-[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.245692 0.290209 -0.302133 0.880189
-[Debug] dit_step29_vt_uncond: [2170, 64] first4: -0.168230 0.597160 -0.303812 0.900573
-[Debug] dit_step29_vt: [2170, 64] first4: -0.492967 -0.691559 -0.283257 0.750519
-[Debug] dit_step29_xt: [2170, 64] first4: 0.407754 1.386251 -0.147346 0.184375
-[DiT] step 30/50 t=0.420
-[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.267059 0.157186 -0.303854 0.831628
-[Debug] dit_step30_vt_uncond: [2170, 64] first4: -0.182529 0.468759 -0.310389 0.867764
-[Debug] dit_step30_vt: [2170, 64] first4: -0.526538 -0.832075 -0.259881 0.638692
-[Debug] dit_step30_xt: [2170, 64] first4: 0.418285 1.402893 -0.142148 0.171602
-[DiT] step 31/50 t=0.400
-[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.287749 0.019218 -0.305370 0.782136
-[Debug] dit_step31_vt_uncond: [2170, 64] first4: -0.194848 0.329909 -0.318059 0.834346
-[Debug] dit_step31_vt: [2170, 64] first4: -0.575041 -0.931991 -0.245392 0.549215
-[Debug] dit_step31_xt: [2170, 64] first4: 0.429786 1.421533 -0.137240 0.160617
-[DiT] step 32/50 t=0.380
-[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.304426 -0.117694 -0.305028 0.730304
-[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.204242 0.187789 -0.324302 0.800084
-[Debug] dit_step32_vt: [2170, 64] first4: -0.600474 -1.036970 -0.221470 0.433362
-[Debug] dit_step32_xt: [2170, 64] first4: 0.441795 1.442272 -0.132811 0.151950
-[DiT] step 33/50 t=0.360
-[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.319238 -0.254083 -0.299488 0.682753
-[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.210244 0.042433 -0.326321 0.765284
-[Debug] dit_step33_vt: [2170, 64] first4: -0.649450 -1.121861 -0.194288 0.366186
-[Debug] dit_step33_xt: [2170, 64] first4: 0.454784 1.464709 -0.128925 0.144626
-[DiT] step 34/50 t=0.340
-[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.329936 -0.381831 -0.294650 0.641700
-[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.211479 -0.098170 -0.329415 0.733958
-[Debug] dit_step34_vt: [2170, 64] first4: -0.683049 -1.194662 -0.165597 0.286680
-[Debug] dit_step34_xt: [2170, 64] first4: 0.468445 1.488602 -0.125613 0.138893
-[DiT] step 35/50 t=0.320
-[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.339163 -0.510193 -0.285358 0.597418
-[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.213347 -0.237217 -0.326480 0.696784
-[Debug] dit_step35_vt: [2170, 64] first4: -0.716524 -1.285484 -0.142755 0.230145
-[Debug] dit_step35_xt: [2170, 64] first4: 0.482776 1.514312 -0.122758 0.134290
-[DiT] step 36/50 t=0.300
-[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.344282 -0.639077 -0.274660 0.557109
-[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.213018 -0.378000 -0.321460 0.659633
-[Debug] dit_step36_vt: [2170, 64] first4: -0.737407 -1.359316 -0.114364 0.177555
-[Debug] dit_step36_xt: [2170, 64] first4: 0.497524 1.541498 -0.120471 0.130739
-[DiT] step 37/50 t=0.280
-[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.350530 -0.754109 -0.251615 0.515208
-[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.215775 -0.505277 -0.306489 0.618527
-[Debug] dit_step37_vt: [2170, 64] first4: -0.758482 -1.436458 -0.064580 0.134758
-[Debug] dit_step37_xt: [2170, 64] first4: 0.512694 1.570228 -0.119179 0.128044
-[DiT] step 38/50 t=0.260
-[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.351878 -0.862931 -0.225801 0.473748
-[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.216566 -0.629070 -0.288643 0.572433
-[Debug] dit_step38_vt: [2170, 64] first4: -0.762284 -1.485620 -0.020081 0.114390
-[Debug] dit_step38_xt: [2170, 64] first4: 0.527939 1.599940 -0.118778 0.125756
-[DiT] step 39/50 t=0.240
-[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.353231 -0.974004 -0.188379 0.427506
-[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.220991 -0.753843 -0.259805 0.519853
-[Debug] dit_step39_vt: [2170, 64] first4: -0.758813 -1.561977 0.045217 0.080611
-[Debug] dit_step39_xt: [2170, 64] first4: 0.543115 1.631179 -0.119682 0.124144
-[DiT] step 40/50 t=0.220
-[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.356234 -1.066793 -0.144915 0.378638
-[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.228754 -0.860472 -0.222172 0.464689
-[Debug] dit_step40_vt: [2170, 64] first4: -0.749040 -1.587260 0.097200 0.055798
-[Debug] dit_step40_xt: [2170, 64] first4: 0.558096 1.662925 -0.121626 0.123028
-[DiT] step 41/50 t=0.200
-[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.355270 -1.157881 -0.092032 0.327957
-[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.236702 -0.967551 -0.178051 0.403448
-[Debug] dit_step41_vt: [2170, 64] first4: -0.709683 -1.623054 0.190026 0.058188
-[Debug] dit_step41_xt: [2170, 64] first4: 0.572290 1.695386 -0.125427 0.121864
-[DiT] step 42/50 t=0.180
-[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.352479 -1.244785 -0.033309 0.267733
-[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.244508 -1.070808 -0.121544 0.336131
-[Debug] dit_step42_vt: [2170, 64] first4: -0.676820 -1.651634 0.233357 0.014431
-[Debug] dit_step42_xt: [2170, 64] first4: 0.585826 1.728418 -0.130094 0.121575
-[DiT] step 43/50 t=0.160
-[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.345241 -1.321976 0.026233 0.204406
-[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.251274 -1.163180 -0.061907 0.264783
-[Debug] dit_step43_vt: [2170, 64] first4: -0.615402 -1.690314 0.311200 0.000887
-[Debug] dit_step43_xt: [2170, 64] first4: 0.598134 1.762225 -0.136318 0.121558
-[DiT] step 44/50 t=0.140
-[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.330899 -1.390417 0.088632 0.138209
-[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.253711 -1.249830 -0.000380 0.191808
-[Debug] dit_step44_vt: [2170, 64] first4: -0.551020 -1.700038 0.375316 -0.049211
-[Debug] dit_step44_xt: [2170, 64] first4: 0.609155 1.796225 -0.143824 0.122542
-[DiT] step 45/50 t=0.120
-[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.315962 -1.445453 0.152387 0.071900
-[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.260821 -1.326647 0.053483 0.118309
-[Debug] dit_step45_vt: [2170, 64] first4: -0.444048 -1.697294 0.502791 -0.074117
-[Debug] dit_step45_xt: [2170, 64] first4: 0.618036 1.830171 -0.153880 0.124024
-[DiT] step 46/50 t=0.100
-[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.294778 -1.496297 0.212962 0.010717
-[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.258755 -1.395077 0.130780 0.026842
-[Debug] dit_step46_vt: [2170, 64] first4: -0.331878 -1.748994 0.398840 0.016598
-[Debug] dit_step46_xt: [2170, 64] first4: 0.624673 1.865151 -0.161857 0.123692
-[DiT] step 47/50 t=0.080
-[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.281226 -1.541478 0.262625 -0.022201
-[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.279224 -1.453849 0.178946 -0.018997
-[Debug] dit_step47_vt: [2170, 64] first4: -0.212368 -1.759161 0.579829 -0.049871
-[Debug] dit_step47_xt: [2170, 64] first4: 0.628921 1.900334 -0.173453 0.124690
-[DiT] step 48/50 t=0.060
-[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.256958 -1.573266 0.310890 -0.048733
-[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.281065 -1.517397 0.260529 -0.069026
-[Debug] dit_step48_vt: [2170, 64] first4: -0.191601 -1.679712 0.330919 0.028046
-[Debug] dit_step48_xt: [2170, 64] first4: 0.632753 1.933929 -0.180072 0.124129
-[DiT] step 49/50 t=0.040
-[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.282571 -1.604237 0.314485 -0.067221
-[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.305600 -1.528070 0.251822 -0.083237
-[Debug] dit_step49_vt: [2170, 64] first4: -0.212302 -1.898327 0.637213 -0.078416
-[Debug] dit_x0: [2170, 64] first4: 0.636999 1.971895 -0.192816 0.125697
-[DiT] step 50/50 t=0.020
-[DiT] Total generation: 88329.8 ms (88329.8 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.636999 1.971895 -0.192816 0.125697
-[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 474 nodes, T_latent=192
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16'
-ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16                             0x13070d7f0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0             0x13070e360 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0      0x13070e5c0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0'
-ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0      0x13070eb20 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4                              0x13070eea0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4'
-ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4                              0x13070f4a0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32'
-ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32              0x13070f8c0 | th_max = 1024 | th_width =   32
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4                 0x130710c50 | th_max = 1024 | th_width =   32
-[VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 474 nodes, T_latent=256
-[VAE] Graph: 474 nodes, T_latent=186
-ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1'
-ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1             0x130710eb0 | th_max =  896 | th_width =   32
-[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 609578.6 ms
-[Debug] vae_audio: [2, 4166400] first4: -0.002759 -0.002685 -0.002611 -0.002214
-[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Request 1/1] Done
-ggml_metal_free: deallocating
-ggml_metal_free: deallocating
-[Pipeline] All done
-[Request] Loaded request0.json
-[Noise] Reusing existing rng_philox_seed42.bf16
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
-[GGML] Running acestep-v15-turbo-Q8_0.gguf...
-[GGML] Done, 47 dump files
-[Turbo] Reusing existing Python dumps: python-turbo
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999813
-  lyric_embed                          1.000000
-  enc_hidden                           0.999792
-  detok_output                         0.999991
-  context                              0.999996
-  noise                                1.000000
-  temb_t                               0.999998
-  hidden_after_proj_in                 0.999992
-  enc_after_cond_emb                   0.999776
-  layer0_sa_output                     0.999956
-  hidden_after_layer0                  0.999975
-  hidden_after_layer6                  0.999915
-  hidden_after_layer12                 0.999185
-  hidden_after_layer18                 0.996490
-  hidden_after_layer23                 0.993375
-  dit_step0_vt                         0.974442
-  dit_step0_xt                         0.999944
-  dit_step1_vt                         0.976442
-  dit_step1_xt                         0.999818
-  dit_step2_vt                         0.978398
-  dit_step2_xt                         0.999498
-  dit_step3_vt                         0.979729
-  dit_step3_xt                         0.998787
-  dit_step4_vt                         0.979038
-  dit_step4_xt                         0.997189
-  dit_step5_vt                         0.976705
-  dit_step5_xt                         0.993692
-  dit_step6_vt                         0.973710
-  dit_step6_xt                         0.987147
-  dit_step7_vt                         0.967471
-  dit_x0                               0.977589
-  vae_audio                            0.899969
-  vae_audio (log spectral)             0.999797
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999943   0.142558   0.006797  -0.002331   0.972917  -0.002342   0.972003
-  dit_step1_xt             0.999817   0.272611   0.011800  -0.005326   0.942672  -0.005313   0.941730
-  dit_step2_xt             0.999496   0.463653   0.018437  -0.009355   0.909197  -0.009311   0.908527
-  dit_step3_xt             0.998785   0.659420   0.027271  -0.014709   0.873849  -0.014577   0.873624
-  dit_step4_xt             0.997188   0.977590   0.039587  -0.021771   0.842053  -0.021660   0.841995
-  dit_step5_xt             0.993691   1.450203   0.057714  -0.031846   0.825442  -0.032109   0.824593
-  dit_step6_xt             0.987145   2.144326   0.085260  -0.046128   0.856513  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q8_0.gguf
-[GGML] Running acestep-v15-sft-Q8_0.gguf...
-[GGML] Done, 233 dump files
-[SFT] Reusing existing Python dumps: python-sft
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999813
-  lyric_embed                          1.000000
-  enc_hidden                           0.999792
-  detok_output                         0.999991
-  context                              0.999996
-  noise                                1.000000
-  temb_t                               0.999994
-  hidden_after_proj_in                 0.999993
-  enc_after_cond_emb                   0.999779
-  layer0_sa_output                     0.999948
-  hidden_after_layer0                  0.999975
-  hidden_after_layer6                  0.999842
-  hidden_after_layer12                 0.999467
-  hidden_after_layer18                 0.998721
-  hidden_after_layer23                 0.998987
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998936
-  dit_step0_vt_uncond                  0.998589
-  dit_step0_vt                         0.995617
-  dit_step0_xt                         0.999998
-  dit_step5_vt_cond                    0.999453
-  dit_step5_vt                         0.993749
-  dit_step5_xt                         0.999962
-  dit_step10_vt_cond                   0.998786
-  dit_step10_vt                        0.993632
-  dit_step10_xt                        0.999883
-  dit_step15_vt_cond                   0.996925
-  dit_step15_vt                        0.985179
-  dit_step15_xt                        0.999650
-  dit_step20_vt_cond                   0.993789
-  dit_step20_vt                        0.978156
-  dit_step20_xt                        0.998994
-  dit_step25_vt_cond                   0.988666
-  dit_step25_vt                        0.968588
-  dit_step25_xt                        0.997635
-  dit_step30_vt_cond                   0.983353
-  dit_step30_vt                        0.963692
-  dit_step30_xt                        0.995502
-  dit_step35_vt_cond                   0.978311
-  dit_step35_vt                        0.954994
-  dit_step35_xt                        0.992900
-  dit_step40_vt_cond                   0.975242
-  dit_step40_vt                        0.949054
-  dit_step40_xt                        0.990408
-  dit_step45_vt_cond                   0.977875
-  dit_step45_vt                        0.949872
-  dit_step45_xt                        0.988656
-  dit_step49_vt_cond                   0.980007
-  dit_step49_vt                        0.943555
-  dit_x0                               0.988056
-  vae_audio                            0.945079
-  vae_audio (log spectral)             0.999659
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999997   0.038313   0.002069  -0.001710   0.980019  -0.001741   0.980402
-  dit_step5_xt             0.999960   0.128136   0.005945  -0.006874   0.889005  -0.007143   0.887999
-  dit_step10_xt            0.999882   0.212035   0.009258  -0.012273   0.810355  -0.012603   0.811299
-  dit_step15_xt            0.999649   0.310457   0.014288  -0.017479   0.745211  -0.018114   0.745269
-  dit_step20_xt            0.998994   0.579346   0.021839  -0.022740   0.699641  -0.023808   0.699582
-  dit_step25_xt            0.997635   0.874619   0.031657  -0.028120   0.678310  -0.029311   0.679278
-  dit_step30_xt            0.995501   1.140020   0.043494  -0.033543   0.684534  -0.035027   0.685262
-  dit_step35_xt            0.992900   1.761304   0.057050  -0.038898   0.716898  -0.040716   0.717196
-  dit_step40_xt            0.990407   2.128224   0.071732  -0.044231   0.770985  -0.046462   0.771853
-  dit_step45_xt            0.988655   2.420490   0.085663  -0.050087   0.842526  -0.052475   0.843036
diff --git a/tests/Q4_K_M.log b/tests/Q4_K_M.log
deleted file mode 100644
index d76238c..0000000
--- a/tests/Q4_K_M.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
-[GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999805
-  lyric_embed                          1.000000
-  enc_hidden                           0.997032
-  detok_output                         0.999610
-  context                              0.999750
-  noise                                1.000000
-  temb_t                               0.999902
-  hidden_after_proj_in                 0.999908
-  enc_after_cond_emb                   0.997517
-  layer0_sa_output                     0.998371
-  hidden_after_layer0                  0.999675
-  hidden_after_layer6                  0.999257
-  hidden_after_layer12                 0.995500
-  hidden_after_layer18                 0.991597
-  hidden_after_layer23                 0.985460
-  dit_step0_vt                         0.947383
-  dit_step0_xt                         0.999885
-  dit_step1_vt                         0.947784
-  dit_step1_xt                         0.999617
-  dit_step2_vt                         0.957305
-  dit_step2_xt                         0.999014
-  dit_step3_vt                         0.961931
-  dit_step3_xt                         0.997757
-  dit_step4_vt                         0.959773
-  dit_step4_xt                         0.994900
-  dit_step5_vt                         0.956611
-  dit_step5_xt                         0.988539
-  dit_step6_vt                         0.950669
-  dit_step6_xt                         0.976494
-  dit_step7_vt                         0.938658
-  dit_x0                               0.958725
-  vae_audio                            0.837763
-  vae_audio (STFT cosine)              0.954448
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999885   0.165835   0.010206  -0.002260   0.973133  -0.002342   0.972003
-  dit_step1_xt             0.999617   0.269038   0.018058  -0.005119   0.943095  -0.005313   0.941730
-  dit_step2_xt             0.999014   0.433553   0.027847  -0.009033   0.910111  -0.009311   0.908527
-  dit_step3_xt             0.997757   0.593449   0.040253  -0.014301   0.875156  -0.014577   0.873624
-  dit_step4_xt             0.994900   0.889597   0.058068  -0.021205   0.843622  -0.021660   0.841995
-  dit_step5_xt             0.988539   1.371047   0.084767  -0.031100   0.827136  -0.032109   0.824593
-  dit_step6_xt             0.976494   1.997185   0.125556  -0.045244   0.858177  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q4_K_M.gguf
-[GGML] Running acestep-v15-sft-Q4_K_M.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999805
-  lyric_embed                          1.000000
-  enc_hidden                           0.997032
-  detok_output                         0.999610
-  context                              0.999750
-  noise                                1.000000
-  temb_t                               0.999669
-  hidden_after_proj_in                 0.999909
-  enc_after_cond_emb                   0.997507
-  layer0_sa_output                     0.998509
-  hidden_after_layer0                  0.999683
-  hidden_after_layer6                  0.999144
-  hidden_after_layer12                 0.997681
-  hidden_after_layer18                 0.996675
-  hidden_after_layer23                 0.996878
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.996752
-  dit_step0_vt_uncond                  0.996146
-  dit_step0_vt                         0.989964
-  dit_step0_xt                         0.999995
-  dit_step5_vt_cond                    0.995283
-  dit_step5_vt                         0.977862
-  dit_step5_xt                         0.999822
-  dit_step10_vt_cond                   0.991380
-  dit_step10_vt                        0.969437
-  dit_step10_xt                        0.999216
-  dit_step15_vt_cond                   0.982929
-  dit_step15_vt                        0.945354
-  dit_step15_xt                        0.997510
-  dit_step20_vt_cond                   0.968161
-  dit_step20_vt                        0.918017
-  dit_step20_xt                        0.993520
-  dit_step25_vt_cond                   0.951227
-  dit_step25_vt                        0.894209
-  dit_step25_xt                        0.986602
-  dit_step30_vt_cond                   0.931041
-  dit_step30_vt                        0.870642
-  dit_step30_xt                        0.976800
-  dit_step35_vt_cond                   0.910848
-  dit_step35_vt                        0.844696
-  dit_step35_xt                        0.965863
-  dit_step40_vt_cond                   0.899076
-  dit_step40_vt                        0.824961
-  dit_step40_xt                        0.956007
-  dit_step45_vt_cond                   0.909967
-  dit_step45_vt                        0.832581
-  dit_step45_xt                        0.949409
-  dit_step49_vt_cond                   0.928566
-  dit_step49_vt                        0.867519
-  dit_x0                               0.947240
-  vae_audio                            0.830949
-  vae_audio (STFT cosine)              0.926924
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999995   0.037971   0.002890  -0.001839   0.980350  -0.001741   0.980402
-  dit_step5_xt             0.999822   0.197493   0.013061  -0.007274   0.890479  -0.007143   0.887999
-  dit_step10_xt            0.999216   0.534656   0.024200  -0.012937   0.812358  -0.012603   0.811299
-  dit_step15_xt            0.997510   0.842267   0.038950  -0.018571   0.747602  -0.018114   0.745268
-  dit_step20_xt            0.993520   1.160067   0.058410  -0.024329   0.702011  -0.023808   0.699582
-  dit_step25_xt            0.986602   1.554590   0.081033  -0.030223   0.679448  -0.029311   0.679278
-  dit_step30_xt            0.976800   1.927341   0.107204  -0.036251   0.683778  -0.035027   0.685262
-  dit_step35_xt            0.965863   2.255865   0.136115  -0.042287   0.714074  -0.040716   0.717195
-  dit_step40_xt            0.956007   2.590231   0.166595  -0.048296   0.766380  -0.046462   0.771853
-  dit_step45_xt            0.949409   2.912931   0.195670  -0.054552   0.836735  -0.052475   0.843036
diff --git a/tests/Q5_K_M.log b/tests/Q5_K_M.log
deleted file mode 100644
index 5989b97..0000000
--- a/tests/Q5_K_M.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
-[GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999805
-  lyric_embed                          1.000000
-  enc_hidden                           0.999038
-  detok_output                         0.999875
-  context                              0.999920
-  noise                                1.000000
-  temb_t                               0.999972
-  hidden_after_proj_in                 0.999960
-  enc_after_cond_emb                   0.999148
-  layer0_sa_output                     0.999386
-  hidden_after_layer0                  0.999829
-  hidden_after_layer6                  0.999741
-  hidden_after_layer12                 0.998654
-  hidden_after_layer18                 0.995432
-  hidden_after_layer23                 0.991374
-  dit_step0_vt                         0.968035
-  dit_step0_xt                         0.999930
-  dit_step1_vt                         0.971217
-  dit_step1_xt                         0.999785
-  dit_step2_vt                         0.970740
-  dit_step2_xt                         0.999391
-  dit_step3_vt                         0.973678
-  dit_step3_xt                         0.998557
-  dit_step4_vt                         0.972169
-  dit_step4_xt                         0.996665
-  dit_step5_vt                         0.967356
-  dit_step5_xt                         0.992218
-  dit_step6_vt                         0.962469
-  dit_step6_xt                         0.983446
-  dit_step7_vt                         0.953383
-  dit_x0                               0.970119
-  vae_audio                            0.883226
-  vae_audio (STFT cosine)              0.968463
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999930   0.139407   0.007818  -0.002306   0.973025  -0.002342   0.972003
-  dit_step1_xt             0.999785   0.264377   0.013418  -0.005299   0.942885  -0.005313   0.941730
-  dit_step2_xt             0.999391   0.455966   0.021259  -0.009285   0.909477  -0.009311   0.908527
-  dit_step3_xt             0.998557   0.657160   0.031461  -0.014661   0.874187  -0.014577   0.873624
-  dit_step4_xt             0.996665   0.973354   0.045708  -0.021890   0.842366  -0.021660   0.841995
-  dit_step5_xt             0.992218   1.446589   0.067697  -0.032248   0.825911  -0.032109   0.824593
-  dit_step6_xt             0.983446   2.092730   0.101558  -0.046788   0.857148  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q5_K_M.gguf
-[GGML] Running acestep-v15-sft-Q5_K_M.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999805
-  lyric_embed                          1.000000
-  enc_hidden                           0.999038
-  detok_output                         0.999875
-  context                              0.999920
-  noise                                1.000000
-  temb_t                               0.999900
-  hidden_after_proj_in                 0.999961
-  enc_after_cond_emb                   0.999149
-  layer0_sa_output                     0.999452
-  hidden_after_layer0                  0.999863
-  hidden_after_layer6                  0.999565
-  hidden_after_layer12                 0.998948
-  hidden_after_layer18                 0.997903
-  hidden_after_layer23                 0.998403
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998327
-  dit_step0_vt_uncond                  0.998326
-  dit_step0_vt                         0.994229
-  dit_step0_xt                         0.999997
-  dit_step5_vt_cond                    0.998595
-  dit_step5_vt                         0.987922
-  dit_step5_xt                         0.999930
-  dit_step10_vt_cond                   0.997050
-  dit_step10_vt                        0.985303
-  dit_step10_xt                        0.999749
-  dit_step15_vt_cond                   0.992839
-  dit_step15_vt                        0.969921
-  dit_step15_xt                        0.999178
-  dit_step20_vt_cond                   0.985993
-  dit_step20_vt                        0.954166
-  dit_step20_xt                        0.997691
-  dit_step25_vt_cond                   0.977103
-  dit_step25_vt                        0.938414
-  dit_step25_xt                        0.994921
-  dit_step30_vt_cond                   0.966556
-  dit_step30_vt                        0.922758
-  dit_step30_xt                        0.990726
-  dit_step35_vt_cond                   0.956566
-  dit_step35_vt                        0.906167
-  dit_step35_xt                        0.985856
-  dit_step40_vt_cond                   0.951093
-  dit_step40_vt                        0.892482
-  dit_step40_xt                        0.981314
-  dit_step45_vt_cond                   0.957449
-  dit_step45_vt                        0.895800
-  dit_step45_xt                        0.978161
-  dit_step49_vt_cond                   0.967216
-  dit_step49_vt                        0.914978
-  dit_x0                               0.977077
-  vae_audio                            0.891856
-  vae_audio (STFT cosine)              0.946058
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999997   0.038463   0.002320  -0.001770   0.980102  -0.001741   0.980402
-  dit_step5_xt             0.999930   0.110477   0.008105  -0.006919   0.889608  -0.007143   0.887999
-  dit_step10_xt            0.999749   0.218324   0.013563  -0.012429   0.811137  -0.012603   0.811299
-  dit_step15_xt            0.999178   0.406292   0.021833  -0.017883   0.745846  -0.018114   0.745268
-  dit_step20_xt            0.997691   0.617228   0.033331  -0.023467   0.699845  -0.023808   0.699582
-  dit_step25_xt            0.994921   0.873662   0.047346  -0.029215   0.677264  -0.029311   0.679278
-  dit_step30_xt            0.990726   1.146449   0.064421  -0.034956   0.681324  -0.035027   0.685262
-  dit_step35_xt            0.985856   1.448653   0.083553  -0.040671   0.711562  -0.040716   0.717195
-  dit_step40_xt            0.981314   1.836126   0.103939  -0.046406   0.764127  -0.046462   0.771853
-  dit_step45_xt            0.978161   2.180611   0.123396  -0.052503   0.834743  -0.052475   0.843036
diff --git a/tests/Q6_K.log b/tests/Q6_K.log
deleted file mode 100644
index 6cd4c1c..0000000
--- a/tests/Q6_K.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
-[GGML] Running acestep-v15-turbo-Q6_K.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999805
-  lyric_embed                          1.000000
-  enc_hidden                           0.999638
-  detok_output                         0.999962
-  context                              0.999976
-  noise                                1.000000
-  temb_t                               0.999990
-  hidden_after_proj_in                 0.999980
-  enc_after_cond_emb                   0.999648
-  layer0_sa_output                     0.999763
-  hidden_after_layer0                  0.999888
-  hidden_after_layer6                  0.999853
-  hidden_after_layer12                 0.998917
-  hidden_after_layer18                 0.995924
-  hidden_after_layer23                 0.992281
-  dit_step0_vt                         0.971207
-  dit_step0_xt                         0.999937
-  dit_step1_vt                         0.975354
-  dit_step1_xt                         0.999803
-  dit_step2_vt                         0.978312
-  dit_step2_xt                         0.999479
-  dit_step3_vt                         0.977879
-  dit_step3_xt                         0.998730
-  dit_step4_vt                         0.976291
-  dit_step4_xt                         0.997040
-  dit_step5_vt                         0.973193
-  dit_step5_xt                         0.993208
-  dit_step6_vt                         0.969738
-  dit_step6_xt                         0.985862
-  dit_step7_vt                         0.962454
-  dit_x0                               0.974866
-  vae_audio                            0.893678
-  vae_audio (STFT cosine)              0.969663
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999937   0.147590   0.007252  -0.002265   0.972930  -0.002342   0.972003
-  dit_step1_xt             0.999803   0.291665   0.012432  -0.005192   0.942660  -0.005313   0.941730
-  dit_step2_xt             0.999479   0.474224   0.019215  -0.009147   0.909068  -0.009311   0.908527
-  dit_step3_xt             0.998730   0.730810   0.028734  -0.014438   0.873565  -0.014577   0.873624
-  dit_step4_xt             0.997040   1.058607   0.042049  -0.021507   0.841532  -0.021660   0.841995
-  dit_step5_xt             0.993208   1.534989   0.062024  -0.031604   0.824595  -0.032109   0.824593
-  dit_step6_xt             0.985862   2.188862   0.092252  -0.045920   0.855268  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q6_K.gguf
-[GGML] Running acestep-v15-sft-Q6_K.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999805
-  lyric_embed                          1.000000
-  enc_hidden                           0.999638
-  detok_output                         0.999962
-  context                              0.999976
-  noise                                1.000000
-  temb_t                               0.999970
-  hidden_after_proj_in                 0.999981
-  enc_after_cond_emb                   0.999651
-  layer0_sa_output                     0.999771
-  hidden_after_layer0                  0.999913
-  hidden_after_layer6                  0.999782
-  hidden_after_layer12                 0.999350
-  hidden_after_layer18                 0.998535
-  hidden_after_layer23                 0.998814
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998712
-  dit_step0_vt_uncond                  0.998275
-  dit_step0_vt                         0.994897
-  dit_step0_xt                         0.999997
-  dit_step5_vt_cond                    0.999148
-  dit_step5_vt                         0.992272
-  dit_step5_xt                         0.999951
-  dit_step10_vt_cond                   0.997802
-  dit_step10_vt                        0.990167
-  dit_step10_xt                        0.999821
-  dit_step15_vt_cond                   0.995510
-  dit_step15_vt                        0.980612
-  dit_step15_xt                        0.999436
-  dit_step20_vt_cond                   0.991103
-  dit_step20_vt                        0.969601
-  dit_step20_xt                        0.998471
-  dit_step25_vt_cond                   0.984595
-  dit_step25_vt                        0.957457
-  dit_step25_xt                        0.996593
-  dit_step30_vt_cond                   0.977649
-  dit_step30_vt                        0.948797
-  dit_step30_xt                        0.993770
-  dit_step35_vt_cond                   0.970853
-  dit_step35_vt                        0.937303
-  dit_step35_xt                        0.990429
-  dit_step40_vt_cond                   0.966727
-  dit_step40_vt                        0.927488
-  dit_step40_xt                        0.987201
-  dit_step45_vt_cond                   0.971343
-  dit_step45_vt                        0.937992
-  dit_step45_xt                        0.984913
-  dit_step49_vt_cond                   0.978000
-  dit_step49_vt                        0.949509
-  dit_x0                               0.984147
-  vae_audio                            0.935392
-  vae_audio (STFT cosine)              0.974483
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999997   0.038602   0.002180  -0.001744   0.980167  -0.001741   0.980402
-  dit_step5_xt             0.999951   0.145112   0.006817  -0.006930   0.889866  -0.007143   0.887999
-  dit_step10_xt            0.999821   0.208421   0.011339  -0.012339   0.811560  -0.012603   0.811299
-  dit_step15_xt            0.999436   0.337160   0.018157  -0.017579   0.746441  -0.018114   0.745268
-  dit_step20_xt            0.998471   0.561928   0.027460  -0.022914   0.700716  -0.023808   0.699582
-  dit_step25_xt            0.996593   0.825034   0.039088  -0.028344   0.679138  -0.029311   0.679278
-  dit_step30_xt            0.993770   1.225392   0.052945  -0.033832   0.684642  -0.035027   0.685262
-  dit_step35_xt            0.990429   1.650381   0.068602  -0.039215   0.716082  -0.040716   0.717195
-  dit_step40_xt            0.987201   2.085848   0.085637  -0.044595   0.769111  -0.046462   0.771853
-  dit_step45_xt            0.984913   2.477617   0.101990  -0.050396   0.839899  -0.052475   0.843036
diff --git a/tests/Q8_0.log b/tests/Q8_0.log
deleted file mode 100644
index 27ba118..0000000
--- a/tests/Q8_0.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
-[GGML] Running acestep-v15-turbo-Q8_0.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999805
-  lyric_embed                          1.000000
-  enc_hidden                           0.999784
-  detok_output                         0.999983
-  context                              0.999990
-  noise                                1.000000
-  temb_t                               0.999997
-  hidden_after_proj_in                 0.999986
-  enc_after_cond_emb                   0.999765
-  layer0_sa_output                     0.999924
-  hidden_after_layer0                  0.999957
-  hidden_after_layer6                  0.999892
-  hidden_after_layer12                 0.999346
-  hidden_after_layer18                 0.996758
-  hidden_after_layer23                 0.993881
-  dit_step0_vt                         0.976421
-  dit_step0_xt                         0.999948
-  dit_step1_vt                         0.979128
-  dit_step1_xt                         0.999834
-  dit_step2_vt                         0.982059
-  dit_step2_xt                         0.999561
-  dit_step3_vt                         0.983029
-  dit_step3_xt                         0.998948
-  dit_step4_vt                         0.981353
-  dit_step4_xt                         0.997565
-  dit_step5_vt                         0.978860
-  dit_step5_xt                         0.994480
-  dit_step6_vt                         0.976051
-  dit_step6_xt                         0.988641
-  dit_step7_vt                         0.970144
-  dit_x0                               0.979969
-  vae_audio                            0.905525
-  vae_audio (STFT cosine)              0.976530
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999948   0.134961   0.006551  -0.002307   0.972901  -0.002342   0.972003
-  dit_step1_xt             0.999834   0.262688   0.011280  -0.005306   0.942604  -0.005313   0.941730
-  dit_step2_xt             0.999561   0.448301   0.017428  -0.009351   0.909110  -0.009311   0.908527
-  dit_step3_xt             0.998948   0.617858   0.025766  -0.014708   0.873709  -0.014577   0.873624
-  dit_step4_xt             0.997565   0.740504   0.037507  -0.021763   0.841873  -0.021660   0.841995
-  dit_step5_xt             0.994480   1.211945   0.054863  -0.031844   0.825164  -0.032109   0.824593
-  dit_step6_xt             0.988641   2.056566   0.081142  -0.046105   0.856063  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q8_0.gguf
-[GGML] Running acestep-v15-sft-Q8_0.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999805
-  lyric_embed                          1.000000
-  enc_hidden                           0.999784
-  detok_output                         0.999983
-  context                              0.999990
-  noise                                1.000000
-  temb_t                               0.999991
-  hidden_after_proj_in                 0.999986
-  enc_after_cond_emb                   0.999768
-  layer0_sa_output                     0.999913
-  hidden_after_layer0                  0.999961
-  hidden_after_layer6                  0.999814
-  hidden_after_layer12                 0.999441
-  hidden_after_layer18                 0.998694
-  hidden_after_layer23                 0.998948
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998899
-  dit_step0_vt_uncond                  0.998530
-  dit_step0_vt                         0.995437
-  dit_step0_xt                         0.999998
-  dit_step5_vt_cond                    0.999435
-  dit_step5_vt                         0.993135
-  dit_step5_xt                         0.999959
-  dit_step10_vt_cond                   0.998667
-  dit_step10_vt                        0.992381
-  dit_step10_xt                        0.999876
-  dit_step15_vt_cond                   0.996784
-  dit_step15_vt                        0.983109
-  dit_step15_xt                        0.999626
-  dit_step20_vt_cond                   0.993660
-  dit_step20_vt                        0.976141
-  dit_step20_xt                        0.998967
-  dit_step25_vt_cond                   0.989047
-  dit_step25_vt                        0.965619
-  dit_step25_xt                        0.997655
-  dit_step30_vt_cond                   0.983970
-  dit_step30_vt                        0.959590
-  dit_step30_xt                        0.995674
-  dit_step35_vt_cond                   0.978928
-  dit_step35_vt                        0.949494
-  dit_step35_xt                        0.993260
-  dit_step40_vt_cond                   0.975960
-  dit_step40_vt                        0.939874
-  dit_step40_xt                        0.990935
-  dit_step45_vt_cond                   0.978761
-  dit_step45_vt                        0.940675
-  dit_step45_xt                        0.989300
-  dit_step49_vt_cond                   0.980854
-  dit_step49_vt                        0.920647
-  dit_x0                               0.988696
-  vae_audio                            0.944426
-  vae_audio (STFT cosine)              0.974764
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999998   0.038422   0.002097  -0.001714   0.980004  -0.001741   0.980402
-  dit_step5_xt             0.999959   0.134478   0.006082  -0.006888   0.888999  -0.007143   0.887999
-  dit_step10_xt            0.999876   0.215550   0.009492  -0.012330   0.810305  -0.012603   0.811299
-  dit_step15_xt            0.999626   0.342195   0.014680  -0.017574   0.745063  -0.018114   0.745268
-  dit_step20_xt            0.998967   0.566416   0.022205  -0.022917   0.699295  -0.023808   0.699582
-  dit_step25_xt            0.997655   0.862320   0.031744  -0.028373   0.677531  -0.029311   0.679278
-  dit_step30_xt            0.995674   1.138689   0.043055  -0.033821   0.683290  -0.035027   0.685262
-  dit_step35_xt            0.993260   1.656645   0.056128  -0.039223   0.714963  -0.040716   0.717195
-  dit_step40_xt            0.990935   2.096484   0.070423  -0.044591   0.768426  -0.046462   0.771853
-  dit_step45_xt            0.989300   2.398146   0.084110  -0.050467   0.839484  -0.052475   0.843036
diff --git a/tests/Vulkan-BF16.log b/tests/Vulkan-BF16.log
index 2d955d7..c063695 100644
--- a/tests/Vulkan-BF16.log
+++ b/tests/Vulkan-BF16.log
@@ -1,7 +1,7 @@
 ggml_vulkan: Found 1 Vulkan devices:
 ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
 [Load] DiT backend: Vulkan0 (CPU threads: 16)
-[Load] Backend init: 260.3 ms
+[Load] Backend init: 142.5 ms
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [DiT] Self-attn: Q+K+V fused
 [DiT] Cross-attn: Q+K+V fused
@@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 3007.9 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 397.7 ms
+[Load] DiT weight load: 338.6 ms
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[Load] VAE backend: Vulkan0 (shared)
 [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 672.5 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 661.6 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 32.1 ms
+[Load] BPE tokenizer: 31.2 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] TextEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 166.9 ms
-[Encode] TextEncoder (70 tokens): 30.9 ms
+[Load] TextEncoder: 141.4 ms
+[Encode] TextEncoder (70 tokens): 1939.4 ms
 [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 11.2 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.3 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] CondEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 1160.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 163.7 ms
+[Load] ConditionEncoder: 130.2 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 22.5 ms, enc_S=238
+[Encode] ConditionEncoder: 2492.6 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.758148 -0.049593 -0.132730 0.058488
 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024
 [WeightCtx] Loaded 30 tensors, 200.3 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 28.1 ms
+[Load] Detokenizer: 23.1 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 229.8 ms
+[Context] Detokenizer: 2525.9 ms
 [Debug] detok_output: [2170, 64] first4: -0.125193 1.435010 0.308190 -0.624228
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -79,70 +77,67 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Debug] proj_in_input: [192, 2170] first4: -0.125193 1.435010 0.308190 -0.624228
 [Debug] enc_after_cond_emb: [2048, 238] first4: -0.168464 0.814954 0.327714 -0.561971
 [Debug] layer0_sa_input: [2048, 1085] first4: -0.719110 -0.764019 -0.047328 0.261808
-[Debug] layer0_q_after_rope: [128, 16] first4: -2.424376 -0.094810 -0.411903 1.007324
-[Debug] layer0_k_after_rope: [128, 8] first4: -12.712339 1.106410 1.775920 1.780798
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.501171 0.169176 -0.355798 0.513027
-[Debug] layer0_attn_out: [2048, 1085] first4: -1.540742 -1.044333 0.188720 0.456093
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540742 -1.044333 0.188720 0.456093
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.598325 -0.820241 -0.296337 0.493580
-[Debug] hidden_after_layer0: [2048, 1085] first4: -9.091503 0.566892 52.584164 -0.903901
-[Debug] hidden_after_layer6: [2048, 1085] first4: -21.192070 0.040278 33.599442 -4.442998
-[Debug] hidden_after_layer12: [2048, 1085] first4: -15.068191 -18.118078 71.999359 28.597229
-[Debug] hidden_after_layer18: [2048, 1085] first4: -27.132679 15.867422 60.847614 20.940519
-[Debug] hidden_after_layer23: [2048, 1085] first4: -12.584854 45.152912 198.753845 145.517029
-[Debug] dit_step0_vt: [2170, 64] first4: 0.014936 1.119046 0.345802 2.379982
-[Debug] dit_step0_xt: [2170, 64] first4: 0.193657 2.105384 -0.187593 0.739475
+[Debug] layer0_q_after_rope: [128, 16] first4: -1.985390 -0.040374 -0.446411 0.887640
+[Debug] layer0_k_after_rope: [128, 8] first4: -12.133966 1.032982 1.765450 1.789189
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.285921 -0.088167 -0.083954 0.187361
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.468877 -0.930195 0.454157 0.450160
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.468877 -0.930195 0.454157 0.450160
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.465657 -0.778736 0.078704 0.498346
+[Debug] hidden_after_layer0: [2048, 1085] first4: -8.621284 0.720027 54.661194 -0.769228
+[Debug] hidden_after_layer6: [2048, 1085] first4: -12.726752 3.144506 -9.323353 -12.165966
+[Debug] hidden_after_layer12: [2048, 1085] first4: -16.201662 -10.547243 4.967308 15.566863
+[Debug] hidden_after_layer18: [2048, 1085] first4: -26.509827 14.787127 -25.476906 8.639433
+[Debug] hidden_after_layer23: [2048, 1085] first4: -16.044237 89.590195 45.410172 78.645676
+[Debug] dit_step0_vt: [2170, 64] first4: 0.347229 0.879013 0.198151 1.945618
+[Debug] dit_step0_xt: [2170, 64] first4: 0.178553 2.116295 -0.180882 0.759219
 [DiT] step 1/8 t=1.000
-[Debug] dit_step1_vt: [2170, 64] first4: 0.086700 0.854980 -0.273651 1.728149
-[Debug] dit_step1_xt: [2170, 64] first4: 0.188928 2.058749 -0.172667 0.645212
+[Debug] dit_step1_vt: [2170, 64] first4: 0.068695 0.847748 -0.298004 1.750702
+[Debug] dit_step1_xt: [2170, 64] first4: 0.174806 2.070054 -0.164627 0.663726
 [DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: 0.180420 0.837399 -0.150421 2.056976
-[Debug] dit_step2_xt: [2170, 64] first4: 0.176900 2.002922 -0.162639 0.508081
+[Debug] dit_step2_vt: [2170, 64] first4: 0.151260 0.875549 -0.207390 2.089754
+[Debug] dit_step2_xt: [2170, 64] first4: 0.164722 2.011684 -0.150801 0.524409
 [DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: 0.130821 0.833313 0.053528 2.193359
-[Debug] dit_step3_xt: [2170, 64] first4: 0.165998 1.933480 -0.167099 0.325301
+[Debug] dit_step3_vt: [2170, 64] first4: 0.077034 0.843689 -0.087112 2.299004
+[Debug] dit_step3_xt: [2170, 64] first4: 0.158302 1.941377 -0.143542 0.332826
 [DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 0.273712 0.866425 0.216686 2.274872
-[Debug] dit_step4_xt: [2170, 64] first4: 0.136672 1.840648 -0.190316 0.081565
+[Debug] dit_step4_vt: [2170, 64] first4: 0.173340 0.815531 0.275307 2.367218
+[Debug] dit_step4_xt: [2170, 64] first4: 0.139730 1.853999 -0.173039 0.079195
 [DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 0.347900 0.772171 0.542953 2.248352
-[Debug] dit_step5_xt: [2170, 64] first4: 0.086972 1.730338 -0.267881 -0.239629
+[Debug] dit_step5_vt: [2170, 64] first4: 0.210556 0.765915 0.470947 2.214279
+[Debug] dit_step5_xt: [2170, 64] first4: 0.109651 1.744582 -0.240317 -0.237130
 [DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: 0.132820 0.664673 0.218246 2.387787
-[Debug] dit_step6_xt: [2170, 64] first4: 0.060408 1.597404 -0.311530 -0.717186
+[Debug] dit_step6_vt: [2170, 64] first4: -0.038303 0.546310 0.224964 2.284607
+[Debug] dit_step6_xt: [2170, 64] first4: 0.117311 1.635320 -0.285310 -0.694052
 [DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: -0.335976 0.323303 0.198029 2.726624
-[Debug] dit_x0: [2170, 64] first4: 0.161200 1.500413 -0.370938 -1.535173
+[Debug] dit_step7_vt: [2170, 64] first4: -0.300537 0.235870 0.263802 2.617432
+[Debug] dit_x0: [2170, 64] first4: 0.207473 1.564559 -0.364450 -1.479281
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 740.5 ms (740.5 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.161200 1.500413 -0.370938 -1.535173
+[DiT] Total generation: 2630.4 ms (2630.4 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.207473 1.564559 -0.364450 -1.479281
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 9812.1 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000591 0.001078 0.000929 0.001296
+[VAE Batch0] Decode: 2992.9 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000614 0.001141 0.000934 0.001396
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:55:13.398 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:55:13.398 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:55:13.399 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:55:13.399 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:55:13.399 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:55:14.155 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:55:15.664 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:55:15.664 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:55:15.669 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:55:15.830 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:46:56.541 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:46:56.622 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:46:57.937 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:46:57.937 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:46:57.939 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:46:57.945 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:46:58.137 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:55:15.831 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:46:58.139 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:55:15.838 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:55:15.850 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:55:15.851 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:55:15.885 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:55:16.193 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:55:16.193 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:55:16.193 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006814241409301758, 'diffusion_time_cost': 0.30007076263427734, 'diffusion_per_step_time_cost': 0.03750884532928467, 'total_time_cost': 0.3068850040435791, 'offload_time_cost': 0.0}
-2026-03-01 19:55:16.208 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:55:16.210 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:55:16.210 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:55:16.210 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:55:16.210 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:55:16.210 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:55:16.210 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:55:16.485 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:55:16.488 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:55:16.491 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:46:58.146 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:46:58.161 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:46:58.161 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:46:58.195 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:46:58.511 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:46:58.512 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:46:58.512 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006871938705444336, 'diffusion_time_cost': 0.30806517601013184, 'diffusion_per_step_time_cost': 0.03850814700126648, 'total_time_cost': 0.31493711471557617, 'offload_time_cost': 0.0}
+2026-03-04 21:46:58.526 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:46:58.528 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:46:58.528 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:46:58.528 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:46:58.528 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:46:58.528 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:46:58.528 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:46:58.802 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:46:58.804 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:46:58.806 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf
 [GGML] Running acestep-v15-turbo-BF16.gguf...
@@ -224,36 +219,36 @@ Using precomputed LM hints
   temb_t                               0.999999
   hidden_after_proj_in                 0.999987
   enc_after_cond_emb                   0.999825
-  layer0_sa_output                     0.999959
-  hidden_after_layer0                  0.999982
-  hidden_after_layer6                  0.999916
-  hidden_after_layer12                 0.999276
-  hidden_after_layer18                 0.996645
-  hidden_after_layer23                 0.993735
-  dit_step0_vt                         0.975502
-  dit_step0_xt                         0.999946
-  dit_step1_vt                         0.898326
-  dit_step1_xt                         0.999578
-  dit_step2_vt                         0.893586
-  dit_step2_xt                         0.998276
-  dit_step3_vt                         0.881101
-  dit_step3_xt                         0.994720
-  dit_step4_vt                         0.869138
-  dit_step4_xt                         0.986137
-  dit_step5_vt                         0.854878
-  dit_step5_xt                         0.965846
-  dit_step6_vt                         0.840298
-  dit_step6_xt                         0.925771
-  dit_step7_vt                         0.818271
-  dit_x0                               0.867399
-  vae_audio                            0.680412
-  vae_audio (STFT cosine)              0.855380
+  layer0_sa_output                     0.920858
+  hidden_after_layer0                  0.996092
+  hidden_after_layer6                  0.980248
+  hidden_after_layer12                 0.977161
+  hidden_after_layer18                 0.973382
+  hidden_after_layer23                 0.961755
+  dit_step0_vt                         0.843333
+  dit_step0_xt                         0.999656
+  dit_step1_vt                         0.875601
+  dit_step1_xt                         0.998907
+  dit_step2_vt                         0.860701
+  dit_step2_xt                         0.996792
+  dit_step3_vt                         0.838816
+  dit_step3_xt                         0.991464
+  dit_step4_vt                         0.827875
+  dit_step4_xt                         0.978766
+  dit_step5_vt                         0.812689
+  dit_step5_xt                         0.949636
+  dit_step6_vt                         0.795272
+  dit_step6_xt                         0.894491
+  dit_step7_vt                         0.769772
+  dit_x0                               0.818406
+  vae_audio                            0.571274
+  vae_audio (STFT cosine)              0.788509
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999946   0.135811   0.006633  -0.002316   0.972919  -0.002342   0.972003
-  dit_step1_xt             0.999578   0.413265   0.019706  -0.005121   0.942541  -0.005313   0.941730
-  dit_step2_xt             0.998276   0.811472   0.038208  -0.008968   0.908957  -0.009311   0.908527
-  dit_step3_xt             0.994720   1.481150   0.064047  -0.014385   0.872574  -0.014577   0.873624
-  dit_step4_xt             0.986137   1.857148   0.100272  -0.021489   0.837038  -0.021660   0.841995
-  dit_step5_xt             0.965846   1.439633   0.154129  -0.031859   0.812819  -0.032109   0.824593
-  dit_step6_xt             0.925771   2.125688   0.235367  -0.046759   0.832442  -0.046482   0.855546
+  dit_step0_xt             0.999656   0.367652   0.018858  -0.002243   0.972108  -0.002342   0.972003
+  dit_step1_xt             0.998907   0.763455   0.032624  -0.004985   0.941679  -0.005313   0.941730
+  dit_step2_xt             0.996792   1.022189   0.053741  -0.008816   0.908019  -0.009311   0.908527
+  dit_step3_xt             0.991464   1.657425   0.084380  -0.014275   0.871556  -0.014577   0.873624
+  dit_step4_xt             0.978766   2.432666   0.128087  -0.021464   0.836876  -0.021660   0.841995
+  dit_step5_xt             0.949636   3.423663   0.193034  -0.032107   0.813619  -0.032109   0.824593
+  dit_step6_xt             0.894491   4.744513   0.289706  -0.047388   0.833987  -0.046482   0.855546
diff --git a/tests/Vulkan-Q4_K_M.log b/tests/Vulkan-Q4_K_M.log
index 011c0c3..03f9985 100644
--- a/tests/Vulkan-Q4_K_M.log
+++ b/tests/Vulkan-Q4_K_M.log
@@ -1,7 +1,7 @@
 ggml_vulkan: Found 1 Vulkan devices:
 ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
 [Load] DiT backend: Vulkan0 (CPU threads: 16)
-[Load] Backend init: 115.6 ms
+[Load] Backend init: 146.5 ms
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K fused, V separate
 [DiT] Cross-attn: all separate
@@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 895.6 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 126.7 ms
+[Load] DiT weight load: 110.3 ms
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[Load] VAE backend: Vulkan0 (shared)
 [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 667.9 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 661.8 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 31.0 ms
+[Load] BPE tokenizer: 31.2 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] TextEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 166.1 ms
-[Encode] TextEncoder (70 tokens): 18.4 ms
+[Load] TextEncoder: 143.1 ms
+[Encode] TextEncoder (70 tokens): 18.1 ms
 [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 11.3 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.3 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] CondEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K fused, V separate
@@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 352.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 43.9 ms
+[Load] ConditionEncoder: 40.5 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 18.2 ms, enc_S=238
+[Encode] ConditionEncoder: 2552.5 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.760519 -0.046675 -0.129011 0.057651
 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 64.7 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 8.9 ms
+[Load] Detokenizer: 8.5 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 152.2 ms
+[Context] Detokenizer: 574.4 ms
 [Debug] detok_output: [2170, 64] first4: -0.107345 1.442038 0.300564 -0.641466
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -93,56 +91,53 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Debug] dit_step0_vt: [2170, 64] first4: 0.669312 0.442215 1.300629 2.101841
 [Debug] dit_step0_xt: [2170, 64] first4: 0.163913 2.136149 -0.230995 0.752118
 [DiT] step 1/8 t=1.000
-[Debug] dit_step1_vt: [2170, 64] first4: 1.120422 0.593113 1.031189 1.813599
-[Debug] dit_step1_xt: [2170, 64] first4: 0.102799 2.103798 -0.287241 0.653194
+[Debug] dit_step1_vt: [2170, 64] first4: 1.083954 0.575027 1.011414 1.785126
+[Debug] dit_step1_xt: [2170, 64] first4: 0.104788 2.104784 -0.286163 0.654747
 [DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: 1.381363 0.295410 1.456146 1.949341
-[Debug] dit_step2_xt: [2170, 64] first4: 0.010708 2.084104 -0.384318 0.523238
+[Debug] dit_step2_vt: [2170, 64] first4: 1.406609 0.358032 1.442169 1.947861
+[Debug] dit_step2_xt: [2170, 64] first4: 0.011014 2.080915 -0.382307 0.524890
 [DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: 1.440727 0.067017 1.481567 2.158554
-[Debug] dit_step3_xt: [2170, 64] first4: -0.109353 2.078519 -0.507782 0.343359
+[Debug] dit_step3_vt: [2170, 64] first4: 1.450653 0.080627 1.479324 2.174759
+[Debug] dit_step3_xt: [2170, 64] first4: -0.109874 2.074197 -0.505584 0.343660
 [DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 1.377216 0.234177 1.413437 2.181564
-[Debug] dit_step4_xt: [2170, 64] first4: -0.256912 2.053428 -0.659221 0.109620
+[Debug] dit_step4_vt: [2170, 64] first4: 1.396931 0.250122 1.401264 2.164902
+[Debug] dit_step4_xt: [2170, 64] first4: -0.259545 2.047398 -0.655720 0.111706
 [DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 1.135239 0.376801 1.055233 2.272675
-[Debug] dit_step5_xt: [2170, 64] first4: -0.419089 1.999600 -0.809969 -0.215048
+[Debug] dit_step5_vt: [2170, 64] first4: 1.155813 0.405807 1.027550 2.260437
+[Debug] dit_step5_xt: [2170, 64] first4: -0.424661 1.989425 -0.802512 -0.211213
 [DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: 0.948242 0.399368 0.426941 2.645081
-[Debug] dit_step6_xt: [2170, 64] first4: -0.608737 1.919726 -0.895357 -0.744064
+[Debug] dit_step6_vt: [2170, 64] first4: 0.916870 0.396088 0.350647 2.622253
+[Debug] dit_step6_xt: [2170, 64] first4: -0.608035 1.910208 -0.872642 -0.735664
 [DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: 0.549133 -0.167076 0.379578 2.984619
-[Debug] dit_x0: [2170, 64] first4: -0.773477 1.969849 -1.009230 -1.639450
+[Debug] dit_step7_vt: [2170, 64] first4: 0.544876 -0.215309 0.434998 3.006592
+[Debug] dit_x0: [2170, 64] first4: -0.771498 1.974800 -1.003141 -1.637641
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 263.6 ms (263.6 ms/sample)
-[Debug] dit_output: [2170, 64] first4: -0.773477 1.969849 -1.009230 -1.639450
+[DiT] Total generation: 342.3 ms (342.3 ms/sample)
+[Debug] dit_output: [2170, 64] first4: -0.771498 1.974800 -1.003141 -1.637641
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 9686.3 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.015021 0.018215 0.017495 0.016521
+[VAE Batch0] Decode: 1703.5 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.012597 0.015460 0.014870 0.014040
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:56:19.059 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:56:19.060 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:56:19.060 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:56:19.060 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:56:19.060 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:56:19.832 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:56:21.417 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:56:21.417 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:56:21.428 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:56:21.589 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:47:49.166 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:47:49.255 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:47:50.597 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:47:50.597 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:47:50.598 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:47:50.604 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:47:50.793 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:56:21.591 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:47:50.795 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:56:21.597 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:56:21.610 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:56:21.610 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:56:21.642 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:56:21.955 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:56:21.956 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:56:21.956 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006905794143676758, 'diffusion_time_cost': 0.3056776523590088, 'diffusion_per_step_time_cost': 0.0382097065448761, 'total_time_cost': 0.31258344650268555, 'offload_time_cost': 0.0}
-2026-03-01 19:56:21.970 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:56:21.973 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:56:21.973 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:56:21.973 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:56:21.973 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:56:21.973 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:56:21.973 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:56:22.249 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:56:22.252 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:56:22.255 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:47:50.802 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:47:50.816 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:47:50.816 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:47:50.850 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:47:51.166 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:47:51.167 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:47:51.167 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006922483444213867, 'diffusion_time_cost': 0.3079640865325928, 'diffusion_per_step_time_cost': 0.0384955108165741, 'total_time_cost': 0.31488656997680664, 'offload_time_cost': 0.0}
+2026-03-04 21:47:51.181 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:47:51.183 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:47:51.183 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB
+2026-03-04 21:47:51.183 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:47:51.183 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB
+2026-03-04 21:47:51.183 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB
+2026-03-04 21:47:51.183 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:47:51.458 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:47:51.460 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:47:51.461 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
 [GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
@@ -232,28 +227,28 @@ Using precomputed LM hints
   hidden_after_layer23                 0.947132
   dit_step0_vt                         0.790630
   dit_step0_xt                         0.999550
-  dit_step1_vt                         0.812267
-  dit_step1_xt                         0.998316
-  dit_step2_vt                         0.797855
-  dit_step2_xt                         0.994982
-  dit_step3_vt                         0.785550
-  dit_step3_xt                         0.987155
-  dit_step4_vt                         0.777677
-  dit_step4_xt                         0.969894
-  dit_step5_vt                         0.765554
-  dit_step5_xt                         0.933268
-  dit_step6_vt                         0.748164
-  dit_step6_xt                         0.865654
-  dit_step7_vt                         0.704997
-  dit_x0                               0.768990
-  vae_audio                            0.377954
-  vae_audio (STFT cosine)              0.669489
+  dit_step1_vt                         0.756205
+  dit_step1_xt                         0.998148
+  dit_step2_vt                         0.797194
+  dit_step2_xt                         0.994834
+  dit_step3_vt                         0.784456
+  dit_step3_xt                         0.987026
+  dit_step4_vt                         0.776725
+  dit_step4_xt                         0.969792
+  dit_step5_vt                         0.765077
+  dit_step5_xt                         0.933184
+  dit_step6_vt                         0.747231
+  dit_step6_xt                         0.865289
+  dit_step7_vt                         0.704165
+  dit_x0                               0.767979
+  vae_audio                            0.376451
+  vae_audio (STFT cosine)              0.668630
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999550   0.201120   0.022082  -0.002496   0.972768  -0.002342   0.972003
-  dit_step1_xt             0.998316   0.415084   0.041258  -0.005641   0.942202  -0.005313   0.941730
-  dit_step2_xt             0.994982   0.710340   0.068500  -0.010236   0.907728  -0.009311   0.908527
-  dit_step3_xt             0.987155   1.070455   0.105302  -0.016404   0.870181  -0.014577   0.873624
-  dit_step4_xt             0.969894   1.456633   0.155292  -0.024587   0.833834  -0.021660   0.841995
-  dit_step5_xt             0.933268   1.997366   0.225911  -0.035903   0.808944  -0.032109   0.824593
-  dit_step6_xt             0.865654   3.020976   0.331484  -0.051668   0.828925  -0.046482   0.855546
+  dit_step1_xt             0.998148   0.415598   0.043234  -0.005810   0.944103  -0.005313   0.941730
+  dit_step2_xt             0.994834   0.709830   0.069736  -0.010410   0.909328  -0.009311   0.908527
+  dit_step3_xt             0.987026   1.071567   0.106058  -0.016584   0.871456  -0.014577   0.873624
+  dit_step4_xt             0.969792   1.488428   0.155756  -0.024763   0.834729  -0.021660   0.841995
+  dit_step5_xt             0.933184   1.958024   0.226224  -0.036147   0.809005  -0.032109   0.824593
+  dit_step6_xt             0.865289   3.030077   0.331834  -0.051892   0.828296  -0.046482   0.855546
diff --git a/tests/Vulkan-Q5_K_M.log b/tests/Vulkan-Q5_K_M.log
index ec38ab3..aa0eb9c 100644
--- a/tests/Vulkan-Q5_K_M.log
+++ b/tests/Vulkan-Q5_K_M.log
@@ -1,7 +1,7 @@
 ggml_vulkan: Found 1 Vulkan devices:
 ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
 [Load] DiT backend: Vulkan0 (CPU threads: 16)
-[Load] Backend init: 114.1 ms
+[Load] Backend init: 114.4 ms
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K fused, V separate
 [DiT] Cross-attn: all separate
@@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1061.2 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 151.9 ms
+[Load] DiT weight load: 129.5 ms
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[Load] VAE backend: Vulkan0 (shared)
 [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 677.1 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 660.3 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 32.6 ms
+[Load] BPE tokenizer: 30.7 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] TextEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 167.6 ms
-[Encode] TextEncoder (70 tokens): 18.0 ms
+[Load] TextEncoder: 142.0 ms
+[Encode] TextEncoder (70 tokens): 17.4 ms
 [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 11.1 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.3 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] CondEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K fused, V separate
@@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 412.5 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 55.7 ms
+[Load] ConditionEncoder: 50.1 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 17.4 ms, enc_S=238
+[Encode] ConditionEncoder: 3109.7 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.760480 -0.051691 -0.132144 0.058144
 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 73.2 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 14.2 ms
+[Load] Detokenizer: 9.1 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 176.8 ms
+[Context] Detokenizer: 674.8 ms
 [Debug] detok_output: [2170, 64] first4: -0.125636 1.455599 0.291766 -0.651349
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -96,53 +94,50 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Debug] dit_step1_vt: [2170, 64] first4: -0.053368 1.748116 -0.894806 1.618408
 [Debug] dit_step1_xt: [2170, 64] first4: 0.197534 2.006799 -0.135800 0.647723
 [DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: -0.025024 1.326050 -0.792084 2.043884
-[Debug] dit_step2_xt: [2170, 64] first4: 0.199202 1.918396 -0.082994 0.511464
+[Debug] dit_step2_vt: [2170, 64] first4: -0.031860 1.378967 -0.801270 2.036987
+[Debug] dit_step2_xt: [2170, 64] first4: 0.199658 1.914868 -0.082382 0.511923
 [DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: -0.000458 1.126770 -0.795593 2.254120
-[Debug] dit_step3_xt: [2170, 64] first4: 0.199240 1.824498 -0.016695 0.323620
+[Debug] dit_step3_vt: [2170, 64] first4: 0.009003 1.141663 -0.806183 2.229477
+[Debug] dit_step3_xt: [2170, 64] first4: 0.198908 1.819729 -0.015200 0.326134
 [DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 0.174652 1.253662 -1.125977 2.441956
-[Debug] dit_step4_xt: [2170, 64] first4: 0.180528 1.690177 0.103946 0.061982
+[Debug] dit_step4_vt: [2170, 64] first4: 0.174896 1.264160 -1.139648 2.439102
+[Debug] dit_step4_xt: [2170, 64] first4: 0.180169 1.684284 0.106905 0.064801
 [DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 0.205261 1.640076 -1.795410 2.452087
-[Debug] dit_step5_xt: [2170, 64] first4: 0.151205 1.455881 0.360433 -0.288316
+[Debug] dit_step5_vt: [2170, 64] first4: 0.201294 1.641151 -1.784760 2.454834
+[Debug] dit_step5_xt: [2170, 64] first4: 0.151413 1.449833 0.361871 -0.285889
 [DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: -0.158905 1.750122 -2.412979 2.419128
-[Debug] dit_step6_xt: [2170, 64] first4: 0.182986 1.105856 0.843029 -0.772142
+[Debug] dit_step6_vt: [2170, 64] first4: -0.154907 1.748291 -2.434448 2.425964
+[Debug] dit_step6_xt: [2170, 64] first4: 0.182394 1.100175 0.848760 -0.771082
 [DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: -0.636047 1.672760 -3.485062 2.600891
-[Debug] dit_x0: [2170, 64] first4: 0.373800 0.604028 1.888547 -1.552409
+[Debug] dit_step7_vt: [2170, 64] first4: -0.633545 1.687561 -3.500275 2.586243
+[Debug] dit_x0: [2170, 64] first4: 0.372458 0.593907 1.898843 -1.546955
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 269.9 ms (269.9 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.373800 0.604028 1.888547 -1.552409
+[DiT] Total generation: 354.9 ms (354.9 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.372458 0.593907 1.898843 -1.546955
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 9630.7 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.001367 0.001844 0.001533 0.001892
+[VAE Batch0] Decode: 1718.2 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.001432 0.001921 0.001585 0.001927
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:56:02.727 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:56:02.728 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:56:02.728 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:56:02.728 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:56:02.728 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:56:03.499 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:56:05.072 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:56:05.072 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:56:05.078 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:56:05.239 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:47:37.062 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:47:37.143 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:47:38.480 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:47:38.481 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:47:38.482 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:47:38.488 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:47:38.703 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:56:05.241 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:47:38.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:56:05.247 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:56:05.260 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:56:05.260 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:56:05.285 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:56:05.592 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:56:05.593 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:56:05.593 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00687718391418457, 'diffusion_time_cost': 0.3001282215118408, 'diffusion_per_step_time_cost': 0.0375160276889801, 'total_time_cost': 0.3070054054260254, 'offload_time_cost': 0.0}
-2026-03-01 19:56:05.607 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:56:05.609 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:56:05.610 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB
-2026-03-01 19:56:05.610 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:56:05.610 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB
-2026-03-01 19:56:05.610 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB
-2026-03-01 19:56:05.610 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:56:05.884 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:56:05.888 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:56:05.891 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:47:38.712 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:47:38.726 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:47:38.726 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:47:38.761 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:47:39.078 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:47:39.079 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:47:39.079 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006884098052978516, 'diffusion_time_cost': 0.3090353012084961, 'diffusion_per_step_time_cost': 0.03862941265106201, 'total_time_cost': 0.3159193992614746, 'offload_time_cost': 0.0}
+2026-03-04 21:47:39.092 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:47:39.095 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:47:39.095 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:47:39.095 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:47:39.095 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:47:39.095 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:47:39.095 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:47:39.374 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:47:39.376 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:47:39.378 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
 [GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
@@ -234,26 +229,26 @@ Using precomputed LM hints
   dit_step0_xt                         0.999650
   dit_step1_vt                         0.854589
   dit_step1_xt                         0.998725
-  dit_step2_vt                         0.841602
-  dit_step2_xt                         0.996217
-  dit_step3_vt                         0.832748
-  dit_step3_xt                         0.990342
-  dit_step4_vt                         0.826828
-  dit_step4_xt                         0.977304
-  dit_step5_vt                         0.815977
-  dit_step5_xt                         0.948497
-  dit_step6_vt                         0.803425
-  dit_step6_xt                         0.895308
-  dit_step7_vt                         0.770195
-  dit_x0                               0.820447
-  vae_audio                            0.478241
-  vae_audio (STFT cosine)              0.753764
+  dit_step2_vt                         0.826891
+  dit_step2_xt                         0.996124
+  dit_step3_vt                         0.832715
+  dit_step3_xt                         0.990263
+  dit_step4_vt                         0.826558
+  dit_step4_xt                         0.977265
+  dit_step5_vt                         0.815705
+  dit_step5_xt                         0.948477
+  dit_step6_vt                         0.802898
+  dit_step6_xt                         0.895216
+  dit_step7_vt                         0.769793
+  dit_x0                               0.820156
+  vae_audio                            0.477357
+  vae_audio (STFT cosine)              0.753154
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
   dit_step0_xt             0.999650   0.235954   0.018872  -0.002255   0.973213  -0.002342   0.972003
   dit_step1_xt             0.998725   0.437235   0.034677  -0.005176   0.942982  -0.005313   0.941730
-  dit_step2_xt             0.996217   0.735376   0.057569  -0.009210   0.909169  -0.009311   0.908527
-  dit_step3_xt             0.990342   1.115564   0.088544  -0.014811   0.872820  -0.014577   0.873624
-  dit_step4_xt             0.977304   1.463506   0.131044  -0.022213   0.838526  -0.021660   0.841995
-  dit_step5_xt             0.948497   2.208427   0.193557  -0.032833   0.817339  -0.032109   0.824593
-  dit_step6_xt             0.895308   3.287671   0.286241  -0.047639   0.842369  -0.046482   0.855546
+  dit_step2_xt             0.996124   0.735913   0.058267  -0.009379   0.909744  -0.009311   0.908527
+  dit_step3_xt             0.990263   1.130236   0.088998  -0.014995   0.873310  -0.014577   0.873624
+  dit_step4_xt             0.977265   1.457183   0.131253  -0.022419   0.838885  -0.021660   0.841995
+  dit_step5_xt             0.948477   2.197404   0.193723  -0.033044   0.817537  -0.032109   0.824593
+  dit_step6_xt             0.895216   3.271284   0.286472  -0.047848   0.842172  -0.046482   0.855546
diff --git a/tests/Vulkan-Q6_K.log b/tests/Vulkan-Q6_K.log
index eff680f..a938da1 100644
--- a/tests/Vulkan-Q6_K.log
+++ b/tests/Vulkan-Q6_K.log
@@ -1,7 +1,7 @@
 ggml_vulkan: Found 1 Vulkan devices:
 ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
 [Load] DiT backend: Vulkan0 (CPU threads: 16)
-[Load] Backend init: 114.2 ms
+[Load] Backend init: 144.9 ms
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K+V fused
 [DiT] Cross-attn: Q+K+V fused
@@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1237.2 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 181.3 ms
+[Load] DiT weight load: 156.5 ms
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[Load] VAE backend: Vulkan0 (shared)
 [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 670.0 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 657.4 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 32.2 ms
+[Load] BPE tokenizer: 32.6 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] TextEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 165.9 ms
-[Encode] TextEncoder (70 tokens): 17.6 ms
+[Load] TextEncoder: 142.6 ms
+[Encode] TextEncoder (70 tokens): 43.2 ms
 [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 11.2 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.3 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] CondEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 476.3 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 61.6 ms
+[Load] ConditionEncoder: 55.4 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 15.6 ms, enc_S=238
+[Encode] ConditionEncoder: 3621.4 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.761356 -0.050570 -0.133026 0.058500
 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 82.2 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 10.8 ms
+[Load] Detokenizer: 10.5 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 143.8 ms
+[Context] Detokenizer: 421.5 ms
 [Debug] detok_output: [2170, 64] first4: -0.141024 1.454365 0.315089 -0.623565
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -79,70 +77,67 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Debug] proj_in_input: [192, 2170] first4: -0.141024 1.454365 0.315089 -0.623565
 [Debug] enc_after_cond_emb: [2048, 238] first4: -0.170166 0.815842 0.310486 -0.571373
 [Debug] layer0_sa_input: [2048, 1085] first4: -0.716080 -0.755969 -0.048350 0.263422
-[Debug] layer0_q_after_rope: [128, 16] first4: -2.400391 -0.081909 -0.397461 1.011719
-[Debug] layer0_k_after_rope: [128, 8] first4: -12.581572 1.117675 1.774897 1.788774
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.503906 0.211304 -0.366943 0.520996
-[Debug] layer0_attn_out: [2048, 1085] first4: -1.540494 -1.050420 0.183235 0.461747
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540494 -1.050420 0.183235 0.461747
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.586454 -0.808233 -0.324089 0.502214
-[Debug] hidden_after_layer0: [2048, 1085] first4: -9.155503 0.531986 51.823910 -0.865276
-[Debug] hidden_after_layer6: [2048, 1085] first4: -20.861578 -0.240065 34.589954 -4.288221
-[Debug] hidden_after_layer12: [2048, 1085] first4: -14.692959 -16.975090 77.250595 30.676491
-[Debug] hidden_after_layer18: [2048, 1085] first4: -28.082283 13.370504 64.661263 19.941170
-[Debug] hidden_after_layer23: [2048, 1085] first4: -16.195175 45.294254 196.766129 138.065048
-[Debug] dit_step0_vt: [2170, 64] first4: 0.098133 1.125458 0.338135 2.349396
-[Debug] dit_step0_xt: [2170, 64] first4: 0.189875 2.105093 -0.187245 0.740865
+[Debug] layer0_q_after_rope: [128, 16] first4: -1.967773 -0.181152 -0.292236 0.785156
+[Debug] layer0_k_after_rope: [128, 8] first4: -12.036863 1.158559 1.733423 1.789948
+[Debug] layer0_sa_output: [2048, 1085] first4: -1.344727 -0.353271 -0.171753 0.330078
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.487266 -0.799756 0.373941 0.458040
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.487266 -0.799756 0.373941 0.458040
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.478606 -0.639722 0.069986 0.503358
+[Debug] hidden_after_layer0: [2048, 1085] first4: -6.179441 -0.194424 25.726625 -0.569950
+[Debug] hidden_after_layer6: [2048, 1085] first4: -12.978424 -2.696237 30.199980 -5.338717
+[Debug] hidden_after_layer12: [2048, 1085] first4: -13.710206 -8.286438 60.887405 36.884922
+[Debug] hidden_after_layer18: [2048, 1085] first4: -19.046274 10.102365 41.516960 14.606686
+[Debug] hidden_after_layer23: [2048, 1085] first4: 52.532547 37.219868 135.759094 151.323456
+[Debug] dit_step0_vt: [2170, 64] first4: 0.032410 0.877930 -0.200378 2.148727
+[Debug] dit_step0_xt: [2170, 64] first4: 0.192863 2.116344 -0.162767 0.749987
 [DiT] step 1/8 t=1.000
-[Debug] dit_step1_vt: [2170, 64] first4: -0.018386 1.071533 -0.402077 1.814056
-[Debug] dit_step1_xt: [2170, 64] first4: 0.190878 2.046645 -0.165313 0.641917
+[Debug] dit_step1_vt: [2170, 64] first4: -0.018381 1.082458 -0.369057 1.835251
+[Debug] dit_step1_xt: [2170, 64] first4: 0.193865 2.057301 -0.142637 0.649882
 [DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: -0.052032 1.017303 -0.201233 2.115219
-[Debug] dit_step2_xt: [2170, 64] first4: 0.194347 1.978825 -0.151898 0.500902
+[Debug] dit_step2_vt: [2170, 64] first4: -0.045654 1.004852 -0.202515 2.128693
+[Debug] dit_step2_xt: [2170, 64] first4: 0.196909 1.990311 -0.129136 0.507969
 [DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: 0.052856 1.105988 0.072205 2.288803
-[Debug] dit_step3_xt: [2170, 64] first4: 0.189942 1.886660 -0.157915 0.310169
+[Debug] dit_step3_vt: [2170, 64] first4: 0.053986 1.098206 0.059753 2.273270
+[Debug] dit_step3_xt: [2170, 64] first4: 0.192410 1.898794 -0.134115 0.318530
 [DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 0.097982 1.134430 0.083038 2.362534
-[Debug] dit_step4_xt: [2170, 64] first4: 0.179444 1.765114 -0.166812 0.057040
+[Debug] dit_step4_vt: [2170, 64] first4: 0.059109 1.133232 0.098053 2.316540
+[Debug] dit_step4_xt: [2170, 64] first4: 0.186077 1.777376 -0.144621 0.070330
 [DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 0.122574 1.016464 0.173828 2.333248
-[Debug] dit_step5_xt: [2170, 64] first4: 0.161934 1.619904 -0.191644 -0.276281
+[Debug] dit_step5_vt: [2170, 64] first4: 0.060867 1.087685 0.153732 2.225224
+[Debug] dit_step5_xt: [2170, 64] first4: 0.177382 1.621992 -0.166582 -0.247560
 [DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: 0.070358 0.866913 -0.005890 2.297897
-[Debug] dit_step6_xt: [2170, 64] first4: 0.147862 1.446522 -0.190466 -0.735860
+[Debug] dit_step6_vt: [2170, 64] first4: -0.040359 0.926651 0.010437 2.195786
+[Debug] dit_step6_xt: [2170, 64] first4: 0.185454 1.436662 -0.168670 -0.686717
 [DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: -0.360962 0.376282 -0.314270 2.626526
-[Debug] dit_x0: [2170, 64] first4: 0.256151 1.333637 -0.096185 -1.523818
+[Debug] dit_step7_vt: [2170, 64] first4: -0.506134 0.486553 -0.233337 2.557739
+[Debug] dit_x0: [2170, 64] first4: 0.337294 1.290696 -0.098669 -1.454038
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 276.6 ms (276.6 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.256151 1.333637 -0.096185 -1.523818
+[DiT] Total generation: 336.6 ms (336.6 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.337294 1.290696 -0.098669 -1.454038
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 9723.7 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000254 0.000880 0.000782 0.001025
+[VAE Batch0] Decode: 1718.9 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000037 0.000692 0.000656 0.000941
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:55:46.361 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:55:46.361 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:55:46.361 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:55:46.362 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:55:46.362 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:55:47.150 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:55:48.700 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:55:48.700 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:55:48.705 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:55:48.864 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:47:24.206 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:47:24.287 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:47:25.614 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:47:25.614 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:47:25.616 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:47:25.621 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:47:25.810 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:55:48.866 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:47:25.812 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:55:48.872 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:55:48.885 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:55:48.885 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:55:48.917 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:55:49.229 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:55:49.230 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:55:49.230 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006822347640991211, 'diffusion_time_cost': 0.3050048351287842, 'diffusion_per_step_time_cost': 0.03812560439109802, 'total_time_cost': 0.3118271827697754, 'offload_time_cost': 0.0}
-2026-03-01 19:55:49.244 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:55:49.267 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:55:49.267 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB
-2026-03-01 19:55:49.267 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:55:49.267 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB
-2026-03-01 19:55:49.267 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB
-2026-03-01 19:55:49.267 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:55:49.543 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:55:49.546 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:55:49.549 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:47:25.819 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:47:25.833 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:47:25.833 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:47:25.868 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:47:26.184 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:47:26.184 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:47:26.185 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006891727447509766, 'diffusion_time_cost': 0.3077425956726074, 'diffusion_per_step_time_cost': 0.03846782445907593, 'total_time_cost': 0.3146343231201172, 'offload_time_cost': 0.0}
+2026-03-04 21:47:26.198 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:47:26.201 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:47:26.201 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB
+2026-03-04 21:47:26.201 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:47:26.201 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB
+2026-03-04 21:47:26.201 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB
+2026-03-04 21:47:26.201 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:47:26.477 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:47:26.479 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:47:26.481 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
 [GGML] Running acestep-v15-turbo-Q6_K.gguf...
@@ -224,36 +219,36 @@ Using precomputed LM hints
   temb_t                               0.999990
   hidden_after_proj_in                 0.999982
   enc_after_cond_emb                   0.999691
-  layer0_sa_output                     0.999774
-  hidden_after_layer0                  0.999710
-  hidden_after_layer6                  0.999855
-  hidden_after_layer12                 0.998856
-  hidden_after_layer18                 0.995803
-  hidden_after_layer23                 0.992072
-  dit_step0_vt                         0.970064
-  dit_step0_xt                         0.999934
-  dit_step1_vt                         0.924403
-  dit_step1_xt                         0.999650
-  dit_step2_vt                         0.915580
-  dit_step2_xt                         0.998651
-  dit_step3_vt                         0.914431
-  dit_step3_xt                         0.996098
-  dit_step4_vt                         0.913750
-  dit_step4_xt                         0.990344
-  dit_step5_vt                         0.906205
-  dit_step5_xt                         0.976856
-  dit_step6_vt                         0.897054
-  dit_step6_xt                         0.950943
-  dit_step7_vt                         0.876737
-  dit_x0                               0.912738
-  vae_audio                            0.744947
-  vae_audio (STFT cosine)              0.875717
+  layer0_sa_output                     0.916347
+  hidden_after_layer0                  0.997124
+  hidden_after_layer6                  0.993692
+  hidden_after_layer12                 0.992958
+  hidden_after_layer18                 0.988620
+  hidden_after_layer23                 0.980873
+  dit_step0_vt                         0.928387
+  dit_step0_xt                         0.999844
+  dit_step1_vt                         0.919122
+  dit_step1_xt                         0.999441
+  dit_step2_vt                         0.904200
+  dit_step2_xt                         0.998155
+  dit_step3_vt                         0.897635
+  dit_step3_xt                         0.994890
+  dit_step4_vt                         0.891638
+  dit_step4_xt                         0.987300
+  dit_step5_vt                         0.886907
+  dit_step5_xt                         0.970219
+  dit_step6_vt                         0.876538
+  dit_step6_xt                         0.938117
+  dit_step7_vt                         0.853291
+  dit_x0                               0.891872
+  vae_audio                            0.694699
+  vae_audio (STFT cosine)              0.858167
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999934   0.147239   0.007394  -0.002260   0.973056  -0.002342   0.972003
-  dit_step1_xt             0.999650   0.408757   0.017759  -0.005276   0.943557  -0.005313   0.941730
-  dit_step2_xt             0.998651   0.803721   0.033644  -0.009510   0.911087  -0.009311   0.908527
-  dit_step3_xt             0.996098   1.476888   0.054660  -0.015226   0.876460  -0.014577   0.873624
-  dit_step4_xt             0.990344   2.294700   0.082632  -0.022702   0.844225  -0.021660   0.841995
-  dit_step5_xt             0.976856   3.284146   0.125042  -0.033545   0.825286  -0.032109   0.824593
-  dit_step6_xt             0.950943   4.445529   0.188707  -0.049081   0.851111  -0.046482   0.855546
+  dit_step0_xt             0.999844   0.420509   0.012210  -0.002227   0.973206  -0.002342   0.972003
+  dit_step1_xt             0.999441   0.819075   0.022719  -0.005232   0.943799  -0.005313   0.941730
+  dit_step2_xt             0.998155   1.085687   0.039812  -0.009404   0.911549  -0.009311   0.908527
+  dit_step3_xt             0.994890   1.743559   0.063467  -0.015082   0.877147  -0.014577   0.873624
+  dit_step4_xt             0.987300   2.546782   0.096584  -0.022664   0.845277  -0.021660   0.841995
+  dit_step5_xt             0.970219   3.539635   0.144911  -0.033717   0.826728  -0.032109   0.824593
+  dit_step6_xt             0.938117   4.795851   0.216607  -0.049484   0.852836  -0.046482   0.855546
diff --git a/tests/Vulkan-Q8_0.log b/tests/Vulkan-Q8_0.log
index 774bc8a..ef53667 100644
--- a/tests/Vulkan-Q8_0.log
+++ b/tests/Vulkan-Q8_0.log
@@ -1,7 +1,7 @@
 ggml_vulkan: Found 1 Vulkan devices:
 ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2
 [Load] DiT backend: Vulkan0 (CPU threads: 16)
-[Load] Backend init: 113.5 ms
+[Load] Backend init: 111.5 ms
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [DiT] Self-attn: Q+K+V fused
 [DiT] Cross-attn: Q+K+V fused
@@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Load] null_condition_emb found (CFG available)
 [WeightCtx] Loaded 478 tensors, 1600.7 MB into backend
 [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128
-[Load] DiT weight load: 214.1 ms
+[Load] DiT weight load: 194.1 ms
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [Load] silence_latent: [15000, 64] from GGUF
 [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048
-[Load] VAE backend: Vulkan0 (CPU threads: 16)
+[Load] VAE backend: Vulkan0 (shared)
 [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB
-[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations
-[Load] VAE weights: 671.7 ms
+[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations
+[Load] VAE weights: 657.9 ms
 [Request 1/1] ggml-turbo/request0.json (batch=1)
 [Request] parsed ggml-turbo/request0.json (18 fields)
-[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0)
 [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s
 [Pipeline] 434 audio codes (86.8s @ 5Hz)
 [Pipeline] T=2170, S=1085
 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges
-[Load] BPE tokenizer: 31.9 ms
+[Load] BPE tokenizer: 31.4 ms
 [Pipeline] caption: 70 tokens, lyrics: 167 tokens
-[Load] TextEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] TextEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
 [Load] TextEncoder: 28L, H=1024, Nh=16/8
 [Qwen3] Attn: Q+K+V fused
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend
-[Load] TextEncoder: 176.0 ms
+[Load] TextEncoder: 145.4 ms
 [Encode] TextEncoder (70 tokens): 17.6 ms
 [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830
-[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568
-[Encode] Lyric vocab lookup (167 tokens): 11.2 ms
+[Encode] Lyric vocab lookup (167 tokens): 0.3 ms
 [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809
-[Load] CondEncoder backend: Vulkan0 (CPU threads: 16)
+[Load] CondEncoder backend: Vulkan0 (shared)
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [Load] LyricEncoder: 8L
 [Qwen3] Attn: Q+K+V fused
@@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Qwen3] MLP: gate+up fused
 [WeightCtx] Loaded 140 tensors, 616.6 MB into backend
 [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond
-[Load] ConditionEncoder: 84.7 ms
+[Load] ConditionEncoder: 75.8 ms
 [CondEnc] Lyric sliding mask: 167x167, window=128
 [CondEnc] Timbre sliding mask: 750x750, window=128
 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens
-[Encode] ConditionEncoder: 19.4 ms, enc_S=238
+[Encode] ConditionEncoder: 5074.3 ms, enc_S=238
 [Debug] enc_hidden: [238, 2048] first4: 1.759194 -0.049729 -0.133332 0.058435
 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864
 [WeightCtx] Loaded 30 tensors, 106.5 MB into backend
 [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)
-[Load] Detokenizer: 15.5 ms
+[Load] Detokenizer: 13.7 ms
 [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz)
-[Context] Detokenizer: 85.1 ms
+[Context] Detokenizer: 437.6 ms
 [Debug] detok_output: [2170, 64] first4: -0.121505 1.434749 0.303808 -0.627535
 [Context Batch0] Philox noise seed=42, [2170, 64]
 [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656
@@ -79,70 +77,67 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um
 [Debug] proj_in_input: [192, 2170] first4: -0.121505 1.434749 0.303808 -0.627535
 [Debug] enc_after_cond_emb: [2048, 238] first4: -0.169861 0.817307 0.328308 -0.558397
 [Debug] layer0_sa_input: [2048, 1085] first4: -0.718007 -0.757392 -0.047301 0.261071
-[Debug] layer0_q_after_rope: [128, 16] first4: -2.423828 -0.099304 -0.408203 1.004883
-[Debug] layer0_k_after_rope: [128, 8] first4: -12.718538 1.122484 1.774887 1.790079
-[Debug] layer0_sa_output: [2048, 1085] first4: -1.510742 0.165771 -0.347900 0.511230
-[Debug] layer0_attn_out: [2048, 1085] first4: -1.542524 -1.031132 0.196691 0.455273
-[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.542524 -1.031132 0.196691 0.455273
-[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.585310 -0.791508 -0.290125 0.495190
-[Debug] hidden_after_layer0: [2048, 1085] first4: -8.926053 0.558007 51.172398 -0.877717
-[Debug] hidden_after_layer6: [2048, 1085] first4: -20.768745 -0.272222 34.170349 -4.416629
-[Debug] hidden_after_layer12: [2048, 1085] first4: -14.358247 -18.625305 73.571915 30.079784
-[Debug] hidden_after_layer18: [2048, 1085] first4: -26.789474 14.346137 62.040115 19.708126
-[Debug] hidden_after_layer23: [2048, 1085] first4: -2.927731 38.887718 192.805542 144.255524
-[Debug] dit_step0_vt: [2170, 64] first4: 0.027340 1.115875 0.350609 2.345856
-[Debug] dit_step0_xt: [2170, 64] first4: 0.193093 2.105528 -0.187812 0.741026
+[Debug] layer0_q_after_rope: [128, 16] first4: -2.134766 -0.300049 -0.404541 0.904297
+[Debug] layer0_k_after_rope: [128, 8] first4: -12.349133 1.249128 1.744302 1.794822
+[Debug] layer0_sa_output: [2048, 1085] first4: -0.636230 -0.405029 0.096436 0.194946
+[Debug] layer0_attn_out: [2048, 1085] first4: -1.250806 -0.777872 0.630535 0.449394
+[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.250806 -0.777872 0.630535 0.449394
+[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.239698 -0.683206 0.416180 0.509788
+[Debug] hidden_after_layer0: [2048, 1085] first4: -4.682029 -0.464333 15.184165 -0.212429
+[Debug] hidden_after_layer6: [2048, 1085] first4: -8.053159 0.591622 20.595821 -6.469027
+[Debug] hidden_after_layer12: [2048, 1085] first4: -11.836857 -8.197025 41.079239 30.392553
+[Debug] hidden_after_layer18: [2048, 1085] first4: -20.004263 1.558971 15.575721 16.331001
+[Debug] hidden_after_layer23: [2048, 1085] first4: 23.482555 18.593208 82.512901 173.016068
+[Debug] dit_step0_vt: [2170, 64] first4: 0.084528 0.834541 -0.408783 2.115417
+[Debug] dit_step0_xt: [2170, 64] first4: 0.190494 2.118316 -0.153294 0.751501
 [DiT] step 1/8 t=1.000
-[Debug] dit_step1_vt: [2170, 64] first4: 0.002377 1.005737 -0.352661 1.768188
-[Debug] dit_step1_xt: [2170, 64] first4: 0.192964 2.050670 -0.168576 0.644580
+[Debug] dit_step1_vt: [2170, 64] first4: -0.071388 1.041626 -0.270477 1.704315
+[Debug] dit_step1_xt: [2170, 64] first4: 0.194388 2.061500 -0.138541 0.658538
 [DiT] step 2/8 t=0.955
-[Debug] dit_step2_vt: [2170, 64] first4: -0.063080 1.061218 -0.344177 1.926041
-[Debug] dit_step2_xt: [2170, 64] first4: 0.197169 1.979922 -0.145631 0.516177
+[Debug] dit_step2_vt: [2170, 64] first4: -0.071960 1.095016 -0.333557 1.988541
+[Debug] dit_step2_xt: [2170, 64] first4: 0.199185 1.988499 -0.116304 0.525969
 [DiT] step 3/8 t=0.900
-[Debug] dit_step3_vt: [2170, 64] first4: -0.072388 1.144592 -0.184326 2.069214
-[Debug] dit_step3_xt: [2170, 64] first4: 0.203201 1.884539 -0.130270 0.343743
+[Debug] dit_step3_vt: [2170, 64] first4: -0.037468 1.148598 -0.165955 2.091240
+[Debug] dit_step3_xt: [2170, 64] first4: 0.202307 1.892783 -0.102474 0.351699
 [DiT] step 4/8 t=0.833
-[Debug] dit_step4_vt: [2170, 64] first4: 0.004288 1.147110 0.001495 2.068916
-[Debug] dit_step4_xt: [2170, 64] first4: 0.202742 1.761635 -0.130430 0.122073
+[Debug] dit_step4_vt: [2170, 64] first4: 0.014343 1.134537 -0.033691 2.114731
+[Debug] dit_step4_xt: [2170, 64] first4: 0.200771 1.771225 -0.098864 0.125120
 [DiT] step 5/8 t=0.750
-[Debug] dit_step5_vt: [2170, 64] first4: 0.070211 1.173462 0.080673 2.086014
-[Debug] dit_step5_xt: [2170, 64] first4: 0.192712 1.593997 -0.141955 -0.175929
+[Debug] dit_step5_vt: [2170, 64] first4: 0.064150 1.159027 0.062057 2.121386
+[Debug] dit_step5_xt: [2170, 64] first4: 0.191606 1.605650 -0.107730 -0.177935
 [DiT] step 6/8 t=0.643
-[Debug] dit_step6_vt: [2170, 64] first4: -0.010117 1.145203 0.186996 2.198898
-[Debug] dit_step6_xt: [2170, 64] first4: 0.194735 1.364957 -0.179354 -0.615709
+[Debug] dit_step6_vt: [2170, 64] first4: -0.041473 1.200439 0.198494 2.240326
+[Debug] dit_step6_xt: [2170, 64] first4: 0.199901 1.365562 -0.147428 -0.626000
 [DiT] step 7/8 t=0.500
-[Debug] dit_step7_vt: [2170, 64] first4: -0.244629 0.644890 0.358635 2.446594
-[Debug] dit_x0: [2170, 64] first4: 0.268124 1.171490 -0.286945 -1.349687
+[Debug] dit_step7_vt: [2170, 64] first4: -0.309998 0.692413 0.432823 2.469238
+[Debug] dit_x0: [2170, 64] first4: 0.292900 1.157838 -0.277275 -1.366771
 [DiT] step 8/8 t=0.300
-[DiT] Total generation: 252.0 ms (252.0 ms/sample)
-[Debug] dit_output: [2170, 64] first4: 0.268124 1.171490 -0.286945 -1.349687
+[DiT] Total generation: 335.0 ms (335.0 ms/sample)
+[Debug] dit_output: [2170, 64] first4: 0.292900 1.157838 -0.277275 -1.366771
 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128)
-[VAE] Graph: 417 nodes, T_latent=192
+[VAE] Graph: 335 nodes, T_latent=192
 [VAE] Upsample factor: 1920.00 (expected ~1920)
-[VAE] Graph: 417 nodes, T_latent=256
-[VAE] Graph: 417 nodes, T_latent=186
+[VAE] Graph: 335 nodes, T_latent=256
+[VAE] Graph: 335 nodes, T_latent=186
 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz)
-[VAE Batch0] Decode: 9843.4 ms
-[Debug] vae_audio: [2, 4166400] first4: 0.000170 0.000825 0.000784 0.001115
+[VAE Batch0] Decode: 1706.4 ms
+[Debug] vae_audio: [2, 4166400] first4: 0.000160 0.000739 0.000691 0.001054
 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo)
 [Request 1/1] Done
 [Pipeline] All done
-2026-03-01 19:55:29.948 | WARNING  | acestep.training.lora_utils:<module>:29 - PEFT library not installed. LoRA training will not be available.
-2026-03-01 19:55:29.948 | WARNING  | acestep.training.lokr_utils:<module>:24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora
-2026-03-01 19:55:29.948 | WARNING  | acestep.training.data_module:<module>:25 - Lightning not installed. Training module will not be available.
-2026-03-01 19:55:29.948 | WARNING  | acestep.training.trainer:<module>:28 - Lightning Fabric not installed. Training will use basic training loop.
-2026-03-01 19:55:29.948 | WARNING  | acestep.training.trainer:<module>:36 - bitsandbytes not installed. Using standard AdamW.
-2026-03-01 19:55:30.699 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
-`torch_dtype` is deprecated! Use `dtype` instead!
-2026-03-01 19:55:32.273 | INFO     | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation...
-2026-03-01 19:55:32.274 | INFO     | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs...
-2026-03-01 19:55:32.279 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
-2026-03-01 19:55:32.442 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
-2026-03-01 19:55:32.443 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
+2026-03-04 21:47:11.115 | WARNING  | acestep.training.trainer:<module>:40 - bitsandbytes not installed. Using standard AdamW.
+2026-03-04 21:47:11.205 | INFO     | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa
+Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao`
+2026-03-04 21:47:12.506 | INFO     | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation...
+2026-03-04 21:47:12.506 | INFO     | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs...
+2026-03-04 21:47:12.508 | INFO     | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo).
+2026-03-04 21:47:12.513 | INFO     | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0...
+2026-03-04 21:47:12.703 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0...
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - 
 ======================================================================
-2026-03-01 19:55:32.443 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
-2026-03-01 19:55:32.443 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
-2026-03-01 19:55:32.444 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference)
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ======================================================================
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt:
 # Instruction
 Generate audio semantic tokens based on the given conditions:
 
@@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven
 - duration: 88 seconds
 <|endoftext|>
 
-2026-03-01 19:55:32.444 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
-2026-03-01 19:55:32.444 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ======================================================================
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text:
 # Languages
 fr
 
@@ -184,25 +179,25 @@ Dans le monde des tutos virtuels
 Gândoline, Pumbé à midi
 Une famille à connecter, c'est vrai
 D'un enfant qui voit toi fusionner<|endoftext|>
-2026-03-01 19:55:32.444 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
+2026-03-04 21:47:12.705 | INFO     | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ======================================================================
 
-2026-03-01 19:55:32.450 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
-2026-03-01 19:55:32.462 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
-2026-03-01 19:55:32.463 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
-2026-03-01 19:55:32.484 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
-2026-03-01 19:55:32.791 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
-2026-03-01 19:55:32.791 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
-2026-03-01 19:55:32.791 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006818294525146484, 'diffusion_time_cost': 0.2995321750640869, 'diffusion_per_step_time_cost': 0.037441521883010864, 'total_time_cost': 0.3063504695892334, 'offload_time_cost': 0.0}
-2026-03-01 19:55:32.806 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
-2026-03-01 19:55:32.808 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
-2026-03-01 19:55:32.808 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB
-2026-03-01 19:55:32.808 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
-2026-03-01 19:55:32.808 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB
-2026-03-01 19:55:32.808 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB
-2026-03-01 19:55:32.808 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
-2026-03-01 19:55:33.083 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
-2026-03-01 19:55:33.084 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
-2026-03-01 19:55:33.088 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
+2026-03-04 21:47:12.713 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings...
+2026-03-04 21:47:12.727 | INFO     | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings...
+2026-03-04 21:47:12.727 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda))
+2026-03-04 21:47:12.758 | INFO     | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)...
+2026-03-04 21:47:13.073 | INFO     | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents...
+2026-03-04 21:47:13.073 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16
+2026-03-04 21:47:13.073 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006865262985229492, 'diffusion_time_cost': 0.30722999572753906, 'diffusion_per_step_time_cost': 0.03840374946594238, 'total_time_cost': 0.31409525871276855, 'offload_time_cost': 0.0}
+2026-03-04 21:47:13.087 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE...
+2026-03-04 21:47:13.096 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB
+2026-03-04 21:47:13.096 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.82 GB
+2026-03-04 21:47:13.096 | INFO     | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage...
+2026-03-04 21:47:13.096 | DEBUG    | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.82 GB
+2026-03-04 21:47:13.096 | DEBUG    | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.82 GB
+2026-03-04 21:47:13.096 | INFO     | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170])
+2026-03-04 21:47:13.370 | DEBUG    | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB
+2026-03-04 21:47:13.372 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors...
+2026-03-04 21:47:13.374 | INFO     | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors.
 [Request] Loaded request0.json
 [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
 [GGML] Running acestep-v15-turbo-Q8_0.gguf...
@@ -224,36 +219,36 @@ Using precomputed LM hints
   temb_t                               0.999998
   hidden_after_proj_in                 0.999985
   enc_after_cond_emb                   0.999817
-  layer0_sa_output                     0.999939
-  hidden_after_layer0                  0.999858
-  hidden_after_layer6                  0.999893
-  hidden_after_layer12                 0.999124
-  hidden_after_layer18                 0.996403
-  hidden_after_layer23                 0.993183
-  dit_step0_vt                         0.973885
-  dit_step0_xt                         0.999943
-  dit_step1_vt                         0.915468
-  dit_step1_xt                         0.999633
-  dit_step2_vt                         0.912211
-  dit_step2_xt                         0.998544
-  dit_step3_vt                         0.912707
-  dit_step3_xt                         0.995860
-  dit_step4_vt                         0.906019
-  dit_step4_xt                         0.989505
-  dit_step5_vt                         0.896537
-  dit_step5_xt                         0.974659
-  dit_step6_vt                         0.886047
-  dit_step6_xt                         0.945866
-  dit_step7_vt                         0.869793
-  dit_x0                               0.905017
-  vae_audio                            0.746037
-  vae_audio (STFT cosine)              0.898352
+  layer0_sa_output                     0.896665
+  hidden_after_layer0                  0.996506
+  hidden_after_layer6                  0.988924
+  hidden_after_layer12                 0.986595
+  hidden_after_layer18                 0.980435
+  hidden_after_layer23                 0.969958
+  dit_step0_vt                         0.880150
+  dit_step0_xt                         0.999739
+  dit_step1_vt                         0.904993
+  dit_step1_xt                         0.999178
+  dit_step2_vt                         0.897232
+  dit_step2_xt                         0.997639
+  dit_step3_vt                         0.896753
+  dit_step3_xt                         0.994077
+  dit_step4_vt                         0.889861
+  dit_step4_xt                         0.986004
+  dit_step5_vt                         0.878511
+  dit_step5_xt                         0.967661
+  dit_step6_vt                         0.867605
+  dit_step6_xt                         0.933014
+  dit_step7_vt                         0.848412
+  dit_x0                               0.884572
+  vae_audio                            0.692036
+  vae_audio (STFT cosine)              0.882942
 [Turbo] Error growth GGML vs Python
   stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999943   0.140034   0.006943  -0.002318   0.973036  -0.002342   0.972003
-  dit_step1_xt             0.999633   0.423125   0.018056  -0.005257   0.943026  -0.005313   0.941730
-  dit_step2_xt             0.998544   0.841908   0.034537  -0.009209   0.910286  -0.009311   0.908527
-  dit_step3_xt             0.995860   1.521911   0.055719  -0.014626   0.875169  -0.014577   0.873624
-  dit_step4_xt             0.989505   2.346452   0.085477  -0.021803   0.842334  -0.021660   0.841995
-  dit_step5_xt             0.974659   3.387389   0.130921  -0.032225   0.822365  -0.032109   0.824593
-  dit_step6_xt             0.945866   4.812943   0.199910  -0.047290   0.846751  -0.046482   0.855546
+  dit_step0_xt             0.999739   0.400727   0.016274  -0.002102   0.972847  -0.002342   0.972003
+  dit_step1_xt             0.999178   0.814308   0.027485  -0.004968   0.942952  -0.005313   0.941730
+  dit_step2_xt             0.997639   1.101152   0.044575  -0.008840   0.910138  -0.009311   0.908527
+  dit_step3_xt             0.994077   1.762341   0.067497  -0.014170   0.875003  -0.014577   0.873624
+  dit_step4_xt             0.986004   2.565164   0.099802  -0.021228   0.842166  -0.021660   0.841995
+  dit_step5_xt             0.967661   3.593323   0.149360  -0.031486   0.822043  -0.032109   0.824593
+  dit_step6_xt             0.933014   4.978329   0.224230  -0.046337   0.845793  -0.046482   0.855546
diff --git a/tests/Vulkan_BF16.log b/tests/Vulkan_BF16.log
deleted file mode 100644
index bd5f26b..0000000
--- a/tests/Vulkan_BF16.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf
-[GGML] Running acestep-v15-turbo-BF16.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999812
-  lyric_embed                          1.000000
-  enc_hidden                           0.999834
-  detok_output                         0.999997
-  context                              0.999998
-  noise                                1.000000
-  temb_t                               0.999999
-  hidden_after_proj_in                 0.999987
-  enc_after_cond_emb                   0.999825
-  layer0_sa_output                     0.999959
-  hidden_after_layer0                  0.999982
-  hidden_after_layer6                  0.999916
-  hidden_after_layer12                 0.999276
-  hidden_after_layer18                 0.996645
-  hidden_after_layer23                 0.993735
-  dit_step0_vt                         0.975502
-  dit_step0_xt                         0.999946
-  dit_step1_vt                         0.898326
-  dit_step1_xt                         0.999578
-  dit_step2_vt                         0.893586
-  dit_step2_xt                         0.998276
-  dit_step3_vt                         0.881101
-  dit_step3_xt                         0.994720
-  dit_step4_vt                         0.869138
-  dit_step4_xt                         0.986137
-  dit_step5_vt                         0.854878
-  dit_step5_xt                         0.965846
-  dit_step6_vt                         0.840298
-  dit_step6_xt                         0.925771
-  dit_step7_vt                         0.818300
-  dit_x0                               0.867401
-  vae_audio                            0.680429
-  vae_audio (STFT cosine)              0.855382
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999946   0.135811   0.006633  -0.002316   0.972919  -0.002342   0.972003
-  dit_step1_xt             0.999578   0.413265   0.019706  -0.005121   0.942541  -0.005313   0.941730
-  dit_step2_xt             0.998276   0.811472   0.038208  -0.008968   0.908957  -0.009311   0.908527
-  dit_step3_xt             0.994720   1.481150   0.064047  -0.014385   0.872574  -0.014577   0.873624
-  dit_step4_xt             0.986137   1.857148   0.100272  -0.021489   0.837038  -0.021660   0.841995
-  dit_step5_xt             0.965846   1.439633   0.154129  -0.031859   0.812819  -0.032109   0.824593
-  dit_step6_xt             0.925771   2.125688   0.235367  -0.046759   0.832442  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-BF16.gguf
-[GGML] Running acestep-v15-sft-BF16.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999812
-  lyric_embed                          1.000000
-  enc_hidden                           0.999834
-  detok_output                         0.999997
-  context                              0.999998
-  noise                                1.000000
-  temb_t                               0.999997
-  hidden_after_proj_in                 0.999987
-  enc_after_cond_emb                   0.999828
-  layer0_sa_output                     0.999951
-  hidden_after_layer0                  0.999982
-  hidden_after_layer6                  0.999849
-  hidden_after_layer12                 0.999486
-  hidden_after_layer18                 0.998746
-  hidden_after_layer23                 0.998992
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998963
-  dit_step0_vt_uncond                  0.973704
-  dit_step0_vt                         0.986492
-  dit_step0_xt                         0.999992
-  dit_step5_vt_cond                    0.978980
-  dit_step5_vt                         0.906055
-  dit_step5_xt                         0.999319
-  dit_step10_vt_cond                   0.961518
-  dit_step10_vt                        0.898737
-  dit_step10_xt                        0.996347
-  dit_step15_vt_cond                   0.933830
-  dit_step15_vt                        0.840233
-  dit_step15_xt                        0.988073
-  dit_step20_vt_cond                   0.894620
-  dit_step20_vt                        0.796873
-  dit_step20_xt                        0.970961
-  dit_step25_vt_cond                   0.845710
-  dit_step25_vt                        0.737589
-  dit_step25_xt                        0.943356
-  dit_step30_vt_cond                   0.791700
-  dit_step30_vt                        0.686150
-  dit_step30_xt                        0.906182
-  dit_step35_vt_cond                   0.734800
-  dit_step35_vt                        0.627091
-  dit_step35_xt                        0.866844
-  dit_step40_vt_cond                   0.692744
-  dit_step40_vt                        0.579983
-  dit_step40_xt                        0.832660
-  dit_step45_vt_cond                   0.707766
-  dit_step45_vt                        0.576903
-  dit_step45_xt                        0.809828
-  dit_step49_vt_cond                   0.753038
-  dit_step49_vt                        0.625137
-  dit_x0                               0.801669
-  vae_audio                            0.494694
-  vae_audio (STFT cosine)              0.706773
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999992   0.064200   0.003294  -0.001888   0.980082  -0.001741   0.980402
-  dit_step5_xt             0.999319   0.557092   0.024040  -0.006621   0.887864  -0.007143   0.887999
-  dit_step10_xt            0.996347   0.965268   0.050926  -0.011718   0.806420  -0.012603   0.811299
-  dit_step15_xt            0.988073   0.861492   0.085157  -0.016277   0.731584  -0.018114   0.745268
-  dit_step20_xt            0.970961   1.278730   0.125264  -0.020700   0.671902  -0.023808   0.699582
-  dit_step25_xt            0.943356   1.796219   0.169586  -0.025074   0.633808  -0.029311   0.679278
-  dit_step30_xt            0.906182   2.190889   0.219620  -0.029769   0.614453  -0.035027   0.685262
-  dit_step35_xt            0.866844   2.605400   0.272383  -0.034410   0.619164  -0.040716   0.717195
-  dit_step40_xt            0.832660   3.030330   0.326889  -0.039011   0.646487  -0.046462   0.771853
-  dit_step45_xt            0.809828   3.411977   0.379136  -0.043945   0.692545  -0.052475   0.843036
diff --git a/tests/Vulkan_Q4_K_M.log b/tests/Vulkan_Q4_K_M.log
deleted file mode 100644
index 2c1b7e2..0000000
--- a/tests/Vulkan_Q4_K_M.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf
-[GGML] Running acestep-v15-turbo-Q4_K_M.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999812
-  lyric_embed                          1.000000
-  enc_hidden                           0.997128
-  detok_output                         0.999611
-  context                              0.999751
-  noise                                1.000000
-  temb_t                               0.999906
-  hidden_after_proj_in                 0.999907
-  enc_after_cond_emb                   0.997645
-  layer0_sa_output                     0.998432
-  hidden_after_layer0                  0.999545
-  hidden_after_layer6                  0.923275
-  hidden_after_layer12                 0.969957
-  hidden_after_layer18                 0.964919
-  hidden_after_layer23                 0.947132
-  dit_step0_vt                         0.790633
-  dit_step0_xt                         0.999549
-  dit_step1_vt                         0.812278
-  dit_step1_xt                         0.998317
-  dit_step2_vt                         0.797899
-  dit_step2_xt                         0.994987
-  dit_step3_vt                         0.785709
-  dit_step3_xt                         0.987168
-  dit_step4_vt                         0.777756
-  dit_step4_xt                         0.969910
-  dit_step5_vt                         0.739552
-  dit_step5_xt                         0.933874
-  dit_step6_vt                         0.745520
-  dit_step6_xt                         0.867311
-  dit_step7_vt                         0.704124
-  dit_x0                               0.770712
-  vae_audio                            0.383362
-  vae_audio (STFT cosine)              0.669931
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999549   0.201087   0.022082  -0.002495   0.972767  -0.002342   0.972003
-  dit_step1_xt             0.998317   0.415437   0.041246  -0.005636   0.942205  -0.005313   0.941730
-  dit_step2_xt             0.994987   0.709212   0.068458  -0.010217   0.907730  -0.009311   0.908527
-  dit_step3_xt             0.987168   1.068925   0.105239  -0.016380   0.870170  -0.014577   0.873624
-  dit_step4_xt             0.969910   1.456167   0.155261  -0.024550   0.833831  -0.021660   0.841995
-  dit_step5_xt             0.933874   2.028250   0.225222  -0.035727   0.809987  -0.032109   0.824593
-  dit_step6_xt             0.867311   3.033199   0.329427  -0.051895   0.826478  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q4_K_M.gguf
-[GGML] Running acestep-v15-sft-Q4_K_M.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999812
-  lyric_embed                          1.000000
-  enc_hidden                           0.997128
-  detok_output                         0.999611
-  context                              0.999751
-  noise                                1.000000
-  temb_t                               0.999673
-  hidden_after_proj_in                 0.999909
-  enc_after_cond_emb                   0.997634
-  layer0_sa_output                     0.998553
-  hidden_after_layer0                  0.999511
-  hidden_after_layer6                  0.995145
-  hidden_after_layer12                 0.984092
-  hidden_after_layer18                 0.981649
-  hidden_after_layer23                 0.984387
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.976637
-  dit_step0_vt_uncond                  0.980925
-  dit_step0_vt                         0.934226
-  dit_step0_xt                         0.999962
-  dit_step5_vt_cond                    0.967427
-  dit_step5_vt                         0.910792
-  dit_step5_xt                         0.998806
-  dit_step10_vt_cond                   0.948369
-  dit_step10_vt                        0.866632
-  dit_step10_xt                        0.994857
-  dit_step15_vt_cond                   0.909778
-  dit_step15_vt                        0.814508
-  dit_step15_xt                        0.984920
-  dit_step20_vt_cond                   0.863625
-  dit_step20_vt                        0.764052
-  dit_step20_xt                        0.965868
-  dit_step25_vt_cond                   0.811103
-  dit_step25_vt                        0.700861
-  dit_step25_xt                        0.937051
-  dit_step30_vt_cond                   0.753305
-  dit_step30_vt                        0.655816
-  dit_step30_xt                        0.899063
-  dit_step35_vt_cond                   0.699261
-  dit_step35_vt                        0.599863
-  dit_step35_xt                        0.859178
-  dit_step40_vt_cond                   0.670103
-  dit_step40_vt                        0.573321
-  dit_step40_xt                        0.825435
-  dit_step45_vt_cond                   0.701869
-  dit_step45_vt                        0.600028
-  dit_step45_xt                        0.803747
-  dit_step49_vt_cond                   0.749100
-  dit_step49_vt                        0.652063
-  dit_x0                               0.796334
-  vae_audio                            0.454343
-  vae_audio (STFT cosine)              0.718386
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999962   0.072923   0.006527  -0.001861   0.980234  -0.001741   0.980402
-  dit_step5_xt             0.998806   0.371089   0.032132  -0.007108   0.889710  -0.007143   0.887999
-  dit_step10_xt            0.994857   0.721153   0.060355  -0.013425   0.811244  -0.012603   0.811299
-  dit_step15_xt            0.984920   1.170655   0.094867  -0.019480   0.745370  -0.018114   0.745268
-  dit_step20_xt            0.965868   1.624943   0.135007  -0.025812   0.700521  -0.023808   0.699582
-  dit_step25_xt            0.937051   2.025275   0.178318  -0.032528   0.673256  -0.029311   0.679278
-  dit_step30_xt            0.899063   2.555359   0.227638  -0.038874   0.670375  -0.035027   0.685262
-  dit_step35_xt            0.859178   3.109559   0.281450  -0.045209   0.695123  -0.040716   0.717195
-  dit_step40_xt            0.825435   3.695475   0.337125  -0.051359   0.742071  -0.046462   0.771853
-  dit_step45_xt            0.803747   4.263174   0.390511  -0.057731   0.807748  -0.052475   0.843036
diff --git a/tests/Vulkan_Q5_K_M.log b/tests/Vulkan_Q5_K_M.log
deleted file mode 100644
index e6ff2d6..0000000
--- a/tests/Vulkan_Q5_K_M.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf
-[GGML] Running acestep-v15-turbo-Q5_K_M.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999812
-  lyric_embed                          1.000000
-  enc_hidden                           0.999132
-  detok_output                         0.999876
-  context                              0.999921
-  noise                                1.000000
-  temb_t                               0.999972
-  hidden_after_proj_in                 0.999959
-  enc_after_cond_emb                   0.999270
-  layer0_sa_output                     0.999442
-  hidden_after_layer0                  0.999638
-  hidden_after_layer6                  0.996691
-  hidden_after_layer12                 0.982345
-  hidden_after_layer18                 0.974400
-  hidden_after_layer23                 0.959734
-  dit_step0_vt                         0.838690
-  dit_step0_xt                         0.999650
-  dit_step1_vt                         0.854798
-  dit_step1_xt                         0.998726
-  dit_step2_vt                         0.843823
-  dit_step2_xt                         0.996265
-  dit_step3_vt                         0.832135
-  dit_step3_xt                         0.990412
-  dit_step4_vt                         0.826630
-  dit_step4_xt                         0.977378
-  dit_step5_vt                         0.824313
-  dit_step5_xt                         0.950549
-  dit_step6_vt                         0.806361
-  dit_step6_xt                         0.899178
-  dit_step7_vt                         0.774146
-  dit_x0                               0.825965
-  vae_audio                            0.488652
-  vae_audio (STFT cosine)              0.756261
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999650   0.235943   0.018873  -0.002256   0.973219  -0.002342   0.972003
-  dit_step1_xt             0.998726   0.436601   0.034659  -0.005174   0.942992  -0.005313   0.941730
-  dit_step2_xt             0.996265   0.716827   0.057185  -0.009195   0.909263  -0.009311   0.908527
-  dit_step3_xt             0.990412   0.968242   0.088230  -0.014806   0.872959  -0.014577   0.873624
-  dit_step4_xt             0.977378   1.455533   0.130847  -0.022234   0.838622  -0.021660   0.841995
-  dit_step5_xt             0.950549   2.134846   0.189630  -0.032763   0.816673  -0.032109   0.824593
-  dit_step6_xt             0.899178   3.163587   0.280857  -0.047640   0.840933  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q5_K_M.gguf
-[GGML] Running acestep-v15-sft-Q5_K_M.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999812
-  lyric_embed                          1.000000
-  enc_hidden                           0.999132
-  detok_output                         0.999876
-  context                              0.999921
-  noise                                1.000000
-  temb_t                               0.999899
-  hidden_after_proj_in                 0.999959
-  enc_after_cond_emb                   0.999269
-  layer0_sa_output                     0.999522
-  hidden_after_layer0                  0.999793
-  hidden_after_layer6                  0.995888
-  hidden_after_layer12                 0.985474
-  hidden_after_layer18                 0.984020
-  hidden_after_layer23                 0.986112
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.978964
-  dit_step0_vt_uncond                  0.973976
-  dit_step0_vt                         0.937223
-  dit_step0_xt                         0.999964
-  dit_step5_vt_cond                    0.967160
-  dit_step5_vt                         0.909198
-  dit_step5_xt                         0.998804
-  dit_step10_vt_cond                   0.950415
-  dit_step10_vt                        0.867165
-  dit_step10_xt                        0.994875
-  dit_step15_vt_cond                   0.914609
-  dit_step15_vt                        0.816760
-  dit_step15_xt                        0.985212
-  dit_step20_vt_cond                   0.868346
-  dit_step20_vt                        0.771014
-  dit_step20_xt                        0.966347
-  dit_step25_vt_cond                   0.813828
-  dit_step25_vt                        0.714557
-  dit_step25_xt                        0.936240
-  dit_step30_vt_cond                   0.758857
-  dit_step30_vt                        0.662399
-  dit_step30_xt                        0.898782
-  dit_step35_vt_cond                   0.707135
-  dit_step35_vt                        0.617898
-  dit_step35_xt                        0.859637
-  dit_step40_vt_cond                   0.679574
-  dit_step40_vt                        0.584797
-  dit_step40_xt                        0.827363
-  dit_step45_vt_cond                   0.709869
-  dit_step45_vt                        0.613484
-  dit_step45_xt                        0.805902
-  dit_step49_vt_cond                   0.756478
-  dit_step49_vt                        0.658766
-  dit_x0                               0.797882
-  vae_audio                            0.472032
-  vae_audio (STFT cosine)              0.708586
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999964   0.073235   0.006362  -0.001778   0.980214  -0.001741   0.980402
-  dit_step5_xt             0.998804   0.348623   0.032282  -0.006098   0.890574  -0.007143   0.887999
-  dit_step10_xt            0.994875   0.617850   0.060577  -0.011335   0.811641  -0.012603   0.811299
-  dit_step15_xt            0.985212   1.165812   0.094804  -0.016284   0.748105  -0.018114   0.745268
-  dit_step20_xt            0.966347   1.619635   0.134939  -0.021429   0.702593  -0.023808   0.699582
-  dit_step25_xt            0.936240   2.011917   0.181224  -0.026596   0.681069  -0.029311   0.679278
-  dit_step30_xt            0.898782   2.443318   0.230607  -0.031965   0.682407  -0.035027   0.685262
-  dit_step35_xt            0.859637   2.917810   0.284657  -0.037104   0.710155  -0.040716   0.717195
-  dit_step40_xt            0.827363   3.602165   0.340057  -0.042128   0.759737  -0.046462   0.771853
-  dit_step45_xt            0.805902   4.251132   0.394434  -0.047162   0.828316  -0.052475   0.843036
diff --git a/tests/Vulkan_Q6_K.log b/tests/Vulkan_Q6_K.log
deleted file mode 100644
index 916944c..0000000
--- a/tests/Vulkan_Q6_K.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf
-[GGML] Running acestep-v15-turbo-Q6_K.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999812
-  lyric_embed                          1.000000
-  enc_hidden                           0.999665
-  detok_output                         0.999972
-  context                              0.999982
-  noise                                1.000000
-  temb_t                               0.999990
-  hidden_after_proj_in                 0.999982
-  enc_after_cond_emb                   0.999691
-  layer0_sa_output                     0.999774
-  hidden_after_layer0                  0.999710
-  hidden_after_layer6                  0.999855
-  hidden_after_layer12                 0.998856
-  hidden_after_layer18                 0.995803
-  hidden_after_layer23                 0.992072
-  dit_step0_vt                         0.970064
-  dit_step0_xt                         0.999934
-  dit_step1_vt                         0.924564
-  dit_step1_xt                         0.999651
-  dit_step2_vt                         0.915541
-  dit_step2_xt                         0.998650
-  dit_step3_vt                         0.915489
-  dit_step3_xt                         0.996123
-  dit_step4_vt                         0.916835
-  dit_step4_xt                         0.990527
-  dit_step5_vt                         0.909275
-  dit_step5_xt                         0.977470
-  dit_step6_vt                         0.899986
-  dit_step6_xt                         0.952353
-  dit_step7_vt                         0.880023
-  dit_x0                               0.915268
-  vae_audio                            0.753562
-  vae_audio (STFT cosine)              0.882452
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999934   0.147239   0.007394  -0.002260   0.973056  -0.002342   0.972003
-  dit_step1_xt             0.999651   0.410402   0.017745  -0.005286   0.943565  -0.005313   0.941730
-  dit_step2_xt             0.998650   0.806730   0.033672  -0.009524   0.911097  -0.009311   0.908527
-  dit_step3_xt             0.996123   1.479887   0.054500  -0.015235   0.876469  -0.014577   0.873624
-  dit_step4_xt             0.990527   2.298363   0.081794  -0.022731   0.844225  -0.021660   0.841995
-  dit_step5_xt             0.977470   3.296017   0.123177  -0.033626   0.825405  -0.032109   0.824593
-  dit_step6_xt             0.952353   4.545029   0.185597  -0.049157   0.851892  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q6_K.gguf
-[GGML] Running acestep-v15-sft-Q6_K.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999812
-  lyric_embed                          1.000000
-  enc_hidden                           0.999665
-  detok_output                         0.999972
-  context                              0.999982
-  noise                                1.000000
-  temb_t                               0.999973
-  hidden_after_proj_in                 0.999981
-  enc_after_cond_emb                   0.999694
-  layer0_sa_output                     0.999789
-  hidden_after_layer0                  0.999784
-  hidden_after_layer6                  0.999737
-  hidden_after_layer12                 0.999297
-  hidden_after_layer18                 0.998478
-  hidden_after_layer23                 0.998790
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998675
-  dit_step0_vt_uncond                  0.962163
-  dit_step0_vt                         0.981229
-  dit_step0_xt                         0.999989
-  dit_step5_vt_cond                    0.978548
-  dit_step5_vt                         0.903995
-  dit_step5_xt                         0.999251
-  dit_step10_vt_cond                   0.949676
-  dit_step10_vt                        0.866414
-  dit_step10_xt                        0.996103
-  dit_step15_vt_cond                   0.890112
-  dit_step15_vt                        0.755968
-  dit_step15_xt                        0.986117
-  dit_step20_vt_cond                   0.800524
-  dit_step20_vt                        0.668617
-  dit_step20_xt                        0.965883
-  dit_step25_vt_cond                   0.715616
-  dit_step25_vt                        0.707363
-  dit_step25_xt                        0.936566
-  dit_step30_vt_cond                   0.651806
-  dit_step30_vt                        0.573252
-  dit_step30_xt                        0.901106
-  dit_step35_vt_cond                   0.613517
-  dit_step35_vt                        0.548023
-  dit_step35_xt                        0.866538
-  dit_step40_vt_cond                   0.617661
-  dit_step40_vt                        0.531763
-  dit_step40_xt                        0.837556
-  dit_step45_vt_cond                   0.690489
-  dit_step45_vt                        0.608902
-  dit_step45_xt                        0.819015
-  dit_step49_vt_cond                   0.760344
-  dit_step49_vt                        0.689227
-  dit_x0                               0.812918
-  vae_audio                            0.596607
-  vae_audio (STFT cosine)              0.752876
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999989   0.053618   0.003814  -0.002076   0.980489  -0.001741   0.980402
-  dit_step5_xt             0.999251   0.748318   0.025536  -0.008766   0.893415  -0.007143   0.887999
-  dit_step10_xt            0.996103   1.428011   0.054273  -0.016368   0.822729  -0.012603   0.811299
-  dit_step15_xt            0.986117   2.055885   0.098667  -0.024122   0.777367  -0.018114   0.745268
-  dit_step20_xt            0.965883   2.750473   0.153407  -0.031399   0.762304  -0.023808   0.699582
-  dit_step25_xt            0.936566   3.458536   0.209270  -0.038856   0.768389  -0.029311   0.679278
-  dit_step30_xt            0.901106   4.182745   0.271563  -0.045971   0.805686  -0.035027   0.685262
-  dit_step35_xt            0.866538   4.941256   0.336049  -0.053191   0.866756  -0.040716   0.717195
-  dit_step40_xt            0.837556   5.867188   0.401823  -0.059864   0.948138  -0.046462   0.771853
-  dit_step45_xt            0.819015   6.961776   0.463382  -0.066566   1.043107  -0.052475   0.843036
diff --git a/tests/Vulkan_Q8_0.log b/tests/Vulkan_Q8_0.log
deleted file mode 100644
index 9262047..0000000
--- a/tests/Vulkan_Q8_0.log
+++ /dev/null
@@ -1,130 +0,0 @@
-[Request] Loaded request0.json
-[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf
-[GGML] Running acestep-v15-turbo-Q8_0.gguf...
-[GGML] Done, 47 dump files
-[Python] Initializing acestep-v15-turbo...
-[Python] Generating (acestep-v15-turbo, 8 steps)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 40 dump files
-[Turbo] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999812
-  lyric_embed                          1.000000
-  enc_hidden                           0.999824
-  detok_output                         0.999983
-  context                              0.999990
-  noise                                1.000000
-  temb_t                               0.999998
-  hidden_after_proj_in                 0.999985
-  enc_after_cond_emb                   0.999817
-  layer0_sa_output                     0.999939
-  hidden_after_layer0                  0.999858
-  hidden_after_layer6                  0.999893
-  hidden_after_layer12                 0.999124
-  hidden_after_layer18                 0.996403
-  hidden_after_layer23                 0.993183
-  dit_step0_vt                         0.973885
-  dit_step0_xt                         0.999943
-  dit_step1_vt                         0.915468
-  dit_step1_xt                         0.999633
-  dit_step2_vt                         0.912211
-  dit_step2_xt                         0.998544
-  dit_step3_vt                         0.912707
-  dit_step3_xt                         0.995860
-  dit_step4_vt                         0.906019
-  dit_step4_xt                         0.989505
-  dit_step5_vt                         0.896537
-  dit_step5_xt                         0.974659
-  dit_step6_vt                         0.886047
-  dit_step6_xt                         0.945866
-  dit_step7_vt                         0.869793
-  dit_x0                               0.905017
-  vae_audio                            0.746037
-  vae_audio (STFT cosine)              0.898352
-[Turbo] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999943   0.140034   0.006943  -0.002318   0.973036  -0.002342   0.972003
-  dit_step1_xt             0.999633   0.423125   0.018056  -0.005257   0.943026  -0.005313   0.941730
-  dit_step2_xt             0.998544   0.841908   0.034537  -0.009209   0.910286  -0.009311   0.908527
-  dit_step3_xt             0.995860   1.521911   0.055719  -0.014626   0.875169  -0.014577   0.873624
-  dit_step4_xt             0.989505   2.346452   0.085477  -0.021803   0.842334  -0.021660   0.841995
-  dit_step5_xt             0.974659   3.387389   0.130921  -0.032225   0.822365  -0.032109   0.824593
-  dit_step6_xt             0.945866   4.812943   0.199910  -0.047290   0.846751  -0.046482   0.855546
-[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q8_0.gguf
-[GGML] Running acestep-v15-sft-Q8_0.gguf...
-[GGML] Done, 233 dump files
-[Python] Initializing acestep-v15-sft...
-[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)...
-Using precomputed LM hints
-Using precomputed LM hints
-[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo)
-[Python] Done, 218 dump files
-[SFT] Cosine similarities GGML vs Python
-  stage                          GGML vs Python
-  text_hidden                          0.999812
-  lyric_embed                          1.000000
-  enc_hidden                           0.999824
-  detok_output                         0.999983
-  context                              0.999990
-  noise                                1.000000
-  temb_t                               0.999994
-  hidden_after_proj_in                 0.999985
-  enc_after_cond_emb                   0.999820
-  layer0_sa_output                     0.999932
-  hidden_after_layer0                  0.999867
-  hidden_after_layer6                  0.999809
-  hidden_after_layer12                 0.999421
-  hidden_after_layer18                 0.998648
-  hidden_after_layer23                 0.998927
-  null_condition_emb                   1.000000
-  null_enc_hidden                      1.000000
-  dit_step0_vt_cond                    0.998848
-  dit_step0_vt_uncond                  0.964971
-  dit_step0_vt                         0.982622
-  dit_step0_xt                         0.999990
-  dit_step5_vt_cond                    0.978187
-  dit_step5_vt                         0.910806
-  dit_step5_xt                         0.999338
-  dit_step10_vt_cond                   0.948119
-  dit_step10_vt                        0.856732
-  dit_step10_xt                        0.996258
-  dit_step15_vt_cond                   0.885149
-  dit_step15_vt                        0.741011
-  dit_step15_xt                        0.986353
-  dit_step20_vt_cond                   0.792343
-  dit_step20_vt                        0.735701
-  dit_step20_xt                        0.966995
-  dit_step25_vt_cond                   0.713669
-  dit_step25_vt                        0.604646
-  dit_step25_xt                        0.937523
-  dit_step30_vt_cond                   0.654759
-  dit_step30_vt                        0.575313
-  dit_step30_xt                        0.901384
-  dit_step35_vt_cond                   0.616330
-  dit_step35_vt                        0.533322
-  dit_step35_xt                        0.865098
-  dit_step40_vt_cond                   0.615497
-  dit_step40_vt                        0.525598
-  dit_step40_xt                        0.834978
-  dit_step45_vt_cond                   0.687607
-  dit_step45_vt                        0.600947
-  dit_step45_xt                        0.816193
-  dit_step49_vt_cond                   0.757023
-  dit_step49_vt                        0.678778
-  dit_x0                               0.809822
-  vae_audio                            0.552742
-  vae_audio (STFT cosine)              0.704247
-[SFT] Error growth GGML vs Python
-  stage                         cos    max_err   mean_err     mean_A      std_A     mean_B      std_B
-  dit_step0_xt             0.999990   0.053120   0.003694  -0.002028   0.980340  -0.001741   0.980402
-  dit_step5_xt             0.999338   0.528079   0.024091  -0.008167   0.891761  -0.007143   0.887999
-  dit_step10_xt            0.996258   1.260570   0.054251  -0.014905   0.821682  -0.012603   0.811299
-  dit_step15_xt            0.986353   1.896362   0.099359  -0.021353   0.777987  -0.018114   0.745268
-  dit_step20_xt            0.966995   2.558488   0.150921  -0.027607   0.759790  -0.023808   0.699582
-  dit_step25_xt            0.937523   3.268598   0.209264  -0.033645   0.770984  -0.029311   0.679278
-  dit_step30_xt            0.901384   3.973653   0.271000  -0.039796   0.805477  -0.035027   0.685262
-  dit_step35_xt            0.865098   4.656569   0.335194  -0.045754   0.864460  -0.040716   0.717195
-  dit_step40_xt            0.834978   5.519352   0.400309  -0.051630   0.944399  -0.046462   0.771853
-  dit_step45_xt            0.816193   6.556623   0.460383  -0.057408   1.036260  -0.052475   0.843036
diff --git a/tests/fixtures/ci-cover.json b/tests/fixtures/ci-cover.json
deleted file mode 100644
index 3d21e1f..0000000
--- a/tests/fixtures/ci-cover.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "task_type": "cover",
-  "caption": "Short CI cover test",
-  "duration": 5,
-  "inference_steps": 4,
-  "guidance_scale": 1,
-  "shift": 3,
-  "seed": 42,
-  "reference_audio": "tests/fixtures/ci-text2music0.wav",
-  "audio_cover_strength": 0.8,
-  "audio_codes": "43316,18426,13366,59455,17783,49303,7423,29855,37158,37157,62317,61455,12847,19583,57031,34656,20254,10770,11416,15905,31413,23339,47091,12198,49531,37355,33090,38645,40707,16324,61436,46095,13941,5287,2239,13975,63815,2757,4862,13571,63495,39,29887,49426,12696,50847,40498,61056,25666,12989,23987,54763,25485,31683,28554,25355,16373,28995,2351,1655,7940,55831,34359,15350,15277,11717,20476,52239,5015,19807,24087,3559,20471,34193,32552,60999,29360,25338,38873,16768,17912,27584,24008,1528,449,25563,52684,53223,42183,37215,12343,39431,26055,28148,57286,38382,28863,7191,58397,18991,7695,30716,36784,12687,8707,25649,33718,3202,23035,10747,26354,63965,16260,11223,45679,14343,8679,49351,52927,2535,19207,46447,49615,12694,21110"
-}
diff --git a/tests/fixtures/ci-dit-only.json b/tests/fixtures/ci-dit-only.json
deleted file mode 100644
index 0a83cb8..0000000
--- a/tests/fixtures/ci-dit-only.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "caption": "Short CI clip",
-  "lyrics": "",
-  "bpm": 90,
-  "duration": 5,
-  "keyscale": "C minor",
-  "timesignature": "4",
-  "vocal_language": "en",
-  "inference_steps": 4,
-  "shift": 3
-}
diff --git a/tests/fixtures/ci-full.json b/tests/fixtures/ci-full.json
deleted file mode 100644
index 3a37bfc..0000000
--- a/tests/fixtures/ci-full.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "caption": "Short CI house clip",
-  "lyrics": "[Intro]\n\n[Verse 1]\nTest\n\n[Outro]\nDone",
-  "bpm": 120,
-  "duration": 5,
-  "keyscale": "C major",
-  "timesignature": "4",
-  "vocal_language": "fr",
-  "inference_steps": 4,
-  "shift": 3
-}
diff --git a/tests/fixtures/ci-partial.json b/tests/fixtures/ci-partial.json
deleted file mode 100644
index 19ae9db..0000000
--- a/tests/fixtures/ci-partial.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "caption": "Short CI hip hop clip",
-  "lyrics": "[Intro]\nYeah\n\n[Verse 1]\nOne two\n\n[Chorus]\nTest\n\n[Outro]\nDone",
-  "duration": 5,
-  "vocal_language": "fr",
-  "inference_steps": 4,
-  "shift": 3
-}
diff --git a/tests/fixtures/ci-request-reference.json b/tests/fixtures/ci-request-reference.json
deleted file mode 100644
index 141d77a..0000000
--- a/tests/fixtures/ci-request-reference.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "task_type": "text2music",
-  "caption": "Short CI reference test",
-  "lyrics": "[Verse]\nTest\n[Chorus]\nRef",
-  "duration": 5,
-  "seed": 42,
-  "inference_steps": 4,
-  "guidance_scale": 1,
-  "shift": 3,
-  "reference_audio": "reference.wav",
-  "audio_codes": "",
-  "audio_cover_strength": 1
-}
diff --git a/tests/fixtures/ci-text2music.json b/tests/fixtures/ci-text2music.json
deleted file mode 100644
index 1613723..0000000
--- a/tests/fixtures/ci-text2music.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "task_type": "text2music",
-  "caption": "Short CI test clip",
-  "lyrics": "",
-  "duration": 5,
-  "inference_steps": 4,
-  "guidance_scale": 1,
-  "shift": 3,
-  "seed": 42,
-  "audio_codes": ""
-}
diff --git a/tests/run-generation-tests.sh b/tests/run-generation-tests.sh
deleted file mode 100755
index 666698c..0000000
--- a/tests/run-generation-tests.sh
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env bash
-# Run the same generation tests as the GitHub Action (test-generation.yml).
-# Use this to validate locally before pushing. No assumptions: build and models required.
-#
-# From repo root:
-#   ./models.sh              # once: download Q8_0 + VAE into models/
-#   mkdir -p build && cd build && cmake .. && cmake --build . --config Release
-#   cd .. && tests/run-generation-tests.sh
-
-set -e
-cd "$(dirname "$0")/.."
-REPO_ROOT="$PWD"
-
-# --- Build ---
-if [ ! -f build/dit-vae ] || [ ! -f build/ace-qwen3 ]; then
-    echo "Missing build/dit-vae or build/ace-qwen3. Build first:"
-    echo "  mkdir -p build && cd build && cmake .. && cmake --build . --config Release"
-    exit 1
-fi
-
-# --- Models ---
-TEXT_ENC="models/Qwen3-Embedding-0.6B-Q8_0.gguf"
-DIT="models/acestep-v15-turbo-Q8_0.gguf"
-VAE="models/vae-BF16.gguf"
-LM="models/acestep-5Hz-lm-4B-Q8_0.gguf"
-for f in "$TEXT_ENC" "$DIT" "$VAE"; do
-    if [ ! -f "$f" ]; then
-        echo "Missing $f. Download models once: ./models.sh"
-        exit 1
-    fi
-done
-
-echo "[1/3] Test mode text2music (short)"
-./build/dit-vae \
-    --request tests/fixtures/ci-text2music.json \
-    --text-encoder "$TEXT_ENC" \
-    --dit "$DIT" \
-    --vae "$VAE"
-if [ ! -f tests/fixtures/ci-text2music0.wav ]; then
-    echo "FAIL: tests/fixtures/ci-text2music0.wav not created"
-    exit 1
-fi
-echo "  text2music WAV OK"
-
-echo "[2/3] Test mode cover with WAV reference (short)"
-./build/dit-vae \
-    --request tests/fixtures/ci-cover.json \
-    --text-encoder "$TEXT_ENC" \
-    --dit "$DIT" \
-    --vae "$VAE"
-if [ ! -f tests/fixtures/ci-cover0.wav ]; then
-    echo "FAIL: tests/fixtures/ci-cover0.wav not created"
-    exit 1
-fi
-echo "  cover WAV OK"
-
-echo "[3/3] Test full pipeline (LLM + DiT, short)"
-if [ ! -f "$LM" ]; then
-    echo "Missing $LM; skipping full pipeline. Run ./models.sh to include LM."
-    exit 1
-fi
-# ace-qwen3 names output from input path (e.g. request.json -> request0.json)
-cp tests/fixtures/ci-text2music.json request.json
-./build/ace-qwen3 \
-    --request request.json \
-    --model "$LM"
-if [ ! -f request0.json ]; then
-    echo "FAIL: request0.json not created by ace-qwen3"
-    exit 1
-fi
-./build/dit-vae \
-    --request request0.json \
-    --text-encoder "$TEXT_ENC" \
-    --dit "$DIT" \
-    --vae "$VAE"
-if [ ! -f request00.wav ]; then
-    echo "FAIL: request00.wav not created"
-    exit 1
-fi
-echo "  full pipeline WAV OK"
-
-echo ""
-echo "All generation tests passed locally. Safe to rely on CI for the same checks."
diff --git a/third_party/minimp3.h b/third_party/minimp3.h
deleted file mode 100644
index 3220ae1..0000000
--- a/third_party/minimp3.h
+++ /dev/null
@@ -1,1865 +0,0 @@
-#ifndef MINIMP3_H
-#define MINIMP3_H
-/*
-    https://github.com/lieff/minimp3
-    To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide.
-    This software is distributed without any warranty.
-    See <http://creativecommons.org/publicdomain/zero/1.0/>.
-*/
-#include <stdint.h>
-
-#define MINIMP3_MAX_SAMPLES_PER_FRAME (1152*2)
-
-typedef struct
-{
-    int frame_bytes, frame_offset, channels, hz, layer, bitrate_kbps;
-} mp3dec_frame_info_t;
-
-typedef struct
-{
-    float mdct_overlap[2][9*32], qmf_state[15*2*32];
-    int reserv, free_format_bytes;
-    unsigned char header[4], reserv_buf[511];
-} mp3dec_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
-void mp3dec_init(mp3dec_t *dec);
-#ifndef MINIMP3_FLOAT_OUTPUT
-typedef int16_t mp3d_sample_t;
-#else /* MINIMP3_FLOAT_OUTPUT */
-typedef float mp3d_sample_t;
-void mp3dec_f32_to_s16(const float *in, int16_t *out, int num_samples);
-#endif /* MINIMP3_FLOAT_OUTPUT */
-int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_sample_t *pcm, mp3dec_frame_info_t *info);
-
-#ifdef __cplusplus
-}
-#endif /* __cplusplus */
-
-#endif /* MINIMP3_H */
-#if defined(MINIMP3_IMPLEMENTATION) && !defined(_MINIMP3_IMPLEMENTATION_GUARD)
-#define _MINIMP3_IMPLEMENTATION_GUARD
-
-#include <stdlib.h>
-#include <string.h>
-
-#define MAX_FREE_FORMAT_FRAME_SIZE  2304    /* more than ISO spec's */
-#ifndef MAX_FRAME_SYNC_MATCHES
-#define MAX_FRAME_SYNC_MATCHES      10
-#endif /* MAX_FRAME_SYNC_MATCHES */
-
-#define MAX_L3_FRAME_PAYLOAD_BYTES  MAX_FREE_FORMAT_FRAME_SIZE /* MUST be >= 320000/8/32000*1152 = 1440 */
-
-#define MAX_BITRESERVOIR_BYTES      511
-#define SHORT_BLOCK_TYPE            2
-#define STOP_BLOCK_TYPE             3
-#define MODE_MONO                   3
-#define MODE_JOINT_STEREO           1
-#define HDR_SIZE                    4
-#define HDR_IS_MONO(h)              (((h[3]) & 0xC0) == 0xC0)
-#define HDR_IS_MS_STEREO(h)         (((h[3]) & 0xE0) == 0x60)
-#define HDR_IS_FREE_FORMAT(h)       (((h[2]) & 0xF0) == 0)
-#define HDR_IS_CRC(h)               (!((h[1]) & 1))
-#define HDR_TEST_PADDING(h)         ((h[2]) & 0x2)
-#define HDR_TEST_MPEG1(h)           ((h[1]) & 0x8)
-#define HDR_TEST_NOT_MPEG25(h)      ((h[1]) & 0x10)
-#define HDR_TEST_I_STEREO(h)        ((h[3]) & 0x10)
-#define HDR_TEST_MS_STEREO(h)       ((h[3]) & 0x20)
-#define HDR_GET_STEREO_MODE(h)      (((h[3]) >> 6) & 3)
-#define HDR_GET_STEREO_MODE_EXT(h)  (((h[3]) >> 4) & 3)
-#define HDR_GET_LAYER(h)            (((h[1]) >> 1) & 3)
-#define HDR_GET_BITRATE(h)          ((h[2]) >> 4)
-#define HDR_GET_SAMPLE_RATE(h)      (((h[2]) >> 2) & 3)
-#define HDR_GET_MY_SAMPLE_RATE(h)   (HDR_GET_SAMPLE_RATE(h) + (((h[1] >> 3) & 1) + ((h[1] >> 4) & 1))*3)
-#define HDR_IS_FRAME_576(h)         ((h[1] & 14) == 2)
-#define HDR_IS_LAYER_1(h)           ((h[1] & 6) == 6)
-
-#define BITS_DEQUANTIZER_OUT        -1
-#define MAX_SCF                     (255 + BITS_DEQUANTIZER_OUT*4 - 210)
-#define MAX_SCFI                    ((MAX_SCF + 3) & ~3)
-
-#define MINIMP3_MIN(a, b)           ((a) > (b) ? (b) : (a))
-#define MINIMP3_MAX(a, b)           ((a) < (b) ? (b) : (a))
-
-#if !defined(MINIMP3_NO_SIMD)
-
-#if !defined(MINIMP3_ONLY_SIMD) && (defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_ARM64))
-/* x64 always have SSE2, arm64 always have neon, no need for generic code */
-#define MINIMP3_ONLY_SIMD
-#endif /* SIMD checks... */
-
-#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || ((defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__))
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif /* defined(_MSC_VER) */
-#include <immintrin.h>
-#define HAVE_SSE 1
-#define HAVE_SIMD 1
-#define VSTORE _mm_storeu_ps
-#define VLD _mm_loadu_ps
-#define VSET _mm_set1_ps
-#define VADD _mm_add_ps
-#define VSUB _mm_sub_ps
-#define VMUL _mm_mul_ps
-#define VMAC(a, x, y) _mm_add_ps(a, _mm_mul_ps(x, y))
-#define VMSB(a, x, y) _mm_sub_ps(a, _mm_mul_ps(x, y))
-#define VMUL_S(x, s)  _mm_mul_ps(x, _mm_set1_ps(s))
-#define VREV(x) _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3))
-typedef __m128 f4;
-#if defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD)
-#define minimp3_cpuid __cpuid
-#else /* defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) */
-static __inline__ __attribute__((always_inline)) void minimp3_cpuid(int CPUInfo[], const int InfoType)
-{
-#if defined(__PIC__)
-    __asm__ __volatile__(
-#if defined(__x86_64__)
-        "push %%rbx\n"
-        "cpuid\n"
-        "xchgl %%ebx, %1\n"
-        "pop  %%rbx\n"
-#else /* defined(__x86_64__) */
-        "xchgl %%ebx, %1\n"
-        "cpuid\n"
-        "xchgl %%ebx, %1\n"
-#endif /* defined(__x86_64__) */
-        : "=a" (CPUInfo[0]), "=r" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
-        : "a" (InfoType));
-#else /* defined(__PIC__) */
-    __asm__ __volatile__(
-        "cpuid"
-        : "=a" (CPUInfo[0]), "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
-        : "a" (InfoType));
-#endif /* defined(__PIC__)*/
-}
-#endif /* defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) */
-static int have_simd(void)
-{
-#ifdef MINIMP3_ONLY_SIMD
-    return 1;
-#else /* MINIMP3_ONLY_SIMD */
-    static int g_have_simd;
-    int CPUInfo[4];
-#ifdef MINIMP3_TEST
-    static int g_counter;
-    if (g_counter++ > 100)
-        return 0;
-#endif /* MINIMP3_TEST */
-    if (g_have_simd)
-        goto end;
-    minimp3_cpuid(CPUInfo, 0);
-    g_have_simd = 1;
-    if (CPUInfo[0] > 0)
-    {
-        minimp3_cpuid(CPUInfo, 1);
-        g_have_simd = (CPUInfo[3] & (1 << 26)) + 1; /* SSE2 */
-    }
-end:
-    return g_have_simd - 1;
-#endif /* MINIMP3_ONLY_SIMD */
-}
-#elif defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64)
-#include <arm_neon.h>
-#define HAVE_SSE 0
-#define HAVE_SIMD 1
-#define VSTORE vst1q_f32
-#define VLD vld1q_f32
-#define VSET vmovq_n_f32
-#define VADD vaddq_f32
-#define VSUB vsubq_f32
-#define VMUL vmulq_f32
-#define VMAC(a, x, y) vmlaq_f32(a, x, y)
-#define VMSB(a, x, y) vmlsq_f32(a, x, y)
-#define VMUL_S(x, s)  vmulq_f32(x, vmovq_n_f32(s))
-#define VREV(x) vcombine_f32(vget_high_f32(vrev64q_f32(x)), vget_low_f32(vrev64q_f32(x)))
-typedef float32x4_t f4;
-static int have_simd()
-{   /* TODO: detect neon for !MINIMP3_ONLY_SIMD */
-    return 1;
-}
-#else /* SIMD checks... */
-#define HAVE_SSE 0
-#define HAVE_SIMD 0
-#ifdef MINIMP3_ONLY_SIMD
-#error MINIMP3_ONLY_SIMD used, but SSE/NEON not enabled
-#endif /* MINIMP3_ONLY_SIMD */
-#endif /* SIMD checks... */
-#else /* !defined(MINIMP3_NO_SIMD) */
-#define HAVE_SIMD 0
-#endif /* !defined(MINIMP3_NO_SIMD) */
-
-#if defined(__ARM_ARCH) && (__ARM_ARCH >= 6) && !defined(__aarch64__) && !defined(_M_ARM64)
-#define HAVE_ARMV6 1
-static __inline__ __attribute__((always_inline)) int32_t minimp3_clip_int16_arm(int32_t a)
-{
-    int32_t x = 0;
-    __asm__ ("ssat %0, #16, %1" : "=r"(x) : "r"(a));
-    return x;
-}
-#else
-#define HAVE_ARMV6 0
-#endif
-
-typedef struct
-{
-    const uint8_t *buf;
-    int pos, limit;
-} bs_t;
-
-typedef struct
-{
-    float scf[3*64];
-    uint8_t total_bands, stereo_bands, bitalloc[64], scfcod[64];
-} L12_scale_info;
-
-typedef struct
-{
-    uint8_t tab_offset, code_tab_width, band_count;
-} L12_subband_alloc_t;
-
-typedef struct
-{
-    const uint8_t *sfbtab;
-    uint16_t part_23_length, big_values, scalefac_compress;
-    uint8_t global_gain, block_type, mixed_block_flag, n_long_sfb, n_short_sfb;
-    uint8_t table_select[3], region_count[3], subblock_gain[3];
-    uint8_t preflag, scalefac_scale, count1_table, scfsi;
-} L3_gr_info_t;
-
-typedef struct
-{
-    bs_t bs;
-    uint8_t maindata[MAX_BITRESERVOIR_BYTES + MAX_L3_FRAME_PAYLOAD_BYTES];
-    L3_gr_info_t gr_info[4];
-    float grbuf[2][576], scf[40], syn[18 + 15][2*32];
-    uint8_t ist_pos[2][39];
-} mp3dec_scratch_t;
-
-static void bs_init(bs_t *bs, const uint8_t *data, int bytes)
-{
-    bs->buf   = data;
-    bs->pos   = 0;
-    bs->limit = bytes*8;
-}
-
-static uint32_t get_bits(bs_t *bs, int n)
-{
-    uint32_t next, cache = 0, s = bs->pos & 7;
-    int shl = n + s;
-    const uint8_t *p = bs->buf + (bs->pos >> 3);
-    if ((bs->pos += n) > bs->limit)
-        return 0;
-    next = *p++ & (255 >> s);
-    while ((shl -= 8) > 0)
-    {
-        cache |= next << shl;
-        next = *p++;
-    }
-    return cache | (next >> -shl);
-}
-
-static int hdr_valid(const uint8_t *h)
-{
-    return h[0] == 0xff &&
-        ((h[1] & 0xF0) == 0xf0 || (h[1] & 0xFE) == 0xe2) &&
-        (HDR_GET_LAYER(h) != 0) &&
-        (HDR_GET_BITRATE(h) != 15) &&
-        (HDR_GET_SAMPLE_RATE(h) != 3);
-}
-
-static int hdr_compare(const uint8_t *h1, const uint8_t *h2)
-{
-    return hdr_valid(h2) &&
-        ((h1[1] ^ h2[1]) & 0xFE) == 0 &&
-        ((h1[2] ^ h2[2]) & 0x0C) == 0 &&
-        !(HDR_IS_FREE_FORMAT(h1) ^ HDR_IS_FREE_FORMAT(h2));
-}
-
-static unsigned hdr_bitrate_kbps(const uint8_t *h)
-{
-    static const uint8_t halfrate[2][3][15] = {
-        { { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,16,24,28,32,40,48,56,64,72,80,88,96,112,128 } },
-        { { 0,16,20,24,28,32,40,48,56,64,80,96,112,128,160 }, { 0,16,24,28,32,40,48,56,64,80,96,112,128,160,192 }, { 0,16,32,48,64,80,96,112,128,144,160,176,192,208,224 } },
-    };
-    return 2*halfrate[!!HDR_TEST_MPEG1(h)][HDR_GET_LAYER(h) - 1][HDR_GET_BITRATE(h)];
-}
-
-static unsigned hdr_sample_rate_hz(const uint8_t *h)
-{
-    static const unsigned g_hz[3] = { 44100, 48000, 32000 };
-    return g_hz[HDR_GET_SAMPLE_RATE(h)] >> (int)!HDR_TEST_MPEG1(h) >> (int)!HDR_TEST_NOT_MPEG25(h);
-}
-
-static unsigned hdr_frame_samples(const uint8_t *h)
-{
-    return HDR_IS_LAYER_1(h) ? 384 : (1152 >> (int)HDR_IS_FRAME_576(h));
-}
-
-static int hdr_frame_bytes(const uint8_t *h, int free_format_size)
-{
-    int frame_bytes = hdr_frame_samples(h)*hdr_bitrate_kbps(h)*125/hdr_sample_rate_hz(h);
-    if (HDR_IS_LAYER_1(h))
-    {
-        frame_bytes &= ~3; /* slot align */
-    }
-    return frame_bytes ? frame_bytes : free_format_size;
-}
-
-static int hdr_padding(const uint8_t *h)
-{
-    return HDR_TEST_PADDING(h) ? (HDR_IS_LAYER_1(h) ? 4 : 1) : 0;
-}
-
-#ifndef MINIMP3_ONLY_MP3
-static const L12_subband_alloc_t *L12_subband_alloc_table(const uint8_t *hdr, L12_scale_info *sci)
-{
-    const L12_subband_alloc_t *alloc;
-    int mode = HDR_GET_STEREO_MODE(hdr);
-    int nbands, stereo_bands = (mode == MODE_MONO) ? 0 : (mode == MODE_JOINT_STEREO) ? (HDR_GET_STEREO_MODE_EXT(hdr) << 2) + 4 : 32;
-
-    if (HDR_IS_LAYER_1(hdr))
-    {
-        static const L12_subband_alloc_t g_alloc_L1[] = { { 76, 4, 32 } };
-        alloc = g_alloc_L1;
-        nbands = 32;
-    } else if (!HDR_TEST_MPEG1(hdr))
-    {
-        static const L12_subband_alloc_t g_alloc_L2M2[] = { { 60, 4, 4 }, { 44, 3, 7 }, { 44, 2, 19 } };
-        alloc = g_alloc_L2M2;
-        nbands = 30;
-    } else
-    {
-        static const L12_subband_alloc_t g_alloc_L2M1[] = { { 0, 4, 3 }, { 16, 4, 8 }, { 32, 3, 12 }, { 40, 2, 7 } };
-        int sample_rate_idx = HDR_GET_SAMPLE_RATE(hdr);
-        unsigned kbps = hdr_bitrate_kbps(hdr) >> (int)(mode != MODE_MONO);
-        if (!kbps) /* free-format */
-        {
-            kbps = 192;
-        }
-
-        alloc = g_alloc_L2M1;
-        nbands = 27;
-        if (kbps < 56)
-        {
-            static const L12_subband_alloc_t g_alloc_L2M1_lowrate[] = { { 44, 4, 2 }, { 44, 3, 10 } };
-            alloc = g_alloc_L2M1_lowrate;
-            nbands = sample_rate_idx == 2 ? 12 : 8;
-        } else if (kbps >= 96 && sample_rate_idx != 1)
-        {
-            nbands = 30;
-        }
-    }
-
-    sci->total_bands = (uint8_t)nbands;
-    sci->stereo_bands = (uint8_t)MINIMP3_MIN(stereo_bands, nbands);
-
-    return alloc;
-}
-
-static void L12_read_scalefactors(bs_t *bs, uint8_t *pba, uint8_t *scfcod, int bands, float *scf)
-{
-    static const float g_deq_L12[18*3] = {
-#define DQ(x) 9.53674316e-07f/x, 7.56931807e-07f/x, 6.00777173e-07f/x
-        DQ(3),DQ(7),DQ(15),DQ(31),DQ(63),DQ(127),DQ(255),DQ(511),DQ(1023),DQ(2047),DQ(4095),DQ(8191),DQ(16383),DQ(32767),DQ(65535),DQ(3),DQ(5),DQ(9)
-    };
-    int i, m;
-    for (i = 0; i < bands; i++)
-    {
-        float s = 0;
-        int ba = *pba++;
-        int mask = ba ? 4 + ((19 >> scfcod[i]) & 3) : 0;
-        for (m = 4; m; m >>= 1)
-        {
-            if (mask & m)
-            {
-                int b = get_bits(bs, 6);
-                s = g_deq_L12[ba*3 - 6 + b % 3]*(1 << 21 >> b/3);
-            }
-            *scf++ = s;
-        }
-    }
-}
-
-static void L12_read_scale_info(const uint8_t *hdr, bs_t *bs, L12_scale_info *sci)
-{
-    static const uint8_t g_bitalloc_code_tab[] = {
-        0,17, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16,
-        0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,16,
-        0,17,18, 3,19,4,5,16,
-        0,17,18,16,
-        0,17,18,19, 4,5,6, 7,8, 9,10,11,12,13,14,15,
-        0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,14,
-        0, 2, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16
-    };
-    const L12_subband_alloc_t *subband_alloc = L12_subband_alloc_table(hdr, sci);
-
-    int i, k = 0, ba_bits = 0;
-    const uint8_t *ba_code_tab = g_bitalloc_code_tab;
-
-    for (i = 0; i < sci->total_bands; i++)
-    {
-        uint8_t ba;
-        if (i == k)
-        {
-            k += subband_alloc->band_count;
-            ba_bits = subband_alloc->code_tab_width;
-            ba_code_tab = g_bitalloc_code_tab + subband_alloc->tab_offset;
-            subband_alloc++;
-        }
-        ba = ba_code_tab[get_bits(bs, ba_bits)];
-        sci->bitalloc[2*i] = ba;
-        if (i < sci->stereo_bands)
-        {
-            ba = ba_code_tab[get_bits(bs, ba_bits)];
-        }
-        sci->bitalloc[2*i + 1] = sci->stereo_bands ? ba : 0;
-    }
-
-    for (i = 0; i < 2*sci->total_bands; i++)
-    {
-        sci->scfcod[i] = sci->bitalloc[i] ? HDR_IS_LAYER_1(hdr) ? 2 : get_bits(bs, 2) : 6;
-    }
-
-    L12_read_scalefactors(bs, sci->bitalloc, sci->scfcod, sci->total_bands*2, sci->scf);
-
-    for (i = sci->stereo_bands; i < sci->total_bands; i++)
-    {
-        sci->bitalloc[2*i + 1] = 0;
-    }
-}
-
-static int L12_dequantize_granule(float *grbuf, bs_t *bs, L12_scale_info *sci, int group_size)
-{
-    int i, j, k, choff = 576;
-    for (j = 0; j < 4; j++)
-    {
-        float *dst = grbuf + group_size*j;
-        for (i = 0; i < 2*sci->total_bands; i++)
-        {
-            int ba = sci->bitalloc[i];
-            if (ba != 0)
-            {
-                if (ba < 17)
-                {
-                    int half = (1 << (ba - 1)) - 1;
-                    for (k = 0; k < group_size; k++)
-                    {
-                        dst[k] = (float)((int)get_bits(bs, ba) - half);
-                    }
-                } else
-                {
-                    unsigned mod = (2 << (ba - 17)) + 1;    /* 3, 5, 9 */
-                    unsigned code = get_bits(bs, mod + 2 - (mod >> 3));  /* 5, 7, 10 */
-                    for (k = 0; k < group_size; k++, code /= mod)
-                    {
-                        dst[k] = (float)((int)(code % mod - mod/2));
-                    }
-                }
-            }
-            dst += choff;
-            choff = 18 - choff;
-        }
-    }
-    return group_size*4;
-}
-
-static void L12_apply_scf_384(L12_scale_info *sci, const float *scf, float *dst)
-{
-    int i, k;
-    memcpy(dst + 576 + sci->stereo_bands*18, dst + sci->stereo_bands*18, (sci->total_bands - sci->stereo_bands)*18*sizeof(float));
-    for (i = 0; i < sci->total_bands; i++, dst += 18, scf += 6)
-    {
-        for (k = 0; k < 12; k++)
-        {
-            dst[k + 0]   *= scf[0];
-            dst[k + 576] *= scf[3];
-        }
-    }
-}
-#endif /* MINIMP3_ONLY_MP3 */
-
-static int L3_read_side_info(bs_t *bs, L3_gr_info_t *gr, const uint8_t *hdr)
-{
-    static const uint8_t g_scf_long[8][23] = {
-        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
-        { 12,12,12,12,12,12,16,20,24,28,32,40,48,56,64,76,90,2,2,2,2,2,0 },
-        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
-        { 6,6,6,6,6,6,8,10,12,14,16,18,22,26,32,38,46,54,62,70,76,36,0 },
-        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
-        { 4,4,4,4,4,4,6,6,8,8,10,12,16,20,24,28,34,42,50,54,76,158,0 },
-        { 4,4,4,4,4,4,6,6,6,8,10,12,16,18,22,28,34,40,46,54,54,192,0 },
-        { 4,4,4,4,4,4,6,6,8,10,12,16,20,24,30,38,46,56,68,84,102,26,0 }
-    };
-    static const uint8_t g_scf_short[8][40] = {
-        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
-        { 8,8,8,8,8,8,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 },
-        { 4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 },
-        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 },
-        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
-        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 },
-        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 },
-        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 }
-    };
-    static const uint8_t g_scf_mixed[8][40] = {
-        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
-        { 12,12,12,4,4,4,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 },
-        { 6,6,6,6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 },
-        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 },
-        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
-        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 },
-        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 },
-        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 }
-    };
-
-    unsigned tables, scfsi = 0;
-    int main_data_begin, part_23_sum = 0;
-    int sr_idx = HDR_GET_MY_SAMPLE_RATE(hdr); sr_idx -= (sr_idx != 0);
-    int gr_count = HDR_IS_MONO(hdr) ? 1 : 2;
-
-    if (HDR_TEST_MPEG1(hdr))
-    {
-        gr_count *= 2;
-        main_data_begin = get_bits(bs, 9);
-        scfsi = get_bits(bs, 7 + gr_count);
-    } else
-    {
-        main_data_begin = get_bits(bs, 8 + gr_count) >> gr_count;
-    }
-
-    do
-    {
-        if (HDR_IS_MONO(hdr))
-        {
-            scfsi <<= 4;
-        }
-        gr->part_23_length = (uint16_t)get_bits(bs, 12);
-        part_23_sum += gr->part_23_length;
-        gr->big_values = (uint16_t)get_bits(bs,  9);
-        if (gr->big_values > 288)
-        {
-            return -1;
-        }
-        gr->global_gain = (uint8_t)get_bits(bs, 8);
-        gr->scalefac_compress = (uint16_t)get_bits(bs, HDR_TEST_MPEG1(hdr) ? 4 : 9);
-        gr->sfbtab = g_scf_long[sr_idx];
-        gr->n_long_sfb  = 22;
-        gr->n_short_sfb = 0;
-        if (get_bits(bs, 1))
-        {
-            gr->block_type = (uint8_t)get_bits(bs, 2);
-            if (!gr->block_type)
-            {
-                return -1;
-            }
-            gr->mixed_block_flag = (uint8_t)get_bits(bs, 1);
-            gr->region_count[0] = 7;
-            gr->region_count[1] = 255;
-            if (gr->block_type == SHORT_BLOCK_TYPE)
-            {
-                scfsi &= 0x0F0F;
-                if (!gr->mixed_block_flag)
-                {
-                    gr->region_count[0] = 8;
-                    gr->sfbtab = g_scf_short[sr_idx];
-                    gr->n_long_sfb = 0;
-                    gr->n_short_sfb = 39;
-                } else
-                {
-                    gr->sfbtab = g_scf_mixed[sr_idx];
-                    gr->n_long_sfb = HDR_TEST_MPEG1(hdr) ? 8 : 6;
-                    gr->n_short_sfb = 30;
-                }
-            }
-            tables = get_bits(bs, 10);
-            tables <<= 5;
-            gr->subblock_gain[0] = (uint8_t)get_bits(bs, 3);
-            gr->subblock_gain[1] = (uint8_t)get_bits(bs, 3);
-            gr->subblock_gain[2] = (uint8_t)get_bits(bs, 3);
-        } else
-        {
-            gr->block_type = 0;
-            gr->mixed_block_flag = 0;
-            tables = get_bits(bs, 15);
-            gr->region_count[0] = (uint8_t)get_bits(bs, 4);
-            gr->region_count[1] = (uint8_t)get_bits(bs, 3);
-            gr->region_count[2] = 255;
-        }
-        gr->table_select[0] = (uint8_t)(tables >> 10);
-        gr->table_select[1] = (uint8_t)((tables >> 5) & 31);
-        gr->table_select[2] = (uint8_t)((tables) & 31);
-        gr->preflag = HDR_TEST_MPEG1(hdr) ? get_bits(bs, 1) : (gr->scalefac_compress >= 500);
-        gr->scalefac_scale = (uint8_t)get_bits(bs, 1);
-        gr->count1_table = (uint8_t)get_bits(bs, 1);
-        gr->scfsi = (uint8_t)((scfsi >> 12) & 15);
-        scfsi <<= 4;
-        gr++;
-    } while(--gr_count);
-
-    if (part_23_sum + bs->pos > bs->limit + main_data_begin*8)
-    {
-        return -1;
-    }
-
-    return main_data_begin;
-}
-
-static void L3_read_scalefactors(uint8_t *scf, uint8_t *ist_pos, const uint8_t *scf_size, const uint8_t *scf_count, bs_t *bitbuf, int scfsi)
-{
-    int i, k;
-    for (i = 0; i < 4 && scf_count[i]; i++, scfsi *= 2)
-    {
-        int cnt = scf_count[i];
-        if (scfsi & 8)
-        {
-            memcpy(scf, ist_pos, cnt);
-        } else
-        {
-            int bits = scf_size[i];
-            if (!bits)
-            {
-                memset(scf, 0, cnt);
-                memset(ist_pos, 0, cnt);
-            } else
-            {
-                int max_scf = (scfsi < 0) ? (1 << bits) - 1 : -1;
-                for (k = 0; k < cnt; k++)
-                {
-                    int s = get_bits(bitbuf, bits);
-                    ist_pos[k] = (s == max_scf ? -1 : s);
-                    scf[k] = s;
-                }
-            }
-        }
-        ist_pos += cnt;
-        scf += cnt;
-    }
-    scf[0] = scf[1] = scf[2] = 0;
-}
-
-static float L3_ldexp_q2(float y, int exp_q2)
-{
-    static const float g_expfrac[4] = { 9.31322575e-10f,7.83145814e-10f,6.58544508e-10f,5.53767716e-10f };
-    int e;
-    do
-    {
-        e = MINIMP3_MIN(30*4, exp_q2);
-        y *= g_expfrac[e & 3]*(1 << 30 >> (e >> 2));
-    } while ((exp_q2 -= e) > 0);
-    return y;
-}
-
-static void L3_decode_scalefactors(const uint8_t *hdr, uint8_t *ist_pos, bs_t *bs, const L3_gr_info_t *gr, float *scf, int ch)
-{
-    static const uint8_t g_scf_partitions[3][28] = {
-        { 6,5,5, 5,6,5,5,5,6,5, 7,3,11,10,0,0, 7, 7, 7,0, 6, 6,6,3, 8, 8,5,0 },
-        { 8,9,6,12,6,9,9,9,6,9,12,6,15,18,0,0, 6,15,12,0, 6,12,9,6, 6,18,9,0 },
-        { 9,9,6,12,9,9,9,9,9,9,12,6,18,18,0,0,12,12,12,0,12, 9,9,6,15,12,9,0 }
-    };
-    const uint8_t *scf_partition = g_scf_partitions[!!gr->n_short_sfb + !gr->n_long_sfb];
-    uint8_t scf_size[4], iscf[40];
-    int i, scf_shift = gr->scalefac_scale + 1, gain_exp, scfsi = gr->scfsi;
-    float gain;
-
-    if (HDR_TEST_MPEG1(hdr))
-    {
-        static const uint8_t g_scfc_decode[16] = { 0,1,2,3, 12,5,6,7, 9,10,11,13, 14,15,18,19 };
-        int part = g_scfc_decode[gr->scalefac_compress];
-        scf_size[1] = scf_size[0] = (uint8_t)(part >> 2);
-        scf_size[3] = scf_size[2] = (uint8_t)(part & 3);
-    } else
-    {
-        static const uint8_t g_mod[6*4] = { 5,5,4,4,5,5,4,1,4,3,1,1,5,6,6,1,4,4,4,1,4,3,1,1 };
-        int k, modprod, sfc, ist = HDR_TEST_I_STEREO(hdr) && ch;
-        sfc = gr->scalefac_compress >> ist;
-        for (k = ist*3*4; sfc >= 0; sfc -= modprod, k += 4)
-        {
-            for (modprod = 1, i = 3; i >= 0; i--)
-            {
-                scf_size[i] = (uint8_t)(sfc / modprod % g_mod[k + i]);
-                modprod *= g_mod[k + i];
-            }
-        }
-        scf_partition += k;
-        scfsi = -16;
-    }
-    L3_read_scalefactors(iscf, ist_pos, scf_size, scf_partition, bs, scfsi);
-
-    if (gr->n_short_sfb)
-    {
-        int sh = 3 - scf_shift;
-        for (i = 0; i < gr->n_short_sfb; i += 3)
-        {
-            iscf[gr->n_long_sfb + i + 0] += gr->subblock_gain[0] << sh;
-            iscf[gr->n_long_sfb + i + 1] += gr->subblock_gain[1] << sh;
-            iscf[gr->n_long_sfb + i + 2] += gr->subblock_gain[2] << sh;
-        }
-    } else if (gr->preflag)
-    {
-        static const uint8_t g_preamp[10] = { 1,1,1,1,2,2,3,3,3,2 };
-        for (i = 0; i < 10; i++)
-        {
-            iscf[11 + i] += g_preamp[i];
-        }
-    }
-
-    gain_exp = gr->global_gain + BITS_DEQUANTIZER_OUT*4 - 210 - (HDR_IS_MS_STEREO(hdr) ? 2 : 0);
-    gain = L3_ldexp_q2(1 << (MAX_SCFI/4),  MAX_SCFI - gain_exp);
-    for (i = 0; i < (int)(gr->n_long_sfb + gr->n_short_sfb); i++)
-    {
-        scf[i] = L3_ldexp_q2(gain, iscf[i] << scf_shift);
-    }
-}
-
-static const float g_pow43[129 + 16] = {
-    0,-1,-2.519842f,-4.326749f,-6.349604f,-8.549880f,-10.902724f,-13.390518f,-16.000000f,-18.720754f,-21.544347f,-24.463781f,-27.473142f,-30.567351f,-33.741992f,-36.993181f,
-    0,1,2.519842f,4.326749f,6.349604f,8.549880f,10.902724f,13.390518f,16.000000f,18.720754f,21.544347f,24.463781f,27.473142f,30.567351f,33.741992f,36.993181f,40.317474f,43.711787f,47.173345f,50.699631f,54.288352f,57.937408f,61.644865f,65.408941f,69.227979f,73.100443f,77.024898f,81.000000f,85.024491f,89.097188f,93.216975f,97.382800f,101.593667f,105.848633f,110.146801f,114.487321f,118.869381f,123.292209f,127.755065f,132.257246f,136.798076f,141.376907f,145.993119f,150.646117f,155.335327f,160.060199f,164.820202f,169.614826f,174.443577f,179.305980f,184.201575f,189.129918f,194.090580f,199.083145f,204.107210f,209.162385f,214.248292f,219.364564f,224.510845f,229.686789f,234.892058f,240.126328f,245.389280f,250.680604f,256.000000f,261.347174f,266.721841f,272.123723f,277.552547f,283.008049f,288.489971f,293.998060f,299.532071f,305.091761f,310.676898f,316.287249f,321.922592f,327.582707f,333.267377f,338.976394f,344.709550f,350.466646f,356.247482f,362.051866f,367.879608f,373.730522f,379.604427f,385.501143f,391.420496f,397.362314f,403.326427f,409.312672f,415.320884f,421.350905f,427.402579f,433.475750f,439.570269f,445.685987f,451.822757f,457.980436f,464.158883f,470.357960f,476.577530f,482.817459f,489.077615f,495.357868f,501.658090f,507.978156f,514.317941f,520.677324f,527.056184f,533.454404f,539.871867f,546.308458f,552.764065f,559.238575f,565.731879f,572.243870f,578.774440f,585.323483f,591.890898f,598.476581f,605.080431f,611.702349f,618.342238f,625.000000f,631.675540f,638.368763f,645.079578f
-};
-
-static float L3_pow_43(int x)
-{
-    float frac;
-    int sign, mult = 256;
-
-    if (x < 129)
-    {
-        return g_pow43[16 + x];
-    }
-
-    if (x < 1024)
-    {
-        mult = 16;
-        x <<= 3;
-    }
-
-    sign = 2*x & 64;
-    frac = (float)((x & 63) - sign) / ((x & ~63) + sign);
-    return g_pow43[16 + ((x + sign) >> 6)]*(1.f + frac*((4.f/3) + frac*(2.f/9)))*mult;
-}
-
-static void L3_huffman(float *dst, bs_t *bs, const L3_gr_info_t *gr_info, const float *scf, int layer3gr_limit)
-{
-    static const int16_t tabs[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        785,785,785,785,784,784,784,784,513,513,513,513,513,513,513,513,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,
-        -255,1313,1298,1282,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,290,288,
-        -255,1313,1298,1282,769,769,769,769,529,529,529,529,529,529,529,529,528,528,528,528,528,528,528,528,512,512,512,512,512,512,512,512,290,288,
-        -253,-318,-351,-367,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,819,818,547,547,275,275,275,275,561,560,515,546,289,274,288,258,
-        -254,-287,1329,1299,1314,1312,1057,1057,1042,1042,1026,1026,784,784,784,784,529,529,529,529,529,529,529,529,769,769,769,769,768,768,768,768,563,560,306,306,291,259,
-        -252,-413,-477,-542,1298,-575,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-383,-399,1107,1092,1106,1061,849,849,789,789,1104,1091,773,773,1076,1075,341,340,325,309,834,804,577,577,532,532,516,516,832,818,803,816,561,561,531,531,515,546,289,289,288,258,
-        -252,-429,-493,-559,1057,1057,1042,1042,529,529,529,529,529,529,529,529,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,-382,1077,-415,1106,1061,1104,849,849,789,789,1091,1076,1029,1075,834,834,597,581,340,340,339,324,804,833,532,532,832,772,818,803,817,787,816,771,290,290,290,290,288,258,
-        -253,-349,-414,-447,-463,1329,1299,-479,1314,1312,1057,1057,1042,1042,1026,1026,785,785,785,785,784,784,784,784,769,769,769,769,768,768,768,768,-319,851,821,-335,836,850,805,849,341,340,325,336,533,533,579,579,564,564,773,832,578,548,563,516,321,276,306,291,304,259,
-        -251,-572,-733,-830,-863,-879,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,1396,1351,1381,1366,1395,1335,1380,-559,1334,1138,1138,1063,1063,1350,1392,1031,1031,1062,1062,1364,1363,1120,1120,1333,1348,881,881,881,881,375,374,359,373,343,358,341,325,791,791,1123,1122,-703,1105,1045,-719,865,865,790,790,774,774,1104,1029,338,293,323,308,-799,-815,833,788,772,818,803,816,322,292,307,320,561,531,515,546,289,274,288,258,
-        -251,-525,-605,-685,-765,-831,-846,1298,1057,1057,1312,1282,785,785,785,785,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,1399,1398,1383,1367,1382,1396,1351,-511,1381,1366,1139,1139,1079,1079,1124,1124,1364,1349,1363,1333,882,882,882,882,807,807,807,807,1094,1094,1136,1136,373,341,535,535,881,775,867,822,774,-591,324,338,-671,849,550,550,866,864,609,609,293,336,534,534,789,835,773,-751,834,804,308,307,833,788,832,772,562,562,547,547,305,275,560,515,290,290,
-        -252,-397,-477,-557,-622,-653,-719,-735,-750,1329,1299,1314,1057,1057,1042,1042,1312,1282,1024,1024,785,785,785,785,784,784,784,784,769,769,769,769,-383,1127,1141,1111,1126,1140,1095,1110,869,869,883,883,1079,1109,882,882,375,374,807,868,838,881,791,-463,867,822,368,263,852,837,836,-543,610,610,550,550,352,336,534,534,865,774,851,821,850,805,593,533,579,564,773,832,578,578,548,548,577,577,307,276,306,291,516,560,259,259,
-        -250,-2107,-2507,-2764,-2909,-2974,-3007,-3023,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-767,-1052,-1213,-1277,-1358,-1405,-1469,-1535,-1550,-1582,-1614,-1647,-1662,-1694,-1726,-1759,-1774,-1807,-1822,-1854,-1886,1565,-1919,-1935,-1951,-1967,1731,1730,1580,1717,-1983,1729,1564,-1999,1548,-2015,-2031,1715,1595,-2047,1714,-2063,1610,-2079,1609,-2095,1323,1323,1457,1457,1307,1307,1712,1547,1641,1700,1699,1594,1685,1625,1442,1442,1322,1322,-780,-973,-910,1279,1278,1277,1262,1276,1261,1275,1215,1260,1229,-959,974,974,989,989,-943,735,478,478,495,463,506,414,-1039,1003,958,1017,927,942,987,957,431,476,1272,1167,1228,-1183,1256,-1199,895,895,941,941,1242,1227,1212,1135,1014,1014,490,489,503,487,910,1013,985,925,863,894,970,955,1012,847,-1343,831,755,755,984,909,428,366,754,559,-1391,752,486,457,924,997,698,698,983,893,740,740,908,877,739,739,667,667,953,938,497,287,271,271,683,606,590,712,726,574,302,302,738,736,481,286,526,725,605,711,636,724,696,651,589,681,666,710,364,467,573,695,466,466,301,465,379,379,709,604,665,679,316,316,634,633,436,436,464,269,424,394,452,332,438,363,347,408,393,448,331,422,362,407,392,421,346,406,391,376,375,359,1441,1306,-2367,1290,-2383,1337,-2399,-2415,1426,1321,-2431,1411,1336,-2447,-2463,-2479,1169,1169,1049,1049,1424,1289,1412,1352,1319,-2495,1154,1154,1064,1064,1153,1153,416,390,360,404,403,389,344,374,373,343,358,372,327,357,342,311,356,326,1395,1394,1137,1137,1047,1047,1365,1392,1287,1379,1334,1364,1349,1378,1318,1363,792,792,792,792,1152,1152,1032,1032,1121,1121,1046,1046,1120,1120,1030,1030,-2895,1106,1061,1104,849,849,789,789,1091,1076,1029,1090,1060,1075,833,833,309,324,532,532,832,772,818,803,561,561,531,560,515,546,289,274,288,258,
-        -250,-1179,-1579,-1836,-1996,-2124,-2253,-2333,-2413,-2477,-2542,-2574,-2607,-2622,-2655,1314,1313,1298,1312,1282,785,785,785,785,1040,1040,1025,1025,768,768,768,768,-766,-798,-830,-862,-895,-911,-927,-943,-959,-975,-991,-1007,-1023,-1039,-1055,-1070,1724,1647,-1103,-1119,1631,1767,1662,1738,1708,1723,-1135,1780,1615,1779,1599,1677,1646,1778,1583,-1151,1777,1567,1737,1692,1765,1722,1707,1630,1751,1661,1764,1614,1736,1676,1763,1750,1645,1598,1721,1691,1762,1706,1582,1761,1566,-1167,1749,1629,767,766,751,765,494,494,735,764,719,749,734,763,447,447,748,718,477,506,431,491,446,476,461,505,415,430,475,445,504,399,460,489,414,503,383,474,429,459,502,502,746,752,488,398,501,473,413,472,486,271,480,270,-1439,-1455,1357,-1471,-1487,-1503,1341,1325,-1519,1489,1463,1403,1309,-1535,1372,1448,1418,1476,1356,1462,1387,-1551,1475,1340,1447,1402,1386,-1567,1068,1068,1474,1461,455,380,468,440,395,425,410,454,364,467,466,464,453,269,409,448,268,432,1371,1473,1432,1417,1308,1460,1355,1446,1459,1431,1083,1083,1401,1416,1458,1445,1067,1067,1370,1457,1051,1051,1291,1430,1385,1444,1354,1415,1400,1443,1082,1082,1173,1113,1186,1066,1185,1050,-1967,1158,1128,1172,1097,1171,1081,-1983,1157,1112,416,266,375,400,1170,1142,1127,1065,793,793,1169,1033,1156,1096,1141,1111,1155,1080,1126,1140,898,898,808,808,897,897,792,792,1095,1152,1032,1125,1110,1139,1079,1124,882,807,838,881,853,791,-2319,867,368,263,822,852,837,866,806,865,-2399,851,352,262,534,534,821,836,594,594,549,549,593,593,533,533,848,773,579,579,564,578,548,563,276,276,577,576,306,291,516,560,305,305,275,259,
-        -251,-892,-2058,-2620,-2828,-2957,-3023,-3039,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,-559,1530,-575,-591,1528,1527,1407,1526,1391,1023,1023,1023,1023,1525,1375,1268,1268,1103,1103,1087,1087,1039,1039,1523,-604,815,815,815,815,510,495,509,479,508,463,507,447,431,505,415,399,-734,-782,1262,-815,1259,1244,-831,1258,1228,-847,-863,1196,-879,1253,987,987,748,-767,493,493,462,477,414,414,686,669,478,446,461,445,474,429,487,458,412,471,1266,1264,1009,1009,799,799,-1019,-1276,-1452,-1581,-1677,-1757,-1821,-1886,-1933,-1997,1257,1257,1483,1468,1512,1422,1497,1406,1467,1496,1421,1510,1134,1134,1225,1225,1466,1451,1374,1405,1252,1252,1358,1480,1164,1164,1251,1251,1238,1238,1389,1465,-1407,1054,1101,-1423,1207,-1439,830,830,1248,1038,1237,1117,1223,1148,1236,1208,411,426,395,410,379,269,1193,1222,1132,1235,1221,1116,976,976,1192,1162,1177,1220,1131,1191,963,963,-1647,961,780,-1663,558,558,994,993,437,408,393,407,829,978,813,797,947,-1743,721,721,377,392,844,950,828,890,706,706,812,859,796,960,948,843,934,874,571,571,-1919,690,555,689,421,346,539,539,944,779,918,873,932,842,903,888,570,570,931,917,674,674,-2575,1562,-2591,1609,-2607,1654,1322,1322,1441,1441,1696,1546,1683,1593,1669,1624,1426,1426,1321,1321,1639,1680,1425,1425,1305,1305,1545,1668,1608,1623,1667,1592,1638,1666,1320,1320,1652,1607,1409,1409,1304,1304,1288,1288,1664,1637,1395,1395,1335,1335,1622,1636,1394,1394,1319,1319,1606,1621,1392,1392,1137,1137,1137,1137,345,390,360,375,404,373,1047,-2751,-2767,-2783,1062,1121,1046,-2799,1077,-2815,1106,1061,789,789,1105,1104,263,355,310,340,325,354,352,262,339,324,1091,1076,1029,1090,1060,1075,833,833,788,788,1088,1028,818,818,803,803,561,561,531,531,816,771,546,546,289,274,288,258,
-        -253,-317,-381,-446,-478,-509,1279,1279,-811,-1179,-1451,-1756,-1900,-2028,-2189,-2253,-2333,-2414,-2445,-2511,-2526,1313,1298,-2559,1041,1041,1040,1040,1025,1025,1024,1024,1022,1007,1021,991,1020,975,1019,959,687,687,1018,1017,671,671,655,655,1016,1015,639,639,758,758,623,623,757,607,756,591,755,575,754,559,543,543,1009,783,-575,-621,-685,-749,496,-590,750,749,734,748,974,989,1003,958,988,973,1002,942,987,957,972,1001,926,986,941,971,956,1000,910,985,925,999,894,970,-1071,-1087,-1102,1390,-1135,1436,1509,1451,1374,-1151,1405,1358,1480,1420,-1167,1507,1494,1389,1342,1465,1435,1450,1326,1505,1310,1493,1373,1479,1404,1492,1464,1419,428,443,472,397,736,526,464,464,486,457,442,471,484,482,1357,1449,1434,1478,1388,1491,1341,1490,1325,1489,1463,1403,1309,1477,1372,1448,1418,1433,1476,1356,1462,1387,-1439,1475,1340,1447,1402,1474,1324,1461,1371,1473,269,448,1432,1417,1308,1460,-1711,1459,-1727,1441,1099,1099,1446,1386,1431,1401,-1743,1289,1083,1083,1160,1160,1458,1445,1067,1067,1370,1457,1307,1430,1129,1129,1098,1098,268,432,267,416,266,400,-1887,1144,1187,1082,1173,1113,1186,1066,1050,1158,1128,1143,1172,1097,1171,1081,420,391,1157,1112,1170,1142,1127,1065,1169,1049,1156,1096,1141,1111,1155,1080,1126,1154,1064,1153,1140,1095,1048,-2159,1125,1110,1137,-2175,823,823,1139,1138,807,807,384,264,368,263,868,838,853,791,867,822,852,837,866,806,865,790,-2319,851,821,836,352,262,850,805,849,-2399,533,533,835,820,336,261,578,548,563,577,532,532,832,772,562,562,547,547,305,275,560,515,290,290,288,258 };
-    static const uint8_t tab32[] = { 130,162,193,209,44,28,76,140,9,9,9,9,9,9,9,9,190,254,222,238,126,94,157,157,109,61,173,205 };
-    static const uint8_t tab33[] = { 252,236,220,204,188,172,156,140,124,108,92,76,60,44,28,12 };
-    static const int16_t tabindex[2*16] = { 0,32,64,98,0,132,180,218,292,364,426,538,648,746,0,1126,1460,1460,1460,1460,1460,1460,1460,1460,1842,1842,1842,1842,1842,1842,1842,1842 };
-    static const uint8_t g_linbits[] =  { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,6,8,10,13,4,5,6,7,8,9,11,13 };
-
-#define PEEK_BITS(n)  (bs_cache >> (32 - n))
-#define FLUSH_BITS(n) { bs_cache <<= (n); bs_sh += (n); }
-#define CHECK_BITS    while (bs_sh >= 0) { bs_cache |= (uint32_t)*bs_next_ptr++ << bs_sh; bs_sh -= 8; }
-#define BSPOS         ((bs_next_ptr - bs->buf)*8 - 24 + bs_sh)
-
-    float one = 0.0f;
-    int ireg = 0, big_val_cnt = gr_info->big_values;
-    const uint8_t *sfb = gr_info->sfbtab;
-    const uint8_t *bs_next_ptr = bs->buf + bs->pos/8;
-    uint32_t bs_cache = (((bs_next_ptr[0]*256u + bs_next_ptr[1])*256u + bs_next_ptr[2])*256u + bs_next_ptr[3]) << (bs->pos & 7);
-    int pairs_to_decode, np, bs_sh = (bs->pos & 7) - 8;
-    bs_next_ptr += 4;
-
-    while (big_val_cnt > 0)
-    {
-        int tab_num = gr_info->table_select[ireg];
-        int sfb_cnt = gr_info->region_count[ireg++];
-        const int16_t *codebook = tabs + tabindex[tab_num];
-        int linbits = g_linbits[tab_num];
-        if (linbits)
-        {
-            do
-            {
-                np = *sfb++ / 2;
-                pairs_to_decode = MINIMP3_MIN(big_val_cnt, np);
-                one = *scf++;
-                do
-                {
-                    int j, w = 5;
-                    int leaf = codebook[PEEK_BITS(w)];
-                    while (leaf < 0)
-                    {
-                        FLUSH_BITS(w);
-                        w = leaf & 7;
-                        leaf = codebook[PEEK_BITS(w) - (leaf >> 3)];
-                    }
-                    FLUSH_BITS(leaf >> 8);
-
-                    for (j = 0; j < 2; j++, dst++, leaf >>= 4)
-                    {
-                        int lsb = leaf & 0x0F;
-                        if (lsb == 15)
-                        {
-                            lsb += PEEK_BITS(linbits);
-                            FLUSH_BITS(linbits);
-                            CHECK_BITS;
-                            *dst = one*L3_pow_43(lsb)*((int32_t)bs_cache < 0 ? -1: 1);
-                        } else
-                        {
-                            *dst = g_pow43[16 + lsb - 16*(bs_cache >> 31)]*one;
-                        }
-                        FLUSH_BITS(lsb ? 1 : 0);
-                    }
-                    CHECK_BITS;
-                } while (--pairs_to_decode);
-            } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0);
-        } else
-        {
-            do
-            {
-                np = *sfb++ / 2;
-                pairs_to_decode = MINIMP3_MIN(big_val_cnt, np);
-                one = *scf++;
-                do
-                {
-                    int j, w = 5;
-                    int leaf = codebook[PEEK_BITS(w)];
-                    while (leaf < 0)
-                    {
-                        FLUSH_BITS(w);
-                        w = leaf & 7;
-                        leaf = codebook[PEEK_BITS(w) - (leaf >> 3)];
-                    }
-                    FLUSH_BITS(leaf >> 8);
-
-                    for (j = 0; j < 2; j++, dst++, leaf >>= 4)
-                    {
-                        int lsb = leaf & 0x0F;
-                        *dst = g_pow43[16 + lsb - 16*(bs_cache >> 31)]*one;
-                        FLUSH_BITS(lsb ? 1 : 0);
-                    }
-                    CHECK_BITS;
-                } while (--pairs_to_decode);
-            } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0);
-        }
-    }
-
-    for (np = 1 - big_val_cnt;; dst += 4)
-    {
-        const uint8_t *codebook_count1 = (gr_info->count1_table) ? tab33 : tab32;
-        int leaf = codebook_count1[PEEK_BITS(4)];
-        if (!(leaf & 8))
-        {
-            leaf = codebook_count1[(leaf >> 3) + (bs_cache << 4 >> (32 - (leaf & 3)))];
-        }
-        FLUSH_BITS(leaf & 7);
-        if (BSPOS > layer3gr_limit)
-        {
-            break;
-        }
-#define RELOAD_SCALEFACTOR  if (!--np) { np = *sfb++/2; if (!np) break; one = *scf++; }
-#define DEQ_COUNT1(s) if (leaf & (128 >> s)) { dst[s] = ((int32_t)bs_cache < 0) ? -one : one; FLUSH_BITS(1) }
-        RELOAD_SCALEFACTOR;
-        DEQ_COUNT1(0);
-        DEQ_COUNT1(1);
-        RELOAD_SCALEFACTOR;
-        DEQ_COUNT1(2);
-        DEQ_COUNT1(3);
-        CHECK_BITS;
-    }
-
-    bs->pos = layer3gr_limit;
-}
-
-static void L3_midside_stereo(float *left, int n)
-{
-    int i = 0;
-    float *right = left + 576;
-#if HAVE_SIMD
-    if (have_simd())
-    {
-        for (; i < n - 3; i += 4)
-        {
-            f4 vl = VLD(left + i);
-            f4 vr = VLD(right + i);
-            VSTORE(left + i, VADD(vl, vr));
-            VSTORE(right + i, VSUB(vl, vr));
-        }
-#ifdef __GNUC__
-        /* Workaround for spurious -Waggressive-loop-optimizations warning from gcc.
-         * For more info see: https://github.com/lieff/minimp3/issues/88
-         */
-        if (__builtin_constant_p(n % 4 == 0) && n % 4 == 0)
-            return;
-#endif
-    }
-#endif /* HAVE_SIMD */
-    for (; i < n; i++)
-    {
-        float a = left[i];
-        float b = right[i];
-        left[i] = a + b;
-        right[i] = a - b;
-    }
-}
-
-static void L3_intensity_stereo_band(float *left, int n, float kl, float kr)
-{
-    int i;
-    for (i = 0; i < n; i++)
-    {
-        left[i + 576] = left[i]*kr;
-        left[i] = left[i]*kl;
-    }
-}
-
-static void L3_stereo_top_band(const float *right, const uint8_t *sfb, int nbands, int max_band[3])
-{
-    int i, k;
-
-    max_band[0] = max_band[1] = max_band[2] = -1;
-
-    for (i = 0; i < nbands; i++)
-    {
-        for (k = 0; k < sfb[i]; k += 2)
-        {
-            if (right[k] != 0 || right[k + 1] != 0)
-            {
-                max_band[i % 3] = i;
-                break;
-            }
-        }
-        right += sfb[i];
-    }
-}
-
-static void L3_stereo_process(float *left, const uint8_t *ist_pos, const uint8_t *sfb, const uint8_t *hdr, int max_band[3], int mpeg2_sh)
-{
-    static const float g_pan[7*2] = { 0,1,0.21132487f,0.78867513f,0.36602540f,0.63397460f,0.5f,0.5f,0.63397460f,0.36602540f,0.78867513f,0.21132487f,1,0 };
-    unsigned i, max_pos = HDR_TEST_MPEG1(hdr) ? 7 : 64;
-
-    for (i = 0; sfb[i]; i++)
-    {
-        unsigned ipos = ist_pos[i];
-        if ((int)i > max_band[i % 3] && ipos < max_pos)
-        {
-            float kl, kr, s = HDR_TEST_MS_STEREO(hdr) ? 1.41421356f : 1;
-            if (HDR_TEST_MPEG1(hdr))
-            {
-                kl = g_pan[2*ipos];
-                kr = g_pan[2*ipos + 1];
-            } else
-            {
-                kl = 1;
-                kr = L3_ldexp_q2(1, (ipos + 1) >> 1 << mpeg2_sh);
-                if (ipos & 1)
-                {
-                    kl = kr;
-                    kr = 1;
-                }
-            }
-            L3_intensity_stereo_band(left, sfb[i], kl*s, kr*s);
-        } else if (HDR_TEST_MS_STEREO(hdr))
-        {
-            L3_midside_stereo(left, sfb[i]);
-        }
-        left += sfb[i];
-    }
-}
-
-static void L3_intensity_stereo(float *left, uint8_t *ist_pos, const L3_gr_info_t *gr, const uint8_t *hdr)
-{
-    int max_band[3], n_sfb = gr->n_long_sfb + gr->n_short_sfb;
-    int i, max_blocks = gr->n_short_sfb ? 3 : 1;
-
-    L3_stereo_top_band(left + 576, gr->sfbtab, n_sfb, max_band);
-    if (gr->n_long_sfb)
-    {
-        max_band[0] = max_band[1] = max_band[2] = MINIMP3_MAX(MINIMP3_MAX(max_band[0], max_band[1]), max_band[2]);
-    }
-    for (i = 0; i < max_blocks; i++)
-    {
-        int default_pos = HDR_TEST_MPEG1(hdr) ? 3 : 0;
-        int itop = n_sfb - max_blocks + i;
-        int prev = itop - max_blocks;
-        ist_pos[itop] = max_band[i] >= prev ? default_pos : ist_pos[prev];
-    }
-    L3_stereo_process(left, ist_pos, gr->sfbtab, hdr, max_band, gr[1].scalefac_compress & 1);
-}
-
-static void L3_reorder(float *grbuf, float *scratch, const uint8_t *sfb)
-{
-    int i, len;
-    float *src = grbuf, *dst = scratch;
-
-    for (;0 != (len = *sfb); sfb += 3, src += 2*len)
-    {
-        for (i = 0; i < len; i++, src++)
-        {
-            *dst++ = src[0*len];
-            *dst++ = src[1*len];
-            *dst++ = src[2*len];
-        }
-    }
-    memcpy(grbuf, scratch, (dst - scratch)*sizeof(float));
-}
-
-static void L3_antialias(float *grbuf, int nbands)
-{
-    static const float g_aa[2][8] = {
-        {0.85749293f,0.88174200f,0.94962865f,0.98331459f,0.99551782f,0.99916056f,0.99989920f,0.99999316f},
-        {0.51449576f,0.47173197f,0.31337745f,0.18191320f,0.09457419f,0.04096558f,0.01419856f,0.00369997f}
-    };
-
-    for (; nbands > 0; nbands--, grbuf += 18)
-    {
-        int i = 0;
-#if HAVE_SIMD
-        if (have_simd()) for (; i < 8; i += 4)
-        {
-            f4 vu = VLD(grbuf + 18 + i);
-            f4 vd = VLD(grbuf + 14 - i);
-            f4 vc0 = VLD(g_aa[0] + i);
-            f4 vc1 = VLD(g_aa[1] + i);
-            vd = VREV(vd);
-            VSTORE(grbuf + 18 + i, VSUB(VMUL(vu, vc0), VMUL(vd, vc1)));
-            vd = VADD(VMUL(vu, vc1), VMUL(vd, vc0));
-            VSTORE(grbuf + 14 - i, VREV(vd));
-        }
-#endif /* HAVE_SIMD */
-#ifndef MINIMP3_ONLY_SIMD
-        for(; i < 8; i++)
-        {
-            float u = grbuf[18 + i];
-            float d = grbuf[17 - i];
-            grbuf[18 + i] = u*g_aa[0][i] - d*g_aa[1][i];
-            grbuf[17 - i] = u*g_aa[1][i] + d*g_aa[0][i];
-        }
-#endif /* MINIMP3_ONLY_SIMD */
-    }
-}
-
-static void L3_dct3_9(float *y)
-{
-    float s0, s1, s2, s3, s4, s5, s6, s7, s8, t0, t2, t4;
-
-    s0 = y[0]; s2 = y[2]; s4 = y[4]; s6 = y[6]; s8 = y[8];
-    t0 = s0 + s6*0.5f;
-    s0 -= s6;
-    t4 = (s4 + s2)*0.93969262f;
-    t2 = (s8 + s2)*0.76604444f;
-    s6 = (s4 - s8)*0.17364818f;
-    s4 += s8 - s2;
-
-    s2 = s0 - s4*0.5f;
-    y[4] = s4 + s0;
-    s8 = t0 - t2 + s6;
-    s0 = t0 - t4 + t2;
-    s4 = t0 + t4 - s6;
-
-    s1 = y[1]; s3 = y[3]; s5 = y[5]; s7 = y[7];
-
-    s3 *= 0.86602540f;
-    t0 = (s5 + s1)*0.98480775f;
-    t4 = (s5 - s7)*0.34202014f;
-    t2 = (s1 + s7)*0.64278761f;
-    s1 = (s1 - s5 - s7)*0.86602540f;
-
-    s5 = t0 - s3 - t2;
-    s7 = t4 - s3 - t0;
-    s3 = t4 + s3 - t2;
-
-    y[0] = s4 - s7;
-    y[1] = s2 + s1;
-    y[2] = s0 - s3;
-    y[3] = s8 + s5;
-    y[5] = s8 - s5;
-    y[6] = s0 + s3;
-    y[7] = s2 - s1;
-    y[8] = s4 + s7;
-}
-
-static void L3_imdct36(float *grbuf, float *overlap, const float *window, int nbands)
-{
-    int i, j;
-    static const float g_twid9[18] = {
-        0.73727734f,0.79335334f,0.84339145f,0.88701083f,0.92387953f,0.95371695f,0.97629601f,0.99144486f,0.99904822f,0.67559021f,0.60876143f,0.53729961f,0.46174861f,0.38268343f,0.30070580f,0.21643961f,0.13052619f,0.04361938f
-    };
-
-    for (j = 0; j < nbands; j++, grbuf += 18, overlap += 9)
-    {
-        float co[9], si[9];
-        co[0] = -grbuf[0];
-        si[0] = grbuf[17];
-        for (i = 0; i < 4; i++)
-        {
-            si[8 - 2*i] =   grbuf[4*i + 1] - grbuf[4*i + 2];
-            co[1 + 2*i] =   grbuf[4*i + 1] + grbuf[4*i + 2];
-            si[7 - 2*i] =   grbuf[4*i + 4] - grbuf[4*i + 3];
-            co[2 + 2*i] = -(grbuf[4*i + 3] + grbuf[4*i + 4]);
-        }
-        L3_dct3_9(co);
-        L3_dct3_9(si);
-
-        si[1] = -si[1];
-        si[3] = -si[3];
-        si[5] = -si[5];
-        si[7] = -si[7];
-
-        i = 0;
-
-#if HAVE_SIMD
-        if (have_simd()) for (; i < 8; i += 4)
-        {
-            f4 vovl = VLD(overlap + i);
-            f4 vc = VLD(co + i);
-            f4 vs = VLD(si + i);
-            f4 vr0 = VLD(g_twid9 + i);
-            f4 vr1 = VLD(g_twid9 + 9 + i);
-            f4 vw0 = VLD(window + i);
-            f4 vw1 = VLD(window + 9 + i);
-            f4 vsum = VADD(VMUL(vc, vr1), VMUL(vs, vr0));
-            VSTORE(overlap + i, VSUB(VMUL(vc, vr0), VMUL(vs, vr1)));
-            VSTORE(grbuf + i, VSUB(VMUL(vovl, vw0), VMUL(vsum, vw1)));
-            vsum = VADD(VMUL(vovl, vw1), VMUL(vsum, vw0));
-            VSTORE(grbuf + 14 - i, VREV(vsum));
-        }
-#endif /* HAVE_SIMD */
-        for (; i < 9; i++)
-        {
-            float ovl  = overlap[i];
-            float sum  = co[i]*g_twid9[9 + i] + si[i]*g_twid9[0 + i];
-            overlap[i] = co[i]*g_twid9[0 + i] - si[i]*g_twid9[9 + i];
-            grbuf[i]      = ovl*window[0 + i] - sum*window[9 + i];
-            grbuf[17 - i] = ovl*window[9 + i] + sum*window[0 + i];
-        }
-    }
-}
-
-static void L3_idct3(float x0, float x1, float x2, float *dst)
-{
-    float m1 = x1*0.86602540f;
-    float a1 = x0 - x2*0.5f;
-    dst[1] = x0 + x2;
-    dst[0] = a1 + m1;
-    dst[2] = a1 - m1;
-}
-
-static void L3_imdct12(float *x, float *dst, float *overlap)
-{
-    static const float g_twid3[6] = { 0.79335334f,0.92387953f,0.99144486f, 0.60876143f,0.38268343f,0.13052619f };
-    float co[3], si[3];
-    int i;
-
-    L3_idct3(-x[0], x[6] + x[3], x[12] + x[9], co);
-    L3_idct3(x[15], x[12] - x[9], x[6] - x[3], si);
-    si[1] = -si[1];
-
-    for (i = 0; i < 3; i++)
-    {
-        float ovl  = overlap[i];
-        float sum  = co[i]*g_twid3[3 + i] + si[i]*g_twid3[0 + i];
-        overlap[i] = co[i]*g_twid3[0 + i] - si[i]*g_twid3[3 + i];
-        dst[i]     = ovl*g_twid3[2 - i] - sum*g_twid3[5 - i];
-        dst[5 - i] = ovl*g_twid3[5 - i] + sum*g_twid3[2 - i];
-    }
-}
-
-static void L3_imdct_short(float *grbuf, float *overlap, int nbands)
-{
-    for (;nbands > 0; nbands--, overlap += 9, grbuf += 18)
-    {
-        float tmp[18];
-        memcpy(tmp, grbuf, sizeof(tmp));
-        memcpy(grbuf, overlap, 6*sizeof(float));
-        L3_imdct12(tmp, grbuf + 6, overlap + 6);
-        L3_imdct12(tmp + 1, grbuf + 12, overlap + 6);
-        L3_imdct12(tmp + 2, overlap, overlap + 6);
-    }
-}
-
-static void L3_change_sign(float *grbuf)
-{
-    int b, i;
-    for (b = 0, grbuf += 18; b < 32; b += 2, grbuf += 36)
-        for (i = 1; i < 18; i += 2)
-            grbuf[i] = -grbuf[i];
-}
-
-static void L3_imdct_gr(float *grbuf, float *overlap, unsigned block_type, unsigned n_long_bands)
-{
-    static const float g_mdct_window[2][18] = {
-        { 0.99904822f,0.99144486f,0.97629601f,0.95371695f,0.92387953f,0.88701083f,0.84339145f,0.79335334f,0.73727734f,0.04361938f,0.13052619f,0.21643961f,0.30070580f,0.38268343f,0.46174861f,0.53729961f,0.60876143f,0.67559021f },
-        { 1,1,1,1,1,1,0.99144486f,0.92387953f,0.79335334f,0,0,0,0,0,0,0.13052619f,0.38268343f,0.60876143f }
-    };
-    if (n_long_bands)
-    {
-        L3_imdct36(grbuf, overlap, g_mdct_window[0], n_long_bands);
-        grbuf += 18*n_long_bands;
-        overlap += 9*n_long_bands;
-    }
-    if (block_type == SHORT_BLOCK_TYPE)
-        L3_imdct_short(grbuf, overlap, 32 - n_long_bands);
-    else
-        L3_imdct36(grbuf, overlap, g_mdct_window[block_type == STOP_BLOCK_TYPE], 32 - n_long_bands);
-}
-
-static void L3_save_reservoir(mp3dec_t *h, mp3dec_scratch_t *s)
-{
-    int pos = (s->bs.pos + 7)/8u;
-    int remains = s->bs.limit/8u - pos;
-    if (remains > MAX_BITRESERVOIR_BYTES)
-    {
-        pos += remains - MAX_BITRESERVOIR_BYTES;
-        remains = MAX_BITRESERVOIR_BYTES;
-    }
-    if (remains > 0)
-    {
-        memmove(h->reserv_buf, s->maindata + pos, remains);
-    }
-    h->reserv = remains;
-}
-
-static int L3_restore_reservoir(mp3dec_t *h, bs_t *bs, mp3dec_scratch_t *s, int main_data_begin)
-{
-    int frame_bytes = (bs->limit - bs->pos)/8;
-    int bytes_have = MINIMP3_MIN(h->reserv, main_data_begin);
-    memcpy(s->maindata, h->reserv_buf + MINIMP3_MAX(0, h->reserv - main_data_begin), MINIMP3_MIN(h->reserv, main_data_begin));
-    memcpy(s->maindata + bytes_have, bs->buf + bs->pos/8, frame_bytes);
-    bs_init(&s->bs, s->maindata, bytes_have + frame_bytes);
-    return h->reserv >= main_data_begin;
-}
-
-static void L3_decode(mp3dec_t *h, mp3dec_scratch_t *s, L3_gr_info_t *gr_info, int nch)
-{
-    int ch;
-
-    for (ch = 0; ch < nch; ch++)
-    {
-        int layer3gr_limit = s->bs.pos + gr_info[ch].part_23_length;
-        L3_decode_scalefactors(h->header, s->ist_pos[ch], &s->bs, gr_info + ch, s->scf, ch);
-        L3_huffman(s->grbuf[ch], &s->bs, gr_info + ch, s->scf, layer3gr_limit);
-    }
-
-    if (HDR_TEST_I_STEREO(h->header))
-    {
-        L3_intensity_stereo(s->grbuf[0], s->ist_pos[1], gr_info, h->header);
-    } else if (HDR_IS_MS_STEREO(h->header))
-    {
-        L3_midside_stereo(s->grbuf[0], 576);
-    }
-
-    for (ch = 0; ch < nch; ch++, gr_info++)
-    {
-        int aa_bands = 31;
-        int n_long_bands = (gr_info->mixed_block_flag ? 2 : 0) << (int)(HDR_GET_MY_SAMPLE_RATE(h->header) == 2);
-
-        if (gr_info->n_short_sfb)
-        {
-            aa_bands = n_long_bands - 1;
-            L3_reorder(s->grbuf[ch] + n_long_bands*18, s->syn[0], gr_info->sfbtab + gr_info->n_long_sfb);
-        }
-
-        L3_antialias(s->grbuf[ch], aa_bands);
-        L3_imdct_gr(s->grbuf[ch], h->mdct_overlap[ch], gr_info->block_type, n_long_bands);
-        L3_change_sign(s->grbuf[ch]);
-    }
-}
-
-static void mp3d_DCT_II(float *grbuf, int n)
-{
-    static const float g_sec[24] = {
-        10.19000816f,0.50060302f,0.50241929f,3.40760851f,0.50547093f,0.52249861f,2.05778098f,0.51544732f,0.56694406f,1.48416460f,0.53104258f,0.64682180f,1.16943991f,0.55310392f,0.78815460f,0.97256821f,0.58293498f,1.06067765f,0.83934963f,0.62250412f,1.72244716f,0.74453628f,0.67480832f,5.10114861f
-    };
-    int i, k = 0;
-#if HAVE_SIMD
-    if (have_simd()) for (; k < n; k += 4)
-    {
-        f4 t[4][8], *x;
-        float *y = grbuf + k;
-
-        for (x = t[0], i = 0; i < 8; i++, x++)
-        {
-            f4 x0 = VLD(&y[i*18]);
-            f4 x1 = VLD(&y[(15 - i)*18]);
-            f4 x2 = VLD(&y[(16 + i)*18]);
-            f4 x3 = VLD(&y[(31 - i)*18]);
-            f4 t0 = VADD(x0, x3);
-            f4 t1 = VADD(x1, x2);
-            f4 t2 = VMUL_S(VSUB(x1, x2), g_sec[3*i + 0]);
-            f4 t3 = VMUL_S(VSUB(x0, x3), g_sec[3*i + 1]);
-            x[0] = VADD(t0, t1);
-            x[8] = VMUL_S(VSUB(t0, t1), g_sec[3*i + 2]);
-            x[16] = VADD(t3, t2);
-            x[24] = VMUL_S(VSUB(t3, t2), g_sec[3*i + 2]);
-        }
-        for (x = t[0], i = 0; i < 4; i++, x += 8)
-        {
-            f4 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt;
-            xt = VSUB(x0, x7); x0 = VADD(x0, x7);
-            x7 = VSUB(x1, x6); x1 = VADD(x1, x6);
-            x6 = VSUB(x2, x5); x2 = VADD(x2, x5);
-            x5 = VSUB(x3, x4); x3 = VADD(x3, x4);
-            x4 = VSUB(x0, x3); x0 = VADD(x0, x3);
-            x3 = VSUB(x1, x2); x1 = VADD(x1, x2);
-            x[0] = VADD(x0, x1);
-            x[4] = VMUL_S(VSUB(x0, x1), 0.70710677f);
-            x5 = VADD(x5, x6);
-            x6 = VMUL_S(VADD(x6, x7), 0.70710677f);
-            x7 = VADD(x7, xt);
-            x3 = VMUL_S(VADD(x3, x4), 0.70710677f);
-            x5 = VSUB(x5, VMUL_S(x7, 0.198912367f)); /* rotate by PI/8 */
-            x7 = VADD(x7, VMUL_S(x5, 0.382683432f));
-            x5 = VSUB(x5, VMUL_S(x7, 0.198912367f));
-            x0 = VSUB(xt, x6); xt = VADD(xt, x6);
-            x[1] = VMUL_S(VADD(xt, x7), 0.50979561f);
-            x[2] = VMUL_S(VADD(x4, x3), 0.54119611f);
-            x[3] = VMUL_S(VSUB(x0, x5), 0.60134488f);
-            x[5] = VMUL_S(VADD(x0, x5), 0.89997619f);
-            x[6] = VMUL_S(VSUB(x4, x3), 1.30656302f);
-            x[7] = VMUL_S(VSUB(xt, x7), 2.56291556f);
-        }
-
-        if (k > n - 3)
-        {
-#if HAVE_SSE
-#define VSAVE2(i, v) _mm_storel_pi((__m64 *)(void*)&y[i*18], v)
-#else /* HAVE_SSE */
-#define VSAVE2(i, v) vst1_f32((float32_t *)&y[i*18],  vget_low_f32(v))
-#endif /* HAVE_SSE */
-            for (i = 0; i < 7; i++, y += 4*18)
-            {
-                f4 s = VADD(t[3][i], t[3][i + 1]);
-                VSAVE2(0, t[0][i]);
-                VSAVE2(1, VADD(t[2][i], s));
-                VSAVE2(2, VADD(t[1][i], t[1][i + 1]));
-                VSAVE2(3, VADD(t[2][1 + i], s));
-            }
-            VSAVE2(0, t[0][7]);
-            VSAVE2(1, VADD(t[2][7], t[3][7]));
-            VSAVE2(2, t[1][7]);
-            VSAVE2(3, t[3][7]);
-        } else
-        {
-#define VSAVE4(i, v) VSTORE(&y[i*18], v)
-            for (i = 0; i < 7; i++, y += 4*18)
-            {
-                f4 s = VADD(t[3][i], t[3][i + 1]);
-                VSAVE4(0, t[0][i]);
-                VSAVE4(1, VADD(t[2][i], s));
-                VSAVE4(2, VADD(t[1][i], t[1][i + 1]));
-                VSAVE4(3, VADD(t[2][1 + i], s));
-            }
-            VSAVE4(0, t[0][7]);
-            VSAVE4(1, VADD(t[2][7], t[3][7]));
-            VSAVE4(2, t[1][7]);
-            VSAVE4(3, t[3][7]);
-        }
-    } else
-#endif /* HAVE_SIMD */
-#ifdef MINIMP3_ONLY_SIMD
-    {} /* for HAVE_SIMD=1, MINIMP3_ONLY_SIMD=1 case we do not need non-intrinsic "else" branch */
-#else /* MINIMP3_ONLY_SIMD */
-    for (; k < n; k++)
-    {
-        float t[4][8], *x, *y = grbuf + k;
-
-        for (x = t[0], i = 0; i < 8; i++, x++)
-        {
-            float x0 = y[i*18];
-            float x1 = y[(15 - i)*18];
-            float x2 = y[(16 + i)*18];
-            float x3 = y[(31 - i)*18];
-            float t0 = x0 + x3;
-            float t1 = x1 + x2;
-            float t2 = (x1 - x2)*g_sec[3*i + 0];
-            float t3 = (x0 - x3)*g_sec[3*i + 1];
-            x[0] = t0 + t1;
-            x[8] = (t0 - t1)*g_sec[3*i + 2];
-            x[16] = t3 + t2;
-            x[24] = (t3 - t2)*g_sec[3*i + 2];
-        }
-        for (x = t[0], i = 0; i < 4; i++, x += 8)
-        {
-            float x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt;
-            xt = x0 - x7; x0 += x7;
-            x7 = x1 - x6; x1 += x6;
-            x6 = x2 - x5; x2 += x5;
-            x5 = x3 - x4; x3 += x4;
-            x4 = x0 - x3; x0 += x3;
-            x3 = x1 - x2; x1 += x2;
-            x[0] = x0 + x1;
-            x[4] = (x0 - x1)*0.70710677f;
-            x5 =  x5 + x6;
-            x6 = (x6 + x7)*0.70710677f;
-            x7 =  x7 + xt;
-            x3 = (x3 + x4)*0.70710677f;
-            x5 -= x7*0.198912367f;  /* rotate by PI/8 */
-            x7 += x5*0.382683432f;
-            x5 -= x7*0.198912367f;
-            x0 = xt - x6; xt += x6;
-            x[1] = (xt + x7)*0.50979561f;
-            x[2] = (x4 + x3)*0.54119611f;
-            x[3] = (x0 - x5)*0.60134488f;
-            x[5] = (x0 + x5)*0.89997619f;
-            x[6] = (x4 - x3)*1.30656302f;
-            x[7] = (xt - x7)*2.56291556f;
-
-        }
-        for (i = 0; i < 7; i++, y += 4*18)
-        {
-            y[0*18] = t[0][i];
-            y[1*18] = t[2][i] + t[3][i] + t[3][i + 1];
-            y[2*18] = t[1][i] + t[1][i + 1];
-            y[3*18] = t[2][i + 1] + t[3][i] + t[3][i + 1];
-        }
-        y[0*18] = t[0][7];
-        y[1*18] = t[2][7] + t[3][7];
-        y[2*18] = t[1][7];
-        y[3*18] = t[3][7];
-    }
-#endif /* MINIMP3_ONLY_SIMD */
-}
-
-#ifndef MINIMP3_FLOAT_OUTPUT
-static int16_t mp3d_scale_pcm(float sample)
-{
-#if HAVE_ARMV6
-    int32_t s32 = (int32_t)(sample + .5f);
-    s32 -= (s32 < 0);
-    int16_t s = (int16_t)minimp3_clip_int16_arm(s32);
-#else
-    if (sample >=  32766.5) return (int16_t) 32767;
-    if (sample <= -32767.5) return (int16_t)-32768;
-    int16_t s = (int16_t)(sample + .5f);
-    s -= (s < 0);   /* away from zero, to be compliant */
-#endif
-    return s;
-}
-#else /* MINIMP3_FLOAT_OUTPUT */
-static float mp3d_scale_pcm(float sample)
-{
-    return sample*(1.f/32768.f);
-}
-#endif /* MINIMP3_FLOAT_OUTPUT */
-
-static void mp3d_synth_pair(mp3d_sample_t *pcm, int nch, const float *z)
-{
-    float a;
-    a  = (z[14*64] - z[    0]) * 29;
-    a += (z[ 1*64] + z[13*64]) * 213;
-    a += (z[12*64] - z[ 2*64]) * 459;
-    a += (z[ 3*64] + z[11*64]) * 2037;
-    a += (z[10*64] - z[ 4*64]) * 5153;
-    a += (z[ 5*64] + z[ 9*64]) * 6574;
-    a += (z[ 8*64] - z[ 6*64]) * 37489;
-    a +=  z[ 7*64]             * 75038;
-    pcm[0] = mp3d_scale_pcm(a);
-
-    z += 2;
-    a  = z[14*64] * 104;
-    a += z[12*64] * 1567;
-    a += z[10*64] * 9727;
-    a += z[ 8*64] * 64019;
-    a += z[ 6*64] * -9975;
-    a += z[ 4*64] * -45;
-    a += z[ 2*64] * 146;
-    a += z[ 0*64] * -5;
-    pcm[16*nch] = mp3d_scale_pcm(a);
-}
-
-static void mp3d_synth(float *xl, mp3d_sample_t *dstl, int nch, float *lins)
-{
-    int i;
-    float *xr = xl + 576*(nch - 1);
-    mp3d_sample_t *dstr = dstl + (nch - 1);
-
-    static const float g_win[] = {
-        -1,26,-31,208,218,401,-519,2063,2000,4788,-5517,7134,5959,35640,-39336,74992,
-        -1,24,-35,202,222,347,-581,2080,1952,4425,-5879,7640,5288,33791,-41176,74856,
-        -1,21,-38,196,225,294,-645,2087,1893,4063,-6237,8092,4561,31947,-43006,74630,
-        -1,19,-41,190,227,244,-711,2085,1822,3705,-6589,8492,3776,30112,-44821,74313,
-        -1,17,-45,183,228,197,-779,2075,1739,3351,-6935,8840,2935,28289,-46617,73908,
-        -1,16,-49,176,228,153,-848,2057,1644,3004,-7271,9139,2037,26482,-48390,73415,
-        -2,14,-53,169,227,111,-919,2032,1535,2663,-7597,9389,1082,24694,-50137,72835,
-        -2,13,-58,161,224,72,-991,2001,1414,2330,-7910,9592,70,22929,-51853,72169,
-        -2,11,-63,154,221,36,-1064,1962,1280,2006,-8209,9750,-998,21189,-53534,71420,
-        -2,10,-68,147,215,2,-1137,1919,1131,1692,-8491,9863,-2122,19478,-55178,70590,
-        -3,9,-73,139,208,-29,-1210,1870,970,1388,-8755,9935,-3300,17799,-56778,69679,
-        -3,8,-79,132,200,-57,-1283,1817,794,1095,-8998,9966,-4533,16155,-58333,68692,
-        -4,7,-85,125,189,-83,-1356,1759,605,814,-9219,9959,-5818,14548,-59838,67629,
-        -4,7,-91,117,177,-106,-1428,1698,402,545,-9416,9916,-7154,12980,-61289,66494,
-        -5,6,-97,111,163,-127,-1498,1634,185,288,-9585,9838,-8540,11455,-62684,65290
-    };
-    float *zlin = lins + 15*64;
-    const float *w = g_win;
-
-    zlin[4*15]     = xl[18*16];
-    zlin[4*15 + 1] = xr[18*16];
-    zlin[4*15 + 2] = xl[0];
-    zlin[4*15 + 3] = xr[0];
-
-    zlin[4*31]     = xl[1 + 18*16];
-    zlin[4*31 + 1] = xr[1 + 18*16];
-    zlin[4*31 + 2] = xl[1];
-    zlin[4*31 + 3] = xr[1];
-
-    mp3d_synth_pair(dstr, nch, lins + 4*15 + 1);
-    mp3d_synth_pair(dstr + 32*nch, nch, lins + 4*15 + 64 + 1);
-    mp3d_synth_pair(dstl, nch, lins + 4*15);
-    mp3d_synth_pair(dstl + 32*nch, nch, lins + 4*15 + 64);
-
-#if HAVE_SIMD
-    if (have_simd()) for (i = 14; i >= 0; i--)
-    {
-#define VLOAD(k) f4 w0 = VSET(*w++); f4 w1 = VSET(*w++); f4 vz = VLD(&zlin[4*i - 64*k]); f4 vy = VLD(&zlin[4*i - 64*(15 - k)]);
-#define V0(k) { VLOAD(k) b =         VADD(VMUL(vz, w1), VMUL(vy, w0)) ; a =         VSUB(VMUL(vz, w0), VMUL(vy, w1));  }
-#define V1(k) { VLOAD(k) b = VADD(b, VADD(VMUL(vz, w1), VMUL(vy, w0))); a = VADD(a, VSUB(VMUL(vz, w0), VMUL(vy, w1))); }
-#define V2(k) { VLOAD(k) b = VADD(b, VADD(VMUL(vz, w1), VMUL(vy, w0))); a = VADD(a, VSUB(VMUL(vy, w1), VMUL(vz, w0))); }
-        f4 a, b;
-        zlin[4*i]     = xl[18*(31 - i)];
-        zlin[4*i + 1] = xr[18*(31 - i)];
-        zlin[4*i + 2] = xl[1 + 18*(31 - i)];
-        zlin[4*i + 3] = xr[1 + 18*(31 - i)];
-        zlin[4*i + 64] = xl[1 + 18*(1 + i)];
-        zlin[4*i + 64 + 1] = xr[1 + 18*(1 + i)];
-        zlin[4*i - 64 + 2] = xl[18*(1 + i)];
-        zlin[4*i - 64 + 3] = xr[18*(1 + i)];
-
-        V0(0) V2(1) V1(2) V2(3) V1(4) V2(5) V1(6) V2(7)
-
-        {
-#ifndef MINIMP3_FLOAT_OUTPUT
-#if HAVE_SSE
-            static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f };
-            static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
-            __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)),
-                                           _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min)));
-            dstr[(15 - i)*nch] = _mm_extract_epi16(pcm8, 1);
-            dstr[(17 + i)*nch] = _mm_extract_epi16(pcm8, 5);
-            dstl[(15 - i)*nch] = _mm_extract_epi16(pcm8, 0);
-            dstl[(17 + i)*nch] = _mm_extract_epi16(pcm8, 4);
-            dstr[(47 - i)*nch] = _mm_extract_epi16(pcm8, 3);
-            dstr[(49 + i)*nch] = _mm_extract_epi16(pcm8, 7);
-            dstl[(47 - i)*nch] = _mm_extract_epi16(pcm8, 2);
-            dstl[(49 + i)*nch] = _mm_extract_epi16(pcm8, 6);
-#else /* HAVE_SSE */
-            int16x4_t pcma, pcmb;
-            a = VADD(a, VSET(0.5f));
-            b = VADD(b, VSET(0.5f));
-            pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0)))));
-            pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0)))));
-            vst1_lane_s16(dstr + (15 - i)*nch, pcma, 1);
-            vst1_lane_s16(dstr + (17 + i)*nch, pcmb, 1);
-            vst1_lane_s16(dstl + (15 - i)*nch, pcma, 0);
-            vst1_lane_s16(dstl + (17 + i)*nch, pcmb, 0);
-            vst1_lane_s16(dstr + (47 - i)*nch, pcma, 3);
-            vst1_lane_s16(dstr + (49 + i)*nch, pcmb, 3);
-            vst1_lane_s16(dstl + (47 - i)*nch, pcma, 2);
-            vst1_lane_s16(dstl + (49 + i)*nch, pcmb, 2);
-#endif /* HAVE_SSE */
-
-#else /* MINIMP3_FLOAT_OUTPUT */
-
-            static const f4 g_scale = { 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f };
-            a = VMUL(a, g_scale);
-            b = VMUL(b, g_scale);
-#if HAVE_SSE
-            _mm_store_ss(dstr + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
-            _mm_store_ss(dstr + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
-            _mm_store_ss(dstl + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)));
-            _mm_store_ss(dstl + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 0, 0, 0)));
-            _mm_store_ss(dstr + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)));
-            _mm_store_ss(dstr + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 3, 3)));
-            _mm_store_ss(dstl + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
-            _mm_store_ss(dstl + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 2, 2)));
-#else /* HAVE_SSE */
-            vst1q_lane_f32(dstr + (15 - i)*nch, a, 1);
-            vst1q_lane_f32(dstr + (17 + i)*nch, b, 1);
-            vst1q_lane_f32(dstl + (15 - i)*nch, a, 0);
-            vst1q_lane_f32(dstl + (17 + i)*nch, b, 0);
-            vst1q_lane_f32(dstr + (47 - i)*nch, a, 3);
-            vst1q_lane_f32(dstr + (49 + i)*nch, b, 3);
-            vst1q_lane_f32(dstl + (47 - i)*nch, a, 2);
-            vst1q_lane_f32(dstl + (49 + i)*nch, b, 2);
-#endif /* HAVE_SSE */
-#endif /* MINIMP3_FLOAT_OUTPUT */
-        }
-    } else
-#endif /* HAVE_SIMD */
-#ifdef MINIMP3_ONLY_SIMD
-    {} /* for HAVE_SIMD=1, MINIMP3_ONLY_SIMD=1 case we do not need non-intrinsic "else" branch */
-#else /* MINIMP3_ONLY_SIMD */
-    for (i = 14; i >= 0; i--)
-    {
-#define LOAD(k) float w0 = *w++; float w1 = *w++; float *vz = &zlin[4*i - k*64]; float *vy = &zlin[4*i - (15 - k)*64];
-#define S0(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j]  = vz[j]*w1 + vy[j]*w0, a[j]  = vz[j]*w0 - vy[j]*w1; }
-#define S1(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vz[j]*w0 - vy[j]*w1; }
-#define S2(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vy[j]*w1 - vz[j]*w0; }
-        float a[4], b[4];
-
-        zlin[4*i]     = xl[18*(31 - i)];
-        zlin[4*i + 1] = xr[18*(31 - i)];
-        zlin[4*i + 2] = xl[1 + 18*(31 - i)];
-        zlin[4*i + 3] = xr[1 + 18*(31 - i)];
-        zlin[4*(i + 16)]   = xl[1 + 18*(1 + i)];
-        zlin[4*(i + 16) + 1] = xr[1 + 18*(1 + i)];
-        zlin[4*(i - 16) + 2] = xl[18*(1 + i)];
-        zlin[4*(i - 16) + 3] = xr[18*(1 + i)];
-
-        S0(0) S2(1) S1(2) S2(3) S1(4) S2(5) S1(6) S2(7)
-
-        dstr[(15 - i)*nch] = mp3d_scale_pcm(a[1]);
-        dstr[(17 + i)*nch] = mp3d_scale_pcm(b[1]);
-        dstl[(15 - i)*nch] = mp3d_scale_pcm(a[0]);
-        dstl[(17 + i)*nch] = mp3d_scale_pcm(b[0]);
-        dstr[(47 - i)*nch] = mp3d_scale_pcm(a[3]);
-        dstr[(49 + i)*nch] = mp3d_scale_pcm(b[3]);
-        dstl[(47 - i)*nch] = mp3d_scale_pcm(a[2]);
-        dstl[(49 + i)*nch] = mp3d_scale_pcm(b[2]);
-    }
-#endif /* MINIMP3_ONLY_SIMD */
-}
-
-static void mp3d_synth_granule(float *qmf_state, float *grbuf, int nbands, int nch, mp3d_sample_t *pcm, float *lins)
-{
-    int i;
-    for (i = 0; i < nch; i++)
-    {
-        mp3d_DCT_II(grbuf + 576*i, nbands);
-    }
-
-    memcpy(lins, qmf_state, sizeof(float)*15*64);
-
-    for (i = 0; i < nbands; i += 2)
-    {
-        mp3d_synth(grbuf + i, pcm + 32*nch*i, nch, lins + i*64);
-    }
-#ifndef MINIMP3_NONSTANDARD_BUT_LOGICAL
-    if (nch == 1)
-    {
-        for (i = 0; i < 15*64; i += 2)
-        {
-            qmf_state[i] = lins[nbands*64 + i];
-        }
-    } else
-#endif /* MINIMP3_NONSTANDARD_BUT_LOGICAL */
-    {
-        memcpy(qmf_state, lins + nbands*64, sizeof(float)*15*64);
-    }
-}
-
-static int mp3d_match_frame(const uint8_t *hdr, int mp3_bytes, int frame_bytes)
-{
-    int i, nmatch;
-    for (i = 0, nmatch = 0; nmatch < MAX_FRAME_SYNC_MATCHES; nmatch++)
-    {
-        i += hdr_frame_bytes(hdr + i, frame_bytes) + hdr_padding(hdr + i);
-        if (i + HDR_SIZE > mp3_bytes)
-            return nmatch > 0;
-        if (!hdr_compare(hdr, hdr + i))
-            return 0;
-    }
-    return 1;
-}
-
-static int mp3d_find_frame(const uint8_t *mp3, int mp3_bytes, int *free_format_bytes, int *ptr_frame_bytes)
-{
-    int i, k;
-    for (i = 0; i < mp3_bytes - HDR_SIZE; i++, mp3++)
-    {
-        if (hdr_valid(mp3))
-        {
-            int frame_bytes = hdr_frame_bytes(mp3, *free_format_bytes);
-            int frame_and_padding = frame_bytes + hdr_padding(mp3);
-
-            for (k = HDR_SIZE; !frame_bytes && k < MAX_FREE_FORMAT_FRAME_SIZE && i + 2*k < mp3_bytes - HDR_SIZE; k++)
-            {
-                if (hdr_compare(mp3, mp3 + k))
-                {
-                    int fb = k - hdr_padding(mp3);
-                    int nextfb = fb + hdr_padding(mp3 + k);
-                    if (i + k + nextfb + HDR_SIZE > mp3_bytes || !hdr_compare(mp3, mp3 + k + nextfb))
-                        continue;
-                    frame_and_padding = k;
-                    frame_bytes = fb;
-                    *free_format_bytes = fb;
-                }
-            }
-            if ((frame_bytes && i + frame_and_padding <= mp3_bytes &&
-                mp3d_match_frame(mp3, mp3_bytes - i, frame_bytes)) ||
-                (!i && frame_and_padding == mp3_bytes))
-            {
-                *ptr_frame_bytes = frame_and_padding;
-                return i;
-            }
-            *free_format_bytes = 0;
-        }
-    }
-    *ptr_frame_bytes = 0;
-    return mp3_bytes;
-}
-
-void mp3dec_init(mp3dec_t *dec)
-{
-    dec->header[0] = 0;
-}
-
-int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_sample_t *pcm, mp3dec_frame_info_t *info)
-{
-    int i = 0, igr, frame_size = 0, success = 1;
-    const uint8_t *hdr;
-    bs_t bs_frame[1];
-    mp3dec_scratch_t scratch;
-
-    if (mp3_bytes > 4 && dec->header[0] == 0xff && hdr_compare(dec->header, mp3))
-    {
-        frame_size = hdr_frame_bytes(mp3, dec->free_format_bytes) + hdr_padding(mp3);
-        if (frame_size != mp3_bytes && (frame_size + HDR_SIZE > mp3_bytes || !hdr_compare(mp3, mp3 + frame_size)))
-        {
-            frame_size = 0;
-        }
-    }
-    if (!frame_size)
-    {
-        memset(dec, 0, sizeof(mp3dec_t));
-        i = mp3d_find_frame(mp3, mp3_bytes, &dec->free_format_bytes, &frame_size);
-        if (!frame_size || i + frame_size > mp3_bytes)
-        {
-            info->frame_bytes = i;
-            return 0;
-        }
-    }
-
-    hdr = mp3 + i;
-    memcpy(dec->header, hdr, HDR_SIZE);
-    info->frame_bytes = i + frame_size;
-    info->frame_offset = i;
-    info->channels = HDR_IS_MONO(hdr) ? 1 : 2;
-    info->hz = hdr_sample_rate_hz(hdr);
-    info->layer = 4 - HDR_GET_LAYER(hdr);
-    info->bitrate_kbps = hdr_bitrate_kbps(hdr);
-
-    if (!pcm)
-    {
-        return hdr_frame_samples(hdr);
-    }
-
-    bs_init(bs_frame, hdr + HDR_SIZE, frame_size - HDR_SIZE);
-    if (HDR_IS_CRC(hdr))
-    {
-        get_bits(bs_frame, 16);
-    }
-
-    if (info->layer == 3)
-    {
-        int main_data_begin = L3_read_side_info(bs_frame, scratch.gr_info, hdr);
-        if (main_data_begin < 0 || bs_frame->pos > bs_frame->limit)
-        {
-            mp3dec_init(dec);
-            return 0;
-        }
-        success = L3_restore_reservoir(dec, bs_frame, &scratch, main_data_begin);
-        if (success)
-        {
-            for (igr = 0; igr < (HDR_TEST_MPEG1(hdr) ? 2 : 1); igr++, pcm += 576*info->channels)
-            {
-                memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
-                L3_decode(dec, &scratch, scratch.gr_info + igr*info->channels, info->channels);
-                mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 18, info->channels, pcm, scratch.syn[0]);
-            }
-        }
-        L3_save_reservoir(dec, &scratch);
-    } else
-    {
-#ifdef MINIMP3_ONLY_MP3
-        return 0;
-#else /* MINIMP3_ONLY_MP3 */
-        L12_scale_info sci[1];
-        L12_read_scale_info(hdr, bs_frame, sci);
-
-        memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
-        for (i = 0, igr = 0; igr < 3; igr++)
-        {
-            if (12 == (i += L12_dequantize_granule(scratch.grbuf[0] + i, bs_frame, sci, info->layer | 1)))
-            {
-                i = 0;
-                L12_apply_scf_384(sci, sci->scf + igr, scratch.grbuf[0]);
-                mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 12, info->channels, pcm, scratch.syn[0]);
-                memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
-                pcm += 384*info->channels;
-            }
-            if (bs_frame->pos > bs_frame->limit)
-            {
-                mp3dec_init(dec);
-                return 0;
-            }
-        }
-#endif /* MINIMP3_ONLY_MP3 */
-    }
-    return success*hdr_frame_samples(dec->header);
-}
-
-#ifdef MINIMP3_FLOAT_OUTPUT
-void mp3dec_f32_to_s16(const float *in, int16_t *out, int num_samples)
-{
-    int i = 0;
-#if HAVE_SIMD
-    int aligned_count = num_samples & ~7;
-    for(; i < aligned_count; i += 8)
-    {
-        static const f4 g_scale = { 32768.0f, 32768.0f, 32768.0f, 32768.0f };
-        f4 a = VMUL(VLD(&in[i  ]), g_scale);
-        f4 b = VMUL(VLD(&in[i+4]), g_scale);
-#if HAVE_SSE
-        static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f };
-        static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
-        __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)),
-                                       _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min)));
-        out[i  ] = _mm_extract_epi16(pcm8, 0);
-        out[i+1] = _mm_extract_epi16(pcm8, 1);
-        out[i+2] = _mm_extract_epi16(pcm8, 2);
-        out[i+3] = _mm_extract_epi16(pcm8, 3);
-        out[i+4] = _mm_extract_epi16(pcm8, 4);
-        out[i+5] = _mm_extract_epi16(pcm8, 5);
-        out[i+6] = _mm_extract_epi16(pcm8, 6);
-        out[i+7] = _mm_extract_epi16(pcm8, 7);
-#else /* HAVE_SSE */
-        int16x4_t pcma, pcmb;
-        a = VADD(a, VSET(0.5f));
-        b = VADD(b, VSET(0.5f));
-        pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0)))));
-        pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0)))));
-        vst1_lane_s16(out+i  , pcma, 0);
-        vst1_lane_s16(out+i+1, pcma, 1);
-        vst1_lane_s16(out+i+2, pcma, 2);
-        vst1_lane_s16(out+i+3, pcma, 3);
-        vst1_lane_s16(out+i+4, pcmb, 0);
-        vst1_lane_s16(out+i+5, pcmb, 1);
-        vst1_lane_s16(out+i+6, pcmb, 2);
-        vst1_lane_s16(out+i+7, pcmb, 3);
-#endif /* HAVE_SSE */
-    }
-#endif /* HAVE_SIMD */
-    for(; i < num_samples; i++)
-    {
-        float sample = in[i] * 32768.0f;
-        if (sample >=  32766.5)
-            out[i] = (int16_t) 32767;
-        else if (sample <= -32767.5)
-            out[i] = (int16_t)-32768;
-        else
-        {
-            int16_t s = (int16_t)(sample + .5f);
-            s -= (s < 0);   /* away from zero, to be compliant */
-            out[i] = s;
-        }
-    }
-}
-#endif /* MINIMP3_FLOAT_OUTPUT */
-#endif /* MINIMP3_IMPLEMENTATION && !_MINIMP3_IMPLEMENTATION_GUARD */
diff --git a/tools/ace-qwen3.cpp b/tools/ace-qwen3.cpp
index fbfd049..9605e2a 100644
--- a/tools/ace-qwen3.cpp
+++ b/tools/ace-qwen3.cpp
@@ -691,8 +691,7 @@ int main(int argc, char ** argv) {
                 "# Instruction\n"
                 "Expand the user's input into a more detailed"
                 " and specific musical description:\n";
-            std::string user_msg = ace.caption + "\n\ninstrumental: "
-                + std::string(req.instrumental ? "true" : "false");
+            std::string user_msg = ace.caption;
             prompt = build_custom_prompt(bpe, sys, user_msg.c_str());
         } else {
             prompt = build_lm_prompt(bpe, ace);
diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp
index 1f4ffac..8893f4b 100644
--- a/tools/dit-vae.cpp
+++ b/tools/dit-vae.cpp
@@ -6,13 +6,10 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <cmath>
 #include <vector>
 #include <random>
 #include "philox.h"
 
-#include "ggml.h"
-#include "ggml-backend.h"
 #include "dit-sampler.h"
 #include "vae.h"
 #include "qwen3-enc.h"
@@ -22,7 +19,6 @@
 #include "debug.h"
 #include "request.h"
 #include "timer.h"
-#include "audio.h"
 
 // Minimal WAV writer (16-bit PCM stereo)
 static bool write_wav(const char * path, const float * audio, int T_audio, int sr) {
@@ -67,9 +63,6 @@ static void print_usage(const char * prog) {
         "  --text-encoder <gguf>   Text encoder GGUF file\n"
         "  --dit <gguf>            DiT GGUF file\n"
         "  --vae <gguf>            VAE GGUF file\n\n"
-        "LoRA:\n"
-        "  --lora <path>           LoRA adapter (adapter_model.safetensors)\n"
-        "  --lora-scale <float>    LoRA scale, e.g. alpha/rank (default: 1.0)\n\n"
         "Batch:\n"
         "  --batch <N>             DiT variations per request (default: 1, max 9)\n\n"
         "Output naming: input.json -> input0.wav, input1.wav, ... (last digit = batch index)\n\n"
@@ -103,12 +96,10 @@ int main(int argc, char ** argv) {
     const char * dit_gguf      = NULL;
     const char * vae_gguf      = NULL;
     const char * dump_dir      = NULL;
-    const char * lora_path     = NULL;
-    float lora_scale            = 1.0f;
     bool use_fa                = true;
-    int batch_n                 = 1;
-    int vae_chunk               = 256;
-    int vae_overlap             = 64;
+    int batch_n                = 1;
+    int vae_chunk              = 256;
+    int vae_overlap            = 64;
 
     for (int i = 1; i < argc; i++) {
         if (strcmp(argv[i], "--request") == 0) {
@@ -124,8 +115,6 @@ int main(int argc, char ** argv) {
         else if (strcmp(argv[i], "--batch") == 0 && i+1 < argc) batch_n = atoi(argv[++i]);
         else if (strcmp(argv[i], "--vae-chunk") == 0 && i+1 < argc) vae_chunk = atoi(argv[++i]);
         else if (strcmp(argv[i], "--vae-overlap") == 0 && i+1 < argc) vae_overlap = atoi(argv[++i]);
-        else if (strcmp(argv[i], "--lora") == 0 && i+1 < argc) lora_path = argv[++i];
-        else if (strcmp(argv[i], "--lora-scale") == 0 && i+1 < argc) lora_scale = (float)atof(argv[++i]);
         else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
             print_usage(argv[0]); return 0;
         } else {
@@ -172,16 +161,6 @@ int main(int argc, char ** argv) {
     }
     fprintf(stderr, "[Load] DiT weight load: %.1f ms\n", timer.ms());
 
-    if (lora_path) {
-        timer.reset();
-        if (!dit_ggml_load_lora(&model, lora_path, lora_scale)) {
-            fprintf(stderr, "FATAL: failed to load LoRA from %s\n", lora_path);
-            dit_ggml_free(&model);
-            return 1;
-        }
-        fprintf(stderr, "[Load] LoRA: %.1f ms\n", timer.ms());
-    }
-
     // Read DiT GGUF metadata + silence_latent tensor (once)
     bool is_turbo = false;
     std::vector<float> silence_full;  // [15000, 64] f32
@@ -247,11 +226,8 @@ int main(int argc, char ** argv) {
             continue;
         }
 
-        // Extract params (append custom_tag to caption for LoRA/condition so trigger is in text)
-        std::string caption_for_cond = req.caption;
-        if (!req.custom_tag.empty())
-            caption_for_cond += ", " + req.custom_tag;
-        const char * caption  = caption_for_cond.c_str();
+        // Extract params
+        const char * caption  = req.caption.c_str();
         const char * lyrics   = req.lyrics.c_str();
         char bpm_str[16] = "N/A";
         if (req.bpm > 0) snprintf(bpm_str, sizeof(bpm_str), "%d", req.bpm);
@@ -262,12 +238,12 @@ int main(int argc, char ** argv) {
         float duration        = req.duration > 0 ? req.duration : 30.0f;
         long long seed        = req.seed;
         int num_steps         = req.inference_steps > 0 ? req.inference_steps : 8;
-        float guidance_scale  = req.guidance_scale > 0 ? req.guidance_scale : 7.0f;
+        float guidance_scale  = req.guidance_scale;
         float shift           = req.shift > 0 ? req.shift : 1.0f;
-        float cover_strength  = req.audio_cover_strength >= 0 && req.audio_cover_strength <= 1
-            ? req.audio_cover_strength : 1.0f;
 
-        if (is_turbo && guidance_scale > 1.0f) {
+        if (guidance_scale <= 0.0f)
+            guidance_scale = is_turbo ? 1.0f : 7.0f;
+        else if (is_turbo && guidance_scale > 1.0f) {
             fprintf(stderr, "[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was %.1f)\n",
                     guidance_scale);
             guidance_scale = 1.0f;
@@ -281,39 +257,8 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "[Pipeline] seed=%lld, steps=%d, guidance=%.1f, shift=%.1f, duration=%.1fs\n",
                 seed, num_steps, guidance_scale, shift, duration);
 
-        // Parse audio codes from request (or produce from src_audio WAV/MP3)
+        // Parse audio codes from request
         std::vector<int> codes_vec = parse_codes_string(req.audio_codes);
-        if (codes_vec.empty() && !req.src_audio.empty() && have_vae) {
-            const std::string & src_path = req.src_audio;
-            std::vector<float> wav_stereo;
-            int n_samples = load_audio_48k_stereo(src_path.c_str(), &wav_stereo);
-            if (n_samples > 0) {
-                int T_audio = n_samples;
-                if (T_audio >= 1920) {
-                    VAEEncoderGGML enc = {};
-                    if (vae_encoder_load(&enc, vae_gguf)) {
-                        size_t max_lat = (size_t)(T_audio / 2048) + 1;
-                        std::vector<float> enc_out(max_lat * 64);
-                        int T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data());
-                        vae_encoder_free(&enc);
-                        if (T_lat >= FSQ_FRAMES_PER_CODE) {
-                            DetokGGML detok = {};
-                            if (detok_ggml_load(&detok, dit_gguf, model.backend, model.cpu_backend)) {
-                                std::vector<float> codeword_table((size_t)FSQ_N_CODES * FSQ_FRAMES_PER_CODE * 64);
-                                fprintf(stderr, "[Cover] building FSQ codeword table (8000 codes)...\n");
-                                detok_ggml_build_codeword_table(&detok, codeword_table.data());
-                                latent_frames_to_codes(T_lat, enc_out.data(), codeword_table.data(), &codes_vec);
-                                fprintf(stderr, "[Cover] encoded %s -> %zu codes (%.1fs @ 5Hz)\n",
-                                        src_path.c_str(), codes_vec.size(), (float)codes_vec.size() / 5.0f);
-                                detok_ggml_free(&detok);
-                            }
-                        }
-                    }
-                }
-            } else {
-                fprintf(stderr, "[Cover] WARNING: cannot load src_audio %s (use .wav or .mp3), skipping cover-from-file\n", src_path.c_str());
-            }
-        }
         if (!codes_vec.empty())
             fprintf(stderr, "[Pipeline] %zu audio codes (%.1fs @ 5Hz)\n",
                     codes_vec.size(), (float)codes_vec.size() / 5.0f);
@@ -416,50 +361,16 @@ int main(int argc, char ** argv) {
         }
         fprintf(stderr, "[Load] ConditionEncoder: %.1f ms\n", timer.ms());
 
-        // Timbre input: reference_audio (WAV or MP3 via VAE encoder) or silence (first 750 frames = 30s @ 25Hz)
+        // Silence feats for timbre input: first 750 frames (30s @ 25Hz)
         const int S_ref = 750;
-        std::vector<float> timbre_feats(S_ref * 64);
-        const float * timbre_ptr = silence_full.data();
-        int S_ref_actual = S_ref;
-        if (!req.reference_audio.empty()) {
-            const std::string & ref_path = req.reference_audio;
-            std::vector<float> wav_stereo;
-            int n_samples = load_audio_48k_stereo(ref_path.c_str(), &wav_stereo);
-            if (n_samples > 0 && have_vae) {
-                VAEEncoderGGML enc = {};
-                if (vae_encoder_load(&enc, vae_gguf)) {
-                    int T_audio = n_samples;
-                    if (T_audio >= 1920) {
-                        // Encoder strides 2,4,4,8,8 -> max latent frames = T_audio/2048 + 1
-                        size_t max_lat = (size_t)(T_audio / 2048) + 1;
-                        std::vector<float> enc_out(max_lat * 64);
-                        int T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data());
-                        if (T_lat > 0) {
-                            size_t copy_frames = (size_t)(T_lat < S_ref ? T_lat : S_ref);
-                            memcpy(timbre_feats.data(), enc_out.data(), copy_frames * 64 * sizeof(float));
-                            if (T_lat < S_ref)
-                                memcpy(timbre_feats.data() + copy_frames * 64, silence_full.data(),
-                                       (S_ref - (int)copy_frames) * 64 * sizeof(float));
-                            S_ref_actual = (int)copy_frames;
-                            if (T_lat > S_ref) S_ref_actual = S_ref;
-                            timbre_ptr = timbre_feats.data();
-                            fprintf(stderr, "[Timbre] encoded %s -> %d frames (25Hz)\n", ref_path.c_str(), S_ref_actual);
-                        }
-                    }
-                    vae_encoder_free(&enc);
-                }
-            } else if (n_samples <= 0) {
-                fprintf(stderr, "[Timbre] WARNING: cannot load audio %s (use .wav or .mp3), using silence\n", ref_path.c_str());
-            } else if (!have_vae) {
-                fprintf(stderr, "[Timbre] reference_audio requires --vae (with encoder weights); using silence\n");
-            }
-        }
+        std::vector<float> silence_feats(S_ref * 64);
+        memcpy(silence_feats.data(), silence_full.data(), S_ref * 64 * sizeof(float));
 
         timer.reset();
         std::vector<float> enc_hidden;
         cond_ggml_forward(&cond, text_hidden.data(), S_text,
                            lyric_embed.data(), S_lyric,
-                           timbre_ptr, S_ref_actual,
+                           silence_feats.data(), S_ref,
                            enc_hidden, &enc_S);
         fprintf(stderr, "[Encode] ConditionEncoder: %.1f ms, enc_S=%d\n", timer.ms(), enc_S);
 
@@ -503,20 +414,15 @@ int main(int argc, char ** argv) {
         }
 
         // Build single context: [T, ctx_ch] = src_latents[64] + mask_ones[64]
-        // src_latents = blend(decoded_codes, silence) for t<decoded_T, else silence; audio_cover_strength controls blend
+        // src_latents = decoded_codes[0:decoded_T] + silence_latent[0:T-decoded_T]
+        // Padding reads silence from frame 0 (not from decoded_T), matching reference implementation
         std::vector<float> context_single(T * ctx_ch);
         for (int t = 0; t < T; t++) {
-            for (int c = 0; c < Oc; c++) {
-                float v;
-                if (t < decoded_T) {
-                    float dec = decoded_latents[t * Oc + c];
-                    float sil = silence_full[c];  // frame 0 of silence
-                    v = (1.0f - cover_strength) * sil + cover_strength * dec;
-                } else {
-                    v = silence_full[(t - decoded_T) * Oc + c];
-                }
-                context_single[t * ctx_ch + c] = v;
-            }
+            const float * src = (t < decoded_T)
+                ? decoded_latents.data() + t * Oc
+                : silence_full.data() + (t - decoded_T) * Oc;
+            for (int c = 0; c < Oc; c++)
+                context_single[t * ctx_ch + c] = src[c];
             for (int c = 0; c < Oc; c++)
                 context_single[t * ctx_ch + Oc + c] = 1.0f;
         }
diff --git a/tools/neural-codec.cpp b/tools/neural-codec.cpp
new file mode 100644
index 0000000..e75f3c3
--- /dev/null
+++ b/tools/neural-codec.cpp
@@ -0,0 +1,522 @@
+// neural-codec.cpp: neural audio codec (Oobleck VAE)
+//
+// 2 modes:
+//   encode: WAV -> latent file (f32, Q8, or Q4)
+//   decode: latent file -> WAV (48kHz stereo)
+//
+// Three latent formats, decode auto-detects:
+//
+//   f32 (default): flat [T, 64] f32, no header.
+//     T = file_size / 256. 25Hz, ~6.4 KB/s, ~51 kbit/s.
+//
+//   Q8 (--q8): symmetric per-frame int8 quantization.
+//     header: "NAC8" magic (4B) + uint32 T_latent (4B)
+//     frame:  f16 scale (2B) + int8[64] (64B) = 66B
+//     25Hz, ~1.6 KB/s, ~13 kbit/s.
+//
+//   Q4 (--q4): symmetric per-frame 4-bit quantization.
+//     header: "NAC4" magic (4B) + uint32 T_latent (4B)
+//     frame:  f16 scale (2B) + nibbles[32] (32B) = 34B
+//     25Hz, ~850 B/s, ~6.8 kbit/s.
+//
+// Usage:
+//   neural-codec --vae model.gguf --encode -i song.wav -o song.latent
+//   neural-codec --vae model.gguf --encode --q8 -i song.wav -o song.nac8
+//   neural-codec --vae model.gguf --encode --q4 -i song.wav -o song.nac4
+//   neural-codec --vae model.gguf --decode -i song.nac4 -o song.wav
+
+#include "vae.h"
+#include "vae-enc.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+
+// Minimal WAV reader: 16-bit PCM or 32-bit float, mono/stereo, any sample rate.
+// Returns interleaved float [T, 2]. Sets *T_audio, *sr. Caller frees.
+static float * read_wav(const char * path, int * T_audio, int * sr) {
+    FILE * f = fopen(path, "rb");
+    if (!f) { fprintf(stderr, "[WAV] Cannot open %s\n", path); return NULL; }
+
+    char riff[4]; fread(riff, 1, 4, f);
+    if (memcmp(riff, "RIFF", 4) != 0) {
+        fprintf(stderr, "[WAV] Not a RIFF file: %s\n", path); fclose(f); return NULL;
+    }
+    fseek(f, 4, SEEK_CUR);
+    char wave[4]; fread(wave, 1, 4, f);
+    if (memcmp(wave, "WAVE", 4) != 0) {
+        fprintf(stderr, "[WAV] Not a WAVE file: %s\n", path); fclose(f); return NULL;
+    }
+
+    int n_channels = 0, sample_rate = 0, bits_per_sample = 0;
+    short audio_format = 0;
+    float * audio = NULL;
+    int n_samples = 0;
+
+    while (!feof(f)) {
+        char chunk_id[4];
+        int chunk_size;
+        if (fread(chunk_id, 1, 4, f) != 4) break;
+        if (fread(&chunk_size, 4, 1, f) != 1) break;
+
+        if (memcmp(chunk_id, "fmt ", 4) == 0) {
+            fread(&audio_format, 2, 1, f);
+            short nc; fread(&nc, 2, 1, f); n_channels = nc;
+            fread(&sample_rate, 4, 1, f);
+            fseek(f, 4, SEEK_CUR); // byte_rate
+            fseek(f, 2, SEEK_CUR); // block_align
+            short bps; fread(&bps, 2, 1, f); bits_per_sample = bps;
+            int consumed = 16;
+            if (chunk_size > consumed) fseek(f, chunk_size - consumed, SEEK_CUR);
+
+        } else if (memcmp(chunk_id, "data", 4) == 0 && n_channels > 0) {
+            if (audio_format == 1 && bits_per_sample == 16) {
+                n_samples = chunk_size / (n_channels * 2);
+                audio = (float *)malloc((size_t)n_samples * 2 * sizeof(float));
+                std::vector<short> buf((size_t)n_samples * n_channels);
+                fread(buf.data(), 2, (size_t)n_samples * n_channels, f);
+                for (int t = 0; t < n_samples; t++) {
+                    if (n_channels == 1) {
+                        float s = (float)buf[t] / 32768.0f;
+                        audio[t * 2 + 0] = s;
+                        audio[t * 2 + 1] = s;
+                    } else {
+                        audio[t * 2 + 0] = (float)buf[t * n_channels + 0] / 32768.0f;
+                        audio[t * 2 + 1] = (float)buf[t * n_channels + 1] / 32768.0f;
+                    }
+                }
+            } else if (audio_format == 3 && bits_per_sample == 32) {
+                n_samples = chunk_size / (n_channels * 4);
+                audio = (float *)malloc((size_t)n_samples * 2 * sizeof(float));
+                std::vector<float> buf((size_t)n_samples * n_channels);
+                fread(buf.data(), 4, (size_t)n_samples * n_channels, f);
+                for (int t = 0; t < n_samples; t++) {
+                    if (n_channels == 1) {
+                        audio[t * 2 + 0] = buf[t];
+                        audio[t * 2 + 1] = buf[t];
+                    } else {
+                        audio[t * 2 + 0] = buf[t * n_channels + 0];
+                        audio[t * 2 + 1] = buf[t * n_channels + 1];
+                    }
+                }
+            } else {
+                fprintf(stderr, "[WAV] Unsupported: format=%d bits=%d (need PCM16 or float32)\n",
+                        audio_format, bits_per_sample);
+                fclose(f); return NULL;
+            }
+            break;
+        } else {
+            fseek(f, chunk_size, SEEK_CUR);
+        }
+    }
+    fclose(f);
+    if (!audio) { fprintf(stderr, "[WAV] No audio data in %s\n", path); return NULL; }
+
+    *T_audio = n_samples;
+    *sr = sample_rate;
+    fprintf(stderr, "[WAV] Read %s: %d samples, %d Hz, %d ch, %d bit\n",
+            path, n_samples, sample_rate, n_channels, bits_per_sample);
+    return audio;
+}
+
+// WAV writer: planar [ch0: T, ch1: T] -> 16-bit PCM stereo
+static bool write_wav(const char * path, const float * audio, int T_audio, int sr) {
+    FILE * f = fopen(path, "wb");
+    if (!f) return false;
+    int n_channels = 2, bits = 16;
+    int byte_rate = sr * n_channels * (bits / 8);
+    int block_align = n_channels * (bits / 8);
+    int data_size = T_audio * n_channels * (bits / 8);
+    int file_size = 36 + data_size;
+    fwrite("RIFF", 1, 4, f);
+    fwrite(&file_size, 4, 1, f);
+    fwrite("WAVE", 1, 4, f);
+    fwrite("fmt ", 1, 4, f);
+    int fmt_size = 16; fwrite(&fmt_size, 4, 1, f);
+    short audio_fmt = 1; fwrite(&audio_fmt, 2, 1, f);
+    short nc = (short)n_channels; fwrite(&nc, 2, 1, f);
+    fwrite(&sr, 4, 1, f);
+    fwrite(&byte_rate, 4, 1, f);
+    short ba = (short)block_align; fwrite(&ba, 2, 1, f);
+    short bp = (short)bits; fwrite(&bp, 2, 1, f);
+    fwrite("data", 1, 4, f);
+    fwrite(&data_size, 4, 1, f);
+    for (int t = 0; t < T_audio; t++) {
+        for (int c = 0; c < 2; c++) {
+            float s = audio[c * T_audio + t];
+            s = s < -1.0f ? -1.0f : (s > 1.0f ? 1.0f : s);
+            short v = (short)(s * 32767.0f);
+            fwrite(&v, 2, 1, f);
+        }
+    }
+    fclose(f);
+    return true;
+}
+
+// Q8 format constants
+static const char NAC8_MAGIC[4] = {'N', 'A', 'C', '8'};
+static const int NAC8_HEADER = 8;   // 4B magic + 4B T_latent
+static const int NAC8_FRAME  = 66;  // 2B f16 scale + 64B int8
+
+// Write Q8 quantized latent
+static bool write_latent_q8(const char * path, const float * data, int T_latent) {
+    FILE * f = fopen(path, "wb");
+    if (!f) return false;
+
+    fwrite(NAC8_MAGIC, 1, 4, f);
+    uint32_t t = (uint32_t)T_latent;
+    fwrite(&t, 4, 1, f);
+
+    for (int i = 0; i < T_latent; i++) {
+        const float * frame = data + i * 64;
+
+        // find max abs for symmetric quant
+        float amax = 0.0f;
+        for (int j = 0; j < 64; j++) {
+            float a = fabsf(frame[j]);
+            if (a > amax) amax = a;
+        }
+        float scale = amax / 127.0f;
+        ggml_fp16_t scale_f16 = ggml_fp32_to_fp16(scale);
+        fwrite(&scale_f16, 2, 1, f);
+
+        // quantize
+        int8_t q[64];
+        float inv = (scale > 0.0f) ? 127.0f / amax : 0.0f;
+        for (int j = 0; j < 64; j++) {
+            int v = (int)roundf(frame[j] * inv);
+            q[j] = (int8_t)(v < -127 ? -127 : (v > 127 ? 127 : v));
+        }
+        fwrite(q, 1, 64, f);
+    }
+    fclose(f);
+
+    size_t bytes = NAC8_HEADER + (size_t)T_latent * NAC8_FRAME;
+    float duration = (float)T_latent * 1920.0f / 48000.0f;
+    float kbps = (float)bytes * 8.0f / (duration * 1000.0f);
+    fprintf(stderr, "[Latent] Wrote %s: Q8, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
+            path, T_latent, duration, (float)bytes / 1024.0f, kbps);
+    return true;
+}
+
+// Q4 format constants
+static const char NAC4_MAGIC[4] = {'N', 'A', 'C', '4'};
+static const int NAC4_HEADER = 8;   // 4B magic + 4B T_latent
+static const int NAC4_FRAME  = 34;  // 2B f16 scale + 32B packed nibbles
+
+// Write Q4 quantized latent
+// Symmetric 4-bit: range [-7, 7], scale = amax / 7.0
+// Packing: byte = (low & 0x0F) | (high << 4), two signed nibbles per byte
+static bool write_latent_q4(const char * path, const float * data, int T_latent) {
+    FILE * f = fopen(path, "wb");
+    if (!f) return false;
+
+    fwrite(NAC4_MAGIC, 1, 4, f);
+    uint32_t t = (uint32_t)T_latent;
+    fwrite(&t, 4, 1, f);
+
+    for (int i = 0; i < T_latent; i++) {
+        const float * frame = data + i * 64;
+
+        // find max abs for symmetric quant
+        float amax = 0.0f;
+        for (int j = 0; j < 64; j++) {
+            float a = fabsf(frame[j]);
+            if (a > amax) amax = a;
+        }
+        float scale = amax / 7.0f;
+        ggml_fp16_t scale_f16 = ggml_fp32_to_fp16(scale);
+        fwrite(&scale_f16, 2, 1, f);
+
+        // quantize and pack pairs into bytes
+        float inv = (scale > 0.0f) ? 7.0f / amax : 0.0f;
+        uint8_t packed[32];
+        for (int j = 0; j < 32; j++) {
+            int lo = (int)roundf(frame[j * 2 + 0] * inv);
+            int hi = (int)roundf(frame[j * 2 + 1] * inv);
+            lo = lo < -7 ? -7 : (lo > 7 ? 7 : lo);
+            hi = hi < -7 ? -7 : (hi > 7 ? 7 : hi);
+            packed[j] = (uint8_t)((lo & 0x0F) | (hi << 4));
+        }
+        fwrite(packed, 1, 32, f);
+    }
+    fclose(f);
+
+    size_t bytes = NAC4_HEADER + (size_t)T_latent * NAC4_FRAME;
+    float duration = (float)T_latent * 1920.0f / 48000.0f;
+    float kbps = (float)bytes * 8.0f / (duration * 1000.0f);
+    fprintf(stderr, "[Latent] Wrote %s: Q4, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
+            path, T_latent, duration, (float)bytes / 1024.0f, kbps);
+    return true;
+}
+
+// Write f32 raw latent (no header)
+static bool write_latent_f32(const char * path, const float * data, int T_latent) {
+    FILE * f = fopen(path, "wb");
+    if (!f) return false;
+    size_t bytes = (size_t)T_latent * 64 * sizeof(float);
+    fwrite(data, 1, bytes, f);
+    fclose(f);
+    float duration = (float)T_latent * 1920.0f / 48000.0f;
+    fprintf(stderr, "[Latent] Wrote %s: f32, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
+            path, T_latent, duration, (float)bytes / 1024.0f,
+            (float)bytes * 8.0f / (duration * 1000.0f));
+    return true;
+}
+
+// Read latent, auto-detect format (NAC8 -> Q8, NAC4 -> Q4, else f32).
+// Returns [T_latent, 64] f32 (dequantized if quantized). Caller frees.
+static float * read_latent(const char * path, int * T_latent) {
+    FILE * f = fopen(path, "rb");
+    if (!f) { fprintf(stderr, "[Latent] Cannot open %s\n", path); return NULL; }
+    fseek(f, 0, SEEK_END);
+    long fsize = ftell(f);
+    fseek(f, 0, SEEK_SET);
+
+    // Check magic
+    char magic[4] = {};
+    if (fsize >= 8) fread(magic, 1, 4, f);
+
+    if (memcmp(magic, NAC8_MAGIC, 4) == 0) {
+        // Q8 format
+        uint32_t t;
+        fread(&t, 4, 1, f);
+        *T_latent = (int)t;
+
+        long expected = NAC8_HEADER + (long)t * NAC8_FRAME;
+        if (fsize != expected) {
+            fprintf(stderr, "[Latent] Q8 size mismatch: expected %ld, got %ld\n", expected, fsize);
+            fclose(f); return NULL;
+        }
+
+        float * data = (float *)malloc((size_t)t * 64 * sizeof(float));
+        for (int i = 0; i < (int)t; i++) {
+            ggml_fp16_t scale_f16;
+            fread(&scale_f16, 2, 1, f);
+            float scale = ggml_fp16_to_fp32(scale_f16);
+
+            int8_t q[64];
+            fread(q, 1, 64, f);
+
+            float * frame = data + i * 64;
+            for (int j = 0; j < 64; j++)
+                frame[j] = (float)q[j] * scale;
+        }
+        fclose(f);
+
+        float duration = (float)(*T_latent) * 1920.0f / 48000.0f;
+        float kbps = (float)fsize * 8.0f / (duration * 1000.0f);
+        fprintf(stderr, "[Latent] Read %s: Q8, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
+                path, *T_latent, duration, (float)fsize / 1024.0f, kbps);
+        return data;
+    }
+
+    if (memcmp(magic, NAC4_MAGIC, 4) == 0) {
+        // Q4 format
+        uint32_t t;
+        fread(&t, 4, 1, f);
+        *T_latent = (int)t;
+
+        long expected = NAC4_HEADER + (long)t * NAC4_FRAME;
+        if (fsize != expected) {
+            fprintf(stderr, "[Latent] Q4 size mismatch: expected %ld, got %ld\n", expected, fsize);
+            fclose(f); return NULL;
+        }
+
+        float * data = (float *)malloc((size_t)t * 64 * sizeof(float));
+        for (int i = 0; i < (int)t; i++) {
+            ggml_fp16_t scale_f16;
+            fread(&scale_f16, 2, 1, f);
+            float scale = ggml_fp16_to_fp32(scale_f16);
+
+            uint8_t packed[32];
+            fread(packed, 1, 32, f);
+
+            // unpack signed nibbles
+            float * frame = data + i * 64;
+            for (int j = 0; j < 32; j++) {
+                int lo = (int)(packed[j] & 0x0F);
+                int hi = (int)(packed[j] >> 4);
+                if (lo >= 8) lo -= 16;
+                if (hi >= 8) hi -= 16;
+                frame[j * 2 + 0] = (float)lo * scale;
+                frame[j * 2 + 1] = (float)hi * scale;
+            }
+        }
+        fclose(f);
+
+        float duration = (float)(*T_latent) * 1920.0f / 48000.0f;
+        float kbps = (float)fsize * 8.0f / (duration * 1000.0f);
+        fprintf(stderr, "[Latent] Read %s: Q4, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
+                path, *T_latent, duration, (float)fsize / 1024.0f, kbps);
+        return data;
+    }
+
+    // f32 format (no header, rewind)
+    fseek(f, 0, SEEK_SET);
+    if (fsize % (64 * (int)sizeof(float)) != 0) {
+        fprintf(stderr, "[Latent] File size %ld not a multiple of %d (64 * f32)\n",
+                fsize, (int)(64 * sizeof(float)));
+        fclose(f); return NULL;
+    }
+
+    *T_latent = (int)(fsize / (64 * sizeof(float)));
+    float * data = (float *)malloc(fsize);
+    fread(data, 1, fsize, f);
+    fclose(f);
+
+    float duration = (float)(*T_latent) * 1920.0f / 48000.0f;
+    fprintf(stderr, "[Latent] Read %s: f32, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
+            path, *T_latent, duration, (float)fsize / 1024.0f,
+            (float)fsize * 8.0f / (duration * 1000.0f));
+    return data;
+}
+
+static void print_usage(const char * prog) {
+    fprintf(stderr,
+        "Usage: %s --vae <gguf> --encode|--decode -i <input> [-o <output>] [--q8|--q4]\n\n"
+        "Required:\n"
+        "  --vae <path>            VAE GGUF file\n"
+        "  --encode | --decode     Encode WAV to latent, or decode latent to WAV\n"
+        "  -i <path>               Input (WAV for encode, latent for decode)\n\n"
+        "Output:\n"
+        "  -o <path>               Output file (auto-named if omitted)\n"
+        "  --q8                    Quantize latent to int8 (~13 kbit/s)\n"
+        "  --q4                    Quantize latent to int4 (~6.8 kbit/s)\n\n"
+        "Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4)\n"
+        "               song.latent -> song.wav\n\n"
+        "VAE tiling (memory control):\n"
+        "  --vae-chunk <N>         Latent frames per tile (default: 256)\n"
+        "  --vae-overlap <N>       Overlap frames per side (default: 64)\n\n"
+        "Latent formats (decode auto-detects):\n"
+        "  f32:  flat [T, 64] f32, no header. ~51 kbit/s.\n"
+        "  NAC8: header + per-frame Q8. ~13 kbit/s.\n"
+        "  NAC4: header + per-frame Q4. ~6.8 kbit/s.\n",
+        prog);
+}
+
+static std::string auto_output(const char * input, const char * ext) {
+    std::string s = input;
+    size_t dot = s.rfind('.');
+    if (dot != std::string::npos)
+        return s.substr(0, dot) + ext;
+    return s + ext;
+}
+
+int main(int argc, char ** argv) {
+    const char * vae_path = NULL;
+    const char * input_path = NULL;
+    const char * output_path = NULL;
+    int chunk_size = 256;
+    int overlap = 64;
+    int mode = -1;  // 0 = encode, 1 = decode
+    int quant = 0;  // 0 = f32, 8 = q8, 4 = q4
+
+    for (int i = 1; i < argc; i++) {
+        if      (strcmp(argv[i], "--vae") == 0 && i + 1 < argc)     vae_path = argv[++i];
+        else if (strcmp(argv[i], "-i") == 0 && i + 1 < argc)        input_path = argv[++i];
+        else if (strcmp(argv[i], "--input") == 0 && i + 1 < argc)   input_path = argv[++i];
+        else if (strcmp(argv[i], "-o") == 0 && i + 1 < argc)        output_path = argv[++i];
+        else if (strcmp(argv[i], "--output") == 0 && i + 1 < argc)  output_path = argv[++i];
+        else if (strcmp(argv[i], "--vae-chunk") == 0 && i + 1 < argc)   chunk_size = atoi(argv[++i]);
+        else if (strcmp(argv[i], "--vae-overlap") == 0 && i + 1 < argc) overlap = atoi(argv[++i]);
+        else if (strcmp(argv[i], "--encode") == 0)  mode = 0;
+        else if (strcmp(argv[i], "--decode") == 0)  mode = 1;
+        else if (strcmp(argv[i], "--q8") == 0)      quant = 8;
+        else if (strcmp(argv[i], "--q4") == 0)      quant = 4;
+        else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+            print_usage(argv[0]); return 0;
+        } else {
+            fprintf(stderr, "Unknown arg: %s\n", argv[i]);
+            print_usage(argv[0]); return 1;
+        }
+    }
+
+    if (!vae_path || !input_path || mode < 0) {
+        print_usage(argv[0]); return 1;
+    }
+
+    // Auto output names
+    std::string out_str;
+    if (!output_path) {
+        if (mode == 0) {
+            const char * ext = ".latent";
+            if (quant == 8) ext = ".nac8";
+            if (quant == 4) ext = ".nac4";
+            out_str = auto_output(input_path, ext);
+        } else {
+            out_str = auto_output(input_path, ".wav");
+        }
+        output_path = out_str.c_str();
+    }
+
+    const char * quant_str = "";
+    if (mode == 0 && quant == 8) quant_str = " (Q8)";
+    if (mode == 0 && quant == 4) quant_str = " (Q4)";
+    fprintf(stderr, "\n[VAE] Mode: %s%s\n", mode == 0 ? "encode" : "decode", quant_str);
+    fprintf(stderr, "[VAE] Input:  %s\n", input_path);
+    fprintf(stderr, "[VAE] Output: %s\n\n", output_path);
+
+    // ENCODE
+    if (mode == 0) {
+        int T_audio = 0, sr = 0;
+        float * audio = read_wav(input_path, &T_audio, &sr);
+        if (!audio) return 1;
+        if (sr != 48000)
+            fprintf(stderr, "[WARN] Input is %d Hz, VAE expects 48000. Resample with ffmpeg first.\n", sr);
+
+        VAEEncoder enc = {};
+        vae_enc_load(&enc, vae_path);
+
+        int max_T = (T_audio / 1920) + 64;
+        std::vector<float> latent((size_t)max_T * 64);
+
+        fprintf(stderr, "\n[VAE] Encoding %d samples (%.2fs)...\n",
+                T_audio, (float)T_audio / (float)(sr > 0 ? sr : 48000));
+        int T_latent = vae_enc_encode_tiled(&enc, audio, T_audio,
+                                             latent.data(), max_T, chunk_size, overlap);
+        free(audio);
+        if (T_latent < 0) { vae_enc_free(&enc); return 1; }
+
+        if (quant == 8)
+            write_latent_q8(output_path, latent.data(), T_latent);
+        else if (quant == 4)
+            write_latent_q4(output_path, latent.data(), T_latent);
+        else
+            write_latent_f32(output_path, latent.data(), T_latent);
+
+        vae_enc_free(&enc);
+        fprintf(stderr, "[VAE] Done.\n");
+        return 0;
+    }
+
+    // DECODE (auto-detects f32 vs Q8 vs Q4 from file content)
+    {
+        int T_latent = 0;
+        float * latent = read_latent(input_path, &T_latent);
+        if (!latent) return 1;
+
+        VAEGGML dec = {};
+        vae_ggml_load(&dec, vae_path);
+
+        int max_T = T_latent * 1920 + 4096;
+        std::vector<float> audio((size_t)2 * max_T, 0.0f);
+
+        fprintf(stderr, "\n[VAE] Decoding %d latent frames...\n", T_latent);
+        int T_audio = vae_ggml_decode_tiled(&dec, latent, T_latent,
+                                             audio.data(), max_T, chunk_size, overlap);
+        free(latent);
+        if (T_audio < 0) { vae_ggml_free(&dec); return 1; }
+
+        if (write_wav(output_path, audio.data(), T_audio, 48000))
+            fprintf(stderr, "\n[VAE] Output: %s (%d samples, %.2fs @ 48kHz)\n",
+                    output_path, T_audio, (float)T_audio / 48000.0f);
+        else
+            fprintf(stderr, "[VAE] FATAL: failed to write %s\n", output_path);
+
+        vae_ggml_free(&dec);
+        fprintf(stderr, "[VAE] Done.\n");
+        return 0;
+    }
+}
diff --git a/tools/quantize.cpp b/tools/quantize.cpp
index c778a47..84a3dd4 100644
--- a/tools/quantize.cpp
+++ b/tools/quantize.cpp
@@ -10,7 +10,6 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <string>
 #include <vector>
 
 #ifdef _WIN32