From c6640c02b1521c25b32e5ddd6d6a78766cce8607 Mon Sep 17 00:00:00 2001 From: qxip Date: Sat, 28 Feb 2026 19:26:07 +0100 Subject: [PATCH 01/17] Reference audio (WAV/MP3), CI build + test-generation - Add unified audio loader (audio.h, audio_loader.cpp, wav.h): WAV and MP3 to stereo 48kHz float; MP3 via header-only minimp3, no temp files - dit-vae: reference_audio path uses load_audio_48k_stereo; VAE encoder for timbre conditioning (requires VAE GGUF with encoder weights) - Request API: task_type, reference_audio, src_audio, audio_cover_strength, repainting_start/end; docs in README and docs/MODES.md - GitHub Actions: build.yml (Ubuntu + macOS on push/PR); test-generation.yml (manual + release only, short text2music/cover/full-pipeline tests) - CI fixtures: tests/fixtures/ci-text2music.json, ci-cover.json (cover uses text2music WAV as reference) Made-with: Cursor --- .github/workflows/build.yml | 44 + .github/workflows/test-generation.yml | 72 + .gitignore | 2 + CMakeLists.txt | 2 +- README.md | 17 +- audio.h | 17 + audio_loader.cpp | 117 ++ dit-vae.cpp | 65 +- docs/MODES.md | 79 ++ examples/request-reference.json | 13 + examples/test-reference.sh | 21 + request.cpp | 39 +- request.h | 19 +- tests/fixtures/ci-cover.json | 12 + tests/fixtures/ci-text2music.json | 11 + third_party/minimp3.h | 1865 +++++++++++++++++++++++++ vae.h | 180 +++ wav.h | 100 ++ 18 files changed, 2653 insertions(+), 22 deletions(-) create mode 100644 .github/workflows/build.yml create mode 100644 .github/workflows/test-generation.yml create mode 100644 audio.h create mode 100644 audio_loader.cpp create mode 100644 docs/MODES.md create mode 100644 examples/request-reference.json create mode 100755 examples/test-reference.sh create mode 100644 tests/fixtures/ci-cover.json create mode 100644 tests/fixtures/ci-text2music.json create mode 100644 third_party/minimp3.h create mode 100644 wav.h diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..fe17246 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,44 @@ +# Validate that the project builds on Ubuntu and macOS (no model download). +name: Build + +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Build (Ubuntu) + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt-get update -qq + sudo apt-get install -y -qq cmake build-essential pkg-config libopenblas-dev + mkdir build && cd build + cmake .. -DGGML_BLAS=ON + cmake --build . --config Release -j$(nproc) + + - name: Build (macOS) + if: matrix.os == 'macos-latest' + run: | + mkdir build && cd build + cmake .. + cmake --build . --config Release -j$(sysctl -n hw.ncpu) + + - name: Smoke test + run: | + ./build/ace-qwen3 --help 2>&1 | head -5 + ./build/dit-vae --help 2>&1 | head -5 + ./build/quantize --help 2>&1 | head -3 diff --git a/.github/workflows/test-generation.yml b/.github/workflows/test-generation.yml new file mode 100644 index 0000000..3c9547c --- /dev/null +++ b/.github/workflows/test-generation.yml @@ -0,0 +1,72 @@ +# Build, download models (cached), and run short generation tests for various modes. +# Runs on release (published) or manual trigger only. Uses short duration (5s) and few steps (4). +name: Test generation + +on: + workflow_dispatch: {} + release: + types: [published] + +jobs: + build-and-test: + runs-on: ubuntu-latest + timeout-minutes: 60 + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Build + run: | + sudo apt-get update -qq + sudo apt-get install -y -qq cmake build-essential pkg-config libopenblas-dev + mkdir build && cd build + cmake .. -DGGML_BLAS=ON + cmake --build . --config Release -j$(nproc) + + - name: Cache models + id: cache-models + uses: actions/cache@v4 + with: + path: models + key: acestep-models-q8-${{ hashFiles('models.sh') }} + restore-keys: acestep-models-q8- + + - name: Download models + if: steps.cache-models.outputs.cache-hit != 'true' + run: | + pip install -q hf + ./models.sh + + - name: Test mode text2music (short) + run: | + ./build/dit-vae \ + --request tests/fixtures/ci-text2music.json \ + --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit models/acestep-v15-turbo-Q8_0.gguf \ + --vae models/vae-BF16.gguf + test -f tests/fixtures/ci-text2music0.wav && echo "text2music WAV OK" + + - name: Test mode cover with WAV reference (short) + run: | + ./build/dit-vae \ + --request tests/fixtures/ci-cover.json \ + --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit models/acestep-v15-turbo-Q8_0.gguf \ + --vae models/vae-BF16.gguf + test -f tests/fixtures/ci-cover0.wav && echo "cover WAV OK" + + - name: Test full pipeline (LLM + DiT, short) + run: | + ./build/ace-qwen3 \ + --request tests/fixtures/ci-text2music.json \ + --model models/acestep-5Hz-lm-4B-Q8_0.gguf + test -f request0.json + ./build/dit-vae \ + --request request0.json \ + --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit models/acestep-v15-turbo-Q8_0.gguf \ + --vae models/vae-BF16.gguf + test -f request00.wav && echo "full pipeline WAV OK" diff --git a/.gitignore b/.gitignore index 4ccb4f4..0fa15e6 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ build/ *.bf16 tests/*/ +!tests/fixtures/ +!tests/fixtures/*.json checkpoints/ models/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 7721447..b9a4830 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,7 +46,7 @@ macro(link_ggml_backends target) endmacro() # dit-vae: full pipeline (text-enc + cond + dit + vae + wav) -add_executable(dit-vae dit-vae.cpp request.cpp) +add_executable(dit-vae dit-vae.cpp request.cpp audio_loader.cpp) link_ggml_backends(dit-vae) # ace-qwen3: LLM inference (CoT + audio codes) diff --git a/README.md b/README.md index 2467b53..55ac16d 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,10 @@ cmake --build . --config Release -j$(nproc) Builds two binaries: `ace-qwen3` (LLM) and `dit-vae` (DiT + VAE). +**CI (GitHub Actions)** +- **Build**: on every push/PR, builds on Ubuntu (BLAS) and macOS (Metal); smoke test runs each binary `--help`. +- **Test generation**: on manual trigger or push to `main`; builds, caches models, then runs short (5 s, 4 steps) generation for text2music, cover, and full pipeline (LLM → DiT → WAV). See `.github/workflows/`. + ## Models Pre-quantized GGUFs on [Hugging Face](https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF). @@ -139,10 +143,11 @@ cd examples ./partial.sh # caption + lyrics + duration ./full.sh # all metadata provided ./dit-only.sh # skip LLM, DiT from noise +./test-reference.sh # reference_audio (WAV or MP3) + audio_cover_strength ``` Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0) -alongside the turbo default (8 steps, no CFG). +alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights). ## Generation modes @@ -170,10 +175,11 @@ Run `dit-vae` to decode existing codes. See `examples/dit-only.json`. ## Request JSON reference -All fields with defaults. Only `caption` is required. +All fields with defaults. Only `caption` is required. Built-in modes (text2music, cover, repaint) and audio inputs follow the [ACE-Step 1.5 Tutorial](https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md); see [docs/MODES.md](docs/MODES.md) for what is implemented. ```json { + "task_type": "text2music", "caption": "", "lyrics": "", "instrumental": false, @@ -188,7 +194,12 @@ All fields with defaults. Only `caption` is required. "lm_top_p": 0.9, "lm_top_k": 0, "lm_negative_prompt": "", + "reference_audio": "", + "src_audio": "", "audio_codes": "", + "audio_cover_strength": 1.0, + "repainting_start": 0.0, + "repainting_end": 0.0, "inference_steps": 8, "guidance_scale": 7.0, "shift": 3.0 @@ -198,7 +209,7 @@ All fields with defaults. Only `caption` is required. Key fields: `seed` -1 means random (resolved once, then +1 per batch element). `audio_codes` is generated by ace-qwen3 and consumed by dit-vae (comma separated FSQ token IDs). When present, the LLM is -skipped entirely. +skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style (MP3 decoded in memory; encoded via built-in VAE encoder; requires VAE GGUF with encoder weights). `src_audio`: not yet implemented (see docs/MODES.md). Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG). SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`. diff --git a/audio.h b/audio.h new file mode 100644 index 0000000..bd90915 --- /dev/null +++ b/audio.h @@ -0,0 +1,17 @@ +// audio.h: unified reference-audio loader (WAV + MP3 → stereo 48kHz float) +// Header-only for WAV; MP3 implementation in audio_loader.cpp (minimp3, no temp files). + +#pragma once + +#include +#include +#include + +// Load WAV or MP3 file into stereo float32 at 48kHz. +// Out: interleaved L,R,L,R,...; length = num_samples (per channel). +// Returns num_samples (per channel), or -1 on error. +// No temp files; MP3 decoded in memory via minimp3 (header-only dep). +int load_audio_48k_stereo(const char * path, std::vector * out); + +// MP3 implementation (in audio_loader.cpp; do not call from other TUs without linking it) +int mp3_load_48k_stereo(const char * path, std::vector * out); diff --git a/audio_loader.cpp b/audio_loader.cpp new file mode 100644 index 0000000..a0e71b5 --- /dev/null +++ b/audio_loader.cpp @@ -0,0 +1,117 @@ +// audio_loader.cpp: MP3 decode for reference audio (minimp3, no deps, no temp files) + +#define MINIMP3_IMPLEMENTATION +#include "third_party/minimp3.h" + +#include "wav.h" +#include "audio.h" +#include +#include +#include +#include + +static bool path_ends_with_ci(const char * path, const char * suffix) { + size_t pl = strlen(path), sl = strlen(suffix); + if (pl < sl) return false; + const char * p = path + pl - sl; + for (size_t i = 0; i < sl; i++) { + char a = (char)(p[i] >= 'A' && p[i] <= 'Z' ? p[i] + 32 : p[i]); + char b = (char)(suffix[i] >= 'A' && suffix[i] <= 'Z' ? suffix[i] + 32 : suffix[i]); + if (a != b) return false; + } + return true; +} + +static void pcm_to_float_stereo_48k( + const int16_t * pcm, size_t num_samples, int channels, unsigned int sample_rate, + std::vector * out) +{ + const float scale = 1.0f / 32768.0f; + out->resize(num_samples * 2); + if (channels == 1) { + for (size_t i = 0; i < num_samples; i++) { + float s = (float)pcm[i] * scale; + (*out)[i * 2] = s; + (*out)[i * 2 + 1] = s; + } + } else { + for (size_t i = 0; i < num_samples * 2; i++) + (*out)[i] = (float)pcm[i] * scale; + } + + if (sample_rate != 48000) { + size_t in_len = num_samples; + size_t out_len = (size_t)((double)in_len * 48000.0 / (double)sample_rate); + std::vector resampled(out_len * 2); + for (size_t i = 0; i < out_len; i++) { + double t = (double)i * (double)in_len / (double)out_len; + size_t i0 = (size_t)t; + size_t i1 = std::min(i0 + 1, in_len - 1); + float w = (float)(t - (double)i0); + for (int c = 0; c < 2; c++) + resampled[i * 2 + c] = (*out)[i0 * 2 + c] * (1.0f - w) + (*out)[i1 * 2 + c] * w; + } + *out = std::move(resampled); + } +} + +int mp3_load_48k_stereo(const char * path, std::vector * out) { + FILE * f = fopen(path, "rb"); + if (!f) return -1; + fseek(f, 0, SEEK_END); + long sz = ftell(f); + fseek(f, 0, SEEK_SET); + if (sz <= 0 || sz > 200 * 1024 * 1024) { + fclose(f); + return -1; + } + std::vector buf((size_t)sz); + if (fread(buf.data(), 1, (size_t)sz, f) != (size_t)sz) { + fclose(f); + return -1; + } + fclose(f); + + mp3dec_t dec; + mp3dec_init(&dec); + mp3dec_frame_info_t info; + std::vector pcm; + const uint8_t * read_pos = buf.data(); + int remaining = (int)buf.size(); + int first_hz = 0, first_ch = 0; + const size_t max_samples = (size_t)(60 * 48000 * 2); + + while (remaining > 0) { + size_t old_size = pcm.size(); + if (old_size + (size_t)MINIMP3_MAX_SAMPLES_PER_FRAME > max_samples) break; + pcm.resize(old_size + (size_t)MINIMP3_MAX_SAMPLES_PER_FRAME); + int frame_samples = mp3dec_decode_frame(&dec, read_pos, remaining, pcm.data() + old_size, &info); + if (frame_samples <= 0) { + pcm.resize(old_size); + read_pos++; + remaining--; + continue; + } + if (first_hz == 0) { + first_hz = info.hz; + first_ch = info.channels; + } + pcm.resize(old_size + (size_t)(frame_samples * info.channels)); + read_pos += info.frame_bytes; + remaining -= info.frame_bytes; + } + + if (pcm.empty() || first_hz == 0) return -1; + size_t num_samples = pcm.size() / (size_t)first_ch; + pcm_to_float_stereo_48k(pcm.data(), num_samples, first_ch, (unsigned)first_hz, out); + return (int)(out->size() / 2); +} + +int load_audio_48k_stereo(const char * path, std::vector * out) { + if (!path || !out) return -1; + if (path_ends_with_ci(path, ".mp3")) + return mp3_load_48k_stereo(path, out); + if (path_ends_with_ci(path, ".wav")) + return wav_load_48k_stereo(path, out); + return -1; +} diff --git a/dit-vae.cpp b/dit-vae.cpp index 608f12c..41582c9 100644 --- a/dit-vae.cpp +++ b/dit-vae.cpp @@ -22,6 +22,7 @@ #include "bpe.h" #include "debug.h" #include "request.h" +#include "audio.h" struct Timer { std::chrono::steady_clock::time_point t; @@ -249,6 +250,8 @@ int main(int argc, char ** argv) { int num_steps = req.inference_steps > 0 ? req.inference_steps : 8; float guidance_scale = req.guidance_scale > 0 ? req.guidance_scale : 7.0f; float shift = req.shift > 0 ? req.shift : 1.0f; + float cover_strength = req.audio_cover_strength >= 0 && req.audio_cover_strength <= 1 + ? req.audio_cover_strength : 1.0f; if (is_turbo && guidance_scale > 1.0f) { fprintf(stderr, "[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was %.1f)\n", @@ -386,16 +389,51 @@ int main(int argc, char ** argv) { } fprintf(stderr, "[Load] ConditionEncoder: %.1f ms\n", timer.ms()); - // Silence feats for timbre input: first 750 frames (30s @ 25Hz) + // Timbre input: reference_audio (WAV or MP3 via VAE encoder) or silence (first 750 frames = 30s @ 25Hz) const int S_ref = 750; - std::vector silence_feats(S_ref * 64); - memcpy(silence_feats.data(), silence_full.data(), S_ref * 64 * sizeof(float)); + std::vector timbre_feats(S_ref * 64); + const float * timbre_ptr = silence_full.data(); + int S_ref_actual = S_ref; + if (!req.reference_audio.empty()) { + const std::string & ref_path = req.reference_audio; + if (ref_path.size() >= 4 && ref_path.compare(ref_path.size() - 4, 4, ".wav") == 0) { + std::vector wav_stereo; + int n_samples = load_audio_48k_stereo(ref_path.c_str(), &wav_stereo); + if (n_samples > 0 && have_vae) { + VAEEncoderGGML enc = {}; + if (vae_encoder_load(&enc, vae_gguf)) { + int T_audio = n_samples; + if (T_audio >= 1920) { + int T_lat = T_audio / 1920; + std::vector enc_out((size_t)T_lat * 64); + T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data()); + if (T_lat > 0) { + size_t copy_frames = (size_t)(T_lat < S_ref ? T_lat : S_ref); + memcpy(timbre_feats.data(), enc_out.data(), copy_frames * 64 * sizeof(float)); + if (T_lat < S_ref) + memcpy(timbre_feats.data() + copy_frames * 64, silence_full.data(), + (S_ref - (int)copy_frames) * 64 * sizeof(float)); + S_ref_actual = (int)copy_frames; + if (T_lat > S_ref) S_ref_actual = S_ref; + timbre_ptr = timbre_feats.data(); + fprintf(stderr, "[Timbre] encoded %s -> %d frames (25Hz)\n", ref_path.c_str(), S_ref_actual); + } + } + vae_encoder_free(&enc); + } + } else if (n_samples <= 0) { + fprintf(stderr, "[Timbre] WARNING: cannot load WAV %s, using silence\n", ref_path.c_str()); + } else if (!have_vae) { + fprintf(stderr, "[Timbre] WAV requires --vae (with encoder weights); using silence\n"); + } + } + } timer.reset(); std::vector enc_hidden; cond_ggml_forward(&cond, text_hidden.data(), S_text, lyric_embed.data(), S_lyric, - silence_feats.data(), S_ref, + timbre_ptr, S_ref_actual, enc_hidden, &enc_S); fprintf(stderr, "[Encode] ConditionEncoder: %.1f ms, enc_S=%d\n", timer.ms(), enc_S); @@ -438,15 +476,20 @@ int main(int argc, char ** argv) { } // Build single context: [T, ctx_ch] = src_latents[64] + mask_ones[64] - // src_latents = decoded_codes[0:decoded_T] + silence_latent[0:T-decoded_T] - // Padding reads silence from frame 0 (not from decoded_T), matching reference implementation + // src_latents = blend(decoded_codes, silence) for t context_single(T * ctx_ch); for (int t = 0; t < T; t++) { - const float * src = (t < decoded_T) - ? decoded_latents.data() + t * Oc - : silence_full.data() + (t - decoded_T) * Oc; - for (int c = 0; c < Oc; c++) - context_single[t * ctx_ch + c] = src[c]; + for (int c = 0; c < Oc; c++) { + float v; + if (t < decoded_T) { + float dec = decoded_latents[t * Oc + c]; + float sil = silence_full[c]; // frame 0 of silence + v = (1.0f - cover_strength) * sil + cover_strength * dec; + } else { + v = silence_full[(t - decoded_T) * Oc + c]; + } + context_single[t * ctx_ch + c] = v; + } for (int c = 0; c < Oc; c++) context_single[t * ctx_ch + Oc + c] = 1.0f; } diff --git a/docs/MODES.md b/docs/MODES.md new file mode 100644 index 0000000..4149ae7 --- /dev/null +++ b/docs/MODES.md @@ -0,0 +1,79 @@ +# ACE-Step 1.5 built-in modes (acestep.cpp) + +This document maps the [ACE-Step 1.5 Tutorial](https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md) built-in modes to the current C++ implementation. + +## Task types (Tutorial: Input Control) + +| `task_type` | Description | Turbo/SFT | Base only | C++ status | +|---------------|-------------|-----------|-----------|------------| +| **text2music** | Generate from caption/lyrics (and optional reference) | ✅ | — | ✅ **Supported** | +| **cover** | Re-synthesize with structure from source; optional timbre from reference | ✅ | — | ⚠️ **Partial** (see below) | +| **repaint** | Local edit in time range using source as context | ✅ | — | ❌ Not implemented | +| **lego** | Add new tracks to existing audio | — | ✅ | ❌ Base model only | +| **extract** | Extract single track from mix | — | ✅ | ❌ Base model only | +| **complete** | Add accompaniment to single track | — | ✅ | ❌ Base model only | + +We only ship Turbo and SFT DiT weights; **lego**, **extract**, **complete** require the Base DiT and are out of scope for now. + +--- + +## What we support today + +### text2music (default) +- **Input**: `caption`, optional `lyrics`, metadata (bpm, duration, keyscale, …). +- **Flow**: LM (optional) → CoT + audio codes → DiT (context = silence) → VAE → WAV. +- **Timbre**: Always uses built-in silence latent from the DiT GGUF (no user reference yet). + +### cover (when `audio_codes` are provided) +- **Input**: Same as text2music, plus **precomputed** `audio_codes` (e.g. from a previous run or from Python). +- **Flow**: Skip LM; decode `audio_codes` to latents → DiT context = decoded + silence padding → DiT → VAE → WAV. +- **Limitation**: We do **not** convert a WAV file into `audio_codes`. So “cover from a file” is only possible if you already have codes (e.g. from Python or from a prior `ace-qwen3` run). The request fields `reference_audio` and `src_audio` are accepted in JSON but **not yet used** in the pipeline. + +--- + +## What’s not implemented yet + +### reference_audio (global timbre/style) +- **Tutorial**: Load WAV → stereo 48 kHz, pad/repeat to ≥30 s → **VAE encode** → latents → feed as timbre condition into DiT. +- **C++**: Implemented. Set `reference_audio` to a **WAV or MP3 file path**. dit-vae loads the file (WAV: any sample rate resampled to 48 kHz; MP3: decoded in memory via header-only minimp3, no temp files, then resampled to 48 kHz if needed), runs the **VAE encoder** (Oobleck, in C++ in `vae.h`), and feeds the 64-d latents to the CondEncoder timbre path. No Python, no external deps. Requires a **full VAE GGUF** that includes `encoder.*` tensors (decoder-only GGUFs will print a clear error). +- **audio_cover_strength** (0.0–1.0): Implemented. When `audio_codes` are present, context latents are blended with silence: `(1 - strength)*silence + strength*decoded`. + +### src_audio (Cover from file) +- **Tutorial**: Source audio is converted to **semantic codes** (melody, rhythm, chords, etc.); then DiT uses those as in cover mode. +- **C++**: That implies **audio → codes**. Likely path: WAV → VAE encode → **FSQ tokenizer** (latents → 5 Hz codes). We have the **FSQ detokenizer** (codes → latents); the tokenizer (encode) side would need to be added. Then: `src_audio` path → load WAV → VAE encode → FSQ encode → `audio_codes` → existing cover path. + +### audio_cover_strength +- **Tutorial**: 0.0–1.0, how strongly generation follows reference/codes. +- **C++**: Field is in the request and parsed; no blending logic in the DiT/context path yet. + +### repaint +- **Tutorial**: Specify `repainting_start` / `repainting_end` (seconds); model uses source audio as context and only generates in that interval (3–90 s). +- **C++**: Would require **masked diffusion**: context carries “given” frames; ODE only updates the repaint region. DiT’s context has a 64-channel “mask” that we currently set to 1.0; repaint would set mask per frame and the generation loop would only update unmasked frames. Not implemented. + +--- + +## Request fields (aligned with Tutorial) + +All of these are in `AceRequest` and parsed from / written to JSON. Backend behavior is as above. + +| Field | Type | Purpose | +|-------|------|--------| +| `task_type` | string | `"text2music"` \| `"cover"` \| `"repaint"` \| … | +| `reference_audio` | string | Path to WAV or MP3 for timbre (implemented) | +| `src_audio` | string | Path to WAV for cover/repaint source (not used yet) | +| `audio_codes` | string | Comma-separated FSQ codes; non-empty ⇒ cover path | +| `audio_cover_strength` | float | 0.0–1.0 (parsed, not used yet) | +| `repainting_start` | float | Start time (s) for repaint (not used yet) | +| `repainting_end` | float | End time (s) for repaint (not used yet) | + +See `request.h` and the README “Request JSON reference” for the full list. + +--- + +## Summary + +- **Fully supported**: text2music; cover when you supply **precomputed** `audio_codes`. +- **Schema only** (no backend): `task_type`, `reference_audio`, `src_audio`, `audio_cover_strength`, `repainting_start`/`repainting_end`. +- **To support reference_audio**: add VAE encoder, then feed its output into the existing CondEncoder timbre path. +- **To support cover from file**: add VAE encoder + FSQ tokenizer (or equivalent audio→codes), then reuse existing cover path. +- **To support repaint**: implement masked DiT generation (context mask + ODE only on repaint interval). diff --git a/examples/request-reference.json b/examples/request-reference.json new file mode 100644 index 0000000..55479ee --- /dev/null +++ b/examples/request-reference.json @@ -0,0 +1,13 @@ +{ + "task_type": "text2music", + "caption": "Calm acoustic guitar, soft male vocal, intimate", + "lyrics": "[Verse]\nQuiet strings and a gentle tune\n[Chorus]\nHold the moment in this room", + "duration": 30, + "seed": 42, + "inference_steps": 8, + "guidance_scale": 1.0, + "shift": 3.0, + "reference_audio": "reference.wav", + "audio_codes": "", + "audio_cover_strength": 1.0 +} diff --git a/examples/test-reference.sh b/examples/test-reference.sh new file mode 100755 index 0000000..073a465 --- /dev/null +++ b/examples/test-reference.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Test reference_audio (WAV) and audio_cover_strength. +# Put a WAV file at reference.wav (or set reference_audio path in request-reference.json). +# Requires: built dit-vae, --vae with encoder weights, and models in ../models/. + +set -eu +cd "$(dirname "$0")" + +if [ ! -f "reference.wav" ]; then + echo "No reference.wav found. Copy a WAV file to reference.wav (stereo 48kHz or any rate; will be resampled)." + echo "Then run: $0" + exit 1 +fi + +../build/dit-vae \ + --request request-reference.json \ + --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit ../models/acestep-v15-turbo-Q8_0.gguf \ + --vae ../models/vae-BF16.gguf + +echo "Done. Check request-reference0.wav (and request-reference1.wav if --batch 2)." diff --git a/request.cpp b/request.cpp index 9b20423..a24d838 100644 --- a/request.cpp +++ b/request.cpp @@ -10,8 +10,9 @@ #include #include -// Defaults (aligned with Python GenerationParams) +// Defaults (aligned with Python GenerationParams and ACE-Step 1.5 Tutorial) void request_init(AceRequest * r) { + r->task_type = "text2music"; r->caption = ""; r->lyrics = ""; r->instrumental = false; @@ -26,7 +27,12 @@ void request_init(AceRequest * r) { r->lm_top_p = 0.9f; r->lm_top_k = 0; r->lm_negative_prompt = ""; + r->reference_audio = ""; + r->src_audio = ""; r->audio_codes = ""; + r->audio_cover_strength = 1.0f; + r->repainting_start = 0.0f; + r->repainting_end = 0.0f; r->inference_steps = 8; r->guidance_scale = 1.0f; r->shift = 3.0f; @@ -218,11 +224,14 @@ bool request_parse(AceRequest * r, const char * path) { const std::string & v = kv.value; // strings - if (k == "caption") r->caption = v; + if (k == "task_type") r->task_type = v; + else if (k == "caption") r->caption = v; else if (k == "lyrics") r->lyrics = v; else if (k == "keyscale") r->keyscale = v; else if (k == "timesignature") r->timesignature = v; else if (k == "vocal_language") r->vocal_language = v; + else if (k == "reference_audio") r->reference_audio = v; + else if (k == "src_audio") r->src_audio = v; else if (k == "audio_codes") r->audio_codes = v; else if (k == "lm_negative_prompt") r->lm_negative_prompt = v; @@ -236,6 +245,9 @@ bool request_parse(AceRequest * r, const char * path) { else if (k == "lm_cfg_scale") r->lm_cfg_scale = (float)atof(v.c_str()); else if (k == "lm_top_p") r->lm_top_p = (float)atof(v.c_str()); else if (k == "lm_top_k") r->lm_top_k = atoi(v.c_str()); + else if (k == "audio_cover_strength") r->audio_cover_strength = (float)atof(v.c_str()); + else if (k == "repainting_start") r->repainting_start = (float)atof(v.c_str()); + else if (k == "repainting_end") r->repainting_end = (float)atof(v.c_str()); else if (k == "inference_steps") r->inference_steps = atoi(v.c_str()); else if (k == "guidance_scale") r->guidance_scale = (float)atof(v.c_str()); else if (k == "shift") r->shift = (float)atof(v.c_str()); @@ -257,6 +269,7 @@ bool request_write(const AceRequest * r, const char * path) { } fprintf(f, "{\n"); + fprintf(f, " \"task_type\": \"%s\",\n", json_escape(r->task_type).c_str()); fprintf(f, " \"caption\": \"%s\",\n", json_escape(r->caption).c_str()); fprintf(f, " \"lyrics\": \"%s\",\n", json_escape(r->lyrics).c_str()); if (r->instrumental) @@ -272,10 +285,18 @@ bool request_write(const AceRequest * r, const char * path) { fprintf(f, " \"lm_top_p\": %.2f,\n", r->lm_top_p); fprintf(f, " \"lm_top_k\": %d,\n", r->lm_top_k); fprintf(f, " \"lm_negative_prompt\": \"%s\",\n", json_escape(r->lm_negative_prompt).c_str()); + if (!r->reference_audio.empty()) + fprintf(f, " \"reference_audio\": \"%s\",\n", json_escape(r->reference_audio).c_str()); + if (!r->src_audio.empty()) + fprintf(f, " \"src_audio\": \"%s\",\n", json_escape(r->src_audio).c_str()); + fprintf(f, " \"audio_cover_strength\": %.2f,\n", r->audio_cover_strength); + if (r->repainting_start != 0.0f || r->repainting_end != 0.0f) { + fprintf(f, " \"repainting_start\": %.1f,\n", r->repainting_start); + fprintf(f, " \"repainting_end\": %.1f,\n", r->repainting_end); + } fprintf(f, " \"inference_steps\": %d,\n", r->inference_steps); fprintf(f, " \"guidance_scale\": %.1f,\n", r->guidance_scale); fprintf(f, " \"shift\": %.1f,\n", r->shift); - // audio_codes last (no trailing comma) fprintf(f, " \"audio_codes\": \"%s\"\n", json_escape(r->audio_codes).c_str()); fprintf(f, "}\n"); @@ -285,7 +306,7 @@ bool request_write(const AceRequest * r, const char * path) { } void request_dump(const AceRequest * r, FILE * f) { - fprintf(f, "[Request] seed=%lld\n", (long long)r->seed); + fprintf(f, "[Request] task=%s seed=%lld\n", r->task_type.c_str(), (long long)r->seed); fprintf(f, " caption: %.60s%s\n", r->caption.c_str(), r->caption.size() > 60 ? "..." : ""); fprintf(f, " lyrics: %zu bytes\n", r->lyrics.size()); @@ -296,6 +317,12 @@ void request_dump(const AceRequest * r, FILE * f) { r->lm_temperature, r->lm_cfg_scale, r->lm_top_p, r->lm_top_k); fprintf(f, " dit: steps=%d guidance=%.1f shift=%.1f\n", r->inference_steps, r->guidance_scale, r->shift); - fprintf(f, " audio_codes: %s\n", - r->audio_codes.empty() ? "(none)" : "(present)"); + if (!r->reference_audio.empty()) + fprintf(f, " reference_audio: %s\n", r->reference_audio.c_str()); + if (!r->src_audio.empty()) + fprintf(f, " src_audio: %s\n", r->src_audio.c_str()); + fprintf(f, " audio_codes: %s cover_strength=%.2f\n", + r->audio_codes.empty() ? "(none)" : "(present)", r->audio_cover_strength); + if (r->repainting_start != 0.0f || r->repainting_end != 0.0f) + fprintf(f, " repaint: %.1f–%.1fs\n", r->repainting_start, r->repainting_end); } diff --git a/request.h b/request.h index 1295b83..e9222a0 100644 --- a/request.h +++ b/request.h @@ -3,13 +3,19 @@ // request.h - AceStep generation request (JSON serialization) // // Pure data container + JSON read/write. Zero business logic. -// Aligned with Python GenerationParams (inference.py:39) and API /release_task. +// Aligned with Python GenerationParams and ACE-Step 1.5 Tutorial: +// https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md // #include #include struct AceRequest { + // --- Task & audio inputs (Tutorial: Input Control) --- + // task_type: "text2music" | "cover" | "repaint" | "lego" | "extract" | "complete" + // (lego/extract/complete require Base DiT; we only have Turbo/SFT) + std::string task_type; // "text2music" + // text content std::string caption; // "" std::string lyrics; // "" @@ -32,9 +38,20 @@ struct AceRequest { int lm_top_k; // 0 = disabled (matches Python None) std::string lm_negative_prompt; // "" + // Audio control (Tutorial: reference_audio, src_audio, audio_codes) + // reference_audio: path to WAV for global timbre/style (VAE encode → CondEncoder timbre) + std::string reference_audio; // "" + // src_audio: path to WAV for Cover (encode → codes) or Repaint context + std::string src_audio; // "" // codes (Python-compatible string: "3101,11837,27514,...") // empty = text2music (silence context), non-empty = cover mode std::string audio_codes; // "" + // 0.0–1.0: how strongly generation follows reference/codes (Tutorial: audio_cover_strength) + float audio_cover_strength; // 1.0 + + // Repaint interval (seconds). Only used when task_type == "repaint". + float repainting_start; // 0.0 + float repainting_end; // 0.0 // DiT control (Python: inference_steps, guidance_scale, shift) int inference_steps; // 8 diff --git a/tests/fixtures/ci-cover.json b/tests/fixtures/ci-cover.json new file mode 100644 index 0000000..3d21e1f --- /dev/null +++ b/tests/fixtures/ci-cover.json @@ -0,0 +1,12 @@ +{ + "task_type": "cover", + "caption": "Short CI cover test", + "duration": 5, + "inference_steps": 4, + "guidance_scale": 1, + "shift": 3, + "seed": 42, + "reference_audio": "tests/fixtures/ci-text2music0.wav", + "audio_cover_strength": 0.8, + "audio_codes": "43316,18426,13366,59455,17783,49303,7423,29855,37158,37157,62317,61455,12847,19583,57031,34656,20254,10770,11416,15905,31413,23339,47091,12198,49531,37355,33090,38645,40707,16324,61436,46095,13941,5287,2239,13975,63815,2757,4862,13571,63495,39,29887,49426,12696,50847,40498,61056,25666,12989,23987,54763,25485,31683,28554,25355,16373,28995,2351,1655,7940,55831,34359,15350,15277,11717,20476,52239,5015,19807,24087,3559,20471,34193,32552,60999,29360,25338,38873,16768,17912,27584,24008,1528,449,25563,52684,53223,42183,37215,12343,39431,26055,28148,57286,38382,28863,7191,58397,18991,7695,30716,36784,12687,8707,25649,33718,3202,23035,10747,26354,63965,16260,11223,45679,14343,8679,49351,52927,2535,19207,46447,49615,12694,21110" +} diff --git a/tests/fixtures/ci-text2music.json b/tests/fixtures/ci-text2music.json new file mode 100644 index 0000000..1613723 --- /dev/null +++ b/tests/fixtures/ci-text2music.json @@ -0,0 +1,11 @@ +{ + "task_type": "text2music", + "caption": "Short CI test clip", + "lyrics": "", + "duration": 5, + "inference_steps": 4, + "guidance_scale": 1, + "shift": 3, + "seed": 42, + "audio_codes": "" +} diff --git a/third_party/minimp3.h b/third_party/minimp3.h new file mode 100644 index 0000000..3220ae1 --- /dev/null +++ b/third_party/minimp3.h @@ -0,0 +1,1865 @@ +#ifndef MINIMP3_H +#define MINIMP3_H +/* + https://github.com/lieff/minimp3 + To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide. + This software is distributed without any warranty. + See . +*/ +#include + +#define MINIMP3_MAX_SAMPLES_PER_FRAME (1152*2) + +typedef struct +{ + int frame_bytes, frame_offset, channels, hz, layer, bitrate_kbps; +} mp3dec_frame_info_t; + +typedef struct +{ + float mdct_overlap[2][9*32], qmf_state[15*2*32]; + int reserv, free_format_bytes; + unsigned char header[4], reserv_buf[511]; +} mp3dec_t; + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +void mp3dec_init(mp3dec_t *dec); +#ifndef MINIMP3_FLOAT_OUTPUT +typedef int16_t mp3d_sample_t; +#else /* MINIMP3_FLOAT_OUTPUT */ +typedef float mp3d_sample_t; +void mp3dec_f32_to_s16(const float *in, int16_t *out, int num_samples); +#endif /* MINIMP3_FLOAT_OUTPUT */ +int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_sample_t *pcm, mp3dec_frame_info_t *info); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* MINIMP3_H */ +#if defined(MINIMP3_IMPLEMENTATION) && !defined(_MINIMP3_IMPLEMENTATION_GUARD) +#define _MINIMP3_IMPLEMENTATION_GUARD + +#include +#include + +#define MAX_FREE_FORMAT_FRAME_SIZE 2304 /* more than ISO spec's */ +#ifndef MAX_FRAME_SYNC_MATCHES +#define MAX_FRAME_SYNC_MATCHES 10 +#endif /* MAX_FRAME_SYNC_MATCHES */ + +#define MAX_L3_FRAME_PAYLOAD_BYTES MAX_FREE_FORMAT_FRAME_SIZE /* MUST be >= 320000/8/32000*1152 = 1440 */ + +#define MAX_BITRESERVOIR_BYTES 511 +#define SHORT_BLOCK_TYPE 2 +#define STOP_BLOCK_TYPE 3 +#define MODE_MONO 3 +#define MODE_JOINT_STEREO 1 +#define HDR_SIZE 4 +#define HDR_IS_MONO(h) (((h[3]) & 0xC0) == 0xC0) +#define HDR_IS_MS_STEREO(h) (((h[3]) & 0xE0) == 0x60) +#define HDR_IS_FREE_FORMAT(h) (((h[2]) & 0xF0) == 0) +#define HDR_IS_CRC(h) (!((h[1]) & 1)) +#define HDR_TEST_PADDING(h) ((h[2]) & 0x2) +#define HDR_TEST_MPEG1(h) ((h[1]) & 0x8) +#define HDR_TEST_NOT_MPEG25(h) ((h[1]) & 0x10) +#define HDR_TEST_I_STEREO(h) ((h[3]) & 0x10) +#define HDR_TEST_MS_STEREO(h) ((h[3]) & 0x20) +#define HDR_GET_STEREO_MODE(h) (((h[3]) >> 6) & 3) +#define HDR_GET_STEREO_MODE_EXT(h) (((h[3]) >> 4) & 3) +#define HDR_GET_LAYER(h) (((h[1]) >> 1) & 3) +#define HDR_GET_BITRATE(h) ((h[2]) >> 4) +#define HDR_GET_SAMPLE_RATE(h) (((h[2]) >> 2) & 3) +#define HDR_GET_MY_SAMPLE_RATE(h) (HDR_GET_SAMPLE_RATE(h) + (((h[1] >> 3) & 1) + ((h[1] >> 4) & 1))*3) +#define HDR_IS_FRAME_576(h) ((h[1] & 14) == 2) +#define HDR_IS_LAYER_1(h) ((h[1] & 6) == 6) + +#define BITS_DEQUANTIZER_OUT -1 +#define MAX_SCF (255 + BITS_DEQUANTIZER_OUT*4 - 210) +#define MAX_SCFI ((MAX_SCF + 3) & ~3) + +#define MINIMP3_MIN(a, b) ((a) > (b) ? (b) : (a)) +#define MINIMP3_MAX(a, b) ((a) < (b) ? (b) : (a)) + +#if !defined(MINIMP3_NO_SIMD) + +#if !defined(MINIMP3_ONLY_SIMD) && (defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_ARM64)) +/* x64 always have SSE2, arm64 always have neon, no need for generic code */ +#define MINIMP3_ONLY_SIMD +#endif /* SIMD checks... */ + +#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || ((defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__)) +#if defined(_MSC_VER) +#include +#endif /* defined(_MSC_VER) */ +#include +#define HAVE_SSE 1 +#define HAVE_SIMD 1 +#define VSTORE _mm_storeu_ps +#define VLD _mm_loadu_ps +#define VSET _mm_set1_ps +#define VADD _mm_add_ps +#define VSUB _mm_sub_ps +#define VMUL _mm_mul_ps +#define VMAC(a, x, y) _mm_add_ps(a, _mm_mul_ps(x, y)) +#define VMSB(a, x, y) _mm_sub_ps(a, _mm_mul_ps(x, y)) +#define VMUL_S(x, s) _mm_mul_ps(x, _mm_set1_ps(s)) +#define VREV(x) _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3)) +typedef __m128 f4; +#if defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) +#define minimp3_cpuid __cpuid +#else /* defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) */ +static __inline__ __attribute__((always_inline)) void minimp3_cpuid(int CPUInfo[], const int InfoType) +{ +#if defined(__PIC__) + __asm__ __volatile__( +#if defined(__x86_64__) + "push %%rbx\n" + "cpuid\n" + "xchgl %%ebx, %1\n" + "pop %%rbx\n" +#else /* defined(__x86_64__) */ + "xchgl %%ebx, %1\n" + "cpuid\n" + "xchgl %%ebx, %1\n" +#endif /* defined(__x86_64__) */ + : "=a" (CPUInfo[0]), "=r" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3]) + : "a" (InfoType)); +#else /* defined(__PIC__) */ + __asm__ __volatile__( + "cpuid" + : "=a" (CPUInfo[0]), "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3]) + : "a" (InfoType)); +#endif /* defined(__PIC__)*/ +} +#endif /* defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) */ +static int have_simd(void) +{ +#ifdef MINIMP3_ONLY_SIMD + return 1; +#else /* MINIMP3_ONLY_SIMD */ + static int g_have_simd; + int CPUInfo[4]; +#ifdef MINIMP3_TEST + static int g_counter; + if (g_counter++ > 100) + return 0; +#endif /* MINIMP3_TEST */ + if (g_have_simd) + goto end; + minimp3_cpuid(CPUInfo, 0); + g_have_simd = 1; + if (CPUInfo[0] > 0) + { + minimp3_cpuid(CPUInfo, 1); + g_have_simd = (CPUInfo[3] & (1 << 26)) + 1; /* SSE2 */ + } +end: + return g_have_simd - 1; +#endif /* MINIMP3_ONLY_SIMD */ +} +#elif defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64) +#include +#define HAVE_SSE 0 +#define HAVE_SIMD 1 +#define VSTORE vst1q_f32 +#define VLD vld1q_f32 +#define VSET vmovq_n_f32 +#define VADD vaddq_f32 +#define VSUB vsubq_f32 +#define VMUL vmulq_f32 +#define VMAC(a, x, y) vmlaq_f32(a, x, y) +#define VMSB(a, x, y) vmlsq_f32(a, x, y) +#define VMUL_S(x, s) vmulq_f32(x, vmovq_n_f32(s)) +#define VREV(x) vcombine_f32(vget_high_f32(vrev64q_f32(x)), vget_low_f32(vrev64q_f32(x))) +typedef float32x4_t f4; +static int have_simd() +{ /* TODO: detect neon for !MINIMP3_ONLY_SIMD */ + return 1; +} +#else /* SIMD checks... */ +#define HAVE_SSE 0 +#define HAVE_SIMD 0 +#ifdef MINIMP3_ONLY_SIMD +#error MINIMP3_ONLY_SIMD used, but SSE/NEON not enabled +#endif /* MINIMP3_ONLY_SIMD */ +#endif /* SIMD checks... */ +#else /* !defined(MINIMP3_NO_SIMD) */ +#define HAVE_SIMD 0 +#endif /* !defined(MINIMP3_NO_SIMD) */ + +#if defined(__ARM_ARCH) && (__ARM_ARCH >= 6) && !defined(__aarch64__) && !defined(_M_ARM64) +#define HAVE_ARMV6 1 +static __inline__ __attribute__((always_inline)) int32_t minimp3_clip_int16_arm(int32_t a) +{ + int32_t x = 0; + __asm__ ("ssat %0, #16, %1" : "=r"(x) : "r"(a)); + return x; +} +#else +#define HAVE_ARMV6 0 +#endif + +typedef struct +{ + const uint8_t *buf; + int pos, limit; +} bs_t; + +typedef struct +{ + float scf[3*64]; + uint8_t total_bands, stereo_bands, bitalloc[64], scfcod[64]; +} L12_scale_info; + +typedef struct +{ + uint8_t tab_offset, code_tab_width, band_count; +} L12_subband_alloc_t; + +typedef struct +{ + const uint8_t *sfbtab; + uint16_t part_23_length, big_values, scalefac_compress; + uint8_t global_gain, block_type, mixed_block_flag, n_long_sfb, n_short_sfb; + uint8_t table_select[3], region_count[3], subblock_gain[3]; + uint8_t preflag, scalefac_scale, count1_table, scfsi; +} L3_gr_info_t; + +typedef struct +{ + bs_t bs; + uint8_t maindata[MAX_BITRESERVOIR_BYTES + MAX_L3_FRAME_PAYLOAD_BYTES]; + L3_gr_info_t gr_info[4]; + float grbuf[2][576], scf[40], syn[18 + 15][2*32]; + uint8_t ist_pos[2][39]; +} mp3dec_scratch_t; + +static void bs_init(bs_t *bs, const uint8_t *data, int bytes) +{ + bs->buf = data; + bs->pos = 0; + bs->limit = bytes*8; +} + +static uint32_t get_bits(bs_t *bs, int n) +{ + uint32_t next, cache = 0, s = bs->pos & 7; + int shl = n + s; + const uint8_t *p = bs->buf + (bs->pos >> 3); + if ((bs->pos += n) > bs->limit) + return 0; + next = *p++ & (255 >> s); + while ((shl -= 8) > 0) + { + cache |= next << shl; + next = *p++; + } + return cache | (next >> -shl); +} + +static int hdr_valid(const uint8_t *h) +{ + return h[0] == 0xff && + ((h[1] & 0xF0) == 0xf0 || (h[1] & 0xFE) == 0xe2) && + (HDR_GET_LAYER(h) != 0) && + (HDR_GET_BITRATE(h) != 15) && + (HDR_GET_SAMPLE_RATE(h) != 3); +} + +static int hdr_compare(const uint8_t *h1, const uint8_t *h2) +{ + return hdr_valid(h2) && + ((h1[1] ^ h2[1]) & 0xFE) == 0 && + ((h1[2] ^ h2[2]) & 0x0C) == 0 && + !(HDR_IS_FREE_FORMAT(h1) ^ HDR_IS_FREE_FORMAT(h2)); +} + +static unsigned hdr_bitrate_kbps(const uint8_t *h) +{ + static const uint8_t halfrate[2][3][15] = { + { { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,16,24,28,32,40,48,56,64,72,80,88,96,112,128 } }, + { { 0,16,20,24,28,32,40,48,56,64,80,96,112,128,160 }, { 0,16,24,28,32,40,48,56,64,80,96,112,128,160,192 }, { 0,16,32,48,64,80,96,112,128,144,160,176,192,208,224 } }, + }; + return 2*halfrate[!!HDR_TEST_MPEG1(h)][HDR_GET_LAYER(h) - 1][HDR_GET_BITRATE(h)]; +} + +static unsigned hdr_sample_rate_hz(const uint8_t *h) +{ + static const unsigned g_hz[3] = { 44100, 48000, 32000 }; + return g_hz[HDR_GET_SAMPLE_RATE(h)] >> (int)!HDR_TEST_MPEG1(h) >> (int)!HDR_TEST_NOT_MPEG25(h); +} + +static unsigned hdr_frame_samples(const uint8_t *h) +{ + return HDR_IS_LAYER_1(h) ? 384 : (1152 >> (int)HDR_IS_FRAME_576(h)); +} + +static int hdr_frame_bytes(const uint8_t *h, int free_format_size) +{ + int frame_bytes = hdr_frame_samples(h)*hdr_bitrate_kbps(h)*125/hdr_sample_rate_hz(h); + if (HDR_IS_LAYER_1(h)) + { + frame_bytes &= ~3; /* slot align */ + } + return frame_bytes ? frame_bytes : free_format_size; +} + +static int hdr_padding(const uint8_t *h) +{ + return HDR_TEST_PADDING(h) ? (HDR_IS_LAYER_1(h) ? 4 : 1) : 0; +} + +#ifndef MINIMP3_ONLY_MP3 +static const L12_subband_alloc_t *L12_subband_alloc_table(const uint8_t *hdr, L12_scale_info *sci) +{ + const L12_subband_alloc_t *alloc; + int mode = HDR_GET_STEREO_MODE(hdr); + int nbands, stereo_bands = (mode == MODE_MONO) ? 0 : (mode == MODE_JOINT_STEREO) ? (HDR_GET_STEREO_MODE_EXT(hdr) << 2) + 4 : 32; + + if (HDR_IS_LAYER_1(hdr)) + { + static const L12_subband_alloc_t g_alloc_L1[] = { { 76, 4, 32 } }; + alloc = g_alloc_L1; + nbands = 32; + } else if (!HDR_TEST_MPEG1(hdr)) + { + static const L12_subband_alloc_t g_alloc_L2M2[] = { { 60, 4, 4 }, { 44, 3, 7 }, { 44, 2, 19 } }; + alloc = g_alloc_L2M2; + nbands = 30; + } else + { + static const L12_subband_alloc_t g_alloc_L2M1[] = { { 0, 4, 3 }, { 16, 4, 8 }, { 32, 3, 12 }, { 40, 2, 7 } }; + int sample_rate_idx = HDR_GET_SAMPLE_RATE(hdr); + unsigned kbps = hdr_bitrate_kbps(hdr) >> (int)(mode != MODE_MONO); + if (!kbps) /* free-format */ + { + kbps = 192; + } + + alloc = g_alloc_L2M1; + nbands = 27; + if (kbps < 56) + { + static const L12_subband_alloc_t g_alloc_L2M1_lowrate[] = { { 44, 4, 2 }, { 44, 3, 10 } }; + alloc = g_alloc_L2M1_lowrate; + nbands = sample_rate_idx == 2 ? 12 : 8; + } else if (kbps >= 96 && sample_rate_idx != 1) + { + nbands = 30; + } + } + + sci->total_bands = (uint8_t)nbands; + sci->stereo_bands = (uint8_t)MINIMP3_MIN(stereo_bands, nbands); + + return alloc; +} + +static void L12_read_scalefactors(bs_t *bs, uint8_t *pba, uint8_t *scfcod, int bands, float *scf) +{ + static const float g_deq_L12[18*3] = { +#define DQ(x) 9.53674316e-07f/x, 7.56931807e-07f/x, 6.00777173e-07f/x + DQ(3),DQ(7),DQ(15),DQ(31),DQ(63),DQ(127),DQ(255),DQ(511),DQ(1023),DQ(2047),DQ(4095),DQ(8191),DQ(16383),DQ(32767),DQ(65535),DQ(3),DQ(5),DQ(9) + }; + int i, m; + for (i = 0; i < bands; i++) + { + float s = 0; + int ba = *pba++; + int mask = ba ? 4 + ((19 >> scfcod[i]) & 3) : 0; + for (m = 4; m; m >>= 1) + { + if (mask & m) + { + int b = get_bits(bs, 6); + s = g_deq_L12[ba*3 - 6 + b % 3]*(1 << 21 >> b/3); + } + *scf++ = s; + } + } +} + +static void L12_read_scale_info(const uint8_t *hdr, bs_t *bs, L12_scale_info *sci) +{ + static const uint8_t g_bitalloc_code_tab[] = { + 0,17, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16, + 0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,16, + 0,17,18, 3,19,4,5,16, + 0,17,18,16, + 0,17,18,19, 4,5,6, 7,8, 9,10,11,12,13,14,15, + 0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,14, + 0, 2, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16 + }; + const L12_subband_alloc_t *subband_alloc = L12_subband_alloc_table(hdr, sci); + + int i, k = 0, ba_bits = 0; + const uint8_t *ba_code_tab = g_bitalloc_code_tab; + + for (i = 0; i < sci->total_bands; i++) + { + uint8_t ba; + if (i == k) + { + k += subband_alloc->band_count; + ba_bits = subband_alloc->code_tab_width; + ba_code_tab = g_bitalloc_code_tab + subband_alloc->tab_offset; + subband_alloc++; + } + ba = ba_code_tab[get_bits(bs, ba_bits)]; + sci->bitalloc[2*i] = ba; + if (i < sci->stereo_bands) + { + ba = ba_code_tab[get_bits(bs, ba_bits)]; + } + sci->bitalloc[2*i + 1] = sci->stereo_bands ? ba : 0; + } + + for (i = 0; i < 2*sci->total_bands; i++) + { + sci->scfcod[i] = sci->bitalloc[i] ? HDR_IS_LAYER_1(hdr) ? 2 : get_bits(bs, 2) : 6; + } + + L12_read_scalefactors(bs, sci->bitalloc, sci->scfcod, sci->total_bands*2, sci->scf); + + for (i = sci->stereo_bands; i < sci->total_bands; i++) + { + sci->bitalloc[2*i + 1] = 0; + } +} + +static int L12_dequantize_granule(float *grbuf, bs_t *bs, L12_scale_info *sci, int group_size) +{ + int i, j, k, choff = 576; + for (j = 0; j < 4; j++) + { + float *dst = grbuf + group_size*j; + for (i = 0; i < 2*sci->total_bands; i++) + { + int ba = sci->bitalloc[i]; + if (ba != 0) + { + if (ba < 17) + { + int half = (1 << (ba - 1)) - 1; + for (k = 0; k < group_size; k++) + { + dst[k] = (float)((int)get_bits(bs, ba) - half); + } + } else + { + unsigned mod = (2 << (ba - 17)) + 1; /* 3, 5, 9 */ + unsigned code = get_bits(bs, mod + 2 - (mod >> 3)); /* 5, 7, 10 */ + for (k = 0; k < group_size; k++, code /= mod) + { + dst[k] = (float)((int)(code % mod - mod/2)); + } + } + } + dst += choff; + choff = 18 - choff; + } + } + return group_size*4; +} + +static void L12_apply_scf_384(L12_scale_info *sci, const float *scf, float *dst) +{ + int i, k; + memcpy(dst + 576 + sci->stereo_bands*18, dst + sci->stereo_bands*18, (sci->total_bands - sci->stereo_bands)*18*sizeof(float)); + for (i = 0; i < sci->total_bands; i++, dst += 18, scf += 6) + { + for (k = 0; k < 12; k++) + { + dst[k + 0] *= scf[0]; + dst[k + 576] *= scf[3]; + } + } +} +#endif /* MINIMP3_ONLY_MP3 */ + +static int L3_read_side_info(bs_t *bs, L3_gr_info_t *gr, const uint8_t *hdr) +{ + static const uint8_t g_scf_long[8][23] = { + { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 }, + { 12,12,12,12,12,12,16,20,24,28,32,40,48,56,64,76,90,2,2,2,2,2,0 }, + { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 }, + { 6,6,6,6,6,6,8,10,12,14,16,18,22,26,32,38,46,54,62,70,76,36,0 }, + { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 }, + { 4,4,4,4,4,4,6,6,8,8,10,12,16,20,24,28,34,42,50,54,76,158,0 }, + { 4,4,4,4,4,4,6,6,6,8,10,12,16,18,22,28,34,40,46,54,54,192,0 }, + { 4,4,4,4,4,4,6,6,8,10,12,16,20,24,30,38,46,56,68,84,102,26,0 } + }; + static const uint8_t g_scf_short[8][40] = { + { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 }, + { 8,8,8,8,8,8,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 }, + { 4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 }, + { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 }, + { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 }, + { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 }, + { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 }, + { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 } + }; + static const uint8_t g_scf_mixed[8][40] = { + { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 }, + { 12,12,12,4,4,4,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 }, + { 6,6,6,6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 }, + { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 }, + { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 }, + { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 }, + { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 }, + { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 } + }; + + unsigned tables, scfsi = 0; + int main_data_begin, part_23_sum = 0; + int sr_idx = HDR_GET_MY_SAMPLE_RATE(hdr); sr_idx -= (sr_idx != 0); + int gr_count = HDR_IS_MONO(hdr) ? 1 : 2; + + if (HDR_TEST_MPEG1(hdr)) + { + gr_count *= 2; + main_data_begin = get_bits(bs, 9); + scfsi = get_bits(bs, 7 + gr_count); + } else + { + main_data_begin = get_bits(bs, 8 + gr_count) >> gr_count; + } + + do + { + if (HDR_IS_MONO(hdr)) + { + scfsi <<= 4; + } + gr->part_23_length = (uint16_t)get_bits(bs, 12); + part_23_sum += gr->part_23_length; + gr->big_values = (uint16_t)get_bits(bs, 9); + if (gr->big_values > 288) + { + return -1; + } + gr->global_gain = (uint8_t)get_bits(bs, 8); + gr->scalefac_compress = (uint16_t)get_bits(bs, HDR_TEST_MPEG1(hdr) ? 4 : 9); + gr->sfbtab = g_scf_long[sr_idx]; + gr->n_long_sfb = 22; + gr->n_short_sfb = 0; + if (get_bits(bs, 1)) + { + gr->block_type = (uint8_t)get_bits(bs, 2); + if (!gr->block_type) + { + return -1; + } + gr->mixed_block_flag = (uint8_t)get_bits(bs, 1); + gr->region_count[0] = 7; + gr->region_count[1] = 255; + if (gr->block_type == SHORT_BLOCK_TYPE) + { + scfsi &= 0x0F0F; + if (!gr->mixed_block_flag) + { + gr->region_count[0] = 8; + gr->sfbtab = g_scf_short[sr_idx]; + gr->n_long_sfb = 0; + gr->n_short_sfb = 39; + } else + { + gr->sfbtab = g_scf_mixed[sr_idx]; + gr->n_long_sfb = HDR_TEST_MPEG1(hdr) ? 8 : 6; + gr->n_short_sfb = 30; + } + } + tables = get_bits(bs, 10); + tables <<= 5; + gr->subblock_gain[0] = (uint8_t)get_bits(bs, 3); + gr->subblock_gain[1] = (uint8_t)get_bits(bs, 3); + gr->subblock_gain[2] = (uint8_t)get_bits(bs, 3); + } else + { + gr->block_type = 0; + gr->mixed_block_flag = 0; + tables = get_bits(bs, 15); + gr->region_count[0] = (uint8_t)get_bits(bs, 4); + gr->region_count[1] = (uint8_t)get_bits(bs, 3); + gr->region_count[2] = 255; + } + gr->table_select[0] = (uint8_t)(tables >> 10); + gr->table_select[1] = (uint8_t)((tables >> 5) & 31); + gr->table_select[2] = (uint8_t)((tables) & 31); + gr->preflag = HDR_TEST_MPEG1(hdr) ? get_bits(bs, 1) : (gr->scalefac_compress >= 500); + gr->scalefac_scale = (uint8_t)get_bits(bs, 1); + gr->count1_table = (uint8_t)get_bits(bs, 1); + gr->scfsi = (uint8_t)((scfsi >> 12) & 15); + scfsi <<= 4; + gr++; + } while(--gr_count); + + if (part_23_sum + bs->pos > bs->limit + main_data_begin*8) + { + return -1; + } + + return main_data_begin; +} + +static void L3_read_scalefactors(uint8_t *scf, uint8_t *ist_pos, const uint8_t *scf_size, const uint8_t *scf_count, bs_t *bitbuf, int scfsi) +{ + int i, k; + for (i = 0; i < 4 && scf_count[i]; i++, scfsi *= 2) + { + int cnt = scf_count[i]; + if (scfsi & 8) + { + memcpy(scf, ist_pos, cnt); + } else + { + int bits = scf_size[i]; + if (!bits) + { + memset(scf, 0, cnt); + memset(ist_pos, 0, cnt); + } else + { + int max_scf = (scfsi < 0) ? (1 << bits) - 1 : -1; + for (k = 0; k < cnt; k++) + { + int s = get_bits(bitbuf, bits); + ist_pos[k] = (s == max_scf ? -1 : s); + scf[k] = s; + } + } + } + ist_pos += cnt; + scf += cnt; + } + scf[0] = scf[1] = scf[2] = 0; +} + +static float L3_ldexp_q2(float y, int exp_q2) +{ + static const float g_expfrac[4] = { 9.31322575e-10f,7.83145814e-10f,6.58544508e-10f,5.53767716e-10f }; + int e; + do + { + e = MINIMP3_MIN(30*4, exp_q2); + y *= g_expfrac[e & 3]*(1 << 30 >> (e >> 2)); + } while ((exp_q2 -= e) > 0); + return y; +} + +static void L3_decode_scalefactors(const uint8_t *hdr, uint8_t *ist_pos, bs_t *bs, const L3_gr_info_t *gr, float *scf, int ch) +{ + static const uint8_t g_scf_partitions[3][28] = { + { 6,5,5, 5,6,5,5,5,6,5, 7,3,11,10,0,0, 7, 7, 7,0, 6, 6,6,3, 8, 8,5,0 }, + { 8,9,6,12,6,9,9,9,6,9,12,6,15,18,0,0, 6,15,12,0, 6,12,9,6, 6,18,9,0 }, + { 9,9,6,12,9,9,9,9,9,9,12,6,18,18,0,0,12,12,12,0,12, 9,9,6,15,12,9,0 } + }; + const uint8_t *scf_partition = g_scf_partitions[!!gr->n_short_sfb + !gr->n_long_sfb]; + uint8_t scf_size[4], iscf[40]; + int i, scf_shift = gr->scalefac_scale + 1, gain_exp, scfsi = gr->scfsi; + float gain; + + if (HDR_TEST_MPEG1(hdr)) + { + static const uint8_t g_scfc_decode[16] = { 0,1,2,3, 12,5,6,7, 9,10,11,13, 14,15,18,19 }; + int part = g_scfc_decode[gr->scalefac_compress]; + scf_size[1] = scf_size[0] = (uint8_t)(part >> 2); + scf_size[3] = scf_size[2] = (uint8_t)(part & 3); + } else + { + static const uint8_t g_mod[6*4] = { 5,5,4,4,5,5,4,1,4,3,1,1,5,6,6,1,4,4,4,1,4,3,1,1 }; + int k, modprod, sfc, ist = HDR_TEST_I_STEREO(hdr) && ch; + sfc = gr->scalefac_compress >> ist; + for (k = ist*3*4; sfc >= 0; sfc -= modprod, k += 4) + { + for (modprod = 1, i = 3; i >= 0; i--) + { + scf_size[i] = (uint8_t)(sfc / modprod % g_mod[k + i]); + modprod *= g_mod[k + i]; + } + } + scf_partition += k; + scfsi = -16; + } + L3_read_scalefactors(iscf, ist_pos, scf_size, scf_partition, bs, scfsi); + + if (gr->n_short_sfb) + { + int sh = 3 - scf_shift; + for (i = 0; i < gr->n_short_sfb; i += 3) + { + iscf[gr->n_long_sfb + i + 0] += gr->subblock_gain[0] << sh; + iscf[gr->n_long_sfb + i + 1] += gr->subblock_gain[1] << sh; + iscf[gr->n_long_sfb + i + 2] += gr->subblock_gain[2] << sh; + } + } else if (gr->preflag) + { + static const uint8_t g_preamp[10] = { 1,1,1,1,2,2,3,3,3,2 }; + for (i = 0; i < 10; i++) + { + iscf[11 + i] += g_preamp[i]; + } + } + + gain_exp = gr->global_gain + BITS_DEQUANTIZER_OUT*4 - 210 - (HDR_IS_MS_STEREO(hdr) ? 2 : 0); + gain = L3_ldexp_q2(1 << (MAX_SCFI/4), MAX_SCFI - gain_exp); + for (i = 0; i < (int)(gr->n_long_sfb + gr->n_short_sfb); i++) + { + scf[i] = L3_ldexp_q2(gain, iscf[i] << scf_shift); + } +} + +static const float g_pow43[129 + 16] = { + 0,-1,-2.519842f,-4.326749f,-6.349604f,-8.549880f,-10.902724f,-13.390518f,-16.000000f,-18.720754f,-21.544347f,-24.463781f,-27.473142f,-30.567351f,-33.741992f,-36.993181f, + 0,1,2.519842f,4.326749f,6.349604f,8.549880f,10.902724f,13.390518f,16.000000f,18.720754f,21.544347f,24.463781f,27.473142f,30.567351f,33.741992f,36.993181f,40.317474f,43.711787f,47.173345f,50.699631f,54.288352f,57.937408f,61.644865f,65.408941f,69.227979f,73.100443f,77.024898f,81.000000f,85.024491f,89.097188f,93.216975f,97.382800f,101.593667f,105.848633f,110.146801f,114.487321f,118.869381f,123.292209f,127.755065f,132.257246f,136.798076f,141.376907f,145.993119f,150.646117f,155.335327f,160.060199f,164.820202f,169.614826f,174.443577f,179.305980f,184.201575f,189.129918f,194.090580f,199.083145f,204.107210f,209.162385f,214.248292f,219.364564f,224.510845f,229.686789f,234.892058f,240.126328f,245.389280f,250.680604f,256.000000f,261.347174f,266.721841f,272.123723f,277.552547f,283.008049f,288.489971f,293.998060f,299.532071f,305.091761f,310.676898f,316.287249f,321.922592f,327.582707f,333.267377f,338.976394f,344.709550f,350.466646f,356.247482f,362.051866f,367.879608f,373.730522f,379.604427f,385.501143f,391.420496f,397.362314f,403.326427f,409.312672f,415.320884f,421.350905f,427.402579f,433.475750f,439.570269f,445.685987f,451.822757f,457.980436f,464.158883f,470.357960f,476.577530f,482.817459f,489.077615f,495.357868f,501.658090f,507.978156f,514.317941f,520.677324f,527.056184f,533.454404f,539.871867f,546.308458f,552.764065f,559.238575f,565.731879f,572.243870f,578.774440f,585.323483f,591.890898f,598.476581f,605.080431f,611.702349f,618.342238f,625.000000f,631.675540f,638.368763f,645.079578f +}; + +static float L3_pow_43(int x) +{ + float frac; + int sign, mult = 256; + + if (x < 129) + { + return g_pow43[16 + x]; + } + + if (x < 1024) + { + mult = 16; + x <<= 3; + } + + sign = 2*x & 64; + frac = (float)((x & 63) - sign) / ((x & ~63) + sign); + return g_pow43[16 + ((x + sign) >> 6)]*(1.f + frac*((4.f/3) + frac*(2.f/9)))*mult; +} + +static void L3_huffman(float *dst, bs_t *bs, const L3_gr_info_t *gr_info, const float *scf, int layer3gr_limit) +{ + static const int16_t tabs[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 785,785,785,785,784,784,784,784,513,513,513,513,513,513,513,513,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256, + -255,1313,1298,1282,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,290,288, + -255,1313,1298,1282,769,769,769,769,529,529,529,529,529,529,529,529,528,528,528,528,528,528,528,528,512,512,512,512,512,512,512,512,290,288, + -253,-318,-351,-367,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,819,818,547,547,275,275,275,275,561,560,515,546,289,274,288,258, + -254,-287,1329,1299,1314,1312,1057,1057,1042,1042,1026,1026,784,784,784,784,529,529,529,529,529,529,529,529,769,769,769,769,768,768,768,768,563,560,306,306,291,259, + -252,-413,-477,-542,1298,-575,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-383,-399,1107,1092,1106,1061,849,849,789,789,1104,1091,773,773,1076,1075,341,340,325,309,834,804,577,577,532,532,516,516,832,818,803,816,561,561,531,531,515,546,289,289,288,258, + -252,-429,-493,-559,1057,1057,1042,1042,529,529,529,529,529,529,529,529,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,-382,1077,-415,1106,1061,1104,849,849,789,789,1091,1076,1029,1075,834,834,597,581,340,340,339,324,804,833,532,532,832,772,818,803,817,787,816,771,290,290,290,290,288,258, + -253,-349,-414,-447,-463,1329,1299,-479,1314,1312,1057,1057,1042,1042,1026,1026,785,785,785,785,784,784,784,784,769,769,769,769,768,768,768,768,-319,851,821,-335,836,850,805,849,341,340,325,336,533,533,579,579,564,564,773,832,578,548,563,516,321,276,306,291,304,259, + -251,-572,-733,-830,-863,-879,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,1396,1351,1381,1366,1395,1335,1380,-559,1334,1138,1138,1063,1063,1350,1392,1031,1031,1062,1062,1364,1363,1120,1120,1333,1348,881,881,881,881,375,374,359,373,343,358,341,325,791,791,1123,1122,-703,1105,1045,-719,865,865,790,790,774,774,1104,1029,338,293,323,308,-799,-815,833,788,772,818,803,816,322,292,307,320,561,531,515,546,289,274,288,258, + -251,-525,-605,-685,-765,-831,-846,1298,1057,1057,1312,1282,785,785,785,785,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,1399,1398,1383,1367,1382,1396,1351,-511,1381,1366,1139,1139,1079,1079,1124,1124,1364,1349,1363,1333,882,882,882,882,807,807,807,807,1094,1094,1136,1136,373,341,535,535,881,775,867,822,774,-591,324,338,-671,849,550,550,866,864,609,609,293,336,534,534,789,835,773,-751,834,804,308,307,833,788,832,772,562,562,547,547,305,275,560,515,290,290, + -252,-397,-477,-557,-622,-653,-719,-735,-750,1329,1299,1314,1057,1057,1042,1042,1312,1282,1024,1024,785,785,785,785,784,784,784,784,769,769,769,769,-383,1127,1141,1111,1126,1140,1095,1110,869,869,883,883,1079,1109,882,882,375,374,807,868,838,881,791,-463,867,822,368,263,852,837,836,-543,610,610,550,550,352,336,534,534,865,774,851,821,850,805,593,533,579,564,773,832,578,578,548,548,577,577,307,276,306,291,516,560,259,259, + -250,-2107,-2507,-2764,-2909,-2974,-3007,-3023,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-767,-1052,-1213,-1277,-1358,-1405,-1469,-1535,-1550,-1582,-1614,-1647,-1662,-1694,-1726,-1759,-1774,-1807,-1822,-1854,-1886,1565,-1919,-1935,-1951,-1967,1731,1730,1580,1717,-1983,1729,1564,-1999,1548,-2015,-2031,1715,1595,-2047,1714,-2063,1610,-2079,1609,-2095,1323,1323,1457,1457,1307,1307,1712,1547,1641,1700,1699,1594,1685,1625,1442,1442,1322,1322,-780,-973,-910,1279,1278,1277,1262,1276,1261,1275,1215,1260,1229,-959,974,974,989,989,-943,735,478,478,495,463,506,414,-1039,1003,958,1017,927,942,987,957,431,476,1272,1167,1228,-1183,1256,-1199,895,895,941,941,1242,1227,1212,1135,1014,1014,490,489,503,487,910,1013,985,925,863,894,970,955,1012,847,-1343,831,755,755,984,909,428,366,754,559,-1391,752,486,457,924,997,698,698,983,893,740,740,908,877,739,739,667,667,953,938,497,287,271,271,683,606,590,712,726,574,302,302,738,736,481,286,526,725,605,711,636,724,696,651,589,681,666,710,364,467,573,695,466,466,301,465,379,379,709,604,665,679,316,316,634,633,436,436,464,269,424,394,452,332,438,363,347,408,393,448,331,422,362,407,392,421,346,406,391,376,375,359,1441,1306,-2367,1290,-2383,1337,-2399,-2415,1426,1321,-2431,1411,1336,-2447,-2463,-2479,1169,1169,1049,1049,1424,1289,1412,1352,1319,-2495,1154,1154,1064,1064,1153,1153,416,390,360,404,403,389,344,374,373,343,358,372,327,357,342,311,356,326,1395,1394,1137,1137,1047,1047,1365,1392,1287,1379,1334,1364,1349,1378,1318,1363,792,792,792,792,1152,1152,1032,1032,1121,1121,1046,1046,1120,1120,1030,1030,-2895,1106,1061,1104,849,849,789,789,1091,1076,1029,1090,1060,1075,833,833,309,324,532,532,832,772,818,803,561,561,531,560,515,546,289,274,288,258, + -250,-1179,-1579,-1836,-1996,-2124,-2253,-2333,-2413,-2477,-2542,-2574,-2607,-2622,-2655,1314,1313,1298,1312,1282,785,785,785,785,1040,1040,1025,1025,768,768,768,768,-766,-798,-830,-862,-895,-911,-927,-943,-959,-975,-991,-1007,-1023,-1039,-1055,-1070,1724,1647,-1103,-1119,1631,1767,1662,1738,1708,1723,-1135,1780,1615,1779,1599,1677,1646,1778,1583,-1151,1777,1567,1737,1692,1765,1722,1707,1630,1751,1661,1764,1614,1736,1676,1763,1750,1645,1598,1721,1691,1762,1706,1582,1761,1566,-1167,1749,1629,767,766,751,765,494,494,735,764,719,749,734,763,447,447,748,718,477,506,431,491,446,476,461,505,415,430,475,445,504,399,460,489,414,503,383,474,429,459,502,502,746,752,488,398,501,473,413,472,486,271,480,270,-1439,-1455,1357,-1471,-1487,-1503,1341,1325,-1519,1489,1463,1403,1309,-1535,1372,1448,1418,1476,1356,1462,1387,-1551,1475,1340,1447,1402,1386,-1567,1068,1068,1474,1461,455,380,468,440,395,425,410,454,364,467,466,464,453,269,409,448,268,432,1371,1473,1432,1417,1308,1460,1355,1446,1459,1431,1083,1083,1401,1416,1458,1445,1067,1067,1370,1457,1051,1051,1291,1430,1385,1444,1354,1415,1400,1443,1082,1082,1173,1113,1186,1066,1185,1050,-1967,1158,1128,1172,1097,1171,1081,-1983,1157,1112,416,266,375,400,1170,1142,1127,1065,793,793,1169,1033,1156,1096,1141,1111,1155,1080,1126,1140,898,898,808,808,897,897,792,792,1095,1152,1032,1125,1110,1139,1079,1124,882,807,838,881,853,791,-2319,867,368,263,822,852,837,866,806,865,-2399,851,352,262,534,534,821,836,594,594,549,549,593,593,533,533,848,773,579,579,564,578,548,563,276,276,577,576,306,291,516,560,305,305,275,259, + -251,-892,-2058,-2620,-2828,-2957,-3023,-3039,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,-559,1530,-575,-591,1528,1527,1407,1526,1391,1023,1023,1023,1023,1525,1375,1268,1268,1103,1103,1087,1087,1039,1039,1523,-604,815,815,815,815,510,495,509,479,508,463,507,447,431,505,415,399,-734,-782,1262,-815,1259,1244,-831,1258,1228,-847,-863,1196,-879,1253,987,987,748,-767,493,493,462,477,414,414,686,669,478,446,461,445,474,429,487,458,412,471,1266,1264,1009,1009,799,799,-1019,-1276,-1452,-1581,-1677,-1757,-1821,-1886,-1933,-1997,1257,1257,1483,1468,1512,1422,1497,1406,1467,1496,1421,1510,1134,1134,1225,1225,1466,1451,1374,1405,1252,1252,1358,1480,1164,1164,1251,1251,1238,1238,1389,1465,-1407,1054,1101,-1423,1207,-1439,830,830,1248,1038,1237,1117,1223,1148,1236,1208,411,426,395,410,379,269,1193,1222,1132,1235,1221,1116,976,976,1192,1162,1177,1220,1131,1191,963,963,-1647,961,780,-1663,558,558,994,993,437,408,393,407,829,978,813,797,947,-1743,721,721,377,392,844,950,828,890,706,706,812,859,796,960,948,843,934,874,571,571,-1919,690,555,689,421,346,539,539,944,779,918,873,932,842,903,888,570,570,931,917,674,674,-2575,1562,-2591,1609,-2607,1654,1322,1322,1441,1441,1696,1546,1683,1593,1669,1624,1426,1426,1321,1321,1639,1680,1425,1425,1305,1305,1545,1668,1608,1623,1667,1592,1638,1666,1320,1320,1652,1607,1409,1409,1304,1304,1288,1288,1664,1637,1395,1395,1335,1335,1622,1636,1394,1394,1319,1319,1606,1621,1392,1392,1137,1137,1137,1137,345,390,360,375,404,373,1047,-2751,-2767,-2783,1062,1121,1046,-2799,1077,-2815,1106,1061,789,789,1105,1104,263,355,310,340,325,354,352,262,339,324,1091,1076,1029,1090,1060,1075,833,833,788,788,1088,1028,818,818,803,803,561,561,531,531,816,771,546,546,289,274,288,258, + -253,-317,-381,-446,-478,-509,1279,1279,-811,-1179,-1451,-1756,-1900,-2028,-2189,-2253,-2333,-2414,-2445,-2511,-2526,1313,1298,-2559,1041,1041,1040,1040,1025,1025,1024,1024,1022,1007,1021,991,1020,975,1019,959,687,687,1018,1017,671,671,655,655,1016,1015,639,639,758,758,623,623,757,607,756,591,755,575,754,559,543,543,1009,783,-575,-621,-685,-749,496,-590,750,749,734,748,974,989,1003,958,988,973,1002,942,987,957,972,1001,926,986,941,971,956,1000,910,985,925,999,894,970,-1071,-1087,-1102,1390,-1135,1436,1509,1451,1374,-1151,1405,1358,1480,1420,-1167,1507,1494,1389,1342,1465,1435,1450,1326,1505,1310,1493,1373,1479,1404,1492,1464,1419,428,443,472,397,736,526,464,464,486,457,442,471,484,482,1357,1449,1434,1478,1388,1491,1341,1490,1325,1489,1463,1403,1309,1477,1372,1448,1418,1433,1476,1356,1462,1387,-1439,1475,1340,1447,1402,1474,1324,1461,1371,1473,269,448,1432,1417,1308,1460,-1711,1459,-1727,1441,1099,1099,1446,1386,1431,1401,-1743,1289,1083,1083,1160,1160,1458,1445,1067,1067,1370,1457,1307,1430,1129,1129,1098,1098,268,432,267,416,266,400,-1887,1144,1187,1082,1173,1113,1186,1066,1050,1158,1128,1143,1172,1097,1171,1081,420,391,1157,1112,1170,1142,1127,1065,1169,1049,1156,1096,1141,1111,1155,1080,1126,1154,1064,1153,1140,1095,1048,-2159,1125,1110,1137,-2175,823,823,1139,1138,807,807,384,264,368,263,868,838,853,791,867,822,852,837,866,806,865,790,-2319,851,821,836,352,262,850,805,849,-2399,533,533,835,820,336,261,578,548,563,577,532,532,832,772,562,562,547,547,305,275,560,515,290,290,288,258 }; + static const uint8_t tab32[] = { 130,162,193,209,44,28,76,140,9,9,9,9,9,9,9,9,190,254,222,238,126,94,157,157,109,61,173,205 }; + static const uint8_t tab33[] = { 252,236,220,204,188,172,156,140,124,108,92,76,60,44,28,12 }; + static const int16_t tabindex[2*16] = { 0,32,64,98,0,132,180,218,292,364,426,538,648,746,0,1126,1460,1460,1460,1460,1460,1460,1460,1460,1842,1842,1842,1842,1842,1842,1842,1842 }; + static const uint8_t g_linbits[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,6,8,10,13,4,5,6,7,8,9,11,13 }; + +#define PEEK_BITS(n) (bs_cache >> (32 - n)) +#define FLUSH_BITS(n) { bs_cache <<= (n); bs_sh += (n); } +#define CHECK_BITS while (bs_sh >= 0) { bs_cache |= (uint32_t)*bs_next_ptr++ << bs_sh; bs_sh -= 8; } +#define BSPOS ((bs_next_ptr - bs->buf)*8 - 24 + bs_sh) + + float one = 0.0f; + int ireg = 0, big_val_cnt = gr_info->big_values; + const uint8_t *sfb = gr_info->sfbtab; + const uint8_t *bs_next_ptr = bs->buf + bs->pos/8; + uint32_t bs_cache = (((bs_next_ptr[0]*256u + bs_next_ptr[1])*256u + bs_next_ptr[2])*256u + bs_next_ptr[3]) << (bs->pos & 7); + int pairs_to_decode, np, bs_sh = (bs->pos & 7) - 8; + bs_next_ptr += 4; + + while (big_val_cnt > 0) + { + int tab_num = gr_info->table_select[ireg]; + int sfb_cnt = gr_info->region_count[ireg++]; + const int16_t *codebook = tabs + tabindex[tab_num]; + int linbits = g_linbits[tab_num]; + if (linbits) + { + do + { + np = *sfb++ / 2; + pairs_to_decode = MINIMP3_MIN(big_val_cnt, np); + one = *scf++; + do + { + int j, w = 5; + int leaf = codebook[PEEK_BITS(w)]; + while (leaf < 0) + { + FLUSH_BITS(w); + w = leaf & 7; + leaf = codebook[PEEK_BITS(w) - (leaf >> 3)]; + } + FLUSH_BITS(leaf >> 8); + + for (j = 0; j < 2; j++, dst++, leaf >>= 4) + { + int lsb = leaf & 0x0F; + if (lsb == 15) + { + lsb += PEEK_BITS(linbits); + FLUSH_BITS(linbits); + CHECK_BITS; + *dst = one*L3_pow_43(lsb)*((int32_t)bs_cache < 0 ? -1: 1); + } else + { + *dst = g_pow43[16 + lsb - 16*(bs_cache >> 31)]*one; + } + FLUSH_BITS(lsb ? 1 : 0); + } + CHECK_BITS; + } while (--pairs_to_decode); + } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0); + } else + { + do + { + np = *sfb++ / 2; + pairs_to_decode = MINIMP3_MIN(big_val_cnt, np); + one = *scf++; + do + { + int j, w = 5; + int leaf = codebook[PEEK_BITS(w)]; + while (leaf < 0) + { + FLUSH_BITS(w); + w = leaf & 7; + leaf = codebook[PEEK_BITS(w) - (leaf >> 3)]; + } + FLUSH_BITS(leaf >> 8); + + for (j = 0; j < 2; j++, dst++, leaf >>= 4) + { + int lsb = leaf & 0x0F; + *dst = g_pow43[16 + lsb - 16*(bs_cache >> 31)]*one; + FLUSH_BITS(lsb ? 1 : 0); + } + CHECK_BITS; + } while (--pairs_to_decode); + } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0); + } + } + + for (np = 1 - big_val_cnt;; dst += 4) + { + const uint8_t *codebook_count1 = (gr_info->count1_table) ? tab33 : tab32; + int leaf = codebook_count1[PEEK_BITS(4)]; + if (!(leaf & 8)) + { + leaf = codebook_count1[(leaf >> 3) + (bs_cache << 4 >> (32 - (leaf & 3)))]; + } + FLUSH_BITS(leaf & 7); + if (BSPOS > layer3gr_limit) + { + break; + } +#define RELOAD_SCALEFACTOR if (!--np) { np = *sfb++/2; if (!np) break; one = *scf++; } +#define DEQ_COUNT1(s) if (leaf & (128 >> s)) { dst[s] = ((int32_t)bs_cache < 0) ? -one : one; FLUSH_BITS(1) } + RELOAD_SCALEFACTOR; + DEQ_COUNT1(0); + DEQ_COUNT1(1); + RELOAD_SCALEFACTOR; + DEQ_COUNT1(2); + DEQ_COUNT1(3); + CHECK_BITS; + } + + bs->pos = layer3gr_limit; +} + +static void L3_midside_stereo(float *left, int n) +{ + int i = 0; + float *right = left + 576; +#if HAVE_SIMD + if (have_simd()) + { + for (; i < n - 3; i += 4) + { + f4 vl = VLD(left + i); + f4 vr = VLD(right + i); + VSTORE(left + i, VADD(vl, vr)); + VSTORE(right + i, VSUB(vl, vr)); + } +#ifdef __GNUC__ + /* Workaround for spurious -Waggressive-loop-optimizations warning from gcc. + * For more info see: https://github.com/lieff/minimp3/issues/88 + */ + if (__builtin_constant_p(n % 4 == 0) && n % 4 == 0) + return; +#endif + } +#endif /* HAVE_SIMD */ + for (; i < n; i++) + { + float a = left[i]; + float b = right[i]; + left[i] = a + b; + right[i] = a - b; + } +} + +static void L3_intensity_stereo_band(float *left, int n, float kl, float kr) +{ + int i; + for (i = 0; i < n; i++) + { + left[i + 576] = left[i]*kr; + left[i] = left[i]*kl; + } +} + +static void L3_stereo_top_band(const float *right, const uint8_t *sfb, int nbands, int max_band[3]) +{ + int i, k; + + max_band[0] = max_band[1] = max_band[2] = -1; + + for (i = 0; i < nbands; i++) + { + for (k = 0; k < sfb[i]; k += 2) + { + if (right[k] != 0 || right[k + 1] != 0) + { + max_band[i % 3] = i; + break; + } + } + right += sfb[i]; + } +} + +static void L3_stereo_process(float *left, const uint8_t *ist_pos, const uint8_t *sfb, const uint8_t *hdr, int max_band[3], int mpeg2_sh) +{ + static const float g_pan[7*2] = { 0,1,0.21132487f,0.78867513f,0.36602540f,0.63397460f,0.5f,0.5f,0.63397460f,0.36602540f,0.78867513f,0.21132487f,1,0 }; + unsigned i, max_pos = HDR_TEST_MPEG1(hdr) ? 7 : 64; + + for (i = 0; sfb[i]; i++) + { + unsigned ipos = ist_pos[i]; + if ((int)i > max_band[i % 3] && ipos < max_pos) + { + float kl, kr, s = HDR_TEST_MS_STEREO(hdr) ? 1.41421356f : 1; + if (HDR_TEST_MPEG1(hdr)) + { + kl = g_pan[2*ipos]; + kr = g_pan[2*ipos + 1]; + } else + { + kl = 1; + kr = L3_ldexp_q2(1, (ipos + 1) >> 1 << mpeg2_sh); + if (ipos & 1) + { + kl = kr; + kr = 1; + } + } + L3_intensity_stereo_band(left, sfb[i], kl*s, kr*s); + } else if (HDR_TEST_MS_STEREO(hdr)) + { + L3_midside_stereo(left, sfb[i]); + } + left += sfb[i]; + } +} + +static void L3_intensity_stereo(float *left, uint8_t *ist_pos, const L3_gr_info_t *gr, const uint8_t *hdr) +{ + int max_band[3], n_sfb = gr->n_long_sfb + gr->n_short_sfb; + int i, max_blocks = gr->n_short_sfb ? 3 : 1; + + L3_stereo_top_band(left + 576, gr->sfbtab, n_sfb, max_band); + if (gr->n_long_sfb) + { + max_band[0] = max_band[1] = max_band[2] = MINIMP3_MAX(MINIMP3_MAX(max_band[0], max_band[1]), max_band[2]); + } + for (i = 0; i < max_blocks; i++) + { + int default_pos = HDR_TEST_MPEG1(hdr) ? 3 : 0; + int itop = n_sfb - max_blocks + i; + int prev = itop - max_blocks; + ist_pos[itop] = max_band[i] >= prev ? default_pos : ist_pos[prev]; + } + L3_stereo_process(left, ist_pos, gr->sfbtab, hdr, max_band, gr[1].scalefac_compress & 1); +} + +static void L3_reorder(float *grbuf, float *scratch, const uint8_t *sfb) +{ + int i, len; + float *src = grbuf, *dst = scratch; + + for (;0 != (len = *sfb); sfb += 3, src += 2*len) + { + for (i = 0; i < len; i++, src++) + { + *dst++ = src[0*len]; + *dst++ = src[1*len]; + *dst++ = src[2*len]; + } + } + memcpy(grbuf, scratch, (dst - scratch)*sizeof(float)); +} + +static void L3_antialias(float *grbuf, int nbands) +{ + static const float g_aa[2][8] = { + {0.85749293f,0.88174200f,0.94962865f,0.98331459f,0.99551782f,0.99916056f,0.99989920f,0.99999316f}, + {0.51449576f,0.47173197f,0.31337745f,0.18191320f,0.09457419f,0.04096558f,0.01419856f,0.00369997f} + }; + + for (; nbands > 0; nbands--, grbuf += 18) + { + int i = 0; +#if HAVE_SIMD + if (have_simd()) for (; i < 8; i += 4) + { + f4 vu = VLD(grbuf + 18 + i); + f4 vd = VLD(grbuf + 14 - i); + f4 vc0 = VLD(g_aa[0] + i); + f4 vc1 = VLD(g_aa[1] + i); + vd = VREV(vd); + VSTORE(grbuf + 18 + i, VSUB(VMUL(vu, vc0), VMUL(vd, vc1))); + vd = VADD(VMUL(vu, vc1), VMUL(vd, vc0)); + VSTORE(grbuf + 14 - i, VREV(vd)); + } +#endif /* HAVE_SIMD */ +#ifndef MINIMP3_ONLY_SIMD + for(; i < 8; i++) + { + float u = grbuf[18 + i]; + float d = grbuf[17 - i]; + grbuf[18 + i] = u*g_aa[0][i] - d*g_aa[1][i]; + grbuf[17 - i] = u*g_aa[1][i] + d*g_aa[0][i]; + } +#endif /* MINIMP3_ONLY_SIMD */ + } +} + +static void L3_dct3_9(float *y) +{ + float s0, s1, s2, s3, s4, s5, s6, s7, s8, t0, t2, t4; + + s0 = y[0]; s2 = y[2]; s4 = y[4]; s6 = y[6]; s8 = y[8]; + t0 = s0 + s6*0.5f; + s0 -= s6; + t4 = (s4 + s2)*0.93969262f; + t2 = (s8 + s2)*0.76604444f; + s6 = (s4 - s8)*0.17364818f; + s4 += s8 - s2; + + s2 = s0 - s4*0.5f; + y[4] = s4 + s0; + s8 = t0 - t2 + s6; + s0 = t0 - t4 + t2; + s4 = t0 + t4 - s6; + + s1 = y[1]; s3 = y[3]; s5 = y[5]; s7 = y[7]; + + s3 *= 0.86602540f; + t0 = (s5 + s1)*0.98480775f; + t4 = (s5 - s7)*0.34202014f; + t2 = (s1 + s7)*0.64278761f; + s1 = (s1 - s5 - s7)*0.86602540f; + + s5 = t0 - s3 - t2; + s7 = t4 - s3 - t0; + s3 = t4 + s3 - t2; + + y[0] = s4 - s7; + y[1] = s2 + s1; + y[2] = s0 - s3; + y[3] = s8 + s5; + y[5] = s8 - s5; + y[6] = s0 + s3; + y[7] = s2 - s1; + y[8] = s4 + s7; +} + +static void L3_imdct36(float *grbuf, float *overlap, const float *window, int nbands) +{ + int i, j; + static const float g_twid9[18] = { + 0.73727734f,0.79335334f,0.84339145f,0.88701083f,0.92387953f,0.95371695f,0.97629601f,0.99144486f,0.99904822f,0.67559021f,0.60876143f,0.53729961f,0.46174861f,0.38268343f,0.30070580f,0.21643961f,0.13052619f,0.04361938f + }; + + for (j = 0; j < nbands; j++, grbuf += 18, overlap += 9) + { + float co[9], si[9]; + co[0] = -grbuf[0]; + si[0] = grbuf[17]; + for (i = 0; i < 4; i++) + { + si[8 - 2*i] = grbuf[4*i + 1] - grbuf[4*i + 2]; + co[1 + 2*i] = grbuf[4*i + 1] + grbuf[4*i + 2]; + si[7 - 2*i] = grbuf[4*i + 4] - grbuf[4*i + 3]; + co[2 + 2*i] = -(grbuf[4*i + 3] + grbuf[4*i + 4]); + } + L3_dct3_9(co); + L3_dct3_9(si); + + si[1] = -si[1]; + si[3] = -si[3]; + si[5] = -si[5]; + si[7] = -si[7]; + + i = 0; + +#if HAVE_SIMD + if (have_simd()) for (; i < 8; i += 4) + { + f4 vovl = VLD(overlap + i); + f4 vc = VLD(co + i); + f4 vs = VLD(si + i); + f4 vr0 = VLD(g_twid9 + i); + f4 vr1 = VLD(g_twid9 + 9 + i); + f4 vw0 = VLD(window + i); + f4 vw1 = VLD(window + 9 + i); + f4 vsum = VADD(VMUL(vc, vr1), VMUL(vs, vr0)); + VSTORE(overlap + i, VSUB(VMUL(vc, vr0), VMUL(vs, vr1))); + VSTORE(grbuf + i, VSUB(VMUL(vovl, vw0), VMUL(vsum, vw1))); + vsum = VADD(VMUL(vovl, vw1), VMUL(vsum, vw0)); + VSTORE(grbuf + 14 - i, VREV(vsum)); + } +#endif /* HAVE_SIMD */ + for (; i < 9; i++) + { + float ovl = overlap[i]; + float sum = co[i]*g_twid9[9 + i] + si[i]*g_twid9[0 + i]; + overlap[i] = co[i]*g_twid9[0 + i] - si[i]*g_twid9[9 + i]; + grbuf[i] = ovl*window[0 + i] - sum*window[9 + i]; + grbuf[17 - i] = ovl*window[9 + i] + sum*window[0 + i]; + } + } +} + +static void L3_idct3(float x0, float x1, float x2, float *dst) +{ + float m1 = x1*0.86602540f; + float a1 = x0 - x2*0.5f; + dst[1] = x0 + x2; + dst[0] = a1 + m1; + dst[2] = a1 - m1; +} + +static void L3_imdct12(float *x, float *dst, float *overlap) +{ + static const float g_twid3[6] = { 0.79335334f,0.92387953f,0.99144486f, 0.60876143f,0.38268343f,0.13052619f }; + float co[3], si[3]; + int i; + + L3_idct3(-x[0], x[6] + x[3], x[12] + x[9], co); + L3_idct3(x[15], x[12] - x[9], x[6] - x[3], si); + si[1] = -si[1]; + + for (i = 0; i < 3; i++) + { + float ovl = overlap[i]; + float sum = co[i]*g_twid3[3 + i] + si[i]*g_twid3[0 + i]; + overlap[i] = co[i]*g_twid3[0 + i] - si[i]*g_twid3[3 + i]; + dst[i] = ovl*g_twid3[2 - i] - sum*g_twid3[5 - i]; + dst[5 - i] = ovl*g_twid3[5 - i] + sum*g_twid3[2 - i]; + } +} + +static void L3_imdct_short(float *grbuf, float *overlap, int nbands) +{ + for (;nbands > 0; nbands--, overlap += 9, grbuf += 18) + { + float tmp[18]; + memcpy(tmp, grbuf, sizeof(tmp)); + memcpy(grbuf, overlap, 6*sizeof(float)); + L3_imdct12(tmp, grbuf + 6, overlap + 6); + L3_imdct12(tmp + 1, grbuf + 12, overlap + 6); + L3_imdct12(tmp + 2, overlap, overlap + 6); + } +} + +static void L3_change_sign(float *grbuf) +{ + int b, i; + for (b = 0, grbuf += 18; b < 32; b += 2, grbuf += 36) + for (i = 1; i < 18; i += 2) + grbuf[i] = -grbuf[i]; +} + +static void L3_imdct_gr(float *grbuf, float *overlap, unsigned block_type, unsigned n_long_bands) +{ + static const float g_mdct_window[2][18] = { + { 0.99904822f,0.99144486f,0.97629601f,0.95371695f,0.92387953f,0.88701083f,0.84339145f,0.79335334f,0.73727734f,0.04361938f,0.13052619f,0.21643961f,0.30070580f,0.38268343f,0.46174861f,0.53729961f,0.60876143f,0.67559021f }, + { 1,1,1,1,1,1,0.99144486f,0.92387953f,0.79335334f,0,0,0,0,0,0,0.13052619f,0.38268343f,0.60876143f } + }; + if (n_long_bands) + { + L3_imdct36(grbuf, overlap, g_mdct_window[0], n_long_bands); + grbuf += 18*n_long_bands; + overlap += 9*n_long_bands; + } + if (block_type == SHORT_BLOCK_TYPE) + L3_imdct_short(grbuf, overlap, 32 - n_long_bands); + else + L3_imdct36(grbuf, overlap, g_mdct_window[block_type == STOP_BLOCK_TYPE], 32 - n_long_bands); +} + +static void L3_save_reservoir(mp3dec_t *h, mp3dec_scratch_t *s) +{ + int pos = (s->bs.pos + 7)/8u; + int remains = s->bs.limit/8u - pos; + if (remains > MAX_BITRESERVOIR_BYTES) + { + pos += remains - MAX_BITRESERVOIR_BYTES; + remains = MAX_BITRESERVOIR_BYTES; + } + if (remains > 0) + { + memmove(h->reserv_buf, s->maindata + pos, remains); + } + h->reserv = remains; +} + +static int L3_restore_reservoir(mp3dec_t *h, bs_t *bs, mp3dec_scratch_t *s, int main_data_begin) +{ + int frame_bytes = (bs->limit - bs->pos)/8; + int bytes_have = MINIMP3_MIN(h->reserv, main_data_begin); + memcpy(s->maindata, h->reserv_buf + MINIMP3_MAX(0, h->reserv - main_data_begin), MINIMP3_MIN(h->reserv, main_data_begin)); + memcpy(s->maindata + bytes_have, bs->buf + bs->pos/8, frame_bytes); + bs_init(&s->bs, s->maindata, bytes_have + frame_bytes); + return h->reserv >= main_data_begin; +} + +static void L3_decode(mp3dec_t *h, mp3dec_scratch_t *s, L3_gr_info_t *gr_info, int nch) +{ + int ch; + + for (ch = 0; ch < nch; ch++) + { + int layer3gr_limit = s->bs.pos + gr_info[ch].part_23_length; + L3_decode_scalefactors(h->header, s->ist_pos[ch], &s->bs, gr_info + ch, s->scf, ch); + L3_huffman(s->grbuf[ch], &s->bs, gr_info + ch, s->scf, layer3gr_limit); + } + + if (HDR_TEST_I_STEREO(h->header)) + { + L3_intensity_stereo(s->grbuf[0], s->ist_pos[1], gr_info, h->header); + } else if (HDR_IS_MS_STEREO(h->header)) + { + L3_midside_stereo(s->grbuf[0], 576); + } + + for (ch = 0; ch < nch; ch++, gr_info++) + { + int aa_bands = 31; + int n_long_bands = (gr_info->mixed_block_flag ? 2 : 0) << (int)(HDR_GET_MY_SAMPLE_RATE(h->header) == 2); + + if (gr_info->n_short_sfb) + { + aa_bands = n_long_bands - 1; + L3_reorder(s->grbuf[ch] + n_long_bands*18, s->syn[0], gr_info->sfbtab + gr_info->n_long_sfb); + } + + L3_antialias(s->grbuf[ch], aa_bands); + L3_imdct_gr(s->grbuf[ch], h->mdct_overlap[ch], gr_info->block_type, n_long_bands); + L3_change_sign(s->grbuf[ch]); + } +} + +static void mp3d_DCT_II(float *grbuf, int n) +{ + static const float g_sec[24] = { + 10.19000816f,0.50060302f,0.50241929f,3.40760851f,0.50547093f,0.52249861f,2.05778098f,0.51544732f,0.56694406f,1.48416460f,0.53104258f,0.64682180f,1.16943991f,0.55310392f,0.78815460f,0.97256821f,0.58293498f,1.06067765f,0.83934963f,0.62250412f,1.72244716f,0.74453628f,0.67480832f,5.10114861f + }; + int i, k = 0; +#if HAVE_SIMD + if (have_simd()) for (; k < n; k += 4) + { + f4 t[4][8], *x; + float *y = grbuf + k; + + for (x = t[0], i = 0; i < 8; i++, x++) + { + f4 x0 = VLD(&y[i*18]); + f4 x1 = VLD(&y[(15 - i)*18]); + f4 x2 = VLD(&y[(16 + i)*18]); + f4 x3 = VLD(&y[(31 - i)*18]); + f4 t0 = VADD(x0, x3); + f4 t1 = VADD(x1, x2); + f4 t2 = VMUL_S(VSUB(x1, x2), g_sec[3*i + 0]); + f4 t3 = VMUL_S(VSUB(x0, x3), g_sec[3*i + 1]); + x[0] = VADD(t0, t1); + x[8] = VMUL_S(VSUB(t0, t1), g_sec[3*i + 2]); + x[16] = VADD(t3, t2); + x[24] = VMUL_S(VSUB(t3, t2), g_sec[3*i + 2]); + } + for (x = t[0], i = 0; i < 4; i++, x += 8) + { + f4 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt; + xt = VSUB(x0, x7); x0 = VADD(x0, x7); + x7 = VSUB(x1, x6); x1 = VADD(x1, x6); + x6 = VSUB(x2, x5); x2 = VADD(x2, x5); + x5 = VSUB(x3, x4); x3 = VADD(x3, x4); + x4 = VSUB(x0, x3); x0 = VADD(x0, x3); + x3 = VSUB(x1, x2); x1 = VADD(x1, x2); + x[0] = VADD(x0, x1); + x[4] = VMUL_S(VSUB(x0, x1), 0.70710677f); + x5 = VADD(x5, x6); + x6 = VMUL_S(VADD(x6, x7), 0.70710677f); + x7 = VADD(x7, xt); + x3 = VMUL_S(VADD(x3, x4), 0.70710677f); + x5 = VSUB(x5, VMUL_S(x7, 0.198912367f)); /* rotate by PI/8 */ + x7 = VADD(x7, VMUL_S(x5, 0.382683432f)); + x5 = VSUB(x5, VMUL_S(x7, 0.198912367f)); + x0 = VSUB(xt, x6); xt = VADD(xt, x6); + x[1] = VMUL_S(VADD(xt, x7), 0.50979561f); + x[2] = VMUL_S(VADD(x4, x3), 0.54119611f); + x[3] = VMUL_S(VSUB(x0, x5), 0.60134488f); + x[5] = VMUL_S(VADD(x0, x5), 0.89997619f); + x[6] = VMUL_S(VSUB(x4, x3), 1.30656302f); + x[7] = VMUL_S(VSUB(xt, x7), 2.56291556f); + } + + if (k > n - 3) + { +#if HAVE_SSE +#define VSAVE2(i, v) _mm_storel_pi((__m64 *)(void*)&y[i*18], v) +#else /* HAVE_SSE */ +#define VSAVE2(i, v) vst1_f32((float32_t *)&y[i*18], vget_low_f32(v)) +#endif /* HAVE_SSE */ + for (i = 0; i < 7; i++, y += 4*18) + { + f4 s = VADD(t[3][i], t[3][i + 1]); + VSAVE2(0, t[0][i]); + VSAVE2(1, VADD(t[2][i], s)); + VSAVE2(2, VADD(t[1][i], t[1][i + 1])); + VSAVE2(3, VADD(t[2][1 + i], s)); + } + VSAVE2(0, t[0][7]); + VSAVE2(1, VADD(t[2][7], t[3][7])); + VSAVE2(2, t[1][7]); + VSAVE2(3, t[3][7]); + } else + { +#define VSAVE4(i, v) VSTORE(&y[i*18], v) + for (i = 0; i < 7; i++, y += 4*18) + { + f4 s = VADD(t[3][i], t[3][i + 1]); + VSAVE4(0, t[0][i]); + VSAVE4(1, VADD(t[2][i], s)); + VSAVE4(2, VADD(t[1][i], t[1][i + 1])); + VSAVE4(3, VADD(t[2][1 + i], s)); + } + VSAVE4(0, t[0][7]); + VSAVE4(1, VADD(t[2][7], t[3][7])); + VSAVE4(2, t[1][7]); + VSAVE4(3, t[3][7]); + } + } else +#endif /* HAVE_SIMD */ +#ifdef MINIMP3_ONLY_SIMD + {} /* for HAVE_SIMD=1, MINIMP3_ONLY_SIMD=1 case we do not need non-intrinsic "else" branch */ +#else /* MINIMP3_ONLY_SIMD */ + for (; k < n; k++) + { + float t[4][8], *x, *y = grbuf + k; + + for (x = t[0], i = 0; i < 8; i++, x++) + { + float x0 = y[i*18]; + float x1 = y[(15 - i)*18]; + float x2 = y[(16 + i)*18]; + float x3 = y[(31 - i)*18]; + float t0 = x0 + x3; + float t1 = x1 + x2; + float t2 = (x1 - x2)*g_sec[3*i + 0]; + float t3 = (x0 - x3)*g_sec[3*i + 1]; + x[0] = t0 + t1; + x[8] = (t0 - t1)*g_sec[3*i + 2]; + x[16] = t3 + t2; + x[24] = (t3 - t2)*g_sec[3*i + 2]; + } + for (x = t[0], i = 0; i < 4; i++, x += 8) + { + float x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt; + xt = x0 - x7; x0 += x7; + x7 = x1 - x6; x1 += x6; + x6 = x2 - x5; x2 += x5; + x5 = x3 - x4; x3 += x4; + x4 = x0 - x3; x0 += x3; + x3 = x1 - x2; x1 += x2; + x[0] = x0 + x1; + x[4] = (x0 - x1)*0.70710677f; + x5 = x5 + x6; + x6 = (x6 + x7)*0.70710677f; + x7 = x7 + xt; + x3 = (x3 + x4)*0.70710677f; + x5 -= x7*0.198912367f; /* rotate by PI/8 */ + x7 += x5*0.382683432f; + x5 -= x7*0.198912367f; + x0 = xt - x6; xt += x6; + x[1] = (xt + x7)*0.50979561f; + x[2] = (x4 + x3)*0.54119611f; + x[3] = (x0 - x5)*0.60134488f; + x[5] = (x0 + x5)*0.89997619f; + x[6] = (x4 - x3)*1.30656302f; + x[7] = (xt - x7)*2.56291556f; + + } + for (i = 0; i < 7; i++, y += 4*18) + { + y[0*18] = t[0][i]; + y[1*18] = t[2][i] + t[3][i] + t[3][i + 1]; + y[2*18] = t[1][i] + t[1][i + 1]; + y[3*18] = t[2][i + 1] + t[3][i] + t[3][i + 1]; + } + y[0*18] = t[0][7]; + y[1*18] = t[2][7] + t[3][7]; + y[2*18] = t[1][7]; + y[3*18] = t[3][7]; + } +#endif /* MINIMP3_ONLY_SIMD */ +} + +#ifndef MINIMP3_FLOAT_OUTPUT +static int16_t mp3d_scale_pcm(float sample) +{ +#if HAVE_ARMV6 + int32_t s32 = (int32_t)(sample + .5f); + s32 -= (s32 < 0); + int16_t s = (int16_t)minimp3_clip_int16_arm(s32); +#else + if (sample >= 32766.5) return (int16_t) 32767; + if (sample <= -32767.5) return (int16_t)-32768; + int16_t s = (int16_t)(sample + .5f); + s -= (s < 0); /* away from zero, to be compliant */ +#endif + return s; +} +#else /* MINIMP3_FLOAT_OUTPUT */ +static float mp3d_scale_pcm(float sample) +{ + return sample*(1.f/32768.f); +} +#endif /* MINIMP3_FLOAT_OUTPUT */ + +static void mp3d_synth_pair(mp3d_sample_t *pcm, int nch, const float *z) +{ + float a; + a = (z[14*64] - z[ 0]) * 29; + a += (z[ 1*64] + z[13*64]) * 213; + a += (z[12*64] - z[ 2*64]) * 459; + a += (z[ 3*64] + z[11*64]) * 2037; + a += (z[10*64] - z[ 4*64]) * 5153; + a += (z[ 5*64] + z[ 9*64]) * 6574; + a += (z[ 8*64] - z[ 6*64]) * 37489; + a += z[ 7*64] * 75038; + pcm[0] = mp3d_scale_pcm(a); + + z += 2; + a = z[14*64] * 104; + a += z[12*64] * 1567; + a += z[10*64] * 9727; + a += z[ 8*64] * 64019; + a += z[ 6*64] * -9975; + a += z[ 4*64] * -45; + a += z[ 2*64] * 146; + a += z[ 0*64] * -5; + pcm[16*nch] = mp3d_scale_pcm(a); +} + +static void mp3d_synth(float *xl, mp3d_sample_t *dstl, int nch, float *lins) +{ + int i; + float *xr = xl + 576*(nch - 1); + mp3d_sample_t *dstr = dstl + (nch - 1); + + static const float g_win[] = { + -1,26,-31,208,218,401,-519,2063,2000,4788,-5517,7134,5959,35640,-39336,74992, + -1,24,-35,202,222,347,-581,2080,1952,4425,-5879,7640,5288,33791,-41176,74856, + -1,21,-38,196,225,294,-645,2087,1893,4063,-6237,8092,4561,31947,-43006,74630, + -1,19,-41,190,227,244,-711,2085,1822,3705,-6589,8492,3776,30112,-44821,74313, + -1,17,-45,183,228,197,-779,2075,1739,3351,-6935,8840,2935,28289,-46617,73908, + -1,16,-49,176,228,153,-848,2057,1644,3004,-7271,9139,2037,26482,-48390,73415, + -2,14,-53,169,227,111,-919,2032,1535,2663,-7597,9389,1082,24694,-50137,72835, + -2,13,-58,161,224,72,-991,2001,1414,2330,-7910,9592,70,22929,-51853,72169, + -2,11,-63,154,221,36,-1064,1962,1280,2006,-8209,9750,-998,21189,-53534,71420, + -2,10,-68,147,215,2,-1137,1919,1131,1692,-8491,9863,-2122,19478,-55178,70590, + -3,9,-73,139,208,-29,-1210,1870,970,1388,-8755,9935,-3300,17799,-56778,69679, + -3,8,-79,132,200,-57,-1283,1817,794,1095,-8998,9966,-4533,16155,-58333,68692, + -4,7,-85,125,189,-83,-1356,1759,605,814,-9219,9959,-5818,14548,-59838,67629, + -4,7,-91,117,177,-106,-1428,1698,402,545,-9416,9916,-7154,12980,-61289,66494, + -5,6,-97,111,163,-127,-1498,1634,185,288,-9585,9838,-8540,11455,-62684,65290 + }; + float *zlin = lins + 15*64; + const float *w = g_win; + + zlin[4*15] = xl[18*16]; + zlin[4*15 + 1] = xr[18*16]; + zlin[4*15 + 2] = xl[0]; + zlin[4*15 + 3] = xr[0]; + + zlin[4*31] = xl[1 + 18*16]; + zlin[4*31 + 1] = xr[1 + 18*16]; + zlin[4*31 + 2] = xl[1]; + zlin[4*31 + 3] = xr[1]; + + mp3d_synth_pair(dstr, nch, lins + 4*15 + 1); + mp3d_synth_pair(dstr + 32*nch, nch, lins + 4*15 + 64 + 1); + mp3d_synth_pair(dstl, nch, lins + 4*15); + mp3d_synth_pair(dstl + 32*nch, nch, lins + 4*15 + 64); + +#if HAVE_SIMD + if (have_simd()) for (i = 14; i >= 0; i--) + { +#define VLOAD(k) f4 w0 = VSET(*w++); f4 w1 = VSET(*w++); f4 vz = VLD(&zlin[4*i - 64*k]); f4 vy = VLD(&zlin[4*i - 64*(15 - k)]); +#define V0(k) { VLOAD(k) b = VADD(VMUL(vz, w1), VMUL(vy, w0)) ; a = VSUB(VMUL(vz, w0), VMUL(vy, w1)); } +#define V1(k) { VLOAD(k) b = VADD(b, VADD(VMUL(vz, w1), VMUL(vy, w0))); a = VADD(a, VSUB(VMUL(vz, w0), VMUL(vy, w1))); } +#define V2(k) { VLOAD(k) b = VADD(b, VADD(VMUL(vz, w1), VMUL(vy, w0))); a = VADD(a, VSUB(VMUL(vy, w1), VMUL(vz, w0))); } + f4 a, b; + zlin[4*i] = xl[18*(31 - i)]; + zlin[4*i + 1] = xr[18*(31 - i)]; + zlin[4*i + 2] = xl[1 + 18*(31 - i)]; + zlin[4*i + 3] = xr[1 + 18*(31 - i)]; + zlin[4*i + 64] = xl[1 + 18*(1 + i)]; + zlin[4*i + 64 + 1] = xr[1 + 18*(1 + i)]; + zlin[4*i - 64 + 2] = xl[18*(1 + i)]; + zlin[4*i - 64 + 3] = xr[18*(1 + i)]; + + V0(0) V2(1) V1(2) V2(3) V1(4) V2(5) V1(6) V2(7) + + { +#ifndef MINIMP3_FLOAT_OUTPUT +#if HAVE_SSE + static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f }; + static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f }; + __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)), + _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min))); + dstr[(15 - i)*nch] = _mm_extract_epi16(pcm8, 1); + dstr[(17 + i)*nch] = _mm_extract_epi16(pcm8, 5); + dstl[(15 - i)*nch] = _mm_extract_epi16(pcm8, 0); + dstl[(17 + i)*nch] = _mm_extract_epi16(pcm8, 4); + dstr[(47 - i)*nch] = _mm_extract_epi16(pcm8, 3); + dstr[(49 + i)*nch] = _mm_extract_epi16(pcm8, 7); + dstl[(47 - i)*nch] = _mm_extract_epi16(pcm8, 2); + dstl[(49 + i)*nch] = _mm_extract_epi16(pcm8, 6); +#else /* HAVE_SSE */ + int16x4_t pcma, pcmb; + a = VADD(a, VSET(0.5f)); + b = VADD(b, VSET(0.5f)); + pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0))))); + pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0))))); + vst1_lane_s16(dstr + (15 - i)*nch, pcma, 1); + vst1_lane_s16(dstr + (17 + i)*nch, pcmb, 1); + vst1_lane_s16(dstl + (15 - i)*nch, pcma, 0); + vst1_lane_s16(dstl + (17 + i)*nch, pcmb, 0); + vst1_lane_s16(dstr + (47 - i)*nch, pcma, 3); + vst1_lane_s16(dstr + (49 + i)*nch, pcmb, 3); + vst1_lane_s16(dstl + (47 - i)*nch, pcma, 2); + vst1_lane_s16(dstl + (49 + i)*nch, pcmb, 2); +#endif /* HAVE_SSE */ + +#else /* MINIMP3_FLOAT_OUTPUT */ + + static const f4 g_scale = { 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f }; + a = VMUL(a, g_scale); + b = VMUL(b, g_scale); +#if HAVE_SSE + _mm_store_ss(dstr + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); + _mm_store_ss(dstr + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); + _mm_store_ss(dstl + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0))); + _mm_store_ss(dstl + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 0, 0, 0))); + _mm_store_ss(dstr + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3))); + _mm_store_ss(dstr + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 3, 3))); + _mm_store_ss(dstl + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2))); + _mm_store_ss(dstl + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 2, 2))); +#else /* HAVE_SSE */ + vst1q_lane_f32(dstr + (15 - i)*nch, a, 1); + vst1q_lane_f32(dstr + (17 + i)*nch, b, 1); + vst1q_lane_f32(dstl + (15 - i)*nch, a, 0); + vst1q_lane_f32(dstl + (17 + i)*nch, b, 0); + vst1q_lane_f32(dstr + (47 - i)*nch, a, 3); + vst1q_lane_f32(dstr + (49 + i)*nch, b, 3); + vst1q_lane_f32(dstl + (47 - i)*nch, a, 2); + vst1q_lane_f32(dstl + (49 + i)*nch, b, 2); +#endif /* HAVE_SSE */ +#endif /* MINIMP3_FLOAT_OUTPUT */ + } + } else +#endif /* HAVE_SIMD */ +#ifdef MINIMP3_ONLY_SIMD + {} /* for HAVE_SIMD=1, MINIMP3_ONLY_SIMD=1 case we do not need non-intrinsic "else" branch */ +#else /* MINIMP3_ONLY_SIMD */ + for (i = 14; i >= 0; i--) + { +#define LOAD(k) float w0 = *w++; float w1 = *w++; float *vz = &zlin[4*i - k*64]; float *vy = &zlin[4*i - (15 - k)*64]; +#define S0(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] = vz[j]*w1 + vy[j]*w0, a[j] = vz[j]*w0 - vy[j]*w1; } +#define S1(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vz[j]*w0 - vy[j]*w1; } +#define S2(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vy[j]*w1 - vz[j]*w0; } + float a[4], b[4]; + + zlin[4*i] = xl[18*(31 - i)]; + zlin[4*i + 1] = xr[18*(31 - i)]; + zlin[4*i + 2] = xl[1 + 18*(31 - i)]; + zlin[4*i + 3] = xr[1 + 18*(31 - i)]; + zlin[4*(i + 16)] = xl[1 + 18*(1 + i)]; + zlin[4*(i + 16) + 1] = xr[1 + 18*(1 + i)]; + zlin[4*(i - 16) + 2] = xl[18*(1 + i)]; + zlin[4*(i - 16) + 3] = xr[18*(1 + i)]; + + S0(0) S2(1) S1(2) S2(3) S1(4) S2(5) S1(6) S2(7) + + dstr[(15 - i)*nch] = mp3d_scale_pcm(a[1]); + dstr[(17 + i)*nch] = mp3d_scale_pcm(b[1]); + dstl[(15 - i)*nch] = mp3d_scale_pcm(a[0]); + dstl[(17 + i)*nch] = mp3d_scale_pcm(b[0]); + dstr[(47 - i)*nch] = mp3d_scale_pcm(a[3]); + dstr[(49 + i)*nch] = mp3d_scale_pcm(b[3]); + dstl[(47 - i)*nch] = mp3d_scale_pcm(a[2]); + dstl[(49 + i)*nch] = mp3d_scale_pcm(b[2]); + } +#endif /* MINIMP3_ONLY_SIMD */ +} + +static void mp3d_synth_granule(float *qmf_state, float *grbuf, int nbands, int nch, mp3d_sample_t *pcm, float *lins) +{ + int i; + for (i = 0; i < nch; i++) + { + mp3d_DCT_II(grbuf + 576*i, nbands); + } + + memcpy(lins, qmf_state, sizeof(float)*15*64); + + for (i = 0; i < nbands; i += 2) + { + mp3d_synth(grbuf + i, pcm + 32*nch*i, nch, lins + i*64); + } +#ifndef MINIMP3_NONSTANDARD_BUT_LOGICAL + if (nch == 1) + { + for (i = 0; i < 15*64; i += 2) + { + qmf_state[i] = lins[nbands*64 + i]; + } + } else +#endif /* MINIMP3_NONSTANDARD_BUT_LOGICAL */ + { + memcpy(qmf_state, lins + nbands*64, sizeof(float)*15*64); + } +} + +static int mp3d_match_frame(const uint8_t *hdr, int mp3_bytes, int frame_bytes) +{ + int i, nmatch; + for (i = 0, nmatch = 0; nmatch < MAX_FRAME_SYNC_MATCHES; nmatch++) + { + i += hdr_frame_bytes(hdr + i, frame_bytes) + hdr_padding(hdr + i); + if (i + HDR_SIZE > mp3_bytes) + return nmatch > 0; + if (!hdr_compare(hdr, hdr + i)) + return 0; + } + return 1; +} + +static int mp3d_find_frame(const uint8_t *mp3, int mp3_bytes, int *free_format_bytes, int *ptr_frame_bytes) +{ + int i, k; + for (i = 0; i < mp3_bytes - HDR_SIZE; i++, mp3++) + { + if (hdr_valid(mp3)) + { + int frame_bytes = hdr_frame_bytes(mp3, *free_format_bytes); + int frame_and_padding = frame_bytes + hdr_padding(mp3); + + for (k = HDR_SIZE; !frame_bytes && k < MAX_FREE_FORMAT_FRAME_SIZE && i + 2*k < mp3_bytes - HDR_SIZE; k++) + { + if (hdr_compare(mp3, mp3 + k)) + { + int fb = k - hdr_padding(mp3); + int nextfb = fb + hdr_padding(mp3 + k); + if (i + k + nextfb + HDR_SIZE > mp3_bytes || !hdr_compare(mp3, mp3 + k + nextfb)) + continue; + frame_and_padding = k; + frame_bytes = fb; + *free_format_bytes = fb; + } + } + if ((frame_bytes && i + frame_and_padding <= mp3_bytes && + mp3d_match_frame(mp3, mp3_bytes - i, frame_bytes)) || + (!i && frame_and_padding == mp3_bytes)) + { + *ptr_frame_bytes = frame_and_padding; + return i; + } + *free_format_bytes = 0; + } + } + *ptr_frame_bytes = 0; + return mp3_bytes; +} + +void mp3dec_init(mp3dec_t *dec) +{ + dec->header[0] = 0; +} + +int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_sample_t *pcm, mp3dec_frame_info_t *info) +{ + int i = 0, igr, frame_size = 0, success = 1; + const uint8_t *hdr; + bs_t bs_frame[1]; + mp3dec_scratch_t scratch; + + if (mp3_bytes > 4 && dec->header[0] == 0xff && hdr_compare(dec->header, mp3)) + { + frame_size = hdr_frame_bytes(mp3, dec->free_format_bytes) + hdr_padding(mp3); + if (frame_size != mp3_bytes && (frame_size + HDR_SIZE > mp3_bytes || !hdr_compare(mp3, mp3 + frame_size))) + { + frame_size = 0; + } + } + if (!frame_size) + { + memset(dec, 0, sizeof(mp3dec_t)); + i = mp3d_find_frame(mp3, mp3_bytes, &dec->free_format_bytes, &frame_size); + if (!frame_size || i + frame_size > mp3_bytes) + { + info->frame_bytes = i; + return 0; + } + } + + hdr = mp3 + i; + memcpy(dec->header, hdr, HDR_SIZE); + info->frame_bytes = i + frame_size; + info->frame_offset = i; + info->channels = HDR_IS_MONO(hdr) ? 1 : 2; + info->hz = hdr_sample_rate_hz(hdr); + info->layer = 4 - HDR_GET_LAYER(hdr); + info->bitrate_kbps = hdr_bitrate_kbps(hdr); + + if (!pcm) + { + return hdr_frame_samples(hdr); + } + + bs_init(bs_frame, hdr + HDR_SIZE, frame_size - HDR_SIZE); + if (HDR_IS_CRC(hdr)) + { + get_bits(bs_frame, 16); + } + + if (info->layer == 3) + { + int main_data_begin = L3_read_side_info(bs_frame, scratch.gr_info, hdr); + if (main_data_begin < 0 || bs_frame->pos > bs_frame->limit) + { + mp3dec_init(dec); + return 0; + } + success = L3_restore_reservoir(dec, bs_frame, &scratch, main_data_begin); + if (success) + { + for (igr = 0; igr < (HDR_TEST_MPEG1(hdr) ? 2 : 1); igr++, pcm += 576*info->channels) + { + memset(scratch.grbuf[0], 0, 576*2*sizeof(float)); + L3_decode(dec, &scratch, scratch.gr_info + igr*info->channels, info->channels); + mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 18, info->channels, pcm, scratch.syn[0]); + } + } + L3_save_reservoir(dec, &scratch); + } else + { +#ifdef MINIMP3_ONLY_MP3 + return 0; +#else /* MINIMP3_ONLY_MP3 */ + L12_scale_info sci[1]; + L12_read_scale_info(hdr, bs_frame, sci); + + memset(scratch.grbuf[0], 0, 576*2*sizeof(float)); + for (i = 0, igr = 0; igr < 3; igr++) + { + if (12 == (i += L12_dequantize_granule(scratch.grbuf[0] + i, bs_frame, sci, info->layer | 1))) + { + i = 0; + L12_apply_scf_384(sci, sci->scf + igr, scratch.grbuf[0]); + mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 12, info->channels, pcm, scratch.syn[0]); + memset(scratch.grbuf[0], 0, 576*2*sizeof(float)); + pcm += 384*info->channels; + } + if (bs_frame->pos > bs_frame->limit) + { + mp3dec_init(dec); + return 0; + } + } +#endif /* MINIMP3_ONLY_MP3 */ + } + return success*hdr_frame_samples(dec->header); +} + +#ifdef MINIMP3_FLOAT_OUTPUT +void mp3dec_f32_to_s16(const float *in, int16_t *out, int num_samples) +{ + int i = 0; +#if HAVE_SIMD + int aligned_count = num_samples & ~7; + for(; i < aligned_count; i += 8) + { + static const f4 g_scale = { 32768.0f, 32768.0f, 32768.0f, 32768.0f }; + f4 a = VMUL(VLD(&in[i ]), g_scale); + f4 b = VMUL(VLD(&in[i+4]), g_scale); +#if HAVE_SSE + static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f }; + static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f }; + __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)), + _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min))); + out[i ] = _mm_extract_epi16(pcm8, 0); + out[i+1] = _mm_extract_epi16(pcm8, 1); + out[i+2] = _mm_extract_epi16(pcm8, 2); + out[i+3] = _mm_extract_epi16(pcm8, 3); + out[i+4] = _mm_extract_epi16(pcm8, 4); + out[i+5] = _mm_extract_epi16(pcm8, 5); + out[i+6] = _mm_extract_epi16(pcm8, 6); + out[i+7] = _mm_extract_epi16(pcm8, 7); +#else /* HAVE_SSE */ + int16x4_t pcma, pcmb; + a = VADD(a, VSET(0.5f)); + b = VADD(b, VSET(0.5f)); + pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0))))); + pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0))))); + vst1_lane_s16(out+i , pcma, 0); + vst1_lane_s16(out+i+1, pcma, 1); + vst1_lane_s16(out+i+2, pcma, 2); + vst1_lane_s16(out+i+3, pcma, 3); + vst1_lane_s16(out+i+4, pcmb, 0); + vst1_lane_s16(out+i+5, pcmb, 1); + vst1_lane_s16(out+i+6, pcmb, 2); + vst1_lane_s16(out+i+7, pcmb, 3); +#endif /* HAVE_SSE */ + } +#endif /* HAVE_SIMD */ + for(; i < num_samples; i++) + { + float sample = in[i] * 32768.0f; + if (sample >= 32766.5) + out[i] = (int16_t) 32767; + else if (sample <= -32767.5) + out[i] = (int16_t)-32768; + else + { + int16_t s = (int16_t)(sample + .5f); + s -= (s < 0); /* away from zero, to be compliant */ + out[i] = s; + } + } +} +#endif /* MINIMP3_FLOAT_OUTPUT */ +#endif /* MINIMP3_IMPLEMENTATION && !_MINIMP3_IMPLEMENTATION_GUARD */ diff --git a/vae.h b/vae.h index 58fd558..4713581 100644 --- a/vae.h +++ b/vae.h @@ -556,3 +556,183 @@ static void vae_ggml_free(VAEGGML * m) { if (m->cpu_backend) ggml_backend_free(m->cpu_backend); *m = {}; } + +// --------------------------------------------------------------------------- +// VAE Encoder (audio -> 64-d latents @ 25Hz for reference timbre) +// Oobleck encoder: conv1(2->128) -> 5 blocks (stride 2,4,4,8,8) -> snake -> conv2(2048->128) +// Output 128 = mean(64) + scale(64); we use mean only. +// Requires encoder.* tensors in the same VAE GGUF (full autoencoder export). +// --------------------------------------------------------------------------- +struct VAEEncoderBlock { + VAEResUnit ru[3]; + struct ggml_tensor * sa, * sb; + struct ggml_tensor * c1w, * c1b; + int in_ch, out_ch, stride; +}; + +struct VAEEncoderGGML { + struct ggml_tensor * c1w, * c1b; + VAEEncoderBlock blk[5]; + struct ggml_tensor * sa, * sb; + struct ggml_tensor * c2w, * c2b; + + ggml_backend_t backend; + ggml_backend_t cpu_backend; + ggml_backend_sched_t sched; + ggml_backend_buffer_t buf; + struct ggml_context * weight_ctx; + bool has_encoder; +}; + +static bool vae_encoder_load(VAEEncoderGGML * m, const char * path) { + m->has_encoder = false; + GGUFModel gf = {}; + if (!gf_load(&gf, path)) { + fprintf(stderr, "[VAE Encoder] cannot load %s\n", path); + return false; + } + if (!ggml_get_tensor(gf.meta, "encoder.conv1.weight_v")) { + gf_close(&gf); + fprintf(stderr, "[VAE Encoder] no encoder.* in %s (decoder-only GGUF). Use a full VAE GGUF for reference_audio WAV.\n", path); + return false; + } + + static const int enc_strides[] = {2, 4, 4, 8, 8}; + static const int enc_in_ch[] = {128, 256, 512, 1024, 2048}; + static const int enc_out_ch[] = {256, 512, 1024, 2048, 2048}; + static const int dilations[] = {1, 3, 9}; + + size_t ctx_size = ggml_tensor_overhead() * 200; + struct ggml_init_params p = { ctx_size, NULL, true }; + m->weight_ctx = ggml_init(p); + struct ggml_context * ctx = m->weight_ctx; + + m->c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, 2, 128); + m->c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128); + + for (int i = 0; i < 5; i++) { + VAEEncoderBlock & b = m->blk[i]; + b.in_ch = enc_in_ch[i]; + b.out_ch = enc_out_ch[i]; + b.stride = enc_strides[i]; + int C = b.in_ch; + b.sa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + b.sb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + b.c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 2 * b.stride, C, b.out_ch); + b.c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, b.out_ch); + for (int r = 0; r < 3; r++) { + VAEResUnit & ru = b.ru[r]; + ru.dilation = dilations[r]; + ru.s1a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + ru.s1b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + ru.c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, C, C); + ru.c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C); + ru.s2a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + ru.s2b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + ru.c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 1, C, C); + ru.c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C); + } + } + m->sa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048); + m->sb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048); + m->c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 3, 2048, 128); + m->c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128); + + BackendPair bp = backend_init("VAE-Encoder"); + m->backend = bp.backend; + m->cpu_backend = bp.cpu_backend; + m->sched = backend_sched_new(bp, 8192); + m->buf = ggml_backend_alloc_ctx_tensors(ctx, m->backend); + + vae_fuse_wn(m->c1w, gf, "encoder.conv1"); + vae_load_bias(m->c1b, gf, "encoder.conv1.bias"); + + for (int i = 0; i < 5; i++) { + VAEEncoderBlock & b = m->blk[i]; + std::string pfx = "encoder.block." + std::to_string(i); + for (int r = 0; r < 3; r++) { + std::string rp = pfx + ".res_unit" + std::to_string(r + 1); + vae_load_snake(b.ru[r].s1a, gf, rp + ".snake1.alpha"); + vae_load_snake_inv(b.ru[r].s1b, gf, rp + ".snake1.beta"); + vae_fuse_wn(b.ru[r].c1w, gf, rp + ".conv1"); + vae_load_bias(b.ru[r].c1b, gf, rp + ".conv1.bias"); + vae_load_snake(b.ru[r].s2a, gf, rp + ".snake2.alpha"); + vae_load_snake_inv(b.ru[r].s2b, gf, rp + ".snake2.beta"); + vae_fuse_wn(b.ru[r].c2w, gf, rp + ".conv2"); + vae_load_bias(b.ru[r].c2b, gf, rp + ".conv2.bias"); + } + vae_load_snake(b.sa, gf, pfx + ".snake1.alpha"); + vae_load_snake_inv(b.sb, gf, pfx + ".snake1.beta"); + vae_fuse_wn(b.c1w, gf, pfx + ".conv1"); + vae_load_bias(b.c1b, gf, pfx + ".conv1.bias"); + } + vae_load_snake(m->sa, gf, "encoder.snake1.alpha"); + vae_load_snake_inv(m->sb, gf, "encoder.snake1.beta"); + vae_fuse_wn(m->c2w, gf, "encoder.conv2"); + vae_load_bias(m->c2b, gf, "encoder.conv2.bias"); + + gf_close(&gf); + m->has_encoder = true; + fprintf(stderr, "[VAE Encoder] loaded (2->128->...->2048->128, 64-d mean)\n"); + return true; +} + +static int vae_encoder_forward(VAEEncoderGGML * m, const float * audio, int T_audio, + float * latent_out) { + if (!m->has_encoder || T_audio < 1920) return -1; + int T_latent = T_audio / 1920; + + ggml_backend_sched_reset(m->sched); + size_t ctx_size = 4096 * ggml_tensor_overhead() + ggml_graph_overhead(); + struct ggml_init_params gp = { ctx_size, NULL, true }; + struct ggml_context * ctx = ggml_init(gp); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 4096, false); + + struct ggml_tensor * x = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, T_audio, 2); + ggml_set_name(x, "audio_in"); + ggml_set_input(x); + x = vae_conv1d(ctx, m->c1w, m->c1b, x, 1, 3, 1); + + for (int i = 0; i < 5; i++) { + VAEEncoderBlock & b = m->blk[i]; + for (int r = 0; r < 3; r++) + x = vae_res_unit(ctx, &b.ru[r], x); + x = vae_snake(ctx, x, b.sa, b.sb); + int pad = (int)((float)b.stride / 2.0f + 0.5f); + x = vae_conv1d(ctx, b.c1w, b.c1b, x, b.stride, pad, 1); + } + + x = vae_snake(ctx, x, m->sa, m->sb); + x = vae_conv1d(ctx, m->c2w, m->c2b, x, 1, 1, 1); + ggml_set_name(x, "enc_out"); + ggml_set_output(x); + + ggml_build_forward_expand(gf, x); + if (!ggml_backend_sched_alloc_graph(m->sched, gf)) { + ggml_free(ctx); + return -1; + } + ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "audio_in"), audio, 0, (size_t)T_audio * 2 * sizeof(float)); + ggml_backend_sched_graph_compute(m->sched, gf); + ggml_backend_sched_synchronize(m->sched); + + struct ggml_tensor * out = ggml_graph_get_tensor(gf, "enc_out"); + std::vector tmp(128 * T_latent); + ggml_backend_tensor_get(out, tmp.data(), 0, 128 * (size_t)T_latent * sizeof(float)); + for (int t = 0; t < T_latent; t++) + for (int c = 0; c < 64; c++) + latent_out[t * 64 + c] = tmp[t * 128 + c]; + + ggml_backend_sched_reset(m->sched); + ggml_free(ctx); + return T_latent; +} + +static void vae_encoder_free(VAEEncoderGGML * m) { + if (m->sched) ggml_backend_sched_free(m->sched); + if (m->buf) ggml_backend_buffer_free(m->buf); + if (m->weight_ctx) ggml_free(m->weight_ctx); + if (m->backend && m->backend != m->cpu_backend) ggml_backend_free(m->backend); + if (m->cpu_backend) ggml_backend_free(m->cpu_backend); + *m = {}; +} diff --git a/wav.h b/wav.h new file mode 100644 index 0000000..ded473d --- /dev/null +++ b/wav.h @@ -0,0 +1,100 @@ +// wav.h: minimal WAV loader for reference audio (stereo 48kHz float out) +// No Python or external deps. Handles 16-bit PCM, mono/stereo, resamples to 48kHz if needed. + +#pragma once + +#include +#include +#include +#include + +// Load WAV file into stereo float32 at 48kHz. +// Out: interleaved L,R,L,R,... length = num_samples (both channels). +// Returns num_samples (per channel), or -1 on error. +static int wav_load_48k_stereo(const char * path, std::vector * out) { + FILE * f = fopen(path, "rb"); + if (!f) return -1; + + char riff[4], fmt[4]; + if (fread(riff, 1, 4, f) != 4 || memcmp(riff, "RIFF", 4) != 0) { + fclose(f); + return -1; + } + uint32_t file_len; + if (fread(&file_len, 4, 1, f) != 1) { fclose(f); return -1; } + if (fread(fmt, 1, 4, f) != 4 || memcmp(fmt, "WAVE", 4) != 0) { + fclose(f); + return -1; + } + + uint16_t channels = 2, bits = 16; + uint32_t sample_rate = 48000; + bool found_fmt = false; + + while (1) { + char chunk_id[4]; + if (fread(chunk_id, 1, 4, f) != 4) break; + uint32_t chunk_size; + if (fread(&chunk_size, 4, 1, f) != 1) break; + long chunk_start = ftell(f); + + if (memcmp(chunk_id, "fmt ", 4) == 0 && chunk_size >= 16) { + uint16_t fmt_tag, block_align; + uint32_t byte_rate; + if (fread(&fmt_tag, 2, 1, f) != 1) break; + if (fread(&channels, 2, 1, f) != 1) break; + if (fread(&sample_rate, 4, 1, f) != 1) break; + if (fread(&byte_rate, 4, 1, f) != 1) break; + if (fread(&block_align, 2, 1, f) != 1) break; + if (fread(&bits, 2, 1, f) != 1) break; + found_fmt = true; + } else if (memcmp(chunk_id, "data", 4) == 0 && found_fmt) { + size_t num_bytes = chunk_size; + size_t num_samples = num_bytes / (channels * (bits / 8)); + if (num_samples == 0) { fclose(f); return -1; } + + std::vector raw(num_samples * channels); + if (fread(raw.data(), 2, raw.size(), f) != raw.size()) { + fclose(f); + return -1; + } + + out->resize(num_samples * 2); + float scale = 1.0f / 32768.0f; + if (channels == 1) { + for (size_t i = 0; i < num_samples; i++) { + float s = (float)raw[i] * scale; + (*out)[i * 2] = s; + (*out)[i * 2 + 1] = s; + } + } else { + for (size_t i = 0; i < num_samples * 2; i++) + (*out)[i] = (float)raw[i] * scale; + } + + fclose(f); + + // Resample to 48kHz if needed (linear interpolation) + if (sample_rate != 48000) { + size_t in_len = num_samples; + size_t out_len = (size_t)((double)in_len * 48000.0 / (double)sample_rate); + std::vector resampled(out_len * 2); + for (size_t i = 0; i < out_len; i++) { + double t = (double)i * (double)in_len / (double)out_len; + size_t i0 = (size_t)t; + size_t i1 = std::min(i0 + 1, in_len - 1); + float w = (float)(t - (double)i0); + for (int c = 0; c < 2; c++) + resampled[i * 2 + c] = (*out)[i0 * 2 + c] * (1.0f - w) + (*out)[i1 * 2 + c] * w; + } + *out = std::move(resampled); + return (int)out_len; + } + return (int)num_samples; + } + + fseek(f, chunk_start + (long)chunk_size, SEEK_SET); + } + fclose(f); + return -1; +} From 0d22a861b4c1a0ac1459a5e2d35e323a3db15bd3 Mon Sep 17 00:00:00 2001 From: qxip Date: Sat, 28 Feb 2026 19:33:38 +0100 Subject: [PATCH 02/17] Add cover examples, harden test-generation cache, document examples - examples/cover.json + cover.sh: cover mode (precomputed audio_codes, no LLM) - examples/cover-reference.json + cover-reference.sh: cover + reference_audio (WAV/MP3) - README: list cover and cover-reference in examples - test-generation: cache key with runner.os, restore-keys per GitHub docs Made-with: Cursor --- .github/workflows/test-generation.yml | 7 +++++-- README.md | 2 ++ examples/cover-reference.json | 16 ++++++++++++++++ examples/cover-reference.sh | 20 ++++++++++++++++++++ examples/cover.json | 15 +++++++++++++++ examples/cover.sh | 15 +++++++++++++++ 6 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 examples/cover-reference.json create mode 100755 examples/cover-reference.sh create mode 100644 examples/cover.json create mode 100755 examples/cover.sh diff --git a/.github/workflows/test-generation.yml b/.github/workflows/test-generation.yml index 3c9547c..18afb24 100644 --- a/.github/workflows/test-generation.yml +++ b/.github/workflows/test-generation.yml @@ -26,13 +26,16 @@ jobs: cmake .. -DGGML_BLAS=ON cmake --build . --config Release -j$(nproc) + # Restore/save model artifacts (see https://docs.github.com/en/actions/reference/workflows-and-actions/dependency-caching) - name: Cache models id: cache-models uses: actions/cache@v4 with: path: models - key: acestep-models-q8-${{ hashFiles('models.sh') }} - restore-keys: acestep-models-q8- + key: ${{ runner.os }}-acestep-models-q8-${{ hashFiles('models.sh') }} + restore-keys: | + ${{ runner.os }}-acestep-models-q8- + ${{ runner.os }}-acestep-models- - name: Download models if: steps.cache-models.outputs.cache-hit != 'true' diff --git a/README.md b/README.md index 55ac16d..87a0e34 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,8 @@ cd examples ./partial.sh # caption + lyrics + duration ./full.sh # all metadata provided ./dit-only.sh # skip LLM, DiT from noise +./cover.sh # cover mode: decode precomputed audio_codes (no LLM) +./cover-reference.sh # cover + reference_audio for timbre (WAV/MP3; needs reference.wav or .mp3) ./test-reference.sh # reference_audio (WAV or MP3) + audio_cover_strength ``` diff --git a/examples/cover-reference.json b/examples/cover-reference.json new file mode 100644 index 0000000..313d419 --- /dev/null +++ b/examples/cover-reference.json @@ -0,0 +1,16 @@ +{ + "task_type": "cover", + "caption": "Cover with timbre from reference WAV/MP3", + "duration": 10, + "bpm": 83, + "keyscale": "G major", + "timesignature": "4", + "vocal_language": "fr", + "inference_steps": 8, + "guidance_scale": 1, + "shift": 3, + "seed": 42, + "audio_cover_strength": 0.9, + "audio_codes": "43316,18426,13366,59455,17783,49303,7423,29855,37158,37157,62317,61455,12847,19583,57031,34656,20254,10770,11416,15905,31413,23339,47091,12198,49531,37355,33090,38645,40707,16324,61436,46095,13941,5287,2239,13975,63815,2757,4862,13571,63495,39,29887,49426,12696,50847,40498,61056,25666,12989,23987,54763,25485,31683,28554,25355,16373,28995,2351,1655,7940,55831,34359,15350,15277,11717,20476,52239,5015,19807,24087,3559,20471,34193,32552,60999,29360,25338,38873,16768,17912,27584,24008,1528,449,25563,52684,53223,42183,37215,12343,39431,26055,28148,57286,38382,28863,7191,58397,18991,7695,30716,36784,12687,8707,25649,33718,3202,23035,10747,26354,63965,16260,11223,45679,14343,8679,49351,52927,2535,19207,46447,49615,12694,21110,46597,60991,27711,49751,54656,30448,33125,13585,29256,5161,42434,11753,39402,60354,21953,39532,14282,52160,34248,16304,4671,14172,5127,25991,15343,8583,61902,16328,31700,48415,28879,11215,52715,25541,11203,7695,63951,33803,40453,17750,28006,8231,40464,3136,51006,23839,18711,18711,18711,18711,3343,3279,2823,16071,3271,2823,2319,55815,40260,16215,12047,16631,26927,21863,20060,10166,51070,39,12099,63440,18418,25271,10792,2128,44166,53750,41263,44247,61287,42303,27614,21997,24879,38799,12648,38341,36833,19408,11769,2979,63979,44239,25559,27591,17626,44087,33796,4901,53176,57399,37180,38024,9216,63485,2005,13656,15914,45576,29194,45624,62332,53237,63988,40332,20486,31367,10951,46207,22231,63479,38877,17262,49335,42045,57388", + "reference_audio": "reference.wav" +} \ No newline at end of file diff --git a/examples/cover-reference.sh b/examples/cover-reference.sh new file mode 100755 index 0000000..2687b09 --- /dev/null +++ b/examples/cover-reference.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Cover mode with reference timbre: audio_codes + reference_audio (WAV or MP3). +# Put a WAV/MP3 at reference.wav (or reference.mp3) or set reference_audio in cover-reference.json. +# Requires VAE GGUF with encoder weights (same as request-reference / test-reference). +set -eu +cd "$(dirname "$0")" + +if [ ! -f "reference.wav" ] && [ ! -f "reference.mp3" ]; then + echo "No reference.wav or reference.mp3 found. Copy a file to reference.wav (or .mp3), or set reference_audio in cover-reference.json." + echo "Then run: $0" + exit 1 +fi + +../build/dit-vae \ + --request cover-reference.json \ + --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit ../models/acestep-v15-turbo-Q8_0.gguf \ + --vae ../models/vae-BF16.gguf + +echo "Done. Check cover-reference0.wav" diff --git a/examples/cover.json b/examples/cover.json new file mode 100644 index 0000000..82cbaaa --- /dev/null +++ b/examples/cover.json @@ -0,0 +1,15 @@ +{ + "task_type": "cover", + "caption": "Re-synthesize from precomputed codes (e.g. from a previous ace-qwen3 run)", + "duration": 10, + "bpm": 83, + "keyscale": "G major", + "timesignature": "4", + "vocal_language": "fr", + "inference_steps": 8, + "guidance_scale": 1, + "shift": 3, + "seed": 42, + "audio_cover_strength": 0.9, + "audio_codes": "43316,18426,13366,59455,17783,49303,7423,29855,37158,37157,62317,61455,12847,19583,57031,34656,20254,10770,11416,15905,31413,23339,47091,12198,49531,37355,33090,38645,40707,16324,61436,46095,13941,5287,2239,13975,63815,2757,4862,13571,63495,39,29887,49426,12696,50847,40498,61056,25666,12989,23987,54763,25485,31683,28554,25355,16373,28995,2351,1655,7940,55831,34359,15350,15277,11717,20476,52239,5015,19807,24087,3559,20471,34193,32552,60999,29360,25338,38873,16768,17912,27584,24008,1528,449,25563,52684,53223,42183,37215,12343,39431,26055,28148,57286,38382,28863,7191,58397,18991,7695,30716,36784,12687,8707,25649,33718,3202,23035,10747,26354,63965,16260,11223,45679,14343,8679,49351,52927,2535,19207,46447,49615,12694,21110,46597,60991,27711,49751,54656,30448,33125,13585,29256,5161,42434,11753,39402,60354,21953,39532,14282,52160,34248,16304,4671,14172,5127,25991,15343,8583,61902,16328,31700,48415,28879,11215,52715,25541,11203,7695,63951,33803,40453,17750,28006,8231,40464,3136,51006,23839,18711,18711,18711,18711,3343,3279,2823,16071,3271,2823,2319,55815,40260,16215,12047,16631,26927,21863,20060,10166,51070,39,12099,63440,18418,25271,10792,2128,44166,53750,41263,44247,61287,42303,27614,21997,24879,38799,12648,38341,36833,19408,11769,2979,63979,44239,25559,27591,17626,44087,33796,4901,53176,57399,37180,38024,9216,63485,2005,13656,15914,45576,29194,45624,62332,53237,63988,40332,20486,31367,10951,46207,22231,63479,38877,17262,49335,42045,57388" +} \ No newline at end of file diff --git a/examples/cover.sh b/examples/cover.sh new file mode 100755 index 0000000..14d340e --- /dev/null +++ b/examples/cover.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Cover mode: decode precomputed audio_codes to WAV (no LLM). +# Use cover.json as-is, or replace audio_codes with output from a previous run: +# ../build/ace-qwen3 --request simple.json --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf +# # then use simple0.json as input, or copy its audio_codes into cover.json +set -eu +cd "$(dirname "$0")" + +../build/dit-vae \ + --request cover.json \ + --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit ../models/acestep-v15-turbo-Q8_0.gguf \ + --vae ../models/vae-BF16.gguf + +echo "Done. Check cover0.wav" From a637e598ae959a8915176113d43c4e84235f5ddf Mon Sep 17 00:00:00 2001 From: qxip Date: Sat, 28 Feb 2026 20:10:34 +0100 Subject: [PATCH 03/17] VAE encoder fix, local test script, full-pipeline output path - vae.h: use encoder output tensor length (out->ne[0]) for read size instead of T_audio/1920 to fix tensor read out of bounds on reference_audio encode - dit-vae: support WAV and MP3 for reference_audio (single load path) - tests/run-generation-tests.sh: local run of same three steps as CI; step 3 copies fixture to request.json so ace-qwen3 writes request0.json - test-generation.yml: same request.json copy for full-pipeline step - README: validate locally first, then CI - .gitignore: request.json, request0.json, tests/fixtures/*0.json Made-with: Cursor --- .github/workflows/test-generation.yml | 6 +- .gitignore | 3 + README.md | 2 +- dit-vae.cpp | 51 ++++++++-------- tests/run-generation-tests.sh | 83 +++++++++++++++++++++++++++ vae.h | 8 ++- 6 files changed, 120 insertions(+), 33 deletions(-) create mode 100755 tests/run-generation-tests.sh diff --git a/.github/workflows/test-generation.yml b/.github/workflows/test-generation.yml index 18afb24..c63b3c0 100644 --- a/.github/workflows/test-generation.yml +++ b/.github/workflows/test-generation.yml @@ -1,5 +1,6 @@ # Build, download models (cached), and run short generation tests for various modes. -# Runs on release (published) or manual trigger only. Uses short duration (5s) and few steps (4). +# Validate locally first: from repo root run tests/run-generation-tests.sh (after build + ./models.sh). +# CI runs the same steps. Trigger: release (published) or workflow_dispatch only. name: Test generation on: @@ -63,8 +64,9 @@ jobs: - name: Test full pipeline (LLM + DiT, short) run: | + cp tests/fixtures/ci-text2music.json request.json ./build/ace-qwen3 \ - --request tests/fixtures/ci-text2music.json \ + --request request.json \ --model models/acestep-5Hz-lm-4B-Q8_0.gguf test -f request0.json ./build/dit-vae \ diff --git a/.gitignore b/.gitignore index 0fa15e6..91ddabb 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,9 @@ tests/*/ !tests/fixtures/ !tests/fixtures/*.json +request.json +request0.json +tests/fixtures/*0.json checkpoints/ models/ __pycache__/ diff --git a/README.md b/README.md index 87a0e34..178fe39 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ Builds two binaries: `ace-qwen3` (LLM) and `dit-vae` (DiT + VAE). **CI (GitHub Actions)** - **Build**: on every push/PR, builds on Ubuntu (BLAS) and macOS (Metal); smoke test runs each binary `--help`. -- **Test generation**: on manual trigger or push to `main`; builds, caches models, then runs short (5 s, 4 steps) generation for text2music, cover, and full pipeline (LLM → DiT → WAV). See `.github/workflows/`. +- **Test generation**: on release or manual trigger only; runs the same checks as **local** `tests/run-generation-tests.sh`. Validate locally first (build + `./models.sh`, then `tests/run-generation-tests.sh`), then use CI to confirm. See `.github/workflows/`. ## Models diff --git a/dit-vae.cpp b/dit-vae.cpp index 41582c9..5f57194 100644 --- a/dit-vae.cpp +++ b/dit-vae.cpp @@ -396,36 +396,33 @@ int main(int argc, char ** argv) { int S_ref_actual = S_ref; if (!req.reference_audio.empty()) { const std::string & ref_path = req.reference_audio; - if (ref_path.size() >= 4 && ref_path.compare(ref_path.size() - 4, 4, ".wav") == 0) { - std::vector wav_stereo; - int n_samples = load_audio_48k_stereo(ref_path.c_str(), &wav_stereo); - if (n_samples > 0 && have_vae) { - VAEEncoderGGML enc = {}; - if (vae_encoder_load(&enc, vae_gguf)) { - int T_audio = n_samples; - if (T_audio >= 1920) { - int T_lat = T_audio / 1920; - std::vector enc_out((size_t)T_lat * 64); - T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data()); - if (T_lat > 0) { - size_t copy_frames = (size_t)(T_lat < S_ref ? T_lat : S_ref); - memcpy(timbre_feats.data(), enc_out.data(), copy_frames * 64 * sizeof(float)); - if (T_lat < S_ref) - memcpy(timbre_feats.data() + copy_frames * 64, silence_full.data(), - (S_ref - (int)copy_frames) * 64 * sizeof(float)); - S_ref_actual = (int)copy_frames; - if (T_lat > S_ref) S_ref_actual = S_ref; - timbre_ptr = timbre_feats.data(); - fprintf(stderr, "[Timbre] encoded %s -> %d frames (25Hz)\n", ref_path.c_str(), S_ref_actual); - } + std::vector wav_stereo; + int n_samples = load_audio_48k_stereo(ref_path.c_str(), &wav_stereo); + if (n_samples > 0 && have_vae) { + VAEEncoderGGML enc = {}; + if (vae_encoder_load(&enc, vae_gguf)) { + int T_audio = n_samples; + if (T_audio >= 1920) { + std::vector enc_out((size_t)S_ref * 64); + int T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data()); + if (T_lat > 0) { + size_t copy_frames = (size_t)(T_lat < S_ref ? T_lat : S_ref); + memcpy(timbre_feats.data(), enc_out.data(), copy_frames * 64 * sizeof(float)); + if (T_lat < S_ref) + memcpy(timbre_feats.data() + copy_frames * 64, silence_full.data(), + (S_ref - (int)copy_frames) * 64 * sizeof(float)); + S_ref_actual = (int)copy_frames; + if (T_lat > S_ref) S_ref_actual = S_ref; + timbre_ptr = timbre_feats.data(); + fprintf(stderr, "[Timbre] encoded %s -> %d frames (25Hz)\n", ref_path.c_str(), S_ref_actual); } - vae_encoder_free(&enc); } - } else if (n_samples <= 0) { - fprintf(stderr, "[Timbre] WARNING: cannot load WAV %s, using silence\n", ref_path.c_str()); - } else if (!have_vae) { - fprintf(stderr, "[Timbre] WAV requires --vae (with encoder weights); using silence\n"); + vae_encoder_free(&enc); } + } else if (n_samples <= 0) { + fprintf(stderr, "[Timbre] WARNING: cannot load audio %s (use .wav or .mp3), using silence\n", ref_path.c_str()); + } else if (!have_vae) { + fprintf(stderr, "[Timbre] reference_audio requires --vae (with encoder weights); using silence\n"); } } diff --git a/tests/run-generation-tests.sh b/tests/run-generation-tests.sh new file mode 100755 index 0000000..666698c --- /dev/null +++ b/tests/run-generation-tests.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# Run the same generation tests as the GitHub Action (test-generation.yml). +# Use this to validate locally before pushing. No assumptions: build and models required. +# +# From repo root: +# ./models.sh # once: download Q8_0 + VAE into models/ +# mkdir -p build && cd build && cmake .. && cmake --build . --config Release +# cd .. && tests/run-generation-tests.sh + +set -e +cd "$(dirname "$0")/.." +REPO_ROOT="$PWD" + +# --- Build --- +if [ ! -f build/dit-vae ] || [ ! -f build/ace-qwen3 ]; then + echo "Missing build/dit-vae or build/ace-qwen3. Build first:" + echo " mkdir -p build && cd build && cmake .. && cmake --build . --config Release" + exit 1 +fi + +# --- Models --- +TEXT_ENC="models/Qwen3-Embedding-0.6B-Q8_0.gguf" +DIT="models/acestep-v15-turbo-Q8_0.gguf" +VAE="models/vae-BF16.gguf" +LM="models/acestep-5Hz-lm-4B-Q8_0.gguf" +for f in "$TEXT_ENC" "$DIT" "$VAE"; do + if [ ! -f "$f" ]; then + echo "Missing $f. Download models once: ./models.sh" + exit 1 + fi +done + +echo "[1/3] Test mode text2music (short)" +./build/dit-vae \ + --request tests/fixtures/ci-text2music.json \ + --text-encoder "$TEXT_ENC" \ + --dit "$DIT" \ + --vae "$VAE" +if [ ! -f tests/fixtures/ci-text2music0.wav ]; then + echo "FAIL: tests/fixtures/ci-text2music0.wav not created" + exit 1 +fi +echo " text2music WAV OK" + +echo "[2/3] Test mode cover with WAV reference (short)" +./build/dit-vae \ + --request tests/fixtures/ci-cover.json \ + --text-encoder "$TEXT_ENC" \ + --dit "$DIT" \ + --vae "$VAE" +if [ ! -f tests/fixtures/ci-cover0.wav ]; then + echo "FAIL: tests/fixtures/ci-cover0.wav not created" + exit 1 +fi +echo " cover WAV OK" + +echo "[3/3] Test full pipeline (LLM + DiT, short)" +if [ ! -f "$LM" ]; then + echo "Missing $LM; skipping full pipeline. Run ./models.sh to include LM." + exit 1 +fi +# ace-qwen3 names output from input path (e.g. request.json -> request0.json) +cp tests/fixtures/ci-text2music.json request.json +./build/ace-qwen3 \ + --request request.json \ + --model "$LM" +if [ ! -f request0.json ]; then + echo "FAIL: request0.json not created by ace-qwen3" + exit 1 +fi +./build/dit-vae \ + --request request0.json \ + --text-encoder "$TEXT_ENC" \ + --dit "$DIT" \ + --vae "$VAE" +if [ ! -f request00.wav ]; then + echo "FAIL: request00.wav not created" + exit 1 +fi +echo " full pipeline WAV OK" + +echo "" +echo "All generation tests passed locally. Safe to rely on CI for the same checks." diff --git a/vae.h b/vae.h index 4713581..92ff718 100644 --- a/vae.h +++ b/vae.h @@ -680,7 +680,6 @@ static bool vae_encoder_load(VAEEncoderGGML * m, const char * path) { static int vae_encoder_forward(VAEEncoderGGML * m, const float * audio, int T_audio, float * latent_out) { if (!m->has_encoder || T_audio < 1920) return -1; - int T_latent = T_audio / 1920; ggml_backend_sched_reset(m->sched); size_t ctx_size = 4096 * ggml_tensor_overhead() + ggml_graph_overhead(); @@ -717,8 +716,11 @@ static int vae_encoder_forward(VAEEncoderGGML * m, const float * audio, int T_au ggml_backend_sched_synchronize(m->sched); struct ggml_tensor * out = ggml_graph_get_tensor(gf, "enc_out"); - std::vector tmp(128 * T_latent); - ggml_backend_tensor_get(out, tmp.data(), 0, 128 * (size_t)T_latent * sizeof(float)); + // Encoder strides 2,4,4,8,8 give T_out != T_audio/1920; use actual output shape to avoid read out of bounds + int T_latent = (int)out->ne[0]; + size_t nbytes = (size_t)T_latent * 128 * sizeof(float); + std::vector tmp((size_t)T_latent * 128); + ggml_backend_tensor_get(out, tmp.data(), 0, nbytes); for (int t = 0; t < T_latent; t++) for (int c = 0; c < 64; c++) latent_out[t * 64 + c] = tmp[t * 128 + c]; From 9f23438ba155bb718f2bc4a38afc880c72b073d4 Mon Sep 17 00:00:00 2001 From: qxip Date: Sat, 28 Feb 2026 20:36:33 +0100 Subject: [PATCH 04/17] CI: run example scripts with short fixtures - Add examples/run-examples-ci.sh to run all 7 example scripts in order - Add short-duration CI fixtures: ci-dit-only, ci-partial, ci-full, ci-request-reference - Replace inline test-generation steps with single step running run-examples-ci.sh Made-with: Cursor --- .github/workflows/test-generation.yml | 39 +++----------------- examples/run-examples-ci.sh | 47 ++++++++++++++++++++++++ tests/fixtures/ci-dit-only.json | 11 ++++++ tests/fixtures/ci-full.json | 11 ++++++ tests/fixtures/ci-partial.json | 8 ++++ tests/fixtures/ci-request-reference.json | 13 +++++++ 6 files changed, 95 insertions(+), 34 deletions(-) create mode 100755 examples/run-examples-ci.sh create mode 100644 tests/fixtures/ci-dit-only.json create mode 100644 tests/fixtures/ci-full.json create mode 100644 tests/fixtures/ci-partial.json create mode 100644 tests/fixtures/ci-request-reference.json diff --git a/.github/workflows/test-generation.yml b/.github/workflows/test-generation.yml index c63b3c0..36cefd1 100644 --- a/.github/workflows/test-generation.yml +++ b/.github/workflows/test-generation.yml @@ -1,6 +1,6 @@ -# Build, download models (cached), and run short generation tests for various modes. -# Validate locally first: from repo root run tests/run-generation-tests.sh (after build + ./models.sh). -# CI runs the same steps. Trigger: release (published) or workflow_dispatch only. +# Build, download models (cached), and run all example scripts with short CI fixtures. +# Validate locally: from repo root run ./examples/run-examples-ci.sh (after build + ./models.sh). +# Trigger: release (published) or workflow_dispatch only. name: Test generation on: @@ -44,34 +44,5 @@ jobs: pip install -q hf ./models.sh - - name: Test mode text2music (short) - run: | - ./build/dit-vae \ - --request tests/fixtures/ci-text2music.json \ - --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \ - --dit models/acestep-v15-turbo-Q8_0.gguf \ - --vae models/vae-BF16.gguf - test -f tests/fixtures/ci-text2music0.wav && echo "text2music WAV OK" - - - name: Test mode cover with WAV reference (short) - run: | - ./build/dit-vae \ - --request tests/fixtures/ci-cover.json \ - --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \ - --dit models/acestep-v15-turbo-Q8_0.gguf \ - --vae models/vae-BF16.gguf - test -f tests/fixtures/ci-cover0.wav && echo "cover WAV OK" - - - name: Test full pipeline (LLM + DiT, short) - run: | - cp tests/fixtures/ci-text2music.json request.json - ./build/ace-qwen3 \ - --request request.json \ - --model models/acestep-5Hz-lm-4B-Q8_0.gguf - test -f request0.json - ./build/dit-vae \ - --request request0.json \ - --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \ - --dit models/acestep-v15-turbo-Q8_0.gguf \ - --vae models/vae-BF16.gguf - test -f request00.wav && echo "full pipeline WAV OK" + - name: Run examples (CI fixtures, short duration) + run: chmod +x examples/run-examples-ci.sh && ./examples/run-examples-ci.sh diff --git a/examples/run-examples-ci.sh b/examples/run-examples-ci.sh new file mode 100755 index 0000000..362f92a --- /dev/null +++ b/examples/run-examples-ci.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Run all example scripts with short-duration CI fixtures (from repo root). +# Prereqs: build/ and models/ present; run after build and ./models.sh. +set -eu +cd "$(dirname "$0")/.." +EXAMPLES=examples +cd "$EXAMPLES" + +run() { echo "== $*" && "$@"; } + +# 1) DiT-only (no LLM), 5s +run cp ../tests/fixtures/ci-dit-only.json dit-only.json +run ./dit-only.sh +test -f dit-only0.wav && echo "dit-only OK" + +# 2) Cover from precomputed audio_codes (existing cover.json, 10s) +run ./cover.sh +test -f cover0.wav && echo "cover OK" + +# 3) reference.wav for cover-reference and test-reference +run cp cover0.wav reference.wav + +# 4) Cover + reference timbre +run ./cover-reference.sh +test -f cover-reference0.wav && echo "cover-reference OK" + +# 5) text2music with reference_audio +run cp ../tests/fixtures/ci-request-reference.json request-reference.json +run ./test-reference.sh +test -f request-reference0.wav && echo "test-reference OK" + +# 6) Simple (caption only, LLM fills), 5s +run cp ../tests/fixtures/ci-text2music.json simple.json +run ./simple.sh +test -f simple00.wav && echo "simple OK" + +# 7) Partial (caption + lyrics + duration), 5s +run cp ../tests/fixtures/ci-partial.json partial.json +run ./partial.sh +test -f partial00.wav && echo "partial OK" + +# 8) Full (all metadata), 5s +run cp ../tests/fixtures/ci-full.json full.json +run ./full.sh +test -f full00.wav && echo "full OK" + +echo "All example scripts passed." diff --git a/tests/fixtures/ci-dit-only.json b/tests/fixtures/ci-dit-only.json new file mode 100644 index 0000000..0a83cb8 --- /dev/null +++ b/tests/fixtures/ci-dit-only.json @@ -0,0 +1,11 @@ +{ + "caption": "Short CI clip", + "lyrics": "", + "bpm": 90, + "duration": 5, + "keyscale": "C minor", + "timesignature": "4", + "vocal_language": "en", + "inference_steps": 4, + "shift": 3 +} diff --git a/tests/fixtures/ci-full.json b/tests/fixtures/ci-full.json new file mode 100644 index 0000000..3a37bfc --- /dev/null +++ b/tests/fixtures/ci-full.json @@ -0,0 +1,11 @@ +{ + "caption": "Short CI house clip", + "lyrics": "[Intro]\n\n[Verse 1]\nTest\n\n[Outro]\nDone", + "bpm": 120, + "duration": 5, + "keyscale": "C major", + "timesignature": "4", + "vocal_language": "fr", + "inference_steps": 4, + "shift": 3 +} diff --git a/tests/fixtures/ci-partial.json b/tests/fixtures/ci-partial.json new file mode 100644 index 0000000..19ae9db --- /dev/null +++ b/tests/fixtures/ci-partial.json @@ -0,0 +1,8 @@ +{ + "caption": "Short CI hip hop clip", + "lyrics": "[Intro]\nYeah\n\n[Verse 1]\nOne two\n\n[Chorus]\nTest\n\n[Outro]\nDone", + "duration": 5, + "vocal_language": "fr", + "inference_steps": 4, + "shift": 3 +} diff --git a/tests/fixtures/ci-request-reference.json b/tests/fixtures/ci-request-reference.json new file mode 100644 index 0000000..141d77a --- /dev/null +++ b/tests/fixtures/ci-request-reference.json @@ -0,0 +1,13 @@ +{ + "task_type": "text2music", + "caption": "Short CI reference test", + "lyrics": "[Verse]\nTest\n[Chorus]\nRef", + "duration": 5, + "seed": 42, + "inference_steps": 4, + "guidance_scale": 1, + "shift": 3, + "reference_audio": "reference.wav", + "audio_codes": "", + "audio_cover_strength": 1 +} From 14b108c3df1c5ae91463aaf7870bd0d4588d64a3 Mon Sep 17 00:00:00 2001 From: qxip Date: Sat, 28 Feb 2026 20:38:43 +0100 Subject: [PATCH 05/17] CI: upload generated WAVs as artifact for inspection Made-with: Cursor --- .github/workflows/test-generation.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/test-generation.yml b/.github/workflows/test-generation.yml index 36cefd1..2f84eed 100644 --- a/.github/workflows/test-generation.yml +++ b/.github/workflows/test-generation.yml @@ -46,3 +46,16 @@ jobs: - name: Run examples (CI fixtures, short duration) run: chmod +x examples/run-examples-ci.sh && ./examples/run-examples-ci.sh + + - name: Upload generated audio + uses: actions/upload-artifact@v4 + with: + name: generated-audio + path: | + examples/dit-only0.wav + examples/cover0.wav + examples/cover-reference0.wav + examples/request-reference0.wav + examples/simple00.wav + examples/partial00.wav + examples/full00.wav From 6e49bbddde8706312f12f14c0f68481337a2ebf7 Mon Sep 17 00:00:00 2001 From: qxip Date: Sat, 28 Feb 2026 21:21:54 +0100 Subject: [PATCH 06/17] LoRA: adapter loading + example + README - Add safetensors reader and dit_ggml_load_lora (PEFT adapter_model.safetensors) - Apply LoRA at linear layers in DiT (self/cross-attn, MLP) when base weights are separate - CLI: --lora and --lora-scale in dit-vae - Example: examples/lora.sh + lora.json (duckdbot/acestep-lora-cryda) - README: LoRA section, example list, dit-vae options Made-with: Cursor --- CMakeLists.txt | 4 +- README.md | 7 ++ examples/lora.json | 11 +++ examples/lora.sh | 31 +++++++ src/dit-graph.h | 47 +++++++---- src/dit-lora.cpp | 198 +++++++++++++++++++++++++++++++++++++++++++++ src/dit.h | 20 +++++ src/safetensors.h | 107 ++++++++++++++++++++++++ tools/dit-vae.cpp | 23 +++++- 9 files changed, 429 insertions(+), 19 deletions(-) create mode 100644 examples/lora.json create mode 100755 examples/lora.sh create mode 100644 src/dit-lora.cpp create mode 100644 src/safetensors.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 551a072..503145a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,8 +53,8 @@ add_library(acestep-core STATIC link_ggml_backends(acestep-core) target_include_directories(acestep-core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) -# dit-vae: full pipeline (text-enc + cond + dit + vae + wav) -add_executable(dit-vae tools/dit-vae.cpp) +# dit-vae: full pipeline (text-enc + cond + dit + vae + wav) + LoRA support +add_executable(dit-vae tools/dit-vae.cpp src/dit-lora.cpp) target_link_libraries(dit-vae PRIVATE acestep-core) link_ggml_backends(dit-vae) diff --git a/README.md b/README.md index 178fe39..9ad2a2b 100644 --- a/README.md +++ b/README.md @@ -146,11 +146,14 @@ cd examples ./cover.sh # cover mode: decode precomputed audio_codes (no LLM) ./cover-reference.sh # cover + reference_audio for timbre (WAV/MP3; needs reference.wav or .mp3) ./test-reference.sh # reference_audio (WAV or MP3) + audio_cover_strength +./lora.sh # DiT + LoRA adapter (e.g. duckdbot/acestep-lora-cryda; put adapter in examples/lora/) ``` Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0) alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights). +**LoRA adapters**: use `--lora ` and optional `--lora-scale ` with dit-vae to run the DiT with a PEFT-style LoRA (e.g. [duckdbot/acestep-lora-cryda](https://huggingface.co/duckdbot/acestep-lora-cryda)). Adapter must be `adapter_model.safetensors` (safetensors with `lora_A` / `lora_B` keys matching `decoder.layers.*`). Put the file in `examples/lora/` and run `./lora.sh`, or pass the path explicitly. + ## Generation modes The LLM fills what's missing in the JSON and generates audio codes. @@ -254,6 +257,10 @@ Required: --dit DiT GGUF file --vae VAE GGUF file +LoRA: + --lora LoRA adapter (adapter_model.safetensors) + --lora-scale LoRA scale, e.g. alpha/rank (default: 1.0) + Batch: --batch DiT variations per request (default: 1, max 9) diff --git a/examples/lora.json b/examples/lora.json new file mode 100644 index 0000000..8317521 --- /dev/null +++ b/examples/lora.json @@ -0,0 +1,11 @@ +{ + "task_type": "text2music", + "caption": "Emotional vocal track with soft synths", + "lyrics": "", + "duration": 10, + "inference_steps": 8, + "guidance_scale": 1, + "shift": 3, + "seed": 42, + "vocal_language": "en" +} diff --git a/examples/lora.sh b/examples/lora.sh new file mode 100755 index 0000000..9c25d33 --- /dev/null +++ b/examples/lora.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# LoRA example: generate with a PEFT LoRA adapter (e.g. duckdbot/acestep-lora-cryda). +# Requires adapter_model.safetensors in lora/ (download once; see below). +set -eu +cd "$(dirname "$0")" + +ADAPTER="lora/adapter_model.safetensors" +if [ ! -f "$ADAPTER" ]; then + echo "LoRA adapter not found at $ADAPTER" + echo "Download once (e.g. from Hugging Face):" + echo " mkdir -p lora" + echo " curl -L -o $ADAPTER 'https://huggingface.co/duckdbot/acestep-lora-cryda/resolve/main/adapter_model.safetensors'" + echo "Or: pip install hf && huggingface-cli download duckdbot/acestep-lora-cryda adapter_model.safetensors --local-dir lora" + exit 1 +fi + +# LLM: fill lyrics + codes +../build/ace-qwen3 \ + --request lora.json \ + --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf + +# DiT+VAE with LoRA (scale = alpha/rank; 1.0 is typical) +../build/dit-vae \ + --request lora0.json \ + --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit ../models/acestep-v15-turbo-Q8_0.gguf \ + --vae ../models/vae-BF16.gguf \ + --lora "$ADAPTER" \ + --lora-scale 1.0 + +echo "Done. Check lora00.wav" diff --git a/src/dit-graph.h b/src/dit-graph.h index 2a92324..ab5839e 100644 --- a/src/dit-graph.h +++ b/src/dit-graph.h @@ -44,6 +44,23 @@ static struct ggml_tensor * dit_ggml_linear( return ggml_mul_mat(ctx, weight, input); } +// Linear with optional LoRA: out = W@x + scale * (B@(A@x)). lora_a/lora_b may be NULL. +static struct ggml_tensor * dit_ggml_linear_lora( + struct ggml_context * ctx, + struct ggml_tensor * weight, + struct ggml_tensor * lora_a, // [in, r] + struct ggml_tensor * lora_b, // [r, out] + float lora_scale, + struct ggml_tensor * input) { + struct ggml_tensor * out = ggml_mul_mat(ctx, weight, input); + if (lora_a && lora_b && lora_scale != 0.0f) { + struct ggml_tensor * ax = ggml_mul_mat(ctx, lora_a, input); + struct ggml_tensor * bax = ggml_mul_mat(ctx, lora_b, ax); + out = ggml_add(ctx, out, ggml_scale(ctx, bax, lora_scale)); + } + return out; +} + // Helper: Linear layer with bias static struct ggml_tensor * dit_ggml_linear_bias( struct ggml_context * ctx, @@ -164,6 +181,7 @@ static struct ggml_tensor * dit_ggml_build_self_attn( struct ggml_tensor * q, * k, * v; int q_dim = Nh * D; int kv_dim = Nkv * D; + float lora_scale = m->lora_scale; if (ly->sa_qkv) { struct ggml_tensor * qkv = dit_ggml_linear(ctx, ly->sa_qkv, norm_sa); q = ggml_cont(ctx, ggml_view_3d(ctx, qkv, q_dim, S, N, qkv->nb[1], qkv->nb[2], 0)); @@ -173,11 +191,11 @@ static struct ggml_tensor * dit_ggml_build_self_attn( struct ggml_tensor * qk = dit_ggml_linear(ctx, ly->sa_qk, norm_sa); q = ggml_cont(ctx, ggml_view_3d(ctx, qk, q_dim, S, N, qk->nb[1], qk->nb[2], 0)); k = ggml_cont(ctx, ggml_view_3d(ctx, qk, kv_dim, S, N, qk->nb[1], qk->nb[2], (size_t)q_dim * qk->nb[0])); - v = dit_ggml_linear(ctx, ly->sa_v_proj, norm_sa); + v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa); } else { - q = dit_ggml_linear(ctx, ly->sa_q_proj, norm_sa); - k = dit_ggml_linear(ctx, ly->sa_k_proj, norm_sa); - v = dit_ggml_linear(ctx, ly->sa_v_proj, norm_sa); + q = dit_ggml_linear_lora(ctx, ly->sa_q_proj, ly->lora_sa_q_a, ly->lora_sa_q_b, lora_scale, norm_sa); + k = dit_ggml_linear_lora(ctx, ly->sa_k_proj, ly->lora_sa_k_a, ly->lora_sa_k_b, lora_scale, norm_sa); + v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa); } // 2) Reshape to heads: [Nh*D, S, N] -> [D, Nh, S, N] @@ -239,7 +257,7 @@ static struct ggml_tensor * dit_ggml_build_self_attn( } // 8) O projection: [Nh*D, S, N] -> [H, S, N] - struct ggml_tensor * out = dit_ggml_linear(ctx, ly->sa_o_proj, attn); + struct ggml_tensor * out = dit_ggml_linear_lora(ctx, ly->sa_o_proj, ly->lora_sa_o_a, ly->lora_sa_o_b, m->lora_scale, attn); return out; } @@ -253,20 +271,21 @@ static struct ggml_tensor * dit_ggml_build_mlp( struct ggml_tensor * norm_ffn, int S) { + float lora_scale = m->lora_scale; struct ggml_tensor * ff; if (ly->gate_up) { // Fused: single matmul [H, 2*I] x [H, S, N] -> [2*I, S, N], then swiglu splits ne[0] struct ggml_tensor * gu = dit_ggml_linear(ctx, ly->gate_up, norm_ffn); ff = ggml_swiglu(ctx, gu); } else { - // Separate: two matmuls + split swiglu - struct ggml_tensor * gate = dit_ggml_linear(ctx, ly->gate_proj, norm_ffn); - struct ggml_tensor * up = dit_ggml_linear(ctx, ly->up_proj, norm_ffn); + // Separate: two matmuls + split swiglu (with optional LoRA) + struct ggml_tensor * gate = dit_ggml_linear_lora(ctx, ly->gate_proj, ly->lora_gate_a, ly->lora_gate_b, lora_scale, norm_ffn); + struct ggml_tensor * up = dit_ggml_linear_lora(ctx, ly->up_proj, ly->lora_up_a, ly->lora_up_b, lora_scale, norm_ffn); ff = ggml_swiglu_split(ctx, gate, up); } // Down projection: [I, S] -> [H, S] - return dit_ggml_linear(ctx, ly->down_proj, ff); + return dit_ggml_linear_lora(ctx, ly->down_proj, ly->lora_down_a, ly->lora_down_b, lora_scale, ff); } // Build cross-attention sub-graph for a single layer. @@ -305,14 +324,14 @@ static struct ggml_tensor * dit_ggml_build_cross_attn( v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0])); } else if (ly->ca_kv) { // Q separate, K+V fused - q = dit_ggml_linear(ctx, ly->ca_q_proj, norm_ca); + q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca); struct ggml_tensor * kv = ggml_mul_mat(ctx, ly->ca_kv, enc); k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0)); v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0])); } else { - q = dit_ggml_linear(ctx, ly->ca_q_proj, norm_ca); - k = dit_ggml_linear(ctx, ly->ca_k_proj, enc); - v = dit_ggml_linear(ctx, ly->ca_v_proj, enc); + q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca); + k = dit_ggml_linear_lora(ctx, ly->ca_k_proj, ly->lora_ca_k_a, ly->lora_ca_k_b, m->lora_scale, enc); + v = dit_ggml_linear_lora(ctx, ly->ca_v_proj, ly->lora_ca_v_a, ly->lora_ca_v_b, m->lora_scale, enc); } // reshape to [D, heads, seq, N] then permute to [D, seq, heads, N] @@ -342,7 +361,7 @@ static struct ggml_tensor * dit_ggml_build_cross_attn( attn = ggml_reshape_3d(ctx, attn, Nh * D, S, N); // O projection - return dit_ggml_linear(ctx, ly->ca_o_proj, attn); + return dit_ggml_linear_lora(ctx, ly->ca_o_proj, ly->lora_ca_o_a, ly->lora_ca_o_b, m->lora_scale, attn); } // Build one full DiT layer (AdaLN + self-attn + cross-attn + FFN + gated residuals) diff --git a/src/dit-lora.cpp b/src/dit-lora.cpp new file mode 100644 index 0000000..881d941 --- /dev/null +++ b/src/dit-lora.cpp @@ -0,0 +1,198 @@ +// dit-lora.cpp: Load LoRA adapters from safetensors into DiT (ACE-Step). +// Compatible with PEFT adapter_model.safetensors (lora_A / lora_B per target layer). + +#include "dit.h" +#include "safetensors.h" +#include +#include +#include +#include + +// Normalize adapter key to base name: decoder.layers.N. +// e.g. "base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_A.default" -> "decoder.layers.0.self_attn.q_proj" +static std::string lora_key_to_base(const std::string & key) { + std::string s = key; + const char * prefixes[] = { "base_model.model.model.", "base_model.model." }; + for (const char * p : prefixes) { + size_t pl = strlen(p); + if (s.size() >= pl && s.compare(0, pl, p) == 0) { + s = s.substr(pl); + break; + } + } + if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.default") == 0) + s = s.substr(0, s.size() - 14); + else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.default") == 0) + s = s.substr(0, s.size() - 14); + else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_A") == 0) + s = s.substr(0, s.size() - 7); + else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_B") == 0) + s = s.substr(0, s.size() - 7); + return s; +} + +static bool is_lora_a(const std::string & key) { + return key.find("lora_A") != std::string::npos; +} + +// Slot index for layer: 0=sa_q, 1=sa_k, 2=sa_v, 3=sa_o, 4=ca_q, 5=ca_k, 6=ca_v, 7=ca_o, 8=gate, 9=up, 10=down +enum LoraSlot { + SA_Q, SA_K, SA_V, SA_O, CA_Q, CA_K, CA_V, CA_O, GATE, UP, DOWN, N_SLOTS +}; + +static bool parse_base_name(const std::string & base, int * layer_idx, LoraSlot * slot) { + int L = -1; + if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.q_proj", &L) == 1) { *layer_idx = L; *slot = SA_Q; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.k_proj", &L) == 1) { *layer_idx = L; *slot = SA_K; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.v_proj", &L) == 1) { *layer_idx = L; *slot = SA_V; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.o_proj", &L) == 1) { *layer_idx = L; *slot = SA_O; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.q_proj", &L) == 1) { *layer_idx = L; *slot = CA_Q; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.k_proj", &L) == 1) { *layer_idx = L; *slot = CA_K; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.v_proj", &L) == 1) { *layer_idx = L; *slot = CA_V; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.o_proj", &L) == 1) { *layer_idx = L; *slot = CA_O; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.mlp.gate_proj", &L) == 1) { *layer_idx = L; *slot = GATE; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.mlp.up_proj", &L) == 1) { *layer_idx = L; *slot = UP; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.mlp.down_proj", &L) == 1) { *layer_idx = L; *slot = DOWN; return true; } + return false; +} + +static struct ggml_tensor ** slot_to_ptr(DiTGGMLLayer * ly, LoraSlot slot, bool is_b) { + if (is_b) { + switch (slot) { + case SA_Q: return &ly->lora_sa_q_b; case SA_K: return &ly->lora_sa_k_b; case SA_V: return &ly->lora_sa_v_b; case SA_O: return &ly->lora_sa_o_b; + case CA_Q: return &ly->lora_ca_q_b; case CA_K: return &ly->lora_ca_k_b; case CA_V: return &ly->lora_ca_v_b; case CA_O: return &ly->lora_ca_o_b; + case GATE: return &ly->lora_gate_b; case UP: return &ly->lora_up_b; case DOWN: return &ly->lora_down_b; + default: return nullptr; + } + } else { + switch (slot) { + case SA_Q: return &ly->lora_sa_q_a; case SA_K: return &ly->lora_sa_k_a; case SA_V: return &ly->lora_sa_v_a; case SA_O: return &ly->lora_sa_o_a; + case CA_Q: return &ly->lora_ca_q_a; case CA_K: return &ly->lora_ca_k_a; case CA_V: return &ly->lora_ca_v_a; case CA_O: return &ly->lora_ca_o_a; + case GATE: return &ly->lora_gate_a; case UP: return &ly->lora_up_a; case DOWN: return &ly->lora_down_a; + default: return nullptr; + } + } +} + +bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale) { + FILE * fp = fopen(lora_path, "rb"); + if (!fp) { + fprintf(stderr, "[LoRA] cannot open %s\n", lora_path); + return false; + } + uint8_t h8[8]; + if (fread(h8, 1, 8, fp) != 8) { + fclose(fp); + return false; + } + uint64_t header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24) + | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56); + uint64_t data_section_start = 8 + header_len; + + std::unordered_map tensors; + int n = safetensors_parse_lora(fp, &tensors); + if (n == 0) { + fclose(fp); + fprintf(stderr, "[LoRA] no LoRA tensors found in %s\n", lora_path); + return false; + } + + // Count pairs we will load: for each lora_A key, find the matching lora_B (same base name) + std::unordered_map> pairs; // base -> (key_a, key_b) + std::unordered_map base_to_b; + for (const auto & kv : tensors) { + std::string base = lora_key_to_base(kv.first); + if (base.empty()) continue; + if (is_lora_a(kv.first)) + base_to_b[base] = ""; // mark base as having A; we'll find B next + } + for (const auto & kv : tensors) { + std::string base = lora_key_to_base(kv.first); + if (base.empty()) continue; + if (base_to_b.count(base) && kv.first.find("lora_B") != std::string::npos) + base_to_b[base] = kv.first; + } + for (const auto & kv : tensors) { + if (!is_lora_a(kv.first)) continue; + std::string base = lora_key_to_base(kv.first); + auto it = base_to_b.find(base); + if (it != base_to_b.end() && !it->second.empty()) + pairs[base] = { kv.first, it->second }; + } + + int n_pairs = (int)pairs.size(); + wctx_init(&m->lora_wctx, n_pairs * 2); // A and B per pair + + fseek(fp, (long)data_section_start, SEEK_SET); + + for (const auto & p : pairs) { + const std::string & base = p.first; + const std::string & key_a = p.second.first; + const std::string & key_b = p.second.second; + int layer_idx = 0; + LoraSlot slot = N_SLOTS; + if (!parse_base_name(base, &layer_idx, &slot) || layer_idx < 0 || layer_idx >= m->cfg.n_layers) continue; + + DiTGGMLLayer * ly = &m->layers[layer_idx]; + SafeTensorInfo & info_a = tensors[key_a]; + SafeTensorInfo & info_b = tensors[key_b]; + if (info_a.n_dims != 2 || info_b.n_dims != 2) continue; + // A_pt [r, in], B_pt [out, r]. We need A_ggml [r, in] for mul_mat(A,x)=[r,S], B_ggml [out, r] for mul_mat(B, Ax)=[out,S]. + // ggml layout: ne[0]=cols, ne[1]=rows. So A: [r, in] -> ne[0]=in, ne[1]=r. B: [out, r] -> ne[0]=r, ne[1]=out. + int64_t r = info_a.shape[0], in_dim = info_a.shape[1]; + int64_t out_dim = info_b.shape[0]; + if (info_b.shape[1] != r) continue; + + struct ggml_tensor * ta = ggml_new_tensor_2d(m->lora_wctx.ctx, GGML_TYPE_F32, (int64_t)in_dim, (int64_t)r); + struct ggml_tensor * tb = ggml_new_tensor_2d(m->lora_wctx.ctx, GGML_TYPE_F32, (int64_t)r, (int64_t)out_dim); + ggml_set_name(ta, key_a.c_str()); + ggml_set_name(tb, key_b.c_str()); + + // Copy A: file is row-major [r, in], we need ggml col-major [in, r] (transpose) + size_t na = (size_t)(r * in_dim); + m->lora_wctx.staging.emplace_back(na); + float * buf_a = m->lora_wctx.staging.back().data(); + if (!safetensors_read_tensor_data(fp, data_section_start, info_a.data_start, info_a.data_end, buf_a)) { + fclose(fp); + wctx_free(&m->lora_wctx); + return false; + } + m->lora_wctx.staging.emplace_back(na); + float * transposed_a = m->lora_wctx.staging.back().data(); + for (int64_t i = 0; i < r; i++) + for (int64_t j = 0; j < in_dim; j++) + transposed_a[(size_t)(j * r + i)] = buf_a[(size_t)(i * in_dim + j)]; + m->lora_wctx.pending.push_back({ ta, transposed_a, na * sizeof(float), 0 }); + + size_t nb = (size_t)(out_dim * r); + m->lora_wctx.staging.emplace_back(nb); + float * buf_b = m->lora_wctx.staging.back().data(); + if (!safetensors_read_tensor_data(fp, data_section_start, info_b.data_start, info_b.data_end, buf_b)) { + fclose(fp); + wctx_free(&m->lora_wctx); + return false; + } + m->lora_wctx.staging.emplace_back(nb); + float * transposed_b = m->lora_wctx.staging.back().data(); + for (int64_t i = 0; i < out_dim; i++) + for (int64_t j = 0; j < r; j++) + transposed_b[(size_t)(j * out_dim + i)] = buf_b[(size_t)(i * r + j)]; + m->lora_wctx.pending.push_back({ tb, transposed_b, nb * sizeof(float), 0 }); + + struct ggml_tensor ** pa = slot_to_ptr(ly, slot, false); + struct ggml_tensor ** pb = slot_to_ptr(ly, slot, true); + if (pa) *pa = ta; + if (pb) *pb = tb; + } + fclose(fp); + fp = nullptr; + + if (!wctx_alloc(&m->lora_wctx, m->backend)) { + fprintf(stderr, "[LoRA] failed to allocate LoRA tensors on backend\n"); + wctx_free(&m->lora_wctx); + return false; + } + m->lora_scale = scale; + fprintf(stderr, "[LoRA] loaded %d adapter pairs from %s (scale=%.4f)\n", n_pairs, lora_path, scale); + return true; +} diff --git a/src/dit.h b/src/dit.h index 524dd76..9c842b0 100644 --- a/src/dit.h +++ b/src/dit.h @@ -81,6 +81,19 @@ struct DiTGGMLLayer { // AdaLN scale-shift table: [6*hidden] (6 rows of [hidden]) struct ggml_tensor * scale_shift_table; // [hidden, 6] in ggml layout + // Optional LoRA adapters (F32, applied when base projection is separate) + struct ggml_tensor * lora_sa_q_a, * lora_sa_q_b; + struct ggml_tensor * lora_sa_k_a, * lora_sa_k_b; + struct ggml_tensor * lora_sa_v_a, * lora_sa_v_b; + struct ggml_tensor * lora_sa_o_a, * lora_sa_o_b; + struct ggml_tensor * lora_ca_q_a, * lora_ca_q_b; + struct ggml_tensor * lora_ca_k_a, * lora_ca_k_b; + struct ggml_tensor * lora_ca_v_a, * lora_ca_v_b; + struct ggml_tensor * lora_ca_o_a, * lora_ca_o_b; + struct ggml_tensor * lora_gate_a, * lora_gate_b; + struct ggml_tensor * lora_up_a, * lora_up_b; + struct ggml_tensor * lora_down_a, * lora_down_b; + int layer_type; // 0=sliding, 1=full }; @@ -122,6 +135,8 @@ struct DiTGGML { // Weight storage WeightCtx wctx; + WeightCtx lora_wctx; // optional LoRA adapter tensors (when lora_scale > 0) + float lora_scale; // alpha/rank for LoRA (0 = no LoRA) // Pre-allocated constant for AdaLN (1+scale) fusion struct ggml_tensor * scalar_one; // [1] = 1.0f, broadcast in ggml_add @@ -389,10 +404,15 @@ static void dit_ggml_init_backend(DiTGGML * m) { m->use_flash_attn = (bp.backend != bp.cpu_backend); } +// Load LoRA adapter from safetensors (e.g. adapter_model.safetensors). +// scale = alpha/rank (typical 1.0). Call after dit_ggml_load. Returns false on error. +bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale); + static void dit_ggml_free(DiTGGML * m) { if (m->sched) ggml_backend_sched_free(m->sched); if (m->backend && m->backend != m->cpu_backend) ggml_backend_free(m->backend); if (m->cpu_backend) ggml_backend_free(m->cpu_backend); wctx_free(&m->wctx); + if (m->lora_wctx.ctx) wctx_free(&m->lora_wctx); *m = {}; } diff --git a/src/safetensors.h b/src/safetensors.h new file mode 100644 index 0000000..74d5967 --- /dev/null +++ b/src/safetensors.h @@ -0,0 +1,107 @@ +#pragma once +// safetensors.h: minimal reader for LoRA adapter_model.safetensors +// +// Format: 8-byte header length (LE uint64), then JSON header, then raw tensor data. +// We only parse keys that look like "*lora_A*" / "*lora_B*" and extract shape + data_offsets. + +#include +#include +#include +#include +#include +#include +#include + +struct SafeTensorInfo { + std::string dtype; // "F32", "F16", "BF16" + int64_t shape[2]; // [dim0, dim1] from JSON + int n_dims; + uint64_t data_start; // byte offset in file (after header) + uint64_t data_end; +}; + +// Open file, read header, parse tensor metadata for LoRA tensors. +// Returns number of LoRA tensors found; fills *out with tensor name -> info. +// Caller must fclose(fp) and free the map; file position is left at start of data section. +static int safetensors_parse_lora(FILE * fp, std::unordered_map * out) { + out->clear(); + uint64_t header_len = 0; + uint8_t h8[8]; + if (fread(h8, 1, 8, fp) != 8) return 0; + header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24) + | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56); + if (header_len == 0 || header_len > 10 * 1024 * 1024) return 0; // cap 10MB header + std::vector buf(header_len + 1); + if (fread(buf.data(), 1, header_len, fp) != header_len) return 0; + buf[header_len] = '\0'; + const char * json = buf.data(); + + // Find each key that contains "lora_A" or "lora_B" + const char * p = json; + int count = 0; + while ((p = strstr(p, "\"")) != nullptr) { + const char * key_start = p + 1; + p = strchr(key_start, '"'); + if (!p) break; + std::string key(key_start, (size_t)(p - key_start)); + p++; + if (key.find("lora_A") == std::string::npos && key.find("lora_B") == std::string::npos) { + continue; + } + // Find the value object for this key: skip ": + while (*p && (*p == ' ' || *p == ':')) p++; + if (*p != '{') continue; + const char * obj = p; + SafeTensorInfo info = {}; + info.shape[0] = info.shape[1] = 1; + info.n_dims = 0; + // "shape":[n,m] or [n] + const char * sh = strstr(obj, "\"shape\""); + if (sh) { + const char * br = strchr(sh, '['); + if (br) { + long long a = 0, b = 0; + int n = sscanf(br, "[%lld,%lld]", &a, &b); + if (n >= 1) { info.shape[0] = (int64_t)a; info.n_dims = 1; } + if (n >= 2) { info.shape[1] = (int64_t)b; info.n_dims = 2; } + } + } + const char * dt = strstr(obj, "\"dtype\""); + if (dt) { + const char * q = strchr(dt, '"'); + if (q) q = strchr(q + 1, '"'); + if (q) { + const char * start = q + 1; + const char * end = strchr(start, '"'); + if (end) info.dtype = std::string(start, end - start); + } + } + const char * off = strstr(obj, "\"data_offsets\""); + if (off) { + const char * br = strchr(off, '['); + if (br) { + uint64_t s = 0, e = 0; + if (sscanf(br, "[%llu,%llu]", (unsigned long long*)&s, (unsigned long long*)&e) == 2) { + info.data_start = s; + info.data_end = e; + } + } + } + if (info.dtype.empty() || info.n_dims == 0) continue; + (*out)[key] = info; + count++; + } + return count; +} + +// Read raw tensor data from file. File must be positioned at start of data section +// (i.e. after the 8-byte header length + header bytes). +// data_offset in the JSON is relative to the start of the data section. +static bool safetensors_read_tensor_data(FILE * fp, uint64_t data_section_start, + uint64_t tensor_start, uint64_t tensor_end, void * out_buf) { + uint64_t off = data_section_start + tensor_start; + uint64_t nbytes = tensor_end - tensor_start; + if (fseek(fp, (long)off, SEEK_SET) != 0) return false; + if (fread(out_buf, 1, nbytes, fp) != nbytes) return false; + return true; +} diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index cac80a5..d889da1 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -67,6 +67,9 @@ static void print_usage(const char * prog) { " --text-encoder Text encoder GGUF file\n" " --dit DiT GGUF file\n" " --vae VAE GGUF file\n\n" + "LoRA:\n" + " --lora LoRA adapter (adapter_model.safetensors)\n" + " --lora-scale LoRA scale, e.g. alpha/rank (default: 1.0)\n\n" "Batch:\n" " --batch DiT variations per request (default: 1, max 9)\n\n" "Output naming: input.json -> input0.wav, input1.wav, ... (last digit = batch index)\n\n" @@ -99,9 +102,11 @@ int main(int argc, char ** argv) { const char * dit_gguf = NULL; const char * vae_gguf = NULL; const char * dump_dir = NULL; - int batch_n = 1; - int vae_chunk = 256; - int vae_overlap = 64; + const char * lora_path = NULL; + float lora_scale = 1.0f; + int batch_n = 1; + int vae_chunk = 256; + int vae_overlap = 64; for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "--request") == 0) { @@ -116,6 +121,8 @@ int main(int argc, char ** argv) { else if (strcmp(argv[i], "--batch") == 0 && i+1 < argc) batch_n = atoi(argv[++i]); else if (strcmp(argv[i], "--vae-chunk") == 0 && i+1 < argc) vae_chunk = atoi(argv[++i]); else if (strcmp(argv[i], "--vae-overlap") == 0 && i+1 < argc) vae_overlap = atoi(argv[++i]); + else if (strcmp(argv[i], "--lora") == 0 && i+1 < argc) lora_path = argv[++i]; + else if (strcmp(argv[i], "--lora-scale") == 0 && i+1 < argc) lora_scale = (float)atof(argv[++i]); else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { print_usage(argv[0]); return 0; } else { @@ -161,6 +168,16 @@ int main(int argc, char ** argv) { } fprintf(stderr, "[Load] DiT weight load: %.1f ms\n", timer.ms()); + if (lora_path) { + timer.reset(); + if (!dit_ggml_load_lora(&model, lora_path, lora_scale)) { + fprintf(stderr, "FATAL: failed to load LoRA from %s\n", lora_path); + dit_ggml_free(&model); + return 1; + } + fprintf(stderr, "[Load] LoRA: %.1f ms\n", timer.ms()); + } + // Read DiT GGUF metadata + silence_latent tensor (once) bool is_turbo = false; std::vector silence_full; // [15000, 64] f32 From 658ca3035663c0051252c618a94348ec8729af53 Mon Sep 17 00:00:00 2001 From: qxip Date: Sat, 28 Feb 2026 21:27:23 +0100 Subject: [PATCH 07/17] Fix VAE encoder double free and enc_out buffer size in reference_audio path - vae_encoder_free: reset sched, free in order (sched, weight_ctx, buf, backends), null ptrs - dit-vae: size enc_out to max_lat = (T_audio/2048)+1 to avoid encoder write overflow Made-with: Cursor --- src/vae.h | 28 +++++++++++++++++++++++----- tools/dit-vae.cpp | 4 +++- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/vae.h b/src/vae.h index f97da51..b4335e1 100644 --- a/src/vae.h +++ b/src/vae.h @@ -731,10 +731,28 @@ static int vae_encoder_forward(VAEEncoderGGML * m, const float * audio, int T_au } static void vae_encoder_free(VAEEncoderGGML * m) { - if (m->sched) ggml_backend_sched_free(m->sched); - if (m->buf) ggml_backend_buffer_free(m->buf); - if (m->weight_ctx) ggml_free(m->weight_ctx); - if (m->backend && m->backend != m->cpu_backend) ggml_backend_free(m->backend); - if (m->cpu_backend) ggml_backend_free(m->cpu_backend); + // Order: reset sched, free sched (drops refs to graph/alloc), free weight_ctx (tensor metadata), + // then buffer (tensor data), then backends. Avoids double free on some GGML backends. + if (m->sched) { + ggml_backend_sched_reset(m->sched); + ggml_backend_sched_free(m->sched); + m->sched = NULL; + } + if (m->weight_ctx) { + ggml_free(m->weight_ctx); + m->weight_ctx = NULL; + } + if (m->buf) { + ggml_backend_buffer_free(m->buf); + m->buf = NULL; + } + if (m->backend && m->backend != m->cpu_backend) { + ggml_backend_free(m->backend); + m->backend = NULL; + } + if (m->cpu_backend) { + ggml_backend_free(m->cpu_backend); + m->cpu_backend = NULL; + } *m = {}; } diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index cac80a5..b24c967 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -393,7 +393,9 @@ int main(int argc, char ** argv) { if (vae_encoder_load(&enc, vae_gguf)) { int T_audio = n_samples; if (T_audio >= 1920) { - std::vector enc_out((size_t)S_ref * 64); + // Encoder strides 2,4,4,8,8 -> max latent frames = T_audio/2048 + 1 + size_t max_lat = (size_t)(T_audio / 2048) + 1; + std::vector enc_out(max_lat * 64); int T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data()); if (T_lat > 0) { size_t copy_frames = (size_t)(T_lat < S_ref ? T_lat : S_ref); From 907a068a24dba3208671edd45775a7cd1334d9ad Mon Sep 17 00:00:00 2001 From: qxip Date: Sat, 28 Feb 2026 22:57:57 +0100 Subject: [PATCH 08/17] LoRA: apply on fused layers, add custom_tag/genre, update example - dit-graph.h: apply LoRA deltas when base uses fused QKV/gate_up/ca_qkv so self-attn, MLP, and cross-attn all use adapters (fixes no audible effect) - dit-lora.cpp: fix safetensors parse (rewind fp before parse); normalize keys for base_model.model.layers.* and .lora_A.weight/.lora_B.weight - request: add custom_tag (LoRA trigger) and genre; parse language, is_instrumental, formatted_lyrics - dit-vae: append custom_tag to caption for condition encoder when set - examples/lora.json: nu-disco example with custom_tag crydamoure Made-with: Cursor --- examples/lora.json | 16 +++++++++++----- src/dit-graph.h | 48 ++++++++++++++++++++++++++++++++++++++++++++-- src/dit-lora.cpp | 20 +++++++++++-------- src/request.cpp | 13 +++++++++++++ src/request.h | 6 +++++- tools/dit-vae.cpp | 7 +++++-- 6 files changed, 92 insertions(+), 18 deletions(-) diff --git a/examples/lora.json b/examples/lora.json index 8317521..c872efb 100644 --- a/examples/lora.json +++ b/examples/lora.json @@ -1,11 +1,17 @@ { "task_type": "text2music", - "caption": "Emotional vocal track with soft synths", - "lyrics": "", - "duration": 10, + "caption": "An energetic nu-disco track built on a foundation of a tight, funky slap bassline and a crisp, four-on-the-floor drum machine beat. The song opens with a distinctive, filtered wah-wah guitar riff that serves as a recurring motif. The arrangement is layered with shimmering synth pads, punchy synth stabs, and subtle arpeggiated synth textures that add movement. The track progresses through dynamic sections, including a brief atmospheric breakdown before rebuilding the main groove.", + "genre": "Nu-disco", + "lyrics": "[Instrumental]", + "bpm": 115, + "keyscale": "C# major", + "timesignature": "4", + "duration": 256, + "language": "unknown", + "instrumental": true, + "custom_tag": "crydamoure", "inference_steps": 8, "guidance_scale": 1, "shift": 3, - "seed": 42, - "vocal_language": "en" + "seed": -1 } diff --git a/src/dit-graph.h b/src/dit-graph.h index ab5839e..1241bc5 100644 --- a/src/dit-graph.h +++ b/src/dit-graph.h @@ -187,10 +187,25 @@ static struct ggml_tensor * dit_ggml_build_self_attn( q = ggml_cont(ctx, ggml_view_3d(ctx, qkv, q_dim, S, N, qkv->nb[1], qkv->nb[2], 0)); k = ggml_cont(ctx, ggml_view_3d(ctx, qkv, kv_dim, S, N, qkv->nb[1], qkv->nb[2], (size_t)q_dim * qkv->nb[0])); v = ggml_cont(ctx, ggml_view_3d(ctx, qkv, kv_dim, S, N, qkv->nb[1], qkv->nb[2], (size_t)(q_dim + kv_dim) * qkv->nb[0])); + // LoRA on fused path: add scale * (B @ (A @ x)) per projection when adapters are loaded + if (lora_scale != 0.0f) { + if (ly->lora_sa_q_a && ly->lora_sa_q_b) + q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_q_b, ggml_mul_mat(ctx, ly->lora_sa_q_a, norm_sa)), lora_scale)); + if (ly->lora_sa_k_a && ly->lora_sa_k_b) + k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_k_b, ggml_mul_mat(ctx, ly->lora_sa_k_a, norm_sa)), lora_scale)); + if (ly->lora_sa_v_a && ly->lora_sa_v_b) + v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_v_b, ggml_mul_mat(ctx, ly->lora_sa_v_a, norm_sa)), lora_scale)); + } } else if (ly->sa_qk) { struct ggml_tensor * qk = dit_ggml_linear(ctx, ly->sa_qk, norm_sa); q = ggml_cont(ctx, ggml_view_3d(ctx, qk, q_dim, S, N, qk->nb[1], qk->nb[2], 0)); k = ggml_cont(ctx, ggml_view_3d(ctx, qk, kv_dim, S, N, qk->nb[1], qk->nb[2], (size_t)q_dim * qk->nb[0])); + if (lora_scale != 0.0f) { + if (ly->lora_sa_q_a && ly->lora_sa_q_b) + q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_q_b, ggml_mul_mat(ctx, ly->lora_sa_q_a, norm_sa)), lora_scale)); + if (ly->lora_sa_k_a && ly->lora_sa_k_b) + k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_k_b, ggml_mul_mat(ctx, ly->lora_sa_k_a, norm_sa)), lora_scale)); + } v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa); } else { q = dit_ggml_linear_lora(ctx, ly->sa_q_proj, ly->lora_sa_q_a, ly->lora_sa_q_b, lora_scale, norm_sa); @@ -271,12 +286,25 @@ static struct ggml_tensor * dit_ggml_build_mlp( struct ggml_tensor * norm_ffn, int S) { + DiTGGMLConfig & c = m->cfg; + int I = c.intermediate_size; + int N = (int)norm_ffn->ne[2]; float lora_scale = m->lora_scale; struct ggml_tensor * ff; if (ly->gate_up) { // Fused: single matmul [H, 2*I] x [H, S, N] -> [2*I, S, N], then swiglu splits ne[0] struct ggml_tensor * gu = dit_ggml_linear(ctx, ly->gate_up, norm_ffn); - ff = ggml_swiglu(ctx, gu); + if (lora_scale != 0.0f && ((ly->lora_gate_a && ly->lora_gate_b) || (ly->lora_up_a && ly->lora_up_b))) { + struct ggml_tensor * gate = ggml_cont(ctx, ggml_view_3d(ctx, gu, I, S, N, gu->nb[1], gu->nb[2], 0)); + struct ggml_tensor * up = ggml_cont(ctx, ggml_view_3d(ctx, gu, I, S, N, gu->nb[1], gu->nb[2], (size_t)I * gu->nb[0])); + if (ly->lora_gate_a && ly->lora_gate_b) + gate = ggml_add(ctx, gate, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_gate_b, ggml_mul_mat(ctx, ly->lora_gate_a, norm_ffn)), lora_scale)); + if (ly->lora_up_a && ly->lora_up_b) + up = ggml_add(ctx, up, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_up_b, ggml_mul_mat(ctx, ly->lora_up_a, norm_ffn)), lora_scale)); + ff = ggml_swiglu_split(ctx, gate, up); + } else { + ff = ggml_swiglu(ctx, gu); + } } else { // Separate: two matmuls + split swiglu (with optional LoRA) struct ggml_tensor * gate = dit_ggml_linear_lora(ctx, ly->gate_proj, ly->lora_gate_a, ly->lora_gate_b, lora_scale, norm_ffn); @@ -311,6 +339,7 @@ static struct ggml_tensor * dit_ggml_build_cross_attn( // Q from hidden, KV from encoder (full fused, Q+KV partial, separate) int q_dim = Nh * D; int kv_dim = Nkv * D; + float lora_scale = m->lora_scale; struct ggml_tensor * q, * k, * v; if (ly->ca_qkv) { // Full QKV fused: split Q from hidden, KV from enc via weight views @@ -322,12 +351,27 @@ static struct ggml_tensor * dit_ggml_build_cross_attn( struct ggml_tensor * kv = ggml_mul_mat(ctx, w_kv, enc); k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0)); v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0])); + // LoRA on fused path: add scale * (B @ (A @ x)) for Q (from norm_ca), K/V (from enc) + if (lora_scale != 0.0f) { + if (ly->lora_ca_q_a && ly->lora_ca_q_b) + q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_q_b, ggml_mul_mat(ctx, ly->lora_ca_q_a, norm_ca)), lora_scale)); + if (ly->lora_ca_k_a && ly->lora_ca_k_b) + k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_k_b, ggml_mul_mat(ctx, ly->lora_ca_k_a, enc)), lora_scale)); + if (ly->lora_ca_v_a && ly->lora_ca_v_b) + v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_v_b, ggml_mul_mat(ctx, ly->lora_ca_v_a, enc)), lora_scale)); + } } else if (ly->ca_kv) { // Q separate, K+V fused - q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca); + q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, lora_scale, norm_ca); struct ggml_tensor * kv = ggml_mul_mat(ctx, ly->ca_kv, enc); k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0)); v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0])); + if (lora_scale != 0.0f) { + if (ly->lora_ca_k_a && ly->lora_ca_k_b) + k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_k_b, ggml_mul_mat(ctx, ly->lora_ca_k_a, enc)), lora_scale)); + if (ly->lora_ca_v_a && ly->lora_ca_v_b) + v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_v_b, ggml_mul_mat(ctx, ly->lora_ca_v_a, enc)), lora_scale)); + } } else { q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca); k = dit_ggml_linear_lora(ctx, ly->ca_k_proj, ly->lora_ca_k_a, ly->lora_ca_k_b, m->lora_scale, enc); diff --git a/src/dit-lora.cpp b/src/dit-lora.cpp index 881d941..b14e090 100644 --- a/src/dit-lora.cpp +++ b/src/dit-lora.cpp @@ -9,7 +9,7 @@ #include // Normalize adapter key to base name: decoder.layers.N. -// e.g. "base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_A.default" -> "decoder.layers.0.self_attn.q_proj" +// Handles: base_model.model.model., base_model.model.; decoder.layers. or layers.; .lora_A.default/.lora_B.default or .lora_A.weight/.lora_B.weight static std::string lora_key_to_base(const std::string & key) { std::string s = key; const char * prefixes[] = { "base_model.model.model.", "base_model.model." }; @@ -20,14 +20,22 @@ static std::string lora_key_to_base(const std::string & key) { break; } } + // PEFT-style suffix if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.default") == 0) s = s.substr(0, s.size() - 14); else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.default") == 0) s = s.substr(0, s.size() - 14); + else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.weight") == 0) + s = s.substr(0, s.size() - 14); + else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.weight") == 0) + s = s.substr(0, s.size() - 14); else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_A") == 0) s = s.substr(0, s.size() - 7); else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_B") == 0) s = s.substr(0, s.size() - 7); + // HuggingFace adapter: layers.N -> decoder.layers.N for our DiT naming + if (s.size() >= 7 && s.compare(0, 7, "layers.") == 0) + s = "decoder." + s; return s; } @@ -80,17 +88,13 @@ bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale) { fprintf(stderr, "[LoRA] cannot open %s\n", lora_path); return false; } - uint8_t h8[8]; - if (fread(h8, 1, 8, fp) != 8) { + std::unordered_map tensors; + if (fseek(fp, 0, SEEK_SET) != 0) { fclose(fp); return false; } - uint64_t header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24) - | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56); - uint64_t data_section_start = 8 + header_len; - - std::unordered_map tensors; int n = safetensors_parse_lora(fp, &tensors); + uint64_t data_section_start = (uint64_t)ftell(fp); if (n == 0) { fclose(fp); fprintf(stderr, "[LoRA] no LoRA tensors found in %s\n", lora_path); diff --git a/src/request.cpp b/src/request.cpp index a24d838..f85873a 100644 --- a/src/request.cpp +++ b/src/request.cpp @@ -16,6 +16,8 @@ void request_init(AceRequest * r) { r->caption = ""; r->lyrics = ""; r->instrumental = false; + r->custom_tag = ""; + r->genre = ""; r->bpm = 0; r->duration = -1.0f; r->keyscale = ""; @@ -227,7 +229,11 @@ bool request_parse(AceRequest * r, const char * path) { if (k == "task_type") r->task_type = v; else if (k == "caption") r->caption = v; else if (k == "lyrics") r->lyrics = v; + else if (k == "custom_tag") r->custom_tag = v; + else if (k == "genre") r->genre = v; else if (k == "keyscale") r->keyscale = v; + else if (k == "formatted_lyrics") r->lyrics = v; // alias for lyrics + else if (k == "language") r->vocal_language = v; // alias for vocal_language else if (k == "timesignature") r->timesignature = v; else if (k == "vocal_language") r->vocal_language = v; else if (k == "reference_audio") r->reference_audio = v; @@ -254,6 +260,7 @@ bool request_parse(AceRequest * r, const char * path) { // bools else if (k == "instrumental") r->instrumental = (v == "true"); + else if (k == "is_instrumental") r->instrumental = (v == "true"); // unknown keys: silently ignored (forward compat) } @@ -274,6 +281,10 @@ bool request_write(const AceRequest * r, const char * path) { fprintf(f, " \"lyrics\": \"%s\",\n", json_escape(r->lyrics).c_str()); if (r->instrumental) fprintf(f, " \"instrumental\": true,\n"); + if (!r->custom_tag.empty()) + fprintf(f, " \"custom_tag\": \"%s\",\n", json_escape(r->custom_tag).c_str()); + if (!r->genre.empty()) + fprintf(f, " \"genre\": \"%s\",\n", json_escape(r->genre).c_str()); fprintf(f, " \"bpm\": %d,\n", r->bpm); fprintf(f, " \"duration\": %.1f,\n", r->duration); fprintf(f, " \"keyscale\": \"%s\",\n", json_escape(r->keyscale).c_str()); @@ -310,6 +321,8 @@ void request_dump(const AceRequest * r, FILE * f) { fprintf(f, " caption: %.60s%s\n", r->caption.c_str(), r->caption.size() > 60 ? "..." : ""); fprintf(f, " lyrics: %zu bytes\n", r->lyrics.size()); + if (!r->custom_tag.empty()) + fprintf(f, " custom_tag: %s\n", r->custom_tag.c_str()); fprintf(f, " bpm=%d dur=%.0f key=%s ts=%s lang=%s\n", r->bpm, r->duration, r->keyscale.c_str(), r->timesignature.c_str(), r->vocal_language.c_str()); diff --git a/src/request.h b/src/request.h index e9222a0..ef4b41f 100644 --- a/src/request.h +++ b/src/request.h @@ -19,7 +19,11 @@ struct AceRequest { // text content std::string caption; // "" std::string lyrics; // "" - bool instrumental; // false + bool instrumental; // false + + // LoRA / style trigger (appended to caption for condition encoder when set) + std::string custom_tag; // "" e.g. "crydamoure" + std::string genre; // "" e.g. "Nu-disco" // metadata (user-provided or LLM-enriched) int bpm; // 0 = unset diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index d889da1..fd5fe47 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -243,8 +243,11 @@ int main(int argc, char ** argv) { continue; } - // Extract params - const char * caption = req.caption.c_str(); + // Extract params (append custom_tag to caption for LoRA/condition so trigger is in text) + std::string caption_for_cond = req.caption; + if (!req.custom_tag.empty()) + caption_for_cond += ", " + req.custom_tag; + const char * caption = caption_for_cond.c_str(); const char * lyrics = req.lyrics.c_str(); char bpm_str[16] = "N/A"; if (req.bpm > 0) snprintf(bpm_str, sizeof(bpm_str), "%d", req.bpm); From acd84020a3f43a0ebfd0dbec323dca7f2781cafa Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Sat, 28 Feb 2026 23:23:08 +0100 Subject: [PATCH 09/17] Fix formatting and clarify LoRA adapter instructions --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9ad2a2b..d809146 100644 --- a/README.md +++ b/README.md @@ -146,13 +146,13 @@ cd examples ./cover.sh # cover mode: decode precomputed audio_codes (no LLM) ./cover-reference.sh # cover + reference_audio for timbre (WAV/MP3; needs reference.wav or .mp3) ./test-reference.sh # reference_audio (WAV or MP3) + audio_cover_strength -./lora.sh # DiT + LoRA adapter (e.g. duckdbot/acestep-lora-cryda; put adapter in examples/lora/) +./lora.sh # DiT + LoRA adapter ``` Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0) alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights). -**LoRA adapters**: use `--lora ` and optional `--lora-scale ` with dit-vae to run the DiT with a PEFT-style LoRA (e.g. [duckdbot/acestep-lora-cryda](https://huggingface.co/duckdbot/acestep-lora-cryda)). Adapter must be `adapter_model.safetensors` (safetensors with `lora_A` / `lora_B` keys matching `decoder.layers.*`). Put the file in `examples/lora/` and run `./lora.sh`, or pass the path explicitly. +**LoRA adapters**: use `--lora ` and optional `--lora-scale ` with dit-vae to run the DiT with PEFT-style Ace-Step LoRAs. ## Generation modes @@ -214,7 +214,7 @@ All fields with defaults. Only `caption` is required. Built-in modes (text2music Key fields: `seed` -1 means random (resolved once, then +1 per batch element). `audio_codes` is generated by ace-qwen3 and consumed by dit-vae (comma separated FSQ token IDs). When present, the LLM is -skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style (MP3 decoded in memory; encoded via built-in VAE encoder; requires VAE GGUF with encoder weights). `src_audio`: not yet implemented (see docs/MODES.md). +skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style. `src_audio`: not yet implemented (see docs/MODES.md). Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG). SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`. From 9b087b2ee191ecfce4c97bbb09cdae3cb93a7aa7 Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Sat, 28 Feb 2026 23:24:06 +0100 Subject: [PATCH 10/17] Remove download instructions for LoRA adapter Removed instructions for downloading LoRA adapter from Hugging Face. --- examples/lora.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/lora.sh b/examples/lora.sh index 9c25d33..db7ce2b 100755 --- a/examples/lora.sh +++ b/examples/lora.sh @@ -7,10 +7,6 @@ cd "$(dirname "$0")" ADAPTER="lora/adapter_model.safetensors" if [ ! -f "$ADAPTER" ]; then echo "LoRA adapter not found at $ADAPTER" - echo "Download once (e.g. from Hugging Face):" - echo " mkdir -p lora" - echo " curl -L -o $ADAPTER 'https://huggingface.co/duckdbot/acestep-lora-cryda/resolve/main/adapter_model.safetensors'" - echo "Or: pip install hf && huggingface-cli download duckdbot/acestep-lora-cryda adapter_model.safetensors --local-dir lora" exit 1 fi From fc2408ae7cda0e0fdd5b5fb59318d4db88545f69 Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Sat, 28 Feb 2026 23:24:56 +0100 Subject: [PATCH 11/17] Update comments for custom_tag and genre fields --- src/request.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/request.h b/src/request.h index ef4b41f..ba85821 100644 --- a/src/request.h +++ b/src/request.h @@ -22,8 +22,8 @@ struct AceRequest { bool instrumental; // false // LoRA / style trigger (appended to caption for condition encoder when set) - std::string custom_tag; // "" e.g. "crydamoure" - std::string genre; // "" e.g. "Nu-disco" + std::string custom_tag; // "" LoRA trigger word + std::string genre; // "" LoRA genre // metadata (user-provided or LLM-enriched) int bpm; // 0 = unset From e860c79d45ce7329dd7f6557ae2ea44a337967cf Mon Sep 17 00:00:00 2001 From: qxip Date: Sat, 28 Feb 2026 23:46:24 +0100 Subject: [PATCH 12/17] Cover from file (src_audio), docs, README strength clarification - src_audio: load WAV/MP3, VAE encode, FSQ nearest-codeword encode to codes (fsq-detok.h: codeword table + latent_frames_to_codes; dit-vae: wire path) - reference_audio + cover (audio_codes/src_audio) fully supported without Python - MODES.md: cover and reference_audio marked supported; request table updated - README: clarify audio_cover_strength vs guidance_scale vs reference_audio (audio_cover_strength = cover blend; reference_audio = no strength knob; guidance_scale = DiT CFG, separate) Made-with: Cursor --- README.md | 7 ++++++- docs/MODES.md | 38 +++++++++++--------------------------- src/fsq-detok.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ tools/dit-vae.cpp | 33 ++++++++++++++++++++++++++++++++- 4 files changed, 94 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index d809146..8ad6ebf 100644 --- a/README.md +++ b/README.md @@ -214,7 +214,12 @@ All fields with defaults. Only `caption` is required. Built-in modes (text2music Key fields: `seed` -1 means random (resolved once, then +1 per batch element). `audio_codes` is generated by ace-qwen3 and consumed by dit-vae (comma separated FSQ token IDs). When present, the LLM is -skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style. `src_audio`: not yet implemented (see docs/MODES.md). +skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style (MP3 decoded in memory; encoded via built-in VAE encoder; requires VAE GGUF with encoder weights). `src_audio`: path to a **WAV or MP3** for cover source; dit-vae encodes it (VAE + FSQ nearest-codeword) to codes internally, no Python required (see docs/MODES.md). + +**Reference and cover strength (not the same as guidance_scale):** +- **`audio_cover_strength`** (0.0–1.0): Controls how strongly the **cover/source** (from `audio_codes` or `src_audio`) influences the DiT context. The context is blended with silence: `(1 - audio_cover_strength)*silence + audio_cover_strength*decoded`. Use 1.0 for full cover influence, lower values to soften it. Only applies when cover context is present. +- **`reference_audio`**: Timbre from the reference file is applied at full strength; there is no separate strength parameter for reference timbre. +- **`guidance_scale`**: This is **DiT classifier-free guidance** (conditioned vs unconditioned prediction), not reference or cover strength. Turbo models ignore it (forced to 1.0). Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG). SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`. diff --git a/docs/MODES.md b/docs/MODES.md index 4149ae7..ae0b616 100644 --- a/docs/MODES.md +++ b/docs/MODES.md @@ -7,7 +7,7 @@ This document maps the [ACE-Step 1.5 Tutorial](https://github.com/ace-step/ACE-S | `task_type` | Description | Turbo/SFT | Base only | C++ status | |---------------|-------------|-----------|-----------|------------| | **text2music** | Generate from caption/lyrics (and optional reference) | ✅ | — | ✅ **Supported** | -| **cover** | Re-synthesize with structure from source; optional timbre from reference | ✅ | — | ⚠️ **Partial** (see below) | +| **cover** | Re-synthesize with structure from source; optional timbre from reference | ✅ | — | ✅ **Supported** (audio_codes or src_audio WAV/MP3) | | **repaint** | Local edit in time range using source as context | ✅ | — | ❌ Not implemented | | **lego** | Add new tracks to existing audio | — | ✅ | ❌ Base model only | | **extract** | Extract single track from mix | — | ✅ | ❌ Base model only | @@ -22,30 +22,16 @@ We only ship Turbo and SFT DiT weights; **lego**, **extract**, **complete** requ ### text2music (default) - **Input**: `caption`, optional `lyrics`, metadata (bpm, duration, keyscale, …). - **Flow**: LM (optional) → CoT + audio codes → DiT (context = silence) → VAE → WAV. -- **Timbre**: Always uses built-in silence latent from the DiT GGUF (no user reference yet). - -### cover (when `audio_codes` are provided) -- **Input**: Same as text2music, plus **precomputed** `audio_codes` (e.g. from a previous run or from Python). -- **Flow**: Skip LM; decode `audio_codes` to latents → DiT context = decoded + silence padding → DiT → VAE → WAV. -- **Limitation**: We do **not** convert a WAV file into `audio_codes`. So “cover from a file” is only possible if you already have codes (e.g. from Python or from a prior `ace-qwen3` run). The request fields `reference_audio` and `src_audio` are accepted in JSON but **not yet used** in the pipeline. +- **Timbre**: Optional **reference_audio** (WAV/MP3) → VAE encode → CondEncoder timbre; else built-in silence. +### cover (when `audio_codes` or `src_audio` are provided) +- **Input**: Same as text2music, plus either **precomputed** `audio_codes` or **`src_audio`** (WAV/MP3 path). Optional **reference_audio** for timbre. +- **Flow**: If `src_audio` set and no `audio_codes`: load WAV/MP3 → VAE encode → FSQ nearest-codeword encode → codes. Then decode codes to latents → DiT context (blend with silence) → DiT → VAE → WAV. No Python. +- **reference_audio** and **audio_cover_strength**: Implemented (timbre; blend). --- ## What’s not implemented yet -### reference_audio (global timbre/style) -- **Tutorial**: Load WAV → stereo 48 kHz, pad/repeat to ≥30 s → **VAE encode** → latents → feed as timbre condition into DiT. -- **C++**: Implemented. Set `reference_audio` to a **WAV or MP3 file path**. dit-vae loads the file (WAV: any sample rate resampled to 48 kHz; MP3: decoded in memory via header-only minimp3, no temp files, then resampled to 48 kHz if needed), runs the **VAE encoder** (Oobleck, in C++ in `vae.h`), and feeds the 64-d latents to the CondEncoder timbre path. No Python, no external deps. Requires a **full VAE GGUF** that includes `encoder.*` tensors (decoder-only GGUFs will print a clear error). -- **audio_cover_strength** (0.0–1.0): Implemented. When `audio_codes` are present, context latents are blended with silence: `(1 - strength)*silence + strength*decoded`. - -### src_audio (Cover from file) -- **Tutorial**: Source audio is converted to **semantic codes** (melody, rhythm, chords, etc.); then DiT uses those as in cover mode. -- **C++**: That implies **audio → codes**. Likely path: WAV → VAE encode → **FSQ tokenizer** (latents → 5 Hz codes). We have the **FSQ detokenizer** (codes → latents); the tokenizer (encode) side would need to be added. Then: `src_audio` path → load WAV → VAE encode → FSQ encode → `audio_codes` → existing cover path. - -### audio_cover_strength -- **Tutorial**: 0.0–1.0, how strongly generation follows reference/codes. -- **C++**: Field is in the request and parsed; no blending logic in the DiT/context path yet. - ### repaint - **Tutorial**: Specify `repainting_start` / `repainting_end` (seconds); model uses source audio as context and only generates in that interval (3–90 s). - **C++**: Would require **masked diffusion**: context carries “given” frames; ODE only updates the repaint region. DiT’s context has a 64-channel “mask” that we currently set to 1.0; repaint would set mask per frame and the generation loop would only update unmasked frames. Not implemented. @@ -60,9 +46,9 @@ All of these are in `AceRequest` and parsed from / written to JSON. Backend beha |-------|------|--------| | `task_type` | string | `"text2music"` \| `"cover"` \| `"repaint"` \| … | | `reference_audio` | string | Path to WAV or MP3 for timbre (implemented) | -| `src_audio` | string | Path to WAV for cover/repaint source (not used yet) | -| `audio_codes` | string | Comma-separated FSQ codes; non-empty ⇒ cover path | -| `audio_cover_strength` | float | 0.0–1.0 (parsed, not used yet) | +| `src_audio` | string | Path to WAV or MP3 for cover source; encoded to codes internally (implemented) | +| `audio_codes` | string | Comma-separated FSQ codes; non-empty ⇒ cover path (or from `src_audio`) | +| `audio_cover_strength` | float | 0.0–1.0 blend of decoded context with silence (implemented) | | `repainting_start` | float | Start time (s) for repaint (not used yet) | | `repainting_end` | float | End time (s) for repaint (not used yet) | @@ -72,8 +58,6 @@ See `request.h` and the README “Request JSON reference” for the full list. ## Summary -- **Fully supported**: text2music; cover when you supply **precomputed** `audio_codes`. -- **Schema only** (no backend): `task_type`, `reference_audio`, `src_audio`, `audio_cover_strength`, `repainting_start`/`repainting_end`. -- **To support reference_audio**: add VAE encoder, then feed its output into the existing CondEncoder timbre path. -- **To support cover from file**: add VAE encoder + FSQ tokenizer (or equivalent audio→codes), then reuse existing cover path. +- **Fully supported**: text2music (with optional reference_audio for timbre); cover from **precomputed** `audio_codes` or from **WAV/MP3** via `src_audio` (VAE encode + FSQ nearest-codeword encode); reference_audio (timbre); audio_cover_strength (blend). +- **Schema only** (no backend): `repainting_start`/`repainting_end`. - **To support repaint**: implement masked DiT generation (context mask + ODE only on repaint interval). diff --git a/src/fsq-detok.h b/src/fsq-detok.h index 0d4e33c..29eef5f 100644 --- a/src/fsq-detok.h +++ b/src/fsq-detok.h @@ -10,10 +10,13 @@ #pragma once #include "qwen3-enc.h" +#include // FSQ constants static const int FSQ_NDIMS = 6; static const int FSQ_LEVELS[6] = {8, 8, 8, 5, 5, 5}; +static const int FSQ_N_CODES = 8 * 8 * 8 * 5 * 5 * 5; // 8000 +static const int FSQ_FRAMES_PER_CODE = 5; // FSQ decode: integer index -> 6 normalized float values // Each dimension: level_idx / ((L-1)/2) - 1.0 (maps to [-1, 1]) @@ -214,6 +217,48 @@ static int detok_ggml_decode(DetokGGML * m, const int * codes, int T_5Hz, return T_25Hz; } +// Build codeword table for latent->code (cover from file): for each code 0..FSQ_N_CODES-1, +// decode to 5*64 floats. table_out must be at least FSQ_N_CODES * FSQ_FRAMES_PER_CODE * 64 floats. +static void detok_ggml_build_codeword_table(DetokGGML * m, float * table_out) { + const int chunk = FSQ_FRAMES_PER_CODE * 64; + for (int i = 0; i < FSQ_N_CODES; i++) { + int n = detok_ggml_decode(m, &i, 1, table_out + (size_t)i * chunk); + (void)n; + } +} + +// Encode latent frames to 5Hz codes by nearest codeword. T_latent = number of 25Hz frames (64-d each). +// Groups frames in chunks of 5; for each chunk finds the code whose codeword minimizes L2 distance. +// codeword_table from detok_ggml_build_codeword_table (FSQ_N_CODES * 5 * 64 floats). +// Pads last chunk with zeros if T_latent not divisible by 5. +static void latent_frames_to_codes(int T_latent, const float * latent_64d, + const float * codeword_table, + std::vector * out_codes) { + out_codes->clear(); + const int chunk_frames = FSQ_FRAMES_PER_CODE; + const int chunk_size = chunk_frames * 64; + int n_chunks = T_latent / chunk_frames; + if (n_chunks <= 0) return; + for (int g = 0; g < n_chunks; g++) { + const float * chunk = latent_64d + (size_t)g * chunk_size; + int best = 0; + float best_d2 = 1e30f; + for (int i = 0; i < FSQ_N_CODES; i++) { + const float * cw = codeword_table + (size_t)i * chunk_size; + float d2 = 0.0f; + for (int j = 0; j < chunk_size; j++) { + float d = chunk[j] - cw[j]; + d2 += d * d; + } + if (d2 < best_d2) { + best_d2 = d2; + best = i; + } + } + out_codes->push_back(best); + } +} + // Free static void detok_ggml_free(DetokGGML * m) { if (m->sched) ggml_backend_sched_free(m->sched); diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index 7b5f188..bfe274d 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -277,8 +277,39 @@ int main(int argc, char ** argv) { fprintf(stderr, "[Pipeline] seed=%lld, steps=%d, guidance=%.1f, shift=%.1f, duration=%.1fs\n", seed, num_steps, guidance_scale, shift, duration); - // Parse audio codes from request + // Parse audio codes from request (or produce from src_audio WAV/MP3) std::vector codes_vec = parse_codes_string(req.audio_codes); + if (codes_vec.empty() && !req.src_audio.empty() && have_vae) { + const std::string & src_path = req.src_audio; + std::vector wav_stereo; + int n_samples = load_audio_48k_stereo(src_path.c_str(), &wav_stereo); + if (n_samples > 0) { + int T_audio = n_samples; + if (T_audio >= 1920) { + VAEEncoderGGML enc = {}; + if (vae_encoder_load(&enc, vae_gguf)) { + size_t max_lat = (size_t)(T_audio / 2048) + 1; + std::vector enc_out(max_lat * 64); + int T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data()); + vae_encoder_free(&enc); + if (T_lat >= FSQ_FRAMES_PER_CODE) { + DetokGGML detok = {}; + if (detok_ggml_load(&detok, dit_gguf, model.backend, model.cpu_backend)) { + std::vector codeword_table((size_t)FSQ_N_CODES * FSQ_FRAMES_PER_CODE * 64); + fprintf(stderr, "[Cover] building FSQ codeword table (8000 codes)...\n"); + detok_ggml_build_codeword_table(&detok, codeword_table.data()); + latent_frames_to_codes(T_lat, enc_out.data(), codeword_table.data(), &codes_vec); + fprintf(stderr, "[Cover] encoded %s -> %zu codes (%.1fs @ 5Hz)\n", + src_path.c_str(), codes_vec.size(), (float)codes_vec.size() / 5.0f); + detok_ggml_free(&detok); + } + } + } + } + } else { + fprintf(stderr, "[Cover] WARNING: cannot load src_audio %s (use .wav or .mp3), skipping cover-from-file\n", src_path.c_str()); + } + } if (!codes_vec.empty()) fprintf(stderr, "[Pipeline] %zu audio codes (%.1fs @ 5Hz)\n", codes_vec.size(), (float)codes_vec.size() / 5.0f); From d4d3e3b6df669a9da06f7f230597979cbd8f5842 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Mar 2026 22:39:35 +0000 Subject: [PATCH 13/17] Initial plan From b237e8e285b2f12929b6ab2c8720ffc6bbb8de49 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Mar 2026 22:58:27 +0000 Subject: [PATCH 14/17] Resolve all 16 merge conflicts: add upstream features, preserve fork additions Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com> --- _codeql_detected_source_root | 1 + 1 file changed, 1 insertion(+) create mode 120000 _codeql_detected_source_root diff --git a/_codeql_detected_source_root b/_codeql_detected_source_root new file mode 120000 index 0000000..945c9b4 --- /dev/null +++ b/_codeql_detected_source_root @@ -0,0 +1 @@ +. \ No newline at end of file From 8ddc5d2ac8975dcc30f5551ca09f7ccaee803703 Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Mon, 2 Mar 2026 00:04:05 +0100 Subject: [PATCH 15/17] Delete _codeql_detected_source_root --- _codeql_detected_source_root | 1 - 1 file changed, 1 deletion(-) delete mode 120000 _codeql_detected_source_root diff --git a/_codeql_detected_source_root b/_codeql_detected_source_root deleted file mode 120000 index 945c9b4..0000000 --- a/_codeql_detected_source_root +++ /dev/null @@ -1 +0,0 @@ -. \ No newline at end of file From 71695c517324b6f6aa49e89a3e10fef906dfabf3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Mar 2026 10:05:16 +0000 Subject: [PATCH 16/17] Initial plan From bbe369fa15c5743cb259ebd3a00e2f0d218d783e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Mar 2026 10:09:04 +0000 Subject: [PATCH 17/17] Reset master to upstream (ServeurpersoCom/acestep.cpp@aeff0b8) --- .github/workflows/build.yml | 44 - .github/workflows/test-generation.yml | 61 - .gitignore | 5 - CMakeLists.txt | 17 +- README.md | 267 +++- build.sh | 8 - docs/MODES.md | 63 - examples/cover-reference.json | 16 - examples/cover-reference.sh | 20 - examples/cover.json | 15 - examples/cover.sh | 15 - examples/lora.json | 17 - examples/lora.sh | 27 - examples/request-reference.json | 13 - examples/run-examples-ci.sh | 47 - examples/test-reference.sh | 21 - src/audio.h | 17 - src/audio_loader.cpp | 117 -- src/backend.h | 16 +- src/cond-enc.h | 5 +- src/debug.h | 2 - src/dit-graph.h | 96 +- src/dit-lora.cpp | 202 --- src/dit-sampler.h | 4 - src/dit.h | 24 - src/fsq-detok.h | 49 +- src/gguf-weights.h | 1 - src/metadata-fsm.h | 2 - src/prompt.h | 1 - src/qwen3-enc.h | 11 +- src/qwen3-lm.h | 13 +- src/request.cpp | 62 +- src/request.h | 27 +- src/safetensors.h | 107 -- src/vae-enc.h | 391 +++++ src/vae.h | 207 +-- src/wav.h | 101 -- tests/BF16.log | 130 -- tests/CPU-BF16.log | 113 +- tests/CPU-Q4_K_M.log | 115 +- tests/CPU-Q5_K_M.log | 113 +- tests/CPU-Q6_K.log | 113 +- tests/CPU-Q8_0.log | 113 +- tests/CPU_BF16.log | 130 -- tests/CPU_Q4_K_M.log | 130 -- tests/CPU_Q5_K_M.log | 130 -- tests/CPU_Q6_K.log | 130 -- tests/CPU_Q8_0.log | 130 -- tests/CUDA-BF16.log | 115 +- tests/CUDA-Q4_K_M.log | 115 +- tests/CUDA-Q5_K_M.log | 113 +- tests/CUDA-Q6_K.log | 115 +- tests/CUDA-Q8_0.log | 113 +- tests/Metal_Q4_K_M.log | 835 ---------- tests/Metal_Q5_K_M.log | 835 ---------- tests/Metal_Q6_K.log | 819 ---------- tests/Metal_Q8_0.log | 823 ---------- tests/Q4_K_M.log | 130 -- tests/Q5_K_M.log | 130 -- tests/Q6_K.log | 130 -- tests/Q8_0.log | 130 -- tests/Vulkan-BF16.log | 229 ++- tests/Vulkan-Q4_K_M.log | 185 ++- tests/Vulkan-Q5_K_M.log | 175 +- tests/Vulkan-Q6_K.log | 229 ++- tests/Vulkan-Q8_0.log | 227 ++- tests/Vulkan_BF16.log | 130 -- tests/Vulkan_Q4_K_M.log | 130 -- tests/Vulkan_Q5_K_M.log | 130 -- tests/Vulkan_Q6_K.log | 130 -- tests/Vulkan_Q8_0.log | 130 -- tests/fixtures/ci-cover.json | 12 - tests/fixtures/ci-dit-only.json | 11 - tests/fixtures/ci-full.json | 11 - tests/fixtures/ci-partial.json | 8 - tests/fixtures/ci-request-reference.json | 13 - tests/fixtures/ci-text2music.json | 11 - tests/run-generation-tests.sh | 83 - third_party/minimp3.h | 1865 ---------------------- tools/ace-qwen3.cpp | 3 +- tools/dit-vae.cpp | 136 +- tools/neural-codec.cpp | 522 ++++++ tools/quantize.cpp | 1 - 83 files changed, 2313 insertions(+), 9919 deletions(-) delete mode 100644 .github/workflows/build.yml delete mode 100644 .github/workflows/test-generation.yml delete mode 100755 build.sh delete mode 100644 docs/MODES.md delete mode 100644 examples/cover-reference.json delete mode 100755 examples/cover-reference.sh delete mode 100644 examples/cover.json delete mode 100755 examples/cover.sh delete mode 100644 examples/lora.json delete mode 100755 examples/lora.sh delete mode 100644 examples/request-reference.json delete mode 100755 examples/run-examples-ci.sh delete mode 100755 examples/test-reference.sh delete mode 100644 src/audio.h delete mode 100644 src/audio_loader.cpp delete mode 100644 src/dit-lora.cpp delete mode 100644 src/safetensors.h create mode 100644 src/vae-enc.h delete mode 100644 src/wav.h delete mode 100644 tests/BF16.log delete mode 100644 tests/CPU_BF16.log delete mode 100644 tests/CPU_Q4_K_M.log delete mode 100644 tests/CPU_Q5_K_M.log delete mode 100644 tests/CPU_Q6_K.log delete mode 100644 tests/CPU_Q8_0.log delete mode 100644 tests/Metal_Q4_K_M.log delete mode 100644 tests/Metal_Q5_K_M.log delete mode 100644 tests/Metal_Q6_K.log delete mode 100644 tests/Metal_Q8_0.log delete mode 100644 tests/Q4_K_M.log delete mode 100644 tests/Q5_K_M.log delete mode 100644 tests/Q6_K.log delete mode 100644 tests/Q8_0.log delete mode 100644 tests/Vulkan_BF16.log delete mode 100644 tests/Vulkan_Q4_K_M.log delete mode 100644 tests/Vulkan_Q5_K_M.log delete mode 100644 tests/Vulkan_Q6_K.log delete mode 100644 tests/Vulkan_Q8_0.log delete mode 100644 tests/fixtures/ci-cover.json delete mode 100644 tests/fixtures/ci-dit-only.json delete mode 100644 tests/fixtures/ci-full.json delete mode 100644 tests/fixtures/ci-partial.json delete mode 100644 tests/fixtures/ci-request-reference.json delete mode 100644 tests/fixtures/ci-text2music.json delete mode 100755 tests/run-generation-tests.sh delete mode 100644 third_party/minimp3.h create mode 100644 tools/neural-codec.cpp diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index fe17246..0000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,44 +0,0 @@ -# Validate that the project builds on Ubuntu and macOS (no model download). -name: Build - -on: - push: - branches: [main, master] - pull_request: - branches: [main, master] - -jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest, macos-latest] - - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Build (Ubuntu) - if: matrix.os == 'ubuntu-latest' - run: | - sudo apt-get update -qq - sudo apt-get install -y -qq cmake build-essential pkg-config libopenblas-dev - mkdir build && cd build - cmake .. -DGGML_BLAS=ON - cmake --build . --config Release -j$(nproc) - - - name: Build (macOS) - if: matrix.os == 'macos-latest' - run: | - mkdir build && cd build - cmake .. - cmake --build . --config Release -j$(sysctl -n hw.ncpu) - - - name: Smoke test - run: | - ./build/ace-qwen3 --help 2>&1 | head -5 - ./build/dit-vae --help 2>&1 | head -5 - ./build/quantize --help 2>&1 | head -3 diff --git a/.github/workflows/test-generation.yml b/.github/workflows/test-generation.yml deleted file mode 100644 index 2f84eed..0000000 --- a/.github/workflows/test-generation.yml +++ /dev/null @@ -1,61 +0,0 @@ -# Build, download models (cached), and run all example scripts with short CI fixtures. -# Validate locally: from repo root run ./examples/run-examples-ci.sh (after build + ./models.sh). -# Trigger: release (published) or workflow_dispatch only. -name: Test generation - -on: - workflow_dispatch: {} - release: - types: [published] - -jobs: - build-and-test: - runs-on: ubuntu-latest - timeout-minutes: 60 - - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Build - run: | - sudo apt-get update -qq - sudo apt-get install -y -qq cmake build-essential pkg-config libopenblas-dev - mkdir build && cd build - cmake .. -DGGML_BLAS=ON - cmake --build . --config Release -j$(nproc) - - # Restore/save model artifacts (see https://docs.github.com/en/actions/reference/workflows-and-actions/dependency-caching) - - name: Cache models - id: cache-models - uses: actions/cache@v4 - with: - path: models - key: ${{ runner.os }}-acestep-models-q8-${{ hashFiles('models.sh') }} - restore-keys: | - ${{ runner.os }}-acestep-models-q8- - ${{ runner.os }}-acestep-models- - - - name: Download models - if: steps.cache-models.outputs.cache-hit != 'true' - run: | - pip install -q hf - ./models.sh - - - name: Run examples (CI fixtures, short duration) - run: chmod +x examples/run-examples-ci.sh && ./examples/run-examples-ci.sh - - - name: Upload generated audio - uses: actions/upload-artifact@v4 - with: - name: generated-audio - path: | - examples/dit-only0.wav - examples/cover0.wav - examples/cover-reference0.wav - examples/request-reference0.wav - examples/simple00.wav - examples/partial00.wav - examples/full00.wav diff --git a/.gitignore b/.gitignore index 91ddabb..4ccb4f4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,12 +3,7 @@ build/ *.bf16 tests/*/ -!tests/fixtures/ -!tests/fixtures/*.json -request.json -request0.json -tests/fixtures/*0.json checkpoints/ models/ __pycache__/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 99903d6..d7af387 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,12 @@ add_compile_definitions(GGML_MAX_NAME=128) # CUDA architectures: cover Turing to Blackwell for distributed binaries. # Users can override with -DCMAKE_CUDA_ARCHITECTURES=native for local builds. if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real") + find_package(CUDAToolkit QUIET) + if(CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8") + set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real") + else() + set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real") + endif() endif() # ggml as subdirectory, inherits GGML_CUDA, GGML_METAL, etc. from cmake flags @@ -54,13 +59,11 @@ endmacro() # Core library (shared between binaries) add_library(acestep-core STATIC src/request.cpp - src/audio_loader.cpp ) link_ggml_backends(acestep-core) -target_include_directories(acestep-core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) -# dit-vae: full pipeline (text-enc + cond + dit + vae + wav) + LoRA support -add_executable(dit-vae tools/dit-vae.cpp src/dit-lora.cpp) +# dit-vae: full pipeline (text-enc + cond + dit + vae + wav) +add_executable(dit-vae tools/dit-vae.cpp) target_link_libraries(dit-vae PRIVATE acestep-core) link_ggml_backends(dit-vae) @@ -72,3 +75,7 @@ link_ggml_backends(ace-qwen3) # quantize: GGUF requantizer (BF16 -> K-quants) add_executable(quantize tools/quantize.cpp) link_ggml_backends(quantize) + +# neural-codec: Oobleck VAE neural audio codec (encode/decode WAV <-> latent) +add_executable(neural-codec tools/neural-codec.cpp) +link_ggml_backends(neural-codec) diff --git a/README.md b/README.md index 25449d0..84f39dd 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # acestep.cpp Portable C++17 implementation of ACE-Step 1.5 music generation using GGML. -Text + lyrics in, stereo 48kHz WAV out. Runs on CPU, CUDA, Metal, Vulkan. +Text + lyrics in, stereo 48kHz WAV out. Runs on CPU, CUDA, ROCm, Metal, Vulkan. ## Build @@ -16,6 +16,9 @@ cmake .. # Linux with NVIDIA GPU cmake .. -DGGML_CUDA=ON +# Linux with AMD GPU (ROCm) +cmake .. -DGGML_HIP=ON + # Linux with Vulkan cmake .. -DGGML_VULKAN=ON @@ -29,7 +32,7 @@ cmake .. -DGGML_CUDA=ON -DGGML_BLAS=ON cmake --build . --config Release -j$(nproc) ``` -Builds two binaries: `ace-qwen3` (LLM) and `dit-vae` (DiT + VAE). +Builds three binaries: `ace-qwen3` (LLM), `dit-vae` (DiT + VAE) and `neural-codec` (VAE encode/decode). ## Models @@ -94,13 +97,13 @@ EOF # LLM: request.json -> request0.json (enriched with lyrics + codes) ./build/ace-qwen3 \ --request /tmp/request.json \ - --model models/acestep-5Hz-lm-4B-BF16.gguf + --model models/acestep-5Hz-lm-4B-Q8_0.gguf # DiT+VAE: request0.json -> request00.wav ./build/dit-vae \ --request /tmp/request0.json \ - --text-encoder models/Qwen3-Embedding-0.6B-BF16.gguf \ - --dit models/acestep-v15-turbo-BF16.gguf \ + --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit models/acestep-v15-turbo-Q8_0.gguf \ --vae models/vae-BF16.gguf ``` @@ -111,7 +114,7 @@ Generate multiple songs at once with `--batch`: # -> request0.json, request1.json (different lyrics/codes, seeds auto+0, auto+1) ./build/ace-qwen3 \ --request /tmp/request.json \ - --model models/acestep-5Hz-lm-4B-BF16.gguf \ + --model models/acestep-5Hz-lm-4B-Q8_0.gguf \ --batch 2 # DiT+VAE: (2 DiT variations of LM output 1 and 2) @@ -119,8 +122,8 @@ Generate multiple songs at once with `--batch`: # -> request1.json -> request10.wav, request11.wav ./build/dit-vae \ --request /tmp/request0.json /tmp/request1.json \ - --text-encoder models/Qwen3-Embedding-0.6B-BF16.gguf \ - --dit models/acestep-v15-turbo-BF16.gguf \ + --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit models/acestep-v15-turbo-Q8_0.gguf \ --vae models/vae-BF16.gguf \ --batch 2 ``` @@ -151,34 +154,43 @@ Empty field = "fill it". Filled = "don't touch". All modes always output numbered files (`request0.json` .. `requestN-1.json`). The input JSON is never modified. -**Caption only**: the LLM generates lyrics, metadata (bpm, key, time -signature, duration) and audio codes. With `--batch N`, each element -generates its own lyrics and metadata from a different seed, producing -N completely different songs. See `examples/simple.json`. +**Caption only** (`lyrics=""`): two LLM passes. Phase 1 uses the "Expand" +prompt to generate lyrics and metadata (bpm, keyscale, timesignature, +duration) via CoT. Phase 2 reinjects the CoT and generates audio codes using +the "Generate tokens" prompt. CFG is forced to 1.0 in phase 1 (free +sampling); `lm_cfg_scale` only applies in phase 2. With `--batch N`, each +element runs its own phase 1 from a different seed, producing N completely +different songs. See `examples/simple.json`. -**Caption + lyrics (+ optional metadata)**: the LLM fills missing -metadata via CoT, then generates audio codes. User provided fields -are preserved. See `examples/partial.json`. +**Caption + lyrics (+ optional metadata)**: single LLM pass. The "Generate +tokens" prompt is used directly. Missing metadata is filled via CoT, then +audio codes are generated. User-provided fields are never overwritten. +`lm_cfg_scale` applies to both CoT and code generation. See +`examples/partial.json`. **Everything provided** (caption, lyrics, bpm, duration, keyscale, timesignature): the LLM skips CoT and generates audio codes directly. With `--batch N`, all elements share the same prompt (single prefill, KV cache copied). See `examples/full.json`. +**Instrumental** (`lyrics="[Instrumental]"`): treated as "lyrics provided", +so the single-pass "Generate tokens" path is used. No lyrics generation. +The DiT was trained with this exact string as the no-vocal condition. + **Passthrough** (`audio_codes` present): LLM is skipped entirely. Run `dit-vae` to decode existing codes. See `examples/dit-only.json`. ## Request JSON reference -All fields with defaults. Only `caption` is required. +Only `caption` is required. All other fields default to "unset" which means +the LLM fills them, or a sensible runtime default is applied. ```json { "caption": "", "lyrics": "", - "instrumental": false, "bpm": 0, - "duration": -1, + "duration": 0, "keyscale": "", "timesignature": "", "vocal_language": "unknown", @@ -190,18 +202,98 @@ All fields with defaults. Only `caption` is required. "lm_negative_prompt": "", "audio_codes": "", "inference_steps": 8, - "guidance_scale": 7.0, + "guidance_scale": 0.0, "shift": 3.0 } ``` -Key fields: `seed` -1 means random (resolved once, then +1 per batch -element). `audio_codes` is generated by ace-qwen3 and consumed by -dit-vae (comma separated FSQ token IDs). When present, the LLM is -skipped entirely. +### Text conditioning (ace-qwen3 + dit-vae) + +**`caption`** (string, required) +Natural language description of the music style, mood, instruments, etc. +Fed to both the LLM and the DiT text encoder. + +**`lyrics`** (string, default `""`) +Controls vocal generation. Three valid states: +- `""`: LLM generates lyrics from the caption (phase 1 "Expand" prompt). +- `"[Instrumental]"`: no vocals. Passed directly to the DiT, LLM skips lyrics generation. +- Any other string: user-provided lyrics used as-is, LLM only fills missing metadata. + +There is no `instrumental` flag. This field is the single source of truth for +vocal content. + +### Metadata (LLM-filled if unset) + +**`bpm`** (int, default `0` = unset) +Beats per minute. LLM generates one if 0. + +**`duration`** (float seconds, default `0` = unset) +Target audio duration. `0` means the LLM picks it. Clamped to [1, 600]s after +generation. `1` means 1 second. + +**`keyscale`** (string, default `""` = unset) +Musical key and scale, e.g. `"C major"`, `"F# minor"`. LLM fills if empty. + +**`timesignature`** (string, default `""` = unset) +Time signature numerator as a string, e.g. `"4"` for 4/4, `"3"` for 3/4. +LLM fills if empty. + +**`vocal_language`** (string, default `"unknown"`) +BCP-47 language code for lyrics, e.g. `"en"`, `"fr"`, `"ja"`. When set and +lyrics are being generated, the FSM constrains the LLM output to that language. +`"unknown"` lets the LLM decide. + +### Generation control + +**`seed`** (int64, default `-1` = random) +RNG seed. Resolved once at startup to a random value if -1. Batch elements +use `seed+0`, `seed+1`, ... `seed+N-1`. + +**`audio_codes`** (string, default `""`) +Comma-separated FSQ token IDs produced by ace-qwen3. When non-empty, the +entire LLM pass is skipped and dit-vae decodes these codes directly +(passthrough / cover mode). + +### LM sampling (ace-qwen3) + +**`lm_temperature`** (float, default `0.85`) +Sampling temperature for both phase 1 (lyrics/metadata) and phase 2 (audio +codes). Lower = more deterministic. + +**`lm_cfg_scale`** (float, default `2.0`) +Classifier-Free Guidance scale for the LM. Only active in phase 2 (audio +code generation) and in phase 1 when lyrics are already provided. When +`lyrics` is empty, phase 1 always runs with `cfg=1.0` (free sampling). +`1.0` disables CFG. -Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG). -SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`. +**`lm_top_p`** (float, default `0.9`) +Nucleus sampling cutoff. `1.0` disables. When `top_k=0`, an internal +pre-filter of 256 tokens is applied before top_p for performance. + +**`lm_top_k`** (int, default `0` = disabled) +Top-K sampling. `0` disables hard top-K (top_p still applies). + +**`lm_negative_prompt`** (string, default `""`) +Negative caption for CFG in phase 2. Empty string falls back to a +caption-less unconditional prompt. + +### DiT flow matching (dit-vae) + +**`inference_steps`** (int, default `8`) +Number of diffusion denoising steps. Turbo preset: `8`. SFT preset: `50`. + +**`guidance_scale`** (float, default `0.0` = auto) +CFG scale for the DiT. `0.0` is resolved at runtime: +- Turbo models: forced to `1.0` (CFG disabled, turbo was trained without it). +- SFT/base models: `7.0`. +Any value > 1.0 on a turbo model is overridden to 1.0 with a warning. + +**`shift`** (float, default `3.0`) +Flow-matching schedule shift. Controls the timestep distribution. +`shift = s*t / (1 + (s-1)*t)`. Turbo preset: `3.0`. SFT preset: `6.0`. + +Turbo preset: `inference_steps=8, shift=3.0` (guidance_scale auto-resolved to 1.0). +SFT preset: `inference_steps=50, guidance_scale=7.0, shift=6.0`. ## ace-qwen3 reference @@ -258,6 +350,71 @@ Debug: Models are loaded once and reused across all requests. +## neural-codec + +GGML-native neural audio codec based on the Oobleck VAE encoder and decoder. +Serves two purposes: validating the precision of the full VAE chain (encode + +decode roundtrip), and compressing music at ~850 B/s with no perceptible +difference from the original. + +``` +Usage: neural-codec --vae --encode|--decode -i [-o ] [--q8|--q4] + +Required: + --vae VAE GGUF file + --encode | --decode Encode WAV to latent, or decode latent to WAV + -i Input (WAV for encode, latent for decode) + +Output: + -o Output file (auto-named if omitted) + --q8 Quantize latent to int8 (~13 kbit/s) + --q4 Quantize latent to int4 (~6.8 kbit/s) + +Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4) + song.latent -> song.wav + +VAE tiling (memory control): + --vae-chunk Latent frames per tile (default: 256) + --vae-overlap Overlap frames per side (default: 64) + +Latent formats (decode auto-detects): + f32: flat [T, 64] f32, no header. ~51 kbit/s. + NAC8: header + per-frame Q8. ~13 kbit/s. + NAC4: header + per-frame Q4. ~6.8 kbit/s. +``` + +The encoder is the symmetric mirror of the decoder: same snake activations, +same residual units, strided conv1d for downsampling instead of transposed +conv1d for upsampling. No new GGML ops. Downsample 2x4x4x6x10 = 1920x. + +48kHz stereo audio is compressed to 64-dimensional latent frames at 25 Hz. +Three output formats, decode auto-detects from file content: + +| Format | Frame size | Bitrate | 3 min song | vs f32 (cossim) | +|--------|-----------|---------|------------|-----------------| +| f32 | 256B | 51 kbit/s | 1.1 MB | baseline | +| NAC8 | 66B | 13 kbit/s | 290 KB | 0.9999 | +| NAC4 | 34B | 6.8 kbit/s | 150 KB | 0.989 | + +NAC = Neural Audio Codec. The NAC8 and NAC4 file formats are headerless +except for a 4-byte magic (`NAC8` or `NAC4`) and a uint32 frame count. +Q8 quantization error is 39 dB below the VAE reconstruction error (free). +Q4 quantization error is 16 dB below the VAE reconstruction error (inaudible +on most material). + +```bash +# encode (Q4: 6.8 kbit/s, ~150 KB for 3 minutes) +neural-codec --vae models/vae-BF16.gguf --encode --q4 -i song.wav -o song.nac4 + +# encode (Q8: 13 kbit/s, ~290 KB for 3 minutes) +neural-codec --vae models/vae-BF16.gguf --encode --q8 -i song.wav -o song.nac8 + +# decode (auto-detects format) +neural-codec --vae models/vae-BF16.gguf --decode -i song.nac4 -o song_decoded.wav + +# roundtrip validation: compare song.wav and song_decoded.wav with your ears +``` + ## Architecture ``` @@ -278,6 +435,39 @@ dit-vae WAV stereo 48kHz ``` +## Roadmap + +This project started from a simple idea: a Telegram bot using llama.cpp to +prompt a music generator, and the desire to make GGML sing. No more, no less. +No cloud, no black box, scriptable and nothing between you and the model. + +### LLM modes +- [ ] Remaining modes: Understand, Rewrite (single-pass, no audio codes) +- [ ] Reference audio input: repaint and cover tasks (src_audio + cover_strength) + +### Audio I/O +Current: raw PCM f32 WAV via hand-rolled writer, no external deps. +Trade-off to document: +- **Keep as-is**: zero dependencies, clean licensing, works everywhere +- **ffmpeg pipe**: trivial bash wrapper handles any codec/format, no C++ codec hell + - pro: MP3/FLAC/OGG out of the box, input resampling for reference audio + - con: runtime dependency, not embedded +Conclusion pending. Likely ffmpeg as optional external pipe, documented in README. + +### API and interface +- [ ] JSON HTTP server (minimal, well-documented, stable contract) +- [ ] Web interface on top - vibecodeable by anyone, API stays simple +Goal: document the internals and how the model actually works, +not reproduce the Python spaghetti. Expert-first, no commercial fluff. + +### Documentation +Current README is technical study + API reference, intentional. +- [ ] Split when a user-facing interface exists: README (user) + ARCHITECTURE.md (internals) + +### Future models +- [ ] ACE-Step 2.0: evaluate architecture delta, add headers/weights as needed +No commitment, easy to adapt by adding headers or new compilation units as needed. + ## LM specifics ace-qwen3 is not a general-purpose chat engine. It is a two-phase autoregressive @@ -318,7 +508,7 @@ python3 debug-dit-cossim.py # DiT: per-layer cossim GGML vs Python (turbo/ ## Patched GGML fork Uses a patched GGML fork (submodule) with two new ops, a Metal im2col optimization, and -a CUDA bugfix for the Oobleck VAE decoder. All backends: CPU, CUDA, Metal, Vulkan. +a CUDA bugfix for the Oobleck VAE decoder. All backends: CPU, CUDA, ROCm, Metal, Vulkan. F32/F16/BF16 data types. The DiT uses only standard GGML ops and needs no patches. The VAE reconstructs audio from latent space through 5 upsampling blocks (total 1920x), @@ -373,6 +563,19 @@ Upstream `im2col_kernel` uses OW directly as grid dimension Y, which exceeds the times per tile at output widths up to 491520. Fixed with a grid-stride loop on OW and `MIN(OW, MAX_GRIDDIM_Z)` clamping. +### Upstream divergence + +The GGML submodule diverges from upstream only by the addition of +`GGML_OP_SNAKE` and `GGML_OP_COL2IM_1D`. No existing upstream kernel is +modified. These ops are required; the VAE does not work without them. + +An earlier approach patched the upstream naive ops instead of adding custom +ones. Those patches were dropped. They are documented here in case someone +wants to study the naive path: + +- `conv_transpose_1d`: bounded loop replacing O(T_in) brute-force, CUDA and Metal +- `im2col`: grid-stride loop on OW to fix gridDim.y overflow for large tensors + ## Acknowledgements Independent implementation based on ACE-Step 1.5 by ACE Studio and StepFun. @@ -387,3 +590,15 @@ All model weights are theirs, this is just a native backend. note={GitHub repository} } ``` + +## Samples + +https://github.com/user-attachments/assets/9a50c1f4-9ec0-474a-bd14-e8c6b00622a1 + +https://github.com/user-attachments/assets/fb606249-0269-4153-b651-bf78e05baf22 + +https://github.com/user-attachments/assets/e0580468-5e33-4a1f-a0f4-b914e4b9a8c2 + +https://github.com/user-attachments/assets/292a31f1-f97e-4060-9207-ed8364d9a794 + +https://github.com/user-attachments/assets/34b1b781-a5bc-46c4-90a6-615a10bc2c6a diff --git a/build.sh b/build.sh deleted file mode 100755 index 67f711f..0000000 --- a/build.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -rm -rf build -mkdir build -cd build - -cmake .. -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -cmake --build . --config Release -j "$(nproc)" diff --git a/docs/MODES.md b/docs/MODES.md deleted file mode 100644 index ae0b616..0000000 --- a/docs/MODES.md +++ /dev/null @@ -1,63 +0,0 @@ -# ACE-Step 1.5 built-in modes (acestep.cpp) - -This document maps the [ACE-Step 1.5 Tutorial](https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md) built-in modes to the current C++ implementation. - -## Task types (Tutorial: Input Control) - -| `task_type` | Description | Turbo/SFT | Base only | C++ status | -|---------------|-------------|-----------|-----------|------------| -| **text2music** | Generate from caption/lyrics (and optional reference) | ✅ | — | ✅ **Supported** | -| **cover** | Re-synthesize with structure from source; optional timbre from reference | ✅ | — | ✅ **Supported** (audio_codes or src_audio WAV/MP3) | -| **repaint** | Local edit in time range using source as context | ✅ | — | ❌ Not implemented | -| **lego** | Add new tracks to existing audio | — | ✅ | ❌ Base model only | -| **extract** | Extract single track from mix | — | ✅ | ❌ Base model only | -| **complete** | Add accompaniment to single track | — | ✅ | ❌ Base model only | - -We only ship Turbo and SFT DiT weights; **lego**, **extract**, **complete** require the Base DiT and are out of scope for now. - ---- - -## What we support today - -### text2music (default) -- **Input**: `caption`, optional `lyrics`, metadata (bpm, duration, keyscale, …). -- **Flow**: LM (optional) → CoT + audio codes → DiT (context = silence) → VAE → WAV. -- **Timbre**: Optional **reference_audio** (WAV/MP3) → VAE encode → CondEncoder timbre; else built-in silence. - -### cover (when `audio_codes` or `src_audio` are provided) -- **Input**: Same as text2music, plus either **precomputed** `audio_codes` or **`src_audio`** (WAV/MP3 path). Optional **reference_audio** for timbre. -- **Flow**: If `src_audio` set and no `audio_codes`: load WAV/MP3 → VAE encode → FSQ nearest-codeword encode → codes. Then decode codes to latents → DiT context (blend with silence) → DiT → VAE → WAV. No Python. -- **reference_audio** and **audio_cover_strength**: Implemented (timbre; blend). ---- - -## What’s not implemented yet - -### repaint -- **Tutorial**: Specify `repainting_start` / `repainting_end` (seconds); model uses source audio as context and only generates in that interval (3–90 s). -- **C++**: Would require **masked diffusion**: context carries “given” frames; ODE only updates the repaint region. DiT’s context has a 64-channel “mask” that we currently set to 1.0; repaint would set mask per frame and the generation loop would only update unmasked frames. Not implemented. - ---- - -## Request fields (aligned with Tutorial) - -All of these are in `AceRequest` and parsed from / written to JSON. Backend behavior is as above. - -| Field | Type | Purpose | -|-------|------|--------| -| `task_type` | string | `"text2music"` \| `"cover"` \| `"repaint"` \| … | -| `reference_audio` | string | Path to WAV or MP3 for timbre (implemented) | -| `src_audio` | string | Path to WAV or MP3 for cover source; encoded to codes internally (implemented) | -| `audio_codes` | string | Comma-separated FSQ codes; non-empty ⇒ cover path (or from `src_audio`) | -| `audio_cover_strength` | float | 0.0–1.0 blend of decoded context with silence (implemented) | -| `repainting_start` | float | Start time (s) for repaint (not used yet) | -| `repainting_end` | float | End time (s) for repaint (not used yet) | - -See `request.h` and the README “Request JSON reference” for the full list. - ---- - -## Summary - -- **Fully supported**: text2music (with optional reference_audio for timbre); cover from **precomputed** `audio_codes` or from **WAV/MP3** via `src_audio` (VAE encode + FSQ nearest-codeword encode); reference_audio (timbre); audio_cover_strength (blend). -- **Schema only** (no backend): `repainting_start`/`repainting_end`. -- **To support repaint**: implement masked DiT generation (context mask + ODE only on repaint interval). diff --git a/examples/cover-reference.json b/examples/cover-reference.json deleted file mode 100644 index 313d419..0000000 --- a/examples/cover-reference.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "task_type": "cover", - "caption": "Cover with timbre from reference WAV/MP3", - "duration": 10, - "bpm": 83, - "keyscale": "G major", - "timesignature": "4", - "vocal_language": "fr", - "inference_steps": 8, - "guidance_scale": 1, - "shift": 3, - "seed": 42, - "audio_cover_strength": 0.9, - "audio_codes": "43316,18426,13366,59455,17783,49303,7423,29855,37158,37157,62317,61455,12847,19583,57031,34656,20254,10770,11416,15905,31413,23339,47091,12198,49531,37355,33090,38645,40707,16324,61436,46095,13941,5287,2239,13975,63815,2757,4862,13571,63495,39,29887,49426,12696,50847,40498,61056,25666,12989,23987,54763,25485,31683,28554,25355,16373,28995,2351,1655,7940,55831,34359,15350,15277,11717,20476,52239,5015,19807,24087,3559,20471,34193,32552,60999,29360,25338,38873,16768,17912,27584,24008,1528,449,25563,52684,53223,42183,37215,12343,39431,26055,28148,57286,38382,28863,7191,58397,18991,7695,30716,36784,12687,8707,25649,33718,3202,23035,10747,26354,63965,16260,11223,45679,14343,8679,49351,52927,2535,19207,46447,49615,12694,21110,46597,60991,27711,49751,54656,30448,33125,13585,29256,5161,42434,11753,39402,60354,21953,39532,14282,52160,34248,16304,4671,14172,5127,25991,15343,8583,61902,16328,31700,48415,28879,11215,52715,25541,11203,7695,63951,33803,40453,17750,28006,8231,40464,3136,51006,23839,18711,18711,18711,18711,3343,3279,2823,16071,3271,2823,2319,55815,40260,16215,12047,16631,26927,21863,20060,10166,51070,39,12099,63440,18418,25271,10792,2128,44166,53750,41263,44247,61287,42303,27614,21997,24879,38799,12648,38341,36833,19408,11769,2979,63979,44239,25559,27591,17626,44087,33796,4901,53176,57399,37180,38024,9216,63485,2005,13656,15914,45576,29194,45624,62332,53237,63988,40332,20486,31367,10951,46207,22231,63479,38877,17262,49335,42045,57388", - "reference_audio": "reference.wav" -} \ No newline at end of file diff --git a/examples/cover-reference.sh b/examples/cover-reference.sh deleted file mode 100755 index 2687b09..0000000 --- a/examples/cover-reference.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -# Cover mode with reference timbre: audio_codes + reference_audio (WAV or MP3). -# Put a WAV/MP3 at reference.wav (or reference.mp3) or set reference_audio in cover-reference.json. -# Requires VAE GGUF with encoder weights (same as request-reference / test-reference). -set -eu -cd "$(dirname "$0")" - -if [ ! -f "reference.wav" ] && [ ! -f "reference.mp3" ]; then - echo "No reference.wav or reference.mp3 found. Copy a file to reference.wav (or .mp3), or set reference_audio in cover-reference.json." - echo "Then run: $0" - exit 1 -fi - -../build/dit-vae \ - --request cover-reference.json \ - --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ - --dit ../models/acestep-v15-turbo-Q8_0.gguf \ - --vae ../models/vae-BF16.gguf - -echo "Done. Check cover-reference0.wav" diff --git a/examples/cover.json b/examples/cover.json deleted file mode 100644 index 82cbaaa..0000000 --- a/examples/cover.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "task_type": "cover", - "caption": "Re-synthesize from precomputed codes (e.g. from a previous ace-qwen3 run)", - "duration": 10, - "bpm": 83, - "keyscale": "G major", - "timesignature": "4", - "vocal_language": "fr", - "inference_steps": 8, - "guidance_scale": 1, - "shift": 3, - "seed": 42, - "audio_cover_strength": 0.9, - "audio_codes": "43316,18426,13366,59455,17783,49303,7423,29855,37158,37157,62317,61455,12847,19583,57031,34656,20254,10770,11416,15905,31413,23339,47091,12198,49531,37355,33090,38645,40707,16324,61436,46095,13941,5287,2239,13975,63815,2757,4862,13571,63495,39,29887,49426,12696,50847,40498,61056,25666,12989,23987,54763,25485,31683,28554,25355,16373,28995,2351,1655,7940,55831,34359,15350,15277,11717,20476,52239,5015,19807,24087,3559,20471,34193,32552,60999,29360,25338,38873,16768,17912,27584,24008,1528,449,25563,52684,53223,42183,37215,12343,39431,26055,28148,57286,38382,28863,7191,58397,18991,7695,30716,36784,12687,8707,25649,33718,3202,23035,10747,26354,63965,16260,11223,45679,14343,8679,49351,52927,2535,19207,46447,49615,12694,21110,46597,60991,27711,49751,54656,30448,33125,13585,29256,5161,42434,11753,39402,60354,21953,39532,14282,52160,34248,16304,4671,14172,5127,25991,15343,8583,61902,16328,31700,48415,28879,11215,52715,25541,11203,7695,63951,33803,40453,17750,28006,8231,40464,3136,51006,23839,18711,18711,18711,18711,3343,3279,2823,16071,3271,2823,2319,55815,40260,16215,12047,16631,26927,21863,20060,10166,51070,39,12099,63440,18418,25271,10792,2128,44166,53750,41263,44247,61287,42303,27614,21997,24879,38799,12648,38341,36833,19408,11769,2979,63979,44239,25559,27591,17626,44087,33796,4901,53176,57399,37180,38024,9216,63485,2005,13656,15914,45576,29194,45624,62332,53237,63988,40332,20486,31367,10951,46207,22231,63479,38877,17262,49335,42045,57388" -} \ No newline at end of file diff --git a/examples/cover.sh b/examples/cover.sh deleted file mode 100755 index 14d340e..0000000 --- a/examples/cover.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -# Cover mode: decode precomputed audio_codes to WAV (no LLM). -# Use cover.json as-is, or replace audio_codes with output from a previous run: -# ../build/ace-qwen3 --request simple.json --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf -# # then use simple0.json as input, or copy its audio_codes into cover.json -set -eu -cd "$(dirname "$0")" - -../build/dit-vae \ - --request cover.json \ - --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ - --dit ../models/acestep-v15-turbo-Q8_0.gguf \ - --vae ../models/vae-BF16.gguf - -echo "Done. Check cover0.wav" diff --git a/examples/lora.json b/examples/lora.json deleted file mode 100644 index c872efb..0000000 --- a/examples/lora.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "task_type": "text2music", - "caption": "An energetic nu-disco track built on a foundation of a tight, funky slap bassline and a crisp, four-on-the-floor drum machine beat. The song opens with a distinctive, filtered wah-wah guitar riff that serves as a recurring motif. The arrangement is layered with shimmering synth pads, punchy synth stabs, and subtle arpeggiated synth textures that add movement. The track progresses through dynamic sections, including a brief atmospheric breakdown before rebuilding the main groove.", - "genre": "Nu-disco", - "lyrics": "[Instrumental]", - "bpm": 115, - "keyscale": "C# major", - "timesignature": "4", - "duration": 256, - "language": "unknown", - "instrumental": true, - "custom_tag": "crydamoure", - "inference_steps": 8, - "guidance_scale": 1, - "shift": 3, - "seed": -1 -} diff --git a/examples/lora.sh b/examples/lora.sh deleted file mode 100755 index db7ce2b..0000000 --- a/examples/lora.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# LoRA example: generate with a PEFT LoRA adapter (e.g. duckdbot/acestep-lora-cryda). -# Requires adapter_model.safetensors in lora/ (download once; see below). -set -eu -cd "$(dirname "$0")" - -ADAPTER="lora/adapter_model.safetensors" -if [ ! -f "$ADAPTER" ]; then - echo "LoRA adapter not found at $ADAPTER" - exit 1 -fi - -# LLM: fill lyrics + codes -../build/ace-qwen3 \ - --request lora.json \ - --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf - -# DiT+VAE with LoRA (scale = alpha/rank; 1.0 is typical) -../build/dit-vae \ - --request lora0.json \ - --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ - --dit ../models/acestep-v15-turbo-Q8_0.gguf \ - --vae ../models/vae-BF16.gguf \ - --lora "$ADAPTER" \ - --lora-scale 1.0 - -echo "Done. Check lora00.wav" diff --git a/examples/request-reference.json b/examples/request-reference.json deleted file mode 100644 index 55479ee..0000000 --- a/examples/request-reference.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "task_type": "text2music", - "caption": "Calm acoustic guitar, soft male vocal, intimate", - "lyrics": "[Verse]\nQuiet strings and a gentle tune\n[Chorus]\nHold the moment in this room", - "duration": 30, - "seed": 42, - "inference_steps": 8, - "guidance_scale": 1.0, - "shift": 3.0, - "reference_audio": "reference.wav", - "audio_codes": "", - "audio_cover_strength": 1.0 -} diff --git a/examples/run-examples-ci.sh b/examples/run-examples-ci.sh deleted file mode 100755 index 362f92a..0000000 --- a/examples/run-examples-ci.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -# Run all example scripts with short-duration CI fixtures (from repo root). -# Prereqs: build/ and models/ present; run after build and ./models.sh. -set -eu -cd "$(dirname "$0")/.." -EXAMPLES=examples -cd "$EXAMPLES" - -run() { echo "== $*" && "$@"; } - -# 1) DiT-only (no LLM), 5s -run cp ../tests/fixtures/ci-dit-only.json dit-only.json -run ./dit-only.sh -test -f dit-only0.wav && echo "dit-only OK" - -# 2) Cover from precomputed audio_codes (existing cover.json, 10s) -run ./cover.sh -test -f cover0.wav && echo "cover OK" - -# 3) reference.wav for cover-reference and test-reference -run cp cover0.wav reference.wav - -# 4) Cover + reference timbre -run ./cover-reference.sh -test -f cover-reference0.wav && echo "cover-reference OK" - -# 5) text2music with reference_audio -run cp ../tests/fixtures/ci-request-reference.json request-reference.json -run ./test-reference.sh -test -f request-reference0.wav && echo "test-reference OK" - -# 6) Simple (caption only, LLM fills), 5s -run cp ../tests/fixtures/ci-text2music.json simple.json -run ./simple.sh -test -f simple00.wav && echo "simple OK" - -# 7) Partial (caption + lyrics + duration), 5s -run cp ../tests/fixtures/ci-partial.json partial.json -run ./partial.sh -test -f partial00.wav && echo "partial OK" - -# 8) Full (all metadata), 5s -run cp ../tests/fixtures/ci-full.json full.json -run ./full.sh -test -f full00.wav && echo "full OK" - -echo "All example scripts passed." diff --git a/examples/test-reference.sh b/examples/test-reference.sh deleted file mode 100755 index 073a465..0000000 --- a/examples/test-reference.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# Test reference_audio (WAV) and audio_cover_strength. -# Put a WAV file at reference.wav (or set reference_audio path in request-reference.json). -# Requires: built dit-vae, --vae with encoder weights, and models in ../models/. - -set -eu -cd "$(dirname "$0")" - -if [ ! -f "reference.wav" ]; then - echo "No reference.wav found. Copy a WAV file to reference.wav (stereo 48kHz or any rate; will be resampled)." - echo "Then run: $0" - exit 1 -fi - -../build/dit-vae \ - --request request-reference.json \ - --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ - --dit ../models/acestep-v15-turbo-Q8_0.gguf \ - --vae ../models/vae-BF16.gguf - -echo "Done. Check request-reference0.wav (and request-reference1.wav if --batch 2)." diff --git a/src/audio.h b/src/audio.h deleted file mode 100644 index bd90915..0000000 --- a/src/audio.h +++ /dev/null @@ -1,17 +0,0 @@ -// audio.h: unified reference-audio loader (WAV + MP3 → stereo 48kHz float) -// Header-only for WAV; MP3 implementation in audio_loader.cpp (minimp3, no temp files). - -#pragma once - -#include -#include -#include - -// Load WAV or MP3 file into stereo float32 at 48kHz. -// Out: interleaved L,R,L,R,...; length = num_samples (per channel). -// Returns num_samples (per channel), or -1 on error. -// No temp files; MP3 decoded in memory via minimp3 (header-only dep). -int load_audio_48k_stereo(const char * path, std::vector * out); - -// MP3 implementation (in audio_loader.cpp; do not call from other TUs without linking it) -int mp3_load_48k_stereo(const char * path, std::vector * out); diff --git a/src/audio_loader.cpp b/src/audio_loader.cpp deleted file mode 100644 index a0e71b5..0000000 --- a/src/audio_loader.cpp +++ /dev/null @@ -1,117 +0,0 @@ -// audio_loader.cpp: MP3 decode for reference audio (minimp3, no deps, no temp files) - -#define MINIMP3_IMPLEMENTATION -#include "third_party/minimp3.h" - -#include "wav.h" -#include "audio.h" -#include -#include -#include -#include - -static bool path_ends_with_ci(const char * path, const char * suffix) { - size_t pl = strlen(path), sl = strlen(suffix); - if (pl < sl) return false; - const char * p = path + pl - sl; - for (size_t i = 0; i < sl; i++) { - char a = (char)(p[i] >= 'A' && p[i] <= 'Z' ? p[i] + 32 : p[i]); - char b = (char)(suffix[i] >= 'A' && suffix[i] <= 'Z' ? suffix[i] + 32 : suffix[i]); - if (a != b) return false; - } - return true; -} - -static void pcm_to_float_stereo_48k( - const int16_t * pcm, size_t num_samples, int channels, unsigned int sample_rate, - std::vector * out) -{ - const float scale = 1.0f / 32768.0f; - out->resize(num_samples * 2); - if (channels == 1) { - for (size_t i = 0; i < num_samples; i++) { - float s = (float)pcm[i] * scale; - (*out)[i * 2] = s; - (*out)[i * 2 + 1] = s; - } - } else { - for (size_t i = 0; i < num_samples * 2; i++) - (*out)[i] = (float)pcm[i] * scale; - } - - if (sample_rate != 48000) { - size_t in_len = num_samples; - size_t out_len = (size_t)((double)in_len * 48000.0 / (double)sample_rate); - std::vector resampled(out_len * 2); - for (size_t i = 0; i < out_len; i++) { - double t = (double)i * (double)in_len / (double)out_len; - size_t i0 = (size_t)t; - size_t i1 = std::min(i0 + 1, in_len - 1); - float w = (float)(t - (double)i0); - for (int c = 0; c < 2; c++) - resampled[i * 2 + c] = (*out)[i0 * 2 + c] * (1.0f - w) + (*out)[i1 * 2 + c] * w; - } - *out = std::move(resampled); - } -} - -int mp3_load_48k_stereo(const char * path, std::vector * out) { - FILE * f = fopen(path, "rb"); - if (!f) return -1; - fseek(f, 0, SEEK_END); - long sz = ftell(f); - fseek(f, 0, SEEK_SET); - if (sz <= 0 || sz > 200 * 1024 * 1024) { - fclose(f); - return -1; - } - std::vector buf((size_t)sz); - if (fread(buf.data(), 1, (size_t)sz, f) != (size_t)sz) { - fclose(f); - return -1; - } - fclose(f); - - mp3dec_t dec; - mp3dec_init(&dec); - mp3dec_frame_info_t info; - std::vector pcm; - const uint8_t * read_pos = buf.data(); - int remaining = (int)buf.size(); - int first_hz = 0, first_ch = 0; - const size_t max_samples = (size_t)(60 * 48000 * 2); - - while (remaining > 0) { - size_t old_size = pcm.size(); - if (old_size + (size_t)MINIMP3_MAX_SAMPLES_PER_FRAME > max_samples) break; - pcm.resize(old_size + (size_t)MINIMP3_MAX_SAMPLES_PER_FRAME); - int frame_samples = mp3dec_decode_frame(&dec, read_pos, remaining, pcm.data() + old_size, &info); - if (frame_samples <= 0) { - pcm.resize(old_size); - read_pos++; - remaining--; - continue; - } - if (first_hz == 0) { - first_hz = info.hz; - first_ch = info.channels; - } - pcm.resize(old_size + (size_t)(frame_samples * info.channels)); - read_pos += info.frame_bytes; - remaining -= info.frame_bytes; - } - - if (pcm.empty() || first_hz == 0) return -1; - size_t num_samples = pcm.size() / (size_t)first_ch; - pcm_to_float_stereo_48k(pcm.data(), num_samples, first_ch, (unsigned)first_hz, out); - return (int)(out->size() / 2); -} - -int load_audio_48k_stereo(const char * path, std::vector * out) { - if (!path || !out) return -1; - if (path_ends_with_ci(path, ".mp3")) - return mp3_load_48k_stereo(path, out); - if (path_ends_with_ci(path, ".wav")) - return wav_load_48k_stereo(path, out); - return -1; -} diff --git a/src/backend.h b/src/backend.h index 4b8566b..df33975 100644 --- a/src/backend.h +++ b/src/backend.h @@ -13,6 +13,7 @@ extern "C" int cudaDeviceGetAttribute(int *, int, int); #endif #include +#include #include #include @@ -41,6 +42,10 @@ static BackendPair backend_init(const char * label) { ggml_backend_load_all(); BackendPair bp = {}; bp.backend = ggml_backend_init_best(); + if (!bp.backend) { + fprintf(stderr, "[Load] FATAL: no backend available\n"); + exit(1); + } int n_threads = (int)std::thread::hardware_concurrency() / 2; if (n_threads < 1) n_threads = 1; // [GGML] If best backend is already CPU, reuse it (avoid 2 CPU instances @@ -51,6 +56,10 @@ static BackendPair backend_init(const char * label) { ggml_backend_cpu_set_n_threads(bp.backend, n_threads); } else { bp.cpu_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL); + if (!bp.cpu_backend) { + fprintf(stderr, "[Load] FATAL: failed to init CPU backend\n"); + exit(1); + } ggml_backend_cpu_set_n_threads(bp.cpu_backend, n_threads); } fprintf(stderr, "[Load] %s backend: %s (CPU threads: %d)\n", @@ -87,5 +96,10 @@ static void backend_release(ggml_backend_t backend, ggml_backend_t cpu_backend) static ggml_backend_sched_t backend_sched_new(BackendPair bp, int max_nodes) { ggml_backend_t backends[2] = { bp.backend, bp.cpu_backend }; int n = (bp.backend == bp.cpu_backend) ? 1 : 2; - return ggml_backend_sched_new(backends, NULL, n, max_nodes, false, true); + ggml_backend_sched_t sched = ggml_backend_sched_new(backends, NULL, n, max_nodes, false, true); + if (!sched) { + fprintf(stderr, "[Load] FATAL: failed to create scheduler\n"); + exit(1); + } + return sched; } diff --git a/src/cond-enc.h b/src/cond-enc.h index e85b6fd..ba53163 100644 --- a/src/cond-enc.h +++ b/src/cond-enc.h @@ -270,7 +270,10 @@ static void cond_ggml_forward(CondGGML * m, if (timbre_out) ggml_build_forward_expand(gf, timbre_out); // Allocate and set inputs - ggml_backend_sched_alloc_graph(m->sched, gf); + if (!ggml_backend_sched_alloc_graph(m->sched, gf)) { + fprintf(stderr, "[CondEncoder] FATAL: failed to allocate graph\n"); + exit(1); + } ggml_backend_tensor_set(t_lyric_in, lyric_embed, 0, 1024 * S_lyric * sizeof(float)); ggml_backend_tensor_set(t_text_in, text_hidden, 0, 1024 * S_text * sizeof(float)); diff --git a/src/debug.h b/src/debug.h index dc7a626..a32cd11 100644 --- a/src/debug.h +++ b/src/debug.h @@ -6,8 +6,6 @@ #include #include #include -#include -#include #include struct DebugDumper { diff --git a/src/dit-graph.h b/src/dit-graph.h index 1241bc5..abe64d0 100644 --- a/src/dit-graph.h +++ b/src/dit-graph.h @@ -10,10 +10,7 @@ #include "dit.h" -#include -#include #include -#include // Helper: ensure tensor is f32 (cast if bf16/f16) static struct ggml_tensor * dit_ggml_f32( @@ -44,23 +41,6 @@ static struct ggml_tensor * dit_ggml_linear( return ggml_mul_mat(ctx, weight, input); } -// Linear with optional LoRA: out = W@x + scale * (B@(A@x)). lora_a/lora_b may be NULL. -static struct ggml_tensor * dit_ggml_linear_lora( - struct ggml_context * ctx, - struct ggml_tensor * weight, - struct ggml_tensor * lora_a, // [in, r] - struct ggml_tensor * lora_b, // [r, out] - float lora_scale, - struct ggml_tensor * input) { - struct ggml_tensor * out = ggml_mul_mat(ctx, weight, input); - if (lora_a && lora_b && lora_scale != 0.0f) { - struct ggml_tensor * ax = ggml_mul_mat(ctx, lora_a, input); - struct ggml_tensor * bax = ggml_mul_mat(ctx, lora_b, ax); - out = ggml_add(ctx, out, ggml_scale(ctx, bax, lora_scale)); - } - return out; -} - // Helper: Linear layer with bias static struct ggml_tensor * dit_ggml_linear_bias( struct ggml_context * ctx, @@ -181,36 +161,20 @@ static struct ggml_tensor * dit_ggml_build_self_attn( struct ggml_tensor * q, * k, * v; int q_dim = Nh * D; int kv_dim = Nkv * D; - float lora_scale = m->lora_scale; if (ly->sa_qkv) { struct ggml_tensor * qkv = dit_ggml_linear(ctx, ly->sa_qkv, norm_sa); q = ggml_cont(ctx, ggml_view_3d(ctx, qkv, q_dim, S, N, qkv->nb[1], qkv->nb[2], 0)); k = ggml_cont(ctx, ggml_view_3d(ctx, qkv, kv_dim, S, N, qkv->nb[1], qkv->nb[2], (size_t)q_dim * qkv->nb[0])); v = ggml_cont(ctx, ggml_view_3d(ctx, qkv, kv_dim, S, N, qkv->nb[1], qkv->nb[2], (size_t)(q_dim + kv_dim) * qkv->nb[0])); - // LoRA on fused path: add scale * (B @ (A @ x)) per projection when adapters are loaded - if (lora_scale != 0.0f) { - if (ly->lora_sa_q_a && ly->lora_sa_q_b) - q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_q_b, ggml_mul_mat(ctx, ly->lora_sa_q_a, norm_sa)), lora_scale)); - if (ly->lora_sa_k_a && ly->lora_sa_k_b) - k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_k_b, ggml_mul_mat(ctx, ly->lora_sa_k_a, norm_sa)), lora_scale)); - if (ly->lora_sa_v_a && ly->lora_sa_v_b) - v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_v_b, ggml_mul_mat(ctx, ly->lora_sa_v_a, norm_sa)), lora_scale)); - } } else if (ly->sa_qk) { struct ggml_tensor * qk = dit_ggml_linear(ctx, ly->sa_qk, norm_sa); q = ggml_cont(ctx, ggml_view_3d(ctx, qk, q_dim, S, N, qk->nb[1], qk->nb[2], 0)); k = ggml_cont(ctx, ggml_view_3d(ctx, qk, kv_dim, S, N, qk->nb[1], qk->nb[2], (size_t)q_dim * qk->nb[0])); - if (lora_scale != 0.0f) { - if (ly->lora_sa_q_a && ly->lora_sa_q_b) - q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_q_b, ggml_mul_mat(ctx, ly->lora_sa_q_a, norm_sa)), lora_scale)); - if (ly->lora_sa_k_a && ly->lora_sa_k_b) - k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_k_b, ggml_mul_mat(ctx, ly->lora_sa_k_a, norm_sa)), lora_scale)); - } - v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa); + v = dit_ggml_linear(ctx, ly->sa_v_proj, norm_sa); } else { - q = dit_ggml_linear_lora(ctx, ly->sa_q_proj, ly->lora_sa_q_a, ly->lora_sa_q_b, lora_scale, norm_sa); - k = dit_ggml_linear_lora(ctx, ly->sa_k_proj, ly->lora_sa_k_a, ly->lora_sa_k_b, lora_scale, norm_sa); - v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa); + q = dit_ggml_linear(ctx, ly->sa_q_proj, norm_sa); + k = dit_ggml_linear(ctx, ly->sa_k_proj, norm_sa); + v = dit_ggml_linear(ctx, ly->sa_v_proj, norm_sa); } // 2) Reshape to heads: [Nh*D, S, N] -> [D, Nh, S, N] @@ -272,7 +236,7 @@ static struct ggml_tensor * dit_ggml_build_self_attn( } // 8) O projection: [Nh*D, S, N] -> [H, S, N] - struct ggml_tensor * out = dit_ggml_linear_lora(ctx, ly->sa_o_proj, ly->lora_sa_o_a, ly->lora_sa_o_b, m->lora_scale, attn); + struct ggml_tensor * out = dit_ggml_linear(ctx, ly->sa_o_proj, attn); return out; } @@ -286,34 +250,20 @@ static struct ggml_tensor * dit_ggml_build_mlp( struct ggml_tensor * norm_ffn, int S) { - DiTGGMLConfig & c = m->cfg; - int I = c.intermediate_size; - int N = (int)norm_ffn->ne[2]; - float lora_scale = m->lora_scale; struct ggml_tensor * ff; if (ly->gate_up) { // Fused: single matmul [H, 2*I] x [H, S, N] -> [2*I, S, N], then swiglu splits ne[0] struct ggml_tensor * gu = dit_ggml_linear(ctx, ly->gate_up, norm_ffn); - if (lora_scale != 0.0f && ((ly->lora_gate_a && ly->lora_gate_b) || (ly->lora_up_a && ly->lora_up_b))) { - struct ggml_tensor * gate = ggml_cont(ctx, ggml_view_3d(ctx, gu, I, S, N, gu->nb[1], gu->nb[2], 0)); - struct ggml_tensor * up = ggml_cont(ctx, ggml_view_3d(ctx, gu, I, S, N, gu->nb[1], gu->nb[2], (size_t)I * gu->nb[0])); - if (ly->lora_gate_a && ly->lora_gate_b) - gate = ggml_add(ctx, gate, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_gate_b, ggml_mul_mat(ctx, ly->lora_gate_a, norm_ffn)), lora_scale)); - if (ly->lora_up_a && ly->lora_up_b) - up = ggml_add(ctx, up, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_up_b, ggml_mul_mat(ctx, ly->lora_up_a, norm_ffn)), lora_scale)); - ff = ggml_swiglu_split(ctx, gate, up); - } else { - ff = ggml_swiglu(ctx, gu); - } + ff = ggml_swiglu(ctx, gu); } else { - // Separate: two matmuls + split swiglu (with optional LoRA) - struct ggml_tensor * gate = dit_ggml_linear_lora(ctx, ly->gate_proj, ly->lora_gate_a, ly->lora_gate_b, lora_scale, norm_ffn); - struct ggml_tensor * up = dit_ggml_linear_lora(ctx, ly->up_proj, ly->lora_up_a, ly->lora_up_b, lora_scale, norm_ffn); + // Separate: two matmuls + split swiglu + struct ggml_tensor * gate = dit_ggml_linear(ctx, ly->gate_proj, norm_ffn); + struct ggml_tensor * up = dit_ggml_linear(ctx, ly->up_proj, norm_ffn); ff = ggml_swiglu_split(ctx, gate, up); } // Down projection: [I, S] -> [H, S] - return dit_ggml_linear_lora(ctx, ly->down_proj, ly->lora_down_a, ly->lora_down_b, lora_scale, ff); + return dit_ggml_linear(ctx, ly->down_proj, ff); } // Build cross-attention sub-graph for a single layer. @@ -339,7 +289,6 @@ static struct ggml_tensor * dit_ggml_build_cross_attn( // Q from hidden, KV from encoder (full fused, Q+KV partial, separate) int q_dim = Nh * D; int kv_dim = Nkv * D; - float lora_scale = m->lora_scale; struct ggml_tensor * q, * k, * v; if (ly->ca_qkv) { // Full QKV fused: split Q from hidden, KV from enc via weight views @@ -351,31 +300,16 @@ static struct ggml_tensor * dit_ggml_build_cross_attn( struct ggml_tensor * kv = ggml_mul_mat(ctx, w_kv, enc); k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0)); v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0])); - // LoRA on fused path: add scale * (B @ (A @ x)) for Q (from norm_ca), K/V (from enc) - if (lora_scale != 0.0f) { - if (ly->lora_ca_q_a && ly->lora_ca_q_b) - q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_q_b, ggml_mul_mat(ctx, ly->lora_ca_q_a, norm_ca)), lora_scale)); - if (ly->lora_ca_k_a && ly->lora_ca_k_b) - k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_k_b, ggml_mul_mat(ctx, ly->lora_ca_k_a, enc)), lora_scale)); - if (ly->lora_ca_v_a && ly->lora_ca_v_b) - v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_v_b, ggml_mul_mat(ctx, ly->lora_ca_v_a, enc)), lora_scale)); - } } else if (ly->ca_kv) { // Q separate, K+V fused - q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, lora_scale, norm_ca); + q = dit_ggml_linear(ctx, ly->ca_q_proj, norm_ca); struct ggml_tensor * kv = ggml_mul_mat(ctx, ly->ca_kv, enc); k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0)); v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0])); - if (lora_scale != 0.0f) { - if (ly->lora_ca_k_a && ly->lora_ca_k_b) - k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_k_b, ggml_mul_mat(ctx, ly->lora_ca_k_a, enc)), lora_scale)); - if (ly->lora_ca_v_a && ly->lora_ca_v_b) - v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_v_b, ggml_mul_mat(ctx, ly->lora_ca_v_a, enc)), lora_scale)); - } } else { - q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca); - k = dit_ggml_linear_lora(ctx, ly->ca_k_proj, ly->lora_ca_k_a, ly->lora_ca_k_b, m->lora_scale, enc); - v = dit_ggml_linear_lora(ctx, ly->ca_v_proj, ly->lora_ca_v_a, ly->lora_ca_v_b, m->lora_scale, enc); + q = dit_ggml_linear(ctx, ly->ca_q_proj, norm_ca); + k = dit_ggml_linear(ctx, ly->ca_k_proj, enc); + v = dit_ggml_linear(ctx, ly->ca_v_proj, enc); } // reshape to [D, heads, seq, N] then permute to [D, seq, heads, N] @@ -405,7 +339,7 @@ static struct ggml_tensor * dit_ggml_build_cross_attn( attn = ggml_reshape_3d(ctx, attn, Nh * D, S, N); // O projection - return dit_ggml_linear_lora(ctx, ly->ca_o_proj, ly->lora_ca_o_a, ly->lora_ca_o_b, m->lora_scale, attn); + return dit_ggml_linear(ctx, ly->ca_o_proj, attn); } // Build one full DiT layer (AdaLN + self-attn + cross-attn + FFN + gated residuals) diff --git a/src/dit-lora.cpp b/src/dit-lora.cpp deleted file mode 100644 index b14e090..0000000 --- a/src/dit-lora.cpp +++ /dev/null @@ -1,202 +0,0 @@ -// dit-lora.cpp: Load LoRA adapters from safetensors into DiT (ACE-Step). -// Compatible with PEFT adapter_model.safetensors (lora_A / lora_B per target layer). - -#include "dit.h" -#include "safetensors.h" -#include -#include -#include -#include - -// Normalize adapter key to base name: decoder.layers.N. -// Handles: base_model.model.model., base_model.model.; decoder.layers. or layers.; .lora_A.default/.lora_B.default or .lora_A.weight/.lora_B.weight -static std::string lora_key_to_base(const std::string & key) { - std::string s = key; - const char * prefixes[] = { "base_model.model.model.", "base_model.model." }; - for (const char * p : prefixes) { - size_t pl = strlen(p); - if (s.size() >= pl && s.compare(0, pl, p) == 0) { - s = s.substr(pl); - break; - } - } - // PEFT-style suffix - if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.default") == 0) - s = s.substr(0, s.size() - 14); - else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.default") == 0) - s = s.substr(0, s.size() - 14); - else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.weight") == 0) - s = s.substr(0, s.size() - 14); - else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.weight") == 0) - s = s.substr(0, s.size() - 14); - else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_A") == 0) - s = s.substr(0, s.size() - 7); - else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_B") == 0) - s = s.substr(0, s.size() - 7); - // HuggingFace adapter: layers.N -> decoder.layers.N for our DiT naming - if (s.size() >= 7 && s.compare(0, 7, "layers.") == 0) - s = "decoder." + s; - return s; -} - -static bool is_lora_a(const std::string & key) { - return key.find("lora_A") != std::string::npos; -} - -// Slot index for layer: 0=sa_q, 1=sa_k, 2=sa_v, 3=sa_o, 4=ca_q, 5=ca_k, 6=ca_v, 7=ca_o, 8=gate, 9=up, 10=down -enum LoraSlot { - SA_Q, SA_K, SA_V, SA_O, CA_Q, CA_K, CA_V, CA_O, GATE, UP, DOWN, N_SLOTS -}; - -static bool parse_base_name(const std::string & base, int * layer_idx, LoraSlot * slot) { - int L = -1; - if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.q_proj", &L) == 1) { *layer_idx = L; *slot = SA_Q; return true; } - if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.k_proj", &L) == 1) { *layer_idx = L; *slot = SA_K; return true; } - if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.v_proj", &L) == 1) { *layer_idx = L; *slot = SA_V; return true; } - if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.o_proj", &L) == 1) { *layer_idx = L; *slot = SA_O; return true; } - if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.q_proj", &L) == 1) { *layer_idx = L; *slot = CA_Q; return true; } - if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.k_proj", &L) == 1) { *layer_idx = L; *slot = CA_K; return true; } - if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.v_proj", &L) == 1) { *layer_idx = L; *slot = CA_V; return true; } - if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.o_proj", &L) == 1) { *layer_idx = L; *slot = CA_O; return true; } - if (sscanf(base.c_str(), "decoder.layers.%d.mlp.gate_proj", &L) == 1) { *layer_idx = L; *slot = GATE; return true; } - if (sscanf(base.c_str(), "decoder.layers.%d.mlp.up_proj", &L) == 1) { *layer_idx = L; *slot = UP; return true; } - if (sscanf(base.c_str(), "decoder.layers.%d.mlp.down_proj", &L) == 1) { *layer_idx = L; *slot = DOWN; return true; } - return false; -} - -static struct ggml_tensor ** slot_to_ptr(DiTGGMLLayer * ly, LoraSlot slot, bool is_b) { - if (is_b) { - switch (slot) { - case SA_Q: return &ly->lora_sa_q_b; case SA_K: return &ly->lora_sa_k_b; case SA_V: return &ly->lora_sa_v_b; case SA_O: return &ly->lora_sa_o_b; - case CA_Q: return &ly->lora_ca_q_b; case CA_K: return &ly->lora_ca_k_b; case CA_V: return &ly->lora_ca_v_b; case CA_O: return &ly->lora_ca_o_b; - case GATE: return &ly->lora_gate_b; case UP: return &ly->lora_up_b; case DOWN: return &ly->lora_down_b; - default: return nullptr; - } - } else { - switch (slot) { - case SA_Q: return &ly->lora_sa_q_a; case SA_K: return &ly->lora_sa_k_a; case SA_V: return &ly->lora_sa_v_a; case SA_O: return &ly->lora_sa_o_a; - case CA_Q: return &ly->lora_ca_q_a; case CA_K: return &ly->lora_ca_k_a; case CA_V: return &ly->lora_ca_v_a; case CA_O: return &ly->lora_ca_o_a; - case GATE: return &ly->lora_gate_a; case UP: return &ly->lora_up_a; case DOWN: return &ly->lora_down_a; - default: return nullptr; - } - } -} - -bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale) { - FILE * fp = fopen(lora_path, "rb"); - if (!fp) { - fprintf(stderr, "[LoRA] cannot open %s\n", lora_path); - return false; - } - std::unordered_map tensors; - if (fseek(fp, 0, SEEK_SET) != 0) { - fclose(fp); - return false; - } - int n = safetensors_parse_lora(fp, &tensors); - uint64_t data_section_start = (uint64_t)ftell(fp); - if (n == 0) { - fclose(fp); - fprintf(stderr, "[LoRA] no LoRA tensors found in %s\n", lora_path); - return false; - } - - // Count pairs we will load: for each lora_A key, find the matching lora_B (same base name) - std::unordered_map> pairs; // base -> (key_a, key_b) - std::unordered_map base_to_b; - for (const auto & kv : tensors) { - std::string base = lora_key_to_base(kv.first); - if (base.empty()) continue; - if (is_lora_a(kv.first)) - base_to_b[base] = ""; // mark base as having A; we'll find B next - } - for (const auto & kv : tensors) { - std::string base = lora_key_to_base(kv.first); - if (base.empty()) continue; - if (base_to_b.count(base) && kv.first.find("lora_B") != std::string::npos) - base_to_b[base] = kv.first; - } - for (const auto & kv : tensors) { - if (!is_lora_a(kv.first)) continue; - std::string base = lora_key_to_base(kv.first); - auto it = base_to_b.find(base); - if (it != base_to_b.end() && !it->second.empty()) - pairs[base] = { kv.first, it->second }; - } - - int n_pairs = (int)pairs.size(); - wctx_init(&m->lora_wctx, n_pairs * 2); // A and B per pair - - fseek(fp, (long)data_section_start, SEEK_SET); - - for (const auto & p : pairs) { - const std::string & base = p.first; - const std::string & key_a = p.second.first; - const std::string & key_b = p.second.second; - int layer_idx = 0; - LoraSlot slot = N_SLOTS; - if (!parse_base_name(base, &layer_idx, &slot) || layer_idx < 0 || layer_idx >= m->cfg.n_layers) continue; - - DiTGGMLLayer * ly = &m->layers[layer_idx]; - SafeTensorInfo & info_a = tensors[key_a]; - SafeTensorInfo & info_b = tensors[key_b]; - if (info_a.n_dims != 2 || info_b.n_dims != 2) continue; - // A_pt [r, in], B_pt [out, r]. We need A_ggml [r, in] for mul_mat(A,x)=[r,S], B_ggml [out, r] for mul_mat(B, Ax)=[out,S]. - // ggml layout: ne[0]=cols, ne[1]=rows. So A: [r, in] -> ne[0]=in, ne[1]=r. B: [out, r] -> ne[0]=r, ne[1]=out. - int64_t r = info_a.shape[0], in_dim = info_a.shape[1]; - int64_t out_dim = info_b.shape[0]; - if (info_b.shape[1] != r) continue; - - struct ggml_tensor * ta = ggml_new_tensor_2d(m->lora_wctx.ctx, GGML_TYPE_F32, (int64_t)in_dim, (int64_t)r); - struct ggml_tensor * tb = ggml_new_tensor_2d(m->lora_wctx.ctx, GGML_TYPE_F32, (int64_t)r, (int64_t)out_dim); - ggml_set_name(ta, key_a.c_str()); - ggml_set_name(tb, key_b.c_str()); - - // Copy A: file is row-major [r, in], we need ggml col-major [in, r] (transpose) - size_t na = (size_t)(r * in_dim); - m->lora_wctx.staging.emplace_back(na); - float * buf_a = m->lora_wctx.staging.back().data(); - if (!safetensors_read_tensor_data(fp, data_section_start, info_a.data_start, info_a.data_end, buf_a)) { - fclose(fp); - wctx_free(&m->lora_wctx); - return false; - } - m->lora_wctx.staging.emplace_back(na); - float * transposed_a = m->lora_wctx.staging.back().data(); - for (int64_t i = 0; i < r; i++) - for (int64_t j = 0; j < in_dim; j++) - transposed_a[(size_t)(j * r + i)] = buf_a[(size_t)(i * in_dim + j)]; - m->lora_wctx.pending.push_back({ ta, transposed_a, na * sizeof(float), 0 }); - - size_t nb = (size_t)(out_dim * r); - m->lora_wctx.staging.emplace_back(nb); - float * buf_b = m->lora_wctx.staging.back().data(); - if (!safetensors_read_tensor_data(fp, data_section_start, info_b.data_start, info_b.data_end, buf_b)) { - fclose(fp); - wctx_free(&m->lora_wctx); - return false; - } - m->lora_wctx.staging.emplace_back(nb); - float * transposed_b = m->lora_wctx.staging.back().data(); - for (int64_t i = 0; i < out_dim; i++) - for (int64_t j = 0; j < r; j++) - transposed_b[(size_t)(j * out_dim + i)] = buf_b[(size_t)(i * r + j)]; - m->lora_wctx.pending.push_back({ tb, transposed_b, nb * sizeof(float), 0 }); - - struct ggml_tensor ** pa = slot_to_ptr(ly, slot, false); - struct ggml_tensor ** pb = slot_to_ptr(ly, slot, true); - if (pa) *pa = ta; - if (pb) *pb = tb; - } - fclose(fp); - fp = nullptr; - - if (!wctx_alloc(&m->lora_wctx, m->backend)) { - fprintf(stderr, "[LoRA] failed to allocate LoRA tensors on backend\n"); - wctx_free(&m->lora_wctx); - return false; - } - m->lora_scale = scale; - fprintf(stderr, "[LoRA] loaded %d adapter pairs from %s (scale=%.4f)\n", n_pairs, lora_path, scale); - return true; -} diff --git a/src/dit-sampler.h b/src/dit-sampler.h index 92540a8..31d9817 100644 --- a/src/dit-sampler.h +++ b/src/dit-sampler.h @@ -8,12 +8,8 @@ #include "dit-graph.h" #include "debug.h" -#include "ggml.h" -#include "ggml-backend.h" -#include "ggml-alloc.h" #include -#include #include #include #include diff --git a/src/dit.h b/src/dit.h index 872bf06..cd2936e 100644 --- a/src/dit.h +++ b/src/dit.h @@ -10,17 +10,13 @@ #include "ggml.h" #include "ggml-backend.h" -#include "ggml-alloc.h" #include "gguf-weights.h" #include "backend.h" -#include "debug.h" #include #include -#include #include -#include // Config (mirrors dit.cuh DiTConfig) struct DiTGGMLConfig { @@ -81,19 +77,6 @@ struct DiTGGMLLayer { // AdaLN scale-shift table: [6*hidden] (6 rows of [hidden]) struct ggml_tensor * scale_shift_table; // [hidden, 6] in ggml layout - // Optional LoRA adapters (F32, applied when base projection is separate) - struct ggml_tensor * lora_sa_q_a, * lora_sa_q_b; - struct ggml_tensor * lora_sa_k_a, * lora_sa_k_b; - struct ggml_tensor * lora_sa_v_a, * lora_sa_v_b; - struct ggml_tensor * lora_sa_o_a, * lora_sa_o_b; - struct ggml_tensor * lora_ca_q_a, * lora_ca_q_b; - struct ggml_tensor * lora_ca_k_a, * lora_ca_k_b; - struct ggml_tensor * lora_ca_v_a, * lora_ca_v_b; - struct ggml_tensor * lora_ca_o_a, * lora_ca_o_b; - struct ggml_tensor * lora_gate_a, * lora_gate_b; - struct ggml_tensor * lora_up_a, * lora_up_b; - struct ggml_tensor * lora_down_a, * lora_down_b; - int layer_type; // 0=sliding, 1=full }; @@ -135,8 +118,6 @@ struct DiTGGML { // Weight storage WeightCtx wctx; - WeightCtx lora_wctx; // optional LoRA adapter tensors (when lora_scale > 0) - float lora_scale; // alpha/rank for LoRA (0 = no LoRA) // Pre-allocated constant for AdaLN (1+scale) fusion struct ggml_tensor * scalar_one; // [1] = 1.0f, broadcast in ggml_add @@ -404,14 +385,9 @@ static void dit_ggml_init_backend(DiTGGML * m) { m->use_flash_attn = (bp.backend != bp.cpu_backend); } -// Load LoRA adapter from safetensors (e.g. adapter_model.safetensors). -// scale = alpha/rank (typical 1.0). Call after dit_ggml_load. Returns false on error. -bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale); - static void dit_ggml_free(DiTGGML * m) { if (m->sched) ggml_backend_sched_free(m->sched); backend_release(m->backend, m->cpu_backend); wctx_free(&m->wctx); - if (m->lora_wctx.ctx) wctx_free(&m->lora_wctx); *m = {}; } diff --git a/src/fsq-detok.h b/src/fsq-detok.h index 7430db7..5cc3d7c 100644 --- a/src/fsq-detok.h +++ b/src/fsq-detok.h @@ -10,13 +10,10 @@ #pragma once #include "qwen3-enc.h" -#include // FSQ constants static const int FSQ_NDIMS = 6; static const int FSQ_LEVELS[6] = {8, 8, 8, 5, 5, 5}; -static const int FSQ_N_CODES = 8 * 8 * 8 * 5 * 5 * 5; // 8000 -static const int FSQ_FRAMES_PER_CODE = 5; // FSQ decode: integer index -> 6 normalized float values // Each dimension: level_idx / ((L-1)/2) - 1.0 (maps to [-1, 1]) @@ -113,6 +110,10 @@ static bool detok_ggml_load(DetokGGML * m, const char * gguf_path, ggml_backend_t backends[2] = { backend, cpu_backend }; int n = (backend == cpu_backend) ? 1 : 2; m->sched = ggml_backend_sched_new(backends, NULL, n, 4096, false, true); + if (!m->sched) { + fprintf(stderr, "[FSQ] FATAL: failed to create scheduler\n"); + return false; + } fprintf(stderr, "[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64)\n"); return true; @@ -220,48 +221,6 @@ static int detok_ggml_decode(DetokGGML * m, const int * codes, int T_5Hz, return T_25Hz; } -// Build codeword table for latent->code (cover from file): for each code 0..FSQ_N_CODES-1, -// decode to 5*64 floats. table_out must be at least FSQ_N_CODES * FSQ_FRAMES_PER_CODE * 64 floats. -static void detok_ggml_build_codeword_table(DetokGGML * m, float * table_out) { - const int chunk = FSQ_FRAMES_PER_CODE * 64; - for (int i = 0; i < FSQ_N_CODES; i++) { - int n = detok_ggml_decode(m, &i, 1, table_out + (size_t)i * chunk); - (void)n; - } -} - -// Encode latent frames to 5Hz codes by nearest codeword. T_latent = number of 25Hz frames (64-d each). -// Groups frames in chunks of 5; for each chunk finds the code whose codeword minimizes L2 distance. -// codeword_table from detok_ggml_build_codeword_table (FSQ_N_CODES * 5 * 64 floats). -// Pads last chunk with zeros if T_latent not divisible by 5. -static void latent_frames_to_codes(int T_latent, const float * latent_64d, - const float * codeword_table, - std::vector * out_codes) { - out_codes->clear(); - const int chunk_frames = FSQ_FRAMES_PER_CODE; - const int chunk_size = chunk_frames * 64; - int n_chunks = T_latent / chunk_frames; - if (n_chunks <= 0) return; - for (int g = 0; g < n_chunks; g++) { - const float * chunk = latent_64d + (size_t)g * chunk_size; - int best = 0; - float best_d2 = 1e30f; - for (int i = 0; i < FSQ_N_CODES; i++) { - const float * cw = codeword_table + (size_t)i * chunk_size; - float d2 = 0.0f; - for (int j = 0; j < chunk_size; j++) { - float d = chunk[j] - cw[j]; - d2 += d * d; - } - if (d2 < best_d2) { - best_d2 = d2; - best = i; - } - } - out_codes->push_back(best); - } -} - // Free static void detok_ggml_free(DetokGGML * m) { if (m->sched) ggml_backend_sched_free(m->sched); diff --git a/src/gguf-weights.h b/src/gguf-weights.h index ac5f22d..be5bede 100644 --- a/src/gguf-weights.h +++ b/src/gguf-weights.h @@ -18,7 +18,6 @@ #include #include -#include #include #ifdef _WIN32 diff --git a/src/metadata-fsm.h b/src/metadata-fsm.h index becbe1c..69ae125 100644 --- a/src/metadata-fsm.h +++ b/src/metadata-fsm.h @@ -11,10 +11,8 @@ #include #include #include -#include #include #include -#include // Prefix tree for FSM constrained decoding struct PrefixTree { diff --git a/src/prompt.h b/src/prompt.h index 99782d8..c568f5f 100644 --- a/src/prompt.h +++ b/src/prompt.h @@ -8,7 +8,6 @@ #include #include -#include #include #include diff --git a/src/qwen3-enc.h b/src/qwen3-enc.h index b8ea213..ee9b628 100644 --- a/src/qwen3-enc.h +++ b/src/qwen3-enc.h @@ -17,7 +17,6 @@ #include "gguf-weights.h" #include #include -#include #include #include @@ -404,7 +403,10 @@ static void qwen3_forward(Qwen3GGML * m, const int * token_ids, int S, float * o ggml_build_forward_expand(gf, out); // Allocate - ggml_backend_sched_alloc_graph(m->sched, gf); + if (!ggml_backend_sched_alloc_graph(m->sched, gf)) { + fprintf(stderr, "[TextEncoder] FATAL: failed to allocate graph (%d tokens)\n", S); + exit(1); + } // Set inputs ggml_backend_tensor_set(t_ids, token_ids, 0, S * sizeof(int)); @@ -455,7 +457,10 @@ static void qwen3_embed_lookup(Qwen3GGML * m, const int * token_ids, int S, floa ggml_set_output(out); ggml_build_forward_expand(gf, out); - ggml_backend_sched_alloc_graph(m->sched, gf); + if (!ggml_backend_sched_alloc_graph(m->sched, gf)) { + fprintf(stderr, "[TextEncoder] FATAL: failed to allocate graph (embed lookup, %d tokens)\n", S); + exit(1); + } ggml_backend_tensor_set(t_ids, token_ids, 0, S * sizeof(int)); ggml_backend_sched_graph_compute(m->sched, gf); ggml_backend_tensor_get(out, output, 0, (size_t)H * S * sizeof(float)); diff --git a/src/qwen3-lm.h b/src/qwen3-lm.h index 3bbd514..5f5e290 100644 --- a/src/qwen3-lm.h +++ b/src/qwen3-lm.h @@ -4,14 +4,11 @@ #pragma once #include "qwen3-enc.h" // Qwen3Layer, Qwen3Config, layer build helpers -#include "ggml-alloc.h" -#include "bpe.h" #include #include #include #include -#include #include // LM config (superset of encoder config) @@ -450,7 +447,10 @@ static void qw3lm_forward(Qwen3LM * m, const int * token_ids, int n_tokens, ggml_build_forward_expand(gf, lgt); // Schedule + allocate - ggml_backend_sched_alloc_graph(m->sched, gf); + if (!ggml_backend_sched_alloc_graph(m->sched, gf)) { + fprintf(stderr, "[LM] FATAL: failed to allocate graph (prefill, %d tokens)\n", n_tokens); + exit(1); + } // Set token IDs ggml_backend_tensor_set(token_ids_t, token_ids, 0, n_tokens * sizeof(int)); @@ -678,7 +678,10 @@ static void qw3lm_forward_batch(Qwen3LM * m, const int * token_ids, ggml_build_forward_expand(gf, lgt); // Allocate - ggml_backend_sched_alloc_graph(m->sched, gf); + if (!ggml_backend_sched_alloc_graph(m->sched, gf)) { + fprintf(stderr, "[LM] FATAL: failed to allocate graph (batch decode, N=%d)\n", N); + exit(1); + } // Set token IDs ggml_backend_tensor_set(token_ids_t, token_ids, 0, N * sizeof(int)); diff --git a/src/request.cpp b/src/request.cpp index f85873a..c851eb3 100644 --- a/src/request.cpp +++ b/src/request.cpp @@ -6,20 +6,16 @@ #include "request.h" #include -#include #include #include -// Defaults (aligned with Python GenerationParams and ACE-Step 1.5 Tutorial) +// Defaults (aligned with Python GenerationParams) void request_init(AceRequest * r) { - r->task_type = "text2music"; r->caption = ""; r->lyrics = ""; - r->instrumental = false; - r->custom_tag = ""; - r->genre = ""; + r->bpm = 0; - r->duration = -1.0f; + r->duration = 0.0f; r->keyscale = ""; r->timesignature = ""; r->vocal_language = "unknown"; @@ -29,14 +25,9 @@ void request_init(AceRequest * r) { r->lm_top_p = 0.9f; r->lm_top_k = 0; r->lm_negative_prompt = ""; - r->reference_audio = ""; - r->src_audio = ""; r->audio_codes = ""; - r->audio_cover_strength = 1.0f; - r->repainting_start = 0.0f; - r->repainting_end = 0.0f; r->inference_steps = 8; - r->guidance_scale = 1.0f; + r->guidance_scale = 0.0f; r->shift = 3.0f; } @@ -226,18 +217,11 @@ bool request_parse(AceRequest * r, const char * path) { const std::string & v = kv.value; // strings - if (k == "task_type") r->task_type = v; - else if (k == "caption") r->caption = v; + if (k == "caption") r->caption = v; else if (k == "lyrics") r->lyrics = v; - else if (k == "custom_tag") r->custom_tag = v; - else if (k == "genre") r->genre = v; else if (k == "keyscale") r->keyscale = v; - else if (k == "formatted_lyrics") r->lyrics = v; // alias for lyrics - else if (k == "language") r->vocal_language = v; // alias for vocal_language else if (k == "timesignature") r->timesignature = v; else if (k == "vocal_language") r->vocal_language = v; - else if (k == "reference_audio") r->reference_audio = v; - else if (k == "src_audio") r->src_audio = v; else if (k == "audio_codes") r->audio_codes = v; else if (k == "lm_negative_prompt") r->lm_negative_prompt = v; @@ -251,16 +235,11 @@ bool request_parse(AceRequest * r, const char * path) { else if (k == "lm_cfg_scale") r->lm_cfg_scale = (float)atof(v.c_str()); else if (k == "lm_top_p") r->lm_top_p = (float)atof(v.c_str()); else if (k == "lm_top_k") r->lm_top_k = atoi(v.c_str()); - else if (k == "audio_cover_strength") r->audio_cover_strength = (float)atof(v.c_str()); - else if (k == "repainting_start") r->repainting_start = (float)atof(v.c_str()); - else if (k == "repainting_end") r->repainting_end = (float)atof(v.c_str()); else if (k == "inference_steps") r->inference_steps = atoi(v.c_str()); else if (k == "guidance_scale") r->guidance_scale = (float)atof(v.c_str()); else if (k == "shift") r->shift = (float)atof(v.c_str()); // bools - else if (k == "instrumental") r->instrumental = (v == "true"); - else if (k == "is_instrumental") r->instrumental = (v == "true"); // unknown keys: silently ignored (forward compat) } @@ -276,15 +255,8 @@ bool request_write(const AceRequest * r, const char * path) { } fprintf(f, "{\n"); - fprintf(f, " \"task_type\": \"%s\",\n", json_escape(r->task_type).c_str()); fprintf(f, " \"caption\": \"%s\",\n", json_escape(r->caption).c_str()); fprintf(f, " \"lyrics\": \"%s\",\n", json_escape(r->lyrics).c_str()); - if (r->instrumental) - fprintf(f, " \"instrumental\": true,\n"); - if (!r->custom_tag.empty()) - fprintf(f, " \"custom_tag\": \"%s\",\n", json_escape(r->custom_tag).c_str()); - if (!r->genre.empty()) - fprintf(f, " \"genre\": \"%s\",\n", json_escape(r->genre).c_str()); fprintf(f, " \"bpm\": %d,\n", r->bpm); fprintf(f, " \"duration\": %.1f,\n", r->duration); fprintf(f, " \"keyscale\": \"%s\",\n", json_escape(r->keyscale).c_str()); @@ -296,18 +268,10 @@ bool request_write(const AceRequest * r, const char * path) { fprintf(f, " \"lm_top_p\": %.2f,\n", r->lm_top_p); fprintf(f, " \"lm_top_k\": %d,\n", r->lm_top_k); fprintf(f, " \"lm_negative_prompt\": \"%s\",\n", json_escape(r->lm_negative_prompt).c_str()); - if (!r->reference_audio.empty()) - fprintf(f, " \"reference_audio\": \"%s\",\n", json_escape(r->reference_audio).c_str()); - if (!r->src_audio.empty()) - fprintf(f, " \"src_audio\": \"%s\",\n", json_escape(r->src_audio).c_str()); - fprintf(f, " \"audio_cover_strength\": %.2f,\n", r->audio_cover_strength); - if (r->repainting_start != 0.0f || r->repainting_end != 0.0f) { - fprintf(f, " \"repainting_start\": %.1f,\n", r->repainting_start); - fprintf(f, " \"repainting_end\": %.1f,\n", r->repainting_end); - } fprintf(f, " \"inference_steps\": %d,\n", r->inference_steps); fprintf(f, " \"guidance_scale\": %.1f,\n", r->guidance_scale); fprintf(f, " \"shift\": %.1f,\n", r->shift); + // audio_codes last (no trailing comma) fprintf(f, " \"audio_codes\": \"%s\"\n", json_escape(r->audio_codes).c_str()); fprintf(f, "}\n"); @@ -317,12 +281,10 @@ bool request_write(const AceRequest * r, const char * path) { } void request_dump(const AceRequest * r, FILE * f) { - fprintf(f, "[Request] task=%s seed=%lld\n", r->task_type.c_str(), (long long)r->seed); + fprintf(f, "[Request] seed=%lld\n", (long long)r->seed); fprintf(f, " caption: %.60s%s\n", r->caption.c_str(), r->caption.size() > 60 ? "..." : ""); fprintf(f, " lyrics: %zu bytes\n", r->lyrics.size()); - if (!r->custom_tag.empty()) - fprintf(f, " custom_tag: %s\n", r->custom_tag.c_str()); fprintf(f, " bpm=%d dur=%.0f key=%s ts=%s lang=%s\n", r->bpm, r->duration, r->keyscale.c_str(), r->timesignature.c_str(), r->vocal_language.c_str()); @@ -330,12 +292,6 @@ void request_dump(const AceRequest * r, FILE * f) { r->lm_temperature, r->lm_cfg_scale, r->lm_top_p, r->lm_top_k); fprintf(f, " dit: steps=%d guidance=%.1f shift=%.1f\n", r->inference_steps, r->guidance_scale, r->shift); - if (!r->reference_audio.empty()) - fprintf(f, " reference_audio: %s\n", r->reference_audio.c_str()); - if (!r->src_audio.empty()) - fprintf(f, " src_audio: %s\n", r->src_audio.c_str()); - fprintf(f, " audio_codes: %s cover_strength=%.2f\n", - r->audio_codes.empty() ? "(none)" : "(present)", r->audio_cover_strength); - if (r->repainting_start != 0.0f || r->repainting_end != 0.0f) - fprintf(f, " repaint: %.1f–%.1fs\n", r->repainting_start, r->repainting_end); + fprintf(f, " audio_codes: %s\n", + r->audio_codes.empty() ? "(none)" : "(present)"); } diff --git a/src/request.h b/src/request.h index ba85821..d1748b5 100644 --- a/src/request.h +++ b/src/request.h @@ -3,31 +3,21 @@ // request.h - AceStep generation request (JSON serialization) // // Pure data container + JSON read/write. Zero business logic. -// Aligned with Python GenerationParams and ACE-Step 1.5 Tutorial: -// https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md +// Aligned with Python GenerationParams (inference.py:39) and API /release_task. // +#include #include #include struct AceRequest { - // --- Task & audio inputs (Tutorial: Input Control) --- - // task_type: "text2music" | "cover" | "repaint" | "lego" | "extract" | "complete" - // (lego/extract/complete require Base DiT; we only have Turbo/SFT) - std::string task_type; // "text2music" - // text content std::string caption; // "" std::string lyrics; // "" - bool instrumental; // false - - // LoRA / style trigger (appended to caption for condition encoder when set) - std::string custom_tag; // "" LoRA trigger word - std::string genre; // "" LoRA genre // metadata (user-provided or LLM-enriched) int bpm; // 0 = unset - float duration; // -1 = unset + float duration; // 0 = unset std::string keyscale; // "" = unset std::string timesignature; // "" = unset std::string vocal_language; // "unknown" @@ -42,20 +32,9 @@ struct AceRequest { int lm_top_k; // 0 = disabled (matches Python None) std::string lm_negative_prompt; // "" - // Audio control (Tutorial: reference_audio, src_audio, audio_codes) - // reference_audio: path to WAV for global timbre/style (VAE encode → CondEncoder timbre) - std::string reference_audio; // "" - // src_audio: path to WAV for Cover (encode → codes) or Repaint context - std::string src_audio; // "" // codes (Python-compatible string: "3101,11837,27514,...") // empty = text2music (silence context), non-empty = cover mode std::string audio_codes; // "" - // 0.0–1.0: how strongly generation follows reference/codes (Tutorial: audio_cover_strength) - float audio_cover_strength; // 1.0 - - // Repaint interval (seconds). Only used when task_type == "repaint". - float repainting_start; // 0.0 - float repainting_end; // 0.0 // DiT control (Python: inference_steps, guidance_scale, shift) int inference_steps; // 8 diff --git a/src/safetensors.h b/src/safetensors.h deleted file mode 100644 index 74d5967..0000000 --- a/src/safetensors.h +++ /dev/null @@ -1,107 +0,0 @@ -#pragma once -// safetensors.h: minimal reader for LoRA adapter_model.safetensors -// -// Format: 8-byte header length (LE uint64), then JSON header, then raw tensor data. -// We only parse keys that look like "*lora_A*" / "*lora_B*" and extract shape + data_offsets. - -#include -#include -#include -#include -#include -#include -#include - -struct SafeTensorInfo { - std::string dtype; // "F32", "F16", "BF16" - int64_t shape[2]; // [dim0, dim1] from JSON - int n_dims; - uint64_t data_start; // byte offset in file (after header) - uint64_t data_end; -}; - -// Open file, read header, parse tensor metadata for LoRA tensors. -// Returns number of LoRA tensors found; fills *out with tensor name -> info. -// Caller must fclose(fp) and free the map; file position is left at start of data section. -static int safetensors_parse_lora(FILE * fp, std::unordered_map * out) { - out->clear(); - uint64_t header_len = 0; - uint8_t h8[8]; - if (fread(h8, 1, 8, fp) != 8) return 0; - header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24) - | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56); - if (header_len == 0 || header_len > 10 * 1024 * 1024) return 0; // cap 10MB header - std::vector buf(header_len + 1); - if (fread(buf.data(), 1, header_len, fp) != header_len) return 0; - buf[header_len] = '\0'; - const char * json = buf.data(); - - // Find each key that contains "lora_A" or "lora_B" - const char * p = json; - int count = 0; - while ((p = strstr(p, "\"")) != nullptr) { - const char * key_start = p + 1; - p = strchr(key_start, '"'); - if (!p) break; - std::string key(key_start, (size_t)(p - key_start)); - p++; - if (key.find("lora_A") == std::string::npos && key.find("lora_B") == std::string::npos) { - continue; - } - // Find the value object for this key: skip ": - while (*p && (*p == ' ' || *p == ':')) p++; - if (*p != '{') continue; - const char * obj = p; - SafeTensorInfo info = {}; - info.shape[0] = info.shape[1] = 1; - info.n_dims = 0; - // "shape":[n,m] or [n] - const char * sh = strstr(obj, "\"shape\""); - if (sh) { - const char * br = strchr(sh, '['); - if (br) { - long long a = 0, b = 0; - int n = sscanf(br, "[%lld,%lld]", &a, &b); - if (n >= 1) { info.shape[0] = (int64_t)a; info.n_dims = 1; } - if (n >= 2) { info.shape[1] = (int64_t)b; info.n_dims = 2; } - } - } - const char * dt = strstr(obj, "\"dtype\""); - if (dt) { - const char * q = strchr(dt, '"'); - if (q) q = strchr(q + 1, '"'); - if (q) { - const char * start = q + 1; - const char * end = strchr(start, '"'); - if (end) info.dtype = std::string(start, end - start); - } - } - const char * off = strstr(obj, "\"data_offsets\""); - if (off) { - const char * br = strchr(off, '['); - if (br) { - uint64_t s = 0, e = 0; - if (sscanf(br, "[%llu,%llu]", (unsigned long long*)&s, (unsigned long long*)&e) == 2) { - info.data_start = s; - info.data_end = e; - } - } - } - if (info.dtype.empty() || info.n_dims == 0) continue; - (*out)[key] = info; - count++; - } - return count; -} - -// Read raw tensor data from file. File must be positioned at start of data section -// (i.e. after the 8-byte header length + header bytes). -// data_offset in the JSON is relative to the start of the data section. -static bool safetensors_read_tensor_data(FILE * fp, uint64_t data_section_start, - uint64_t tensor_start, uint64_t tensor_end, void * out_buf) { - uint64_t off = data_section_start + tensor_start; - uint64_t nbytes = tensor_end - tensor_start; - if (fseek(fp, (long)off, SEEK_SET) != 0) return false; - if (fread(out_buf, 1, nbytes, fp) != nbytes) return false; - return true; -} diff --git a/src/vae-enc.h b/src/vae-enc.h new file mode 100644 index 0000000..f5c67f2 --- /dev/null +++ b/src/vae-enc.h @@ -0,0 +1,391 @@ +// vae-enc.h: AutoencoderOobleck encoder (audio -> latent) via ggml +// +// Mirror of vae.h decoder. Reuses VAEResUnit, load helpers, graph ops. +// Architecture: conv1(2->128,k=7) -> 5x(3xresunit+snake+strided_conv) -> snake+conv2(2048->128,k=3) +// Output 128ch = mean[64] + scale[64]. Deterministic encode returns mean. +// Downsample: 2x4x4x6x10 = 1920x (matches decoder upsample) + +#pragma once +#include "vae.h" + +// Encoder block: 3xResUnit(in_ch) -> snake(in_ch) -> strided Conv1d(in_ch -> out_ch) +// Decoder block is the mirror: snake(in_ch) -> ConvT(in_ch -> out_ch) -> 3xResUnit(out_ch) +struct VAEEncBlock { + VAEResUnit ru[3]; + struct ggml_tensor * sa, * sb; // snake [1, in_ch] + struct ggml_tensor * dw, * db; // strided conv [K, in_ch, out_ch], bias [out_ch] + int in_ch, out_ch, stride, kernel, padding; +}; + +struct VAEEncoder { + struct ggml_tensor * c1w, * c1b; // conv1 [7, 2, 128], bias [128] + VAEEncBlock blk[5]; + struct ggml_tensor * sa, * sb; // final snake [1, 2048] + struct ggml_tensor * c2w, * c2b; // conv2 [3, 2048, 128], bias [128] + + ggml_backend_t backend; + ggml_backend_t cpu_backend; + ggml_backend_sched_t sched; + ggml_backend_buffer_t buf; + struct ggml_context * weight_ctx; + + // graph cache (rebuilt when T_audio changes) + struct ggml_context * graph_ctx; + uint8_t * graph_buf; + struct ggml_cgraph * graph; + struct ggml_tensor * graph_input; // [T_audio, 2] + struct ggml_tensor * graph_output; // [T_latent, 128] + int graph_T; // cached T_audio (0 = no cache) + + std::vector scratch_in; // transposed input [2 * T_audio] +}; + +// Load encoder weights from the same VAE GGUF (encoder.* tensors) +static void vae_enc_load(VAEEncoder * m, const char * path) { + GGUFModel gf = {}; + if (!gf_load(&gf, path)) { + fprintf(stderr, "[VAE-Enc] FATAL: cannot load %s\n", path); + exit(1); + } + + // Encoder channel layout (mirror of decoder, bottom-up): + // conv1: 2 -> 128 + // block: [128->128, 128->256, 256->512, 512->1024, 1024->2048] + // conv2: 2048 -> 128 (split: mean[64] + scale[64]) + // ResUnits run at in_ch (before downsample), unlike decoder (at out_ch, after upsample). + static const int in_ch[] = {128, 128, 256, 512, 1024}; + static const int out_ch[] = {128, 256, 512, 1024, 2048}; + static const int strides[] = { 2, 4, 4, 6, 10}; + static const int dilations[] = {1, 3, 9}; + + // Phase 1: create weight tensors + size_t ctx_size = ggml_tensor_overhead() * 256; + struct ggml_init_params p = { ctx_size, NULL, true }; + m->weight_ctx = ggml_init(p); + struct ggml_context * ctx = m->weight_ctx; + + m->c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, 2, 128); + m->c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128); + + for (int i = 0; i < 5; i++) { + VAEEncBlock & b = m->blk[i]; + b.in_ch = in_ch[i]; + b.out_ch = out_ch[i]; + b.stride = strides[i]; + b.kernel = strides[i] * 2; + b.padding = (strides[i] + 1) / 2; // ceil(stride / 2) + int C = in_ch[i]; // res_units + snake at in_ch + + // 3 res units at in_ch + for (int r = 0; r < 3; r++) { + VAEResUnit & ru = b.ru[r]; + ru.dilation = dilations[r]; + ru.s1a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + ru.s1b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + ru.c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, C, C); + ru.c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C); + ru.s2a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + ru.s2b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + ru.c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 1, C, C); + ru.c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C); + } + + // snake at in_ch (before downsample conv) + b.sa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + b.sb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); + + // strided conv1d: [K, in_ch, out_ch] + b.dw = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, b.kernel, in_ch[i], out_ch[i]); + b.db = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_ch[i]); + } + + m->sa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048); + m->sb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048); + m->c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 3, 2048, 128); + m->c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128); + + // Phase 2: allocate backend buffer + BackendPair bp = backend_init("VAE-Enc"); + m->backend = bp.backend; + m->cpu_backend = bp.cpu_backend; + m->sched = backend_sched_new(bp, 8192); + m->buf = ggml_backend_alloc_ctx_tensors(ctx, m->backend); + if (!m->buf) { + fprintf(stderr, "[VAE-Enc] FATAL: failed to allocate weight buffer\n"); + exit(1); + } + fprintf(stderr, "[VAE-Enc] Backend: %s, Weight buffer: %.1f MB\n", + ggml_backend_name(m->backend), + (float)ggml_backend_buffer_get_size(m->buf) / (1024 * 1024)); + + // Phase 3: load and fuse weights + vae_fuse_wn(m->c1w, gf, "encoder.conv1"); + vae_load_bias(m->c1b, gf, "encoder.conv1.bias"); + + for (int i = 0; i < 5; i++) { + VAEEncBlock & b = m->blk[i]; + std::string blk_pfx = "encoder.block." + std::to_string(i); + + // res_units first (same load pattern as decoder) + for (int r = 0; r < 3; r++) { + VAEResUnit & ru = b.ru[r]; + std::string rp = blk_pfx + ".res_unit" + std::to_string(r + 1); + vae_load_snake(ru.s1a, gf, rp + ".snake1.alpha"); + vae_load_snake_inv(ru.s1b, gf, rp + ".snake1.beta"); + vae_fuse_wn(ru.c1w, gf, rp + ".conv1"); + vae_load_bias(ru.c1b, gf, rp + ".conv1.bias"); + vae_load_snake(ru.s2a, gf, rp + ".snake2.alpha"); + vae_load_snake_inv(ru.s2b, gf, rp + ".snake2.beta"); + vae_fuse_wn(ru.c2w, gf, rp + ".conv2"); + vae_load_bias(ru.c2b, gf, rp + ".conv2.bias"); + } + + // snake + strided downsample conv (regular conv1d, NOT transposed) + vae_load_snake(b.sa, gf, blk_pfx + ".snake1.alpha"); + vae_load_snake_inv(b.sb, gf, blk_pfx + ".snake1.beta"); + vae_fuse_wn(b.dw, gf, blk_pfx + ".conv1"); + vae_load_bias(b.db, gf, blk_pfx + ".conv1.bias"); + } + + vae_load_snake(m->sa, gf, "encoder.snake1.alpha"); + vae_load_snake_inv(m->sb, gf, "encoder.snake1.beta"); + vae_fuse_wn(m->c2w, gf, "encoder.conv2"); + vae_load_bias(m->c2b, gf, "encoder.conv2.bias"); + + fprintf(stderr, "[VAE-Enc] Loaded: 5 blocks, downsample=1920x, F32 activations\n"); + gf_close(&gf); +} + +// Build encoder graph: audio [T_audio, 2] -> [T_latent, 128] +static struct ggml_tensor * vae_enc_build_graph( + struct ggml_context * ctx, + VAEEncoder * m, + struct ggml_tensor * audio) { // [T, 2] + + // conv1: [T, 2] -> [T, 128] + struct ggml_tensor * x = vae_conv1d(ctx, m->c1w, m->c1b, audio, 1, 3, 1); + + // 5 encoder blocks: resunits(in_ch) -> snake(in_ch) -> strided conv(in_ch -> out_ch) + for (int i = 0; i < 5; i++) { + VAEEncBlock & b = m->blk[i]; + for (int r = 0; r < 3; r++) + x = vae_res_unit(ctx, &b.ru[r], x); + x = vae_snake(ctx, x, b.sa, b.sb); + x = vae_conv1d(ctx, b.dw, b.db, x, b.stride, b.padding, 1); + } + + // Final: snake(2048) -> conv2(2048 -> 128, k=3, pad=1) + x = vae_snake(ctx, x, m->sa, m->sb); + x = vae_conv1d(ctx, m->c2w, m->c2b, x, 1, 1, 1); + + return x; // [T_latent, 128] +} + +// Core compute: build/cache graph, set input, run. Returns T_latent or -1. +// Output stays in m->graph_output for caller to read. +static int vae_enc_compute( + VAEEncoder * m, + const float * audio, // [T_audio, 2] time-major interleaved stereo + int T_audio) { + + // Rebuild graph when T_audio changes + if (m->graph_T != T_audio) { + if (m->graph_ctx) { + ggml_backend_sched_reset(m->sched); + ggml_free(m->graph_ctx); + free(m->graph_buf); + } + + size_t ctx_size = ggml_tensor_overhead() * 1024 + ggml_graph_overhead_custom(8192, false); + m->graph_buf = (uint8_t *)malloc(ctx_size); + struct ggml_init_params p = { ctx_size, m->graph_buf, true }; + struct ggml_context * ctx = ggml_init(p); + + m->graph_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, T_audio, 2); + ggml_set_name(m->graph_input, "enc_input"); + ggml_set_input(m->graph_input); + + m->graph_output = vae_enc_build_graph(ctx, m, m->graph_input); + ggml_set_name(m->graph_output, "enc_output"); + ggml_set_output(m->graph_output); + + m->graph = ggml_new_graph_custom(ctx, 8192, false); + ggml_build_forward_expand(m->graph, m->graph_output); + + if (!ggml_backend_sched_alloc_graph(m->sched, m->graph)) { + fprintf(stderr, "[VAE-Enc] FATAL: graph alloc failed for T=%d\n", T_audio); + ggml_free(ctx); + free(m->graph_buf); + m->graph_ctx = NULL; + m->graph_buf = NULL; + m->graph_T = 0; + return -1; + } + + m->graph_ctx = ctx; + m->graph_T = T_audio; + fprintf(stderr, "[VAE-Enc] Graph: %d nodes, T_audio=%d\n", + ggml_graph_n_nodes(m->graph), T_audio); + } + + // Transpose: [T, 2] time-major -> ggml [T, 2] channel-contiguous + // ggml ne[0]=T is the contiguous dim, so we write all T samples per channel + size_t in_size = (size_t)2 * T_audio; + if (m->scratch_in.size() < in_size) + m->scratch_in.resize(in_size); + for (int c = 0; c < 2; c++) + for (int t = 0; t < T_audio; t++) + m->scratch_in[c * T_audio + t] = audio[t * 2 + c]; + ggml_backend_tensor_set(m->graph_input, + m->scratch_in.data(), 0, in_size * sizeof(float)); + + ggml_backend_sched_graph_compute(m->sched, m->graph); + + return (int)m->graph_output->ne[0]; // T_latent +} + +// Encode API: audio [T_audio, 2] -> latent_out [T_latent, 64] (mean only, deterministic) +// Returns T_latent (or -1 on error). +// latent_out must hold at least (T_audio / 1920) * 64 floats. +static int vae_enc_encode( + VAEEncoder * m, + const float * audio, // [T_audio, 2] interleaved stereo + int T_audio, + float * latent_out, // [T_latent, 64] output, time-major + int max_T_latent) { + + int T_latent = vae_enc_compute(m, audio, T_audio); + if (T_latent < 0) return -1; + + if (T_latent > max_T_latent) { + fprintf(stderr, "[VAE-Enc] T_latent %d exceeds max %d\n", T_latent, max_T_latent); + return -1; + } + + // Graph output is [ne0=T_latent, ne1=128] in ggml, channel-contiguous. + // Channels 0..63 = mean, 64..127 = scale. We only read mean. + // ggml layout: data[c * T_latent + t] for channel c, time t. + // We write time-major: latent_out[t * 64 + c] = data[c * T_latent + t] + // + // Read the full 128ch output once, extract mean channels 0..63 + size_t out_bytes = (size_t)128 * T_latent * sizeof(float); + std::vector raw(128 * T_latent); + ggml_backend_tensor_get(m->graph_output, raw.data(), 0, out_bytes); + + for (int t = 0; t < T_latent; t++) + for (int c = 0; c < 64; c++) + latent_out[t * 64 + c] = raw[c * T_latent + t]; + + fprintf(stderr, "[VAE-Enc] Encode: T_audio=%d -> T_latent=%d (%.2fs @ 48kHz)\n", + T_audio, T_latent, (float)T_audio / 48000.0f); + + return T_latent; +} + +// Tiled encode for long audio (same chunking strategy as decoder) +// chunk_size: latent frames per tile, overlap: context frames on each side +static int vae_enc_encode_tiled( + VAEEncoder * m, + const float * audio, // [T_audio, 2] interleaved stereo + int T_audio, + float * latent_out, // [T_latent, 64] output, time-major + int max_T_latent, + int chunk_size = 256, + int overlap = 64) { + + // Work in audio-sample space. Each latent frame = 1920 audio samples. + int audio_chunk = chunk_size * 1920; + int audio_overlap = overlap * 1920; + + // Shrink overlap until stride is positive + while (audio_chunk - 2 * audio_overlap <= 0 && audio_overlap > 0) + audio_overlap /= 2; + + // Short audio: encode directly + if (T_audio <= audio_chunk) + return vae_enc_encode(m, audio, T_audio, latent_out, max_T_latent); + + int audio_stride = audio_chunk - 2 * audio_overlap; + int num_steps = (T_audio + audio_stride - 1) / audio_stride; + + fprintf(stderr, "[VAE-Enc] Tiled encode: %d tiles (chunk=%d, overlap=%d, stride=%d audio samples)\n", + num_steps, audio_chunk, audio_overlap, audio_stride); + + float downsample_factor = 0.0f; + int latent_write_pos = 0; + + for (int i = 0; i < num_steps; i++) { + // Core range in audio samples (the part we keep) + int core_start = i * audio_stride; + int core_end = core_start + audio_stride; + if (core_end > T_audio) core_end = T_audio; + + // Window with overlap context + int win_start = core_start - audio_overlap; + if (win_start < 0) win_start = 0; + int win_end = core_end + audio_overlap; + if (win_end > T_audio) win_end = T_audio; + int win_len = win_end - win_start; + + // Encode this window + int tile_T = vae_enc_compute(m, audio + win_start * 2, win_len); + if (tile_T < 0) { + fprintf(stderr, "[VAE-Enc] FATAL: tile %d encode failed\n", i); + return -1; + } + + // Determine downsample factor from first tile + if (i == 0) { + downsample_factor = (float)tile_T / (float)win_len; + fprintf(stderr, "[VAE-Enc] Downsample factor: %.6f (expected ~1/1920)\n", + downsample_factor); + } + + // Trim in latent frames (mirror of decoder trim logic) + int added_start = core_start - win_start; + int trim_start = (int)roundf((float)added_start * downsample_factor); + int added_end = win_end - core_end; + int trim_end = (int)roundf((float)added_end * downsample_factor); + + int end_idx = (trim_end > 0) ? (tile_T - trim_end) : tile_T; + int core_len = end_idx - trim_start; + if (core_len <= 0) continue; + + if (latent_write_pos + core_len > max_T_latent) { + fprintf(stderr, "[VAE-Enc] FATAL: tiled output exceeds max_T_latent\n"); + return -1; + } + + // Read tile output [ne0=tile_T, ne1=128], extract mean (ch 0..63), transpose + // Only read the first 64 channels (mean), skip scale channels 64..127 + size_t out_bytes = (size_t)128 * tile_T * sizeof(float); + std::vector raw(128 * tile_T); + ggml_backend_tensor_get(m->graph_output, raw.data(), 0, out_bytes); + + for (int t = 0; t < core_len; t++) + for (int c = 0; c < 64; c++) + latent_out[(latent_write_pos + t) * 64 + c] = + raw[c * tile_T + (trim_start + t)]; + + latent_write_pos += core_len; + } + + fprintf(stderr, "[VAE-Enc] Tiled encode done: %d tiles -> T_latent=%d (%.2fs @ 48kHz)\n", + num_steps, latent_write_pos, (float)T_audio / 48000.0f); + + return latent_write_pos; +} + +// Free all resources +static void vae_enc_free(VAEEncoder * m) { + if (m->graph_ctx) { + ggml_backend_sched_reset(m->sched); + ggml_free(m->graph_ctx); + free(m->graph_buf); + } + if (m->sched) ggml_backend_sched_free(m->sched); + if (m->buf) ggml_backend_buffer_free(m->buf); + if (m->weight_ctx) ggml_free(m->weight_ctx); + backend_release(m->backend, m->cpu_backend); + *m = {}; +} diff --git a/src/vae.h b/src/vae.h index 20e2b6f..fbf6d5f 100644 --- a/src/vae.h +++ b/src/vae.h @@ -14,7 +14,6 @@ #include "backend.h" #include #include -#include #include #include @@ -210,12 +209,16 @@ static void vae_ggml_load(VAEGGML * m, const char * path) { m->sb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 128); m->c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, 128, 2); - // Phase 2: allocate backend buffer (im2col grid Y fix enables long-sequence conv1d) + // Phase 2: allocate backend buffer BackendPair bp = backend_init("VAE"); m->backend = bp.backend; m->cpu_backend = bp.cpu_backend; m->sched = backend_sched_new(bp, 8192); m->buf = ggml_backend_alloc_ctx_tensors(ctx, m->backend); + if (!m->buf) { + fprintf(stderr, "[VAE] FATAL: failed to allocate weight buffer\n"); + exit(1); + } fprintf(stderr, "[VAE] Backend: %s, Weight buffer: %.1f MB\n", ggml_backend_name(m->backend), (float)ggml_backend_buffer_get_size(m->buf) / (1024 * 1024)); @@ -555,203 +558,3 @@ static void vae_ggml_free(VAEGGML * m) { backend_release(m->backend, m->cpu_backend); *m = {}; } - -// --------------------------------------------------------------------------- -// VAE Encoder (audio -> 64-d latents @ 25Hz for reference timbre) -// Oobleck encoder: conv1(2->128) -> 5 blocks (stride 2,4,4,8,8) -> snake -> conv2(2048->128) -// Output 128 = mean(64) + scale(64); we use mean only. -// Requires encoder.* tensors in the same VAE GGUF (full autoencoder export). -// --------------------------------------------------------------------------- -struct VAEEncoderBlock { - VAEResUnit ru[3]; - struct ggml_tensor * sa, * sb; - struct ggml_tensor * c1w, * c1b; - int in_ch, out_ch, stride; -}; - -struct VAEEncoderGGML { - struct ggml_tensor * c1w, * c1b; - VAEEncoderBlock blk[5]; - struct ggml_tensor * sa, * sb; - struct ggml_tensor * c2w, * c2b; - - ggml_backend_t backend; - ggml_backend_t cpu_backend; - ggml_backend_sched_t sched; - ggml_backend_buffer_t buf; - struct ggml_context * weight_ctx; - bool has_encoder; -}; - -static bool vae_encoder_load(VAEEncoderGGML * m, const char * path) { - m->has_encoder = false; - GGUFModel gf = {}; - if (!gf_load(&gf, path)) { - fprintf(stderr, "[VAE Encoder] cannot load %s\n", path); - return false; - } - if (!ggml_get_tensor(gf.meta, "encoder.conv1.weight_v")) { - gf_close(&gf); - fprintf(stderr, "[VAE Encoder] no encoder.* in %s (decoder-only GGUF). Use a full VAE GGUF for reference_audio WAV.\n", path); - return false; - } - - static const int enc_strides[] = {2, 4, 4, 8, 8}; - static const int enc_in_ch[] = {128, 256, 512, 1024, 2048}; - static const int enc_out_ch[] = {256, 512, 1024, 2048, 2048}; - static const int dilations[] = {1, 3, 9}; - - size_t ctx_size = ggml_tensor_overhead() * 200; - struct ggml_init_params p = { ctx_size, NULL, true }; - m->weight_ctx = ggml_init(p); - struct ggml_context * ctx = m->weight_ctx; - - m->c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, 2, 128); - m->c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128); - - for (int i = 0; i < 5; i++) { - VAEEncoderBlock & b = m->blk[i]; - b.in_ch = enc_in_ch[i]; - b.out_ch = enc_out_ch[i]; - b.stride = enc_strides[i]; - int C = b.in_ch; - b.sa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); - b.sb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); - b.c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 2 * b.stride, C, b.out_ch); - b.c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, b.out_ch); - for (int r = 0; r < 3; r++) { - VAEResUnit & ru = b.ru[r]; - ru.dilation = dilations[r]; - ru.s1a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); - ru.s1b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); - ru.c1w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 7, C, C); - ru.c1b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C); - ru.s2a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); - ru.s2b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, C); - ru.c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 1, C, C); - ru.c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, C); - } - } - m->sa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048); - m->sb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 2048); - m->c2w = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 3, 2048, 128); - m->c2b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 128); - - BackendPair bp = backend_init("VAE-Encoder"); - m->backend = bp.backend; - m->cpu_backend = bp.cpu_backend; - m->sched = backend_sched_new(bp, 8192); - m->buf = ggml_backend_alloc_ctx_tensors(ctx, m->backend); - - vae_fuse_wn(m->c1w, gf, "encoder.conv1"); - vae_load_bias(m->c1b, gf, "encoder.conv1.bias"); - - for (int i = 0; i < 5; i++) { - VAEEncoderBlock & b = m->blk[i]; - std::string pfx = "encoder.block." + std::to_string(i); - for (int r = 0; r < 3; r++) { - std::string rp = pfx + ".res_unit" + std::to_string(r + 1); - vae_load_snake(b.ru[r].s1a, gf, rp + ".snake1.alpha"); - vae_load_snake_inv(b.ru[r].s1b, gf, rp + ".snake1.beta"); - vae_fuse_wn(b.ru[r].c1w, gf, rp + ".conv1"); - vae_load_bias(b.ru[r].c1b, gf, rp + ".conv1.bias"); - vae_load_snake(b.ru[r].s2a, gf, rp + ".snake2.alpha"); - vae_load_snake_inv(b.ru[r].s2b, gf, rp + ".snake2.beta"); - vae_fuse_wn(b.ru[r].c2w, gf, rp + ".conv2"); - vae_load_bias(b.ru[r].c2b, gf, rp + ".conv2.bias"); - } - vae_load_snake(b.sa, gf, pfx + ".snake1.alpha"); - vae_load_snake_inv(b.sb, gf, pfx + ".snake1.beta"); - vae_fuse_wn(b.c1w, gf, pfx + ".conv1"); - vae_load_bias(b.c1b, gf, pfx + ".conv1.bias"); - } - vae_load_snake(m->sa, gf, "encoder.snake1.alpha"); - vae_load_snake_inv(m->sb, gf, "encoder.snake1.beta"); - vae_fuse_wn(m->c2w, gf, "encoder.conv2"); - vae_load_bias(m->c2b, gf, "encoder.conv2.bias"); - - gf_close(&gf); - m->has_encoder = true; - fprintf(stderr, "[VAE Encoder] loaded (2->128->...->2048->128, 64-d mean)\n"); - return true; -} - -static int vae_encoder_forward(VAEEncoderGGML * m, const float * audio, int T_audio, - float * latent_out) { - if (!m->has_encoder || T_audio < 1920) return -1; - - ggml_backend_sched_reset(m->sched); - size_t ctx_size = 4096 * ggml_tensor_overhead() + ggml_graph_overhead(); - struct ggml_init_params gp = { ctx_size, NULL, true }; - struct ggml_context * ctx = ggml_init(gp); - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 4096, false); - - struct ggml_tensor * x = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, T_audio, 2); - ggml_set_name(x, "audio_in"); - ggml_set_input(x); - x = vae_conv1d(ctx, m->c1w, m->c1b, x, 1, 3, 1); - - for (int i = 0; i < 5; i++) { - VAEEncoderBlock & b = m->blk[i]; - for (int r = 0; r < 3; r++) - x = vae_res_unit(ctx, &b.ru[r], x); - x = vae_snake(ctx, x, b.sa, b.sb); - int pad = (int)((float)b.stride / 2.0f + 0.5f); - x = vae_conv1d(ctx, b.c1w, b.c1b, x, b.stride, pad, 1); - } - - x = vae_snake(ctx, x, m->sa, m->sb); - x = vae_conv1d(ctx, m->c2w, m->c2b, x, 1, 1, 1); - ggml_set_name(x, "enc_out"); - ggml_set_output(x); - - ggml_build_forward_expand(gf, x); - if (!ggml_backend_sched_alloc_graph(m->sched, gf)) { - ggml_free(ctx); - return -1; - } - ggml_backend_tensor_set(ggml_graph_get_tensor(gf, "audio_in"), audio, 0, (size_t)T_audio * 2 * sizeof(float)); - ggml_backend_sched_graph_compute(m->sched, gf); - ggml_backend_sched_synchronize(m->sched); - - struct ggml_tensor * out = ggml_graph_get_tensor(gf, "enc_out"); - // Encoder strides 2,4,4,8,8 give T_out != T_audio/1920; use actual output shape to avoid read out of bounds - int T_latent = (int)out->ne[0]; - size_t nbytes = (size_t)T_latent * 128 * sizeof(float); - std::vector tmp((size_t)T_latent * 128); - ggml_backend_tensor_get(out, tmp.data(), 0, nbytes); - for (int t = 0; t < T_latent; t++) - for (int c = 0; c < 64; c++) - latent_out[t * 64 + c] = tmp[t * 128 + c]; - - ggml_backend_sched_reset(m->sched); - ggml_free(ctx); - return T_latent; -} - -static void vae_encoder_free(VAEEncoderGGML * m) { - // Order: reset sched, free sched (drops refs to graph/alloc), free weight_ctx (tensor metadata), - // then buffer (tensor data), then backends. Avoids double free on some GGML backends. - if (m->sched) { - ggml_backend_sched_reset(m->sched); - ggml_backend_sched_free(m->sched); - m->sched = NULL; - } - if (m->weight_ctx) { - ggml_free(m->weight_ctx); - m->weight_ctx = NULL; - } - if (m->buf) { - ggml_backend_buffer_free(m->buf); - m->buf = NULL; - } - if (m->backend && m->backend != m->cpu_backend) { - ggml_backend_free(m->backend); - m->backend = NULL; - } - if (m->cpu_backend) { - ggml_backend_free(m->cpu_backend); - m->cpu_backend = NULL; - } - *m = {}; -} diff --git a/src/wav.h b/src/wav.h deleted file mode 100644 index e7f0f02..0000000 --- a/src/wav.h +++ /dev/null @@ -1,101 +0,0 @@ -// wav.h: minimal WAV loader for reference audio (stereo 48kHz float out) -// No Python or external deps. Handles 16-bit PCM, mono/stereo, resamples to 48kHz if needed. - -#pragma once - -#include -#include -#include -#include -#include - -// Load WAV file into stereo float32 at 48kHz. -// Out: interleaved L,R,L,R,... length = num_samples (both channels). -// Returns num_samples (per channel), or -1 on error. -static int wav_load_48k_stereo(const char * path, std::vector * out) { - FILE * f = fopen(path, "rb"); - if (!f) return -1; - - char riff[4], fmt[4]; - if (fread(riff, 1, 4, f) != 4 || memcmp(riff, "RIFF", 4) != 0) { - fclose(f); - return -1; - } - uint32_t file_len; - if (fread(&file_len, 4, 1, f) != 1) { fclose(f); return -1; } - if (fread(fmt, 1, 4, f) != 4 || memcmp(fmt, "WAVE", 4) != 0) { - fclose(f); - return -1; - } - - uint16_t channels = 2, bits = 16; - uint32_t sample_rate = 48000; - bool found_fmt = false; - - while (1) { - char chunk_id[4]; - if (fread(chunk_id, 1, 4, f) != 4) break; - uint32_t chunk_size; - if (fread(&chunk_size, 4, 1, f) != 1) break; - long chunk_start = ftell(f); - - if (memcmp(chunk_id, "fmt ", 4) == 0 && chunk_size >= 16) { - uint16_t fmt_tag, block_align; - uint32_t byte_rate; - if (fread(&fmt_tag, 2, 1, f) != 1) break; - if (fread(&channels, 2, 1, f) != 1) break; - if (fread(&sample_rate, 4, 1, f) != 1) break; - if (fread(&byte_rate, 4, 1, f) != 1) break; - if (fread(&block_align, 2, 1, f) != 1) break; - if (fread(&bits, 2, 1, f) != 1) break; - found_fmt = true; - } else if (memcmp(chunk_id, "data", 4) == 0 && found_fmt) { - size_t num_bytes = chunk_size; - size_t num_samples = num_bytes / (channels * (bits / 8)); - if (num_samples == 0) { fclose(f); return -1; } - - std::vector raw(num_samples * channels); - if (fread(raw.data(), 2, raw.size(), f) != raw.size()) { - fclose(f); - return -1; - } - - out->resize(num_samples * 2); - float scale = 1.0f / 32768.0f; - if (channels == 1) { - for (size_t i = 0; i < num_samples; i++) { - float s = (float)raw[i] * scale; - (*out)[i * 2] = s; - (*out)[i * 2 + 1] = s; - } - } else { - for (size_t i = 0; i < num_samples * 2; i++) - (*out)[i] = (float)raw[i] * scale; - } - - fclose(f); - - // Resample to 48kHz if needed (linear interpolation) - if (sample_rate != 48000) { - size_t in_len = num_samples; - size_t out_len = (size_t)((double)in_len * 48000.0 / (double)sample_rate); - std::vector resampled(out_len * 2); - for (size_t i = 0; i < out_len; i++) { - double t = (double)i * (double)in_len / (double)out_len; - size_t i0 = (size_t)t; - size_t i1 = std::min(i0 + 1, in_len - 1); - float w = (float)(t - (double)i0); - for (int c = 0; c < 2; c++) - resampled[i * 2 + c] = (*out)[i0 * 2 + c] * (1.0f - w) + (*out)[i1 * 2 + c] * w; - } - *out = std::move(resampled); - return (int)out_len; - } - return (int)num_samples; - } - - fseek(f, chunk_start + (long)chunk_size, SEEK_SET); - } - fclose(f); - return -1; -} diff --git a/tests/BF16.log b/tests/BF16.log deleted file mode 100644 index 7ea7d57..0000000 --- a/tests/BF16.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf -[GGML] Running acestep-v15-turbo-BF16.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999805 - lyric_embed 1.000000 - enc_hidden 0.999830 - detok_output 0.999996 - context 0.999998 - noise 1.000000 - temb_t 0.999999 - hidden_after_proj_in 0.999988 - enc_after_cond_emb 0.999818 - layer0_sa_output 0.999951 - hidden_after_layer0 0.999978 - hidden_after_layer6 0.999916 - hidden_after_layer12 0.999234 - hidden_after_layer18 0.996570 - hidden_after_layer23 0.993528 - dit_step0_vt 0.974876 - dit_step0_xt 0.999945 - dit_step1_vt 0.980053 - dit_step1_xt 0.999834 - dit_step2_vt 0.981541 - dit_step2_xt 0.999553 - dit_step3_vt 0.982418 - dit_step3_xt 0.998924 - dit_step4_vt 0.980811 - dit_step4_xt 0.997503 - dit_step5_vt 0.977877 - dit_step5_xt 0.994298 - dit_step6_vt 0.974930 - dit_step6_xt 0.988188 - dit_step7_vt 0.969375 - dit_x0 0.979213 - vae_audio 0.901377 - vae_audio (STFT cosine) 0.975525 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999945 0.135628 0.006709 -0.002312 0.972932 -0.002342 0.972003 - dit_step1_xt 0.999834 0.266762 0.011267 -0.005306 0.942657 -0.005313 0.941730 - dit_step2_xt 0.999553 0.453190 0.017486 -0.009350 0.909152 -0.009311 0.908527 - dit_step3_xt 0.998924 0.643865 0.025962 -0.014715 0.873769 -0.014577 0.873624 - dit_step4_xt 0.997503 0.790038 0.037807 -0.021768 0.841938 -0.021660 0.841995 - dit_step5_xt 0.994298 1.239881 0.055598 -0.031834 0.825214 -0.032109 0.824593 - dit_step6_xt 0.988188 2.076383 0.082565 -0.046121 0.856115 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-BF16.gguf -[GGML] Running acestep-v15-sft-BF16.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999805 - lyric_embed 1.000000 - enc_hidden 0.999830 - detok_output 0.999996 - context 0.999998 - noise 1.000000 - temb_t 0.999997 - hidden_after_proj_in 0.999987 - enc_after_cond_emb 0.999820 - layer0_sa_output 0.999942 - hidden_after_layer0 0.999980 - hidden_after_layer6 0.999847 - hidden_after_layer12 0.999483 - hidden_after_layer18 0.998723 - hidden_after_layer23 0.998976 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998938 - dit_step0_vt_uncond 0.998662 - dit_step0_vt 0.995622 - dit_step0_xt 0.999998 - dit_step5_vt_cond 0.999492 - dit_step5_vt 0.993792 - dit_step5_xt 0.999962 - dit_step10_vt_cond 0.998783 - dit_step10_vt 0.993293 - dit_step10_xt 0.999885 - dit_step15_vt_cond 0.997654 - dit_step15_vt 0.987992 - dit_step15_xt 0.999675 - dit_step20_vt_cond 0.995364 - dit_step20_vt 0.980590 - dit_step20_xt 0.999177 - dit_step25_vt_cond 0.990719 - dit_step25_vt 0.970351 - dit_step25_xt 0.998116 - dit_step30_vt_cond 0.985676 - dit_step30_vt 0.965303 - dit_step30_xt 0.996402 - dit_step35_vt_cond 0.981229 - dit_step35_vt 0.957586 - dit_step35_xt 0.994272 - dit_step40_vt_cond 0.978699 - dit_step40_vt 0.951774 - dit_step40_xt 0.992207 - dit_step45_vt_cond 0.981165 - dit_step45_vt 0.954789 - dit_step45_xt 0.990734 - dit_step49_vt_cond 0.983553 - dit_step49_vt 0.924041 - dit_x0 0.990243 - vae_audio 0.956370 - vae_audio (STFT cosine) 0.981929 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999998 0.038950 0.002063 -0.001725 0.980009 -0.001741 0.980402 - dit_step5_xt 0.999962 0.130437 0.005829 -0.006903 0.888898 -0.007143 0.887999 - dit_step10_xt 0.999885 0.226949 0.009019 -0.012332 0.810283 -0.012603 0.811299 - dit_step15_xt 0.999675 0.364782 0.013694 -0.017622 0.745056 -0.018114 0.745268 - dit_step20_xt 0.999177 0.445386 0.020236 -0.023046 0.699325 -0.023808 0.699582 - dit_step25_xt 0.998116 0.652368 0.029048 -0.028568 0.677830 -0.029311 0.679278 - dit_step30_xt 0.996402 1.067296 0.039895 -0.034151 0.683829 -0.035027 0.685262 - dit_step35_xt 0.994272 1.703333 0.052370 -0.039663 0.716078 -0.040716 0.717195 - dit_step40_xt 0.992207 2.069015 0.065941 -0.045141 0.769969 -0.046462 0.771853 - dit_step45_xt 0.990734 2.329453 0.078903 -0.051095 0.841302 -0.052475 0.843036 diff --git a/tests/CPU-BF16.log b/tests/CPU-BF16.log index b20ebae..74300ed 100644 --- a/tests/CPU-BF16.log +++ b/tests/CPU-BF16.log @@ -7,36 +7,34 @@ [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 3007.9 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 464.0 ms +[Load] DiT weight load: 301.5 ms [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CPU (CPU threads: 16) +[Load] VAE backend: CPU (shared) [VAE] Backend: CPU, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 651.3 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 666.9 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 31.9 ms +[Load] BPE tokenizer: 30.9 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CPU (CPU threads: 16) +[Load] TextEncoder backend: CPU (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 226.8 ms -[Encode] TextEncoder (70 tokens): 59.7 ms +[Load] TextEncoder: 121.5 ms +[Encode] TextEncoder (70 tokens): 58.0 ms [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.7 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CPU (CPU threads: 16) +[Load] CondEncoder backend: CPU (shared) [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -46,18 +44,18 @@ [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 1160.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 230.8 ms +[Load] ConditionEncoder: 111.5 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 274.9 ms, enc_S=238 +[Encode] ConditionEncoder: 268.3 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.758296 -0.049593 -0.132844 0.058496 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [WeightCtx] Loaded 30 tensors, 200.3 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 34.6 ms +[Load] Detokenizer: 23.6 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 958.8 ms +[Context] Detokenizer: 889.4 ms [Debug] detok_output: [2170, 64] first4: -0.124160 1.435260 0.310138 -0.624584 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -112,35 +110,32 @@ [Debug] dit_step7_vt: [2170, 64] first4: 0.002176 0.183052 -1.467304 3.113325 [Debug] dit_x0: [2170, 64] first4: 0.083178 1.441022 0.423316 -1.927701 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 18721.5 ms (18721.5 ms/sample) +[DiT] Total generation: 17583.4 ms (17583.4 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.083178 1.441022 0.423316 -1.927701 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 51818.0 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000519 0.001024 0.000897 0.001200 +[VAE Batch0] Decode: 46859.3 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000480 0.000983 0.000816 0.001189 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:57:38.585 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:57:38.585 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:57:38.585 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:57:38.586 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:57:38.586 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:57:39.413 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:57:40.961 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:57:40.961 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:57:40.966 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:57:41.132 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:49:02.827 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:49:02.916 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:49:04.251 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:49:04.252 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:49:04.253 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:49:04.259 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:49:04.454 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -182,25 +177,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:57:41.134 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:49:04.456 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:57:41.140 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:57:41.153 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:57:41.153 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:57:41.175 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:57:41.483 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:57:41.483 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:57:41.483 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00688624382019043, 'diffusion_time_cost': 0.30014586448669434, 'diffusion_per_step_time_cost': 0.03751823306083679, 'total_time_cost': 0.30703210830688477, 'offload_time_cost': 0.0} -2026-03-01 19:57:41.498 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:57:41.500 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:57:41.500 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB -2026-03-01 19:57:41.500 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:57:41.500 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB -2026-03-01 19:57:41.500 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB -2026-03-01 19:57:41.500 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:57:41.775 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:57:41.777 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:57:41.780 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:49:04.463 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:49:04.478 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:49:04.478 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:49:04.514 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:49:04.845 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:49:04.846 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:49:04.846 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007018327713012695, 'diffusion_time_cost': 0.32423973083496094, 'diffusion_per_step_time_cost': 0.04052996635437012, 'total_time_cost': 0.33125805854797363, 'offload_time_cost': 0.0} +2026-03-04 21:49:04.860 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:49:04.862 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:49:04.862 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB +2026-03-04 21:49:04.862 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:49:04.862 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB +2026-03-04 21:49:04.862 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB +2026-03-04 21:49:04.862 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:49:05.138 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:49:05.140 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:49:05.142 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf [GGML] Running acestep-v15-turbo-BF16.gguf... @@ -244,8 +239,8 @@ Using precomputed LM hints dit_step6_xt 0.988142 dit_step7_vt 0.969102 dit_x0 0.979106 - vae_audio 0.901370 - vae_audio (STFT cosine) 0.975816 + vae_audio 0.901389 + vae_audio (STFT cosine) 0.975826 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999946 0.136541 0.006626 -0.002312 0.972951 -0.002342 0.972003 diff --git a/tests/CPU-Q4_K_M.log b/tests/CPU-Q4_K_M.log index 508a20c..540c4c4 100644 --- a/tests/CPU-Q4_K_M.log +++ b/tests/CPU-Q4_K_M.log @@ -1,5 +1,5 @@ [Load] DiT backend: CPU (CPU threads: 16) -[Load] Backend init: 6.3 ms +[Load] Backend init: 1.6 ms [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K fused, V separate [DiT] Cross-attn: all separate @@ -7,36 +7,34 @@ [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 895.6 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 118.4 ms +[Load] DiT weight load: 121.8 ms [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CPU (CPU threads: 16) +[Load] VAE backend: CPU (shared) [VAE] Backend: CPU, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 696.2 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 699.6 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 33.0 ms +[Load] BPE tokenizer: 33.3 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CPU (CPU threads: 16) +[Load] TextEncoder backend: CPU (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 148.2 ms -[Encode] TextEncoder (70 tokens): 58.0 ms +[Load] TextEncoder: 122.9 ms +[Encode] TextEncoder (70 tokens): 60.4 ms [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.6 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CPU (CPU threads: 16) +[Load] CondEncoder backend: CPU (shared) [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K fused, V separate @@ -46,18 +44,18 @@ [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 352.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 37.5 ms +[Load] ConditionEncoder: 34.8 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 294.2 ms, enc_S=238 +[Encode] ConditionEncoder: 300.3 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.759313 -0.049345 -0.129442 0.055759 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 64.7 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 10.1 ms +[Load] Detokenizer: 9.1 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 354.8 ms +[Context] Detokenizer: 361.0 ms [Debug] detok_output: [2170, 64] first4: -0.106265 1.448869 0.309591 -0.650098 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -112,35 +110,32 @@ [Debug] dit_step7_vt: [2170, 64] first4: -0.463452 0.896626 -1.673395 3.222673 [Debug] dit_x0: [2170, 64] first4: 0.290887 1.122067 0.588729 -1.917174 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 21769.5 ms (21769.5 ms/sample) +[DiT] Total generation: 21823.6 ms (21823.6 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.290887 1.122067 0.588729 -1.917174 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 52184.7 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000272 0.000786 0.000556 0.000990 +[VAE Batch0] Decode: 47904.5 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000330 0.000828 0.000665 0.001038 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 20:03:15.903 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 20:03:15.903 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 20:03:15.903 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 20:03:15.903 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 20:03:15.904 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 20:03:16.714 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 20:03:18.309 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 20:03:18.309 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 20:03:18.315 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 20:03:18.480 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:54:26.607 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:54:26.698 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:54:28.050 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:54:28.050 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:54:28.054 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:54:28.059 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:54:28.263 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -182,25 +177,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 20:03:18.482 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:54:28.265 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 20:03:18.488 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 20:03:18.501 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 20:03:18.501 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 20:03:18.540 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 20:03:18.854 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 20:03:18.855 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 20:03:18.855 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006970643997192383, 'diffusion_time_cost': 0.3072662353515625, 'diffusion_per_step_time_cost': 0.03840827941894531, 'total_time_cost': 0.3142368793487549, 'offload_time_cost': 0.0} -2026-03-01 20:03:18.869 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 20:03:18.872 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 20:03:18.872 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 20:03:18.872 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 20:03:18.872 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 20:03:18.872 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 20:03:18.872 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 20:03:19.148 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 20:03:19.151 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 20:03:19.154 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:54:28.272 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:54:28.288 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:54:28.288 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:54:28.323 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:54:28.640 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:54:28.641 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:54:28.641 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0070536136627197266, 'diffusion_time_cost': 0.30983686447143555, 'diffusion_per_step_time_cost': 0.03872960805892944, 'total_time_cost': 0.3168904781341553, 'offload_time_cost': 0.0} +2026-03-04 21:54:28.655 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:54:28.666 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:54:28.666 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:54:28.666 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:54:28.666 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:54:28.666 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:54:28.666 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:54:28.949 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:54:28.951 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:54:28.952 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf [GGML] Running acestep-v15-turbo-Q4_K_M.gguf... @@ -244,8 +239,8 @@ Using precomputed LM hints dit_step6_xt 0.977196 dit_step7_vt 0.939970 dit_x0 0.959881 - vae_audio 0.834993 - vae_audio (STFT cosine) 0.955098 + vae_audio 0.834992 + vae_audio (STFT cosine) 0.955102 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999883 0.167680 0.010319 -0.002256 0.973185 -0.002342 0.972003 diff --git a/tests/CPU-Q5_K_M.log b/tests/CPU-Q5_K_M.log index e0d9936..6722100 100644 --- a/tests/CPU-Q5_K_M.log +++ b/tests/CPU-Q5_K_M.log @@ -7,36 +7,34 @@ [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1061.2 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 140.3 ms +[Load] DiT weight load: 110.6 ms [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CPU (CPU threads: 16) +[Load] VAE backend: CPU (shared) [VAE] Backend: CPU, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 699.1 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 698.0 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 33.4 ms +[Load] BPE tokenizer: 33.1 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CPU (CPU threads: 16) +[Load] TextEncoder backend: CPU (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 149.7 ms -[Encode] TextEncoder (70 tokens): 57.3 ms +[Load] TextEncoder: 123.1 ms +[Encode] TextEncoder (70 tokens): 57.9 ms [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.5 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CPU (CPU threads: 16) +[Load] CondEncoder backend: CPU (shared) [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K fused, V separate @@ -46,18 +44,18 @@ [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 412.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 45.1 ms +[Load] ConditionEncoder: 41.0 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 387.5 ms, enc_S=238 +[Encode] ConditionEncoder: 388.3 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.760901 -0.053445 -0.132760 0.058505 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 73.2 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 11.3 ms +[Load] Detokenizer: 10.3 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 447.0 ms +[Context] Detokenizer: 446.1 ms [Debug] detok_output: [2170, 64] first4: -0.129311 1.458194 0.298132 -0.651512 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -112,35 +110,32 @@ [Debug] dit_step7_vt: [2170, 64] first4: -0.003599 0.325174 -1.377289 3.053612 [Debug] dit_x0: [2170, 64] first4: 0.058232 1.415164 0.443289 -1.901864 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 27970.1 ms (27970.1 ms/sample) +[DiT] Total generation: 28035.0 ms (28035.0 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.058232 1.415164 0.443289 -1.901864 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 51966.1 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000740 0.001305 0.001083 0.001434 +[VAE Batch0] Decode: 47798.0 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000762 0.001320 0.001139 0.001557 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 20:01:55.226 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 20:01:55.226 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 20:01:55.226 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 20:01:55.226 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 20:01:55.226 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 20:01:56.032 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 20:01:57.576 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 20:01:57.577 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 20:01:57.581 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 20:01:57.747 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:53:09.193 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:53:09.323 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:53:10.674 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:53:10.674 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:53:10.676 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:53:10.682 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:53:10.881 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -182,25 +177,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 20:01:57.749 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:53:10.884 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 20:01:57.755 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 20:01:57.768 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 20:01:57.768 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 20:01:57.801 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 20:01:58.109 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 20:01:58.109 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 20:01:58.109 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007002353668212891, 'diffusion_time_cost': 0.30033254623413086, 'diffusion_per_step_time_cost': 0.03754156827926636, 'total_time_cost': 0.30733489990234375, 'offload_time_cost': 0.0} -2026-03-01 20:01:58.124 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 20:01:58.126 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 20:01:58.126 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 20:01:58.126 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 20:01:58.126 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 20:01:58.126 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 20:01:58.126 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 20:01:58.401 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 20:01:58.403 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 20:01:58.406 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:53:10.890 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:53:10.930 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:53:10.930 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:53:10.966 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:53:11.283 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:53:11.284 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:53:11.284 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006951332092285156, 'diffusion_time_cost': 0.3100306987762451, 'diffusion_per_step_time_cost': 0.03875383734703064, 'total_time_cost': 0.3169820308685303, 'offload_time_cost': 0.0} +2026-03-04 21:53:11.298 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:53:11.300 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:53:11.300 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB +2026-03-04 21:53:11.300 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:53:11.300 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB +2026-03-04 21:53:11.300 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB +2026-03-04 21:53:11.300 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:53:11.575 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:53:11.577 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:53:11.579 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf [GGML] Running acestep-v15-turbo-Q5_K_M.gguf... @@ -244,8 +239,8 @@ Using precomputed LM hints dit_step6_xt 0.983513 dit_step7_vt 0.954349 dit_x0 0.970379 - vae_audio 0.874800 - vae_audio (STFT cosine) 0.967703 + vae_audio 0.874850 + vae_audio (STFT cosine) 0.967714 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999926 0.135378 0.008030 -0.002303 0.973012 -0.002342 0.972003 diff --git a/tests/CPU-Q6_K.log b/tests/CPU-Q6_K.log index 7d4c411..75b4fd2 100644 --- a/tests/CPU-Q6_K.log +++ b/tests/CPU-Q6_K.log @@ -7,36 +7,34 @@ [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1237.2 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 169.4 ms +[Load] DiT weight load: 150.0 ms [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CPU (CPU threads: 16) +[Load] VAE backend: CPU (shared) [VAE] Backend: CPU, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 699.2 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 689.7 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 32.5 ms +[Load] BPE tokenizer: 33.1 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CPU (CPU threads: 16) +[Load] TextEncoder backend: CPU (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 148.3 ms -[Encode] TextEncoder (70 tokens): 57.5 ms +[Load] TextEncoder: 124.2 ms +[Encode] TextEncoder (70 tokens): 58.0 ms [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.6 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CPU (CPU threads: 16) +[Load] CondEncoder backend: CPU (shared) [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -46,18 +44,18 @@ [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 476.3 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 52.6 ms +[Load] ConditionEncoder: 47.5 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 348.9 ms, enc_S=238 +[Encode] ConditionEncoder: 349.5 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.761694 -0.052035 -0.131773 0.058231 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 82.2 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 12.3 ms +[Load] Detokenizer: 11.0 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 414.3 ms +[Context] Detokenizer: 417.1 ms [Debug] detok_output: [2170, 64] first4: -0.151355 1.462444 0.326907 -0.627213 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -112,35 +110,32 @@ [Debug] dit_step7_vt: [2170, 64] first4: 0.118016 0.207620 -1.266971 2.955565 [Debug] dit_x0: [2170, 64] first4: 0.004752 1.435176 0.398691 -1.887822 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 25398.3 ms (25398.3 ms/sample) +[DiT] Total generation: 25477.6 ms (25477.6 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.004752 1.435176 0.398691 -1.887822 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 52074.7 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000467 0.001015 0.000873 0.001303 +[VAE Batch0] Decode: 47852.2 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000553 0.001102 0.000938 0.001323 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 20:00:28.298 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 20:00:28.298 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 20:00:28.298 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 20:00:28.298 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 20:00:28.298 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 20:00:29.103 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 20:00:30.690 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 20:00:30.690 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 20:00:30.695 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 20:00:30.860 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:51:45.520 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:51:45.634 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:51:46.994 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:51:46.994 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:51:46.995 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:51:47.001 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:51:47.198 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -182,25 +177,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 20:00:30.862 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:51:47.201 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 20:00:30.869 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 20:00:30.881 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 20:00:30.882 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 20:00:30.914 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 20:00:31.231 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 20:00:31.232 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 20:00:31.232 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006938934326171875, 'diffusion_time_cost': 0.31071925163269043, 'diffusion_per_step_time_cost': 0.038839906454086304, 'total_time_cost': 0.3176581859588623, 'offload_time_cost': 0.0} -2026-03-01 20:00:31.246 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 20:00:31.249 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 20:00:31.249 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB -2026-03-01 20:00:31.249 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 20:00:31.249 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB -2026-03-01 20:00:31.249 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB -2026-03-01 20:00:31.249 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 20:00:31.524 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 20:00:31.527 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 20:00:31.531 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:51:47.208 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:51:47.224 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:51:47.224 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:51:47.259 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:51:47.579 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:51:47.579 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:51:47.579 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007021188735961914, 'diffusion_time_cost': 0.31169986724853516, 'diffusion_per_step_time_cost': 0.038962483406066895, 'total_time_cost': 0.31872105598449707, 'offload_time_cost': 0.0} +2026-03-04 21:51:47.593 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:51:47.595 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:51:47.595 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB +2026-03-04 21:51:47.596 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:51:47.596 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB +2026-03-04 21:51:47.596 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB +2026-03-04 21:51:47.596 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:51:47.870 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:51:47.872 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:51:47.874 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf [GGML] Running acestep-v15-turbo-Q6_K.gguf... @@ -244,8 +239,8 @@ Using precomputed LM hints dit_step6_xt 0.984569 dit_step7_vt 0.958147 dit_x0 0.972312 - vae_audio 0.891761 - vae_audio (STFT cosine) 0.969080 + vae_audio 0.891790 + vae_audio (STFT cosine) 0.969088 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999936 0.151952 0.007283 -0.002271 0.972870 -0.002342 0.972003 diff --git a/tests/CPU-Q8_0.log b/tests/CPU-Q8_0.log index 76183ea..3eb253c 100644 --- a/tests/CPU-Q8_0.log +++ b/tests/CPU-Q8_0.log @@ -7,36 +7,34 @@ [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1600.7 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 188.0 ms +[Load] DiT weight load: 178.6 ms [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CPU (CPU threads: 16) +[Load] VAE backend: CPU (shared) [VAE] Backend: CPU, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 690.8 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 692.2 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 32.8 ms +[Load] BPE tokenizer: 32.4 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CPU (CPU threads: 16) +[Load] TextEncoder backend: CPU (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 160.0 ms -[Encode] TextEncoder (70 tokens): 57.9 ms +[Load] TextEncoder: 123.5 ms +[Encode] TextEncoder (70 tokens): 58.2 ms [Debug] text_hidden: [70, 1024] first4: 3.704526 2.436253 0.222853 -13.131872 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 13.0 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CPU (CPU threads: 16) +[Load] CondEncoder backend: CPU (shared) [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -46,18 +44,18 @@ [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 616.6 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 126.4 ms +[Load] ConditionEncoder: 65.1 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 390.3 ms, enc_S=238 +[Encode] ConditionEncoder: 373.4 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.758873 -0.049568 -0.132802 0.057792 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 106.5 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 13.6 ms +[Load] Detokenizer: 14.3 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 447.8 ms +[Context] Detokenizer: 448.5 ms [Debug] detok_output: [2170, 64] first4: -0.126218 1.441045 0.305219 -0.629688 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -112,35 +110,32 @@ [Debug] dit_step7_vt: [2170, 64] first4: -0.037024 0.233524 -1.487499 3.098410 [Debug] dit_x0: [2170, 64] first4: 0.094459 1.422387 0.433039 -1.914712 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 26043.3 ms (26043.3 ms/sample) +[DiT] Total generation: 26009.5 ms (26009.5 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.094459 1.422387 0.433039 -1.914712 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 52114.7 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000455 0.000930 0.000816 0.001121 +[VAE Batch0] Decode: 47762.1 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000441 0.000946 0.000788 0.001168 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:59:03.882 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:59:03.882 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:59:03.882 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:59:03.883 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:59:03.883 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:59:04.691 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:59:06.262 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:59:06.262 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:59:06.268 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:59:06.433 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:50:24.424 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:50:24.514 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:50:25.858 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:50:25.858 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:50:25.860 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:50:25.865 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:50:26.063 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -154,8 +149,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -182,25 +177,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:59:06.436 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:50:26.065 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:59:06.443 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:59:06.457 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:59:06.457 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:59:06.478 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:59:06.802 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:59:06.803 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:59:06.803 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006929874420166016, 'diffusion_time_cost': 0.3164329528808594, 'diffusion_per_step_time_cost': 0.03955411911010742, 'total_time_cost': 0.3233628273010254, 'offload_time_cost': 0.0} -2026-03-01 19:59:06.817 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:59:06.819 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:59:06.819 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:59:06.819 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:59:06.819 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:59:06.819 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:59:06.819 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:59:07.095 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:59:07.098 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:59:07.101 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:50:26.073 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:50:26.088 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:50:26.088 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:50:26.120 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:50:26.438 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:50:26.438 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:50:26.438 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.007014036178588867, 'diffusion_time_cost': 0.30962181091308594, 'diffusion_per_step_time_cost': 0.03870272636413574, 'total_time_cost': 0.3166358470916748, 'offload_time_cost': 0.0} +2026-03-04 21:50:26.452 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:50:26.455 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:50:26.455 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.86 GB +2026-03-04 21:50:26.455 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:50:26.455 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.86 GB +2026-03-04 21:50:26.455 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.86 GB +2026-03-04 21:50:26.455 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:50:26.730 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:50:26.732 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:50:26.734 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf [GGML] Running acestep-v15-turbo-Q8_0.gguf... @@ -244,8 +239,8 @@ Using precomputed LM hints dit_step6_xt 0.988647 dit_step7_vt 0.970238 dit_x0 0.980014 - vae_audio 0.903408 - vae_audio (STFT cosine) 0.976427 + vae_audio 0.903437 + vae_audio (STFT cosine) 0.976438 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999946 0.139652 0.006645 -0.002330 0.972930 -0.002342 0.972003 diff --git a/tests/CPU_BF16.log b/tests/CPU_BF16.log deleted file mode 100644 index fcae074..0000000 --- a/tests/CPU_BF16.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf -[GGML] Running acestep-v15-turbo-BF16.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999816 - lyric_embed 1.000000 - enc_hidden 0.999841 - detok_output 0.999995 - context 0.999997 - noise 1.000000 - temb_t 0.999999 - hidden_after_proj_in 0.999988 - enc_after_cond_emb 0.999832 - layer0_sa_output 0.999960 - hidden_after_layer0 0.999982 - hidden_after_layer6 0.999924 - hidden_after_layer12 0.999332 - hidden_after_layer18 0.996692 - hidden_after_layer23 0.993786 - dit_step0_vt 0.975712 - dit_step0_xt 0.999946 - dit_step1_vt 0.979525 - dit_step1_xt 0.999833 - dit_step2_vt 0.981808 - dit_step2_xt 0.999552 - dit_step3_vt 0.982382 - dit_step3_xt 0.998917 - dit_step4_vt 0.980777 - dit_step4_xt 0.997480 - dit_step5_vt 0.978078 - dit_step5_xt 0.994264 - dit_step6_vt 0.974849 - dit_step6_xt 0.988142 - dit_step7_vt 0.969102 - dit_x0 0.979106 - vae_audio 0.901370 - vae_audio (STFT cosine) 0.975816 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999946 0.136541 0.006626 -0.002312 0.972951 -0.002342 0.972003 - dit_step1_xt 0.999833 0.265486 0.011288 -0.005309 0.942692 -0.005313 0.941730 - dit_step2_xt 0.999552 0.451896 0.017477 -0.009347 0.909217 -0.009311 0.908527 - dit_step3_xt 0.998917 0.642624 0.025957 -0.014710 0.873863 -0.014577 0.873624 - dit_step4_xt 0.997480 0.778374 0.037868 -0.021751 0.842047 -0.021660 0.841995 - dit_step5_xt 0.994264 1.244624 0.055630 -0.031814 0.825360 -0.032109 0.824593 - dit_step6_xt 0.988142 2.080976 0.082605 -0.046091 0.856212 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-BF16.gguf -[GGML] Running acestep-v15-sft-BF16.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999816 - lyric_embed 1.000000 - enc_hidden 0.999841 - detok_output 0.999995 - context 0.999997 - noise 1.000000 - temb_t 0.999998 - hidden_after_proj_in 0.999988 - enc_after_cond_emb 0.999834 - layer0_sa_output 0.999959 - hidden_after_layer0 0.999984 - hidden_after_layer6 0.999851 - hidden_after_layer12 0.999471 - hidden_after_layer18 0.998749 - hidden_after_layer23 0.998994 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998963 - dit_step0_vt_uncond 0.998717 - dit_step0_vt 0.995766 - dit_step0_xt 0.999998 - dit_step5_vt_cond 0.999507 - dit_step5_vt 0.993884 - dit_step5_xt 0.999963 - dit_step10_vt_cond 0.998797 - dit_step10_vt 0.993423 - dit_step10_xt 0.999887 - dit_step15_vt_cond 0.997670 - dit_step15_vt 0.988372 - dit_step15_xt 0.999682 - dit_step20_vt_cond 0.995498 - dit_step20_vt 0.982137 - dit_step20_xt 0.999190 - dit_step25_vt_cond 0.991181 - dit_step25_vt 0.972161 - dit_step25_xt 0.998167 - dit_step30_vt_cond 0.986183 - dit_step30_vt 0.967394 - dit_step30_xt 0.996519 - dit_step35_vt_cond 0.981815 - dit_step35_vt 0.959696 - dit_step35_xt 0.994436 - dit_step40_vt_cond 0.979298 - dit_step40_vt 0.954151 - dit_step40_xt 0.992400 - dit_step45_vt_cond 0.981642 - dit_step45_vt 0.955459 - dit_step45_xt 0.990953 - dit_step49_vt_cond 0.982680 - dit_step49_vt 0.941788 - dit_x0 0.990427 - vae_audio 0.960778 - vae_audio (STFT cosine) 0.984703 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999998 0.038465 0.002037 -0.001739 0.980023 -0.001741 0.980402 - dit_step5_xt 0.999963 0.130767 0.005794 -0.006951 0.888986 -0.007143 0.887999 - dit_step10_xt 0.999887 0.230145 0.008907 -0.012421 0.810420 -0.012603 0.811299 - dit_step15_xt 0.999682 0.369882 0.013468 -0.017757 0.745283 -0.018114 0.745268 - dit_step20_xt 0.999190 0.439784 0.019899 -0.023189 0.699688 -0.023808 0.699582 - dit_step25_xt 0.998167 0.657918 0.028642 -0.028736 0.678283 -0.029311 0.679278 - dit_step30_xt 0.996519 1.070616 0.039415 -0.034342 0.684394 -0.035027 0.685262 - dit_step35_xt 0.994436 1.684599 0.051968 -0.039891 0.716568 -0.040716 0.717195 - dit_step40_xt 0.992400 2.115248 0.065570 -0.045402 0.770424 -0.046462 0.771853 - dit_step45_xt 0.990953 2.369087 0.078496 -0.051406 0.841668 -0.052475 0.843036 diff --git a/tests/CPU_Q4_K_M.log b/tests/CPU_Q4_K_M.log deleted file mode 100644 index 44fd5b2..0000000 --- a/tests/CPU_Q4_K_M.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf -[GGML] Running acestep-v15-turbo-Q4_K_M.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999816 - lyric_embed 1.000000 - enc_hidden 0.997095 - detok_output 0.999577 - context 0.999730 - noise 1.000000 - temb_t 0.999896 - hidden_after_proj_in 0.999903 - enc_after_cond_emb 0.997571 - layer0_sa_output 0.998370 - hidden_after_layer0 0.999619 - hidden_after_layer6 0.999177 - hidden_after_layer12 0.995111 - hidden_after_layer18 0.991459 - hidden_after_layer23 0.985217 - dit_step0_vt 0.946613 - dit_step0_xt 0.999883 - dit_step1_vt 0.947613 - dit_step1_xt 0.999611 - dit_step2_vt 0.958491 - dit_step2_xt 0.999010 - dit_step3_vt 0.962965 - dit_step3_xt 0.997773 - dit_step4_vt 0.960997 - dit_step4_xt 0.994989 - dit_step5_vt 0.957636 - dit_step5_xt 0.988832 - dit_step6_vt 0.952016 - dit_step6_xt 0.977196 - dit_step7_vt 0.939970 - dit_x0 0.959881 - vae_audio 0.834993 - vae_audio (STFT cosine) 0.955098 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999883 0.167680 0.010319 -0.002256 0.973185 -0.002342 0.972003 - dit_step1_xt 0.999611 0.268237 0.018204 -0.005104 0.943179 -0.005313 0.941730 - dit_step2_xt 0.999010 0.434671 0.027774 -0.009029 0.910147 -0.009311 0.908527 - dit_step3_xt 0.997773 0.601206 0.039926 -0.014325 0.875171 -0.014577 0.873624 - dit_step4_xt 0.994989 0.892883 0.057385 -0.021274 0.843615 -0.021660 0.841995 - dit_step5_xt 0.988832 1.381146 0.083605 -0.031218 0.827061 -0.032109 0.824593 - dit_step6_xt 0.977196 2.021005 0.123750 -0.045473 0.858175 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q4_K_M.gguf -[GGML] Running acestep-v15-sft-Q4_K_M.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999816 - lyric_embed 1.000000 - enc_hidden 0.997095 - detok_output 0.999577 - context 0.999730 - noise 1.000000 - temb_t 0.999645 - hidden_after_proj_in 0.999904 - enc_after_cond_emb 0.997560 - layer0_sa_output 0.998513 - hidden_after_layer0 0.999624 - hidden_after_layer6 0.999091 - hidden_after_layer12 0.997675 - hidden_after_layer18 0.996682 - hidden_after_layer23 0.996897 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.996806 - dit_step0_vt_uncond 0.996163 - dit_step0_vt 0.990085 - dit_step0_xt 0.999995 - dit_step5_vt_cond 0.995410 - dit_step5_vt 0.978964 - dit_step5_xt 0.999822 - dit_step10_vt_cond 0.991521 - dit_step10_vt 0.970202 - dit_step10_xt 0.999221 - dit_step15_vt_cond 0.981975 - dit_step15_vt 0.945173 - dit_step15_xt 0.997485 - dit_step20_vt_cond 0.967221 - dit_step20_vt 0.918272 - dit_step20_xt 0.993402 - dit_step25_vt_cond 0.950021 - dit_step25_vt 0.894843 - dit_step25_xt 0.986289 - dit_step30_vt_cond 0.929833 - dit_step30_vt 0.870341 - dit_step30_xt 0.976182 - dit_step35_vt_cond 0.909548 - dit_step35_vt 0.845635 - dit_step35_xt 0.964963 - dit_step40_vt_cond 0.897534 - dit_step40_vt 0.827777 - dit_step40_xt 0.954875 - dit_step45_vt_cond 0.908619 - dit_step45_vt 0.841100 - dit_step45_xt 0.948114 - dit_step49_vt_cond 0.927278 - dit_step49_vt 0.867932 - dit_x0 0.945906 - vae_audio 0.825297 - vae_audio (STFT cosine) 0.924406 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999995 0.035570 0.002883 -0.001844 0.980345 -0.001741 0.980402 - dit_step5_xt 0.999822 0.188835 0.013032 -0.007303 0.890510 -0.007143 0.887999 - dit_step10_xt 0.999221 0.527206 0.024125 -0.012987 0.812393 -0.012603 0.811299 - dit_step15_xt 0.997485 0.839391 0.039117 -0.018648 0.747696 -0.018114 0.745268 - dit_step20_xt 0.993402 1.146206 0.058860 -0.024311 0.701939 -0.023808 0.699582 - dit_step25_xt 0.986289 1.528936 0.081899 -0.030231 0.679540 -0.029311 0.679278 - dit_step30_xt 0.976182 1.891257 0.108598 -0.036282 0.684111 -0.035027 0.685262 - dit_step35_xt 0.964963 2.208873 0.137902 -0.042366 0.714637 -0.040716 0.717195 - dit_step40_xt 0.954875 2.494038 0.168832 -0.048453 0.767102 -0.046462 0.771853 - dit_step45_xt 0.948114 2.800970 0.198350 -0.054785 0.837697 -0.052475 0.843036 diff --git a/tests/CPU_Q5_K_M.log b/tests/CPU_Q5_K_M.log deleted file mode 100644 index 4732362..0000000 --- a/tests/CPU_Q5_K_M.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf -[GGML] Running acestep-v15-turbo-Q5_K_M.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999816 - lyric_embed 1.000000 - enc_hidden 0.999099 - detok_output 0.999843 - context 0.999900 - noise 1.000000 - temb_t 0.999968 - hidden_after_proj_in 0.999954 - enc_after_cond_emb 0.999196 - layer0_sa_output 0.999388 - hidden_after_layer0 0.999773 - hidden_after_layer6 0.999687 - hidden_after_layer12 0.998560 - hidden_after_layer18 0.995178 - hidden_after_layer23 0.990907 - dit_step0_vt 0.966084 - dit_step0_xt 0.999926 - dit_step1_vt 0.972329 - dit_step1_xt 0.999780 - dit_step2_vt 0.971107 - dit_step2_xt 0.999383 - dit_step3_vt 0.973886 - dit_step3_xt 0.998543 - dit_step4_vt 0.971976 - dit_step4_xt 0.996642 - dit_step5_vt 0.967575 - dit_step5_xt 0.992211 - dit_step6_vt 0.962964 - dit_step6_xt 0.983513 - dit_step7_vt 0.954349 - dit_x0 0.970379 - vae_audio 0.874800 - vae_audio (STFT cosine) 0.967703 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999926 0.135378 0.008030 -0.002303 0.973012 -0.002342 0.972003 - dit_step1_xt 0.999780 0.276712 0.013491 -0.005310 0.942849 -0.005313 0.941730 - dit_step2_xt 0.999383 0.460420 0.021261 -0.009337 0.909465 -0.009311 0.908527 - dit_step3_xt 0.998543 0.681684 0.031463 -0.014739 0.874175 -0.014577 0.873624 - dit_step4_xt 0.996642 0.853164 0.045737 -0.021967 0.842445 -0.021660 0.841995 - dit_step5_xt 0.992211 1.314129 0.067657 -0.032346 0.825989 -0.032109 0.824593 - dit_step6_xt 0.983513 2.191432 0.101363 -0.046949 0.857195 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q5_K_M.gguf -[GGML] Running acestep-v15-sft-Q5_K_M.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999816 - lyric_embed 1.000000 - enc_hidden 0.999099 - detok_output 0.999843 - context 0.999900 - noise 1.000000 - temb_t 0.999877 - hidden_after_proj_in 0.999954 - enc_after_cond_emb 0.999196 - layer0_sa_output 0.999446 - hidden_after_layer0 0.999823 - hidden_after_layer6 0.999554 - hidden_after_layer12 0.998967 - hidden_after_layer18 0.997974 - hidden_after_layer23 0.998436 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998372 - dit_step0_vt_uncond 0.998354 - dit_step0_vt 0.994379 - dit_step0_xt 0.999997 - dit_step5_vt_cond 0.998658 - dit_step5_vt 0.988358 - dit_step5_xt 0.999933 - dit_step10_vt_cond 0.997095 - dit_step10_vt 0.985993 - dit_step10_xt 0.999758 - dit_step15_vt_cond 0.993108 - dit_step15_vt 0.970538 - dit_step15_xt 0.999209 - dit_step20_vt_cond 0.985753 - dit_step20_vt 0.954524 - dit_step20_xt 0.997715 - dit_step25_vt_cond 0.976423 - dit_step25_vt 0.938088 - dit_step25_xt 0.994906 - dit_step30_vt_cond 0.965769 - dit_step30_vt 0.925268 - dit_step30_xt 0.990600 - dit_step35_vt_cond 0.955274 - dit_step35_vt 0.909442 - dit_step35_xt 0.985533 - dit_step40_vt_cond 0.949378 - dit_step40_vt 0.894016 - dit_step40_xt 0.980757 - dit_step45_vt_cond 0.956168 - dit_step45_vt 0.901535 - dit_step45_xt 0.977447 - dit_step49_vt_cond 0.966288 - dit_step49_vt 0.914297 - dit_x0 0.976302 - vae_audio 0.889659 - vae_audio (STFT cosine) 0.945409 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999997 0.037808 0.002296 -0.001776 0.980078 -0.001741 0.980402 - dit_step5_xt 0.999933 0.104447 0.007971 -0.006973 0.889460 -0.007143 0.887999 - dit_step10_xt 0.999758 0.210002 0.013370 -0.012530 0.810881 -0.012603 0.811299 - dit_step15_xt 0.999209 0.418503 0.021538 -0.017971 0.745622 -0.018114 0.745268 - dit_step20_xt 0.997715 0.623172 0.033317 -0.023695 0.699368 -0.023808 0.699582 - dit_step25_xt 0.994906 0.874752 0.047642 -0.029485 0.676770 -0.029311 0.679278 - dit_step30_xt 0.990600 1.161649 0.065018 -0.035311 0.680992 -0.035027 0.685262 - dit_step35_xt 0.985533 1.453686 0.084547 -0.041122 0.711332 -0.040716 0.717195 - dit_step40_xt 0.980757 1.810532 0.105436 -0.046941 0.764001 -0.046462 0.771853 - dit_step45_xt 0.977447 2.167346 0.125231 -0.053123 0.834843 -0.052475 0.843036 diff --git a/tests/CPU_Q6_K.log b/tests/CPU_Q6_K.log deleted file mode 100644 index 93d1e05..0000000 --- a/tests/CPU_Q6_K.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf -[GGML] Running acestep-v15-turbo-Q6_K.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999816 - lyric_embed 1.000000 - enc_hidden 0.999634 - detok_output 0.999927 - context 0.999954 - noise 1.000000 - temb_t 0.999986 - hidden_after_proj_in 0.999975 - enc_after_cond_emb 0.999619 - layer0_sa_output 0.999718 - hidden_after_layer0 0.999827 - hidden_after_layer6 0.999788 - hidden_after_layer12 0.998843 - hidden_after_layer18 0.995848 - hidden_after_layer23 0.992196 - dit_step0_vt 0.971124 - dit_step0_xt 0.999936 - dit_step1_vt 0.975111 - dit_step1_xt 0.999802 - dit_step2_vt 0.978218 - dit_step2_xt 0.999477 - dit_step3_vt 0.977576 - dit_step3_xt 0.998723 - dit_step4_vt 0.973938 - dit_step4_xt 0.996945 - dit_step5_vt 0.969356 - dit_step5_xt 0.992753 - dit_step6_vt 0.965671 - dit_step6_xt 0.984569 - dit_step7_vt 0.958147 - dit_x0 0.972312 - vae_audio 0.891761 - vae_audio (STFT cosine) 0.969080 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999936 0.151952 0.007283 -0.002271 0.972870 -0.002342 0.972003 - dit_step1_xt 0.999802 0.296519 0.012516 -0.005212 0.942575 -0.005313 0.941730 - dit_step2_xt 0.999477 0.478400 0.019283 -0.009184 0.908992 -0.009311 0.908527 - dit_step3_xt 0.998723 0.734609 0.028810 -0.014535 0.873457 -0.014577 0.873624 - dit_step4_xt 0.996945 1.045720 0.042804 -0.021712 0.841447 -0.021660 0.841995 - dit_step5_xt 0.992753 1.512605 0.064324 -0.032020 0.824620 -0.032109 0.824593 - dit_step6_xt 0.984569 2.166596 0.096699 -0.046604 0.855715 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q6_K.gguf -[GGML] Running acestep-v15-sft-Q6_K.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999816 - lyric_embed 1.000000 - enc_hidden 0.999634 - detok_output 0.999927 - context 0.999954 - noise 1.000000 - temb_t 0.999952 - hidden_after_proj_in 0.999974 - enc_after_cond_emb 0.999624 - layer0_sa_output 0.999731 - hidden_after_layer0 0.999858 - hidden_after_layer6 0.999745 - hidden_after_layer12 0.999282 - hidden_after_layer18 0.998391 - hidden_after_layer23 0.998703 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998624 - dit_step0_vt_uncond 0.998134 - dit_step0_vt 0.994531 - dit_step0_xt 0.999997 - dit_step5_vt_cond 0.999105 - dit_step5_vt 0.991049 - dit_step5_xt 0.999950 - dit_step10_vt_cond 0.997890 - dit_step10_vt 0.988681 - dit_step10_xt 0.999825 - dit_step15_vt_cond 0.995763 - dit_step15_vt 0.978576 - dit_step15_xt 0.999458 - dit_step20_vt_cond 0.991824 - dit_step20_vt 0.966730 - dit_step20_xt 0.998566 - dit_step25_vt_cond 0.986001 - dit_step25_vt 0.952775 - dit_step25_xt 0.996897 - dit_step30_vt_cond 0.979821 - dit_step30_vt 0.943526 - dit_step30_xt 0.994344 - dit_step35_vt_cond 0.973662 - dit_step35_vt 0.929345 - dit_step35_xt 0.991309 - dit_step40_vt_cond 0.969585 - dit_step40_vt 0.918968 - dit_step40_xt 0.988416 - dit_step45_vt_cond 0.972816 - dit_step45_vt 0.918164 - dit_step45_xt 0.986334 - dit_step49_vt_cond 0.976204 - dit_step49_vt 0.909094 - dit_x0 0.985561 - vae_audio 0.940827 - vae_audio (STFT cosine) 0.976287 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999997 0.037619 0.002240 -0.001750 0.980170 -0.001741 0.980402 - dit_step5_xt 0.999950 0.129572 0.006928 -0.006971 0.889777 -0.007143 0.887999 - dit_step10_xt 0.999825 0.192490 0.011325 -0.012410 0.811294 -0.012603 0.811299 - dit_step15_xt 0.999458 0.319211 0.017944 -0.017698 0.745779 -0.018114 0.745268 - dit_step20_xt 0.998566 0.553748 0.026838 -0.023098 0.699443 -0.023808 0.699582 - dit_step25_xt 0.996897 0.760972 0.037747 -0.028532 0.677161 -0.029311 0.679278 - dit_step30_xt 0.994344 1.235259 0.050893 -0.033936 0.681526 -0.035027 0.685262 - dit_step35_xt 0.991309 1.863492 0.065806 -0.039291 0.711899 -0.040716 0.717195 - dit_step40_xt 0.988416 2.112072 0.082079 -0.044606 0.764056 -0.046462 0.771853 - dit_step45_xt 0.986334 2.338981 0.097741 -0.050358 0.834033 -0.052475 0.843036 diff --git a/tests/CPU_Q8_0.log b/tests/CPU_Q8_0.log deleted file mode 100644 index f4a9086..0000000 --- a/tests/CPU_Q8_0.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf -[GGML] Running acestep-v15-turbo-Q8_0.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999816 - lyric_embed 1.000000 - enc_hidden 0.999814 - detok_output 0.999983 - context 0.999990 - noise 1.000000 - temb_t 0.999997 - hidden_after_proj_in 0.999985 - enc_after_cond_emb 0.999791 - layer0_sa_output 0.999925 - hidden_after_layer0 0.999955 - hidden_after_layer6 0.999892 - hidden_after_layer12 0.999219 - hidden_after_layer18 0.996644 - hidden_after_layer23 0.993707 - dit_step0_vt 0.975605 - dit_step0_xt 0.999946 - dit_step1_vt 0.978928 - dit_step1_xt 0.999831 - dit_step2_vt 0.981129 - dit_step2_xt 0.999551 - dit_step3_vt 0.982813 - dit_step3_xt 0.998932 - dit_step4_vt 0.981292 - dit_step4_xt 0.997544 - dit_step5_vt 0.979091 - dit_step5_xt 0.994467 - dit_step6_vt 0.976152 - dit_step6_xt 0.988647 - dit_step7_vt 0.970238 - dit_x0 0.980014 - vae_audio 0.903408 - vae_audio (STFT cosine) 0.976427 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999946 0.139652 0.006645 -0.002330 0.972930 -0.002342 0.972003 - dit_step1_xt 0.999831 0.267117 0.011368 -0.005325 0.942659 -0.005313 0.941730 - dit_step2_xt 0.999551 0.452101 0.017578 -0.009369 0.909163 -0.009311 0.908527 - dit_step3_xt 0.998932 0.629880 0.025911 -0.014735 0.873792 -0.014577 0.873624 - dit_step4_xt 0.997544 0.759572 0.037583 -0.021796 0.841987 -0.021660 0.841995 - dit_step5_xt 0.994467 1.235701 0.054893 -0.031886 0.825306 -0.032109 0.824593 - dit_step6_xt 0.988647 2.096131 0.081207 -0.046181 0.856264 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q8_0.gguf -[GGML] Running acestep-v15-sft-Q8_0.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999816 - lyric_embed 1.000000 - enc_hidden 0.999814 - detok_output 0.999983 - context 0.999990 - noise 1.000000 - temb_t 0.999991 - hidden_after_proj_in 0.999986 - enc_after_cond_emb 0.999795 - layer0_sa_output 0.999912 - hidden_after_layer0 0.999958 - hidden_after_layer6 0.999824 - hidden_after_layer12 0.999445 - hidden_after_layer18 0.998719 - hidden_after_layer23 0.998974 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998922 - dit_step0_vt_uncond 0.998427 - dit_step0_vt 0.995455 - dit_step0_xt 0.999998 - dit_step5_vt_cond 0.999446 - dit_step5_vt 0.993188 - dit_step5_xt 0.999961 - dit_step10_vt_cond 0.998529 - dit_step10_vt 0.992281 - dit_step10_xt 0.999875 - dit_step15_vt_cond 0.996311 - dit_step15_vt 0.982856 - dit_step15_xt 0.999609 - dit_step20_vt_cond 0.992095 - dit_step20_vt 0.974098 - dit_step20_xt 0.998863 - dit_step25_vt_cond 0.986516 - dit_step25_vt 0.962299 - dit_step25_xt 0.997338 - dit_step30_vt_cond 0.980702 - dit_step30_vt 0.955880 - dit_step30_xt 0.995005 - dit_step35_vt_cond 0.975404 - dit_step35_vt 0.945189 - dit_step35_xt 0.992202 - dit_step40_vt_cond 0.972588 - dit_step40_vt 0.935722 - dit_step40_xt 0.989533 - dit_step45_vt_cond 0.975984 - dit_step45_vt 0.937094 - dit_step45_xt 0.987666 - dit_step49_vt_cond 0.978734 - dit_step49_vt 0.917631 - dit_x0 0.986993 - vae_audio 0.937093 - vae_audio (STFT cosine) 0.971416 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999998 0.038134 0.002096 -0.001710 0.980019 -0.001741 0.980402 - dit_step5_xt 0.999961 0.137689 0.005996 -0.006894 0.889095 -0.007143 0.887999 - dit_step10_xt 0.999875 0.219306 0.009469 -0.012337 0.810457 -0.012603 0.811299 - dit_step15_xt 0.999609 0.356501 0.014905 -0.017570 0.745282 -0.018114 0.745268 - dit_step20_xt 0.998863 0.570726 0.023002 -0.022897 0.699575 -0.023808 0.699582 - dit_step25_xt 0.997338 0.870836 0.033418 -0.028306 0.678021 -0.029311 0.679278 - dit_step30_xt 0.995005 1.126647 0.045749 -0.033772 0.683965 -0.035027 0.685262 - dit_step35_xt 0.992202 1.561250 0.059823 -0.039172 0.715848 -0.040716 0.717195 - dit_step40_xt 0.989533 1.985042 0.074909 -0.044584 0.769539 -0.046462 0.771853 - dit_step45_xt 0.987666 2.384698 0.089346 -0.050474 0.840839 -0.052475 0.843036 diff --git a/tests/CUDA-BF16.log b/tests/CUDA-BF16.log index d73a934..5ed30ff 100644 --- a/tests/CUDA-BF16.log +++ b/tests/CUDA-BF16.log @@ -1,7 +1,7 @@ ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes [Load] DiT backend: CUDA0 (CPU threads: 16) -[Load] Backend init: 70.8 ms +[Load] Backend init: 32.6 ms [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [DiT] Self-attn: Q+K+V fused [DiT] Cross-attn: Q+K+V fused @@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices: [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 3007.9 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 375.6 ms +[Load] DiT weight load: 310.9 ms [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CUDA0 (CPU threads: 16) +[Load] VAE backend: CUDA0 (shared) [VAE] Backend: CUDA0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 661.0 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 653.9 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 32.8 ms +[Load] BPE tokenizer: 30.9 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[Load] TextEncoder backend: CUDA0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 128.5 ms -[Encode] TextEncoder (70 tokens): 50.6 ms +[Load] TextEncoder: 102.3 ms +[Encode] TextEncoder (70 tokens): 50.4 ms [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.5 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[Load] CondEncoder backend: CUDA0 (shared) [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices: [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 1160.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 127.1 ms +[Load] ConditionEncoder: 90.6 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 7.9 ms, enc_S=238 +[Encode] ConditionEncoder: 8.2 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.758648 -0.049409 -0.132412 0.058372 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [WeightCtx] Loaded 30 tensors, 200.3 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 24.2 ms +[Load] Detokenizer: 17.6 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 141.9 ms +[Context] Detokenizer: 140.1 ms [Debug] detok_output: [2170, 64] first4: -0.124204 1.435425 0.309963 -0.624679 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices: [Debug] dit_step7_vt: [2170, 64] first4: -0.004009 0.190141 -1.466879 3.103273 [Debug] dit_x0: [2170, 64] first4: 0.085060 1.438241 0.424145 -1.920485 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 248.3 ms (248.3 ms/sample) +[DiT] Total generation: 243.9 ms (243.9 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.085060 1.438241 0.424145 -1.920485 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 812.8 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000547 0.000898 0.000798 0.001064 +[VAE Batch0] Decode: 615.3 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000498 0.000900 0.000800 0.001124 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:54:08.539 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:54:08.540 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:54:08.540 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:54:08.540 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:54:08.540 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:54:09.277 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:54:10.804 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:54:10.804 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:54:10.810 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:54:10.970 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:45:24.010 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:45:24.091 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:45:25.418 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:45:25.418 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:45:25.421 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:45:25.426 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:45:25.618 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:54:10.972 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:45:25.621 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:54:10.978 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:54:10.991 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:54:10.991 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:54:11.023 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:54:11.329 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:54:11.330 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:54:11.330 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0068187713623046875, 'diffusion_time_cost': 0.2986173629760742, 'diffusion_per_step_time_cost': 0.03732717037200928, 'total_time_cost': 0.3054361343383789, 'offload_time_cost': 0.0} -2026-03-01 19:54:11.344 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:54:11.349 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:54:11.349 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:54:11.349 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:54:11.349 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:54:11.349 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:54:11.349 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:54:11.625 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:54:11.628 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:54:11.632 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:45:25.628 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:45:25.643 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:45:25.643 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:45:25.674 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:45:25.993 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:45:25.994 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:45:25.994 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006845712661743164, 'diffusion_time_cost': 0.3112342357635498, 'diffusion_per_step_time_cost': 0.038904279470443726, 'total_time_cost': 0.31807994842529297, 'offload_time_cost': 0.0} +2026-03-04 21:45:26.008 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:45:26.010 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:45:26.010 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:45:26.010 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:45:26.010 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:45:26.010 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:45:26.010 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:45:26.284 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:45:26.286 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:45:26.288 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf [GGML] Running acestep-v15-turbo-BF16.gguf... @@ -246,8 +241,8 @@ Using precomputed LM hints dit_step6_xt 0.988188 dit_step7_vt 0.969375 dit_x0 0.979213 - vae_audio 0.901377 - vae_audio (STFT cosine) 0.975525 + vae_audio 0.901411 + vae_audio (STFT cosine) 0.975533 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999945 0.135628 0.006709 -0.002312 0.972932 -0.002342 0.972003 diff --git a/tests/CUDA-Q4_K_M.log b/tests/CUDA-Q4_K_M.log index 189cb71..403d030 100644 --- a/tests/CUDA-Q4_K_M.log +++ b/tests/CUDA-Q4_K_M.log @@ -1,7 +1,7 @@ ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes [Load] DiT backend: CUDA0 (CPU threads: 16) -[Load] Backend init: 11.2 ms +[Load] Backend init: 9.6 ms [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K fused, V separate [DiT] Cross-attn: all separate @@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices: [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 895.6 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 403.0 ms +[Load] DiT weight load: 141.8 ms [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CUDA0 (CPU threads: 16) +[Load] VAE backend: CUDA0 (shared) [VAE] Backend: CUDA0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 655.9 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 652.4 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 31.4 ms +[Load] BPE tokenizer: 32.6 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[Load] TextEncoder backend: CUDA0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 126.3 ms -[Encode] TextEncoder (70 tokens): 52.7 ms +[Load] TextEncoder: 103.0 ms +[Encode] TextEncoder (70 tokens): 50.9 ms [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.1 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[Load] CondEncoder backend: CUDA0 (shared) [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K fused, V separate @@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices: [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 352.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 118.9 ms +[Load] ConditionEncoder: 29.8 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 12.7 ms, enc_S=238 +[Encode] ConditionEncoder: 13.3 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.759848 -0.046220 -0.129361 0.057668 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 64.7 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 22.1 ms +[Load] Detokenizer: 6.3 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 124.0 ms +[Context] Detokenizer: 124.2 ms [Debug] detok_output: [2170, 64] first4: -0.098446 1.438721 0.299255 -0.646500 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices: [Debug] dit_step7_vt: [2170, 64] first4: -0.488470 0.849564 -1.659694 3.185843 [Debug] dit_x0: [2170, 64] first4: 0.317955 1.165446 0.587176 -1.877443 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 249.1 ms (249.1 ms/sample) +[DiT] Total generation: 249.0 ms (249.0 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.317955 1.165446 0.587176 -1.877443 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 820.0 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000325 0.000812 0.000671 0.000911 +[VAE Batch0] Decode: 616.0 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000379 0.000847 0.000704 0.001000 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:54:39.264 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:54:39.265 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:54:39.265 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:54:39.265 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:54:39.265 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:54:40.025 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:54:41.587 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:54:41.587 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:54:41.592 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:54:41.751 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:45:55.364 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:45:55.452 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:45:56.779 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:45:56.779 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:45:56.781 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:45:56.786 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:45:56.978 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:45:56.980 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:45:56.980 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:45:56.980 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:45:56.980 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:45:56.981 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:45:56.981 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:54:41.753 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:45:56.981 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:54:41.759 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:54:41.771 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:54:41.772 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:54:41.805 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:54:42.113 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:54:42.114 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:54:42.114 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006765604019165039, 'diffusion_time_cost': 0.3010725975036621, 'diffusion_per_step_time_cost': 0.037634074687957764, 'total_time_cost': 0.30783820152282715, 'offload_time_cost': 0.0} -2026-03-01 19:54:42.128 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:54:42.131 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:54:42.131 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:54:42.131 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:54:42.131 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:54:42.131 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:54:42.131 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:54:42.405 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:54:42.408 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:54:42.411 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:45:56.987 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:45:57.002 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:45:57.002 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:45:57.032 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:45:57.348 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:45:57.349 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:45:57.349 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006890535354614258, 'diffusion_time_cost': 0.30885934829711914, 'diffusion_per_step_time_cost': 0.03860741853713989, 'total_time_cost': 0.3157498836517334, 'offload_time_cost': 0.0} +2026-03-04 21:45:57.363 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:45:57.366 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:45:57.366 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.78 GB +2026-03-04 21:45:57.366 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:45:57.366 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.78 GB +2026-03-04 21:45:57.366 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.78 GB +2026-03-04 21:45:57.366 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:45:57.640 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:45:57.642 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:45:57.644 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf [GGML] Running acestep-v15-turbo-Q4_K_M.gguf... @@ -246,8 +241,8 @@ Using precomputed LM hints dit_step6_xt 0.976494 dit_step7_vt 0.938658 dit_x0 0.958725 - vae_audio 0.837763 - vae_audio (STFT cosine) 0.954448 + vae_audio 0.837780 + vae_audio (STFT cosine) 0.954457 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999885 0.165835 0.010206 -0.002260 0.973133 -0.002342 0.972003 diff --git a/tests/CUDA-Q5_K_M.log b/tests/CUDA-Q5_K_M.log index 00b9652..4e72f4f 100644 --- a/tests/CUDA-Q5_K_M.log +++ b/tests/CUDA-Q5_K_M.log @@ -1,7 +1,7 @@ ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes [Load] DiT backend: CUDA0 (CPU threads: 16) -[Load] Backend init: 25.7 ms +[Load] Backend init: 9.6 ms [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K fused, V separate [DiT] Cross-attn: all separate @@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices: [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1061.2 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 465.4 ms +[Load] DiT weight load: 152.8 ms [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CUDA0 (CPU threads: 16) +[Load] VAE backend: CUDA0 (shared) [VAE] Backend: CUDA0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 656.4 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 655.0 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges [Load] BPE tokenizer: 31.3 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[Load] TextEncoder backend: CUDA0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 127.3 ms -[Encode] TextEncoder (70 tokens): 49.5 ms +[Load] TextEncoder: 102.1 ms +[Encode] TextEncoder (70 tokens): 70.3 ms [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.4 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[Load] CondEncoder backend: CUDA0 (shared) [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K fused, V separate @@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices: [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 412.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 138.7 ms +[Load] ConditionEncoder: 34.4 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 13.1 ms, enc_S=238 +[Encode] ConditionEncoder: 13.5 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.760389 -0.050879 -0.130835 0.059141 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 73.2 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 24.2 ms +[Load] Detokenizer: 6.8 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 121.7 ms +[Context] Detokenizer: 124.1 ms [Debug] detok_output: [2170, 64] first4: -0.125017 1.460327 0.292545 -0.654237 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices: [Debug] dit_step7_vt: [2170, 64] first4: 0.031181 0.378487 -1.509792 3.095486 [Debug] dit_x0: [2170, 64] first4: 0.032336 1.392616 0.498835 -1.905283 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 251.1 ms (251.1 ms/sample) +[DiT] Total generation: 261.4 ms (261.4 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.032336 1.392616 0.498835 -1.905283 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 804.2 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000692 0.001098 0.000938 0.001230 +[VAE Batch0] Decode: 614.5 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000681 0.001094 0.000878 0.001246 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:54:31.395 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:54:31.395 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:54:31.395 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:54:31.395 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:54:31.395 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:54:32.168 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:54:33.881 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:54:33.882 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:54:33.887 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:54:34.060 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:45:47.565 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:45:47.662 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:45:48.979 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:45:48.979 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:45:48.981 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:45:48.987 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:45:49.182 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:54:34.062 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:45:49.184 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:54:34.068 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:54:34.081 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:54:34.081 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:54:34.105 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:54:34.415 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:54:34.416 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:54:34.416 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006921052932739258, 'diffusion_time_cost': 0.3029003143310547, 'diffusion_per_step_time_cost': 0.037862539291381836, 'total_time_cost': 0.30982136726379395, 'offload_time_cost': 0.0} -2026-03-01 19:54:34.431 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:54:34.436 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:54:34.436 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:54:34.436 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:54:34.436 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:54:34.436 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:54:34.436 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:54:34.714 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:54:34.716 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:54:34.720 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:45:49.211 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:45:49.226 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:45:49.226 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:45:49.260 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:45:49.577 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:45:49.577 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:45:49.578 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00789022445678711, 'diffusion_time_cost': 0.30838513374328613, 'diffusion_per_step_time_cost': 0.03854814171791077, 'total_time_cost': 0.31627535820007324, 'offload_time_cost': 0.0} +2026-03-04 21:45:49.591 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:45:49.594 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:45:49.594 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:45:49.594 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:45:49.594 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:45:49.594 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:45:49.594 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:45:49.873 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:45:49.875 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:45:49.877 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf [GGML] Running acestep-v15-turbo-Q5_K_M.gguf... @@ -246,8 +241,8 @@ Using precomputed LM hints dit_step6_xt 0.983446 dit_step7_vt 0.953383 dit_x0 0.970119 - vae_audio 0.883226 - vae_audio (STFT cosine) 0.968463 + vae_audio 0.883245 + vae_audio (STFT cosine) 0.968470 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999930 0.139407 0.007818 -0.002306 0.973025 -0.002342 0.972003 diff --git a/tests/CUDA-Q6_K.log b/tests/CUDA-Q6_K.log index 10b9a7a..4950234 100644 --- a/tests/CUDA-Q6_K.log +++ b/tests/CUDA-Q6_K.log @@ -1,7 +1,7 @@ ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes [Load] DiT backend: CUDA0 (CPU threads: 16) -[Load] Backend init: 9.5 ms +[Load] Backend init: 10.2 ms [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K+V fused [DiT] Cross-attn: Q+K+V fused @@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices: [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1237.2 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 514.8 ms +[Load] DiT weight load: 176.0 ms [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CUDA0 (CPU threads: 16) +[Load] VAE backend: CUDA0 (shared) [VAE] Backend: CUDA0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 657.3 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 655.0 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 30.7 ms +[Load] BPE tokenizer: 31.5 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[Load] TextEncoder backend: CUDA0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 125.7 ms -[Encode] TextEncoder (70 tokens): 49.2 ms +[Load] TextEncoder: 102.6 ms +[Encode] TextEncoder (70 tokens): 51.1 ms [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.3 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[Load] CondEncoder backend: CUDA0 (shared) [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices: [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 476.3 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 145.8 ms +[Load] ConditionEncoder: 40.6 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 11.0 ms, enc_S=238 +[Encode] ConditionEncoder: 10.8 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.760759 -0.050104 -0.133269 0.058044 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 82.2 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 26.4 ms +[Load] Detokenizer: 7.8 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 123.5 ms +[Context] Detokenizer: 123.6 ms [Debug] detok_output: [2170, 64] first4: -0.140341 1.456987 0.310602 -0.632665 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices: [Debug] dit_step7_vt: [2170, 64] first4: 0.081321 0.135461 -1.397063 2.986206 [Debug] dit_x0: [2170, 64] first4: 0.028793 1.462229 0.417478 -1.887184 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 273.2 ms (273.2 ms/sample) +[DiT] Total generation: 270.6 ms (270.6 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.028793 1.462229 0.417478 -1.887184 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 804.3 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000481 0.000872 0.000838 0.001216 +[VAE Batch0] Decode: 616.4 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000531 0.001035 0.000900 0.001303 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:54:23.682 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:54:23.683 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:54:23.683 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:54:23.683 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:54:23.683 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:54:24.419 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:54:25.992 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:54:25.992 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:54:25.998 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:54:26.157 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:45:39.727 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:45:39.815 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:45:41.135 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:45:41.135 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:45:41.137 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:45:41.142 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:45:41.335 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:54:26.159 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:45:41.337 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:54:26.166 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:54:26.178 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:54:26.178 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:54:26.214 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:54:26.528 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:54:26.528 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:54:26.528 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00680994987487793, 'diffusion_time_cost': 0.30716919898986816, 'diffusion_per_step_time_cost': 0.03839614987373352, 'total_time_cost': 0.3139791488647461, 'offload_time_cost': 0.0} -2026-03-01 19:54:26.543 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:54:26.545 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:54:26.545 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB -2026-03-01 19:54:26.545 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:54:26.545 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB -2026-03-01 19:54:26.545 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB -2026-03-01 19:54:26.545 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:54:26.821 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:54:26.824 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:54:26.828 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:45:41.345 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:45:41.359 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:45:41.359 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:45:41.390 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:45:41.705 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:45:41.706 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:45:41.706 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006890773773193359, 'diffusion_time_cost': 0.30776047706604004, 'diffusion_per_step_time_cost': 0.038470059633255005, 'total_time_cost': 0.3146512508392334, 'offload_time_cost': 0.0} +2026-03-04 21:45:41.720 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:45:41.722 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:45:41.723 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:45:41.723 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:45:41.723 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:45:41.723 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:45:41.723 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:45:41.997 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:45:41.999 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:45:42.001 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf [GGML] Running acestep-v15-turbo-Q6_K.gguf... @@ -246,8 +241,8 @@ Using precomputed LM hints dit_step6_xt 0.985862 dit_step7_vt 0.962454 dit_x0 0.974866 - vae_audio 0.893678 - vae_audio (STFT cosine) 0.969663 + vae_audio 0.893720 + vae_audio (STFT cosine) 0.969672 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999937 0.147590 0.007252 -0.002265 0.972930 -0.002342 0.972003 diff --git a/tests/CUDA-Q8_0.log b/tests/CUDA-Q8_0.log index 3a84ce1..2744819 100644 --- a/tests/CUDA-Q8_0.log +++ b/tests/CUDA-Q8_0.log @@ -1,7 +1,7 @@ ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, compute capability 12.0, VMM: yes [Load] DiT backend: CUDA0 (CPU threads: 16) -[Load] Backend init: 9.5 ms +[Load] Backend init: 9.7 ms [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K+V fused [DiT] Cross-attn: Q+K+V fused @@ -9,36 +9,34 @@ ggml_cuda_init: found 1 CUDA devices: [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1600.7 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 221.9 ms +[Load] DiT weight load: 201.4 ms [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: CUDA0 (CPU threads: 16) +[Load] VAE backend: CUDA0 (shared) [VAE] Backend: CUDA0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 658.9 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 655.8 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges [Load] BPE tokenizer: 31.2 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: CUDA0 (CPU threads: 16) +[Load] TextEncoder backend: CUDA0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 127.0 ms -[Encode] TextEncoder (70 tokens): 68.2 ms +[Load] TextEncoder: 102.2 ms +[Encode] TextEncoder (70 tokens): 57.4 ms [Debug] text_hidden: [70, 1024] first4: 3.652014 1.047935 0.228532 -12.907304 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 12.3 ms +[Encode] Lyric vocab lookup (167 tokens): 0.1 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: CUDA0 (CPU threads: 16) +[Load] CondEncoder backend: CUDA0 (shared) [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -48,18 +46,18 @@ ggml_cuda_init: found 1 CUDA devices: [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 616.6 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 65.2 ms +[Load] ConditionEncoder: 52.3 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 8.9 ms, enc_S=238 +[Encode] ConditionEncoder: 9.0 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.759220 -0.049559 -0.133467 0.058389 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 106.5 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 12.1 ms +[Load] Detokenizer: 9.2 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 104.8 ms +[Context] Detokenizer: 103.8 ms [Debug] detok_output: [2170, 64] first4: -0.120490 1.436288 0.301594 -0.632564 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -114,35 +112,32 @@ ggml_cuda_init: found 1 CUDA devices: [Debug] dit_step7_vt: [2170, 64] first4: -0.007394 0.229067 -1.488817 3.083439 [Debug] dit_x0: [2170, 64] first4: 0.087028 1.415554 0.432225 -1.919150 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 242.9 ms (242.9 ms/sample) +[DiT] Total generation: 236.6 ms (236.6 ms/sample) [Debug] dit_output: [2170, 64] first4: 0.087028 1.415554 0.432225 -1.919150 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 822.6 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000524 0.000859 0.000752 0.001056 +[VAE Batch0] Decode: 618.6 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000531 0.000916 0.000781 0.001161 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:54:15.905 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:54:15.906 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:54:15.906 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:54:15.906 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:54:15.906 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:54:16.672 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:54:18.198 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:54:18.198 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:54:18.207 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:54:18.371 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:45:31.851 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:45:31.953 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:45:33.265 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:45:33.265 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:45:33.269 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:45:33.275 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:45:33.468 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:54:18.373 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:45:33.470 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:54:18.380 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:54:18.392 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:54:18.392 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:54:18.418 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:54:18.724 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:54:18.724 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:54:18.724 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006882190704345703, 'diffusion_time_cost': 0.298403263092041, 'diffusion_per_step_time_cost': 0.03730040788650513, 'total_time_cost': 0.3052854537963867, 'offload_time_cost': 0.0} -2026-03-01 19:54:18.739 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:54:18.741 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:54:18.741 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.44 GB -2026-03-01 19:54:18.741 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:54:18.741 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.44 GB -2026-03-01 19:54:18.741 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.44 GB -2026-03-01 19:54:18.741 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:54:19.031 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:54:19.034 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:54:19.037 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:45:33.490 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:45:33.505 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:45:33.505 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:45:33.539 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:45:33.854 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:45:33.855 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:45:33.855 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.0069425106048583984, 'diffusion_time_cost': 0.30779337882995605, 'diffusion_per_step_time_cost': 0.03847417235374451, 'total_time_cost': 0.31473588943481445, 'offload_time_cost': 0.0} +2026-03-04 21:45:33.869 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:45:33.871 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:45:33.871 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.78 GB +2026-03-04 21:45:33.871 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:45:33.871 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.78 GB +2026-03-04 21:45:33.871 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.78 GB +2026-03-04 21:45:33.871 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:45:34.145 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:45:34.147 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:45:34.149 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf [GGML] Running acestep-v15-turbo-Q8_0.gguf... @@ -246,8 +241,8 @@ Using precomputed LM hints dit_step6_xt 0.988641 dit_step7_vt 0.970144 dit_x0 0.979969 - vae_audio 0.905525 - vae_audio (STFT cosine) 0.976530 + vae_audio 0.905563 + vae_audio (STFT cosine) 0.976538 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999948 0.134961 0.006551 -0.002307 0.972901 -0.002342 0.972003 diff --git a/tests/Metal_Q4_K_M.log b/tests/Metal_Q4_K_M.log deleted file mode 100644 index e1ad24f..0000000 --- a/tests/Metal_Q4_K_M.log +++ /dev/null @@ -1,835 +0,0 @@ -ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices -ggml_metal_library_init: using embedded metal library -ggml_metal_library_init: loaded in 0.006 sec -ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) -ggml_metal_device_init: GPU name: MTL0 -ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) -ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) -ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) -ggml_metal_device_init: simdgroup reduction = true -ggml_metal_device_init: simdgroup matrix mul. = true -ggml_metal_device_init: has unified memory = true -ggml_metal_device_init: has bfloat = true -ggml_metal_device_init: has tensor = false -ggml_metal_device_init: use residency sets = true -ggml_metal_device_init: use shared buffers = true -ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] DiT backend: MTL0 (CPU threads: 5) -[Load] Backend init: 20.9 ms -[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 -[DiT] Self-attn: Q+K fused, V separate -[DiT] Cross-attn: all separate -[DiT] MLP: gate+up fused -[Load] null_condition_emb found (CFG available) -[WeightCtx] Loaded 478 tensors, 895.6 MB into backend -[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 1421.5 ms -[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 -[Load] silence_latent: [15000, 64] from GGUF -[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] VAE backend: MTL0 (CPU threads: 5) -[VAE] Backend: MTL0, Weight buffer: 255.7 MB -[VAE] Loaded: 5 blocks, upsample=1920x -[Load] VAE weights: 337.8 ms -[Request 1/1] ggml-turbo/request0.json (batch=1) -[Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) -[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s -[Pipeline] 434 audio codes (86.8s @ 5Hz) -[Pipeline] T=2170, S=1085 -[BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 42.3 ms -[Pipeline] caption: 70 tokens, lyrics: 167 tokens -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] TextEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 28L, H=1024, Nh=16/8 -[Load] TextEncoder: 593.9 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' -ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x11de0dee0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x11de0e340 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x11de0ebb0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' -ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x11de0f030 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x11de0f8a0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x11de0fed0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x11de107b0 | th_max = 576 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x11de11170 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' -ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x11de10350 | th_max = 1024 | th_width = 32 -[Encode] TextEncoder (70 tokens): 44.4 ms -[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 33.8 ms -[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] CondEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 -[WeightCtx] Loaded 140 tensors, 352.5 MB into backend -[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 543.9 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1 0x11de1b4b0 | th_max = 896 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q4_K_f32', name = 'kernel_mul_mm_q4_K_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q4_K_f32_bci=0_bco=1 0x11de1ba60 | th_max = 896 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x11de1bea0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x11de1c500 | th_max = 640 | th_width = 32 -[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 149.3 ms, enc_S=238 -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Debug] enc_hidden: [238, 2048] first4: 1.751263 -0.045978 -0.129705 0.058765 -[GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 -[WeightCtx] Loaded 30 tensors, 64.7 MB into backend -[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 113.4 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x11de10d70 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x11de0aff0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2 0x11de0b950 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' -ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x11de1c9a0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q4_K_f32_r1_5', name = 'kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8 0x11de1d9f0 | th_max = 640 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8 0x11de1dfa0 | th_max = 640 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x11de1e320 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x11de1e580 | th_max = 448 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x11de1ef20 | th_max = 1024 | th_width = 32 -[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 1044.0 ms -[Debug] detok_output: [2170, 64] first4: -0.105288 1.440285 0.304742 -0.636920 -[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 -[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 -[Debug] context: [2170, 128] first4: -0.105288 1.440285 0.304742 -0.636920 -[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 -[DiT] Batch N=1, T=2170, S=1085, enc_S=238 -[DiT] Graph: 1775 nodes -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x11f008d70 | th_max = 832 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' -ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x11f009830 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' -ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x11f009c40 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q4_K_f32', name = 'kernel_mul_mv_q4_K_f32_nsg=2' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q4_K_f32_nsg=2 0x11f00ac80 | th_max = 768 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x11f00b000 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x11f00b6c0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x11f00b920 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x11f00bec0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x11f00c450 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x11f00cd60 | th_max = 640 | th_width = 32 -[Debug] tproj: [12288] first4: 0.260912 -0.160417 -0.090199 0.048634 -[Debug] temb: [2048] first4: 0.000215 -0.133911 -0.034469 0.065007 -[Debug] temb_t: [2048] first4: 0.000971 0.025677 -0.052124 0.063327 -[Debug] temb_r: [2048] first4: -0.000756 -0.159588 0.017655 0.001680 -[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 -[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 -[Debug] temb_lin1_t: [2048] first4: -0.049286 -0.053324 -0.012254 -0.047666 -[Debug] temb_lin1_r: [2048] first4: -0.015463 -0.031534 -0.021259 0.006135 -[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.050396 -0.992003 0.526498 0.458000 -[Debug] proj_in_input: [192, 2170] first4: -0.105288 1.440285 0.304742 -0.636920 -[Debug] enc_after_cond_emb: [2048, 238] first4: -0.174268 0.781178 0.275122 -0.515942 -[Debug] layer0_sa_input: [2048, 1085] first4: -0.726228 -0.772737 -0.041859 0.262417 -[Debug] layer0_q_after_rope: [128, 16] first4: -12.136272 0.820533 1.509364 1.799582 -[Debug] layer0_k_after_rope: [128, 8] first4: -0.174268 0.781178 0.275122 -0.515942 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.599759 0.160940 -0.480259 0.455996 -[Debug] layer0_attn_out: [2048, 1085] first4: -12.315464 1.144032 1.760677 1.796125 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.579560 -1.062863 0.061853 0.466855 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.703488 -0.838320 -0.450424 0.503514 -[Debug] hidden_after_layer0: [2048, 1085] first4: -8.870923 0.423529 48.381233 -0.778579 -[Debug] hidden_after_layer6: [2048, 1085] first4: -21.397562 -1.526012 29.991730 -3.928804 -[Debug] hidden_after_layer12: [2048, 1085] first4: -17.419617 -13.309786 66.317848 28.914410 -[Debug] hidden_after_layer18: [2048, 1085] first4: -16.562674 9.657765 55.222641 17.661957 -[Debug] hidden_after_layer23: [2048, 1085] first4: -19.112629 7.039753 181.464966 133.927719 -[Debug] dit_step0_vt: [2170, 64] first4: -0.112419 1.107940 0.244994 2.200569 -[Debug] dit_step0_xt: [2170, 64] first4: 0.199446 2.105889 -0.183011 0.747630 -[DiT] step 1/8 t=1.000 -[Debug] dit_step1_vt: [2170, 64] first4: -0.082195 1.204432 -0.273788 1.824850 -[Debug] dit_step1_xt: [2170, 64] first4: 0.203929 2.040193 -0.168077 0.648093 -[DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: -0.043690 1.209513 0.074423 2.191977 -[Debug] dit_step2_xt: [2170, 64] first4: 0.206842 1.959559 -0.173039 0.501961 -[DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: 0.238132 1.171738 0.272480 2.506455 -[Debug] dit_step3_xt: [2170, 64] first4: 0.186998 1.861914 -0.195745 0.293090 -[DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 0.293275 1.147521 0.096848 2.639339 -[Debug] dit_step4_xt: [2170, 64] first4: 0.155575 1.738965 -0.206122 0.010304 -[DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 0.200179 1.089847 -0.403776 2.739777 -[Debug] dit_step5_xt: [2170, 64] first4: 0.126978 1.583273 -0.148440 -0.381093 -[DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: -0.078240 0.999644 -1.058107 2.768797 -[Debug] dit_step6_xt: [2170, 64] first4: 0.142626 1.383344 0.063182 -0.934852 -[DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: -0.417903 0.862772 -1.662739 3.246292 -[Debug] dit_x0: [2170, 64] first4: 0.267997 1.124512 0.562003 -1.908740 -[DiT] step 8/8 t=0.300 -[DiT] Total generation: 7809.5 ms (7809.5 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.267997 1.124512 0.562003 -1.908740 -[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 474 nodes, T_latent=192 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' -ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x11de1ab80 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x11de1ceb0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x11de1f410 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x11de1f670 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x11de1fa20 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x11de20200 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x11de20760 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x11de216c0 | th_max = 1024 | th_width = 32 -[VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 474 nodes, T_latent=256 -[VAE] Graph: 474 nodes, T_latent=186 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x11de21920 | th_max = 896 | th_width = 32 -[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 609663.4 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000307 0.000830 0.000664 0.001050 -[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Request 1/1] Done -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Pipeline] All done -ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices -ggml_metal_library_init: using embedded metal library -ggml_metal_library_init: loaded in 0.006 sec -ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) -ggml_metal_device_init: GPU name: MTL0 -ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) -ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) -ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) -ggml_metal_device_init: simdgroup reduction = true -ggml_metal_device_init: simdgroup matrix mul. = true -ggml_metal_device_init: has unified memory = true -ggml_metal_device_init: has bfloat = true -ggml_metal_device_init: has tensor = false -ggml_metal_device_init: use residency sets = true -ggml_metal_device_init: use shared buffers = true -ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] DiT backend: MTL0 (CPU threads: 5) -[Load] Backend init: 18.8 ms -[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800 -[DiT] Self-attn: Q+K fused, V separate -[DiT] Cross-attn: all separate -[DiT] MLP: gate+up fused -[Load] null_condition_emb found (CFG available) -[WeightCtx] Loaded 478 tensors, 895.6 MB into backend -[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 1269.3 ms -[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800 -[Load] silence_latent: [15000, 64] from GGUF -[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] VAE backend: MTL0 (CPU threads: 5) -[VAE] Backend: MTL0, Weight buffer: 255.7 MB -[VAE] Loaded: 5 blocks, upsample=1920x -[Load] VAE weights: 272.1 ms -[Request 1/1] ggml-sft/request0.json (batch=1) -[Request] parsed ggml-sft/request0.json (18 fields) -[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s -[Pipeline] 434 audio codes (86.8s @ 5Hz) -[Pipeline] T=2170, S=1085 -[BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 41.8 ms -[Pipeline] caption: 70 tokens, lyrics: 167 tokens -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] TextEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 28L, H=1024, Nh=16/8 -[Load] TextEncoder: 231.9 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' -ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x15570a490 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x15570a8f0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x15570b160 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' -ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x15570b5e0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x15570be50 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x15570c480 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x15570cd60 | th_max = 576 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x15570d170 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' -ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x15570d3d0 | th_max = 1024 | th_width = 32 -[Encode] TextEncoder (70 tokens): 48.9 ms -[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 33.9 ms -[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] CondEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800 -[WeightCtx] Loaded 140 tensors, 352.5 MB into backend -[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 601.2 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1 0x155717100 | th_max = 896 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q4_K_f32', name = 'kernel_mul_mm_q4_K_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q4_K_f32_bci=0_bco=1 0x1557176b0 | th_max = 896 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x155717a30 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x155718090 | th_max = 640 | th_width = 32 -[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 151.9 ms, enc_S=238 -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Debug] enc_hidden: [238, 2048] first4: 1.751314 -0.046022 -0.129862 0.058756 -[GGUF] ../models/acestep-v15-sft-Q4_K_M.gguf: 678 tensors, data at offset 56800 -[WeightCtx] Loaded 30 tensors, 64.7 MB into backend -[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 101.7 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x15570ebf0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x155707790 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2 0x155707dc0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' -ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x1557074e0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q4_K_f32_r1_5', name = 'kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q4_K_f32_r1_5_nsg=2_nxpsg=8 0x1557192f0 | th_max = 640 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8 0x1557198a0 | th_max = 640 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x155719c20 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x155719e80 | th_max = 448 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x15571a8c0 | th_max = 1024 | th_width = 32 -[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 1040.2 ms -[Debug] detok_output: [2170, 64] first4: -0.105274 1.439665 0.307319 -0.637002 -[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 -[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 -[Debug] context: [2170, 128] first4: -0.105274 1.439665 0.307319 -0.637002 -[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1 -[DiT] Batch N=1, T=2170, S=1085, enc_S=238 -[DiT] Graph: 1775 nodes -[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683 -[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683 -[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x15560cd80 | th_max = 832 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' -ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x15560d720 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' -ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x15560db30 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q4_K_f32', name = 'kernel_mul_mv_q4_K_f32_nsg=2' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q4_K_f32_nsg=2 0x15560eb70 | th_max = 768 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x15560eef0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x15560f5b0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x15560f810 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x15560fdb0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x155610340 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x155610d60 | th_max = 640 | th_width = 32 -[Debug] tproj: [12288] first4: 0.154826 -0.114975 -0.093002 0.082122 -[Debug] temb: [2048] first4: -0.003593 -0.176168 0.003892 -0.001352 -[Debug] temb_t: [2048] first4: -0.002002 0.003482 -0.013423 -0.001611 -[Debug] temb_r: [2048] first4: -0.001591 -0.179650 0.017315 0.000259 -[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 -[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 -[Debug] temb_lin1_t: [2048] first4: -0.026166 0.013606 0.032789 -0.028782 -[Debug] temb_lin1_r: [2048] first4: -0.001795 -0.011535 -0.006725 -0.011136 -[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.101326 -0.948224 0.490204 0.449757 -[Debug] proj_in_input: [192, 2170] first4: -0.105274 1.439665 0.307319 -0.637002 -[Debug] enc_after_cond_emb: [2048, 238] first4: -0.189214 0.805478 0.284418 -0.472295 -[Debug] layer0_sa_input: [2048, 1085] first4: -0.923880 -0.725952 -0.044805 0.297821 -[Debug] layer0_q_after_rope: [128, 16] first4: -12.125128 0.516320 1.460617 1.783048 -[Debug] layer0_k_after_rope: [128, 8] first4: -0.189214 0.805478 0.284418 -0.472295 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.642741 0.751692 -0.708185 0.515940 -[Debug] layer0_attn_out: [2048, 1085] first4: -11.610563 1.032188 1.685498 1.814675 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.803507 -1.373816 -0.306776 0.394307 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.998315 -1.012332 -0.558752 0.397301 -[Debug] hidden_after_layer0: [2048, 1085] first4: -9.408201 1.261657 57.661659 -1.674409 -[Debug] hidden_after_layer6: [2048, 1085] first4: -13.125732 4.401457 57.923130 1.593087 -[Debug] hidden_after_layer12: [2048, 1085] first4: -12.760151 8.784775 -27.576780 1.266083 -[Debug] hidden_after_layer18: [2048, 1085] first4: -3.029438 18.924910 -37.522003 -17.408060 -[Debug] hidden_after_layer23: [2048, 1085] first4: 25.718348 50.253456 58.487469 -24.616550 -[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.549879 2.587143 -0.199758 1.525680 -[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.221552 2.068977 -0.854510 1.731250 -[Debug] dit_step0_vt: [2170, 64] first4: -0.770128 3.170936 0.103367 1.213956 -[Debug] dit_step0_xt: [2170, 64] first4: 0.209738 2.092831 -0.173942 0.823377 -[DiT] step 1/50 t=1.000 -[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.553963 2.540515 -0.004453 1.412831 -[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.507386 2.385438 -0.093360 1.515296 -[Debug] dit_step1_vt: [2170, 64] first4: -0.244245 1.996188 -0.241419 1.557151 -[Debug] dit_step1_xt: [2170, 64] first4: 0.214623 2.052907 -0.169114 0.792234 -[DiT] step 2/50 t=0.980 -[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.537810 2.506870 -0.002615 1.406658 -[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.493937 2.362162 -0.101829 1.455003 -[Debug] dit_step2_vt: [2170, 64] first4: -0.705231 2.991064 0.252674 1.183649 -[Debug] dit_step2_xt: [2170, 64] first4: 0.228728 1.993086 -0.174167 0.768561 -[DiT] step 3/50 t=0.960 -[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.501613 2.438805 -0.019274 1.410215 -[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.521661 2.364079 -0.095044 1.376828 -[Debug] dit_step3_vt: [2170, 64] first4: -0.201260 2.055526 -0.239553 1.689172 -[Debug] dit_step3_xt: [2170, 64] first4: 0.232753 1.951976 -0.169376 0.734778 -[DiT] step 4/50 t=0.940 -[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.465795 2.359768 -0.032364 1.399407 -[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.532122 2.334485 -0.099644 1.341739 -[Debug] dit_step4_vt: [2170, 64] first4: -0.511269 2.624130 0.214392 1.268924 -[Debug] dit_step4_xt: [2170, 64] first4: 0.242979 1.899493 -0.173664 0.709399 -[DiT] step 5/50 t=0.920 -[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.416940 2.273875 -0.055556 1.387350 -[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.572103 2.299005 -0.092359 1.353066 -[Debug] dit_step5_vt: [2170, 64] first4: 0.057514 1.863401 -0.254107 1.537004 -[Debug] dit_step5_xt: [2170, 64] first4: 0.241828 1.862225 -0.168582 0.678659 -[DiT] step 6/50 t=0.900 -[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.399254 2.210152 -0.071076 1.369134 -[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.539425 2.227666 -0.114236 1.361075 -[Debug] dit_step6_vt: [2170, 64] first4: -0.380751 2.356979 0.167812 1.213706 -[Debug] dit_step6_xt: [2170, 64] first4: 0.249443 1.815086 -0.171938 0.654385 -[DiT] step 7/50 t=0.880 -[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.339429 2.118412 -0.091855 1.350106 -[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.475619 2.122700 -0.120878 1.360558 -[Debug] dit_step7_vt: [2170, 64] first4: 0.052995 1.858614 -0.256165 1.379718 -[Debug] dit_step7_xt: [2170, 64] first4: 0.248383 1.777913 -0.166815 0.626791 -[DiT] step 8/50 t=0.860 -[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.274483 2.023758 -0.093816 1.332238 -[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.393477 2.023790 -0.130114 1.332444 -[Debug] dit_step8_vt: [2170, 64] first4: -0.218486 2.105614 0.134615 1.230365 -[Debug] dit_step8_xt: [2170, 64] first4: 0.252753 1.735801 -0.169507 0.602183 -[DiT] step 9/50 t=0.840 -[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.208702 1.940720 -0.100297 1.317338 -[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.302713 1.942014 -0.150935 1.306566 -[Debug] dit_step9_vt: [2170, 64] first4: 0.068625 1.756381 -0.163156 1.360642 -[Debug] dit_step9_xt: [2170, 64] first4: 0.251381 1.700673 -0.166244 0.574971 -[DiT] step 10/50 t=0.820 -[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.162154 1.880021 -0.110640 1.303073 -[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.210358 1.886162 -0.152094 1.303815 -[Debug] dit_step10_vt: [2170, 64] first4: -0.200484 1.879984 0.061434 1.187651 -[Debug] dit_step10_xt: [2170, 64] first4: 0.255390 1.663074 -0.167473 0.551217 -[DiT] step 11/50 t=0.800 -[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.132763 1.841353 -0.152935 1.280443 -[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.156466 1.839952 -0.166283 1.309973 -[Debug] dit_step11_vt: [2170, 64] first4: -0.006319 1.715424 -0.248815 1.180641 -[Debug] dit_step11_xt: [2170, 64] first4: 0.255517 1.628765 -0.162497 0.527605 -[DiT] step 12/50 t=0.780 -[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.108732 1.804132 -0.204569 1.271017 -[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.137749 1.799717 -0.174060 1.349185 -[Debug] dit_step12_vt: [2170, 64] first4: -0.093850 1.775385 -0.218540 0.972914 -[Debug] dit_step12_xt: [2170, 64] first4: 0.257394 1.593257 -0.158126 0.508146 -[DiT] step 13/50 t=0.760 -[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.084325 1.755919 -0.251734 1.253830 -[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.116151 1.744928 -0.223829 1.345488 -[Debug] dit_step13_vt: [2170, 64] first4: 0.034148 1.681178 -0.334965 1.042164 -[Debug] dit_step13_xt: [2170, 64] first4: 0.256711 1.559634 -0.151426 0.487303 -[DiT] step 14/50 t=0.740 -[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.062454 1.706585 -0.275264 1.242871 -[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.092396 1.687153 -0.270903 1.319513 -[Debug] dit_step14_vt: [2170, 64] first4: -0.030339 1.704105 -0.218537 1.004399 -[Debug] dit_step14_xt: [2170, 64] first4: 0.257318 1.525552 -0.147056 0.467215 -[DiT] step 15/50 t=0.720 -[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.039531 1.653934 -0.274129 1.244472 -[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.065533 1.623524 -0.308950 1.280105 -[Debug] dit_step15_vt: [2170, 64] first4: 0.042593 1.646848 -0.174753 1.192683 -[Debug] dit_step15_xt: [2170, 64] first4: 0.256466 1.492615 -0.143561 0.443362 -[DiT] step 16/50 t=0.700 -[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.024221 1.582624 -0.288380 1.229998 -[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.041512 1.552975 -0.330420 1.243577 -[Debug] dit_step16_vt: [2170, 64] first4: -0.014702 1.584471 -0.181940 1.121346 -[Debug] dit_step16_xt: [2170, 64] first4: 0.256760 1.460925 -0.139922 0.420935 -[DiT] step 17/50 t=0.680 -[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.016144 1.507916 -0.306446 1.209517 -[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.023216 1.483080 -0.342848 1.208134 -[Debug] dit_step17_vt: [2170, 64] first4: 0.010192 1.492126 -0.218166 1.213425 -[Debug] dit_step17_xt: [2170, 64] first4: 0.256556 1.431083 -0.135559 0.396666 -[DiT] step 18/50 t=0.660 -[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.011327 1.429419 -0.322466 1.189975 -[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.006504 1.414708 -0.351011 1.186830 -[Debug] dit_step18_vt: [2170, 64] first4: -0.055648 1.401301 -0.242752 1.127735 -[Debug] dit_step18_xt: [2170, 64] first4: 0.257669 1.403057 -0.130704 0.374111 -[DiT] step 19/50 t=0.640 -[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.008919 1.352955 -0.336887 1.164963 -[Debug] dit_step19_vt_uncond: [2170, 64] first4: 0.006420 1.358623 -0.354804 1.168313 -[Debug] dit_step19_vt: [2170, 64] first4: -0.054127 1.236317 -0.295143 1.130394 -[Debug] dit_step19_xt: [2170, 64] first4: 0.258751 1.378330 -0.124801 0.351504 -[DiT] step 20/50 t=0.620 -[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.004449 1.272026 -0.345863 1.142193 -[Debug] dit_step20_vt_uncond: [2170, 64] first4: 0.019787 1.305161 -0.354228 1.148333 -[Debug] dit_step20_vt: [2170, 64] first4: -0.100401 1.079987 -0.320124 1.076506 -[Debug] dit_step20_xt: [2170, 64] first4: 0.260759 1.356731 -0.118398 0.329973 -[DiT] step 21/50 t=0.600 -[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.002161 1.194354 -0.356476 1.115376 -[Debug] dit_step21_vt_uncond: [2170, 64] first4: 0.027066 1.258520 -0.355503 1.123235 -[Debug] dit_step21_vt: [2170, 64] first4: -0.089629 0.890893 -0.367078 1.066256 -[Debug] dit_step21_xt: [2170, 64] first4: 0.262552 1.338913 -0.111057 0.308648 -[DiT] step 22/50 t=0.580 -[Debug] dit_step22_vt_cond: [2170, 64] first4: 0.001542 1.116787 -0.366798 1.082653 -[Debug] dit_step22_vt_uncond: [2170, 64] first4: 0.034784 1.215104 -0.359348 1.094688 -[Debug] dit_step22_vt: [2170, 64] first4: -0.114017 0.710875 -0.381058 1.001636 -[Debug] dit_step22_xt: [2170, 64] first4: 0.264832 1.324695 -0.103435 0.288616 -[DiT] step 23/50 t=0.560 -[Debug] dit_step23_vt_cond: [2170, 64] first4: 0.004356 1.043939 -0.376088 1.054782 -[Debug] dit_step23_vt_uncond: [2170, 64] first4: 0.040331 1.176215 -0.358597 1.069999 -[Debug] dit_step23_vt: [2170, 64] first4: -0.106657 0.513238 -0.439613 0.976581 -[Debug] dit_step23_xt: [2170, 64] first4: 0.266965 1.314431 -0.094643 0.269084 -[DiT] step 24/50 t=0.540 -[Debug] dit_step24_vt_cond: [2170, 64] first4: 0.004404 0.961254 -0.387939 1.015311 -[Debug] dit_step24_vt_uncond: [2170, 64] first4: 0.043793 1.129819 -0.356263 1.035491 -[Debug] dit_step24_vt: [2170, 64] first4: -0.131273 0.309370 -0.487982 0.900439 -[Debug] dit_step24_xt: [2170, 64] first4: 0.269591 1.308243 -0.084884 0.251075 -[DiT] step 25/50 t=0.520 -[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.001606 0.858703 -0.396162 0.970976 -[Debug] dit_step25_vt_uncond: [2170, 64] first4: 0.045187 1.067146 -0.350258 0.994534 -[Debug] dit_step25_vt: [2170, 64] first4: -0.160841 0.082930 -0.542274 0.862474 -[Debug] dit_step25_xt: [2170, 64] first4: 0.272808 1.306585 -0.074038 0.233826 -[DiT] step 26/50 t=0.500 -[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.011834 0.743138 -0.406478 0.912916 -[Debug] dit_step26_vt_uncond: [2170, 64] first4: 0.044098 0.988983 -0.348666 0.943761 -[Debug] dit_step26_vt: [2170, 64] first4: -0.203731 -0.135469 -0.575882 0.759197 -[Debug] dit_step26_xt: [2170, 64] first4: 0.276882 1.309294 -0.062520 0.218642 -[DiT] step 27/50 t=0.480 -[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.028043 0.640231 -0.413465 0.856122 -[Debug] dit_step27_vt_uncond: [2170, 64] first4: 0.038067 0.910543 -0.350117 0.887872 -[Debug] dit_step27_vt: [2170, 64] first4: -0.249926 -0.275849 -0.588337 0.733838 -[Debug] dit_step27_xt: [2170, 64] first4: 0.281881 1.314811 -0.050754 0.203965 -[DiT] step 28/50 t=0.460 -[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.048697 0.519480 -0.427048 0.785924 -[Debug] dit_step28_vt_uncond: [2170, 64] first4: 0.029577 0.811304 -0.356754 0.820204 -[Debug] dit_step28_vt: [2170, 64] first4: -0.313111 -0.465662 -0.625360 0.626629 -[Debug] dit_step28_xt: [2170, 64] first4: 0.288143 1.324124 -0.038247 0.191432 -[DiT] step 29/50 t=0.440 -[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.073682 0.390412 -0.435695 0.713586 -[Debug] dit_step29_vt_uncond: [2170, 64] first4: 0.022755 0.688592 -0.366629 0.750458 -[Debug] dit_step29_vt: [2170, 64] first4: -0.404692 -0.558608 -0.601264 0.570632 -[Debug] dit_step29_xt: [2170, 64] first4: 0.296237 1.335296 -0.026221 0.180020 -[DiT] step 30/50 t=0.420 -[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.100612 0.256910 -0.442863 0.643070 -[Debug] dit_step30_vt_uncond: [2170, 64] first4: 0.014270 0.550700 -0.380145 0.680719 -[Debug] dit_step30_vt: [2170, 64] first4: -0.477652 -0.675684 -0.591087 0.486411 -[Debug] dit_step30_xt: [2170, 64] first4: 0.305790 1.348810 -0.014400 0.170292 -[DiT] step 31/50 t=0.400 -[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.127005 0.130974 -0.446946 0.576489 -[Debug] dit_step31_vt_uncond: [2170, 64] first4: 0.003612 0.415976 -0.399074 0.614345 -[Debug] dit_step31_vt: [2170, 64] first4: -0.549710 -0.743030 -0.526327 0.431312 -[Debug] dit_step31_xt: [2170, 64] first4: 0.316784 1.363671 -0.003873 0.161665 -[DiT] step 32/50 t=0.380 -[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.154932 -0.000795 -0.447535 0.511295 -[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.007317 0.275916 -0.413101 0.549311 -[Debug] dit_step32_vt: [2170, 64] first4: -0.628125 -0.848536 -0.505066 0.360242 -[Debug] dit_step32_xt: [2170, 64] first4: 0.329347 1.380641 0.006228 0.154460 -[DiT] step 33/50 t=0.360 -[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.183072 -0.130801 -0.438493 0.449678 -[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.021971 0.136892 -0.420384 0.490091 -[Debug] dit_step33_vt: [2170, 64] first4: -0.685087 -0.931651 -0.428386 0.294226 -[Debug] dit_step33_xt: [2170, 64] first4: 0.343048 1.399274 0.014796 0.148576 -[DiT] step 34/50 t=0.340 -[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.207282 -0.251064 -0.429462 0.399560 -[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.035614 0.010201 -0.426610 0.442224 -[Debug] dit_step34_vt: [2170, 64] first4: -0.740469 -1.039289 -0.393755 0.238626 -[Debug] dit_step34_xt: [2170, 64] first4: 0.357858 1.420060 0.022671 0.143803 -[DiT] step 35/50 t=0.320 -[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.234011 -0.373429 -0.414613 0.349351 -[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.051328 -0.116322 -0.423153 0.392585 -[Debug] dit_step35_vt: [2170, 64] first4: -0.800518 -1.139187 -0.342183 0.192528 -[Debug] dit_step35_xt: [2170, 64] first4: 0.373868 1.442844 0.029515 0.139953 -[DiT] step 36/50 t=0.300 -[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.261591 -0.503509 -0.392160 0.303680 -[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.072050 -0.249828 -0.410849 0.351470 -[Debug] dit_step36_vt: [2170, 64] first4: -0.838416 -1.260836 -0.298992 0.122180 -[Debug] dit_step36_xt: [2170, 64] first4: 0.390637 1.468061 0.035495 0.137509 -[DiT] step 37/50 t=0.280 -[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.290611 -0.615966 -0.361295 0.261135 -[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.095822 -0.367916 -0.388325 0.310791 -[Debug] dit_step37_vt: [2170, 64] first4: -0.893251 -1.349895 -0.245346 0.089192 -[Debug] dit_step37_xt: [2170, 64] first4: 0.408502 1.495059 0.040402 0.135725 -[DiT] step 38/50 t=0.260 -[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.316862 -0.724614 -0.326989 0.221074 -[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.120406 -0.482601 -0.361356 0.272140 -[Debug] dit_step38_vt: [2170, 64] first4: -0.917953 -1.452874 -0.195436 0.033767 -[Debug] dit_step38_xt: [2170, 64] first4: 0.426861 1.524116 0.044310 0.135050 -[DiT] step 39/50 t=0.240 -[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.344701 -0.840724 -0.280406 0.181682 -[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.151500 -0.605403 -0.318787 0.232017 -[Debug] dit_step39_vt: [2170, 64] first4: -0.945851 -1.537027 -0.144223 0.008566 -[Debug] dit_step39_xt: [2170, 64] first4: 0.445778 1.554857 0.047195 0.134879 -[DiT] step 40/50 t=0.220 -[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.369051 -0.939547 -0.228334 0.139823 -[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.182335 -0.713639 -0.270236 0.191739 -[Debug] dit_step40_vt: [2170, 64] first4: -0.958883 -1.593756 -0.082150 -0.057526 -[Debug] dit_step40_xt: [2170, 64] first4: 0.464955 1.586732 0.048838 0.136029 -[DiT] step 41/50 t=0.200 -[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.388759 -1.034758 -0.170808 0.098079 -[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.215027 -0.818665 -0.212783 0.153622 -[Debug] dit_step41_vt: [2170, 64] first4: -0.929079 -1.656826 -0.036355 -0.101313 -[Debug] dit_step41_xt: [2170, 64] first4: 0.483537 1.619868 0.049565 0.138056 -[DiT] step 42/50 t=0.180 -[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.404481 -1.121373 -0.110304 0.048469 -[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.250394 -0.918649 -0.148512 0.113292 -[Debug] dit_step42_vt: [2170, 64] first4: -0.870129 -1.689520 0.009394 -0.198920 -[Debug] dit_step42_xt: [2170, 64] first4: 0.500939 1.653659 0.049377 0.142034 -[DiT] step 43/50 t=0.160 -[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.416518 -1.199422 -0.047277 -0.004303 -[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.285961 -1.014739 -0.080642 0.076449 -[Debug] dit_step43_vt: [2170, 64] first4: -0.799869 -1.709703 0.054214 -0.305297 -[Debug] dit_step43_xt: [2170, 64] first4: 0.516937 1.687853 0.048293 0.148140 -[DiT] step 44/50 t=0.140 -[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.422798 -1.270758 0.022277 -0.058297 -[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.318056 -1.108378 -0.007512 0.042141 -[Debug] dit_step44_vt: [2170, 64] first4: -0.718613 -1.710690 0.113612 -0.432909 -[Debug] dit_step44_xt: [2170, 64] first4: 0.531309 1.722067 0.046020 0.156798 -[DiT] step 45/50 t=0.120 -[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.430461 -1.334901 0.090295 -0.107751 -[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.346132 -1.190932 0.060499 0.012419 -[Debug] dit_step45_vt: [2170, 64] first4: -0.676233 -1.740750 0.184198 -0.543741 -[Debug] dit_step45_xt: [2170, 64] first4: 0.544834 1.756882 0.042336 0.167673 -[DiT] step 46/50 t=0.100 -[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.442548 -1.408986 0.177202 -0.124432 -[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.366053 -1.276834 0.124694 -0.015436 -[Debug] dit_step46_vt: [2170, 64] first4: -0.689058 -1.805405 0.393872 -0.448936 -[Debug] dit_step46_xt: [2170, 64] first4: 0.558615 1.792990 0.034459 0.176652 -[DiT] step 47/50 t=0.080 -[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.439555 -1.466634 0.226367 -0.147289 -[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.380429 -1.352640 0.169813 -0.038167 -[Debug] dit_step47_vt: [2170, 64] first4: -0.594441 -1.800792 0.366639 -0.559854 -[Debug] dit_step47_xt: [2170, 64] first4: 0.570504 1.829005 0.027126 0.187849 -[DiT] step 48/50 t=0.060 -[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.421519 -1.502992 0.243896 -0.165260 -[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.386849 -1.417176 0.200885 -0.065191 -[Debug] dit_step48_vt: [2170, 64] first4: -0.516278 -1.762812 0.360980 -0.463950 -[Debug] dit_step48_xt: [2170, 64] first4: 0.580829 1.864262 0.019907 0.197128 -[DiT] step 49/50 t=0.040 -[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.442348 -1.531937 0.237906 -0.192473 -[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.399571 -1.435245 0.199709 -0.084932 -[Debug] dit_step49_vt: [2170, 64] first4: -0.632891 -1.901084 0.347748 -0.622644 -[Debug] dit_x0: [2170, 64] first4: 0.593487 1.902283 0.012952 0.209581 -[DiT] step 50/50 t=0.020 -[DiT] Total generation: 97237.2 ms (97237.2 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.593487 1.902283 0.012952 0.209581 -[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 474 nodes, T_latent=192 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' -ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x1556105a0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x1556166d0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x155616930 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x155616fc0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x155617400 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x155617a00 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x155617f60 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x155618e40 | th_max = 1024 | th_width = 32 -[VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 474 nodes, T_latent=256 -[VAE] Graph: 474 nodes, T_latent=186 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x10b705130 | th_max = 896 | th_width = 32 -[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 609639.3 ms -[Debug] vae_audio: [2, 4166400] first4: -0.001780 -0.001606 -0.001703 -0.001406 -[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Request 1/1] Done -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Pipeline] All done -[Request] Loaded request0.json -[Noise] Reusing existing rng_philox_seed42.bf16 -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf -[GGML] Running acestep-v15-turbo-Q4_K_M.gguf... -[GGML] Done, 47 dump files -[Turbo] Reusing existing Python dumps: python-turbo -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999813 - lyric_embed 1.000000 - enc_hidden 0.997096 - detok_output 0.999629 - context 0.999763 - noise 1.000000 - temb_t 0.999906 - hidden_after_proj_in 0.999918 - enc_after_cond_emb 0.997606 - layer0_sa_output 0.998452 - hidden_after_layer0 0.999696 - hidden_after_layer6 0.999330 - hidden_after_layer12 0.995408 - hidden_after_layer18 0.991270 - hidden_after_layer23 0.984826 - dit_step0_vt 0.944528 - dit_step0_xt 0.999878 - dit_step1_vt 0.947871 - dit_step1_xt 0.999609 - dit_step2_vt 0.956355 - dit_step2_xt 0.998980 - dit_step3_vt 0.961293 - dit_step3_xt 0.997669 - dit_step4_vt 0.958834 - dit_step4_xt 0.994713 - dit_step5_vt 0.956132 - dit_step5_xt 0.988221 - dit_step6_vt 0.950838 - dit_step6_xt 0.976124 - dit_step7_vt 0.938802 - dit_x0 0.958347 - vae_audio 0.832313 - vae_audio (log spectral) 0.999533 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999877 0.165977 0.010464 -0.002251 0.973155 -0.002342 0.972003 - dit_step1_xt 0.999608 0.266862 0.018170 -0.005108 0.943161 -0.005313 0.941730 - dit_step2_xt 0.998979 0.448963 0.028101 -0.009001 0.910184 -0.009311 0.908527 - dit_step3_xt 0.997667 0.610427 0.040689 -0.014279 0.875248 -0.014577 0.873624 - dit_step4_xt 0.994712 0.903635 0.058677 -0.021196 0.843722 -0.021660 0.841995 - dit_step5_xt 0.988220 1.370464 0.085448 -0.031128 0.827283 -0.032109 0.824593 - dit_step6_xt 0.976123 1.998804 0.126069 -0.045345 0.858424 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q4_K_M.gguf -[GGML] Running acestep-v15-sft-Q4_K_M.gguf... -[GGML] Done, 233 dump files -[SFT] Reusing existing Python dumps: python-sft -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999813 - lyric_embed 1.000000 - enc_hidden 0.997097 - detok_output 0.999629 - context 0.999763 - noise 1.000000 - temb_t 0.999673 - hidden_after_proj_in 0.999917 - enc_after_cond_emb 0.997598 - layer0_sa_output 0.998569 - hidden_after_layer0 0.999686 - hidden_after_layer6 0.999172 - hidden_after_layer12 0.997776 - hidden_after_layer18 0.996818 - hidden_after_layer23 0.997039 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.996934 - dit_step0_vt_uncond 0.996212 - dit_step0_vt 0.990566 - dit_step0_xt 0.999995 - dit_step5_vt_cond 0.995434 - dit_step5_vt 0.980046 - dit_step5_xt 0.999823 - dit_step10_vt_cond 0.991133 - dit_step10_vt 0.971906 - dit_step10_xt 0.999207 - dit_step15_vt_cond 0.982704 - dit_step15_vt 0.950629 - dit_step15_xt 0.997454 - dit_step20_vt_cond 0.968600 - dit_step20_vt 0.929360 - dit_step20_xt 0.993412 - dit_step25_vt_cond 0.951686 - dit_step25_vt 0.903442 - dit_step25_xt 0.986280 - dit_step30_vt_cond 0.931805 - dit_step30_vt 0.881992 - dit_step30_xt 0.976117 - dit_step35_vt_cond 0.911309 - dit_step35_vt 0.858516 - dit_step35_xt 0.964745 - dit_step40_vt_cond 0.898448 - dit_step40_vt 0.843064 - dit_step40_xt 0.954421 - dit_step45_vt_cond 0.908747 - dit_step45_vt 0.865504 - dit_step45_xt 0.947533 - dit_step49_vt_cond 0.927312 - dit_step49_vt 0.885368 - dit_x0 0.945292 - vae_audio 0.825801 - vae_audio (log spectral) 0.999459 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999994 0.035677 0.002825 -0.001840 0.980345 -0.001741 0.980402 - dit_step5_xt 0.999822 0.191921 0.012992 -0.007283 0.890515 -0.007143 0.887999 - dit_step10_xt 0.999206 0.526469 0.024282 -0.012946 0.812557 -0.012603 0.811299 - dit_step15_xt 0.997453 0.836399 0.039177 -0.018559 0.748243 -0.018114 0.745269 - dit_step20_xt 0.993411 1.152330 0.058726 -0.024275 0.703300 -0.023808 0.699582 - dit_step25_xt 0.986279 1.542745 0.081991 -0.030177 0.682229 -0.029311 0.679278 - dit_step30_xt 0.976117 1.915049 0.109049 -0.036245 0.688533 -0.035027 0.685262 - dit_step35_xt 0.964744 2.242426 0.138946 -0.042318 0.720837 -0.040716 0.717196 - dit_step40_xt 0.954421 2.562076 0.170565 -0.048389 0.775001 -0.046462 0.771853 - dit_step45_xt 0.947532 2.889421 0.200672 -0.054787 0.846930 -0.052475 0.843036 diff --git a/tests/Metal_Q5_K_M.log b/tests/Metal_Q5_K_M.log deleted file mode 100644 index a25afc6..0000000 --- a/tests/Metal_Q5_K_M.log +++ /dev/null @@ -1,835 +0,0 @@ -ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices -ggml_metal_library_init: using embedded metal library -ggml_metal_library_init: loaded in 0.007 sec -ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) -ggml_metal_device_init: GPU name: MTL0 -ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) -ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) -ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) -ggml_metal_device_init: simdgroup reduction = true -ggml_metal_device_init: simdgroup matrix mul. = true -ggml_metal_device_init: has unified memory = true -ggml_metal_device_init: has bfloat = true -ggml_metal_device_init: has tensor = false -ggml_metal_device_init: use residency sets = true -ggml_metal_device_init: use shared buffers = true -ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] DiT backend: MTL0 (CPU threads: 5) -[Load] Backend init: 21.7 ms -[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 -[DiT] Self-attn: Q+K fused, V separate -[DiT] Cross-attn: all separate -[DiT] MLP: gate+up fused -[Load] null_condition_emb found (CFG available) -[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend -[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 1538.7 ms -[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 -[Load] silence_latent: [15000, 64] from GGUF -[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] VAE backend: MTL0 (CPU threads: 5) -[VAE] Backend: MTL0, Weight buffer: 255.7 MB -[VAE] Loaded: 5 blocks, upsample=1920x -[Load] VAE weights: 275.7 ms -[Request 1/1] ggml-turbo/request0.json (batch=1) -[Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) -[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s -[Pipeline] 434 audio codes (86.8s @ 5Hz) -[Pipeline] T=2170, S=1085 -[BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 42.1 ms -[Pipeline] caption: 70 tokens, lyrics: 167 tokens -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] TextEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 28L, H=1024, Nh=16/8 -[Load] TextEncoder: 230.3 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' -ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x11cf0b930 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x11cf0bd90 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x11cf0c600 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' -ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x11cf0ca80 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x11cf0d2f0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x11cf0d920 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x11cf0e200 | th_max = 576 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x11cf0e610 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' -ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x11cf0e870 | th_max = 1024 | th_width = 32 -[Encode] TextEncoder (70 tokens): 44.0 ms -[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 33.7 ms -[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] CondEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 -[WeightCtx] Loaded 140 tensors, 412.5 MB into backend -[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 572.5 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1 0x11ce0c140 | th_max = 896 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q5_K_f32', name = 'kernel_mul_mm_q5_K_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q5_K_f32_bci=0_bco=1 0x11cf17e80 | th_max = 832 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x11cf18860 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x11cf18ec0 | th_max = 640 | th_width = 32 -[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 158.6 ms, enc_S=238 -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Debug] enc_hidden: [238, 2048] first4: 1.751803 -0.051174 -0.133188 0.058982 -[GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 -[WeightCtx] Loaded 30 tensors, 73.2 MB into backend -[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 113.6 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x11ce0cf30 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x11ce0d840 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2 0x11ce0ddf0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' -ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x11ce0e050 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q5_K_f32_r1_5', name = 'kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8 0x11ce0ea30 | th_max = 640 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8 0x11ce0efe0 | th_max = 640 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x11ce0f360 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x11ce0f5c0 | th_max = 448 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x11ce10000 | th_max = 1024 | th_width = 32 -[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 1065.0 ms -[Debug] detok_output: [2170, 64] first4: -0.124883 1.453879 0.292856 -0.646204 -[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 -[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 -[Debug] context: [2170, 128] first4: -0.124883 1.453879 0.292856 -0.646204 -[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 -[DiT] Batch N=1, T=2170, S=1085, enc_S=238 -[DiT] Graph: 1775 nodes -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x11cf09240 | th_max = 832 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' -ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x11cf19120 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' -ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x11cf19380 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q5_K_f32', name = 'kernel_mul_mv_q5_K_f32_nsg=2' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q5_K_f32_nsg=2 0x11cf1a3c0 | th_max = 576 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x11cf1a740 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x11cf1ae00 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x11cf1b060 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x11cf1b600 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x11cf1bb90 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x11cf1c5b0 | th_max = 640 | th_width = 32 -[Debug] tproj: [12288] first4: 0.260409 -0.161609 -0.102203 0.051602 -[Debug] temb: [2048] first4: -0.000151 -0.132293 -0.035516 0.064751 -[Debug] temb_t: [2048] first4: 0.000578 0.026708 -0.052786 0.063514 -[Debug] temb_r: [2048] first4: -0.000729 -0.159001 0.017269 0.001237 -[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 -[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 -[Debug] temb_lin1_t: [2048] first4: -0.051153 -0.053631 -0.012192 -0.039024 -[Debug] temb_lin1_r: [2048] first4: -0.016165 -0.021121 -0.015801 -0.000525 -[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.044511 -0.951831 0.540187 0.457322 -[Debug] proj_in_input: [192, 2170] first4: -0.124883 1.453879 0.292856 -0.646204 -[Debug] enc_after_cond_emb: [2048, 238] first4: -0.153168 0.787275 0.319340 -0.492001 -[Debug] layer0_sa_input: [2048, 1085] first4: -0.722961 -0.753736 -0.051927 0.265661 -[Debug] layer0_q_after_rope: [128, 16] first4: -12.602057 0.798570 1.518488 1.778495 -[Debug] layer0_k_after_rope: [128, 8] first4: -0.153168 0.787275 0.319340 -0.492001 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.498292 0.150378 -0.398807 0.484326 -[Debug] layer0_attn_out: [2048, 1085] first4: -12.773369 1.105118 1.773309 1.768943 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.542001 -1.018193 0.152304 0.468235 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.605642 -0.786551 -0.346129 0.499558 -[Debug] hidden_after_layer0: [2048, 1085] first4: -9.171107 0.593998 51.751106 -0.883031 -[Debug] hidden_after_layer6: [2048, 1085] first4: -20.936150 0.582827 29.989494 -4.872031 -[Debug] hidden_after_layer12: [2048, 1085] first4: -18.277052 -17.088211 71.559052 24.992846 -[Debug] hidden_after_layer18: [2048, 1085] first4: -25.915581 10.692349 65.928192 19.066517 -[Debug] hidden_after_layer23: [2048, 1085] first4: -6.799564 38.425339 203.467468 153.140854 -[Debug] dit_step0_vt: [2170, 64] first4: 0.015160 1.163890 0.353989 2.352075 -[Debug] dit_step0_xt: [2170, 64] first4: 0.193647 2.103346 -0.187965 0.740744 -[DiT] step 1/8 t=1.000 -[Debug] dit_step1_vt: [2170, 64] first4: -0.238755 1.372093 -0.135596 1.879695 -[Debug] dit_step1_xt: [2170, 64] first4: 0.206670 2.028504 -0.180569 0.638215 -[DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: -0.034453 1.243445 0.102498 2.382742 -[Debug] dit_step2_xt: [2170, 64] first4: 0.208967 1.945608 -0.187402 0.479365 -[DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: 0.286766 1.110088 0.263285 2.616079 -[Debug] dit_step3_xt: [2170, 64] first4: 0.185070 1.853101 -0.209343 0.261359 -[DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 0.359540 0.909711 0.077998 2.701789 -[Debug] dit_step4_xt: [2170, 64] first4: 0.146547 1.755632 -0.217700 -0.028118 -[DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 0.361413 0.800675 -0.393300 2.731152 -[Debug] dit_step5_xt: [2170, 64] first4: 0.094917 1.641250 -0.161514 -0.418283 -[DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: 0.247442 0.617176 -0.960503 2.789753 -[Debug] dit_step6_xt: [2170, 64] first4: 0.045429 1.517814 0.030587 -0.976234 -[DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: 0.019480 0.316869 -1.427597 3.153955 -[Debug] dit_x0: [2170, 64] first4: 0.039585 1.422753 0.458866 -1.922420 -[DiT] step 8/8 t=0.300 -[DiT] Total generation: 8546.5 ms (8546.5 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.039585 1.422753 0.458866 -1.922420 -[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 474 nodes, T_latent=192 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' -ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x11ce0b610 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x11ce10380 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x11ce10870 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x11ce10ad0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x11ce10e80 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x11ce11660 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x11ce11bc0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x11ce12b20 | th_max = 1024 | th_width = 32 -[VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 474 nodes, T_latent=256 -[VAE] Graph: 474 nodes, T_latent=186 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x11ce12d80 | th_max = 896 | th_width = 32 -[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 609611.2 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000665 0.001184 0.001013 0.001406 -[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Request 1/1] Done -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Pipeline] All done -ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices -ggml_metal_library_init: using embedded metal library -ggml_metal_library_init: loaded in 0.006 sec -ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) -ggml_metal_device_init: GPU name: MTL0 -ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) -ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) -ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) -ggml_metal_device_init: simdgroup reduction = true -ggml_metal_device_init: simdgroup matrix mul. = true -ggml_metal_device_init: has unified memory = true -ggml_metal_device_init: has bfloat = true -ggml_metal_device_init: has tensor = false -ggml_metal_device_init: use residency sets = true -ggml_metal_device_init: use shared buffers = true -ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] DiT backend: MTL0 (CPU threads: 5) -[Load] Backend init: 21.5 ms -[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800 -[DiT] Self-attn: Q+K fused, V separate -[DiT] Cross-attn: all separate -[DiT] MLP: gate+up fused -[Load] null_condition_emb found (CFG available) -[WeightCtx] Loaded 478 tensors, 1061.2 MB into backend -[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 1513.9 ms -[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800 -[Load] silence_latent: [15000, 64] from GGUF -[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] VAE backend: MTL0 (CPU threads: 5) -[VAE] Backend: MTL0, Weight buffer: 255.7 MB -[VAE] Loaded: 5 blocks, upsample=1920x -[Load] VAE weights: 272.4 ms -[Request 1/1] ggml-sft/request0.json (batch=1) -[Request] parsed ggml-sft/request0.json (18 fields) -[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s -[Pipeline] 434 audio codes (86.8s @ 5Hz) -[Pipeline] T=2170, S=1085 -[BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 42.2 ms -[Pipeline] caption: 70 tokens, lyrics: 167 tokens -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] TextEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 28L, H=1024, Nh=16/8 -[Load] TextEncoder: 225.9 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' -ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x12de0dc30 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x12de0e090 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x12de0e900 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' -ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x12de0ed80 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x12de0f5f0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x12de0fc20 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x12de10500 | th_max = 576 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x12de10910 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' -ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x12de10b70 | th_max = 1024 | th_width = 32 -[Encode] TextEncoder (70 tokens): 49.1 ms -[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 42.5 ms -[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] CondEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800 -[WeightCtx] Loaded 140 tensors, 412.5 MB into backend -[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 760.1 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1 0x103e0be70 | th_max = 896 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q5_K_f32', name = 'kernel_mul_mm_q5_K_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q5_K_f32_bci=0_bco=1 0x103e0c420 | th_max = 832 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x103e0c860 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x103e0cfd0 | th_max = 640 | th_width = 32 -[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 162.8 ms, enc_S=238 -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Debug] enc_hidden: [238, 2048] first4: 1.751803 -0.051174 -0.133188 0.058982 -[GGUF] ../models/acestep-v15-sft-Q5_K_M.gguf: 678 tensors, data at offset 56800 -[WeightCtx] Loaded 30 tensors, 73.2 MB into backend -[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 115.2 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x103f05ca0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x103f065b0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2 0x103f06b60 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' -ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x103f06dc0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q5_K_f32_r1_5', name = 'kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q5_K_f32_r1_5_nsg=2_nxpsg=8 0x103f07ca0 | th_max = 640 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8 0x103f08250 | th_max = 640 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x103f085d0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x103f08830 | th_max = 448 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x103f091d0 | th_max = 1024 | th_width = 32 -[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 1055.5 ms -[Debug] detok_output: [2170, 64] first4: -0.124883 1.453879 0.292856 -0.646204 -[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 -[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 -[Debug] context: [2170, 128] first4: -0.124883 1.453879 0.292856 -0.646204 -[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1 -[DiT] Batch N=1, T=2170, S=1085, enc_S=238 -[DiT] Graph: 1775 nodes -[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683 -[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683 -[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x103e0dfd0 | th_max = 832 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' -ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x103e0e530 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' -ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x103e0e940 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q5_K_f32', name = 'kernel_mul_mv_q5_K_f32_nsg=2' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q5_K_f32_nsg=2 0x103e0f980 | th_max = 576 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x103e0fd00 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x103e103c0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x103e10620 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x103e10bc0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x103e11150 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x103e11b70 | th_max = 640 | th_width = 32 -[Debug] tproj: [12288] first4: 0.155137 -0.118305 -0.084248 0.082623 -[Debug] temb: [2048] first4: -0.002843 -0.176820 0.004745 -0.001924 -[Debug] temb_t: [2048] first4: -0.001351 0.003023 -0.012552 -0.001712 -[Debug] temb_r: [2048] first4: -0.001491 -0.179843 0.017298 -0.000212 -[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 -[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 -[Debug] temb_lin1_t: [2048] first4: -0.034754 0.028817 0.027120 -0.031729 -[Debug] temb_lin1_r: [2048] first4: -0.002680 0.004202 0.000655 -0.002088 -[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.095122 -0.908724 0.502637 0.450925 -[Debug] proj_in_input: [192, 2170] first4: -0.124883 1.453879 0.292856 -0.646204 -[Debug] enc_after_cond_emb: [2048, 238] first4: -0.173051 0.876023 0.351566 -0.532545 -[Debug] layer0_sa_input: [2048, 1085] first4: -0.920384 -0.707757 -0.034391 0.299813 -[Debug] layer0_q_after_rope: [128, 16] first4: -12.596228 0.535827 1.482060 1.773901 -[Debug] layer0_k_after_rope: [128, 8] first4: -0.173051 0.876023 0.351566 -0.532545 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.637092 0.777700 -0.593592 0.534410 -[Debug] layer0_attn_out: [2048, 1085] first4: -12.115236 1.011131 1.711030 1.787191 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.803575 -1.350455 -0.166552 0.391822 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.983645 -0.994904 -0.442195 0.398258 -[Debug] hidden_after_layer0: [2048, 1085] first4: -9.639871 1.314413 59.265984 -1.678902 -[Debug] hidden_after_layer6: [2048, 1085] first4: -15.443645 3.665146 59.001129 -0.430717 -[Debug] hidden_after_layer12: [2048, 1085] first4: -13.175318 0.713974 -24.768734 -0.709223 -[Debug] hidden_after_layer18: [2048, 1085] first4: -5.504215 14.850023 -44.686668 -9.688757 -[Debug] hidden_after_layer23: [2048, 1085] first4: 32.081551 63.384781 49.094582 -14.152830 -[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.659668 2.541722 -0.175072 1.431705 -[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.363007 2.087823 -0.714051 1.721254 -[Debug] dit_step0_vt: [2170, 64] first4: -0.859789 3.013237 0.095956 1.039126 -[Debug] dit_step0_xt: [2170, 64] first4: 0.211532 2.095985 -0.173794 0.826874 -[DiT] step 1/50 t=1.000 -[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.721169 2.535385 -0.028817 1.265576 -[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.587318 2.330142 -0.122353 1.467132 -[Debug] dit_step1_vt: [2170, 64] first4: -0.560584 2.181154 -0.231214 1.319647 -[Debug] dit_step1_xt: [2170, 64] first4: 0.222743 2.052362 -0.169170 0.800481 -[DiT] step 2/50 t=0.980 -[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.714072 2.499362 0.020078 1.278304 -[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.600958 2.309783 -0.102115 1.400073 -[Debug] dit_step2_vt: [2170, 64] first4: -0.861209 2.897427 0.258299 0.998091 -[Debug] dit_step2_xt: [2170, 64] first4: 0.239968 1.994414 -0.174336 0.780519 -[DiT] step 3/50 t=0.960 -[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.674846 2.427423 0.055491 1.297147 -[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.635843 2.332050 -0.021028 1.341071 -[Debug] dit_step3_vt: [2170, 64] first4: -0.498860 2.133466 -0.141608 1.438968 -[Debug] dit_step3_xt: [2170, 64] first4: 0.249945 1.951744 -0.171504 0.751740 -[DiT] step 4/50 t=0.940 -[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.605752 2.344337 0.051579 1.306835 -[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.617271 2.293072 -0.026830 1.327208 -[Debug] dit_step4_vt: [2170, 64] first4: -0.645479 2.581793 0.280999 1.082338 -[Debug] dit_step4_xt: [2170, 64] first4: 0.262854 1.900108 -0.177124 0.730093 -[DiT] step 5/50 t=0.920 -[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.529691 2.265976 0.024912 1.308485 -[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.613722 2.254884 0.007785 1.353816 -[Debug] dit_step5_vt: [2170, 64] first4: -0.220550 1.958779 -0.197940 1.314650 -[Debug] dit_step5_xt: [2170, 64] first4: 0.267265 1.860933 -0.173165 0.703800 -[DiT] step 6/50 t=0.900 -[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.492980 2.220784 0.013978 1.314708 -[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.574326 2.194118 0.017516 1.377784 -[Debug] dit_step6_vt: [2170, 64] first4: -0.484828 2.424557 0.188767 1.065213 -[Debug] dit_step6_xt: [2170, 64] first4: 0.276962 1.812442 -0.176940 0.682496 -[DiT] step 7/50 t=0.880 -[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.442502 2.140999 -0.004548 1.307788 -[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.527369 2.112375 0.025084 1.375065 -[Debug] dit_step7_vt: [2170, 64] first4: -0.162851 1.943518 -0.225018 1.235321 -[Debug] dit_step7_xt: [2170, 64] first4: 0.280219 1.773571 -0.172440 0.657789 -[DiT] step 8/50 t=0.860 -[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.402943 2.052812 -0.006712 1.290008 -[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.459439 2.025463 0.030814 1.342298 -[Debug] dit_step8_vt: [2170, 64] first4: -0.414733 2.160834 0.094455 1.088282 -[Debug] dit_step8_xt: [2170, 64] first4: 0.288514 1.730355 -0.174329 0.636023 -[DiT] step 9/50 t=0.840 -[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.369760 1.969441 -0.010690 1.271576 -[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.396671 1.951135 0.030780 1.310076 -[Debug] dit_step9_vt: [2170, 64] first4: -0.235488 1.803751 -0.198291 1.206838 -[Debug] dit_step9_xt: [2170, 64] first4: 0.293223 1.694280 -0.170363 0.611887 -[DiT] step 10/50 t=0.820 -[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.344175 1.892885 0.003662 1.257559 -[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.342085 1.891713 0.047752 1.301667 -[Debug] dit_step10_vt: [2170, 64] first4: -0.420278 1.858245 0.037965 1.052360 -[Debug] dit_step10_xt: [2170, 64] first4: 0.301629 1.657115 -0.171122 0.590840 -[DiT] step 11/50 t=0.800 -[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.321399 1.822005 0.005853 1.241717 -[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.313690 1.840958 0.057351 1.308882 -[Debug] dit_step11_vt: [2170, 64] first4: -0.256015 1.588119 -0.173446 1.051672 -[Debug] dit_step11_xt: [2170, 64] first4: 0.306749 1.625353 -0.167653 0.569806 -[DiT] step 12/50 t=0.780 -[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.300376 1.758913 0.017258 1.234417 -[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.284777 1.790303 0.055927 1.316604 -[Debug] dit_step12_vt: [2170, 64] first4: -0.382294 1.631086 0.039774 0.961382 -[Debug] dit_step12_xt: [2170, 64] first4: 0.314395 1.592731 -0.168449 0.550578 -[DiT] step 13/50 t=0.760 -[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.282512 1.699357 0.023821 1.222209 -[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.256755 1.739260 0.045941 1.313994 -[Debug] dit_step13_vt: [2170, 64] first4: -0.292858 1.451734 -0.071923 0.964463 -[Debug] dit_step13_xt: [2170, 64] first4: 0.320252 1.563696 -0.167011 0.531289 -[DiT] step 14/50 t=0.740 -[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.268355 1.643249 0.035884 1.219110 -[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.234205 1.685310 0.038125 1.306284 -[Debug] dit_step14_vt: [2170, 64] first4: -0.380481 1.484198 0.105917 0.956716 -[Debug] dit_step14_xt: [2170, 64] first4: 0.327862 1.534012 -0.169129 0.512155 -[DiT] step 15/50 t=0.720 -[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.257674 1.591244 0.046474 1.215229 -[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.215398 1.632289 0.020497 1.288409 -[Debug] dit_step15_vt: [2170, 64] first4: -0.335985 1.372340 0.098681 1.003970 -[Debug] dit_step15_xt: [2170, 64] first4: 0.334582 1.506565 -0.171102 0.492076 -[DiT] step 16/50 t=0.700 -[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.250808 1.534114 0.052932 1.211927 -[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.205743 1.580690 0.005785 1.270423 -[Debug] dit_step16_vt: [2170, 64] first4: -0.382949 1.342146 0.222679 1.022127 -[Debug] dit_step16_xt: [2170, 64] first4: 0.342241 1.479722 -0.175556 0.471633 -[DiT] step 17/50 t=0.680 -[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.246631 1.471787 0.045976 1.197702 -[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.202947 1.526995 -0.016021 1.248045 -[Debug] dit_step17_vt: [2170, 64] first4: -0.343461 1.218771 0.212458 1.019693 -[Debug] dit_step17_xt: [2170, 64] first4: 0.349110 1.455347 -0.179805 0.451239 -[DiT] step 18/50 t=0.660 -[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.243157 1.409443 0.036330 1.184456 -[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.204529 1.477009 -0.037415 1.231383 -[Debug] dit_step18_vt: [2170, 64] first4: -0.354021 1.150632 0.298290 1.001465 -[Debug] dit_step18_xt: [2170, 64] first4: 0.356190 1.432334 -0.185771 0.431210 -[DiT] step 19/50 t=0.640 -[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.237715 1.343371 0.017683 1.161345 -[Debug] dit_step19_vt_uncond: [2170, 64] first4: -0.204017 1.427205 -0.062926 1.210816 -[Debug] dit_step19_vt: [2170, 64] first4: -0.318427 0.994659 0.274882 0.959663 -[Debug] dit_step19_xt: [2170, 64] first4: 0.362559 1.412441 -0.191269 0.412017 -[DiT] step 20/50 t=0.620 -[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.232230 1.263409 -0.001007 1.136674 -[Debug] dit_step20_vt_uncond: [2170, 64] first4: -0.200904 1.370160 -0.090913 1.187652 -[Debug] dit_step20_vt: [2170, 64] first4: -0.327181 0.859238 0.330215 0.939772 -[Debug] dit_step20_xt: [2170, 64] first4: 0.369102 1.395257 -0.197873 0.393221 -[DiT] step 21/50 t=0.600 -[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.230263 1.180950 -0.026976 1.107196 -[Debug] dit_step21_vt_uncond: [2170, 64] first4: -0.199895 1.312066 -0.122788 1.160427 -[Debug] dit_step21_vt: [2170, 64] first4: -0.309986 0.682107 0.288212 0.884258 -[Debug] dit_step21_xt: [2170, 64] first4: 0.375302 1.381614 -0.203637 0.375536 -[DiT] step 22/50 t=0.580 -[Debug] dit_step22_vt_cond: [2170, 64] first4: -0.226975 1.087024 -0.049604 1.072087 -[Debug] dit_step22_vt_uncond: [2170, 64] first4: -0.197016 1.244136 -0.149074 1.129456 -[Debug] dit_step22_vt: [2170, 64] first4: -0.315666 0.502494 0.302365 0.836432 -[Debug] dit_step22_xt: [2170, 64] first4: 0.381615 1.371564 -0.209685 0.358807 -[DiT] step 23/50 t=0.560 -[Debug] dit_step23_vt_cond: [2170, 64] first4: -0.229945 0.987764 -0.068857 1.041486 -[Debug] dit_step23_vt_uncond: [2170, 64] first4: -0.196528 1.172394 -0.166018 1.101852 -[Debug] dit_step23_vt: [2170, 64] first4: -0.331807 0.301117 0.239278 0.795295 -[Debug] dit_step23_xt: [2170, 64] first4: 0.388252 1.365542 -0.214470 0.342901 -[DiT] step 24/50 t=0.540 -[Debug] dit_step24_vt_cond: [2170, 64] first4: -0.234566 0.878375 -0.086414 0.999502 -[Debug] dit_step24_vt_uncond: [2170, 64] first4: -0.196025 1.088575 -0.178235 1.066777 -[Debug] dit_step24_vt: [2170, 64] first4: -0.353102 0.129063 0.219470 0.719129 -[Debug] dit_step24_xt: [2170, 64] first4: 0.395314 1.362961 -0.218859 0.328519 -[DiT] step 25/50 t=0.520 -[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.244918 0.762549 -0.099009 0.962606 -[Debug] dit_step25_vt_uncond: [2170, 64] first4: -0.200310 0.995220 -0.184174 1.033471 -[Debug] dit_step25_vt: [2170, 64] first4: -0.384523 -0.079339 0.165061 0.686562 -[Debug] dit_step25_xt: [2170, 64] first4: 0.403004 1.364548 -0.222161 0.314788 -[DiT] step 26/50 t=0.500 -[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.256133 0.634451 -0.110018 0.919318 -[Debug] dit_step26_vt_uncond: [2170, 64] first4: -0.204624 0.887617 -0.187237 0.997615 -[Debug] dit_step26_vt: [2170, 64] first4: -0.416561 -0.257153 0.136664 0.594071 -[Debug] dit_step26_xt: [2170, 64] first4: 0.411335 1.369691 -0.224894 0.302906 -[DiT] step 27/50 t=0.480 -[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.269545 0.510911 -0.116178 0.879919 -[Debug] dit_step27_vt_uncond: [2170, 64] first4: -0.210466 0.778243 -0.183774 0.961990 -[Debug] dit_step27_vt: [2170, 64] first4: -0.454081 -0.397413 0.085143 0.568775 -[Debug] dit_step27_xt: [2170, 64] first4: 0.420417 1.377639 -0.226597 0.291531 -[DiT] step 28/50 t=0.460 -[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.282172 0.372450 -0.120048 0.831178 -[Debug] dit_step28_vt_uncond: [2170, 64] first4: -0.216011 0.655762 -0.179339 0.918627 -[Debug] dit_step28_vt: [2170, 64] first4: -0.483525 -0.575904 0.063843 0.478002 -[Debug] dit_step28_xt: [2170, 64] first4: 0.430087 1.389157 -0.227874 0.281971 -[DiT] step 29/50 t=0.440 -[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.293168 0.231214 -0.124586 0.781733 -[Debug] dit_step29_vt_uncond: [2170, 64] first4: -0.219691 0.525349 -0.170166 0.873745 -[Debug] dit_step29_vt: [2170, 64] first4: -0.518206 -0.710899 -0.005456 0.435657 -[Debug] dit_step29_xt: [2170, 64] first4: 0.440451 1.403375 -0.227765 0.273257 -[DiT] step 30/50 t=0.420 -[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.302506 0.086124 -0.124549 0.731757 -[Debug] dit_step30_vt_uncond: [2170, 64] first4: -0.221950 0.385721 -0.158065 0.830404 -[Debug] dit_step30_vt: [2170, 64] first4: -0.544895 -0.854978 -0.023964 0.343625 -[Debug] dit_step30_xt: [2170, 64] first4: 0.451349 1.420475 -0.227285 0.266385 -[DiT] step 31/50 t=0.400 -[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.310194 -0.052540 -0.126955 0.680247 -[Debug] dit_step31_vt_uncond: [2170, 64] first4: -0.222645 0.246419 -0.151881 0.786437 -[Debug] dit_step31_vt: [2170, 64] first4: -0.574628 -0.958966 -0.058730 0.282347 -[Debug] dit_step31_xt: [2170, 64] first4: 0.462842 1.439654 -0.226111 0.260738 -[DiT] step 32/50 t=0.380 -[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.314911 -0.194979 -0.126385 0.629081 -[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.220352 0.102109 -0.144540 0.743076 -[Debug] dit_step32_vt: [2170, 64] first4: -0.598710 -1.085776 -0.069698 0.196720 -[Debug] dit_step32_xt: [2170, 64] first4: 0.474816 1.461369 -0.224717 0.256804 -[DiT] step 33/50 t=0.360 -[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.318386 -0.334863 -0.120002 0.581452 -[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.218793 -0.045891 -0.135709 0.701808 -[Debug] dit_step33_vt: [2170, 64] first4: -0.615695 -1.169131 -0.067985 0.140436 -[Debug] dit_step33_xt: [2170, 64] first4: 0.487130 1.484752 -0.223357 0.253995 -[DiT] step 34/50 t=0.340 -[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.319302 -0.465754 -0.110709 0.538110 -[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.211723 -0.189661 -0.128844 0.664668 -[Debug] dit_step34_vt: [2170, 64] first4: -0.653452 -1.253376 -0.044227 0.068009 -[Debug] dit_step34_xt: [2170, 64] first4: 0.500199 1.509819 -0.222472 0.252635 -[DiT] step 35/50 t=0.320 -[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.319083 -0.598486 -0.097674 0.492459 -[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.205768 -0.335506 -0.119003 0.623259 -[Debug] dit_step35_vt: [2170, 64] first4: -0.661011 -1.330961 -0.027972 0.020086 -[Debug] dit_step35_xt: [2170, 64] first4: 0.513419 1.536439 -0.221913 0.252233 -[DiT] step 36/50 t=0.300 -[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.318070 -0.730204 -0.080886 0.449659 -[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.199361 -0.482989 -0.108305 0.581833 -[Debug] dit_step36_vt: [2170, 64] first4: -0.690251 -1.401439 0.013562 -0.032488 -[Debug] dit_step36_xt: [2170, 64] first4: 0.527224 1.564467 -0.222184 0.252883 -[DiT] step 37/50 t=0.280 -[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.316356 -0.845106 -0.054751 0.408449 -[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.194137 -0.614114 -0.090670 0.540072 -[Debug] dit_step37_vt: [2170, 64] first4: -0.694876 -1.462489 0.059738 -0.070205 -[Debug] dit_step37_xt: [2170, 64] first4: 0.541122 1.593717 -0.223379 0.254287 -[DiT] step 38/50 t=0.260 -[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.317038 -0.954626 -0.027545 0.366891 -[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.190420 -0.739252 -0.071602 0.495985 -[Debug] dit_step38_vt: [2170, 64] first4: -0.728120 -1.522043 0.107842 -0.107965 -[Debug] dit_step38_xt: [2170, 64] first4: 0.555684 1.624158 -0.225536 0.256446 -[DiT] step 39/50 t=0.240 -[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.317547 -1.061431 0.009151 0.320700 -[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.189703 -0.862508 -0.047778 0.446086 -[Debug] dit_step39_vt: [2170, 64] first4: -0.718521 -1.567608 0.187925 -0.137347 -[Debug] dit_step39_xt: [2170, 64] first4: 0.570055 1.655510 -0.229294 0.259193 -[DiT] step 40/50 t=0.220 -[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.321349 -1.147692 0.054264 0.273095 -[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.194555 -0.963666 -0.014199 0.392062 -[Debug] dit_step40_vt: [2170, 64] first4: -0.725549 -1.599213 0.254155 -0.152648 -[Debug] dit_step40_xt: [2170, 64] first4: 0.584566 1.687495 -0.234377 0.262246 -[DiT] step 41/50 t=0.200 -[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.323505 -1.229047 0.104885 0.223181 -[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.200140 -1.061003 0.024296 0.333073 -[Debug] dit_step41_vt: [2170, 64] first4: -0.706612 -1.620246 0.347929 -0.161344 -[Debug] dit_step41_xt: [2170, 64] first4: 0.598698 1.719899 -0.241336 0.265473 -[DiT] step 42/50 t=0.180 -[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.325457 -1.305037 0.162371 0.163510 -[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.210729 -1.152184 0.073317 0.265414 -[Debug] dit_step42_vt: [2170, 64] first4: -0.675503 -1.653079 0.417194 -0.191713 -[Debug] dit_step42_xt: [2170, 64] first4: 0.612208 1.752961 -0.249680 0.269307 -[DiT] step 43/50 t=0.160 -[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.327040 -1.367895 0.222307 0.103005 -[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.223734 -1.229896 0.124878 0.195856 -[Debug] dit_step43_vt: [2170, 64] first4: -0.637198 -1.669221 0.519679 -0.207779 -[Debug] dit_step43_xt: [2170, 64] first4: 0.624952 1.786345 -0.260074 0.273463 -[DiT] step 44/50 t=0.140 -[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.324606 -1.422529 0.282540 0.041568 -[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.235813 -1.298376 0.179092 0.128338 -[Debug] dit_step44_vt: [2170, 64] first4: -0.582920 -1.697035 0.593491 -0.255212 -[Debug] dit_step44_xt: [2170, 64] first4: 0.636610 1.820286 -0.271943 0.278567 -[DiT] step 45/50 t=0.120 -[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.322754 -1.471227 0.337328 -0.019488 -[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.253130 -1.357642 0.232497 0.062397 -[Debug] dit_step45_vt: [2170, 64] first4: -0.511094 -1.728875 0.663531 -0.289321 -[Debug] dit_step45_xt: [2170, 64] first4: 0.646832 1.854864 -0.285214 0.284353 -[DiT] step 46/50 t=0.100 -[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.317589 -1.518562 0.387160 -0.074592 -[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.269982 -1.418198 0.282772 0.001510 -[Debug] dit_step46_vt: [2170, 64] first4: -0.434923 -1.750584 0.706043 -0.325901 -[Debug] dit_step46_xt: [2170, 64] first4: 0.655531 1.889875 -0.299335 0.290871 -[DiT] step 47/50 t=0.080 -[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.316530 -1.561702 0.429371 -0.113134 -[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.290275 -1.474672 0.330953 -0.045588 -[Debug] dit_step47_vt: [2170, 64] first4: -0.369515 -1.780826 0.721569 -0.327625 -[Debug] dit_step47_xt: [2170, 64] first4: 0.662921 1.925492 -0.313766 0.297424 -[DiT] step 48/50 t=0.060 -[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.304095 -1.593375 0.469693 -0.135493 -[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.298372 -1.526686 0.379661 -0.093868 -[Debug] dit_step48_vt: [2170, 64] first4: -0.296147 -1.763528 0.744123 -0.229345 -[Debug] dit_step48_xt: [2170, 64] first4: 0.668844 1.960763 -0.328649 0.302011 -[DiT] step 49/50 t=0.040 -[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.330730 -1.622756 0.480628 -0.154374 -[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.324303 -1.543248 0.396626 -0.105187 -[Debug] dit_step49_vt: [2170, 64] first4: -0.371182 -1.888395 0.739232 -0.345413 -[Debug] dit_x0: [2170, 64] first4: 0.676268 1.998530 -0.343433 0.308919 -[DiT] step 50/50 t=0.020 -[DiT] Total generation: 106456.5 ms (106456.5 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.676268 1.998530 -0.343433 0.308919 -[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 474 nodes, T_latent=192 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' -ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x103f04c20 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x103f072d0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x103f09950 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x103f05240 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x103f09cd0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x103f0a8c0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x103f0ab20 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x103f0beb0 | th_max = 1024 | th_width = 32 -[VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 474 nodes, T_latent=256 -[VAE] Graph: 474 nodes, T_latent=186 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x103f04080 | th_max = 896 | th_width = 32 -[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 609604.9 ms -[Debug] vae_audio: [2, 4166400] first4: -0.002491 -0.002402 -0.002394 -0.002024 -[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Request 1/1] Done -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Pipeline] All done -[Request] Loaded request0.json -[Noise] Reusing existing rng_philox_seed42.bf16 -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf -[GGML] Running acestep-v15-turbo-Q5_K_M.gguf... -[GGML] Done, 47 dump files -[Turbo] Reusing existing Python dumps: python-turbo -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999813 - lyric_embed 1.000000 - enc_hidden 0.999083 - detok_output 0.999884 - context 0.999927 - noise 1.000000 - temb_t 0.999972 - hidden_after_proj_in 0.999966 - enc_after_cond_emb 0.999209 - layer0_sa_output 0.999459 - hidden_after_layer0 0.999838 - hidden_after_layer6 0.999790 - hidden_after_layer12 0.998662 - hidden_after_layer18 0.995505 - hidden_after_layer23 0.991560 - dit_step0_vt 0.968885 - dit_step0_xt 0.999932 - dit_step1_vt 0.972718 - dit_step1_xt 0.999793 - dit_step2_vt 0.970980 - dit_step2_xt 0.999392 - dit_step3_vt 0.974057 - dit_step3_xt 0.998550 - dit_step4_vt 0.972601 - dit_step4_xt 0.996666 - dit_step5_vt 0.967840 - dit_step5_xt 0.992262 - dit_step6_vt 0.963419 - dit_step6_xt 0.983648 - dit_step7_vt 0.954759 - dit_x0 0.970661 - vae_audio 0.881689 - vae_audio (log spectral) 0.999788 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999930 0.140512 0.007718 -0.002317 0.973035 -0.002342 0.972003 - dit_step1_xt 0.999791 0.264415 0.013154 -0.005313 0.942911 -0.005313 0.941730 - dit_step2_xt 0.999391 0.457878 0.021002 -0.009296 0.909537 -0.009311 0.908527 - dit_step3_xt 0.998548 0.672565 0.031169 -0.014659 0.874300 -0.014577 0.873624 - dit_step4_xt 0.996664 0.977397 0.045289 -0.021867 0.842610 -0.021660 0.841995 - dit_step5_xt 0.992261 1.456099 0.067099 -0.032222 0.826249 -0.032109 0.824593 - dit_step6_xt 0.983647 2.128287 0.100579 -0.046802 0.857538 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q5_K_M.gguf -[GGML] Running acestep-v15-sft-Q5_K_M.gguf... -[GGML] Done, 233 dump files -[SFT] Reusing existing Python dumps: python-sft -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999813 - lyric_embed 1.000000 - enc_hidden 0.999083 - detok_output 0.999884 - context 0.999927 - noise 1.000000 - temb_t 0.999900 - hidden_after_proj_in 0.999966 - enc_after_cond_emb 0.999209 - layer0_sa_output 0.999536 - hidden_after_layer0 0.999891 - hidden_after_layer6 0.999626 - hidden_after_layer12 0.998995 - hidden_after_layer18 0.998026 - hidden_after_layer23 0.998535 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998436 - dit_step0_vt_uncond 0.998344 - dit_step0_vt 0.994668 - dit_step0_xt 0.999997 - dit_step5_vt_cond 0.998676 - dit_step5_vt 0.989830 - dit_step5_xt 0.999935 - dit_step10_vt_cond 0.996806 - dit_step10_vt 0.987585 - dit_step10_xt 0.999744 - dit_step15_vt_cond 0.992244 - dit_step15_vt 0.973038 - dit_step15_xt 0.999108 - dit_step20_vt_cond 0.984474 - dit_step20_vt 0.958153 - dit_step20_xt 0.997397 - dit_step25_vt_cond 0.974096 - dit_step25_vt 0.945640 - dit_step25_xt 0.994154 - dit_step30_vt_cond 0.962790 - dit_step30_vt 0.934107 - dit_step30_xt 0.989253 - dit_step35_vt_cond 0.951958 - dit_step35_vt 0.920426 - dit_step35_xt 0.983572 - dit_step40_vt_cond 0.945880 - dit_step40_vt 0.910054 - dit_step40_xt 0.978292 - dit_step45_vt_cond 0.952542 - dit_step45_vt 0.924831 - dit_step45_xt 0.974685 - dit_step49_vt_cond 0.963084 - dit_step49_vt 0.916267 - dit_x0 0.973449 - vae_audio 0.878623 - vae_audio (log spectral) 0.999566 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999996 0.038422 0.002252 -0.001777 0.980099 -0.001741 0.980402 - dit_step5_xt 0.999933 0.110159 0.007862 -0.006926 0.889483 -0.007143 0.887999 - dit_step10_xt 0.999743 0.216004 0.013603 -0.012446 0.811152 -0.012603 0.811299 - dit_step15_xt 0.999108 0.433603 0.022505 -0.017845 0.746187 -0.018114 0.745269 - dit_step20_xt 0.997397 0.645668 0.035020 -0.023481 0.700583 -0.023808 0.699582 - dit_step25_xt 0.994154 0.894286 0.050478 -0.029250 0.679073 -0.029311 0.679278 - dit_step30_xt 0.989253 1.155794 0.069043 -0.035037 0.684973 -0.035027 0.685262 - dit_step35_xt 0.983572 1.518599 0.089822 -0.040808 0.717172 -0.040716 0.717196 - dit_step40_xt 0.978291 1.917882 0.111854 -0.046581 0.771460 -0.046462 0.771853 - dit_step45_xt 0.974684 2.279520 0.132709 -0.052804 0.843506 -0.052475 0.843036 diff --git a/tests/Metal_Q6_K.log b/tests/Metal_Q6_K.log deleted file mode 100644 index 19a2f5c..0000000 --- a/tests/Metal_Q6_K.log +++ /dev/null @@ -1,819 +0,0 @@ -ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices -ggml_metal_library_init: using embedded metal library -ggml_metal_library_init: loaded in 0.006 sec -ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) -ggml_metal_device_init: GPU name: MTL0 -ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) -ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) -ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) -ggml_metal_device_init: simdgroup reduction = true -ggml_metal_device_init: simdgroup matrix mul. = true -ggml_metal_device_init: has unified memory = true -ggml_metal_device_init: has bfloat = true -ggml_metal_device_init: has tensor = false -ggml_metal_device_init: use residency sets = true -ggml_metal_device_init: use shared buffers = true -ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] DiT backend: MTL0 (CPU threads: 5) -[Load] Backend init: 18.8 ms -[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 -[DiT] MLP: gate+up fused -[Load] null_condition_emb found (CFG available) -[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend -[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 1759.5 ms -[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 -[Load] silence_latent: [15000, 64] from GGUF -[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] VAE backend: MTL0 (CPU threads: 5) -[VAE] Backend: MTL0, Weight buffer: 255.7 MB -[VAE] Loaded: 5 blocks, upsample=1920x -[Load] VAE weights: 335.9 ms -[Request 1/1] ggml-turbo/request0.json (batch=1) -[Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) -[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s -[Pipeline] 434 audio codes (86.8s @ 5Hz) -[Pipeline] T=2170, S=1085 -[BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 42.1 ms -[Pipeline] caption: 70 tokens, lyrics: 167 tokens -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] TextEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 28L, H=1024, Nh=16/8 -[Load] TextEncoder: 294.2 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' -ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x13a80b9e0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x13a80be40 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x13a80c6b0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' -ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x13a80cb30 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x13a80d3a0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x13a80d9d0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x13a80e2b0 | th_max = 576 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x13a80e6c0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' -ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x13a80e920 | th_max = 1024 | th_width = 32 -[Encode] TextEncoder (70 tokens): 46.1 ms -[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 33.8 ms -[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] CondEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 -[WeightCtx] Loaded 140 tensors, 476.3 MB into backend -[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 652.0 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1 0x13a818c40 | th_max = 896 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x13a819080 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x13a8197f0 | th_max = 640 | th_width = 32 -[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 146.8 ms, enc_S=238 -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Debug] enc_hidden: [238, 2048] first4: 1.752129 -0.050073 -0.134015 0.059631 -[GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 -[WeightCtx] Loaded 30 tensors, 82.2 MB into backend -[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 125.6 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x13a817f60 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x13a818470 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2 0x13a808aa0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' -ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x13a808d00 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8 0x13a81a190 | th_max = 640 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x13a81a3f0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x13a81adc0 | th_max = 448 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x13a81b250 | th_max = 1024 | th_width = 32 -[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 1009.6 ms -[Debug] detok_output: [2170, 64] first4: -0.141063 1.454431 0.315142 -0.623566 -[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 -[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 -[Debug] context: [2170, 128] first4: -0.141063 1.454431 0.315142 -0.623566 -[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 -[DiT] Batch N=1, T=2170, S=1085, enc_S=238 -[DiT] Graph: 1841 nodes -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x13970a020 | th_max = 832 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' -ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x13970a280 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' -ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x13970a4e0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x13970b610 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x13970be80 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x13970c0e0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x13970c7b0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x13970cc30 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x13970d300 | th_max = 640 | th_width = 32 -[Debug] tproj: [12288] first4: 0.261089 -0.161223 -0.098728 0.051901 -[Debug] temb: [2048] first4: 0.000236 -0.132397 -0.035348 0.064653 -[Debug] temb_t: [2048] first4: 0.001398 0.026957 -0.052741 0.063660 -[Debug] temb_r: [2048] first4: -0.001162 -0.159353 0.017394 0.000993 -[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 -[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 -[Debug] temb_lin1_t: [2048] first4: -0.049071 -0.051112 -0.017769 -0.037193 -[Debug] temb_lin1_r: [2048] first4: -0.014408 -0.020609 -0.015729 0.003875 -[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.037692 -0.956719 0.540867 0.451860 -[Debug] proj_in_input: [192, 2170] first4: -0.141063 1.454431 0.315142 -0.623566 -[Debug] enc_after_cond_emb: [2048, 238] first4: -0.167564 0.852700 0.309671 -0.538299 -[Debug] layer0_sa_input: [2048, 1085] first4: -0.716202 -0.756050 -0.048455 0.263529 -[Debug] layer0_q_after_rope: [128, 16] first4: -0.167564 0.852700 0.309671 -0.538299 -[Debug] layer0_k_after_rope: [128, 8] first4: -1.214772 -0.856039 -1.908578 -2.256124 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.502833 0.209946 -0.367812 0.520536 -[Debug] layer0_attn_out: [2048, 1085] first4: -1.134820 -0.084089 -34.867664 -0.724257 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540231 -1.049932 0.181504 0.461969 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.609974 -0.819551 -0.333653 0.497179 -[Debug] hidden_after_layer0: [2048, 1085] first4: -9.196066 0.534182 52.334564 -0.880322 -[Debug] hidden_after_layer6: [2048, 1085] first4: -21.325979 -0.207006 34.129318 -4.337931 -[Debug] hidden_after_layer12: [2048, 1085] first4: -15.411194 -16.311844 76.549057 29.816362 -[Debug] hidden_after_layer18: [2048, 1085] first4: -28.025963 13.209218 65.994347 20.087559 -[Debug] hidden_after_layer23: [2048, 1085] first4: -19.575611 48.863052 201.092041 136.881271 -[Debug] dit_step0_vt: [2170, 64] first4: 0.099154 1.132388 0.349667 2.375307 -[Debug] dit_step0_xt: [2170, 64] first4: 0.189829 2.104778 -0.187769 0.739688 -[DiT] step 1/8 t=1.000 -[Debug] dit_step1_vt: [2170, 64] first4: -0.140258 1.329038 -0.173978 1.924904 -[Debug] dit_step1_xt: [2170, 64] first4: 0.197479 2.032285 -0.178279 0.634693 -[DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: 0.064148 1.236530 0.118618 2.406788 -[Debug] dit_step2_xt: [2170, 64] first4: 0.193203 1.949849 -0.186187 0.474240 -[DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: 0.305678 1.101620 0.246811 2.656265 -[Debug] dit_step3_xt: [2170, 64] first4: 0.167730 1.858048 -0.206755 0.252885 -[DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 0.333444 1.032630 0.077940 2.735898 -[Debug] dit_step4_xt: [2170, 64] first4: 0.132004 1.747409 -0.215105 -0.040247 -[DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 0.310135 0.905818 -0.324717 2.786166 -[Debug] dit_step5_xt: [2170, 64] first4: 0.087699 1.618006 -0.168717 -0.438271 -[DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: 0.184311 0.624224 -0.863634 2.781863 -[Debug] dit_step6_xt: [2170, 64] first4: 0.050836 1.493161 0.004009 -0.994643 -[DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: 0.049488 0.168290 -1.435298 3.015385 -[Debug] dit_x0: [2170, 64] first4: 0.035990 1.442675 0.434599 -1.899259 -[DiT] step 8/8 t=0.300 -[DiT] Total generation: 8015.7 ms (8015.7 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.035990 1.442675 0.434599 -1.899259 -[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 474 nodes, T_latent=192 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' -ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x13a81b7d0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x13a81c1a0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x13a81c400 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x13a81ca90 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x13a81ccf0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x13a81d4d0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x13a81da30 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x13a81e910 | th_max = 1024 | th_width = 32 -[VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 474 nodes, T_latent=256 -[VAE] Graph: 474 nodes, T_latent=186 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x13a81eb70 | th_max = 896 | th_width = 32 -[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 609630.7 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000453 0.000980 0.000826 0.001209 -[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Request 1/1] Done -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Pipeline] All done -ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices -ggml_metal_library_init: using embedded metal library -ggml_metal_library_init: loaded in 0.006 sec -ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) -ggml_metal_device_init: GPU name: MTL0 -ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) -ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) -ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) -ggml_metal_device_init: simdgroup reduction = true -ggml_metal_device_init: simdgroup matrix mul. = true -ggml_metal_device_init: has unified memory = true -ggml_metal_device_init: has bfloat = true -ggml_metal_device_init: has tensor = false -ggml_metal_device_init: use residency sets = true -ggml_metal_device_init: use shared buffers = true -ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] DiT backend: MTL0 (CPU threads: 5) -[Load] Backend init: 20.5 ms -[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800 -[DiT] MLP: gate+up fused -[Load] null_condition_emb found (CFG available) -[WeightCtx] Loaded 478 tensors, 1237.2 MB into backend -[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 1781.2 ms -[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800 -[Load] silence_latent: [15000, 64] from GGUF -[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] VAE backend: MTL0 (CPU threads: 5) -[VAE] Backend: MTL0, Weight buffer: 255.7 MB -[VAE] Loaded: 5 blocks, upsample=1920x -[Load] VAE weights: 275.0 ms -[Request 1/1] ggml-sft/request0.json (batch=1) -[Request] parsed ggml-sft/request0.json (18 fields) -[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s -[Pipeline] 434 audio codes (86.8s @ 5Hz) -[Pipeline] T=2170, S=1085 -[BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 42.4 ms -[Pipeline] caption: 70 tokens, lyrics: 167 tokens -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] TextEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 28L, H=1024, Nh=16/8 -[Load] TextEncoder: 361.7 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' -ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x106006bc0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x106007320 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x106008160 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' -ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x1060083c0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x106008e40 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x1060093a0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x106009600 | th_max = 576 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x10600a090 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' -ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x10600a2f0 | th_max = 1024 | th_width = 32 -[Encode] TextEncoder (70 tokens): 45.5 ms -[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 35.7 ms -[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] CondEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800 -[WeightCtx] Loaded 140 tensors, 476.3 MB into backend -[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 850.5 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q6_K_f32', name = 'kernel_mul_mm_q6_K_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q6_K_f32_bci=0_bco=1 0x120606720 | th_max = 896 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x120606ba0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x1206071d0 | th_max = 640 | th_width = 32 -[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 151.8 ms, enc_S=238 -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Debug] enc_hidden: [238, 2048] first4: 1.752129 -0.050073 -0.134015 0.059631 -[GGUF] ../models/acestep-v15-sft-Q6_K.gguf: 678 tensors, data at offset 56800 -[WeightCtx] Loaded 30 tensors, 82.2 MB into backend -[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 134.7 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x106012a70 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x106013260 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q6_K_f32', name = 'kernel_mul_mv_q6_K_f32_nsg=2' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q6_K_f32_nsg=2 0x106013b60 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' -ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x1060134c0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q6_K_f32_r1_5', name = 'kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q6_K_f32_r1_5_nsg=2_nxpsg=8 0x1206079b0 | th_max = 640 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x120608040 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x1206082a0 | th_max = 448 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x120608730 | th_max = 1024 | th_width = 32 -[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 1004.2 ms -[Debug] detok_output: [2170, 64] first4: -0.141063 1.454431 0.315142 -0.623566 -[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 -[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 -[Debug] context: [2170, 128] first4: -0.141063 1.454431 0.315142 -0.623566 -[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1 -[DiT] Batch N=1, T=2170, S=1085, enc_S=238 -[DiT] Graph: 1841 nodes -[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683 -[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683 -[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x120608bb0 | th_max = 832 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' -ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x120608e10 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' -ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x120609070 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x120609f80 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x12060a7f0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x12060aa50 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x12060b0d0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x12060b550 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x12060bc10 | th_max = 640 | th_width = 32 -[Debug] tproj: [12288] first4: 0.153861 -0.117528 -0.090110 0.080834 -[Debug] temb: [2048] first4: -0.002466 -0.176370 0.004369 -0.002069 -[Debug] temb_t: [2048] first4: -0.000999 0.003474 -0.013219 -0.002130 -[Debug] temb_r: [2048] first4: -0.001467 -0.179844 0.017589 0.000062 -[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 -[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 -[Debug] temb_lin1_t: [2048] first4: -0.041140 0.030274 0.027836 -0.025460 -[Debug] temb_lin1_r: [2048] first4: 0.004272 0.006720 0.000208 -0.005103 -[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.088484 -0.913424 0.502796 0.445566 -[Debug] proj_in_input: [192, 2170] first4: -0.141063 1.454431 0.315142 -0.623566 -[Debug] enc_after_cond_emb: [2048, 238] first4: -0.194042 0.920094 0.309464 -0.544236 -[Debug] layer0_sa_input: [2048, 1085] first4: -0.914448 -0.710483 -0.040214 0.295227 -[Debug] layer0_q_after_rope: [128, 16] first4: -0.194042 0.920094 0.309464 -0.544236 -[Debug] layer0_k_after_rope: [128, 8] first4: -1.341203 -0.993715 -1.828661 -2.252987 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.651907 0.800758 -0.600550 0.531539 -[Debug] layer0_attn_out: [2048, 1085] first4: -1.094031 -0.025671 -33.031021 -0.629337 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.804741 -1.365866 -0.176846 0.385942 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.975780 -1.029709 -0.454110 0.391604 -[Debug] hidden_after_layer0: [2048, 1085] first4: -9.417660 1.271689 57.716125 -1.718801 -[Debug] hidden_after_layer6: [2048, 1085] first4: -17.205166 2.402088 59.038250 -1.336451 -[Debug] hidden_after_layer12: [2048, 1085] first4: -10.471869 6.708532 -25.396618 -2.966099 -[Debug] hidden_after_layer18: [2048, 1085] first4: -4.594971 20.646416 -42.849018 -14.024486 -[Debug] hidden_after_layer23: [2048, 1085] first4: 34.838955 64.575096 51.865501 -11.288853 -[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.621944 2.533786 -0.220703 1.441472 -[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.335422 2.090178 -0.712111 1.749312 -[Debug] dit_step0_vt: [2170, 64] first4: -0.817132 3.005553 0.060651 1.037697 -[Debug] dit_step0_xt: [2170, 64] first4: 0.210679 2.096139 -0.173088 0.826902 -[DiT] step 1/50 t=1.000 -[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.663130 2.509250 -0.047312 1.284879 -[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.535073 2.303810 -0.132373 1.511020 -[Debug] dit_step1_vt: [2170, 64] first4: -0.505551 2.169540 -0.259578 1.319904 -[Debug] dit_step1_xt: [2170, 64] first4: 0.220790 2.052748 -0.167896 0.800504 -[DiT] step 2/50 t=0.980 -[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.658958 2.478008 -0.004274 1.297306 -[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.555012 2.295219 -0.109707 1.445456 -[Debug] dit_step2_vt: [2170, 64] first4: -0.800255 2.877401 0.236257 0.991010 -[Debug] dit_step2_xt: [2170, 64] first4: 0.236795 1.995200 -0.172622 0.780684 -[DiT] step 3/50 t=0.960 -[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.623685 2.405478 0.016157 1.310913 -[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.584147 2.331472 -0.052434 1.362097 -[Debug] dit_step3_vt: [2170, 64] first4: -0.460909 2.085223 -0.179917 1.477093 -[Debug] dit_step3_xt: [2170, 64] first4: 0.246013 1.953496 -0.169023 0.751142 -[DiT] step 4/50 t=0.940 -[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.573136 2.336644 0.011644 1.310671 -[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.579254 2.305885 -0.063045 1.333517 -[Debug] dit_step4_vt: [2170, 64] first4: -0.615409 2.553339 0.248993 1.073298 -[Debug] dit_step4_xt: [2170, 64] first4: 0.258321 1.902429 -0.174003 0.729676 -[DiT] step 5/50 t=0.920 -[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.517031 2.271256 -0.017464 1.306595 -[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.593495 2.274611 -0.028605 1.340369 -[Debug] dit_step5_vt: [2170, 64] first4: -0.226837 1.944980 -0.246283 1.356041 -[Debug] dit_step5_xt: [2170, 64] first4: 0.262858 1.863529 -0.169077 0.702555 -[DiT] step 6/50 t=0.900 -[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.492334 2.237296 -0.030848 1.308453 -[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.559225 2.211779 -0.028865 1.356664 -[Debug] dit_step6_vt: [2170, 64] first4: -0.506265 2.451765 0.152689 1.063906 -[Debug] dit_step6_xt: [2170, 64] first4: 0.272983 1.814494 -0.172131 0.681277 -[DiT] step 7/50 t=0.880 -[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.448929 2.166704 -0.051454 1.296641 -[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.516741 2.132911 -0.036027 1.347799 -[Debug] dit_step7_vt: [2170, 64] first4: -0.201648 1.970928 -0.247107 1.267663 -[Debug] dit_step7_xt: [2170, 64] first4: 0.277016 1.775075 -0.167189 0.655924 -[DiT] step 8/50 t=0.860 -[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.412205 2.087660 -0.059856 1.279323 -[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.450006 2.049930 -0.047531 1.315284 -[Debug] dit_step8_vt: [2170, 64] first4: -0.455255 2.220170 0.079066 1.097043 -[Debug] dit_step8_xt: [2170, 64] first4: 0.286121 1.730672 -0.168770 0.633983 -[DiT] step 9/50 t=0.840 -[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.378611 2.005139 -0.068357 1.261302 -[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.379138 1.976703 -0.055227 1.283708 -[Debug] dit_step9_vt: [2170, 64] first4: -0.302897 1.841593 -0.214058 1.239796 -[Debug] dit_step9_xt: [2170, 64] first4: 0.292179 1.693840 -0.164489 0.609187 -[DiT] step 10/50 t=0.820 -[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.355196 1.928296 -0.064251 1.252096 -[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.318644 1.912086 -0.042776 1.279346 -[Debug] dit_step10_vt: [2170, 64] first4: -0.498521 1.941743 -0.002070 1.076981 -[Debug] dit_step10_xt: [2170, 64] first4: 0.302150 1.655005 -0.164448 0.587648 -[DiT] step 11/50 t=0.800 -[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.335946 1.853130 -0.070414 1.240474 -[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.288372 1.862547 -0.022430 1.290233 -[Debug] dit_step11_vt: [2170, 64] first4: -0.354338 1.618477 -0.262413 1.094882 -[Debug] dit_step11_xt: [2170, 64] first4: 0.309236 1.622636 -0.159200 0.565750 -[DiT] step 12/50 t=0.780 -[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.320539 1.785762 -0.070432 1.238213 -[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.262826 1.816614 -0.015609 1.306050 -[Debug] dit_step12_vt: [2170, 64] first4: -0.491342 1.658213 -0.091632 0.993836 -[Debug] dit_step12_xt: [2170, 64] first4: 0.319063 1.589471 -0.157367 0.545873 -[DiT] step 13/50 t=0.760 -[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.304664 1.719325 -0.071341 1.231198 -[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.239698 1.764744 -0.021893 1.311546 -[Debug] dit_step13_vt: [2170, 64] first4: -0.399484 1.443110 -0.224588 1.008114 -[Debug] dit_step13_xt: [2170, 64] first4: 0.327053 1.560609 -0.152875 0.525711 -[DiT] step 14/50 t=0.740 -[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.287200 1.663083 -0.055876 1.237650 -[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.221389 1.709672 -0.033314 1.310252 -[Debug] dit_step14_vt: [2170, 64] first4: -0.462595 1.498639 -0.013459 1.015139 -[Debug] dit_step14_xt: [2170, 64] first4: 0.336305 1.530637 -0.152606 0.505408 -[DiT] step 15/50 t=0.720 -[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.268932 1.604917 -0.048117 1.238302 -[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.201852 1.656007 -0.050695 1.299129 -[Debug] dit_step15_vt: [2170, 64] first4: -0.405852 1.327548 -0.070650 1.046717 -[Debug] dit_step15_xt: [2170, 64] first4: 0.344422 1.504086 -0.151193 0.484474 -[DiT] step 16/50 t=0.700 -[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.252512 1.546370 -0.039756 1.239714 -[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.180964 1.599575 -0.068729 1.281288 -[Debug] dit_step16_vt: [2170, 64] first4: -0.465245 1.345990 0.113584 1.099017 -[Debug] dit_step16_xt: [2170, 64] first4: 0.353727 1.477166 -0.153465 0.462493 -[DiT] step 17/50 t=0.680 -[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.235390 1.484958 -0.035930 1.232975 -[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.159705 1.538287 -0.087686 1.257131 -[Debug] dit_step17_vt: [2170, 64] first4: -0.434245 1.219246 0.101116 1.128435 -[Debug] dit_step17_xt: [2170, 64] first4: 0.362412 1.452781 -0.155487 0.439925 -[DiT] step 18/50 t=0.660 -[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.216501 1.424307 -0.036471 1.226410 -[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.139621 1.481410 -0.105683 1.236565 -[Debug] dit_step18_vt: [2170, 64] first4: -0.455215 1.183846 0.234644 1.168681 -[Debug] dit_step18_xt: [2170, 64] first4: 0.371516 1.429104 -0.160180 0.416551 -[DiT] step 19/50 t=0.640 -[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.193781 1.365146 -0.052930 1.209294 -[Debug] dit_step19_vt_uncond: [2170, 64] first4: -0.117725 1.431031 -0.131161 1.210289 -[Debug] dit_step19_vt: [2170, 64] first4: -0.413012 1.059516 0.190559 1.170309 -[Debug] dit_step19_xt: [2170, 64] first4: 0.379776 1.407914 -0.163991 0.393145 -[DiT] step 20/50 t=0.620 -[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.171132 1.297945 -0.074904 1.190113 -[Debug] dit_step20_vt_uncond: [2170, 64] first4: -0.094949 1.381890 -0.154719 1.182902 -[Debug] dit_step20_vt: [2170, 64] first4: -0.416916 0.961257 0.217113 1.187745 -[Debug] dit_step20_xt: [2170, 64] first4: 0.388114 1.388689 -0.168333 0.369390 -[DiT] step 21/50 t=0.600 -[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.156818 1.217449 -0.102774 1.165961 -[Debug] dit_step21_vt_uncond: [2170, 64] first4: -0.076230 1.330654 -0.181784 1.153171 -[Debug] dit_step21_vt: [2170, 64] first4: -0.413541 0.733536 0.153392 1.169193 -[Debug] dit_step21_xt: [2170, 64] first4: 0.396385 1.374018 -0.171401 0.346006 -[DiT] step 22/50 t=0.580 -[Debug] dit_step22_vt_cond: [2170, 64] first4: -0.149490 1.124467 -0.129160 1.136317 -[Debug] dit_step22_vt_uncond: [2170, 64] first4: -0.066301 1.272416 -0.203485 1.126914 -[Debug] dit_step22_vt: [2170, 64] first4: -0.414485 0.550591 0.128185 1.113165 -[Debug] dit_step22_xt: [2170, 64] first4: 0.404675 1.363006 -0.173965 0.323743 -[DiT] step 23/50 t=0.560 -[Debug] dit_step23_vt_cond: [2170, 64] first4: -0.151829 1.034549 -0.154075 1.109805 -[Debug] dit_step23_vt_uncond: [2170, 64] first4: -0.064656 1.215351 -0.221178 1.104513 -[Debug] dit_step23_vt: [2170, 64] first4: -0.436045 0.335528 0.059138 1.081705 -[Debug] dit_step23_xt: [2170, 64] first4: 0.413396 1.356296 -0.175148 0.302109 -[DiT] step 24/50 t=0.540 -[Debug] dit_step24_vt_cond: [2170, 64] first4: -0.158201 0.930576 -0.174559 1.077020 -[Debug] dit_step24_vt_uncond: [2170, 64] first4: -0.070645 1.149243 -0.231212 1.077814 -[Debug] dit_step24_vt: [2170, 64] first4: -0.428632 0.115268 0.009393 1.014248 -[Debug] dit_step24_xt: [2170, 64] first4: 0.421968 1.353990 -0.175335 0.281824 -[DiT] step 25/50 t=0.520 -[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.172706 0.820171 -0.187534 1.046402 -[Debug] dit_step25_vt_uncond: [2170, 64] first4: -0.084898 1.070370 -0.233334 1.052725 -[Debug] dit_step25_vt: [2170, 64] first4: -0.451610 -0.083828 -0.046615 0.975977 -[Debug] dit_step25_xt: [2170, 64] first4: 0.431001 1.355667 -0.174403 0.262304 -[DiT] step 26/50 t=0.500 -[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.189538 0.693448 -0.199819 1.008348 -[Debug] dit_step26_vt_uncond: [2170, 64] first4: -0.101718 0.976297 -0.234447 1.021465 -[Debug] dit_step26_vt: [2170, 64] first4: -0.457462 -0.318421 -0.092668 0.902675 -[Debug] dit_step26_xt: [2170, 64] first4: 0.440150 1.362035 -0.172550 0.244251 -[DiT] step 27/50 t=0.480 -[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.208769 0.570284 -0.208205 0.971380 -[Debug] dit_step27_vt_uncond: [2170, 64] first4: -0.121884 0.876404 -0.234553 0.991057 -[Debug] dit_step27_vt: [2170, 64] first4: -0.474894 -0.475314 -0.124745 0.852438 -[Debug] dit_step27_xt: [2170, 64] first4: 0.449648 1.371541 -0.170055 0.227202 -[DiT] step 28/50 t=0.460 -[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.231451 0.438197 -0.217041 0.925335 -[Debug] dit_step28_vt_uncond: [2170, 64] first4: -0.142976 0.763591 -0.236660 0.951937 -[Debug] dit_step28_vt: [2170, 64] first4: -0.503044 -0.662942 -0.154526 0.775360 -[Debug] dit_step28_xt: [2170, 64] first4: 0.459709 1.384800 -0.166964 0.211695 -[DiT] step 29/50 t=0.440 -[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.255602 0.305935 -0.225167 0.878463 -[Debug] dit_step29_vt_uncond: [2170, 64] first4: -0.161997 0.641782 -0.240440 0.915606 -[Debug] dit_step29_vt: [2170, 64] first4: -0.546771 -0.770888 -0.172051 0.693937 -[Debug] dit_step29_xt: [2170, 64] first4: 0.470644 1.400218 -0.163523 0.197816 -[DiT] step 30/50 t=0.420 -[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.279856 0.166208 -0.230015 0.829064 -[Debug] dit_step30_vt_uncond: [2170, 64] first4: -0.179567 0.505654 -0.246080 0.880209 -[Debug] dit_step30_vt: [2170, 64] first4: -0.586251 -0.913478 -0.165068 0.591419 -[Debug] dit_step30_xt: [2170, 64] first4: 0.482369 1.418488 -0.160222 0.185988 -[DiT] step 31/50 t=0.400 -[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.303020 0.026116 -0.234342 0.778238 -[Debug] dit_step31_vt_uncond: [2170, 64] first4: -0.194565 0.361688 -0.252981 0.842483 -[Debug] dit_step31_vt: [2170, 64] first4: -0.634687 -1.004404 -0.163206 0.505303 -[Debug] dit_step31_xt: [2170, 64] first4: 0.495063 1.438576 -0.156958 0.175882 -[DiT] step 32/50 t=0.380 -[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.323948 -0.114179 -0.233989 0.723499 -[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.205043 0.215050 -0.257018 0.804404 -[Debug] dit_step32_vt: [2170, 64] first4: -0.683278 -1.112943 -0.143677 0.386194 -[Debug] dit_step32_xt: [2170, 64] first4: 0.508728 1.460835 -0.154084 0.168158 -[DiT] step 33/50 t=0.360 -[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.343586 -0.257977 -0.227555 0.671680 -[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.213907 0.060018 -0.258127 0.766574 -[Debug] dit_step33_vt: [2170, 64] first4: -0.732613 -1.188033 -0.112312 0.307609 -[Debug] dit_step33_xt: [2170, 64] first4: 0.523381 1.484595 -0.151838 0.162006 -[DiT] step 34/50 t=0.340 -[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.359997 -0.390317 -0.220456 0.624948 -[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.218993 -0.085222 -0.256678 0.732610 -[Debug] dit_step34_vt: [2170, 64] first4: -0.782051 -1.277164 -0.095120 0.209992 -[Debug] dit_step34_xt: [2170, 64] first4: 0.539022 1.510138 -0.149936 0.157806 -[DiT] step 35/50 t=0.320 -[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.376117 -0.519578 -0.205748 0.575179 -[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.225790 -0.227651 -0.246590 0.695063 -[Debug] dit_step35_vt: [2170, 64] first4: -0.824555 -1.344921 -0.065901 0.128571 -[Debug] dit_step35_xt: [2170, 64] first4: 0.555513 1.537037 -0.148618 0.155234 -[DiT] step 36/50 t=0.300 -[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.388908 -0.645821 -0.189103 0.531176 -[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.231184 -0.367522 -0.234059 0.657502 -[Debug] dit_step36_vt: [2170, 64] first4: -0.864100 -1.422112 -0.041499 0.066854 -[Debug] dit_step36_xt: [2170, 64] first4: 0.572795 1.565479 -0.147788 0.153897 -[DiT] step 37/50 t=0.280 -[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.403025 -0.763416 -0.163423 0.484469 -[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.241143 -0.498980 -0.214114 0.616995 -[Debug] dit_step37_vt: [2170, 64] first4: -0.892219 -1.488433 0.005198 -0.010545 -[Debug] dit_step37_xt: [2170, 64] first4: 0.590639 1.595248 -0.147892 0.154108 -[DiT] step 38/50 t=0.260 -[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.413133 -0.876420 -0.134134 0.440490 -[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.251082 -0.625665 -0.187976 0.573381 -[Debug] dit_step38_vt: [2170, 64] first4: -0.908542 -1.561219 0.031824 -0.045067 -[Debug] dit_step38_xt: [2170, 64] first4: 0.608810 1.626472 -0.148528 0.155009 -[DiT] step 39/50 t=0.240 -[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.423588 -0.987294 -0.096889 0.393114 -[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.265262 -0.752590 -0.154346 0.525557 -[Debug] dit_step39_vt: [2170, 64] first4: -0.912287 -1.609328 0.086172 -0.106501 -[Debug] dit_step39_xt: [2170, 64] first4: 0.627056 1.658659 -0.150252 0.157140 -[DiT] step 40/50 t=0.220 -[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.432563 -1.079476 -0.052369 0.343707 -[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.280351 -0.859098 -0.115334 0.472040 -[Debug] dit_step40_vt: [2170, 64] first4: -0.898876 -1.653373 0.143320 -0.117817 -[Debug] dit_step40_xt: [2170, 64] first4: 0.645033 1.691726 -0.153118 0.159496 -[DiT] step 41/50 t=0.200 -[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.437202 -1.169207 -0.002996 0.295201 -[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.297226 -0.965837 -0.069953 0.414322 -[Debug] dit_step41_vt: [2170, 64] first4: -0.848467 -1.668746 0.201099 -0.128392 -[Debug] dit_step41_xt: [2170, 64] first4: 0.662003 1.725101 -0.157140 0.162064 -[DiT] step 42/50 t=0.180 -[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.440767 -1.252195 0.052645 0.236928 -[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.315933 -1.066464 -0.015775 0.349839 -[Debug] dit_step42_vt: [2170, 64] first4: -0.805727 -1.702440 0.256392 -0.165544 -[Debug] dit_step42_xt: [2170, 64] first4: 0.678117 1.759150 -0.162268 0.165375 -[DiT] step 43/50 t=0.160 -[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.441682 -1.322971 0.111570 0.178405 -[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.332472 -1.154346 0.037873 0.279796 -[Debug] dit_step43_vt: [2170, 64] first4: -0.752842 -1.718463 0.345657 -0.158267 -[Debug] dit_step43_xt: [2170, 64] first4: 0.693174 1.793519 -0.169181 0.168540 -[DiT] step 44/50 t=0.140 -[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.436481 -1.389212 0.170327 0.116380 -[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.347603 -1.238272 0.093730 0.210065 -[Debug] dit_step44_vt: [2170, 64] first4: -0.673781 -1.742915 0.402893 -0.211575 -[Debug] dit_step44_xt: [2170, 64] first4: 0.706650 1.828377 -0.177239 0.172771 -[DiT] step 45/50 t=0.120 -[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.432089 -1.448218 0.222042 0.052259 -[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.365667 -1.310638 0.145240 0.138891 -[Debug] dit_step45_vt: [2170, 64] first4: -0.598716 -1.780947 0.463354 -0.230193 -[Debug] dit_step45_xt: [2170, 64] first4: 0.718624 1.863996 -0.186506 0.177375 -[DiT] step 46/50 t=0.100 -[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.419096 -1.509236 0.267908 -0.006720 -[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.381439 -1.387897 0.192344 0.071553 -[Debug] dit_step46_vt: [2170, 64] first4: -0.477449 -1.801432 0.493477 -0.266383 -[Debug] dit_step46_xt: [2170, 64] first4: 0.728173 1.900025 -0.196375 0.182703 -[DiT] step 47/50 t=0.080 -[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.407120 -1.565253 0.302819 -0.051005 -[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.401163 -1.460867 0.230993 0.012604 -[Debug] dit_step47_vt: [2170, 64] first4: -0.369522 -1.829498 0.516088 -0.236691 -[Debug] dit_step47_xt: [2170, 64] first4: 0.735563 1.936615 -0.206697 0.187437 -[DiT] step 48/50 t=0.060 -[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.382883 -1.607141 0.333374 -0.083609 -[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.407261 -1.531759 0.269964 -0.041764 -[Debug] dit_step48_vt: [2170, 64] first4: -0.250388 -1.766134 0.508857 -0.194591 -[Debug] dit_step48_xt: [2170, 64] first4: 0.740571 1.971938 -0.216874 0.191329 -[DiT] step 49/50 t=0.040 -[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.416988 -1.643981 0.337042 -0.115695 -[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.434090 -1.549805 0.279877 -0.060649 -[Debug] dit_step49_vt: [2170, 64] first4: -0.398854 -1.970749 0.508508 -0.360412 -[Debug] dit_x0: [2170, 64] first4: 0.748548 2.011353 -0.227044 0.198537 -[DiT] step 50/50 t=0.020 -[DiT] Total generation: 99823.1 ms (99823.1 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.748548 2.011353 -0.227044 0.198537 -[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 474 nodes, T_latent=192 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' -ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x12060b7b0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x1206126e0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x120612940 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x120612fd0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x120613410 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x120613a10 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x120613f70 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x120614e50 | th_max = 1024 | th_width = 32 -[VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 474 nodes, T_latent=256 -[VAE] Graph: 474 nodes, T_latent=186 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x10600ac10 | th_max = 896 | th_width = 32 -[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 609612.5 ms -[Debug] vae_audio: [2, 4166400] first4: -0.003173 -0.003180 -0.003117 -0.002677 -[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Request 1/1] Done -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Pipeline] All done -[Request] Loaded request0.json -[Noise] Reusing existing rng_philox_seed42.bf16 -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf -[GGML] Running acestep-v15-turbo-Q6_K.gguf... -[GGML] Done, 47 dump files -[Turbo] Reusing existing Python dumps: python-turbo -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999813 - lyric_embed 1.000000 - enc_hidden 0.999631 - detok_output 0.999969 - context 0.999982 - noise 1.000000 - temb_t 0.999991 - hidden_after_proj_in 0.999987 - enc_after_cond_emb 0.999648 - layer0_sa_output 0.999791 - hidden_after_layer0 0.999898 - hidden_after_layer6 0.999877 - hidden_after_layer12 0.998721 - hidden_after_layer18 0.995721 - hidden_after_layer23 0.992012 - dit_step0_vt 0.970006 - dit_step0_xt 0.999934 - dit_step1_vt 0.973568 - dit_step1_xt 0.999795 - dit_step2_vt 0.976942 - dit_step2_xt 0.999458 - dit_step3_vt 0.977714 - dit_step3_xt 0.998700 - dit_step4_vt 0.976433 - dit_step4_xt 0.997003 - dit_step5_vt 0.973498 - dit_step5_xt 0.993187 - dit_step6_vt 0.970259 - dit_step6_xt 0.985910 - dit_step7_vt 0.963169 - dit_x0 0.975098 - vae_audio 0.894235 - vae_audio (log spectral) 0.999805 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999933 0.149343 0.007343 -0.002290 0.972926 -0.002342 0.972003 - dit_step1_xt 0.999794 0.293469 0.012678 -0.005225 0.942670 -0.005313 0.941730 - dit_step2_xt 0.999456 0.477608 0.019544 -0.009177 0.909085 -0.009311 0.908527 - dit_step3_xt 0.998699 0.734476 0.028962 -0.014472 0.873547 -0.014577 0.873624 - dit_step4_xt 0.997001 1.052176 0.042099 -0.021523 0.841546 -0.021660 0.841995 - dit_step5_xt 0.993185 1.523836 0.061805 -0.031603 0.824654 -0.032109 0.824593 - dit_step6_xt 0.985908 2.172513 0.091680 -0.045910 0.855362 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q6_K.gguf -[GGML] Running acestep-v15-sft-Q6_K.gguf... -[GGML] Done, 233 dump files -[SFT] Reusing existing Python dumps: python-sft -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999813 - lyric_embed 1.000000 - enc_hidden 0.999631 - detok_output 0.999969 - context 0.999982 - noise 1.000000 - temb_t 0.999973 - hidden_after_proj_in 0.999987 - enc_after_cond_emb 0.999652 - layer0_sa_output 0.999803 - hidden_after_layer0 0.999920 - hidden_after_layer6 0.999785 - hidden_after_layer12 0.999350 - hidden_after_layer18 0.998528 - hidden_after_layer23 0.998828 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998729 - dit_step0_vt_uncond 0.998412 - dit_step0_vt 0.995061 - dit_step0_xt 0.999998 - dit_step5_vt_cond 0.999147 - dit_step5_vt 0.992746 - dit_step5_xt 0.999953 - dit_step10_vt_cond 0.997986 - dit_step10_vt 0.991731 - dit_step10_xt 0.999832 - dit_step15_vt_cond 0.995896 - dit_step15_vt 0.984377 - dit_step15_xt 0.999476 - dit_step20_vt_cond 0.992581 - dit_step20_vt 0.975428 - dit_step20_xt 0.998605 - dit_step25_vt_cond 0.986288 - dit_step25_vt 0.963191 - dit_step25_xt 0.996899 - dit_step30_vt_cond 0.979478 - dit_step30_vt 0.956129 - dit_step30_xt 0.994252 - dit_step35_vt_cond 0.972963 - dit_step35_vt 0.947489 - dit_step35_xt 0.991078 - dit_step40_vt_cond 0.968903 - dit_step40_vt 0.939482 - dit_step40_xt 0.987990 - dit_step45_vt_cond 0.973091 - dit_step45_vt 0.949768 - dit_step45_xt 0.985825 - dit_step49_vt_cond 0.979346 - dit_step49_vt 0.959720 - dit_x0 0.985104 - vae_audio 0.940564 - vae_audio (log spectral) 0.999648 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999996 0.039016 0.002154 -0.001750 0.980178 -0.001741 0.980402 - dit_step5_xt 0.999952 0.136674 0.006709 -0.006940 0.889822 -0.007143 0.887999 - dit_step10_xt 0.999831 0.203842 0.011045 -0.012357 0.811533 -0.012603 0.811299 - dit_step15_xt 0.999475 0.335757 0.017566 -0.017603 0.746439 -0.018114 0.745269 - dit_step20_xt 0.998605 0.555654 0.026541 -0.022932 0.700822 -0.023808 0.699582 - dit_step25_xt 0.996899 0.830926 0.037973 -0.028358 0.679564 -0.029311 0.679278 - dit_step30_xt 0.994252 1.135793 0.051746 -0.033803 0.685565 -0.035027 0.685262 - dit_step35_xt 0.991078 1.467212 0.067373 -0.039173 0.717556 -0.040716 0.717196 - dit_step40_xt 0.987990 1.880554 0.084328 -0.044527 0.771174 -0.046462 0.771853 - dit_step45_xt 0.985824 2.238589 0.100473 -0.050335 0.842316 -0.052475 0.843036 diff --git a/tests/Metal_Q8_0.log b/tests/Metal_Q8_0.log deleted file mode 100644 index 3ddbb85..0000000 --- a/tests/Metal_Q8_0.log +++ /dev/null @@ -1,823 +0,0 @@ -ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices -ggml_metal_library_init: using embedded metal library -ggml_metal_library_init: loaded in 0.006 sec -ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) -ggml_metal_device_init: GPU name: MTL0 -ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) -ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) -ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) -ggml_metal_device_init: simdgroup reduction = true -ggml_metal_device_init: simdgroup matrix mul. = true -ggml_metal_device_init: has unified memory = true -ggml_metal_device_init: has bfloat = true -ggml_metal_device_init: has tensor = false -ggml_metal_device_init: use residency sets = true -ggml_metal_device_init: use shared buffers = true -ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] DiT backend: MTL0 (CPU threads: 5) -[Load] Backend init: 21.3 ms -[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 -[DiT] MLP: gate+up fused -[Load] null_condition_emb found (CFG available) -[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend -[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 1779.3 ms -[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 -[Load] silence_latent: [15000, 64] from GGUF -[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] VAE backend: MTL0 (CPU threads: 5) -[VAE] Backend: MTL0, Weight buffer: 255.7 MB -[VAE] Loaded: 5 blocks, upsample=1920x -[Load] VAE weights: 272.0 ms -[Request 1/1] ggml-turbo/request0.json (batch=1) -[Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) -[Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s -[Pipeline] 434 audio codes (86.8s @ 5Hz) -[Pipeline] T=2170, S=1085 -[BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 41.5 ms -[Pipeline] caption: 70 tokens, lyrics: 167 tokens -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] TextEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 28L, H=1024, Nh=16/8 -[Load] TextEncoder: 228.7 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' -ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x14170b900 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x14170bd60 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x14170c5d0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' -ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x14170ca50 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x14170d2c0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x14170d8f0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x14170e1d0 | th_max = 576 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x14170e5e0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' -ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x14170e840 | th_max = 1024 | th_width = 32 -[Encode] TextEncoder (70 tokens): 44.9 ms -[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 34.4 ms -[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] CondEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 -[WeightCtx] Loaded 140 tensors, 616.6 MB into backend -[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 787.4 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q8_0_f32', name = 'kernel_mul_mm_q8_0_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q8_0_f32_bci=0_bco=1 0x141718bd0 | th_max = 896 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x141719010 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x141719670 | th_max = 640 | th_width = 32 -[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 132.0 ms, enc_S=238 -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Debug] enc_hidden: [238, 2048] first4: 1.750375 -0.049236 -0.134516 0.059443 -[GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 -[WeightCtx] Loaded 30 tensors, 106.5 MB into backend -[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 165.9 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x14160b720 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x14160c030 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q8_0_f32', name = 'kernel_mul_mv_q8_0_f32_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q8_0_f32_nsg=4 0x14160c760 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' -ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x14160c9c0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_q8_0_f32', name = 'kernel_cpy_q8_0_f32' -ggml_metal_library_compile_pipeline: loaded kernel_cpy_q8_0_f32 0x14160ce00 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q8_0_f32_r1_5', name = 'kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8 0x14160da60 | th_max = 640 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x14160dde0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x14160e040 | th_max = 448 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x14160ea80 | th_max = 1024 | th_width = 32 -[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 664.8 ms -[Debug] detok_output: [2170, 64] first4: -0.124953 1.437660 0.307949 -0.624704 -[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 -[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 -[Debug] context: [2170, 128] first4: -0.124953 1.437660 0.307949 -0.624704 -[DiT] Starting: T=2170, S=1085, enc_S=238, steps=8, batch=1 -[DiT] Batch N=1, T=2170, S=1085, enc_S=238 -[DiT] Graph: 1841 nodes -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x14160f030 | th_max = 832 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' -ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x14160aa00 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' -ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x14160f9a0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x1416102f0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x1416109b0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x1416113c0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x1416118b0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x141610670 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x141611dc0 | th_max = 640 | th_width = 32 -[Debug] tproj: [12288] first4: 0.260124 -0.161873 -0.097043 0.052039 -[Debug] temb: [2048] first4: 0.000130 -0.132501 -0.035452 0.064788 -[Debug] temb_t: [2048] first4: 0.001146 0.026826 -0.052770 0.063722 -[Debug] temb_r: [2048] first4: -0.001015 -0.159327 0.017318 0.001066 -[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 -[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 -[Debug] temb_lin1_t: [2048] first4: -0.048950 -0.051683 -0.015299 -0.038721 -[Debug] temb_lin1_r: [2048] first4: -0.013066 -0.018836 -0.015732 0.008463 -[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.039670 -0.968864 0.535370 0.447502 -[Debug] proj_in_input: [192, 2170] first4: -0.124953 1.437660 0.307949 -0.624704 -[Debug] enc_after_cond_emb: [2048, 238] first4: -0.166626 0.855863 0.327675 -0.524847 -[Debug] layer0_sa_input: [2048, 1085] first4: -0.719151 -0.764016 -0.047420 0.261850 -[Debug] layer0_q_after_rope: [128, 16] first4: -0.166626 0.855863 0.327675 -0.524847 -[Debug] layer0_k_after_rope: [128, 8] first4: -1.224136 -0.868663 -1.921617 -2.258156 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.509784 0.173032 -0.350482 0.513236 -[Debug] layer0_attn_out: [2048, 1085] first4: -1.198180 -0.062361 -34.349155 -0.672093 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.543300 -1.045637 0.193163 0.457042 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.610051 -0.820831 -0.300355 0.492100 -[Debug] hidden_after_layer0: [2048, 1085] first4: -9.086482 0.559607 52.406876 -0.905369 -[Debug] hidden_after_layer6: [2048, 1085] first4: -21.433996 -0.091888 33.781910 -4.433238 -[Debug] hidden_after_layer12: [2048, 1085] first4: -15.201079 -18.070684 72.561172 28.713606 -[Debug] hidden_after_layer18: [2048, 1085] first4: -26.474438 14.961594 62.515419 20.237282 -[Debug] hidden_after_layer23: [2048, 1085] first4: -7.978052 44.256046 198.826355 145.129532 -[Debug] dit_step0_vt: [2170, 64] first4: 0.022187 1.144711 0.357881 2.375370 -[Debug] dit_step0_xt: [2170, 64] first4: 0.193327 2.104218 -0.188142 0.739685 -[DiT] step 1/8 t=1.000 -[Debug] dit_step1_vt: [2170, 64] first4: -0.218329 1.318116 -0.102946 1.902612 -[Debug] dit_step1_xt: [2170, 64] first4: 0.205236 2.032320 -0.182527 0.635906 -[DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: 0.013556 1.214101 0.135533 2.387155 -[Debug] dit_step2_xt: [2170, 64] first4: 0.204333 1.951380 -0.191563 0.476762 -[DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: 0.268002 1.073703 0.267121 2.643928 -[Debug] dit_step3_xt: [2170, 64] first4: 0.181999 1.861905 -0.213823 0.256435 -[DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 0.304715 1.021377 0.118111 2.720495 -[Debug] dit_step4_xt: [2170, 64] first4: 0.149351 1.752472 -0.226477 -0.035047 -[DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 0.279773 0.924189 -0.283976 2.780081 -[Debug] dit_step5_xt: [2170, 64] first4: 0.109384 1.620445 -0.185910 -0.432201 -[DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: 0.163348 0.641980 -0.841978 2.816087 -[Debug] dit_step6_xt: [2170, 64] first4: 0.076714 1.492049 -0.017514 -0.995418 -[DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: -0.026257 0.197844 -1.519455 3.080479 -[Debug] dit_x0: [2170, 64] first4: 0.084591 1.432696 0.438323 -1.919562 -[DiT] step 8/8 t=0.300 -[DiT] Total generation: 7097.2 ms (7097.2 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.084591 1.432696 0.438323 -1.919562 -[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 474 nodes, T_latent=192 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' -ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x141717870 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x1417100a0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x1417089d0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x141708e50 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x141719b10 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x14171a1d0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x14171a730 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x14171b660 | th_max = 1024 | th_width = 32 -[VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 474 nodes, T_latent=256 -[VAE] Graph: 474 nodes, T_latent=186 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x141618f80 | th_max = 896 | th_width = 32 -[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 609553.7 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000462 0.000971 0.000803 0.001170 -[VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Request 1/1] Done -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Pipeline] All done -ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices -ggml_metal_library_init: using embedded metal library -ggml_metal_library_init: loaded in 0.006 sec -ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s) -ggml_metal_device_init: GPU name: MTL0 -ggml_metal_device_init: GPU family: MTLGPUFamilyApple8 (1008) -ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003) -ggml_metal_device_init: GPU family: MTLGPUFamilyMetal3 (5001) -ggml_metal_device_init: simdgroup reduction = true -ggml_metal_device_init: simdgroup matrix mul. = true -ggml_metal_device_init: has unified memory = true -ggml_metal_device_init: has bfloat = true -ggml_metal_device_init: has tensor = false -ggml_metal_device_init: use residency sets = true -ggml_metal_device_init: use shared buffers = true -ggml_metal_device_init: recommendedMaxWorkingSetSize = 11453.25 MB -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] DiT backend: MTL0 (CPU threads: 5) -[Load] Backend init: 20.2 ms -[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800 -[DiT] MLP: gate+up fused -[Load] null_condition_emb found (CFG available) -[WeightCtx] Loaded 478 tensors, 1600.7 MB into backend -[Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 2506.1 ms -[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800 -[Load] silence_latent: [15000, 64] from GGUF -[GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] VAE backend: MTL0 (CPU threads: 5) -[VAE] Backend: MTL0, Weight buffer: 255.7 MB -[VAE] Loaded: 5 blocks, upsample=1920x -[Load] VAE weights: 340.1 ms -[Request 1/1] ggml-sft/request0.json (batch=1) -[Request] parsed ggml-sft/request0.json (18 fields) -[Pipeline] seed=42, steps=50, guidance=7.0, shift=1.0, duration=88.0s -[Pipeline] 434 audio codes (86.8s @ 5Hz) -[Pipeline] T=2170, S=1085 -[BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 40.9 ms -[Pipeline] caption: 70 tokens, lyrics: 167 tokens -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] TextEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 28L, H=1024, Nh=16/8 -[Load] TextEncoder: 238.6 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_get_rows_bf16', name = 'kernel_get_rows_bf16' -ggml_metal_library_compile_pipeline: loaded kernel_get_rows_bf16 0x13060e0d0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rms_norm_mul_f32_4', name = 'kernel_rms_norm_mul_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_rms_norm_mul_f32_4 0x13060e830 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_bf16_f32', name = 'kernel_mul_mm_bf16_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_bf16_f32_bci=0_bco=1 0x13060f670 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_rope_neox_f32', name = 'kernel_rope_neox_f32_imrope=0' -ggml_metal_library_compile_pipeline: loaded kernel_rope_neox_f32_imrope=0 0x13060f8d0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=1_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=1_ncpsg=64 0x130610350 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_blk', name = 'kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_blk_nqptg=8_ncpsg=64 0x1306108b0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=1_sinks=0_bias=0_scap=0_kvpad=1_bcm=1_ns10=1024_ns20=1024_nsg=4 0x130610b10 | th_max = 576 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=0 0x1306115a0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_swiglu_f32', name = 'kernel_swiglu_f32' -ggml_metal_library_compile_pipeline: loaded kernel_swiglu_f32 0x130611800 | th_max = 1024 | th_width = 32 -[Encode] TextEncoder (70 tokens): 49.2 ms -[Debug] text_hidden: [70, 1024] first4: 3.703506 2.446818 0.222721 -13.133463 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 34.1 ms -[Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -ggml_metal_init: allocating -ggml_metal_init: found device: Apple M2 Pro -ggml_metal_init: picking default device: Apple M2 Pro -ggml_metal_init: use fusion = true -ggml_metal_init: use concurrency = true -ggml_metal_init: use graph optimize = true -[Load] CondEncoder backend: MTL0 (CPU threads: 5) -[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800 -[WeightCtx] Loaded 140 tensors, 616.6 MB into backend -[Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 615.4 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_q8_0_f32', name = 'kernel_mul_mm_q8_0_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_q8_0_f32_bci=0_bco=1 0x130709710 | th_max = 896 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=64' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=64 0x130709b90 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=1024_ns20=1024_nsg=4 0x13070a1c0 | th_max = 640 | th_width = 32 -[Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 129.6 ms, enc_S=238 -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Debug] enc_hidden: [238, 2048] first4: 1.750375 -0.049236 -0.134516 0.059443 -[GGUF] ../models/acestep-v15-sft-Q8_0.gguf: 678 tensors, data at offset 56800 -[WeightCtx] Loaded 30 tensors, 106.5 MB into backend -[Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 77.5 ms -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_bf16_f32_short', name = 'kernel_mul_mv_bf16_f32_short_nsg=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_bf16_f32_short_nsg=1 0x130708890 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=0_nf=1_rb=1 0x13070ab80 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_q8_0_f32', name = 'kernel_mul_mv_q8_0_f32_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_q8_0_f32_nsg=4 0x13070ade0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_repeat_f32', name = 'kernel_repeat_f32' -ggml_metal_library_compile_pipeline: loaded kernel_repeat_f32 0x13070b260 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_q8_0_f32', name = 'kernel_cpy_q8_0_f32' -ggml_metal_library_compile_pipeline: loaded kernel_cpy_q8_0_f32 0x13070b970 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_ext_q8_0_f32_r1_5', name = 'kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_ext_q8_0_f32_r1_5_nsg=2_nxpsg=8 0x13070c5e0 | th_max = 640 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_pad', name = 'kernel_flash_attn_ext_pad_mask=0_ncpsg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_pad_mask=0_ncpsg=32 0x13070c840 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_f32_dk128_dv128', name = 'kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_f32_dk128_dv128_mask=0_sink=0_bias=0_scap=0_kvpad=1_ns10=1024_ns20=1024_nsg=1_nwg=32 0x13070cc50 | th_max = 448 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_vec_reduce', name = 'kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_vec_reduce_dv=128_nwg=32 0x13070d590 | th_max = 1024 | th_width = 32 -[Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 663.7 ms -[Debug] detok_output: [2170, 64] first4: -0.124953 1.437660 0.307949 -0.624704 -[Context] loaded noise from rng_philox_seed42.bf16: [2170, 64] bf16 -[Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 -[Debug] context: [2170, 128] first4: -0.124953 1.437660 0.307949 -0.624704 -[DiT] Starting: T=2170, S=1085, enc_S=238, steps=50, batch=1 -[DiT] Batch N=1, T=2170, S=1085, enc_S=238 -[DiT] Graph: 1841 nodes -[Debug] null_condition_emb: [2048] first4: 0.018066 -0.000360 0.005096 -0.000683 -[Debug] null_enc_hidden: [238, 2048] first4: 0.018066 -0.000360 0.005096 -0.000683 -[DiT] CFG enabled: guidance_scale=7.0, 2x forward per step, N=1 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f32_f32', name = 'kernel_mul_mm_f32_f32_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f32_f32_bci=0_bco=1 0x130612120 | th_max = 832 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_scale_f32', name = 'kernel_scale_f32' -ggml_metal_library_compile_pipeline: loaded kernel_scale_f32 0x13061a740 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_timestep_embedding_f32', name = 'kernel_timestep_embedding_f32' -ggml_metal_library_compile_pipeline: loaded kernel_timestep_embedding_f32 0x13061af30 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_silu_f32_4', name = 'kernel_silu_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_silu_f32_4 0x13061b880 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=1_nf=1_rb=1 0x13061bf40 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=1 0x13061c950 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32_4', name = 'kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_4_op=2_nf=1_rb=0 0x13061ce40 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_cpy_f32_f32', name = 'kernel_cpy_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_cpy_f32_f32 0x13061bc00 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_flash_attn_ext_f32_dk128_dv128', name = 'kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_flash_attn_ext_f32_dk128_dv128_mask=0_sinks=0_bias=0_scap=0_kvpad=1_bcm=0_ns10=128_ns20=1024_nsg=4 0x13061d350 | th_max = 640 | th_width = 32 -[Debug] tproj: [12288] first4: 0.154891 -0.116111 -0.086457 0.081949 -[Debug] temb: [2048] first4: -0.002756 -0.176432 0.004178 -0.001982 -[Debug] temb_t: [2048] first4: -0.001185 0.003330 -0.013113 -0.002073 -[Debug] temb_r: [2048] first4: -0.001571 -0.179762 0.017291 0.000091 -[Debug] sinusoidal_t: [256] first4: 0.562407 0.789701 0.439822 -0.023583 -[Debug] sinusoidal_r: [256] first4: 1.000000 1.000000 1.000000 1.000000 -[Debug] temb_lin1_t: [2048] first4: -0.038370 0.029875 0.028026 -0.024772 -[Debug] temb_lin1_r: [2048] first4: 0.001863 0.003353 -0.000552 -0.000197 -[Debug] hidden_after_proj_in: [2048, 1085] first4: -1.090297 -0.925707 0.497575 0.441158 -[Debug] proj_in_input: [192, 2170] first4: -0.124953 1.437660 0.307949 -0.624704 -[Debug] enc_after_cond_emb: [2048, 238] first4: -0.185352 0.911478 0.323017 -0.548477 -[Debug] layer0_sa_input: [2048, 1085] first4: -0.915448 -0.716620 -0.037252 0.294421 -[Debug] layer0_q_after_rope: [128, 16] first4: -0.185352 0.911478 0.323017 -0.548477 -[Debug] layer0_k_after_rope: [128, 8] first4: -1.352387 -0.993045 -1.790654 -2.255961 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.646956 0.807681 -0.548860 0.541048 -[Debug] layer0_attn_out: [2048, 1085] first4: -1.095906 0.020644 -33.592499 -0.642108 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.803416 -1.383010 -0.122776 0.380531 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.967658 -1.044807 -0.403243 0.382801 -[Debug] hidden_after_layer0: [2048, 1085] first4: -9.409370 1.156843 57.423218 -1.617135 -[Debug] hidden_after_layer6: [2048, 1085] first4: -17.478519 4.365310 57.869217 -2.590640 -[Debug] hidden_after_layer12: [2048, 1085] first4: -10.105301 4.450487 -23.010748 -1.911694 -[Debug] hidden_after_layer18: [2048, 1085] first4: -3.554647 18.982800 -38.417198 -11.175929 -[Debug] hidden_after_layer23: [2048, 1085] first4: 34.303474 64.211922 58.212040 -9.279413 -[Debug] dit_step0_vt_cond: [2170, 64] first4: -0.583175 2.543502 -0.173930 1.461996 -[Debug] dit_step0_vt_uncond: [2170, 64] first4: -0.311259 2.094594 -0.691959 1.747500 -[Debug] dit_step0_vt: [2170, 64] first4: -0.765032 3.014946 0.101383 1.079777 -[Debug] dit_step0_xt: [2170, 64] first4: 0.209637 2.095951 -0.173903 0.826061 -[DiT] step 1/50 t=1.000 -[Debug] dit_step1_vt_cond: [2170, 64] first4: -0.635693 2.541408 0.005301 1.299802 -[Debug] dit_step1_vt_uncond: [2170, 64] first4: -0.555907 2.342016 -0.102693 1.478369 -[Debug] dit_step1_vt: [2170, 64] first4: -0.416332 2.156285 -0.189682 1.386406 -[Debug] dit_step1_xt: [2170, 64] first4: 0.217963 2.052825 -0.170109 0.798333 -[DiT] step 2/50 t=0.980 -[Debug] dit_step2_vt_cond: [2170, 64] first4: -0.628899 2.502596 0.041825 1.309368 -[Debug] dit_step2_vt_uncond: [2170, 64] first4: -0.545262 2.302532 -0.090023 1.423282 -[Debug] dit_step2_vt: [2170, 64] first4: -0.777333 2.936136 0.278430 1.015660 -[Debug] dit_step2_xt: [2170, 64] first4: 0.233510 1.994103 -0.175678 0.778019 -[DiT] step 3/50 t=0.960 -[Debug] dit_step3_vt_cond: [2170, 64] first4: -0.587814 2.422845 0.053595 1.321287 -[Debug] dit_step3_vt_uncond: [2170, 64] first4: -0.577692 2.332772 -0.037127 1.357753 -[Debug] dit_step3_vt: [2170, 64] first4: -0.357759 2.100169 -0.131042 1.500563 -[Debug] dit_step3_xt: [2170, 64] first4: 0.240665 1.952099 -0.173057 0.748008 -[DiT] step 4/50 t=0.940 -[Debug] dit_step4_vt_cond: [2170, 64] first4: -0.531520 2.346682 0.048538 1.319634 -[Debug] dit_step4_vt_uncond: [2170, 64] first4: -0.572059 2.299766 -0.047866 1.336576 -[Debug] dit_step4_vt: [2170, 64] first4: -0.562290 2.601383 0.287872 1.079526 -[Debug] dit_step4_xt: [2170, 64] first4: 0.251911 1.900071 -0.178814 0.726417 -[DiT] step 5/50 t=0.920 -[Debug] dit_step5_vt_cond: [2170, 64] first4: -0.471980 2.275284 0.016048 1.313642 -[Debug] dit_step5_vt_uncond: [2170, 64] first4: -0.581892 2.263602 -0.014694 1.345006 -[Debug] dit_step5_vt: [2170, 64] first4: -0.118539 1.956480 -0.197979 1.364062 -[Debug] dit_step5_xt: [2170, 64] first4: 0.254282 1.860942 -0.174855 0.699136 -[DiT] step 6/50 t=0.900 -[Debug] dit_step6_vt_cond: [2170, 64] first4: -0.438105 2.231154 0.001016 1.310053 -[Debug] dit_step6_vt_uncond: [2170, 64] first4: -0.544084 2.196794 -0.016910 1.357073 -[Debug] dit_step6_vt: [2170, 64] first4: -0.411409 2.467071 0.199489 1.062841 -[Debug] dit_step6_xt: [2170, 64] first4: 0.262510 1.811600 -0.178844 0.677879 -[DiT] step 7/50 t=0.880 -[Debug] dit_step7_vt_cond: [2170, 64] first4: -0.388773 2.150816 -0.023557 1.299900 -[Debug] dit_step7_vt_uncond: [2170, 64] first4: -0.488913 2.112037 -0.027399 1.344294 -[Debug] dit_step7_vt: [2170, 64] first4: -0.088803 1.961096 -0.200603 1.289382 -[Debug] dit_step7_xt: [2170, 64] first4: 0.264286 1.772379 -0.174832 0.652092 -[DiT] step 8/50 t=0.860 -[Debug] dit_step8_vt_cond: [2170, 64] first4: -0.350721 2.065278 -0.035497 1.282128 -[Debug] dit_step8_vt_uncond: [2170, 64] first4: -0.410401 2.018277 -0.041600 1.310604 -[Debug] dit_step8_vt: [2170, 64] first4: -0.374781 2.238836 0.127401 1.108719 -[Debug] dit_step8_xt: [2170, 64] first4: 0.271781 1.727602 -0.177380 0.629917 -[DiT] step 9/50 t=0.840 -[Debug] dit_step9_vt_cond: [2170, 64] first4: -0.318524 1.978134 -0.051657 1.264737 -[Debug] dit_step9_vt_uncond: [2170, 64] first4: -0.341550 1.941071 -0.051996 1.283970 -[Debug] dit_step9_vt: [2170, 64] first4: -0.191845 1.828466 -0.186632 1.247382 -[Debug] dit_step9_xt: [2170, 64] first4: 0.275618 1.691033 -0.173648 0.604970 -[DiT] step 10/50 t=0.820 -[Debug] dit_step10_vt_cond: [2170, 64] first4: -0.295512 1.899216 -0.053004 1.254855 -[Debug] dit_step10_vt_uncond: [2170, 64] first4: -0.287151 1.875865 -0.046272 1.283265 -[Debug] dit_step10_vt: [2170, 64] first4: -0.389814 1.941252 0.040587 1.075441 -[Debug] dit_step10_xt: [2170, 64] first4: 0.283415 1.652207 -0.174460 0.583461 -[DiT] step 11/50 t=0.800 -[Debug] dit_step11_vt_cond: [2170, 64] first4: -0.275416 1.819571 -0.061343 1.243315 -[Debug] dit_step11_vt_uncond: [2170, 64] first4: -0.256136 1.821522 -0.037205 1.292460 -[Debug] dit_step11_vt: [2170, 64] first4: -0.238914 1.604097 -0.208856 1.109252 -[Debug] dit_step11_xt: [2170, 64] first4: 0.288193 1.620125 -0.170282 0.561276 -[DiT] step 12/50 t=0.780 -[Debug] dit_step12_vt_cond: [2170, 64] first4: -0.256205 1.750170 -0.060178 1.242222 -[Debug] dit_step12_vt_uncond: [2170, 64] first4: -0.227482 1.778021 -0.026911 1.317358 -[Debug] dit_step12_vt: [2170, 64] first4: -0.364171 1.624805 -0.047159 0.969305 -[Debug] dit_step12_xt: [2170, 64] first4: 0.295476 1.587629 -0.169339 0.541890 -[DiT] step 13/50 t=0.760 -[Debug] dit_step13_vt_cond: [2170, 64] first4: -0.235268 1.684496 -0.057297 1.240694 -[Debug] dit_step13_vt_uncond: [2170, 64] first4: -0.197700 1.726529 -0.035090 1.325942 -[Debug] dit_step13_vt: [2170, 64] first4: -0.273492 1.425781 -0.144172 1.021587 -[Debug] dit_step13_xt: [2170, 64] first4: 0.300946 1.559114 -0.166456 0.521458 -[DiT] step 14/50 t=0.740 -[Debug] dit_step14_vt_cond: [2170, 64] first4: -0.215348 1.630674 -0.040594 1.249328 -[Debug] dit_step14_vt_uncond: [2170, 64] first4: -0.174794 1.672403 -0.046957 1.327406 -[Debug] dit_step14_vt: [2170, 64] first4: -0.334811 1.472660 0.055936 0.996336 -[Debug] dit_step14_xt: [2170, 64] first4: 0.307642 1.529660 -0.167575 0.501531 -[DiT] step 15/50 t=0.720 -[Debug] dit_step15_vt_cond: [2170, 64] first4: -0.197163 1.578918 -0.035665 1.254164 -[Debug] dit_step15_vt_uncond: [2170, 64] first4: -0.153038 1.617873 -0.066125 1.317649 -[Debug] dit_step15_vt: [2170, 64] first4: -0.277163 1.353290 0.013043 1.079428 -[Debug] dit_step15_xt: [2170, 64] first4: 0.313186 1.502595 -0.167835 0.479943 -[DiT] step 16/50 t=0.700 -[Debug] dit_step16_vt_cond: [2170, 64] first4: -0.178978 1.530298 -0.037333 1.260345 -[Debug] dit_step16_vt_uncond: [2170, 64] first4: -0.134279 1.564621 -0.087480 1.303262 -[Debug] dit_step16_vt: [2170, 64] first4: -0.306646 1.373590 0.150592 1.110865 -[Debug] dit_step16_xt: [2170, 64] first4: 0.319319 1.475123 -0.170847 0.457726 -[DiT] step 17/50 t=0.680 -[Debug] dit_step17_vt_cond: [2170, 64] first4: -0.160694 1.478322 -0.055287 1.256185 -[Debug] dit_step17_vt_uncond: [2170, 64] first4: -0.120116 1.510010 -0.115773 1.278657 -[Debug] dit_step17_vt: [2170, 64] first4: -0.246079 1.282344 0.092105 1.178406 -[Debug] dit_step17_xt: [2170, 64] first4: 0.324240 1.449476 -0.172689 0.434157 -[DiT] step 18/50 t=0.660 -[Debug] dit_step18_vt_cond: [2170, 64] first4: -0.143064 1.420482 -0.082808 1.247799 -[Debug] dit_step18_vt_uncond: [2170, 64] first4: -0.106988 1.460190 -0.151634 1.257033 -[Debug] dit_step18_vt: [2170, 64] first4: -0.255780 1.219606 0.175261 1.179049 -[Debug] dit_step18_xt: [2170, 64] first4: 0.329356 1.425084 -0.176195 0.410576 -[DiT] step 19/50 t=0.640 -[Debug] dit_step19_vt_cond: [2170, 64] first4: -0.128746 1.354206 -0.117595 1.227311 -[Debug] dit_step19_vt_uncond: [2170, 64] first4: -0.089307 1.410491 -0.191474 1.230714 -[Debug] dit_step19_vt: [2170, 64] first4: -0.249608 1.072434 0.109807 1.183548 -[Debug] dit_step19_xt: [2170, 64] first4: 0.334348 1.403635 -0.178391 0.386906 -[DiT] step 20/50 t=0.620 -[Debug] dit_step20_vt_cond: [2170, 64] first4: -0.119003 1.272067 -0.154096 1.204746 -[Debug] dit_step20_vt_uncond: [2170, 64] first4: -0.074174 1.352716 -0.224701 1.203207 -[Debug] dit_step20_vt: [2170, 64] first4: -0.270581 0.922657 0.097471 1.173580 -[Debug] dit_step20_xt: [2170, 64] first4: 0.339760 1.385182 -0.180340 0.363434 -[DiT] step 21/50 t=0.600 -[Debug] dit_step21_vt_cond: [2170, 64] first4: -0.119830 1.180927 -0.187708 1.173581 -[Debug] dit_step21_vt_uncond: [2170, 64] first4: -0.067616 1.288429 -0.252699 1.168760 -[Debug] dit_step21_vt: [2170, 64] first4: -0.293490 0.743300 0.018199 1.149608 -[Debug] dit_step21_xt: [2170, 64] first4: 0.345629 1.370316 -0.180704 0.340442 -[DiT] step 22/50 t=0.580 -[Debug] dit_step22_vt_cond: [2170, 64] first4: -0.122855 1.082593 -0.215883 1.140040 -[Debug] dit_step22_vt_uncond: [2170, 64] first4: -0.066099 1.222055 -0.270495 1.136248 -[Debug] dit_step22_vt: [2170, 64] first4: -0.307841 0.532235 -0.036651 1.101102 -[Debug] dit_step22_xt: [2170, 64] first4: 0.351786 1.359671 -0.179971 0.318420 -[DiT] step 23/50 t=0.560 -[Debug] dit_step23_vt_cond: [2170, 64] first4: -0.132629 0.986949 -0.241285 1.111620 -[Debug] dit_step23_vt_uncond: [2170, 64] first4: -0.070734 1.155219 -0.283975 1.102751 -[Debug] dit_step23_vt: [2170, 64] first4: -0.338930 0.377693 -0.112170 1.114765 -[Debug] dit_step23_xt: [2170, 64] first4: 0.358565 1.352118 -0.177728 0.296124 -[DiT] step 24/50 t=0.540 -[Debug] dit_step24_vt_cond: [2170, 64] first4: -0.144419 0.884916 -0.261786 1.077981 -[Debug] dit_step24_vt_uncond: [2170, 64] first4: -0.082416 1.084641 -0.291092 1.068368 -[Debug] dit_step24_vt: [2170, 64] first4: -0.333728 0.169552 -0.174635 1.048605 -[Debug] dit_step24_xt: [2170, 64] first4: 0.365239 1.348727 -0.174235 0.275152 -[DiT] step 25/50 t=0.520 -[Debug] dit_step25_vt_cond: [2170, 64] first4: -0.162952 0.778074 -0.275456 1.049087 -[Debug] dit_step25_vt_uncond: [2170, 64] first4: -0.097955 1.005611 -0.292343 1.037690 -[Debug] dit_step25_vt: [2170, 64] first4: -0.381506 0.021539 -0.229765 1.053622 -[Debug] dit_step25_xt: [2170, 64] first4: 0.372869 1.348296 -0.169640 0.254080 -[DiT] step 26/50 t=0.500 -[Debug] dit_step26_vt_cond: [2170, 64] first4: -0.181652 0.659926 -0.286432 1.013943 -[Debug] dit_step26_vt_uncond: [2170, 64] first4: -0.115043 0.914795 -0.293755 1.005103 -[Debug] dit_step26_vt: [2170, 64] first4: -0.384189 -0.243706 -0.263819 0.974856 -[Debug] dit_step26_xt: [2170, 64] first4: 0.380553 1.353170 -0.164363 0.234583 -[DiT] step 27/50 t=0.480 -[Debug] dit_step27_vt_cond: [2170, 64] first4: -0.201740 0.544023 -0.293109 0.975384 -[Debug] dit_step27_vt_uncond: [2170, 64] first4: -0.133396 0.820908 -0.294792 0.973169 -[Debug] dit_step27_vt: [2170, 64] first4: -0.422354 -0.384602 -0.284394 0.929557 -[Debug] dit_step27_xt: [2170, 64] first4: 0.389000 1.360862 -0.158675 0.215992 -[DiT] step 28/50 t=0.460 -[Debug] dit_step28_vt_cond: [2170, 64] first4: -0.224075 0.417509 -0.297748 0.929072 -[Debug] dit_step28_vt_uncond: [2170, 64] first4: -0.152221 0.713812 -0.298235 0.936752 -[Debug] dit_step28_vt: [2170, 64] first4: -0.444730 -0.577905 -0.283219 0.830288 -[Debug] dit_step28_xt: [2170, 64] first4: 0.397895 1.372420 -0.153011 0.199386 -[DiT] step 29/50 t=0.440 -[Debug] dit_step29_vt_cond: [2170, 64] first4: -0.245692 0.290209 -0.302133 0.880189 -[Debug] dit_step29_vt_uncond: [2170, 64] first4: -0.168230 0.597160 -0.303812 0.900573 -[Debug] dit_step29_vt: [2170, 64] first4: -0.492967 -0.691559 -0.283257 0.750519 -[Debug] dit_step29_xt: [2170, 64] first4: 0.407754 1.386251 -0.147346 0.184375 -[DiT] step 30/50 t=0.420 -[Debug] dit_step30_vt_cond: [2170, 64] first4: -0.267059 0.157186 -0.303854 0.831628 -[Debug] dit_step30_vt_uncond: [2170, 64] first4: -0.182529 0.468759 -0.310389 0.867764 -[Debug] dit_step30_vt: [2170, 64] first4: -0.526538 -0.832075 -0.259881 0.638692 -[Debug] dit_step30_xt: [2170, 64] first4: 0.418285 1.402893 -0.142148 0.171602 -[DiT] step 31/50 t=0.400 -[Debug] dit_step31_vt_cond: [2170, 64] first4: -0.287749 0.019218 -0.305370 0.782136 -[Debug] dit_step31_vt_uncond: [2170, 64] first4: -0.194848 0.329909 -0.318059 0.834346 -[Debug] dit_step31_vt: [2170, 64] first4: -0.575041 -0.931991 -0.245392 0.549215 -[Debug] dit_step31_xt: [2170, 64] first4: 0.429786 1.421533 -0.137240 0.160617 -[DiT] step 32/50 t=0.380 -[Debug] dit_step32_vt_cond: [2170, 64] first4: -0.304426 -0.117694 -0.305028 0.730304 -[Debug] dit_step32_vt_uncond: [2170, 64] first4: -0.204242 0.187789 -0.324302 0.800084 -[Debug] dit_step32_vt: [2170, 64] first4: -0.600474 -1.036970 -0.221470 0.433362 -[Debug] dit_step32_xt: [2170, 64] first4: 0.441795 1.442272 -0.132811 0.151950 -[DiT] step 33/50 t=0.360 -[Debug] dit_step33_vt_cond: [2170, 64] first4: -0.319238 -0.254083 -0.299488 0.682753 -[Debug] dit_step33_vt_uncond: [2170, 64] first4: -0.210244 0.042433 -0.326321 0.765284 -[Debug] dit_step33_vt: [2170, 64] first4: -0.649450 -1.121861 -0.194288 0.366186 -[Debug] dit_step33_xt: [2170, 64] first4: 0.454784 1.464709 -0.128925 0.144626 -[DiT] step 34/50 t=0.340 -[Debug] dit_step34_vt_cond: [2170, 64] first4: -0.329936 -0.381831 -0.294650 0.641700 -[Debug] dit_step34_vt_uncond: [2170, 64] first4: -0.211479 -0.098170 -0.329415 0.733958 -[Debug] dit_step34_vt: [2170, 64] first4: -0.683049 -1.194662 -0.165597 0.286680 -[Debug] dit_step34_xt: [2170, 64] first4: 0.468445 1.488602 -0.125613 0.138893 -[DiT] step 35/50 t=0.320 -[Debug] dit_step35_vt_cond: [2170, 64] first4: -0.339163 -0.510193 -0.285358 0.597418 -[Debug] dit_step35_vt_uncond: [2170, 64] first4: -0.213347 -0.237217 -0.326480 0.696784 -[Debug] dit_step35_vt: [2170, 64] first4: -0.716524 -1.285484 -0.142755 0.230145 -[Debug] dit_step35_xt: [2170, 64] first4: 0.482776 1.514312 -0.122758 0.134290 -[DiT] step 36/50 t=0.300 -[Debug] dit_step36_vt_cond: [2170, 64] first4: -0.344282 -0.639077 -0.274660 0.557109 -[Debug] dit_step36_vt_uncond: [2170, 64] first4: -0.213018 -0.378000 -0.321460 0.659633 -[Debug] dit_step36_vt: [2170, 64] first4: -0.737407 -1.359316 -0.114364 0.177555 -[Debug] dit_step36_xt: [2170, 64] first4: 0.497524 1.541498 -0.120471 0.130739 -[DiT] step 37/50 t=0.280 -[Debug] dit_step37_vt_cond: [2170, 64] first4: -0.350530 -0.754109 -0.251615 0.515208 -[Debug] dit_step37_vt_uncond: [2170, 64] first4: -0.215775 -0.505277 -0.306489 0.618527 -[Debug] dit_step37_vt: [2170, 64] first4: -0.758482 -1.436458 -0.064580 0.134758 -[Debug] dit_step37_xt: [2170, 64] first4: 0.512694 1.570228 -0.119179 0.128044 -[DiT] step 38/50 t=0.260 -[Debug] dit_step38_vt_cond: [2170, 64] first4: -0.351878 -0.862931 -0.225801 0.473748 -[Debug] dit_step38_vt_uncond: [2170, 64] first4: -0.216566 -0.629070 -0.288643 0.572433 -[Debug] dit_step38_vt: [2170, 64] first4: -0.762284 -1.485620 -0.020081 0.114390 -[Debug] dit_step38_xt: [2170, 64] first4: 0.527939 1.599940 -0.118778 0.125756 -[DiT] step 39/50 t=0.240 -[Debug] dit_step39_vt_cond: [2170, 64] first4: -0.353231 -0.974004 -0.188379 0.427506 -[Debug] dit_step39_vt_uncond: [2170, 64] first4: -0.220991 -0.753843 -0.259805 0.519853 -[Debug] dit_step39_vt: [2170, 64] first4: -0.758813 -1.561977 0.045217 0.080611 -[Debug] dit_step39_xt: [2170, 64] first4: 0.543115 1.631179 -0.119682 0.124144 -[DiT] step 40/50 t=0.220 -[Debug] dit_step40_vt_cond: [2170, 64] first4: -0.356234 -1.066793 -0.144915 0.378638 -[Debug] dit_step40_vt_uncond: [2170, 64] first4: -0.228754 -0.860472 -0.222172 0.464689 -[Debug] dit_step40_vt: [2170, 64] first4: -0.749040 -1.587260 0.097200 0.055798 -[Debug] dit_step40_xt: [2170, 64] first4: 0.558096 1.662925 -0.121626 0.123028 -[DiT] step 41/50 t=0.200 -[Debug] dit_step41_vt_cond: [2170, 64] first4: -0.355270 -1.157881 -0.092032 0.327957 -[Debug] dit_step41_vt_uncond: [2170, 64] first4: -0.236702 -0.967551 -0.178051 0.403448 -[Debug] dit_step41_vt: [2170, 64] first4: -0.709683 -1.623054 0.190026 0.058188 -[Debug] dit_step41_xt: [2170, 64] first4: 0.572290 1.695386 -0.125427 0.121864 -[DiT] step 42/50 t=0.180 -[Debug] dit_step42_vt_cond: [2170, 64] first4: -0.352479 -1.244785 -0.033309 0.267733 -[Debug] dit_step42_vt_uncond: [2170, 64] first4: -0.244508 -1.070808 -0.121544 0.336131 -[Debug] dit_step42_vt: [2170, 64] first4: -0.676820 -1.651634 0.233357 0.014431 -[Debug] dit_step42_xt: [2170, 64] first4: 0.585826 1.728418 -0.130094 0.121575 -[DiT] step 43/50 t=0.160 -[Debug] dit_step43_vt_cond: [2170, 64] first4: -0.345241 -1.321976 0.026233 0.204406 -[Debug] dit_step43_vt_uncond: [2170, 64] first4: -0.251274 -1.163180 -0.061907 0.264783 -[Debug] dit_step43_vt: [2170, 64] first4: -0.615402 -1.690314 0.311200 0.000887 -[Debug] dit_step43_xt: [2170, 64] first4: 0.598134 1.762225 -0.136318 0.121558 -[DiT] step 44/50 t=0.140 -[Debug] dit_step44_vt_cond: [2170, 64] first4: -0.330899 -1.390417 0.088632 0.138209 -[Debug] dit_step44_vt_uncond: [2170, 64] first4: -0.253711 -1.249830 -0.000380 0.191808 -[Debug] dit_step44_vt: [2170, 64] first4: -0.551020 -1.700038 0.375316 -0.049211 -[Debug] dit_step44_xt: [2170, 64] first4: 0.609155 1.796225 -0.143824 0.122542 -[DiT] step 45/50 t=0.120 -[Debug] dit_step45_vt_cond: [2170, 64] first4: -0.315962 -1.445453 0.152387 0.071900 -[Debug] dit_step45_vt_uncond: [2170, 64] first4: -0.260821 -1.326647 0.053483 0.118309 -[Debug] dit_step45_vt: [2170, 64] first4: -0.444048 -1.697294 0.502791 -0.074117 -[Debug] dit_step45_xt: [2170, 64] first4: 0.618036 1.830171 -0.153880 0.124024 -[DiT] step 46/50 t=0.100 -[Debug] dit_step46_vt_cond: [2170, 64] first4: -0.294778 -1.496297 0.212962 0.010717 -[Debug] dit_step46_vt_uncond: [2170, 64] first4: -0.258755 -1.395077 0.130780 0.026842 -[Debug] dit_step46_vt: [2170, 64] first4: -0.331878 -1.748994 0.398840 0.016598 -[Debug] dit_step46_xt: [2170, 64] first4: 0.624673 1.865151 -0.161857 0.123692 -[DiT] step 47/50 t=0.080 -[Debug] dit_step47_vt_cond: [2170, 64] first4: -0.281226 -1.541478 0.262625 -0.022201 -[Debug] dit_step47_vt_uncond: [2170, 64] first4: -0.279224 -1.453849 0.178946 -0.018997 -[Debug] dit_step47_vt: [2170, 64] first4: -0.212368 -1.759161 0.579829 -0.049871 -[Debug] dit_step47_xt: [2170, 64] first4: 0.628921 1.900334 -0.173453 0.124690 -[DiT] step 48/50 t=0.060 -[Debug] dit_step48_vt_cond: [2170, 64] first4: -0.256958 -1.573266 0.310890 -0.048733 -[Debug] dit_step48_vt_uncond: [2170, 64] first4: -0.281065 -1.517397 0.260529 -0.069026 -[Debug] dit_step48_vt: [2170, 64] first4: -0.191601 -1.679712 0.330919 0.028046 -[Debug] dit_step48_xt: [2170, 64] first4: 0.632753 1.933929 -0.180072 0.124129 -[DiT] step 49/50 t=0.040 -[Debug] dit_step49_vt_cond: [2170, 64] first4: -0.282571 -1.604237 0.314485 -0.067221 -[Debug] dit_step49_vt_uncond: [2170, 64] first4: -0.305600 -1.528070 0.251822 -0.083237 -[Debug] dit_step49_vt: [2170, 64] first4: -0.212302 -1.898327 0.637213 -0.078416 -[Debug] dit_x0: [2170, 64] first4: 0.636999 1.971895 -0.192816 0.125697 -[DiT] step 50/50 t=0.020 -[DiT] Total generation: 88329.8 ms (88329.8 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.636999 1.971895 -0.192816 0.125697 -[VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 474 nodes, T_latent=192 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_im2col_f16', name = 'kernel_im2col_f16' -ggml_metal_library_compile_pipeline: loaded kernel_im2col_f16 0x13070d7f0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=0' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=0 0x13070e360 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=0_nf=1_rb=0 0x13070e5c0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_bin_fuse_f32_f32_f32', name = 'kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0' -ggml_metal_library_compile_pipeline: loaded kernel_bin_fuse_f32_f32_f32_op=2_nf=1_rb=0 0x13070eb20 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sin_f32_4', name = 'kernel_sin_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sin_f32_4 0x13070eea0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_sqr_f32_4', name = 'kernel_sqr_f32_4' -ggml_metal_library_compile_pipeline: loaded kernel_sqr_f32_4 0x13070f4a0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_conv_transpose_1d_f32_f32', name = 'kernel_conv_transpose_1d_f32_f32' -ggml_metal_library_compile_pipeline: loaded kernel_conv_transpose_1d_f32_f32 0x13070f8c0 | th_max = 1024 | th_width = 32 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mv_f16_f16_4', name = 'kernel_mul_mv_f16_f16_4_nsg=4' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mv_f16_f16_4_nsg=4 0x130710c50 | th_max = 1024 | th_width = 32 -[VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 474 nodes, T_latent=256 -[VAE] Graph: 474 nodes, T_latent=186 -ggml_metal_library_compile_pipeline: compiling pipeline: base = 'kernel_mul_mm_f16_f16', name = 'kernel_mul_mm_f16_f16_bci=0_bco=1' -ggml_metal_library_compile_pipeline: loaded kernel_mul_mm_f16_f16_bci=0_bco=1 0x130710eb0 | th_max = 896 | th_width = 32 -[VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 609578.6 ms -[Debug] vae_audio: [2, 4166400] first4: -0.002759 -0.002685 -0.002611 -0.002214 -[VAE Batch0] Wrote ggml-sft/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Request 1/1] Done -ggml_metal_free: deallocating -ggml_metal_free: deallocating -[Pipeline] All done -[Request] Loaded request0.json -[Noise] Reusing existing rng_philox_seed42.bf16 -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf -[GGML] Running acestep-v15-turbo-Q8_0.gguf... -[GGML] Done, 47 dump files -[Turbo] Reusing existing Python dumps: python-turbo -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999813 - lyric_embed 1.000000 - enc_hidden 0.999792 - detok_output 0.999991 - context 0.999996 - noise 1.000000 - temb_t 0.999998 - hidden_after_proj_in 0.999992 - enc_after_cond_emb 0.999776 - layer0_sa_output 0.999956 - hidden_after_layer0 0.999975 - hidden_after_layer6 0.999915 - hidden_after_layer12 0.999185 - hidden_after_layer18 0.996490 - hidden_after_layer23 0.993375 - dit_step0_vt 0.974442 - dit_step0_xt 0.999944 - dit_step1_vt 0.976442 - dit_step1_xt 0.999818 - dit_step2_vt 0.978398 - dit_step2_xt 0.999498 - dit_step3_vt 0.979729 - dit_step3_xt 0.998787 - dit_step4_vt 0.979038 - dit_step4_xt 0.997189 - dit_step5_vt 0.976705 - dit_step5_xt 0.993692 - dit_step6_vt 0.973710 - dit_step6_xt 0.987147 - dit_step7_vt 0.967471 - dit_x0 0.977589 - vae_audio 0.899969 - vae_audio (log spectral) 0.999797 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999943 0.142558 0.006797 -0.002331 0.972917 -0.002342 0.972003 - dit_step1_xt 0.999817 0.272611 0.011800 -0.005326 0.942672 -0.005313 0.941730 - dit_step2_xt 0.999496 0.463653 0.018437 -0.009355 0.909197 -0.009311 0.908527 - dit_step3_xt 0.998785 0.659420 0.027271 -0.014709 0.873849 -0.014577 0.873624 - dit_step4_xt 0.997188 0.977590 0.039587 -0.021771 0.842053 -0.021660 0.841995 - dit_step5_xt 0.993691 1.450203 0.057714 -0.031846 0.825442 -0.032109 0.824593 - dit_step6_xt 0.987145 2.144326 0.085260 -0.046128 0.856513 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q8_0.gguf -[GGML] Running acestep-v15-sft-Q8_0.gguf... -[GGML] Done, 233 dump files -[SFT] Reusing existing Python dumps: python-sft -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999813 - lyric_embed 1.000000 - enc_hidden 0.999792 - detok_output 0.999991 - context 0.999996 - noise 1.000000 - temb_t 0.999994 - hidden_after_proj_in 0.999993 - enc_after_cond_emb 0.999779 - layer0_sa_output 0.999948 - hidden_after_layer0 0.999975 - hidden_after_layer6 0.999842 - hidden_after_layer12 0.999467 - hidden_after_layer18 0.998721 - hidden_after_layer23 0.998987 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998936 - dit_step0_vt_uncond 0.998589 - dit_step0_vt 0.995617 - dit_step0_xt 0.999998 - dit_step5_vt_cond 0.999453 - dit_step5_vt 0.993749 - dit_step5_xt 0.999962 - dit_step10_vt_cond 0.998786 - dit_step10_vt 0.993632 - dit_step10_xt 0.999883 - dit_step15_vt_cond 0.996925 - dit_step15_vt 0.985179 - dit_step15_xt 0.999650 - dit_step20_vt_cond 0.993789 - dit_step20_vt 0.978156 - dit_step20_xt 0.998994 - dit_step25_vt_cond 0.988666 - dit_step25_vt 0.968588 - dit_step25_xt 0.997635 - dit_step30_vt_cond 0.983353 - dit_step30_vt 0.963692 - dit_step30_xt 0.995502 - dit_step35_vt_cond 0.978311 - dit_step35_vt 0.954994 - dit_step35_xt 0.992900 - dit_step40_vt_cond 0.975242 - dit_step40_vt 0.949054 - dit_step40_xt 0.990408 - dit_step45_vt_cond 0.977875 - dit_step45_vt 0.949872 - dit_step45_xt 0.988656 - dit_step49_vt_cond 0.980007 - dit_step49_vt 0.943555 - dit_x0 0.988056 - vae_audio 0.945079 - vae_audio (log spectral) 0.999659 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999997 0.038313 0.002069 -0.001710 0.980019 -0.001741 0.980402 - dit_step5_xt 0.999960 0.128136 0.005945 -0.006874 0.889005 -0.007143 0.887999 - dit_step10_xt 0.999882 0.212035 0.009258 -0.012273 0.810355 -0.012603 0.811299 - dit_step15_xt 0.999649 0.310457 0.014288 -0.017479 0.745211 -0.018114 0.745269 - dit_step20_xt 0.998994 0.579346 0.021839 -0.022740 0.699641 -0.023808 0.699582 - dit_step25_xt 0.997635 0.874619 0.031657 -0.028120 0.678310 -0.029311 0.679278 - dit_step30_xt 0.995501 1.140020 0.043494 -0.033543 0.684534 -0.035027 0.685262 - dit_step35_xt 0.992900 1.761304 0.057050 -0.038898 0.716898 -0.040716 0.717196 - dit_step40_xt 0.990407 2.128224 0.071732 -0.044231 0.770985 -0.046462 0.771853 - dit_step45_xt 0.988655 2.420490 0.085663 -0.050087 0.842526 -0.052475 0.843036 diff --git a/tests/Q4_K_M.log b/tests/Q4_K_M.log deleted file mode 100644 index d76238c..0000000 --- a/tests/Q4_K_M.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf -[GGML] Running acestep-v15-turbo-Q4_K_M.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999805 - lyric_embed 1.000000 - enc_hidden 0.997032 - detok_output 0.999610 - context 0.999750 - noise 1.000000 - temb_t 0.999902 - hidden_after_proj_in 0.999908 - enc_after_cond_emb 0.997517 - layer0_sa_output 0.998371 - hidden_after_layer0 0.999675 - hidden_after_layer6 0.999257 - hidden_after_layer12 0.995500 - hidden_after_layer18 0.991597 - hidden_after_layer23 0.985460 - dit_step0_vt 0.947383 - dit_step0_xt 0.999885 - dit_step1_vt 0.947784 - dit_step1_xt 0.999617 - dit_step2_vt 0.957305 - dit_step2_xt 0.999014 - dit_step3_vt 0.961931 - dit_step3_xt 0.997757 - dit_step4_vt 0.959773 - dit_step4_xt 0.994900 - dit_step5_vt 0.956611 - dit_step5_xt 0.988539 - dit_step6_vt 0.950669 - dit_step6_xt 0.976494 - dit_step7_vt 0.938658 - dit_x0 0.958725 - vae_audio 0.837763 - vae_audio (STFT cosine) 0.954448 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999885 0.165835 0.010206 -0.002260 0.973133 -0.002342 0.972003 - dit_step1_xt 0.999617 0.269038 0.018058 -0.005119 0.943095 -0.005313 0.941730 - dit_step2_xt 0.999014 0.433553 0.027847 -0.009033 0.910111 -0.009311 0.908527 - dit_step3_xt 0.997757 0.593449 0.040253 -0.014301 0.875156 -0.014577 0.873624 - dit_step4_xt 0.994900 0.889597 0.058068 -0.021205 0.843622 -0.021660 0.841995 - dit_step5_xt 0.988539 1.371047 0.084767 -0.031100 0.827136 -0.032109 0.824593 - dit_step6_xt 0.976494 1.997185 0.125556 -0.045244 0.858177 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q4_K_M.gguf -[GGML] Running acestep-v15-sft-Q4_K_M.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999805 - lyric_embed 1.000000 - enc_hidden 0.997032 - detok_output 0.999610 - context 0.999750 - noise 1.000000 - temb_t 0.999669 - hidden_after_proj_in 0.999909 - enc_after_cond_emb 0.997507 - layer0_sa_output 0.998509 - hidden_after_layer0 0.999683 - hidden_after_layer6 0.999144 - hidden_after_layer12 0.997681 - hidden_after_layer18 0.996675 - hidden_after_layer23 0.996878 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.996752 - dit_step0_vt_uncond 0.996146 - dit_step0_vt 0.989964 - dit_step0_xt 0.999995 - dit_step5_vt_cond 0.995283 - dit_step5_vt 0.977862 - dit_step5_xt 0.999822 - dit_step10_vt_cond 0.991380 - dit_step10_vt 0.969437 - dit_step10_xt 0.999216 - dit_step15_vt_cond 0.982929 - dit_step15_vt 0.945354 - dit_step15_xt 0.997510 - dit_step20_vt_cond 0.968161 - dit_step20_vt 0.918017 - dit_step20_xt 0.993520 - dit_step25_vt_cond 0.951227 - dit_step25_vt 0.894209 - dit_step25_xt 0.986602 - dit_step30_vt_cond 0.931041 - dit_step30_vt 0.870642 - dit_step30_xt 0.976800 - dit_step35_vt_cond 0.910848 - dit_step35_vt 0.844696 - dit_step35_xt 0.965863 - dit_step40_vt_cond 0.899076 - dit_step40_vt 0.824961 - dit_step40_xt 0.956007 - dit_step45_vt_cond 0.909967 - dit_step45_vt 0.832581 - dit_step45_xt 0.949409 - dit_step49_vt_cond 0.928566 - dit_step49_vt 0.867519 - dit_x0 0.947240 - vae_audio 0.830949 - vae_audio (STFT cosine) 0.926924 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999995 0.037971 0.002890 -0.001839 0.980350 -0.001741 0.980402 - dit_step5_xt 0.999822 0.197493 0.013061 -0.007274 0.890479 -0.007143 0.887999 - dit_step10_xt 0.999216 0.534656 0.024200 -0.012937 0.812358 -0.012603 0.811299 - dit_step15_xt 0.997510 0.842267 0.038950 -0.018571 0.747602 -0.018114 0.745268 - dit_step20_xt 0.993520 1.160067 0.058410 -0.024329 0.702011 -0.023808 0.699582 - dit_step25_xt 0.986602 1.554590 0.081033 -0.030223 0.679448 -0.029311 0.679278 - dit_step30_xt 0.976800 1.927341 0.107204 -0.036251 0.683778 -0.035027 0.685262 - dit_step35_xt 0.965863 2.255865 0.136115 -0.042287 0.714074 -0.040716 0.717195 - dit_step40_xt 0.956007 2.590231 0.166595 -0.048296 0.766380 -0.046462 0.771853 - dit_step45_xt 0.949409 2.912931 0.195670 -0.054552 0.836735 -0.052475 0.843036 diff --git a/tests/Q5_K_M.log b/tests/Q5_K_M.log deleted file mode 100644 index 5989b97..0000000 --- a/tests/Q5_K_M.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf -[GGML] Running acestep-v15-turbo-Q5_K_M.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999805 - lyric_embed 1.000000 - enc_hidden 0.999038 - detok_output 0.999875 - context 0.999920 - noise 1.000000 - temb_t 0.999972 - hidden_after_proj_in 0.999960 - enc_after_cond_emb 0.999148 - layer0_sa_output 0.999386 - hidden_after_layer0 0.999829 - hidden_after_layer6 0.999741 - hidden_after_layer12 0.998654 - hidden_after_layer18 0.995432 - hidden_after_layer23 0.991374 - dit_step0_vt 0.968035 - dit_step0_xt 0.999930 - dit_step1_vt 0.971217 - dit_step1_xt 0.999785 - dit_step2_vt 0.970740 - dit_step2_xt 0.999391 - dit_step3_vt 0.973678 - dit_step3_xt 0.998557 - dit_step4_vt 0.972169 - dit_step4_xt 0.996665 - dit_step5_vt 0.967356 - dit_step5_xt 0.992218 - dit_step6_vt 0.962469 - dit_step6_xt 0.983446 - dit_step7_vt 0.953383 - dit_x0 0.970119 - vae_audio 0.883226 - vae_audio (STFT cosine) 0.968463 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999930 0.139407 0.007818 -0.002306 0.973025 -0.002342 0.972003 - dit_step1_xt 0.999785 0.264377 0.013418 -0.005299 0.942885 -0.005313 0.941730 - dit_step2_xt 0.999391 0.455966 0.021259 -0.009285 0.909477 -0.009311 0.908527 - dit_step3_xt 0.998557 0.657160 0.031461 -0.014661 0.874187 -0.014577 0.873624 - dit_step4_xt 0.996665 0.973354 0.045708 -0.021890 0.842366 -0.021660 0.841995 - dit_step5_xt 0.992218 1.446589 0.067697 -0.032248 0.825911 -0.032109 0.824593 - dit_step6_xt 0.983446 2.092730 0.101558 -0.046788 0.857148 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q5_K_M.gguf -[GGML] Running acestep-v15-sft-Q5_K_M.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999805 - lyric_embed 1.000000 - enc_hidden 0.999038 - detok_output 0.999875 - context 0.999920 - noise 1.000000 - temb_t 0.999900 - hidden_after_proj_in 0.999961 - enc_after_cond_emb 0.999149 - layer0_sa_output 0.999452 - hidden_after_layer0 0.999863 - hidden_after_layer6 0.999565 - hidden_after_layer12 0.998948 - hidden_after_layer18 0.997903 - hidden_after_layer23 0.998403 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998327 - dit_step0_vt_uncond 0.998326 - dit_step0_vt 0.994229 - dit_step0_xt 0.999997 - dit_step5_vt_cond 0.998595 - dit_step5_vt 0.987922 - dit_step5_xt 0.999930 - dit_step10_vt_cond 0.997050 - dit_step10_vt 0.985303 - dit_step10_xt 0.999749 - dit_step15_vt_cond 0.992839 - dit_step15_vt 0.969921 - dit_step15_xt 0.999178 - dit_step20_vt_cond 0.985993 - dit_step20_vt 0.954166 - dit_step20_xt 0.997691 - dit_step25_vt_cond 0.977103 - dit_step25_vt 0.938414 - dit_step25_xt 0.994921 - dit_step30_vt_cond 0.966556 - dit_step30_vt 0.922758 - dit_step30_xt 0.990726 - dit_step35_vt_cond 0.956566 - dit_step35_vt 0.906167 - dit_step35_xt 0.985856 - dit_step40_vt_cond 0.951093 - dit_step40_vt 0.892482 - dit_step40_xt 0.981314 - dit_step45_vt_cond 0.957449 - dit_step45_vt 0.895800 - dit_step45_xt 0.978161 - dit_step49_vt_cond 0.967216 - dit_step49_vt 0.914978 - dit_x0 0.977077 - vae_audio 0.891856 - vae_audio (STFT cosine) 0.946058 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999997 0.038463 0.002320 -0.001770 0.980102 -0.001741 0.980402 - dit_step5_xt 0.999930 0.110477 0.008105 -0.006919 0.889608 -0.007143 0.887999 - dit_step10_xt 0.999749 0.218324 0.013563 -0.012429 0.811137 -0.012603 0.811299 - dit_step15_xt 0.999178 0.406292 0.021833 -0.017883 0.745846 -0.018114 0.745268 - dit_step20_xt 0.997691 0.617228 0.033331 -0.023467 0.699845 -0.023808 0.699582 - dit_step25_xt 0.994921 0.873662 0.047346 -0.029215 0.677264 -0.029311 0.679278 - dit_step30_xt 0.990726 1.146449 0.064421 -0.034956 0.681324 -0.035027 0.685262 - dit_step35_xt 0.985856 1.448653 0.083553 -0.040671 0.711562 -0.040716 0.717195 - dit_step40_xt 0.981314 1.836126 0.103939 -0.046406 0.764127 -0.046462 0.771853 - dit_step45_xt 0.978161 2.180611 0.123396 -0.052503 0.834743 -0.052475 0.843036 diff --git a/tests/Q6_K.log b/tests/Q6_K.log deleted file mode 100644 index 6cd4c1c..0000000 --- a/tests/Q6_K.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf -[GGML] Running acestep-v15-turbo-Q6_K.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999805 - lyric_embed 1.000000 - enc_hidden 0.999638 - detok_output 0.999962 - context 0.999976 - noise 1.000000 - temb_t 0.999990 - hidden_after_proj_in 0.999980 - enc_after_cond_emb 0.999648 - layer0_sa_output 0.999763 - hidden_after_layer0 0.999888 - hidden_after_layer6 0.999853 - hidden_after_layer12 0.998917 - hidden_after_layer18 0.995924 - hidden_after_layer23 0.992281 - dit_step0_vt 0.971207 - dit_step0_xt 0.999937 - dit_step1_vt 0.975354 - dit_step1_xt 0.999803 - dit_step2_vt 0.978312 - dit_step2_xt 0.999479 - dit_step3_vt 0.977879 - dit_step3_xt 0.998730 - dit_step4_vt 0.976291 - dit_step4_xt 0.997040 - dit_step5_vt 0.973193 - dit_step5_xt 0.993208 - dit_step6_vt 0.969738 - dit_step6_xt 0.985862 - dit_step7_vt 0.962454 - dit_x0 0.974866 - vae_audio 0.893678 - vae_audio (STFT cosine) 0.969663 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999937 0.147590 0.007252 -0.002265 0.972930 -0.002342 0.972003 - dit_step1_xt 0.999803 0.291665 0.012432 -0.005192 0.942660 -0.005313 0.941730 - dit_step2_xt 0.999479 0.474224 0.019215 -0.009147 0.909068 -0.009311 0.908527 - dit_step3_xt 0.998730 0.730810 0.028734 -0.014438 0.873565 -0.014577 0.873624 - dit_step4_xt 0.997040 1.058607 0.042049 -0.021507 0.841532 -0.021660 0.841995 - dit_step5_xt 0.993208 1.534989 0.062024 -0.031604 0.824595 -0.032109 0.824593 - dit_step6_xt 0.985862 2.188862 0.092252 -0.045920 0.855268 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q6_K.gguf -[GGML] Running acestep-v15-sft-Q6_K.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999805 - lyric_embed 1.000000 - enc_hidden 0.999638 - detok_output 0.999962 - context 0.999976 - noise 1.000000 - temb_t 0.999970 - hidden_after_proj_in 0.999981 - enc_after_cond_emb 0.999651 - layer0_sa_output 0.999771 - hidden_after_layer0 0.999913 - hidden_after_layer6 0.999782 - hidden_after_layer12 0.999350 - hidden_after_layer18 0.998535 - hidden_after_layer23 0.998814 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998712 - dit_step0_vt_uncond 0.998275 - dit_step0_vt 0.994897 - dit_step0_xt 0.999997 - dit_step5_vt_cond 0.999148 - dit_step5_vt 0.992272 - dit_step5_xt 0.999951 - dit_step10_vt_cond 0.997802 - dit_step10_vt 0.990167 - dit_step10_xt 0.999821 - dit_step15_vt_cond 0.995510 - dit_step15_vt 0.980612 - dit_step15_xt 0.999436 - dit_step20_vt_cond 0.991103 - dit_step20_vt 0.969601 - dit_step20_xt 0.998471 - dit_step25_vt_cond 0.984595 - dit_step25_vt 0.957457 - dit_step25_xt 0.996593 - dit_step30_vt_cond 0.977649 - dit_step30_vt 0.948797 - dit_step30_xt 0.993770 - dit_step35_vt_cond 0.970853 - dit_step35_vt 0.937303 - dit_step35_xt 0.990429 - dit_step40_vt_cond 0.966727 - dit_step40_vt 0.927488 - dit_step40_xt 0.987201 - dit_step45_vt_cond 0.971343 - dit_step45_vt 0.937992 - dit_step45_xt 0.984913 - dit_step49_vt_cond 0.978000 - dit_step49_vt 0.949509 - dit_x0 0.984147 - vae_audio 0.935392 - vae_audio (STFT cosine) 0.974483 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999997 0.038602 0.002180 -0.001744 0.980167 -0.001741 0.980402 - dit_step5_xt 0.999951 0.145112 0.006817 -0.006930 0.889866 -0.007143 0.887999 - dit_step10_xt 0.999821 0.208421 0.011339 -0.012339 0.811560 -0.012603 0.811299 - dit_step15_xt 0.999436 0.337160 0.018157 -0.017579 0.746441 -0.018114 0.745268 - dit_step20_xt 0.998471 0.561928 0.027460 -0.022914 0.700716 -0.023808 0.699582 - dit_step25_xt 0.996593 0.825034 0.039088 -0.028344 0.679138 -0.029311 0.679278 - dit_step30_xt 0.993770 1.225392 0.052945 -0.033832 0.684642 -0.035027 0.685262 - dit_step35_xt 0.990429 1.650381 0.068602 -0.039215 0.716082 -0.040716 0.717195 - dit_step40_xt 0.987201 2.085848 0.085637 -0.044595 0.769111 -0.046462 0.771853 - dit_step45_xt 0.984913 2.477617 0.101990 -0.050396 0.839899 -0.052475 0.843036 diff --git a/tests/Q8_0.log b/tests/Q8_0.log deleted file mode 100644 index 27ba118..0000000 --- a/tests/Q8_0.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf -[GGML] Running acestep-v15-turbo-Q8_0.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999805 - lyric_embed 1.000000 - enc_hidden 0.999784 - detok_output 0.999983 - context 0.999990 - noise 1.000000 - temb_t 0.999997 - hidden_after_proj_in 0.999986 - enc_after_cond_emb 0.999765 - layer0_sa_output 0.999924 - hidden_after_layer0 0.999957 - hidden_after_layer6 0.999892 - hidden_after_layer12 0.999346 - hidden_after_layer18 0.996758 - hidden_after_layer23 0.993881 - dit_step0_vt 0.976421 - dit_step0_xt 0.999948 - dit_step1_vt 0.979128 - dit_step1_xt 0.999834 - dit_step2_vt 0.982059 - dit_step2_xt 0.999561 - dit_step3_vt 0.983029 - dit_step3_xt 0.998948 - dit_step4_vt 0.981353 - dit_step4_xt 0.997565 - dit_step5_vt 0.978860 - dit_step5_xt 0.994480 - dit_step6_vt 0.976051 - dit_step6_xt 0.988641 - dit_step7_vt 0.970144 - dit_x0 0.979969 - vae_audio 0.905525 - vae_audio (STFT cosine) 0.976530 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999948 0.134961 0.006551 -0.002307 0.972901 -0.002342 0.972003 - dit_step1_xt 0.999834 0.262688 0.011280 -0.005306 0.942604 -0.005313 0.941730 - dit_step2_xt 0.999561 0.448301 0.017428 -0.009351 0.909110 -0.009311 0.908527 - dit_step3_xt 0.998948 0.617858 0.025766 -0.014708 0.873709 -0.014577 0.873624 - dit_step4_xt 0.997565 0.740504 0.037507 -0.021763 0.841873 -0.021660 0.841995 - dit_step5_xt 0.994480 1.211945 0.054863 -0.031844 0.825164 -0.032109 0.824593 - dit_step6_xt 0.988641 2.056566 0.081142 -0.046105 0.856063 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q8_0.gguf -[GGML] Running acestep-v15-sft-Q8_0.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999805 - lyric_embed 1.000000 - enc_hidden 0.999784 - detok_output 0.999983 - context 0.999990 - noise 1.000000 - temb_t 0.999991 - hidden_after_proj_in 0.999986 - enc_after_cond_emb 0.999768 - layer0_sa_output 0.999913 - hidden_after_layer0 0.999961 - hidden_after_layer6 0.999814 - hidden_after_layer12 0.999441 - hidden_after_layer18 0.998694 - hidden_after_layer23 0.998948 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998899 - dit_step0_vt_uncond 0.998530 - dit_step0_vt 0.995437 - dit_step0_xt 0.999998 - dit_step5_vt_cond 0.999435 - dit_step5_vt 0.993135 - dit_step5_xt 0.999959 - dit_step10_vt_cond 0.998667 - dit_step10_vt 0.992381 - dit_step10_xt 0.999876 - dit_step15_vt_cond 0.996784 - dit_step15_vt 0.983109 - dit_step15_xt 0.999626 - dit_step20_vt_cond 0.993660 - dit_step20_vt 0.976141 - dit_step20_xt 0.998967 - dit_step25_vt_cond 0.989047 - dit_step25_vt 0.965619 - dit_step25_xt 0.997655 - dit_step30_vt_cond 0.983970 - dit_step30_vt 0.959590 - dit_step30_xt 0.995674 - dit_step35_vt_cond 0.978928 - dit_step35_vt 0.949494 - dit_step35_xt 0.993260 - dit_step40_vt_cond 0.975960 - dit_step40_vt 0.939874 - dit_step40_xt 0.990935 - dit_step45_vt_cond 0.978761 - dit_step45_vt 0.940675 - dit_step45_xt 0.989300 - dit_step49_vt_cond 0.980854 - dit_step49_vt 0.920647 - dit_x0 0.988696 - vae_audio 0.944426 - vae_audio (STFT cosine) 0.974764 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999998 0.038422 0.002097 -0.001714 0.980004 -0.001741 0.980402 - dit_step5_xt 0.999959 0.134478 0.006082 -0.006888 0.888999 -0.007143 0.887999 - dit_step10_xt 0.999876 0.215550 0.009492 -0.012330 0.810305 -0.012603 0.811299 - dit_step15_xt 0.999626 0.342195 0.014680 -0.017574 0.745063 -0.018114 0.745268 - dit_step20_xt 0.998967 0.566416 0.022205 -0.022917 0.699295 -0.023808 0.699582 - dit_step25_xt 0.997655 0.862320 0.031744 -0.028373 0.677531 -0.029311 0.679278 - dit_step30_xt 0.995674 1.138689 0.043055 -0.033821 0.683290 -0.035027 0.685262 - dit_step35_xt 0.993260 1.656645 0.056128 -0.039223 0.714963 -0.040716 0.717195 - dit_step40_xt 0.990935 2.096484 0.070423 -0.044591 0.768426 -0.046462 0.771853 - dit_step45_xt 0.989300 2.398146 0.084110 -0.050467 0.839484 -0.052475 0.843036 diff --git a/tests/Vulkan-BF16.log b/tests/Vulkan-BF16.log index 2d955d7..c063695 100644 --- a/tests/Vulkan-BF16.log +++ b/tests/Vulkan-BF16.log @@ -1,7 +1,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 [Load] DiT backend: Vulkan0 (CPU threads: 16) -[Load] Backend init: 260.3 ms +[Load] Backend init: 142.5 ms [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [DiT] Self-attn: Q+K+V fused [DiT] Cross-attn: Q+K+V fused @@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 3007.9 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 397.7 ms +[Load] DiT weight load: 338.6 ms [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: Vulkan0 (CPU threads: 16) +[Load] VAE backend: Vulkan0 (shared) [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 672.5 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 661.6 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 32.1 ms +[Load] BPE tokenizer: 31.2 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[Load] TextEncoder backend: Vulkan0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 166.9 ms -[Encode] TextEncoder (70 tokens): 30.9 ms +[Load] TextEncoder: 141.4 ms +[Encode] TextEncoder (70 tokens): 1939.4 ms [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 11.2 ms +[Encode] Lyric vocab lookup (167 tokens): 0.3 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[Load] CondEncoder backend: Vulkan0 (shared) [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 1160.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 163.7 ms +[Load] ConditionEncoder: 130.2 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 22.5 ms, enc_S=238 +[Encode] ConditionEncoder: 2492.6 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.758148 -0.049593 -0.132730 0.058488 [GGUF] ../models/acestep-v15-turbo-BF16.gguf: 678 tensors, data at offset 57024 [WeightCtx] Loaded 30 tensors, 200.3 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 28.1 ms +[Load] Detokenizer: 23.1 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 229.8 ms +[Context] Detokenizer: 2525.9 ms [Debug] detok_output: [2170, 64] first4: -0.125193 1.435010 0.308190 -0.624228 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -79,70 +77,67 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Debug] proj_in_input: [192, 2170] first4: -0.125193 1.435010 0.308190 -0.624228 [Debug] enc_after_cond_emb: [2048, 238] first4: -0.168464 0.814954 0.327714 -0.561971 [Debug] layer0_sa_input: [2048, 1085] first4: -0.719110 -0.764019 -0.047328 0.261808 -[Debug] layer0_q_after_rope: [128, 16] first4: -2.424376 -0.094810 -0.411903 1.007324 -[Debug] layer0_k_after_rope: [128, 8] first4: -12.712339 1.106410 1.775920 1.780798 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.501171 0.169176 -0.355798 0.513027 -[Debug] layer0_attn_out: [2048, 1085] first4: -1.540742 -1.044333 0.188720 0.456093 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540742 -1.044333 0.188720 0.456093 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.598325 -0.820241 -0.296337 0.493580 -[Debug] hidden_after_layer0: [2048, 1085] first4: -9.091503 0.566892 52.584164 -0.903901 -[Debug] hidden_after_layer6: [2048, 1085] first4: -21.192070 0.040278 33.599442 -4.442998 -[Debug] hidden_after_layer12: [2048, 1085] first4: -15.068191 -18.118078 71.999359 28.597229 -[Debug] hidden_after_layer18: [2048, 1085] first4: -27.132679 15.867422 60.847614 20.940519 -[Debug] hidden_after_layer23: [2048, 1085] first4: -12.584854 45.152912 198.753845 145.517029 -[Debug] dit_step0_vt: [2170, 64] first4: 0.014936 1.119046 0.345802 2.379982 -[Debug] dit_step0_xt: [2170, 64] first4: 0.193657 2.105384 -0.187593 0.739475 +[Debug] layer0_q_after_rope: [128, 16] first4: -1.985390 -0.040374 -0.446411 0.887640 +[Debug] layer0_k_after_rope: [128, 8] first4: -12.133966 1.032982 1.765450 1.789189 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.285921 -0.088167 -0.083954 0.187361 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.468877 -0.930195 0.454157 0.450160 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.468877 -0.930195 0.454157 0.450160 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.465657 -0.778736 0.078704 0.498346 +[Debug] hidden_after_layer0: [2048, 1085] first4: -8.621284 0.720027 54.661194 -0.769228 +[Debug] hidden_after_layer6: [2048, 1085] first4: -12.726752 3.144506 -9.323353 -12.165966 +[Debug] hidden_after_layer12: [2048, 1085] first4: -16.201662 -10.547243 4.967308 15.566863 +[Debug] hidden_after_layer18: [2048, 1085] first4: -26.509827 14.787127 -25.476906 8.639433 +[Debug] hidden_after_layer23: [2048, 1085] first4: -16.044237 89.590195 45.410172 78.645676 +[Debug] dit_step0_vt: [2170, 64] first4: 0.347229 0.879013 0.198151 1.945618 +[Debug] dit_step0_xt: [2170, 64] first4: 0.178553 2.116295 -0.180882 0.759219 [DiT] step 1/8 t=1.000 -[Debug] dit_step1_vt: [2170, 64] first4: 0.086700 0.854980 -0.273651 1.728149 -[Debug] dit_step1_xt: [2170, 64] first4: 0.188928 2.058749 -0.172667 0.645212 +[Debug] dit_step1_vt: [2170, 64] first4: 0.068695 0.847748 -0.298004 1.750702 +[Debug] dit_step1_xt: [2170, 64] first4: 0.174806 2.070054 -0.164627 0.663726 [DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: 0.180420 0.837399 -0.150421 2.056976 -[Debug] dit_step2_xt: [2170, 64] first4: 0.176900 2.002922 -0.162639 0.508081 +[Debug] dit_step2_vt: [2170, 64] first4: 0.151260 0.875549 -0.207390 2.089754 +[Debug] dit_step2_xt: [2170, 64] first4: 0.164722 2.011684 -0.150801 0.524409 [DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: 0.130821 0.833313 0.053528 2.193359 -[Debug] dit_step3_xt: [2170, 64] first4: 0.165998 1.933480 -0.167099 0.325301 +[Debug] dit_step3_vt: [2170, 64] first4: 0.077034 0.843689 -0.087112 2.299004 +[Debug] dit_step3_xt: [2170, 64] first4: 0.158302 1.941377 -0.143542 0.332826 [DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 0.273712 0.866425 0.216686 2.274872 -[Debug] dit_step4_xt: [2170, 64] first4: 0.136672 1.840648 -0.190316 0.081565 +[Debug] dit_step4_vt: [2170, 64] first4: 0.173340 0.815531 0.275307 2.367218 +[Debug] dit_step4_xt: [2170, 64] first4: 0.139730 1.853999 -0.173039 0.079195 [DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 0.347900 0.772171 0.542953 2.248352 -[Debug] dit_step5_xt: [2170, 64] first4: 0.086972 1.730338 -0.267881 -0.239629 +[Debug] dit_step5_vt: [2170, 64] first4: 0.210556 0.765915 0.470947 2.214279 +[Debug] dit_step5_xt: [2170, 64] first4: 0.109651 1.744582 -0.240317 -0.237130 [DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: 0.132820 0.664673 0.218246 2.387787 -[Debug] dit_step6_xt: [2170, 64] first4: 0.060408 1.597404 -0.311530 -0.717186 +[Debug] dit_step6_vt: [2170, 64] first4: -0.038303 0.546310 0.224964 2.284607 +[Debug] dit_step6_xt: [2170, 64] first4: 0.117311 1.635320 -0.285310 -0.694052 [DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: -0.335976 0.323303 0.198029 2.726624 -[Debug] dit_x0: [2170, 64] first4: 0.161200 1.500413 -0.370938 -1.535173 +[Debug] dit_step7_vt: [2170, 64] first4: -0.300537 0.235870 0.263802 2.617432 +[Debug] dit_x0: [2170, 64] first4: 0.207473 1.564559 -0.364450 -1.479281 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 740.5 ms (740.5 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.161200 1.500413 -0.370938 -1.535173 +[DiT] Total generation: 2630.4 ms (2630.4 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.207473 1.564559 -0.364450 -1.479281 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 9812.1 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000591 0.001078 0.000929 0.001296 +[VAE Batch0] Decode: 2992.9 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000614 0.001141 0.000934 0.001396 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:55:13.398 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:55:13.398 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:55:13.399 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:55:13.399 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:55:13.399 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:55:14.155 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:55:15.664 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:55:15.664 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:55:15.669 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:55:15.830 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:46:56.541 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:46:56.622 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:46:57.937 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:46:57.937 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:46:57.939 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:46:57.945 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:46:58.137 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:55:15.831 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:46:58.139 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:55:15.838 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:55:15.850 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:55:15.851 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:55:15.885 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:55:16.193 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:55:16.193 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:55:16.193 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006814241409301758, 'diffusion_time_cost': 0.30007076263427734, 'diffusion_per_step_time_cost': 0.03750884532928467, 'total_time_cost': 0.3068850040435791, 'offload_time_cost': 0.0} -2026-03-01 19:55:16.208 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:55:16.210 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:55:16.210 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:55:16.210 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:55:16.210 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:55:16.210 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:55:16.210 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:55:16.485 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:55:16.488 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:55:16.491 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:46:58.146 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:46:58.161 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:46:58.161 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:46:58.195 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:46:58.511 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:46:58.512 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:46:58.512 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006871938705444336, 'diffusion_time_cost': 0.30806517601013184, 'diffusion_per_step_time_cost': 0.03850814700126648, 'total_time_cost': 0.31493711471557617, 'offload_time_cost': 0.0} +2026-03-04 21:46:58.526 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:46:58.528 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:46:58.528 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:46:58.528 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:46:58.528 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:46:58.528 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:46:58.528 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:46:58.802 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:46:58.804 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:46:58.806 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf [GGML] Running acestep-v15-turbo-BF16.gguf... @@ -224,36 +219,36 @@ Using precomputed LM hints temb_t 0.999999 hidden_after_proj_in 0.999987 enc_after_cond_emb 0.999825 - layer0_sa_output 0.999959 - hidden_after_layer0 0.999982 - hidden_after_layer6 0.999916 - hidden_after_layer12 0.999276 - hidden_after_layer18 0.996645 - hidden_after_layer23 0.993735 - dit_step0_vt 0.975502 - dit_step0_xt 0.999946 - dit_step1_vt 0.898326 - dit_step1_xt 0.999578 - dit_step2_vt 0.893586 - dit_step2_xt 0.998276 - dit_step3_vt 0.881101 - dit_step3_xt 0.994720 - dit_step4_vt 0.869138 - dit_step4_xt 0.986137 - dit_step5_vt 0.854878 - dit_step5_xt 0.965846 - dit_step6_vt 0.840298 - dit_step6_xt 0.925771 - dit_step7_vt 0.818271 - dit_x0 0.867399 - vae_audio 0.680412 - vae_audio (STFT cosine) 0.855380 + layer0_sa_output 0.920858 + hidden_after_layer0 0.996092 + hidden_after_layer6 0.980248 + hidden_after_layer12 0.977161 + hidden_after_layer18 0.973382 + hidden_after_layer23 0.961755 + dit_step0_vt 0.843333 + dit_step0_xt 0.999656 + dit_step1_vt 0.875601 + dit_step1_xt 0.998907 + dit_step2_vt 0.860701 + dit_step2_xt 0.996792 + dit_step3_vt 0.838816 + dit_step3_xt 0.991464 + dit_step4_vt 0.827875 + dit_step4_xt 0.978766 + dit_step5_vt 0.812689 + dit_step5_xt 0.949636 + dit_step6_vt 0.795272 + dit_step6_xt 0.894491 + dit_step7_vt 0.769772 + dit_x0 0.818406 + vae_audio 0.571274 + vae_audio (STFT cosine) 0.788509 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999946 0.135811 0.006633 -0.002316 0.972919 -0.002342 0.972003 - dit_step1_xt 0.999578 0.413265 0.019706 -0.005121 0.942541 -0.005313 0.941730 - dit_step2_xt 0.998276 0.811472 0.038208 -0.008968 0.908957 -0.009311 0.908527 - dit_step3_xt 0.994720 1.481150 0.064047 -0.014385 0.872574 -0.014577 0.873624 - dit_step4_xt 0.986137 1.857148 0.100272 -0.021489 0.837038 -0.021660 0.841995 - dit_step5_xt 0.965846 1.439633 0.154129 -0.031859 0.812819 -0.032109 0.824593 - dit_step6_xt 0.925771 2.125688 0.235367 -0.046759 0.832442 -0.046482 0.855546 + dit_step0_xt 0.999656 0.367652 0.018858 -0.002243 0.972108 -0.002342 0.972003 + dit_step1_xt 0.998907 0.763455 0.032624 -0.004985 0.941679 -0.005313 0.941730 + dit_step2_xt 0.996792 1.022189 0.053741 -0.008816 0.908019 -0.009311 0.908527 + dit_step3_xt 0.991464 1.657425 0.084380 -0.014275 0.871556 -0.014577 0.873624 + dit_step4_xt 0.978766 2.432666 0.128087 -0.021464 0.836876 -0.021660 0.841995 + dit_step5_xt 0.949636 3.423663 0.193034 -0.032107 0.813619 -0.032109 0.824593 + dit_step6_xt 0.894491 4.744513 0.289706 -0.047388 0.833987 -0.046482 0.855546 diff --git a/tests/Vulkan-Q4_K_M.log b/tests/Vulkan-Q4_K_M.log index 011c0c3..03f9985 100644 --- a/tests/Vulkan-Q4_K_M.log +++ b/tests/Vulkan-Q4_K_M.log @@ -1,7 +1,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 [Load] DiT backend: Vulkan0 (CPU threads: 16) -[Load] Backend init: 115.6 ms +[Load] Backend init: 146.5 ms [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K fused, V separate [DiT] Cross-attn: all separate @@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 895.6 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 126.7 ms +[Load] DiT weight load: 110.3 ms [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: Vulkan0 (CPU threads: 16) +[Load] VAE backend: Vulkan0 (shared) [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 667.9 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 661.8 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 31.0 ms +[Load] BPE tokenizer: 31.2 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[Load] TextEncoder backend: Vulkan0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 166.1 ms -[Encode] TextEncoder (70 tokens): 18.4 ms +[Load] TextEncoder: 143.1 ms +[Encode] TextEncoder (70 tokens): 18.1 ms [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 11.3 ms +[Encode] Lyric vocab lookup (167 tokens): 0.3 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[Load] CondEncoder backend: Vulkan0 (shared) [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K fused, V separate @@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 352.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 43.9 ms +[Load] ConditionEncoder: 40.5 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 18.2 ms, enc_S=238 +[Encode] ConditionEncoder: 2552.5 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.760519 -0.046675 -0.129011 0.057651 [GGUF] ../models/acestep-v15-turbo-Q4_K_M.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 64.7 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 8.9 ms +[Load] Detokenizer: 8.5 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 152.2 ms +[Context] Detokenizer: 574.4 ms [Debug] detok_output: [2170, 64] first4: -0.107345 1.442038 0.300564 -0.641466 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -93,56 +91,53 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Debug] dit_step0_vt: [2170, 64] first4: 0.669312 0.442215 1.300629 2.101841 [Debug] dit_step0_xt: [2170, 64] first4: 0.163913 2.136149 -0.230995 0.752118 [DiT] step 1/8 t=1.000 -[Debug] dit_step1_vt: [2170, 64] first4: 1.120422 0.593113 1.031189 1.813599 -[Debug] dit_step1_xt: [2170, 64] first4: 0.102799 2.103798 -0.287241 0.653194 +[Debug] dit_step1_vt: [2170, 64] first4: 1.083954 0.575027 1.011414 1.785126 +[Debug] dit_step1_xt: [2170, 64] first4: 0.104788 2.104784 -0.286163 0.654747 [DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: 1.381363 0.295410 1.456146 1.949341 -[Debug] dit_step2_xt: [2170, 64] first4: 0.010708 2.084104 -0.384318 0.523238 +[Debug] dit_step2_vt: [2170, 64] first4: 1.406609 0.358032 1.442169 1.947861 +[Debug] dit_step2_xt: [2170, 64] first4: 0.011014 2.080915 -0.382307 0.524890 [DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: 1.440727 0.067017 1.481567 2.158554 -[Debug] dit_step3_xt: [2170, 64] first4: -0.109353 2.078519 -0.507782 0.343359 +[Debug] dit_step3_vt: [2170, 64] first4: 1.450653 0.080627 1.479324 2.174759 +[Debug] dit_step3_xt: [2170, 64] first4: -0.109874 2.074197 -0.505584 0.343660 [DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 1.377216 0.234177 1.413437 2.181564 -[Debug] dit_step4_xt: [2170, 64] first4: -0.256912 2.053428 -0.659221 0.109620 +[Debug] dit_step4_vt: [2170, 64] first4: 1.396931 0.250122 1.401264 2.164902 +[Debug] dit_step4_xt: [2170, 64] first4: -0.259545 2.047398 -0.655720 0.111706 [DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 1.135239 0.376801 1.055233 2.272675 -[Debug] dit_step5_xt: [2170, 64] first4: -0.419089 1.999600 -0.809969 -0.215048 +[Debug] dit_step5_vt: [2170, 64] first4: 1.155813 0.405807 1.027550 2.260437 +[Debug] dit_step5_xt: [2170, 64] first4: -0.424661 1.989425 -0.802512 -0.211213 [DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: 0.948242 0.399368 0.426941 2.645081 -[Debug] dit_step6_xt: [2170, 64] first4: -0.608737 1.919726 -0.895357 -0.744064 +[Debug] dit_step6_vt: [2170, 64] first4: 0.916870 0.396088 0.350647 2.622253 +[Debug] dit_step6_xt: [2170, 64] first4: -0.608035 1.910208 -0.872642 -0.735664 [DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: 0.549133 -0.167076 0.379578 2.984619 -[Debug] dit_x0: [2170, 64] first4: -0.773477 1.969849 -1.009230 -1.639450 +[Debug] dit_step7_vt: [2170, 64] first4: 0.544876 -0.215309 0.434998 3.006592 +[Debug] dit_x0: [2170, 64] first4: -0.771498 1.974800 -1.003141 -1.637641 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 263.6 ms (263.6 ms/sample) -[Debug] dit_output: [2170, 64] first4: -0.773477 1.969849 -1.009230 -1.639450 +[DiT] Total generation: 342.3 ms (342.3 ms/sample) +[Debug] dit_output: [2170, 64] first4: -0.771498 1.974800 -1.003141 -1.637641 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 9686.3 ms -[Debug] vae_audio: [2, 4166400] first4: 0.015021 0.018215 0.017495 0.016521 +[VAE Batch0] Decode: 1703.5 ms +[Debug] vae_audio: [2, 4166400] first4: 0.012597 0.015460 0.014870 0.014040 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:56:19.059 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:56:19.060 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:56:19.060 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:56:19.060 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:56:19.060 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:56:19.832 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:56:21.417 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:56:21.417 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:56:21.428 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:56:21.589 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:47:49.166 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:47:49.255 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:47:50.597 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:47:50.597 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:47:50.598 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:47:50.604 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:47:50.793 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:56:21.591 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:47:50.795 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:56:21.597 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:56:21.610 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:56:21.610 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:56:21.642 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:56:21.955 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:56:21.956 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:56:21.956 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006905794143676758, 'diffusion_time_cost': 0.3056776523590088, 'diffusion_per_step_time_cost': 0.0382097065448761, 'total_time_cost': 0.31258344650268555, 'offload_time_cost': 0.0} -2026-03-01 19:56:21.970 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:56:21.973 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:56:21.973 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:56:21.973 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:56:21.973 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:56:21.973 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:56:21.973 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:56:22.249 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:56:22.252 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:56:22.255 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:47:50.802 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:47:50.816 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:47:50.816 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:47:50.850 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:47:51.166 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:47:51.167 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:47:51.167 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006922483444213867, 'diffusion_time_cost': 0.3079640865325928, 'diffusion_per_step_time_cost': 0.0384955108165741, 'total_time_cost': 0.31488656997680664, 'offload_time_cost': 0.0} +2026-03-04 21:47:51.181 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:47:51.183 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:47:51.183 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.77 GB +2026-03-04 21:47:51.183 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:47:51.183 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.77 GB +2026-03-04 21:47:51.183 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.77 GB +2026-03-04 21:47:51.183 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:47:51.458 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:47:51.460 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:47:51.461 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf [GGML] Running acestep-v15-turbo-Q4_K_M.gguf... @@ -232,28 +227,28 @@ Using precomputed LM hints hidden_after_layer23 0.947132 dit_step0_vt 0.790630 dit_step0_xt 0.999550 - dit_step1_vt 0.812267 - dit_step1_xt 0.998316 - dit_step2_vt 0.797855 - dit_step2_xt 0.994982 - dit_step3_vt 0.785550 - dit_step3_xt 0.987155 - dit_step4_vt 0.777677 - dit_step4_xt 0.969894 - dit_step5_vt 0.765554 - dit_step5_xt 0.933268 - dit_step6_vt 0.748164 - dit_step6_xt 0.865654 - dit_step7_vt 0.704997 - dit_x0 0.768990 - vae_audio 0.377954 - vae_audio (STFT cosine) 0.669489 + dit_step1_vt 0.756205 + dit_step1_xt 0.998148 + dit_step2_vt 0.797194 + dit_step2_xt 0.994834 + dit_step3_vt 0.784456 + dit_step3_xt 0.987026 + dit_step4_vt 0.776725 + dit_step4_xt 0.969792 + dit_step5_vt 0.765077 + dit_step5_xt 0.933184 + dit_step6_vt 0.747231 + dit_step6_xt 0.865289 + dit_step7_vt 0.704165 + dit_x0 0.767979 + vae_audio 0.376451 + vae_audio (STFT cosine) 0.668630 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999550 0.201120 0.022082 -0.002496 0.972768 -0.002342 0.972003 - dit_step1_xt 0.998316 0.415084 0.041258 -0.005641 0.942202 -0.005313 0.941730 - dit_step2_xt 0.994982 0.710340 0.068500 -0.010236 0.907728 -0.009311 0.908527 - dit_step3_xt 0.987155 1.070455 0.105302 -0.016404 0.870181 -0.014577 0.873624 - dit_step4_xt 0.969894 1.456633 0.155292 -0.024587 0.833834 -0.021660 0.841995 - dit_step5_xt 0.933268 1.997366 0.225911 -0.035903 0.808944 -0.032109 0.824593 - dit_step6_xt 0.865654 3.020976 0.331484 -0.051668 0.828925 -0.046482 0.855546 + dit_step1_xt 0.998148 0.415598 0.043234 -0.005810 0.944103 -0.005313 0.941730 + dit_step2_xt 0.994834 0.709830 0.069736 -0.010410 0.909328 -0.009311 0.908527 + dit_step3_xt 0.987026 1.071567 0.106058 -0.016584 0.871456 -0.014577 0.873624 + dit_step4_xt 0.969792 1.488428 0.155756 -0.024763 0.834729 -0.021660 0.841995 + dit_step5_xt 0.933184 1.958024 0.226224 -0.036147 0.809005 -0.032109 0.824593 + dit_step6_xt 0.865289 3.030077 0.331834 -0.051892 0.828296 -0.046482 0.855546 diff --git a/tests/Vulkan-Q5_K_M.log b/tests/Vulkan-Q5_K_M.log index ec38ab3..aa0eb9c 100644 --- a/tests/Vulkan-Q5_K_M.log +++ b/tests/Vulkan-Q5_K_M.log @@ -1,7 +1,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 [Load] DiT backend: Vulkan0 (CPU threads: 16) -[Load] Backend init: 114.1 ms +[Load] Backend init: 114.4 ms [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K fused, V separate [DiT] Cross-attn: all separate @@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1061.2 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 151.9 ms +[Load] DiT weight load: 129.5 ms [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: Vulkan0 (CPU threads: 16) +[Load] VAE backend: Vulkan0 (shared) [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 677.1 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 660.3 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 32.6 ms +[Load] BPE tokenizer: 30.7 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[Load] TextEncoder backend: Vulkan0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 167.6 ms -[Encode] TextEncoder (70 tokens): 18.0 ms +[Load] TextEncoder: 142.0 ms +[Encode] TextEncoder (70 tokens): 17.4 ms [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 11.1 ms +[Encode] Lyric vocab lookup (167 tokens): 0.3 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[Load] CondEncoder backend: Vulkan0 (shared) [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K fused, V separate @@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 412.5 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 55.7 ms +[Load] ConditionEncoder: 50.1 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 17.4 ms, enc_S=238 +[Encode] ConditionEncoder: 3109.7 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.760480 -0.051691 -0.132144 0.058144 [GGUF] ../models/acestep-v15-turbo-Q5_K_M.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 73.2 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 14.2 ms +[Load] Detokenizer: 9.1 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 176.8 ms +[Context] Detokenizer: 674.8 ms [Debug] detok_output: [2170, 64] first4: -0.125636 1.455599 0.291766 -0.651349 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -96,53 +94,50 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Debug] dit_step1_vt: [2170, 64] first4: -0.053368 1.748116 -0.894806 1.618408 [Debug] dit_step1_xt: [2170, 64] first4: 0.197534 2.006799 -0.135800 0.647723 [DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: -0.025024 1.326050 -0.792084 2.043884 -[Debug] dit_step2_xt: [2170, 64] first4: 0.199202 1.918396 -0.082994 0.511464 +[Debug] dit_step2_vt: [2170, 64] first4: -0.031860 1.378967 -0.801270 2.036987 +[Debug] dit_step2_xt: [2170, 64] first4: 0.199658 1.914868 -0.082382 0.511923 [DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: -0.000458 1.126770 -0.795593 2.254120 -[Debug] dit_step3_xt: [2170, 64] first4: 0.199240 1.824498 -0.016695 0.323620 +[Debug] dit_step3_vt: [2170, 64] first4: 0.009003 1.141663 -0.806183 2.229477 +[Debug] dit_step3_xt: [2170, 64] first4: 0.198908 1.819729 -0.015200 0.326134 [DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 0.174652 1.253662 -1.125977 2.441956 -[Debug] dit_step4_xt: [2170, 64] first4: 0.180528 1.690177 0.103946 0.061982 +[Debug] dit_step4_vt: [2170, 64] first4: 0.174896 1.264160 -1.139648 2.439102 +[Debug] dit_step4_xt: [2170, 64] first4: 0.180169 1.684284 0.106905 0.064801 [DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 0.205261 1.640076 -1.795410 2.452087 -[Debug] dit_step5_xt: [2170, 64] first4: 0.151205 1.455881 0.360433 -0.288316 +[Debug] dit_step5_vt: [2170, 64] first4: 0.201294 1.641151 -1.784760 2.454834 +[Debug] dit_step5_xt: [2170, 64] first4: 0.151413 1.449833 0.361871 -0.285889 [DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: -0.158905 1.750122 -2.412979 2.419128 -[Debug] dit_step6_xt: [2170, 64] first4: 0.182986 1.105856 0.843029 -0.772142 +[Debug] dit_step6_vt: [2170, 64] first4: -0.154907 1.748291 -2.434448 2.425964 +[Debug] dit_step6_xt: [2170, 64] first4: 0.182394 1.100175 0.848760 -0.771082 [DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: -0.636047 1.672760 -3.485062 2.600891 -[Debug] dit_x0: [2170, 64] first4: 0.373800 0.604028 1.888547 -1.552409 +[Debug] dit_step7_vt: [2170, 64] first4: -0.633545 1.687561 -3.500275 2.586243 +[Debug] dit_x0: [2170, 64] first4: 0.372458 0.593907 1.898843 -1.546955 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 269.9 ms (269.9 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.373800 0.604028 1.888547 -1.552409 +[DiT] Total generation: 354.9 ms (354.9 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.372458 0.593907 1.898843 -1.546955 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 9630.7 ms -[Debug] vae_audio: [2, 4166400] first4: 0.001367 0.001844 0.001533 0.001892 +[VAE Batch0] Decode: 1718.2 ms +[Debug] vae_audio: [2, 4166400] first4: 0.001432 0.001921 0.001585 0.001927 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:56:02.727 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:56:02.728 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:56:02.728 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:56:02.728 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:56:02.728 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:56:03.499 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:56:05.072 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:56:05.072 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:56:05.078 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:56:05.239 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:47:37.062 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:47:37.143 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:47:38.480 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:47:38.481 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:47:38.482 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:47:38.488 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:47:38.703 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:56:05.241 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:47:38.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:56:05.247 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:56:05.260 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:56:05.260 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:56:05.285 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:56:05.592 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:56:05.593 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:56:05.593 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.00687718391418457, 'diffusion_time_cost': 0.3001282215118408, 'diffusion_per_step_time_cost': 0.0375160276889801, 'total_time_cost': 0.3070054054260254, 'offload_time_cost': 0.0} -2026-03-01 19:56:05.607 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:56:05.609 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:56:05.610 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB -2026-03-01 19:56:05.610 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:56:05.610 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB -2026-03-01 19:56:05.610 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB -2026-03-01 19:56:05.610 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:56:05.884 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:56:05.888 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:56:05.891 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:47:38.712 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:47:38.726 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:47:38.726 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:47:38.761 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:47:39.078 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:47:39.079 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:47:39.079 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006884098052978516, 'diffusion_time_cost': 0.3090353012084961, 'diffusion_per_step_time_cost': 0.03862941265106201, 'total_time_cost': 0.3159193992614746, 'offload_time_cost': 0.0} +2026-03-04 21:47:39.092 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:47:39.095 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:47:39.095 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:47:39.095 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:47:39.095 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:47:39.095 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:47:39.095 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:47:39.374 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:47:39.376 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:47:39.378 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf [GGML] Running acestep-v15-turbo-Q5_K_M.gguf... @@ -234,26 +229,26 @@ Using precomputed LM hints dit_step0_xt 0.999650 dit_step1_vt 0.854589 dit_step1_xt 0.998725 - dit_step2_vt 0.841602 - dit_step2_xt 0.996217 - dit_step3_vt 0.832748 - dit_step3_xt 0.990342 - dit_step4_vt 0.826828 - dit_step4_xt 0.977304 - dit_step5_vt 0.815977 - dit_step5_xt 0.948497 - dit_step6_vt 0.803425 - dit_step6_xt 0.895308 - dit_step7_vt 0.770195 - dit_x0 0.820447 - vae_audio 0.478241 - vae_audio (STFT cosine) 0.753764 + dit_step2_vt 0.826891 + dit_step2_xt 0.996124 + dit_step3_vt 0.832715 + dit_step3_xt 0.990263 + dit_step4_vt 0.826558 + dit_step4_xt 0.977265 + dit_step5_vt 0.815705 + dit_step5_xt 0.948477 + dit_step6_vt 0.802898 + dit_step6_xt 0.895216 + dit_step7_vt 0.769793 + dit_x0 0.820156 + vae_audio 0.477357 + vae_audio (STFT cosine) 0.753154 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B dit_step0_xt 0.999650 0.235954 0.018872 -0.002255 0.973213 -0.002342 0.972003 dit_step1_xt 0.998725 0.437235 0.034677 -0.005176 0.942982 -0.005313 0.941730 - dit_step2_xt 0.996217 0.735376 0.057569 -0.009210 0.909169 -0.009311 0.908527 - dit_step3_xt 0.990342 1.115564 0.088544 -0.014811 0.872820 -0.014577 0.873624 - dit_step4_xt 0.977304 1.463506 0.131044 -0.022213 0.838526 -0.021660 0.841995 - dit_step5_xt 0.948497 2.208427 0.193557 -0.032833 0.817339 -0.032109 0.824593 - dit_step6_xt 0.895308 3.287671 0.286241 -0.047639 0.842369 -0.046482 0.855546 + dit_step2_xt 0.996124 0.735913 0.058267 -0.009379 0.909744 -0.009311 0.908527 + dit_step3_xt 0.990263 1.130236 0.088998 -0.014995 0.873310 -0.014577 0.873624 + dit_step4_xt 0.977265 1.457183 0.131253 -0.022419 0.838885 -0.021660 0.841995 + dit_step5_xt 0.948477 2.197404 0.193723 -0.033044 0.817537 -0.032109 0.824593 + dit_step6_xt 0.895216 3.271284 0.286472 -0.047848 0.842172 -0.046482 0.855546 diff --git a/tests/Vulkan-Q6_K.log b/tests/Vulkan-Q6_K.log index eff680f..a938da1 100644 --- a/tests/Vulkan-Q6_K.log +++ b/tests/Vulkan-Q6_K.log @@ -1,7 +1,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 [Load] DiT backend: Vulkan0 (CPU threads: 16) -[Load] Backend init: 114.2 ms +[Load] Backend init: 144.9 ms [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K+V fused [DiT] Cross-attn: Q+K+V fused @@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1237.2 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 181.3 ms +[Load] DiT weight load: 156.5 ms [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: Vulkan0 (CPU threads: 16) +[Load] VAE backend: Vulkan0 (shared) [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 670.0 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 657.4 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 32.2 ms +[Load] BPE tokenizer: 32.6 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[Load] TextEncoder backend: Vulkan0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 165.9 ms -[Encode] TextEncoder (70 tokens): 17.6 ms +[Load] TextEncoder: 142.6 ms +[Encode] TextEncoder (70 tokens): 43.2 ms [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 11.2 ms +[Encode] Lyric vocab lookup (167 tokens): 0.3 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[Load] CondEncoder backend: Vulkan0 (shared) [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 476.3 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 61.6 ms +[Load] ConditionEncoder: 55.4 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 15.6 ms, enc_S=238 +[Encode] ConditionEncoder: 3621.4 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.761356 -0.050570 -0.133026 0.058500 [GGUF] ../models/acestep-v15-turbo-Q6_K.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 82.2 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 10.8 ms +[Load] Detokenizer: 10.5 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 143.8 ms +[Context] Detokenizer: 421.5 ms [Debug] detok_output: [2170, 64] first4: -0.141024 1.454365 0.315089 -0.623565 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -79,70 +77,67 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Debug] proj_in_input: [192, 2170] first4: -0.141024 1.454365 0.315089 -0.623565 [Debug] enc_after_cond_emb: [2048, 238] first4: -0.170166 0.815842 0.310486 -0.571373 [Debug] layer0_sa_input: [2048, 1085] first4: -0.716080 -0.755969 -0.048350 0.263422 -[Debug] layer0_q_after_rope: [128, 16] first4: -2.400391 -0.081909 -0.397461 1.011719 -[Debug] layer0_k_after_rope: [128, 8] first4: -12.581572 1.117675 1.774897 1.788774 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.503906 0.211304 -0.366943 0.520996 -[Debug] layer0_attn_out: [2048, 1085] first4: -1.540494 -1.050420 0.183235 0.461747 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.540494 -1.050420 0.183235 0.461747 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.586454 -0.808233 -0.324089 0.502214 -[Debug] hidden_after_layer0: [2048, 1085] first4: -9.155503 0.531986 51.823910 -0.865276 -[Debug] hidden_after_layer6: [2048, 1085] first4: -20.861578 -0.240065 34.589954 -4.288221 -[Debug] hidden_after_layer12: [2048, 1085] first4: -14.692959 -16.975090 77.250595 30.676491 -[Debug] hidden_after_layer18: [2048, 1085] first4: -28.082283 13.370504 64.661263 19.941170 -[Debug] hidden_after_layer23: [2048, 1085] first4: -16.195175 45.294254 196.766129 138.065048 -[Debug] dit_step0_vt: [2170, 64] first4: 0.098133 1.125458 0.338135 2.349396 -[Debug] dit_step0_xt: [2170, 64] first4: 0.189875 2.105093 -0.187245 0.740865 +[Debug] layer0_q_after_rope: [128, 16] first4: -1.967773 -0.181152 -0.292236 0.785156 +[Debug] layer0_k_after_rope: [128, 8] first4: -12.036863 1.158559 1.733423 1.789948 +[Debug] layer0_sa_output: [2048, 1085] first4: -1.344727 -0.353271 -0.171753 0.330078 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.487266 -0.799756 0.373941 0.458040 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.487266 -0.799756 0.373941 0.458040 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.478606 -0.639722 0.069986 0.503358 +[Debug] hidden_after_layer0: [2048, 1085] first4: -6.179441 -0.194424 25.726625 -0.569950 +[Debug] hidden_after_layer6: [2048, 1085] first4: -12.978424 -2.696237 30.199980 -5.338717 +[Debug] hidden_after_layer12: [2048, 1085] first4: -13.710206 -8.286438 60.887405 36.884922 +[Debug] hidden_after_layer18: [2048, 1085] first4: -19.046274 10.102365 41.516960 14.606686 +[Debug] hidden_after_layer23: [2048, 1085] first4: 52.532547 37.219868 135.759094 151.323456 +[Debug] dit_step0_vt: [2170, 64] first4: 0.032410 0.877930 -0.200378 2.148727 +[Debug] dit_step0_xt: [2170, 64] first4: 0.192863 2.116344 -0.162767 0.749987 [DiT] step 1/8 t=1.000 -[Debug] dit_step1_vt: [2170, 64] first4: -0.018386 1.071533 -0.402077 1.814056 -[Debug] dit_step1_xt: [2170, 64] first4: 0.190878 2.046645 -0.165313 0.641917 +[Debug] dit_step1_vt: [2170, 64] first4: -0.018381 1.082458 -0.369057 1.835251 +[Debug] dit_step1_xt: [2170, 64] first4: 0.193865 2.057301 -0.142637 0.649882 [DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: -0.052032 1.017303 -0.201233 2.115219 -[Debug] dit_step2_xt: [2170, 64] first4: 0.194347 1.978825 -0.151898 0.500902 +[Debug] dit_step2_vt: [2170, 64] first4: -0.045654 1.004852 -0.202515 2.128693 +[Debug] dit_step2_xt: [2170, 64] first4: 0.196909 1.990311 -0.129136 0.507969 [DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: 0.052856 1.105988 0.072205 2.288803 -[Debug] dit_step3_xt: [2170, 64] first4: 0.189942 1.886660 -0.157915 0.310169 +[Debug] dit_step3_vt: [2170, 64] first4: 0.053986 1.098206 0.059753 2.273270 +[Debug] dit_step3_xt: [2170, 64] first4: 0.192410 1.898794 -0.134115 0.318530 [DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 0.097982 1.134430 0.083038 2.362534 -[Debug] dit_step4_xt: [2170, 64] first4: 0.179444 1.765114 -0.166812 0.057040 +[Debug] dit_step4_vt: [2170, 64] first4: 0.059109 1.133232 0.098053 2.316540 +[Debug] dit_step4_xt: [2170, 64] first4: 0.186077 1.777376 -0.144621 0.070330 [DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 0.122574 1.016464 0.173828 2.333248 -[Debug] dit_step5_xt: [2170, 64] first4: 0.161934 1.619904 -0.191644 -0.276281 +[Debug] dit_step5_vt: [2170, 64] first4: 0.060867 1.087685 0.153732 2.225224 +[Debug] dit_step5_xt: [2170, 64] first4: 0.177382 1.621992 -0.166582 -0.247560 [DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: 0.070358 0.866913 -0.005890 2.297897 -[Debug] dit_step6_xt: [2170, 64] first4: 0.147862 1.446522 -0.190466 -0.735860 +[Debug] dit_step6_vt: [2170, 64] first4: -0.040359 0.926651 0.010437 2.195786 +[Debug] dit_step6_xt: [2170, 64] first4: 0.185454 1.436662 -0.168670 -0.686717 [DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: -0.360962 0.376282 -0.314270 2.626526 -[Debug] dit_x0: [2170, 64] first4: 0.256151 1.333637 -0.096185 -1.523818 +[Debug] dit_step7_vt: [2170, 64] first4: -0.506134 0.486553 -0.233337 2.557739 +[Debug] dit_x0: [2170, 64] first4: 0.337294 1.290696 -0.098669 -1.454038 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 276.6 ms (276.6 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.256151 1.333637 -0.096185 -1.523818 +[DiT] Total generation: 336.6 ms (336.6 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.337294 1.290696 -0.098669 -1.454038 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 9723.7 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000254 0.000880 0.000782 0.001025 +[VAE Batch0] Decode: 1718.9 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000037 0.000692 0.000656 0.000941 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:55:46.361 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:55:46.361 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:55:46.361 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:55:46.362 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:55:46.362 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:55:47.150 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:55:48.700 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:55:48.700 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:55:48.705 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:55:48.864 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:47:24.206 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:47:24.287 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:47:25.614 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:47:25.614 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:47:25.616 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:47:25.621 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:47:25.810 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:55:48.866 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:47:25.812 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:55:48.872 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:55:48.885 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:55:48.885 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:55:48.917 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:55:49.229 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:55:49.230 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:55:49.230 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006822347640991211, 'diffusion_time_cost': 0.3050048351287842, 'diffusion_per_step_time_cost': 0.03812560439109802, 'total_time_cost': 0.3118271827697754, 'offload_time_cost': 0.0} -2026-03-01 19:55:49.244 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:55:49.267 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:55:49.267 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.47 GB -2026-03-01 19:55:49.267 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:55:49.267 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.47 GB -2026-03-01 19:55:49.267 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.47 GB -2026-03-01 19:55:49.267 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:55:49.543 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:55:49.546 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:55:49.549 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:47:25.819 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:47:25.833 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:47:25.833 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:47:25.868 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:47:26.184 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:47:26.184 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:47:26.185 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006891727447509766, 'diffusion_time_cost': 0.3077425956726074, 'diffusion_per_step_time_cost': 0.03846782445907593, 'total_time_cost': 0.3146343231201172, 'offload_time_cost': 0.0} +2026-03-04 21:47:26.198 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:47:26.201 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:47:26.201 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.85 GB +2026-03-04 21:47:26.201 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:47:26.201 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.85 GB +2026-03-04 21:47:26.201 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.85 GB +2026-03-04 21:47:26.201 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:47:26.477 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:47:26.479 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:47:26.481 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf [GGML] Running acestep-v15-turbo-Q6_K.gguf... @@ -224,36 +219,36 @@ Using precomputed LM hints temb_t 0.999990 hidden_after_proj_in 0.999982 enc_after_cond_emb 0.999691 - layer0_sa_output 0.999774 - hidden_after_layer0 0.999710 - hidden_after_layer6 0.999855 - hidden_after_layer12 0.998856 - hidden_after_layer18 0.995803 - hidden_after_layer23 0.992072 - dit_step0_vt 0.970064 - dit_step0_xt 0.999934 - dit_step1_vt 0.924403 - dit_step1_xt 0.999650 - dit_step2_vt 0.915580 - dit_step2_xt 0.998651 - dit_step3_vt 0.914431 - dit_step3_xt 0.996098 - dit_step4_vt 0.913750 - dit_step4_xt 0.990344 - dit_step5_vt 0.906205 - dit_step5_xt 0.976856 - dit_step6_vt 0.897054 - dit_step6_xt 0.950943 - dit_step7_vt 0.876737 - dit_x0 0.912738 - vae_audio 0.744947 - vae_audio (STFT cosine) 0.875717 + layer0_sa_output 0.916347 + hidden_after_layer0 0.997124 + hidden_after_layer6 0.993692 + hidden_after_layer12 0.992958 + hidden_after_layer18 0.988620 + hidden_after_layer23 0.980873 + dit_step0_vt 0.928387 + dit_step0_xt 0.999844 + dit_step1_vt 0.919122 + dit_step1_xt 0.999441 + dit_step2_vt 0.904200 + dit_step2_xt 0.998155 + dit_step3_vt 0.897635 + dit_step3_xt 0.994890 + dit_step4_vt 0.891638 + dit_step4_xt 0.987300 + dit_step5_vt 0.886907 + dit_step5_xt 0.970219 + dit_step6_vt 0.876538 + dit_step6_xt 0.938117 + dit_step7_vt 0.853291 + dit_x0 0.891872 + vae_audio 0.694699 + vae_audio (STFT cosine) 0.858167 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999934 0.147239 0.007394 -0.002260 0.973056 -0.002342 0.972003 - dit_step1_xt 0.999650 0.408757 0.017759 -0.005276 0.943557 -0.005313 0.941730 - dit_step2_xt 0.998651 0.803721 0.033644 -0.009510 0.911087 -0.009311 0.908527 - dit_step3_xt 0.996098 1.476888 0.054660 -0.015226 0.876460 -0.014577 0.873624 - dit_step4_xt 0.990344 2.294700 0.082632 -0.022702 0.844225 -0.021660 0.841995 - dit_step5_xt 0.976856 3.284146 0.125042 -0.033545 0.825286 -0.032109 0.824593 - dit_step6_xt 0.950943 4.445529 0.188707 -0.049081 0.851111 -0.046482 0.855546 + dit_step0_xt 0.999844 0.420509 0.012210 -0.002227 0.973206 -0.002342 0.972003 + dit_step1_xt 0.999441 0.819075 0.022719 -0.005232 0.943799 -0.005313 0.941730 + dit_step2_xt 0.998155 1.085687 0.039812 -0.009404 0.911549 -0.009311 0.908527 + dit_step3_xt 0.994890 1.743559 0.063467 -0.015082 0.877147 -0.014577 0.873624 + dit_step4_xt 0.987300 2.546782 0.096584 -0.022664 0.845277 -0.021660 0.841995 + dit_step5_xt 0.970219 3.539635 0.144911 -0.033717 0.826728 -0.032109 0.824593 + dit_step6_xt 0.938117 4.795851 0.216607 -0.049484 0.852836 -0.046482 0.855546 diff --git a/tests/Vulkan-Q8_0.log b/tests/Vulkan-Q8_0.log index 774bc8a..ef53667 100644 --- a/tests/Vulkan-Q8_0.log +++ b/tests/Vulkan-Q8_0.log @@ -1,7 +1,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 1 | matrix cores: NV_coopmat2 [Load] DiT backend: Vulkan0 (CPU threads: 16) -[Load] Backend init: 113.5 ms +[Load] Backend init: 111.5 ms [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [DiT] Self-attn: Q+K+V fused [DiT] Cross-attn: Q+K+V fused @@ -9,36 +9,34 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Load] null_condition_emb found (CFG available) [WeightCtx] Loaded 478 tensors, 1600.7 MB into backend [Load] DiT: 24 layers, H=2048, Nh=16/8, D=128 -[Load] DiT weight load: 214.1 ms +[Load] DiT weight load: 194.1 ms [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [Load] silence_latent: [15000, 64] from GGUF [GGUF] ../models/vae-BF16.gguf: 365 tensors, data at offset 30048 -[Load] VAE backend: Vulkan0 (CPU threads: 16) +[Load] VAE backend: Vulkan0 (shared) [VAE] Backend: Vulkan0, Weight buffer: 161.1 MB -[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations -[Load] VAE weights: 671.7 ms +[VAE] Loaded: 5 blocks, upsample=1920x, F32 activations +[Load] VAE weights: 657.9 ms [Request 1/1] ggml-turbo/request0.json (batch=1) [Request] parsed ggml-turbo/request0.json (18 fields) -[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was 7.0) [Pipeline] seed=42, steps=8, guidance=1.0, shift=3.0, duration=88.0s [Pipeline] 434 audio codes (86.8s @ 5Hz) [Pipeline] T=2170, S=1085 [BPE] Loaded from GGUF: 151643 vocab, 151387 merges -[Load] BPE tokenizer: 31.9 ms +[Load] BPE tokenizer: 31.4 ms [Pipeline] caption: 70 tokens, lyrics: 167 tokens -[Load] TextEncoder backend: Vulkan0 (CPU threads: 16) +[Load] TextEncoder backend: Vulkan0 (shared) [GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 [Load] TextEncoder: 28L, H=1024, Nh=16/8 [Qwen3] Attn: Q+K+V fused [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 310 tensors, 1136.5 MB into backend -[Load] TextEncoder: 176.0 ms +[Load] TextEncoder: 145.4 ms [Encode] TextEncoder (70 tokens): 17.6 ms [Debug] text_hidden: [70, 1024] first4: 3.705836 2.395382 0.221845 -13.145830 -[GGUF] ../models/Qwen3-Embedding-0.6B-BF16.gguf: 310 tensors, data at offset 5337568 -[Encode] Lyric vocab lookup (167 tokens): 11.2 ms +[Encode] Lyric vocab lookup (167 tokens): 0.3 ms [Debug] lyric_embed: [167, 1024] first4: 0.029175 0.032227 -0.022339 -0.028809 -[Load] CondEncoder backend: Vulkan0 (CPU threads: 16) +[Load] CondEncoder backend: Vulkan0 (shared) [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [Load] LyricEncoder: 8L [Qwen3] Attn: Q+K+V fused @@ -48,18 +46,18 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Qwen3] MLP: gate+up fused [WeightCtx] Loaded 140 tensors, 616.6 MB into backend [Load] CondEncoder: lyric(8L), timbre(4L), text_proj, null_cond -[Load] ConditionEncoder: 84.7 ms +[Load] ConditionEncoder: 75.8 ms [CondEnc] Lyric sliding mask: 167x167, window=128 [CondEnc] Timbre sliding mask: 750x750, window=128 [Encode] Packed: lyric=167 + timbre=1 + text=70 = 238 tokens -[Encode] ConditionEncoder: 19.4 ms, enc_S=238 +[Encode] ConditionEncoder: 5074.3 ms, enc_S=238 [Debug] enc_hidden: [238, 2048] first4: 1.759194 -0.049729 -0.133332 0.058435 [GGUF] ../models/acestep-v15-turbo-Q8_0.gguf: 678 tensors, data at offset 56864 [WeightCtx] Loaded 30 tensors, 106.5 MB into backend [Load] Detokenizer: FSQ(6->2048) + 2L encoder(S=5, 2048->64) -[Load] Detokenizer: 15.5 ms +[Load] Detokenizer: 13.7 ms [Context] Decoded: 434 codes -> 2170 frames (86.8s @ 25Hz) -[Context] Detokenizer: 85.1 ms +[Context] Detokenizer: 437.6 ms [Debug] detok_output: [2170, 64] first4: -0.121505 1.434749 0.303808 -0.627535 [Context Batch0] Philox noise seed=42, [2170, 64] [Debug] noise: [2170, 64] first4: 0.194336 2.156250 -0.171875 0.847656 @@ -79,70 +77,67 @@ ggml_vulkan: 0 = NVIDIA RTX PRO 6000 Blackwell Workstation Edition (NVIDIA) | um [Debug] proj_in_input: [192, 2170] first4: -0.121505 1.434749 0.303808 -0.627535 [Debug] enc_after_cond_emb: [2048, 238] first4: -0.169861 0.817307 0.328308 -0.558397 [Debug] layer0_sa_input: [2048, 1085] first4: -0.718007 -0.757392 -0.047301 0.261071 -[Debug] layer0_q_after_rope: [128, 16] first4: -2.423828 -0.099304 -0.408203 1.004883 -[Debug] layer0_k_after_rope: [128, 8] first4: -12.718538 1.122484 1.774887 1.790079 -[Debug] layer0_sa_output: [2048, 1085] first4: -1.510742 0.165771 -0.347900 0.511230 -[Debug] layer0_attn_out: [2048, 1085] first4: -1.542524 -1.031132 0.196691 0.455273 -[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.542524 -1.031132 0.196691 0.455273 -[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.585310 -0.791508 -0.290125 0.495190 -[Debug] hidden_after_layer0: [2048, 1085] first4: -8.926053 0.558007 51.172398 -0.877717 -[Debug] hidden_after_layer6: [2048, 1085] first4: -20.768745 -0.272222 34.170349 -4.416629 -[Debug] hidden_after_layer12: [2048, 1085] first4: -14.358247 -18.625305 73.571915 30.079784 -[Debug] hidden_after_layer18: [2048, 1085] first4: -26.789474 14.346137 62.040115 19.708126 -[Debug] hidden_after_layer23: [2048, 1085] first4: -2.927731 38.887718 192.805542 144.255524 -[Debug] dit_step0_vt: [2170, 64] first4: 0.027340 1.115875 0.350609 2.345856 -[Debug] dit_step0_xt: [2170, 64] first4: 0.193093 2.105528 -0.187812 0.741026 +[Debug] layer0_q_after_rope: [128, 16] first4: -2.134766 -0.300049 -0.404541 0.904297 +[Debug] layer0_k_after_rope: [128, 8] first4: -12.349133 1.249128 1.744302 1.794822 +[Debug] layer0_sa_output: [2048, 1085] first4: -0.636230 -0.405029 0.096436 0.194946 +[Debug] layer0_attn_out: [2048, 1085] first4: -1.250806 -0.777872 0.630535 0.449394 +[Debug] layer0_after_self_attn: [2048, 1085] first4: -1.250806 -0.777872 0.630535 0.449394 +[Debug] layer0_after_cross_attn: [2048, 1085] first4: -1.239698 -0.683206 0.416180 0.509788 +[Debug] hidden_after_layer0: [2048, 1085] first4: -4.682029 -0.464333 15.184165 -0.212429 +[Debug] hidden_after_layer6: [2048, 1085] first4: -8.053159 0.591622 20.595821 -6.469027 +[Debug] hidden_after_layer12: [2048, 1085] first4: -11.836857 -8.197025 41.079239 30.392553 +[Debug] hidden_after_layer18: [2048, 1085] first4: -20.004263 1.558971 15.575721 16.331001 +[Debug] hidden_after_layer23: [2048, 1085] first4: 23.482555 18.593208 82.512901 173.016068 +[Debug] dit_step0_vt: [2170, 64] first4: 0.084528 0.834541 -0.408783 2.115417 +[Debug] dit_step0_xt: [2170, 64] first4: 0.190494 2.118316 -0.153294 0.751501 [DiT] step 1/8 t=1.000 -[Debug] dit_step1_vt: [2170, 64] first4: 0.002377 1.005737 -0.352661 1.768188 -[Debug] dit_step1_xt: [2170, 64] first4: 0.192964 2.050670 -0.168576 0.644580 +[Debug] dit_step1_vt: [2170, 64] first4: -0.071388 1.041626 -0.270477 1.704315 +[Debug] dit_step1_xt: [2170, 64] first4: 0.194388 2.061500 -0.138541 0.658538 [DiT] step 2/8 t=0.955 -[Debug] dit_step2_vt: [2170, 64] first4: -0.063080 1.061218 -0.344177 1.926041 -[Debug] dit_step2_xt: [2170, 64] first4: 0.197169 1.979922 -0.145631 0.516177 +[Debug] dit_step2_vt: [2170, 64] first4: -0.071960 1.095016 -0.333557 1.988541 +[Debug] dit_step2_xt: [2170, 64] first4: 0.199185 1.988499 -0.116304 0.525969 [DiT] step 3/8 t=0.900 -[Debug] dit_step3_vt: [2170, 64] first4: -0.072388 1.144592 -0.184326 2.069214 -[Debug] dit_step3_xt: [2170, 64] first4: 0.203201 1.884539 -0.130270 0.343743 +[Debug] dit_step3_vt: [2170, 64] first4: -0.037468 1.148598 -0.165955 2.091240 +[Debug] dit_step3_xt: [2170, 64] first4: 0.202307 1.892783 -0.102474 0.351699 [DiT] step 4/8 t=0.833 -[Debug] dit_step4_vt: [2170, 64] first4: 0.004288 1.147110 0.001495 2.068916 -[Debug] dit_step4_xt: [2170, 64] first4: 0.202742 1.761635 -0.130430 0.122073 +[Debug] dit_step4_vt: [2170, 64] first4: 0.014343 1.134537 -0.033691 2.114731 +[Debug] dit_step4_xt: [2170, 64] first4: 0.200771 1.771225 -0.098864 0.125120 [DiT] step 5/8 t=0.750 -[Debug] dit_step5_vt: [2170, 64] first4: 0.070211 1.173462 0.080673 2.086014 -[Debug] dit_step5_xt: [2170, 64] first4: 0.192712 1.593997 -0.141955 -0.175929 +[Debug] dit_step5_vt: [2170, 64] first4: 0.064150 1.159027 0.062057 2.121386 +[Debug] dit_step5_xt: [2170, 64] first4: 0.191606 1.605650 -0.107730 -0.177935 [DiT] step 6/8 t=0.643 -[Debug] dit_step6_vt: [2170, 64] first4: -0.010117 1.145203 0.186996 2.198898 -[Debug] dit_step6_xt: [2170, 64] first4: 0.194735 1.364957 -0.179354 -0.615709 +[Debug] dit_step6_vt: [2170, 64] first4: -0.041473 1.200439 0.198494 2.240326 +[Debug] dit_step6_xt: [2170, 64] first4: 0.199901 1.365562 -0.147428 -0.626000 [DiT] step 7/8 t=0.500 -[Debug] dit_step7_vt: [2170, 64] first4: -0.244629 0.644890 0.358635 2.446594 -[Debug] dit_x0: [2170, 64] first4: 0.268124 1.171490 -0.286945 -1.349687 +[Debug] dit_step7_vt: [2170, 64] first4: -0.309998 0.692413 0.432823 2.469238 +[Debug] dit_x0: [2170, 64] first4: 0.292900 1.157838 -0.277275 -1.366771 [DiT] step 8/8 t=0.300 -[DiT] Total generation: 252.0 ms (252.0 ms/sample) -[Debug] dit_output: [2170, 64] first4: 0.268124 1.171490 -0.286945 -1.349687 +[DiT] Total generation: 335.0 ms (335.0 ms/sample) +[Debug] dit_output: [2170, 64] first4: 0.292900 1.157838 -0.277275 -1.366771 [VAE] Tiled decode: 17 tiles (chunk=256, overlap=64, stride=128) -[VAE] Graph: 417 nodes, T_latent=192 +[VAE] Graph: 335 nodes, T_latent=192 [VAE] Upsample factor: 1920.00 (expected ~1920) -[VAE] Graph: 417 nodes, T_latent=256 -[VAE] Graph: 417 nodes, T_latent=186 +[VAE] Graph: 335 nodes, T_latent=256 +[VAE] Graph: 335 nodes, T_latent=186 [VAE] Tiled decode done: 17 tiles -> T_audio=4166400 (86.80s @ 48kHz) -[VAE Batch0] Decode: 9843.4 ms -[Debug] vae_audio: [2, 4166400] first4: 0.000170 0.000825 0.000784 0.001115 +[VAE Batch0] Decode: 1706.4 ms +[Debug] vae_audio: [2, 4166400] first4: 0.000160 0.000739 0.000691 0.001054 [VAE Batch0] Wrote ggml-turbo/request00.wav: 4166400 samples (86.80s @ 48kHz stereo) [Request 1/1] Done [Pipeline] All done -2026-03-01 19:55:29.948 | WARNING | acestep.training.lora_utils::29 - PEFT library not installed. LoRA training will not be available. -2026-03-01 19:55:29.948 | WARNING | acestep.training.lokr_utils::24 - LyCORIS library not installed. LoKr training/inference unavailable. Install with: pip install lycoris-lora -2026-03-01 19:55:29.948 | WARNING | acestep.training.data_module::25 - Lightning not installed. Training module will not be available. -2026-03-01 19:55:29.948 | WARNING | acestep.training.trainer::28 - Lightning Fabric not installed. Training will use basic training loop. -2026-03-01 19:55:29.948 | WARNING | acestep.training.trainer::36 - bitsandbytes not installed. Using standard AdamW. -2026-03-01 19:55:30.699 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa -`torch_dtype` is deprecated! Use `dtype` instead! -2026-03-01 19:55:32.273 | INFO | acestep.core.generation.handler.generate_music:generate_music:92 - [generate_music] Starting generation... -2026-03-01 19:55:32.274 | INFO | acestep.core.generation.handler.generate_music:generate_music:95 - [generate_music] Preparing inputs... -2026-03-01 19:55:32.279 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... -2026-03-01 19:55:32.442 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... -2026-03-01 19:55:32.443 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - +2026-03-04 21:47:11.115 | WARNING | acestep.training.trainer::40 - bitsandbytes not installed. Using standard AdamW. +2026-03-04 21:47:11.205 | INFO | acestep.core.generation.handler.init_service_loader:_load_main_model_from_checkpoint:55 - [initialize_service] Attempting to load model with attention implementation: sdpa +Unable to import `torchao` Tensor objects. This may affect loading checkpoints serialized with `torchao` +2026-03-04 21:47:12.506 | INFO | acestep.core.generation.handler.generate_music:generate_music:164 - [generate_music] Starting generation... +2026-03-04 21:47:12.506 | INFO | acestep.core.generation.handler.generate_music:generate_music:167 - [generate_music] Preparing inputs... +2026-03-04 21:47:12.508 | INFO | acestep.core.generation.handler.generate_music:_vram_preflight_check:70 - [generate_music] VRAM pre-flight: 86.40 GB free, ~0.94 GB needed (batch=1, duration=88s, mode=turbo). +2026-03-04 21:47:12.513 | INFO | acestep.core.generation.handler.conditioning_target:_prepare_target_latents_and_wavs:41 - [generate_music] Decoding audio codes for item 0... +2026-03-04 21:47:12.703 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_precomputed_lm_hints:31 - [generate_music] Decoding audio codes for LM hints for item 0... +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:85 - ====================================================================== -2026-03-01 19:55:32.443 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) -2026-03-01 19:55:32.443 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== -2026-03-01 19:55:32.444 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:86 - 🔍 [DEBUG] DiT TEXT ENCODER INPUT (Inference) +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:87 - ====================================================================== +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:88 - text_prompt: # Instruction Generate audio semantic tokens based on the given conditions: @@ -156,8 +151,8 @@ An upbeat and anthemic pop-rock track driven by bright, slightly overdriven - duration: 88 seconds <|endoftext|> -2026-03-01 19:55:32.444 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== -2026-03-01 19:55:32.444 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:89 - ====================================================================== +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:90 - lyrics_text: # Languages fr @@ -184,25 +179,25 @@ Dans le monde des tutos virtuels Gândoline, Pumbé à midi Une famille à connecter, c'est vrai D'un enfant qui voit toi fusionner<|endoftext|> -2026-03-01 19:55:32.444 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== +2026-03-04 21:47:12.705 | INFO | acestep.core.generation.handler.conditioning_text:_prepare_text_conditioning_inputs:91 - ====================================================================== -2026-03-01 19:55:32.450 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... -2026-03-01 19:55:32.462 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... -2026-03-01 19:55:32.463 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) -2026-03-01 19:55:32.484 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... -2026-03-01 19:55:32.791 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... -2026-03-01 19:55:32.791 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 -2026-03-01 19:55:32.791 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006818294525146484, 'diffusion_time_cost': 0.2995321750640869, 'diffusion_per_step_time_cost': 0.037441521883010864, 'total_time_cost': 0.3063504695892334, 'offload_time_cost': 0.0} -2026-03-01 19:55:32.806 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... -2026-03-01 19:55:32.808 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB -2026-03-01 19:55:32.808 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 87.40 GB -2026-03-01 19:55:32.808 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... -2026-03-01 19:55:32.808 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 87.40 GB -2026-03-01 19:55:32.808 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 87.40 GB -2026-03-01 19:55:32.808 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) -2026-03-01 19:55:33.083 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB -2026-03-01 19:55:33.084 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... -2026-03-01 19:55:33.088 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. +2026-03-04 21:47:12.713 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:110 - [preprocess_batch] Inferring prompt embeddings... +2026-03-04 21:47:12.727 | INFO | acestep.core.generation.handler.conditioning_embed:preprocess_batch:113 - [preprocess_batch] Inferring lyric embeddings... +2026-03-04 21:47:12.727 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:120 - [service_generate] Generating audio... (DiT backend: PyTorch (cuda)) +2026-03-04 21:47:12.758 | INFO | acestep.core.generation.handler.service_generate_execute:_execute_service_generate_diffusion:200 - [service_generate] DiT diffusion via PyTorch (cuda)... +2026-03-04 21:47:13.073 | INFO | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:41 - [generate_music] Model generation completed. Decoding latents... +2026-03-04 21:47:13.073 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:63 - [generate_music] pred_latents: torch.Size([1, 2170, 64]), dtype=torch.bfloat16 +2026-03-04 21:47:13.073 | DEBUG | acestep.core.generation.handler.generate_music_decode:_prepare_generate_music_decode_state:64 - [generate_music] time_costs: {'encoder_time_cost': 0.006865262985229492, 'diffusion_time_cost': 0.30722999572753906, 'diffusion_per_step_time_cost': 0.03840374946594238, 'total_time_cost': 0.31409525871276855, 'offload_time_cost': 0.0} +2026-03-04 21:47:13.087 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:118 - [generate_music] Decoding latents with VAE... +2026-03-04 21:47:13.096 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:127 - [generate_music] Before VAE decode: allocated=5.98GB, max=7.29GB +2026-03-04 21:47:13.096 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:145 - [generate_music] Effective free VRAM before VAE decode: 86.82 GB +2026-03-04 21:47:13.096 | INFO | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:163 - [generate_music] Using tiled VAE decode to reduce VRAM usage... +2026-03-04 21:47:13.096 | DEBUG | acestep.core.generation.handler.memory_utils:_get_auto_decode_chunk_size:75 - [_get_auto_decode_chunk_size] Effective free VRAM: 86.82 GB +2026-03-04 21:47:13.096 | DEBUG | acestep.core.generation.handler.memory_utils:_should_offload_wav_to_cpu:98 - [_should_offload_wav_to_cpu] Effective free VRAM: 86.82 GB +2026-03-04 21:47:13.096 | INFO | acestep.core.generation.handler.vae_decode:tiled_decode:56 - [tiled_decode] chunk_size=512, offload_wav_to_cpu=False, latents_shape=torch.Size([1, 64, 2170]) +2026-03-04 21:47:13.370 | DEBUG | acestep.core.generation.handler.generate_music_decode:_decode_generate_music_pred_latents:185 - [generate_music] After VAE decode: allocated=6.15GB, max=7.44GB +2026-03-04 21:47:13.372 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:35 - [generate_music] VAE decode completed. Preparing audio tensors... +2026-03-04 21:47:13.374 | INFO | acestep.core.generation.handler.generate_music_payload:_build_generate_music_success_payload:45 - [generate_music] Done! Generated 1 audio tensors. [Request] Loaded request0.json [Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf [GGML] Running acestep-v15-turbo-Q8_0.gguf... @@ -224,36 +219,36 @@ Using precomputed LM hints temb_t 0.999998 hidden_after_proj_in 0.999985 enc_after_cond_emb 0.999817 - layer0_sa_output 0.999939 - hidden_after_layer0 0.999858 - hidden_after_layer6 0.999893 - hidden_after_layer12 0.999124 - hidden_after_layer18 0.996403 - hidden_after_layer23 0.993183 - dit_step0_vt 0.973885 - dit_step0_xt 0.999943 - dit_step1_vt 0.915468 - dit_step1_xt 0.999633 - dit_step2_vt 0.912211 - dit_step2_xt 0.998544 - dit_step3_vt 0.912707 - dit_step3_xt 0.995860 - dit_step4_vt 0.906019 - dit_step4_xt 0.989505 - dit_step5_vt 0.896537 - dit_step5_xt 0.974659 - dit_step6_vt 0.886047 - dit_step6_xt 0.945866 - dit_step7_vt 0.869793 - dit_x0 0.905017 - vae_audio 0.746037 - vae_audio (STFT cosine) 0.898352 + layer0_sa_output 0.896665 + hidden_after_layer0 0.996506 + hidden_after_layer6 0.988924 + hidden_after_layer12 0.986595 + hidden_after_layer18 0.980435 + hidden_after_layer23 0.969958 + dit_step0_vt 0.880150 + dit_step0_xt 0.999739 + dit_step1_vt 0.904993 + dit_step1_xt 0.999178 + dit_step2_vt 0.897232 + dit_step2_xt 0.997639 + dit_step3_vt 0.896753 + dit_step3_xt 0.994077 + dit_step4_vt 0.889861 + dit_step4_xt 0.986004 + dit_step5_vt 0.878511 + dit_step5_xt 0.967661 + dit_step6_vt 0.867605 + dit_step6_xt 0.933014 + dit_step7_vt 0.848412 + dit_x0 0.884572 + vae_audio 0.692036 + vae_audio (STFT cosine) 0.882942 [Turbo] Error growth GGML vs Python stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999943 0.140034 0.006943 -0.002318 0.973036 -0.002342 0.972003 - dit_step1_xt 0.999633 0.423125 0.018056 -0.005257 0.943026 -0.005313 0.941730 - dit_step2_xt 0.998544 0.841908 0.034537 -0.009209 0.910286 -0.009311 0.908527 - dit_step3_xt 0.995860 1.521911 0.055719 -0.014626 0.875169 -0.014577 0.873624 - dit_step4_xt 0.989505 2.346452 0.085477 -0.021803 0.842334 -0.021660 0.841995 - dit_step5_xt 0.974659 3.387389 0.130921 -0.032225 0.822365 -0.032109 0.824593 - dit_step6_xt 0.945866 4.812943 0.199910 -0.047290 0.846751 -0.046482 0.855546 + dit_step0_xt 0.999739 0.400727 0.016274 -0.002102 0.972847 -0.002342 0.972003 + dit_step1_xt 0.999178 0.814308 0.027485 -0.004968 0.942952 -0.005313 0.941730 + dit_step2_xt 0.997639 1.101152 0.044575 -0.008840 0.910138 -0.009311 0.908527 + dit_step3_xt 0.994077 1.762341 0.067497 -0.014170 0.875003 -0.014577 0.873624 + dit_step4_xt 0.986004 2.565164 0.099802 -0.021228 0.842166 -0.021660 0.841995 + dit_step5_xt 0.967661 3.593323 0.149360 -0.031486 0.822043 -0.032109 0.824593 + dit_step6_xt 0.933014 4.978329 0.224230 -0.046337 0.845793 -0.046482 0.855546 diff --git a/tests/Vulkan_BF16.log b/tests/Vulkan_BF16.log deleted file mode 100644 index bd5f26b..0000000 --- a/tests/Vulkan_BF16.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf -[GGML] Running acestep-v15-turbo-BF16.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999812 - lyric_embed 1.000000 - enc_hidden 0.999834 - detok_output 0.999997 - context 0.999998 - noise 1.000000 - temb_t 0.999999 - hidden_after_proj_in 0.999987 - enc_after_cond_emb 0.999825 - layer0_sa_output 0.999959 - hidden_after_layer0 0.999982 - hidden_after_layer6 0.999916 - hidden_after_layer12 0.999276 - hidden_after_layer18 0.996645 - hidden_after_layer23 0.993735 - dit_step0_vt 0.975502 - dit_step0_xt 0.999946 - dit_step1_vt 0.898326 - dit_step1_xt 0.999578 - dit_step2_vt 0.893586 - dit_step2_xt 0.998276 - dit_step3_vt 0.881101 - dit_step3_xt 0.994720 - dit_step4_vt 0.869138 - dit_step4_xt 0.986137 - dit_step5_vt 0.854878 - dit_step5_xt 0.965846 - dit_step6_vt 0.840298 - dit_step6_xt 0.925771 - dit_step7_vt 0.818300 - dit_x0 0.867401 - vae_audio 0.680429 - vae_audio (STFT cosine) 0.855382 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999946 0.135811 0.006633 -0.002316 0.972919 -0.002342 0.972003 - dit_step1_xt 0.999578 0.413265 0.019706 -0.005121 0.942541 -0.005313 0.941730 - dit_step2_xt 0.998276 0.811472 0.038208 -0.008968 0.908957 -0.009311 0.908527 - dit_step3_xt 0.994720 1.481150 0.064047 -0.014385 0.872574 -0.014577 0.873624 - dit_step4_xt 0.986137 1.857148 0.100272 -0.021489 0.837038 -0.021660 0.841995 - dit_step5_xt 0.965846 1.439633 0.154129 -0.031859 0.812819 -0.032109 0.824593 - dit_step6_xt 0.925771 2.125688 0.235367 -0.046759 0.832442 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-BF16.gguf -[GGML] Running acestep-v15-sft-BF16.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999812 - lyric_embed 1.000000 - enc_hidden 0.999834 - detok_output 0.999997 - context 0.999998 - noise 1.000000 - temb_t 0.999997 - hidden_after_proj_in 0.999987 - enc_after_cond_emb 0.999828 - layer0_sa_output 0.999951 - hidden_after_layer0 0.999982 - hidden_after_layer6 0.999849 - hidden_after_layer12 0.999486 - hidden_after_layer18 0.998746 - hidden_after_layer23 0.998992 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998963 - dit_step0_vt_uncond 0.973704 - dit_step0_vt 0.986492 - dit_step0_xt 0.999992 - dit_step5_vt_cond 0.978980 - dit_step5_vt 0.906055 - dit_step5_xt 0.999319 - dit_step10_vt_cond 0.961518 - dit_step10_vt 0.898737 - dit_step10_xt 0.996347 - dit_step15_vt_cond 0.933830 - dit_step15_vt 0.840233 - dit_step15_xt 0.988073 - dit_step20_vt_cond 0.894620 - dit_step20_vt 0.796873 - dit_step20_xt 0.970961 - dit_step25_vt_cond 0.845710 - dit_step25_vt 0.737589 - dit_step25_xt 0.943356 - dit_step30_vt_cond 0.791700 - dit_step30_vt 0.686150 - dit_step30_xt 0.906182 - dit_step35_vt_cond 0.734800 - dit_step35_vt 0.627091 - dit_step35_xt 0.866844 - dit_step40_vt_cond 0.692744 - dit_step40_vt 0.579983 - dit_step40_xt 0.832660 - dit_step45_vt_cond 0.707766 - dit_step45_vt 0.576903 - dit_step45_xt 0.809828 - dit_step49_vt_cond 0.753038 - dit_step49_vt 0.625137 - dit_x0 0.801669 - vae_audio 0.494694 - vae_audio (STFT cosine) 0.706773 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999992 0.064200 0.003294 -0.001888 0.980082 -0.001741 0.980402 - dit_step5_xt 0.999319 0.557092 0.024040 -0.006621 0.887864 -0.007143 0.887999 - dit_step10_xt 0.996347 0.965268 0.050926 -0.011718 0.806420 -0.012603 0.811299 - dit_step15_xt 0.988073 0.861492 0.085157 -0.016277 0.731584 -0.018114 0.745268 - dit_step20_xt 0.970961 1.278730 0.125264 -0.020700 0.671902 -0.023808 0.699582 - dit_step25_xt 0.943356 1.796219 0.169586 -0.025074 0.633808 -0.029311 0.679278 - dit_step30_xt 0.906182 2.190889 0.219620 -0.029769 0.614453 -0.035027 0.685262 - dit_step35_xt 0.866844 2.605400 0.272383 -0.034410 0.619164 -0.040716 0.717195 - dit_step40_xt 0.832660 3.030330 0.326889 -0.039011 0.646487 -0.046462 0.771853 - dit_step45_xt 0.809828 3.411977 0.379136 -0.043945 0.692545 -0.052475 0.843036 diff --git a/tests/Vulkan_Q4_K_M.log b/tests/Vulkan_Q4_K_M.log deleted file mode 100644 index 2c1b7e2..0000000 --- a/tests/Vulkan_Q4_K_M.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q4_K_M.gguf -[GGML] Running acestep-v15-turbo-Q4_K_M.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999812 - lyric_embed 1.000000 - enc_hidden 0.997128 - detok_output 0.999611 - context 0.999751 - noise 1.000000 - temb_t 0.999906 - hidden_after_proj_in 0.999907 - enc_after_cond_emb 0.997645 - layer0_sa_output 0.998432 - hidden_after_layer0 0.999545 - hidden_after_layer6 0.923275 - hidden_after_layer12 0.969957 - hidden_after_layer18 0.964919 - hidden_after_layer23 0.947132 - dit_step0_vt 0.790633 - dit_step0_xt 0.999549 - dit_step1_vt 0.812278 - dit_step1_xt 0.998317 - dit_step2_vt 0.797899 - dit_step2_xt 0.994987 - dit_step3_vt 0.785709 - dit_step3_xt 0.987168 - dit_step4_vt 0.777756 - dit_step4_xt 0.969910 - dit_step5_vt 0.739552 - dit_step5_xt 0.933874 - dit_step6_vt 0.745520 - dit_step6_xt 0.867311 - dit_step7_vt 0.704124 - dit_x0 0.770712 - vae_audio 0.383362 - vae_audio (STFT cosine) 0.669931 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999549 0.201087 0.022082 -0.002495 0.972767 -0.002342 0.972003 - dit_step1_xt 0.998317 0.415437 0.041246 -0.005636 0.942205 -0.005313 0.941730 - dit_step2_xt 0.994987 0.709212 0.068458 -0.010217 0.907730 -0.009311 0.908527 - dit_step3_xt 0.987168 1.068925 0.105239 -0.016380 0.870170 -0.014577 0.873624 - dit_step4_xt 0.969910 1.456167 0.155261 -0.024550 0.833831 -0.021660 0.841995 - dit_step5_xt 0.933874 2.028250 0.225222 -0.035727 0.809987 -0.032109 0.824593 - dit_step6_xt 0.867311 3.033199 0.329427 -0.051895 0.826478 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q4_K_M.gguf -[GGML] Running acestep-v15-sft-Q4_K_M.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999812 - lyric_embed 1.000000 - enc_hidden 0.997128 - detok_output 0.999611 - context 0.999751 - noise 1.000000 - temb_t 0.999673 - hidden_after_proj_in 0.999909 - enc_after_cond_emb 0.997634 - layer0_sa_output 0.998553 - hidden_after_layer0 0.999511 - hidden_after_layer6 0.995145 - hidden_after_layer12 0.984092 - hidden_after_layer18 0.981649 - hidden_after_layer23 0.984387 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.976637 - dit_step0_vt_uncond 0.980925 - dit_step0_vt 0.934226 - dit_step0_xt 0.999962 - dit_step5_vt_cond 0.967427 - dit_step5_vt 0.910792 - dit_step5_xt 0.998806 - dit_step10_vt_cond 0.948369 - dit_step10_vt 0.866632 - dit_step10_xt 0.994857 - dit_step15_vt_cond 0.909778 - dit_step15_vt 0.814508 - dit_step15_xt 0.984920 - dit_step20_vt_cond 0.863625 - dit_step20_vt 0.764052 - dit_step20_xt 0.965868 - dit_step25_vt_cond 0.811103 - dit_step25_vt 0.700861 - dit_step25_xt 0.937051 - dit_step30_vt_cond 0.753305 - dit_step30_vt 0.655816 - dit_step30_xt 0.899063 - dit_step35_vt_cond 0.699261 - dit_step35_vt 0.599863 - dit_step35_xt 0.859178 - dit_step40_vt_cond 0.670103 - dit_step40_vt 0.573321 - dit_step40_xt 0.825435 - dit_step45_vt_cond 0.701869 - dit_step45_vt 0.600028 - dit_step45_xt 0.803747 - dit_step49_vt_cond 0.749100 - dit_step49_vt 0.652063 - dit_x0 0.796334 - vae_audio 0.454343 - vae_audio (STFT cosine) 0.718386 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999962 0.072923 0.006527 -0.001861 0.980234 -0.001741 0.980402 - dit_step5_xt 0.998806 0.371089 0.032132 -0.007108 0.889710 -0.007143 0.887999 - dit_step10_xt 0.994857 0.721153 0.060355 -0.013425 0.811244 -0.012603 0.811299 - dit_step15_xt 0.984920 1.170655 0.094867 -0.019480 0.745370 -0.018114 0.745268 - dit_step20_xt 0.965868 1.624943 0.135007 -0.025812 0.700521 -0.023808 0.699582 - dit_step25_xt 0.937051 2.025275 0.178318 -0.032528 0.673256 -0.029311 0.679278 - dit_step30_xt 0.899063 2.555359 0.227638 -0.038874 0.670375 -0.035027 0.685262 - dit_step35_xt 0.859178 3.109559 0.281450 -0.045209 0.695123 -0.040716 0.717195 - dit_step40_xt 0.825435 3.695475 0.337125 -0.051359 0.742071 -0.046462 0.771853 - dit_step45_xt 0.803747 4.263174 0.390511 -0.057731 0.807748 -0.052475 0.843036 diff --git a/tests/Vulkan_Q5_K_M.log b/tests/Vulkan_Q5_K_M.log deleted file mode 100644 index e6ff2d6..0000000 --- a/tests/Vulkan_Q5_K_M.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q5_K_M.gguf -[GGML] Running acestep-v15-turbo-Q5_K_M.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999812 - lyric_embed 1.000000 - enc_hidden 0.999132 - detok_output 0.999876 - context 0.999921 - noise 1.000000 - temb_t 0.999972 - hidden_after_proj_in 0.999959 - enc_after_cond_emb 0.999270 - layer0_sa_output 0.999442 - hidden_after_layer0 0.999638 - hidden_after_layer6 0.996691 - hidden_after_layer12 0.982345 - hidden_after_layer18 0.974400 - hidden_after_layer23 0.959734 - dit_step0_vt 0.838690 - dit_step0_xt 0.999650 - dit_step1_vt 0.854798 - dit_step1_xt 0.998726 - dit_step2_vt 0.843823 - dit_step2_xt 0.996265 - dit_step3_vt 0.832135 - dit_step3_xt 0.990412 - dit_step4_vt 0.826630 - dit_step4_xt 0.977378 - dit_step5_vt 0.824313 - dit_step5_xt 0.950549 - dit_step6_vt 0.806361 - dit_step6_xt 0.899178 - dit_step7_vt 0.774146 - dit_x0 0.825965 - vae_audio 0.488652 - vae_audio (STFT cosine) 0.756261 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999650 0.235943 0.018873 -0.002256 0.973219 -0.002342 0.972003 - dit_step1_xt 0.998726 0.436601 0.034659 -0.005174 0.942992 -0.005313 0.941730 - dit_step2_xt 0.996265 0.716827 0.057185 -0.009195 0.909263 -0.009311 0.908527 - dit_step3_xt 0.990412 0.968242 0.088230 -0.014806 0.872959 -0.014577 0.873624 - dit_step4_xt 0.977378 1.455533 0.130847 -0.022234 0.838622 -0.021660 0.841995 - dit_step5_xt 0.950549 2.134846 0.189630 -0.032763 0.816673 -0.032109 0.824593 - dit_step6_xt 0.899178 3.163587 0.280857 -0.047640 0.840933 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q5_K_M.gguf -[GGML] Running acestep-v15-sft-Q5_K_M.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999812 - lyric_embed 1.000000 - enc_hidden 0.999132 - detok_output 0.999876 - context 0.999921 - noise 1.000000 - temb_t 0.999899 - hidden_after_proj_in 0.999959 - enc_after_cond_emb 0.999269 - layer0_sa_output 0.999522 - hidden_after_layer0 0.999793 - hidden_after_layer6 0.995888 - hidden_after_layer12 0.985474 - hidden_after_layer18 0.984020 - hidden_after_layer23 0.986112 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.978964 - dit_step0_vt_uncond 0.973976 - dit_step0_vt 0.937223 - dit_step0_xt 0.999964 - dit_step5_vt_cond 0.967160 - dit_step5_vt 0.909198 - dit_step5_xt 0.998804 - dit_step10_vt_cond 0.950415 - dit_step10_vt 0.867165 - dit_step10_xt 0.994875 - dit_step15_vt_cond 0.914609 - dit_step15_vt 0.816760 - dit_step15_xt 0.985212 - dit_step20_vt_cond 0.868346 - dit_step20_vt 0.771014 - dit_step20_xt 0.966347 - dit_step25_vt_cond 0.813828 - dit_step25_vt 0.714557 - dit_step25_xt 0.936240 - dit_step30_vt_cond 0.758857 - dit_step30_vt 0.662399 - dit_step30_xt 0.898782 - dit_step35_vt_cond 0.707135 - dit_step35_vt 0.617898 - dit_step35_xt 0.859637 - dit_step40_vt_cond 0.679574 - dit_step40_vt 0.584797 - dit_step40_xt 0.827363 - dit_step45_vt_cond 0.709869 - dit_step45_vt 0.613484 - dit_step45_xt 0.805902 - dit_step49_vt_cond 0.756478 - dit_step49_vt 0.658766 - dit_x0 0.797882 - vae_audio 0.472032 - vae_audio (STFT cosine) 0.708586 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999964 0.073235 0.006362 -0.001778 0.980214 -0.001741 0.980402 - dit_step5_xt 0.998804 0.348623 0.032282 -0.006098 0.890574 -0.007143 0.887999 - dit_step10_xt 0.994875 0.617850 0.060577 -0.011335 0.811641 -0.012603 0.811299 - dit_step15_xt 0.985212 1.165812 0.094804 -0.016284 0.748105 -0.018114 0.745268 - dit_step20_xt 0.966347 1.619635 0.134939 -0.021429 0.702593 -0.023808 0.699582 - dit_step25_xt 0.936240 2.011917 0.181224 -0.026596 0.681069 -0.029311 0.679278 - dit_step30_xt 0.898782 2.443318 0.230607 -0.031965 0.682407 -0.035027 0.685262 - dit_step35_xt 0.859637 2.917810 0.284657 -0.037104 0.710155 -0.040716 0.717195 - dit_step40_xt 0.827363 3.602165 0.340057 -0.042128 0.759737 -0.046462 0.771853 - dit_step45_xt 0.805902 4.251132 0.394434 -0.047162 0.828316 -0.052475 0.843036 diff --git a/tests/Vulkan_Q6_K.log b/tests/Vulkan_Q6_K.log deleted file mode 100644 index 916944c..0000000 --- a/tests/Vulkan_Q6_K.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q6_K.gguf -[GGML] Running acestep-v15-turbo-Q6_K.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999812 - lyric_embed 1.000000 - enc_hidden 0.999665 - detok_output 0.999972 - context 0.999982 - noise 1.000000 - temb_t 0.999990 - hidden_after_proj_in 0.999982 - enc_after_cond_emb 0.999691 - layer0_sa_output 0.999774 - hidden_after_layer0 0.999710 - hidden_after_layer6 0.999855 - hidden_after_layer12 0.998856 - hidden_after_layer18 0.995803 - hidden_after_layer23 0.992072 - dit_step0_vt 0.970064 - dit_step0_xt 0.999934 - dit_step1_vt 0.924564 - dit_step1_xt 0.999651 - dit_step2_vt 0.915541 - dit_step2_xt 0.998650 - dit_step3_vt 0.915489 - dit_step3_xt 0.996123 - dit_step4_vt 0.916835 - dit_step4_xt 0.990527 - dit_step5_vt 0.909275 - dit_step5_xt 0.977470 - dit_step6_vt 0.899986 - dit_step6_xt 0.952353 - dit_step7_vt 0.880023 - dit_x0 0.915268 - vae_audio 0.753562 - vae_audio (STFT cosine) 0.882452 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999934 0.147239 0.007394 -0.002260 0.973056 -0.002342 0.972003 - dit_step1_xt 0.999651 0.410402 0.017745 -0.005286 0.943565 -0.005313 0.941730 - dit_step2_xt 0.998650 0.806730 0.033672 -0.009524 0.911097 -0.009311 0.908527 - dit_step3_xt 0.996123 1.479887 0.054500 -0.015235 0.876469 -0.014577 0.873624 - dit_step4_xt 0.990527 2.298363 0.081794 -0.022731 0.844225 -0.021660 0.841995 - dit_step5_xt 0.977470 3.296017 0.123177 -0.033626 0.825405 -0.032109 0.824593 - dit_step6_xt 0.952353 4.545029 0.185597 -0.049157 0.851892 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q6_K.gguf -[GGML] Running acestep-v15-sft-Q6_K.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999812 - lyric_embed 1.000000 - enc_hidden 0.999665 - detok_output 0.999972 - context 0.999982 - noise 1.000000 - temb_t 0.999973 - hidden_after_proj_in 0.999981 - enc_after_cond_emb 0.999694 - layer0_sa_output 0.999789 - hidden_after_layer0 0.999784 - hidden_after_layer6 0.999737 - hidden_after_layer12 0.999297 - hidden_after_layer18 0.998478 - hidden_after_layer23 0.998790 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998675 - dit_step0_vt_uncond 0.962163 - dit_step0_vt 0.981229 - dit_step0_xt 0.999989 - dit_step5_vt_cond 0.978548 - dit_step5_vt 0.903995 - dit_step5_xt 0.999251 - dit_step10_vt_cond 0.949676 - dit_step10_vt 0.866414 - dit_step10_xt 0.996103 - dit_step15_vt_cond 0.890112 - dit_step15_vt 0.755968 - dit_step15_xt 0.986117 - dit_step20_vt_cond 0.800524 - dit_step20_vt 0.668617 - dit_step20_xt 0.965883 - dit_step25_vt_cond 0.715616 - dit_step25_vt 0.707363 - dit_step25_xt 0.936566 - dit_step30_vt_cond 0.651806 - dit_step30_vt 0.573252 - dit_step30_xt 0.901106 - dit_step35_vt_cond 0.613517 - dit_step35_vt 0.548023 - dit_step35_xt 0.866538 - dit_step40_vt_cond 0.617661 - dit_step40_vt 0.531763 - dit_step40_xt 0.837556 - dit_step45_vt_cond 0.690489 - dit_step45_vt 0.608902 - dit_step45_xt 0.819015 - dit_step49_vt_cond 0.760344 - dit_step49_vt 0.689227 - dit_x0 0.812918 - vae_audio 0.596607 - vae_audio (STFT cosine) 0.752876 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999989 0.053618 0.003814 -0.002076 0.980489 -0.001741 0.980402 - dit_step5_xt 0.999251 0.748318 0.025536 -0.008766 0.893415 -0.007143 0.887999 - dit_step10_xt 0.996103 1.428011 0.054273 -0.016368 0.822729 -0.012603 0.811299 - dit_step15_xt 0.986117 2.055885 0.098667 -0.024122 0.777367 -0.018114 0.745268 - dit_step20_xt 0.965883 2.750473 0.153407 -0.031399 0.762304 -0.023808 0.699582 - dit_step25_xt 0.936566 3.458536 0.209270 -0.038856 0.768389 -0.029311 0.679278 - dit_step30_xt 0.901106 4.182745 0.271563 -0.045971 0.805686 -0.035027 0.685262 - dit_step35_xt 0.866538 4.941256 0.336049 -0.053191 0.866756 -0.040716 0.717195 - dit_step40_xt 0.837556 5.867188 0.401823 -0.059864 0.948138 -0.046462 0.771853 - dit_step45_xt 0.819015 6.961776 0.463382 -0.066566 1.043107 -0.052475 0.843036 diff --git a/tests/Vulkan_Q8_0.log b/tests/Vulkan_Q8_0.log deleted file mode 100644 index 9262047..0000000 --- a/tests/Vulkan_Q8_0.log +++ /dev/null @@ -1,130 +0,0 @@ -[Request] Loaded request0.json -[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-Q8_0.gguf -[GGML] Running acestep-v15-turbo-Q8_0.gguf... -[GGML] Done, 47 dump files -[Python] Initializing acestep-v15-turbo... -[Python] Generating (acestep-v15-turbo, 8 steps)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-turbo/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 40 dump files -[Turbo] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999812 - lyric_embed 1.000000 - enc_hidden 0.999824 - detok_output 0.999983 - context 0.999990 - noise 1.000000 - temb_t 0.999998 - hidden_after_proj_in 0.999985 - enc_after_cond_emb 0.999817 - layer0_sa_output 0.999939 - hidden_after_layer0 0.999858 - hidden_after_layer6 0.999893 - hidden_after_layer12 0.999124 - hidden_after_layer18 0.996403 - hidden_after_layer23 0.993183 - dit_step0_vt 0.973885 - dit_step0_xt 0.999943 - dit_step1_vt 0.915468 - dit_step1_xt 0.999633 - dit_step2_vt 0.912211 - dit_step2_xt 0.998544 - dit_step3_vt 0.912707 - dit_step3_xt 0.995860 - dit_step4_vt 0.906019 - dit_step4_xt 0.989505 - dit_step5_vt 0.896537 - dit_step5_xt 0.974659 - dit_step6_vt 0.886047 - dit_step6_xt 0.945866 - dit_step7_vt 0.869793 - dit_x0 0.905017 - vae_audio 0.746037 - vae_audio (STFT cosine) 0.898352 -[Turbo] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999943 0.140034 0.006943 -0.002318 0.973036 -0.002342 0.972003 - dit_step1_xt 0.999633 0.423125 0.018056 -0.005257 0.943026 -0.005313 0.941730 - dit_step2_xt 0.998544 0.841908 0.034537 -0.009209 0.910286 -0.009311 0.908527 - dit_step3_xt 0.995860 1.521911 0.055719 -0.014626 0.875169 -0.014577 0.873624 - dit_step4_xt 0.989505 2.346452 0.085477 -0.021803 0.842334 -0.021660 0.841995 - dit_step5_xt 0.974659 3.387389 0.130921 -0.032225 0.822365 -0.032109 0.824593 - dit_step6_xt 0.945866 4.812943 0.199910 -0.047290 0.846751 -0.046482 0.855546 -[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-Q8_0.gguf -[GGML] Running acestep-v15-sft-Q8_0.gguf... -[GGML] Done, 233 dump files -[Python] Initializing acestep-v15-sft... -[Python] Generating (acestep-v15-sft, 50 steps, CFG 7.0)... -Using precomputed LM hints -Using precomputed LM hints -[Python] Wrote python-sft/output.wav: 4166400 samples (86.80s @ 48kHz stereo) -[Python] Done, 218 dump files -[SFT] Cosine similarities GGML vs Python - stage GGML vs Python - text_hidden 0.999812 - lyric_embed 1.000000 - enc_hidden 0.999824 - detok_output 0.999983 - context 0.999990 - noise 1.000000 - temb_t 0.999994 - hidden_after_proj_in 0.999985 - enc_after_cond_emb 0.999820 - layer0_sa_output 0.999932 - hidden_after_layer0 0.999867 - hidden_after_layer6 0.999809 - hidden_after_layer12 0.999421 - hidden_after_layer18 0.998648 - hidden_after_layer23 0.998927 - null_condition_emb 1.000000 - null_enc_hidden 1.000000 - dit_step0_vt_cond 0.998848 - dit_step0_vt_uncond 0.964971 - dit_step0_vt 0.982622 - dit_step0_xt 0.999990 - dit_step5_vt_cond 0.978187 - dit_step5_vt 0.910806 - dit_step5_xt 0.999338 - dit_step10_vt_cond 0.948119 - dit_step10_vt 0.856732 - dit_step10_xt 0.996258 - dit_step15_vt_cond 0.885149 - dit_step15_vt 0.741011 - dit_step15_xt 0.986353 - dit_step20_vt_cond 0.792343 - dit_step20_vt 0.735701 - dit_step20_xt 0.966995 - dit_step25_vt_cond 0.713669 - dit_step25_vt 0.604646 - dit_step25_xt 0.937523 - dit_step30_vt_cond 0.654759 - dit_step30_vt 0.575313 - dit_step30_xt 0.901384 - dit_step35_vt_cond 0.616330 - dit_step35_vt 0.533322 - dit_step35_xt 0.865098 - dit_step40_vt_cond 0.615497 - dit_step40_vt 0.525598 - dit_step40_xt 0.834978 - dit_step45_vt_cond 0.687607 - dit_step45_vt 0.600947 - dit_step45_xt 0.816193 - dit_step49_vt_cond 0.757023 - dit_step49_vt 0.678778 - dit_x0 0.809822 - vae_audio 0.552742 - vae_audio (STFT cosine) 0.704247 -[SFT] Error growth GGML vs Python - stage cos max_err mean_err mean_A std_A mean_B std_B - dit_step0_xt 0.999990 0.053120 0.003694 -0.002028 0.980340 -0.001741 0.980402 - dit_step5_xt 0.999338 0.528079 0.024091 -0.008167 0.891761 -0.007143 0.887999 - dit_step10_xt 0.996258 1.260570 0.054251 -0.014905 0.821682 -0.012603 0.811299 - dit_step15_xt 0.986353 1.896362 0.099359 -0.021353 0.777987 -0.018114 0.745268 - dit_step20_xt 0.966995 2.558488 0.150921 -0.027607 0.759790 -0.023808 0.699582 - dit_step25_xt 0.937523 3.268598 0.209264 -0.033645 0.770984 -0.029311 0.679278 - dit_step30_xt 0.901384 3.973653 0.271000 -0.039796 0.805477 -0.035027 0.685262 - dit_step35_xt 0.865098 4.656569 0.335194 -0.045754 0.864460 -0.040716 0.717195 - dit_step40_xt 0.834978 5.519352 0.400309 -0.051630 0.944399 -0.046462 0.771853 - dit_step45_xt 0.816193 6.556623 0.460383 -0.057408 1.036260 -0.052475 0.843036 diff --git a/tests/fixtures/ci-cover.json b/tests/fixtures/ci-cover.json deleted file mode 100644 index 3d21e1f..0000000 --- a/tests/fixtures/ci-cover.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "task_type": "cover", - "caption": "Short CI cover test", - "duration": 5, - "inference_steps": 4, - "guidance_scale": 1, - "shift": 3, - "seed": 42, - "reference_audio": "tests/fixtures/ci-text2music0.wav", - "audio_cover_strength": 0.8, - "audio_codes": "43316,18426,13366,59455,17783,49303,7423,29855,37158,37157,62317,61455,12847,19583,57031,34656,20254,10770,11416,15905,31413,23339,47091,12198,49531,37355,33090,38645,40707,16324,61436,46095,13941,5287,2239,13975,63815,2757,4862,13571,63495,39,29887,49426,12696,50847,40498,61056,25666,12989,23987,54763,25485,31683,28554,25355,16373,28995,2351,1655,7940,55831,34359,15350,15277,11717,20476,52239,5015,19807,24087,3559,20471,34193,32552,60999,29360,25338,38873,16768,17912,27584,24008,1528,449,25563,52684,53223,42183,37215,12343,39431,26055,28148,57286,38382,28863,7191,58397,18991,7695,30716,36784,12687,8707,25649,33718,3202,23035,10747,26354,63965,16260,11223,45679,14343,8679,49351,52927,2535,19207,46447,49615,12694,21110" -} diff --git a/tests/fixtures/ci-dit-only.json b/tests/fixtures/ci-dit-only.json deleted file mode 100644 index 0a83cb8..0000000 --- a/tests/fixtures/ci-dit-only.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "caption": "Short CI clip", - "lyrics": "", - "bpm": 90, - "duration": 5, - "keyscale": "C minor", - "timesignature": "4", - "vocal_language": "en", - "inference_steps": 4, - "shift": 3 -} diff --git a/tests/fixtures/ci-full.json b/tests/fixtures/ci-full.json deleted file mode 100644 index 3a37bfc..0000000 --- a/tests/fixtures/ci-full.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "caption": "Short CI house clip", - "lyrics": "[Intro]\n\n[Verse 1]\nTest\n\n[Outro]\nDone", - "bpm": 120, - "duration": 5, - "keyscale": "C major", - "timesignature": "4", - "vocal_language": "fr", - "inference_steps": 4, - "shift": 3 -} diff --git a/tests/fixtures/ci-partial.json b/tests/fixtures/ci-partial.json deleted file mode 100644 index 19ae9db..0000000 --- a/tests/fixtures/ci-partial.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "caption": "Short CI hip hop clip", - "lyrics": "[Intro]\nYeah\n\n[Verse 1]\nOne two\n\n[Chorus]\nTest\n\n[Outro]\nDone", - "duration": 5, - "vocal_language": "fr", - "inference_steps": 4, - "shift": 3 -} diff --git a/tests/fixtures/ci-request-reference.json b/tests/fixtures/ci-request-reference.json deleted file mode 100644 index 141d77a..0000000 --- a/tests/fixtures/ci-request-reference.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "task_type": "text2music", - "caption": "Short CI reference test", - "lyrics": "[Verse]\nTest\n[Chorus]\nRef", - "duration": 5, - "seed": 42, - "inference_steps": 4, - "guidance_scale": 1, - "shift": 3, - "reference_audio": "reference.wav", - "audio_codes": "", - "audio_cover_strength": 1 -} diff --git a/tests/fixtures/ci-text2music.json b/tests/fixtures/ci-text2music.json deleted file mode 100644 index 1613723..0000000 --- a/tests/fixtures/ci-text2music.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "task_type": "text2music", - "caption": "Short CI test clip", - "lyrics": "", - "duration": 5, - "inference_steps": 4, - "guidance_scale": 1, - "shift": 3, - "seed": 42, - "audio_codes": "" -} diff --git a/tests/run-generation-tests.sh b/tests/run-generation-tests.sh deleted file mode 100755 index 666698c..0000000 --- a/tests/run-generation-tests.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env bash -# Run the same generation tests as the GitHub Action (test-generation.yml). -# Use this to validate locally before pushing. No assumptions: build and models required. -# -# From repo root: -# ./models.sh # once: download Q8_0 + VAE into models/ -# mkdir -p build && cd build && cmake .. && cmake --build . --config Release -# cd .. && tests/run-generation-tests.sh - -set -e -cd "$(dirname "$0")/.." -REPO_ROOT="$PWD" - -# --- Build --- -if [ ! -f build/dit-vae ] || [ ! -f build/ace-qwen3 ]; then - echo "Missing build/dit-vae or build/ace-qwen3. Build first:" - echo " mkdir -p build && cd build && cmake .. && cmake --build . --config Release" - exit 1 -fi - -# --- Models --- -TEXT_ENC="models/Qwen3-Embedding-0.6B-Q8_0.gguf" -DIT="models/acestep-v15-turbo-Q8_0.gguf" -VAE="models/vae-BF16.gguf" -LM="models/acestep-5Hz-lm-4B-Q8_0.gguf" -for f in "$TEXT_ENC" "$DIT" "$VAE"; do - if [ ! -f "$f" ]; then - echo "Missing $f. Download models once: ./models.sh" - exit 1 - fi -done - -echo "[1/3] Test mode text2music (short)" -./build/dit-vae \ - --request tests/fixtures/ci-text2music.json \ - --text-encoder "$TEXT_ENC" \ - --dit "$DIT" \ - --vae "$VAE" -if [ ! -f tests/fixtures/ci-text2music0.wav ]; then - echo "FAIL: tests/fixtures/ci-text2music0.wav not created" - exit 1 -fi -echo " text2music WAV OK" - -echo "[2/3] Test mode cover with WAV reference (short)" -./build/dit-vae \ - --request tests/fixtures/ci-cover.json \ - --text-encoder "$TEXT_ENC" \ - --dit "$DIT" \ - --vae "$VAE" -if [ ! -f tests/fixtures/ci-cover0.wav ]; then - echo "FAIL: tests/fixtures/ci-cover0.wav not created" - exit 1 -fi -echo " cover WAV OK" - -echo "[3/3] Test full pipeline (LLM + DiT, short)" -if [ ! -f "$LM" ]; then - echo "Missing $LM; skipping full pipeline. Run ./models.sh to include LM." - exit 1 -fi -# ace-qwen3 names output from input path (e.g. request.json -> request0.json) -cp tests/fixtures/ci-text2music.json request.json -./build/ace-qwen3 \ - --request request.json \ - --model "$LM" -if [ ! -f request0.json ]; then - echo "FAIL: request0.json not created by ace-qwen3" - exit 1 -fi -./build/dit-vae \ - --request request0.json \ - --text-encoder "$TEXT_ENC" \ - --dit "$DIT" \ - --vae "$VAE" -if [ ! -f request00.wav ]; then - echo "FAIL: request00.wav not created" - exit 1 -fi -echo " full pipeline WAV OK" - -echo "" -echo "All generation tests passed locally. Safe to rely on CI for the same checks." diff --git a/third_party/minimp3.h b/third_party/minimp3.h deleted file mode 100644 index 3220ae1..0000000 --- a/third_party/minimp3.h +++ /dev/null @@ -1,1865 +0,0 @@ -#ifndef MINIMP3_H -#define MINIMP3_H -/* - https://github.com/lieff/minimp3 - To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide. - This software is distributed without any warranty. - See . -*/ -#include - -#define MINIMP3_MAX_SAMPLES_PER_FRAME (1152*2) - -typedef struct -{ - int frame_bytes, frame_offset, channels, hz, layer, bitrate_kbps; -} mp3dec_frame_info_t; - -typedef struct -{ - float mdct_overlap[2][9*32], qmf_state[15*2*32]; - int reserv, free_format_bytes; - unsigned char header[4], reserv_buf[511]; -} mp3dec_t; - -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - -void mp3dec_init(mp3dec_t *dec); -#ifndef MINIMP3_FLOAT_OUTPUT -typedef int16_t mp3d_sample_t; -#else /* MINIMP3_FLOAT_OUTPUT */ -typedef float mp3d_sample_t; -void mp3dec_f32_to_s16(const float *in, int16_t *out, int num_samples); -#endif /* MINIMP3_FLOAT_OUTPUT */ -int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_sample_t *pcm, mp3dec_frame_info_t *info); - -#ifdef __cplusplus -} -#endif /* __cplusplus */ - -#endif /* MINIMP3_H */ -#if defined(MINIMP3_IMPLEMENTATION) && !defined(_MINIMP3_IMPLEMENTATION_GUARD) -#define _MINIMP3_IMPLEMENTATION_GUARD - -#include -#include - -#define MAX_FREE_FORMAT_FRAME_SIZE 2304 /* more than ISO spec's */ -#ifndef MAX_FRAME_SYNC_MATCHES -#define MAX_FRAME_SYNC_MATCHES 10 -#endif /* MAX_FRAME_SYNC_MATCHES */ - -#define MAX_L3_FRAME_PAYLOAD_BYTES MAX_FREE_FORMAT_FRAME_SIZE /* MUST be >= 320000/8/32000*1152 = 1440 */ - -#define MAX_BITRESERVOIR_BYTES 511 -#define SHORT_BLOCK_TYPE 2 -#define STOP_BLOCK_TYPE 3 -#define MODE_MONO 3 -#define MODE_JOINT_STEREO 1 -#define HDR_SIZE 4 -#define HDR_IS_MONO(h) (((h[3]) & 0xC0) == 0xC0) -#define HDR_IS_MS_STEREO(h) (((h[3]) & 0xE0) == 0x60) -#define HDR_IS_FREE_FORMAT(h) (((h[2]) & 0xF0) == 0) -#define HDR_IS_CRC(h) (!((h[1]) & 1)) -#define HDR_TEST_PADDING(h) ((h[2]) & 0x2) -#define HDR_TEST_MPEG1(h) ((h[1]) & 0x8) -#define HDR_TEST_NOT_MPEG25(h) ((h[1]) & 0x10) -#define HDR_TEST_I_STEREO(h) ((h[3]) & 0x10) -#define HDR_TEST_MS_STEREO(h) ((h[3]) & 0x20) -#define HDR_GET_STEREO_MODE(h) (((h[3]) >> 6) & 3) -#define HDR_GET_STEREO_MODE_EXT(h) (((h[3]) >> 4) & 3) -#define HDR_GET_LAYER(h) (((h[1]) >> 1) & 3) -#define HDR_GET_BITRATE(h) ((h[2]) >> 4) -#define HDR_GET_SAMPLE_RATE(h) (((h[2]) >> 2) & 3) -#define HDR_GET_MY_SAMPLE_RATE(h) (HDR_GET_SAMPLE_RATE(h) + (((h[1] >> 3) & 1) + ((h[1] >> 4) & 1))*3) -#define HDR_IS_FRAME_576(h) ((h[1] & 14) == 2) -#define HDR_IS_LAYER_1(h) ((h[1] & 6) == 6) - -#define BITS_DEQUANTIZER_OUT -1 -#define MAX_SCF (255 + BITS_DEQUANTIZER_OUT*4 - 210) -#define MAX_SCFI ((MAX_SCF + 3) & ~3) - -#define MINIMP3_MIN(a, b) ((a) > (b) ? (b) : (a)) -#define MINIMP3_MAX(a, b) ((a) < (b) ? (b) : (a)) - -#if !defined(MINIMP3_NO_SIMD) - -#if !defined(MINIMP3_ONLY_SIMD) && (defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_ARM64)) -/* x64 always have SSE2, arm64 always have neon, no need for generic code */ -#define MINIMP3_ONLY_SIMD -#endif /* SIMD checks... */ - -#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || ((defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__)) -#if defined(_MSC_VER) -#include -#endif /* defined(_MSC_VER) */ -#include -#define HAVE_SSE 1 -#define HAVE_SIMD 1 -#define VSTORE _mm_storeu_ps -#define VLD _mm_loadu_ps -#define VSET _mm_set1_ps -#define VADD _mm_add_ps -#define VSUB _mm_sub_ps -#define VMUL _mm_mul_ps -#define VMAC(a, x, y) _mm_add_ps(a, _mm_mul_ps(x, y)) -#define VMSB(a, x, y) _mm_sub_ps(a, _mm_mul_ps(x, y)) -#define VMUL_S(x, s) _mm_mul_ps(x, _mm_set1_ps(s)) -#define VREV(x) _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3)) -typedef __m128 f4; -#if defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) -#define minimp3_cpuid __cpuid -#else /* defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) */ -static __inline__ __attribute__((always_inline)) void minimp3_cpuid(int CPUInfo[], const int InfoType) -{ -#if defined(__PIC__) - __asm__ __volatile__( -#if defined(__x86_64__) - "push %%rbx\n" - "cpuid\n" - "xchgl %%ebx, %1\n" - "pop %%rbx\n" -#else /* defined(__x86_64__) */ - "xchgl %%ebx, %1\n" - "cpuid\n" - "xchgl %%ebx, %1\n" -#endif /* defined(__x86_64__) */ - : "=a" (CPUInfo[0]), "=r" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3]) - : "a" (InfoType)); -#else /* defined(__PIC__) */ - __asm__ __volatile__( - "cpuid" - : "=a" (CPUInfo[0]), "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3]) - : "a" (InfoType)); -#endif /* defined(__PIC__)*/ -} -#endif /* defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) */ -static int have_simd(void) -{ -#ifdef MINIMP3_ONLY_SIMD - return 1; -#else /* MINIMP3_ONLY_SIMD */ - static int g_have_simd; - int CPUInfo[4]; -#ifdef MINIMP3_TEST - static int g_counter; - if (g_counter++ > 100) - return 0; -#endif /* MINIMP3_TEST */ - if (g_have_simd) - goto end; - minimp3_cpuid(CPUInfo, 0); - g_have_simd = 1; - if (CPUInfo[0] > 0) - { - minimp3_cpuid(CPUInfo, 1); - g_have_simd = (CPUInfo[3] & (1 << 26)) + 1; /* SSE2 */ - } -end: - return g_have_simd - 1; -#endif /* MINIMP3_ONLY_SIMD */ -} -#elif defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64) -#include -#define HAVE_SSE 0 -#define HAVE_SIMD 1 -#define VSTORE vst1q_f32 -#define VLD vld1q_f32 -#define VSET vmovq_n_f32 -#define VADD vaddq_f32 -#define VSUB vsubq_f32 -#define VMUL vmulq_f32 -#define VMAC(a, x, y) vmlaq_f32(a, x, y) -#define VMSB(a, x, y) vmlsq_f32(a, x, y) -#define VMUL_S(x, s) vmulq_f32(x, vmovq_n_f32(s)) -#define VREV(x) vcombine_f32(vget_high_f32(vrev64q_f32(x)), vget_low_f32(vrev64q_f32(x))) -typedef float32x4_t f4; -static int have_simd() -{ /* TODO: detect neon for !MINIMP3_ONLY_SIMD */ - return 1; -} -#else /* SIMD checks... */ -#define HAVE_SSE 0 -#define HAVE_SIMD 0 -#ifdef MINIMP3_ONLY_SIMD -#error MINIMP3_ONLY_SIMD used, but SSE/NEON not enabled -#endif /* MINIMP3_ONLY_SIMD */ -#endif /* SIMD checks... */ -#else /* !defined(MINIMP3_NO_SIMD) */ -#define HAVE_SIMD 0 -#endif /* !defined(MINIMP3_NO_SIMD) */ - -#if defined(__ARM_ARCH) && (__ARM_ARCH >= 6) && !defined(__aarch64__) && !defined(_M_ARM64) -#define HAVE_ARMV6 1 -static __inline__ __attribute__((always_inline)) int32_t minimp3_clip_int16_arm(int32_t a) -{ - int32_t x = 0; - __asm__ ("ssat %0, #16, %1" : "=r"(x) : "r"(a)); - return x; -} -#else -#define HAVE_ARMV6 0 -#endif - -typedef struct -{ - const uint8_t *buf; - int pos, limit; -} bs_t; - -typedef struct -{ - float scf[3*64]; - uint8_t total_bands, stereo_bands, bitalloc[64], scfcod[64]; -} L12_scale_info; - -typedef struct -{ - uint8_t tab_offset, code_tab_width, band_count; -} L12_subband_alloc_t; - -typedef struct -{ - const uint8_t *sfbtab; - uint16_t part_23_length, big_values, scalefac_compress; - uint8_t global_gain, block_type, mixed_block_flag, n_long_sfb, n_short_sfb; - uint8_t table_select[3], region_count[3], subblock_gain[3]; - uint8_t preflag, scalefac_scale, count1_table, scfsi; -} L3_gr_info_t; - -typedef struct -{ - bs_t bs; - uint8_t maindata[MAX_BITRESERVOIR_BYTES + MAX_L3_FRAME_PAYLOAD_BYTES]; - L3_gr_info_t gr_info[4]; - float grbuf[2][576], scf[40], syn[18 + 15][2*32]; - uint8_t ist_pos[2][39]; -} mp3dec_scratch_t; - -static void bs_init(bs_t *bs, const uint8_t *data, int bytes) -{ - bs->buf = data; - bs->pos = 0; - bs->limit = bytes*8; -} - -static uint32_t get_bits(bs_t *bs, int n) -{ - uint32_t next, cache = 0, s = bs->pos & 7; - int shl = n + s; - const uint8_t *p = bs->buf + (bs->pos >> 3); - if ((bs->pos += n) > bs->limit) - return 0; - next = *p++ & (255 >> s); - while ((shl -= 8) > 0) - { - cache |= next << shl; - next = *p++; - } - return cache | (next >> -shl); -} - -static int hdr_valid(const uint8_t *h) -{ - return h[0] == 0xff && - ((h[1] & 0xF0) == 0xf0 || (h[1] & 0xFE) == 0xe2) && - (HDR_GET_LAYER(h) != 0) && - (HDR_GET_BITRATE(h) != 15) && - (HDR_GET_SAMPLE_RATE(h) != 3); -} - -static int hdr_compare(const uint8_t *h1, const uint8_t *h2) -{ - return hdr_valid(h2) && - ((h1[1] ^ h2[1]) & 0xFE) == 0 && - ((h1[2] ^ h2[2]) & 0x0C) == 0 && - !(HDR_IS_FREE_FORMAT(h1) ^ HDR_IS_FREE_FORMAT(h2)); -} - -static unsigned hdr_bitrate_kbps(const uint8_t *h) -{ - static const uint8_t halfrate[2][3][15] = { - { { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,16,24,28,32,40,48,56,64,72,80,88,96,112,128 } }, - { { 0,16,20,24,28,32,40,48,56,64,80,96,112,128,160 }, { 0,16,24,28,32,40,48,56,64,80,96,112,128,160,192 }, { 0,16,32,48,64,80,96,112,128,144,160,176,192,208,224 } }, - }; - return 2*halfrate[!!HDR_TEST_MPEG1(h)][HDR_GET_LAYER(h) - 1][HDR_GET_BITRATE(h)]; -} - -static unsigned hdr_sample_rate_hz(const uint8_t *h) -{ - static const unsigned g_hz[3] = { 44100, 48000, 32000 }; - return g_hz[HDR_GET_SAMPLE_RATE(h)] >> (int)!HDR_TEST_MPEG1(h) >> (int)!HDR_TEST_NOT_MPEG25(h); -} - -static unsigned hdr_frame_samples(const uint8_t *h) -{ - return HDR_IS_LAYER_1(h) ? 384 : (1152 >> (int)HDR_IS_FRAME_576(h)); -} - -static int hdr_frame_bytes(const uint8_t *h, int free_format_size) -{ - int frame_bytes = hdr_frame_samples(h)*hdr_bitrate_kbps(h)*125/hdr_sample_rate_hz(h); - if (HDR_IS_LAYER_1(h)) - { - frame_bytes &= ~3; /* slot align */ - } - return frame_bytes ? frame_bytes : free_format_size; -} - -static int hdr_padding(const uint8_t *h) -{ - return HDR_TEST_PADDING(h) ? (HDR_IS_LAYER_1(h) ? 4 : 1) : 0; -} - -#ifndef MINIMP3_ONLY_MP3 -static const L12_subband_alloc_t *L12_subband_alloc_table(const uint8_t *hdr, L12_scale_info *sci) -{ - const L12_subband_alloc_t *alloc; - int mode = HDR_GET_STEREO_MODE(hdr); - int nbands, stereo_bands = (mode == MODE_MONO) ? 0 : (mode == MODE_JOINT_STEREO) ? (HDR_GET_STEREO_MODE_EXT(hdr) << 2) + 4 : 32; - - if (HDR_IS_LAYER_1(hdr)) - { - static const L12_subband_alloc_t g_alloc_L1[] = { { 76, 4, 32 } }; - alloc = g_alloc_L1; - nbands = 32; - } else if (!HDR_TEST_MPEG1(hdr)) - { - static const L12_subband_alloc_t g_alloc_L2M2[] = { { 60, 4, 4 }, { 44, 3, 7 }, { 44, 2, 19 } }; - alloc = g_alloc_L2M2; - nbands = 30; - } else - { - static const L12_subband_alloc_t g_alloc_L2M1[] = { { 0, 4, 3 }, { 16, 4, 8 }, { 32, 3, 12 }, { 40, 2, 7 } }; - int sample_rate_idx = HDR_GET_SAMPLE_RATE(hdr); - unsigned kbps = hdr_bitrate_kbps(hdr) >> (int)(mode != MODE_MONO); - if (!kbps) /* free-format */ - { - kbps = 192; - } - - alloc = g_alloc_L2M1; - nbands = 27; - if (kbps < 56) - { - static const L12_subband_alloc_t g_alloc_L2M1_lowrate[] = { { 44, 4, 2 }, { 44, 3, 10 } }; - alloc = g_alloc_L2M1_lowrate; - nbands = sample_rate_idx == 2 ? 12 : 8; - } else if (kbps >= 96 && sample_rate_idx != 1) - { - nbands = 30; - } - } - - sci->total_bands = (uint8_t)nbands; - sci->stereo_bands = (uint8_t)MINIMP3_MIN(stereo_bands, nbands); - - return alloc; -} - -static void L12_read_scalefactors(bs_t *bs, uint8_t *pba, uint8_t *scfcod, int bands, float *scf) -{ - static const float g_deq_L12[18*3] = { -#define DQ(x) 9.53674316e-07f/x, 7.56931807e-07f/x, 6.00777173e-07f/x - DQ(3),DQ(7),DQ(15),DQ(31),DQ(63),DQ(127),DQ(255),DQ(511),DQ(1023),DQ(2047),DQ(4095),DQ(8191),DQ(16383),DQ(32767),DQ(65535),DQ(3),DQ(5),DQ(9) - }; - int i, m; - for (i = 0; i < bands; i++) - { - float s = 0; - int ba = *pba++; - int mask = ba ? 4 + ((19 >> scfcod[i]) & 3) : 0; - for (m = 4; m; m >>= 1) - { - if (mask & m) - { - int b = get_bits(bs, 6); - s = g_deq_L12[ba*3 - 6 + b % 3]*(1 << 21 >> b/3); - } - *scf++ = s; - } - } -} - -static void L12_read_scale_info(const uint8_t *hdr, bs_t *bs, L12_scale_info *sci) -{ - static const uint8_t g_bitalloc_code_tab[] = { - 0,17, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16, - 0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,16, - 0,17,18, 3,19,4,5,16, - 0,17,18,16, - 0,17,18,19, 4,5,6, 7,8, 9,10,11,12,13,14,15, - 0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,14, - 0, 2, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16 - }; - const L12_subband_alloc_t *subband_alloc = L12_subband_alloc_table(hdr, sci); - - int i, k = 0, ba_bits = 0; - const uint8_t *ba_code_tab = g_bitalloc_code_tab; - - for (i = 0; i < sci->total_bands; i++) - { - uint8_t ba; - if (i == k) - { - k += subband_alloc->band_count; - ba_bits = subband_alloc->code_tab_width; - ba_code_tab = g_bitalloc_code_tab + subband_alloc->tab_offset; - subband_alloc++; - } - ba = ba_code_tab[get_bits(bs, ba_bits)]; - sci->bitalloc[2*i] = ba; - if (i < sci->stereo_bands) - { - ba = ba_code_tab[get_bits(bs, ba_bits)]; - } - sci->bitalloc[2*i + 1] = sci->stereo_bands ? ba : 0; - } - - for (i = 0; i < 2*sci->total_bands; i++) - { - sci->scfcod[i] = sci->bitalloc[i] ? HDR_IS_LAYER_1(hdr) ? 2 : get_bits(bs, 2) : 6; - } - - L12_read_scalefactors(bs, sci->bitalloc, sci->scfcod, sci->total_bands*2, sci->scf); - - for (i = sci->stereo_bands; i < sci->total_bands; i++) - { - sci->bitalloc[2*i + 1] = 0; - } -} - -static int L12_dequantize_granule(float *grbuf, bs_t *bs, L12_scale_info *sci, int group_size) -{ - int i, j, k, choff = 576; - for (j = 0; j < 4; j++) - { - float *dst = grbuf + group_size*j; - for (i = 0; i < 2*sci->total_bands; i++) - { - int ba = sci->bitalloc[i]; - if (ba != 0) - { - if (ba < 17) - { - int half = (1 << (ba - 1)) - 1; - for (k = 0; k < group_size; k++) - { - dst[k] = (float)((int)get_bits(bs, ba) - half); - } - } else - { - unsigned mod = (2 << (ba - 17)) + 1; /* 3, 5, 9 */ - unsigned code = get_bits(bs, mod + 2 - (mod >> 3)); /* 5, 7, 10 */ - for (k = 0; k < group_size; k++, code /= mod) - { - dst[k] = (float)((int)(code % mod - mod/2)); - } - } - } - dst += choff; - choff = 18 - choff; - } - } - return group_size*4; -} - -static void L12_apply_scf_384(L12_scale_info *sci, const float *scf, float *dst) -{ - int i, k; - memcpy(dst + 576 + sci->stereo_bands*18, dst + sci->stereo_bands*18, (sci->total_bands - sci->stereo_bands)*18*sizeof(float)); - for (i = 0; i < sci->total_bands; i++, dst += 18, scf += 6) - { - for (k = 0; k < 12; k++) - { - dst[k + 0] *= scf[0]; - dst[k + 576] *= scf[3]; - } - } -} -#endif /* MINIMP3_ONLY_MP3 */ - -static int L3_read_side_info(bs_t *bs, L3_gr_info_t *gr, const uint8_t *hdr) -{ - static const uint8_t g_scf_long[8][23] = { - { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 }, - { 12,12,12,12,12,12,16,20,24,28,32,40,48,56,64,76,90,2,2,2,2,2,0 }, - { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 }, - { 6,6,6,6,6,6,8,10,12,14,16,18,22,26,32,38,46,54,62,70,76,36,0 }, - { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 }, - { 4,4,4,4,4,4,6,6,8,8,10,12,16,20,24,28,34,42,50,54,76,158,0 }, - { 4,4,4,4,4,4,6,6,6,8,10,12,16,18,22,28,34,40,46,54,54,192,0 }, - { 4,4,4,4,4,4,6,6,8,10,12,16,20,24,30,38,46,56,68,84,102,26,0 } - }; - static const uint8_t g_scf_short[8][40] = { - { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 }, - { 8,8,8,8,8,8,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 }, - { 4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 }, - { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 }, - { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 }, - { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 }, - { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 }, - { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 } - }; - static const uint8_t g_scf_mixed[8][40] = { - { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 }, - { 12,12,12,4,4,4,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 }, - { 6,6,6,6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 }, - { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 }, - { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 }, - { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 }, - { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 }, - { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 } - }; - - unsigned tables, scfsi = 0; - int main_data_begin, part_23_sum = 0; - int sr_idx = HDR_GET_MY_SAMPLE_RATE(hdr); sr_idx -= (sr_idx != 0); - int gr_count = HDR_IS_MONO(hdr) ? 1 : 2; - - if (HDR_TEST_MPEG1(hdr)) - { - gr_count *= 2; - main_data_begin = get_bits(bs, 9); - scfsi = get_bits(bs, 7 + gr_count); - } else - { - main_data_begin = get_bits(bs, 8 + gr_count) >> gr_count; - } - - do - { - if (HDR_IS_MONO(hdr)) - { - scfsi <<= 4; - } - gr->part_23_length = (uint16_t)get_bits(bs, 12); - part_23_sum += gr->part_23_length; - gr->big_values = (uint16_t)get_bits(bs, 9); - if (gr->big_values > 288) - { - return -1; - } - gr->global_gain = (uint8_t)get_bits(bs, 8); - gr->scalefac_compress = (uint16_t)get_bits(bs, HDR_TEST_MPEG1(hdr) ? 4 : 9); - gr->sfbtab = g_scf_long[sr_idx]; - gr->n_long_sfb = 22; - gr->n_short_sfb = 0; - if (get_bits(bs, 1)) - { - gr->block_type = (uint8_t)get_bits(bs, 2); - if (!gr->block_type) - { - return -1; - } - gr->mixed_block_flag = (uint8_t)get_bits(bs, 1); - gr->region_count[0] = 7; - gr->region_count[1] = 255; - if (gr->block_type == SHORT_BLOCK_TYPE) - { - scfsi &= 0x0F0F; - if (!gr->mixed_block_flag) - { - gr->region_count[0] = 8; - gr->sfbtab = g_scf_short[sr_idx]; - gr->n_long_sfb = 0; - gr->n_short_sfb = 39; - } else - { - gr->sfbtab = g_scf_mixed[sr_idx]; - gr->n_long_sfb = HDR_TEST_MPEG1(hdr) ? 8 : 6; - gr->n_short_sfb = 30; - } - } - tables = get_bits(bs, 10); - tables <<= 5; - gr->subblock_gain[0] = (uint8_t)get_bits(bs, 3); - gr->subblock_gain[1] = (uint8_t)get_bits(bs, 3); - gr->subblock_gain[2] = (uint8_t)get_bits(bs, 3); - } else - { - gr->block_type = 0; - gr->mixed_block_flag = 0; - tables = get_bits(bs, 15); - gr->region_count[0] = (uint8_t)get_bits(bs, 4); - gr->region_count[1] = (uint8_t)get_bits(bs, 3); - gr->region_count[2] = 255; - } - gr->table_select[0] = (uint8_t)(tables >> 10); - gr->table_select[1] = (uint8_t)((tables >> 5) & 31); - gr->table_select[2] = (uint8_t)((tables) & 31); - gr->preflag = HDR_TEST_MPEG1(hdr) ? get_bits(bs, 1) : (gr->scalefac_compress >= 500); - gr->scalefac_scale = (uint8_t)get_bits(bs, 1); - gr->count1_table = (uint8_t)get_bits(bs, 1); - gr->scfsi = (uint8_t)((scfsi >> 12) & 15); - scfsi <<= 4; - gr++; - } while(--gr_count); - - if (part_23_sum + bs->pos > bs->limit + main_data_begin*8) - { - return -1; - } - - return main_data_begin; -} - -static void L3_read_scalefactors(uint8_t *scf, uint8_t *ist_pos, const uint8_t *scf_size, const uint8_t *scf_count, bs_t *bitbuf, int scfsi) -{ - int i, k; - for (i = 0; i < 4 && scf_count[i]; i++, scfsi *= 2) - { - int cnt = scf_count[i]; - if (scfsi & 8) - { - memcpy(scf, ist_pos, cnt); - } else - { - int bits = scf_size[i]; - if (!bits) - { - memset(scf, 0, cnt); - memset(ist_pos, 0, cnt); - } else - { - int max_scf = (scfsi < 0) ? (1 << bits) - 1 : -1; - for (k = 0; k < cnt; k++) - { - int s = get_bits(bitbuf, bits); - ist_pos[k] = (s == max_scf ? -1 : s); - scf[k] = s; - } - } - } - ist_pos += cnt; - scf += cnt; - } - scf[0] = scf[1] = scf[2] = 0; -} - -static float L3_ldexp_q2(float y, int exp_q2) -{ - static const float g_expfrac[4] = { 9.31322575e-10f,7.83145814e-10f,6.58544508e-10f,5.53767716e-10f }; - int e; - do - { - e = MINIMP3_MIN(30*4, exp_q2); - y *= g_expfrac[e & 3]*(1 << 30 >> (e >> 2)); - } while ((exp_q2 -= e) > 0); - return y; -} - -static void L3_decode_scalefactors(const uint8_t *hdr, uint8_t *ist_pos, bs_t *bs, const L3_gr_info_t *gr, float *scf, int ch) -{ - static const uint8_t g_scf_partitions[3][28] = { - { 6,5,5, 5,6,5,5,5,6,5, 7,3,11,10,0,0, 7, 7, 7,0, 6, 6,6,3, 8, 8,5,0 }, - { 8,9,6,12,6,9,9,9,6,9,12,6,15,18,0,0, 6,15,12,0, 6,12,9,6, 6,18,9,0 }, - { 9,9,6,12,9,9,9,9,9,9,12,6,18,18,0,0,12,12,12,0,12, 9,9,6,15,12,9,0 } - }; - const uint8_t *scf_partition = g_scf_partitions[!!gr->n_short_sfb + !gr->n_long_sfb]; - uint8_t scf_size[4], iscf[40]; - int i, scf_shift = gr->scalefac_scale + 1, gain_exp, scfsi = gr->scfsi; - float gain; - - if (HDR_TEST_MPEG1(hdr)) - { - static const uint8_t g_scfc_decode[16] = { 0,1,2,3, 12,5,6,7, 9,10,11,13, 14,15,18,19 }; - int part = g_scfc_decode[gr->scalefac_compress]; - scf_size[1] = scf_size[0] = (uint8_t)(part >> 2); - scf_size[3] = scf_size[2] = (uint8_t)(part & 3); - } else - { - static const uint8_t g_mod[6*4] = { 5,5,4,4,5,5,4,1,4,3,1,1,5,6,6,1,4,4,4,1,4,3,1,1 }; - int k, modprod, sfc, ist = HDR_TEST_I_STEREO(hdr) && ch; - sfc = gr->scalefac_compress >> ist; - for (k = ist*3*4; sfc >= 0; sfc -= modprod, k += 4) - { - for (modprod = 1, i = 3; i >= 0; i--) - { - scf_size[i] = (uint8_t)(sfc / modprod % g_mod[k + i]); - modprod *= g_mod[k + i]; - } - } - scf_partition += k; - scfsi = -16; - } - L3_read_scalefactors(iscf, ist_pos, scf_size, scf_partition, bs, scfsi); - - if (gr->n_short_sfb) - { - int sh = 3 - scf_shift; - for (i = 0; i < gr->n_short_sfb; i += 3) - { - iscf[gr->n_long_sfb + i + 0] += gr->subblock_gain[0] << sh; - iscf[gr->n_long_sfb + i + 1] += gr->subblock_gain[1] << sh; - iscf[gr->n_long_sfb + i + 2] += gr->subblock_gain[2] << sh; - } - } else if (gr->preflag) - { - static const uint8_t g_preamp[10] = { 1,1,1,1,2,2,3,3,3,2 }; - for (i = 0; i < 10; i++) - { - iscf[11 + i] += g_preamp[i]; - } - } - - gain_exp = gr->global_gain + BITS_DEQUANTIZER_OUT*4 - 210 - (HDR_IS_MS_STEREO(hdr) ? 2 : 0); - gain = L3_ldexp_q2(1 << (MAX_SCFI/4), MAX_SCFI - gain_exp); - for (i = 0; i < (int)(gr->n_long_sfb + gr->n_short_sfb); i++) - { - scf[i] = L3_ldexp_q2(gain, iscf[i] << scf_shift); - } -} - -static const float g_pow43[129 + 16] = { - 0,-1,-2.519842f,-4.326749f,-6.349604f,-8.549880f,-10.902724f,-13.390518f,-16.000000f,-18.720754f,-21.544347f,-24.463781f,-27.473142f,-30.567351f,-33.741992f,-36.993181f, - 0,1,2.519842f,4.326749f,6.349604f,8.549880f,10.902724f,13.390518f,16.000000f,18.720754f,21.544347f,24.463781f,27.473142f,30.567351f,33.741992f,36.993181f,40.317474f,43.711787f,47.173345f,50.699631f,54.288352f,57.937408f,61.644865f,65.408941f,69.227979f,73.100443f,77.024898f,81.000000f,85.024491f,89.097188f,93.216975f,97.382800f,101.593667f,105.848633f,110.146801f,114.487321f,118.869381f,123.292209f,127.755065f,132.257246f,136.798076f,141.376907f,145.993119f,150.646117f,155.335327f,160.060199f,164.820202f,169.614826f,174.443577f,179.305980f,184.201575f,189.129918f,194.090580f,199.083145f,204.107210f,209.162385f,214.248292f,219.364564f,224.510845f,229.686789f,234.892058f,240.126328f,245.389280f,250.680604f,256.000000f,261.347174f,266.721841f,272.123723f,277.552547f,283.008049f,288.489971f,293.998060f,299.532071f,305.091761f,310.676898f,316.287249f,321.922592f,327.582707f,333.267377f,338.976394f,344.709550f,350.466646f,356.247482f,362.051866f,367.879608f,373.730522f,379.604427f,385.501143f,391.420496f,397.362314f,403.326427f,409.312672f,415.320884f,421.350905f,427.402579f,433.475750f,439.570269f,445.685987f,451.822757f,457.980436f,464.158883f,470.357960f,476.577530f,482.817459f,489.077615f,495.357868f,501.658090f,507.978156f,514.317941f,520.677324f,527.056184f,533.454404f,539.871867f,546.308458f,552.764065f,559.238575f,565.731879f,572.243870f,578.774440f,585.323483f,591.890898f,598.476581f,605.080431f,611.702349f,618.342238f,625.000000f,631.675540f,638.368763f,645.079578f -}; - -static float L3_pow_43(int x) -{ - float frac; - int sign, mult = 256; - - if (x < 129) - { - return g_pow43[16 + x]; - } - - if (x < 1024) - { - mult = 16; - x <<= 3; - } - - sign = 2*x & 64; - frac = (float)((x & 63) - sign) / ((x & ~63) + sign); - return g_pow43[16 + ((x + sign) >> 6)]*(1.f + frac*((4.f/3) + frac*(2.f/9)))*mult; -} - -static void L3_huffman(float *dst, bs_t *bs, const L3_gr_info_t *gr_info, const float *scf, int layer3gr_limit) -{ - static const int16_t tabs[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 785,785,785,785,784,784,784,784,513,513,513,513,513,513,513,513,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256, - -255,1313,1298,1282,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,290,288, - -255,1313,1298,1282,769,769,769,769,529,529,529,529,529,529,529,529,528,528,528,528,528,528,528,528,512,512,512,512,512,512,512,512,290,288, - -253,-318,-351,-367,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,819,818,547,547,275,275,275,275,561,560,515,546,289,274,288,258, - -254,-287,1329,1299,1314,1312,1057,1057,1042,1042,1026,1026,784,784,784,784,529,529,529,529,529,529,529,529,769,769,769,769,768,768,768,768,563,560,306,306,291,259, - -252,-413,-477,-542,1298,-575,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-383,-399,1107,1092,1106,1061,849,849,789,789,1104,1091,773,773,1076,1075,341,340,325,309,834,804,577,577,532,532,516,516,832,818,803,816,561,561,531,531,515,546,289,289,288,258, - -252,-429,-493,-559,1057,1057,1042,1042,529,529,529,529,529,529,529,529,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,-382,1077,-415,1106,1061,1104,849,849,789,789,1091,1076,1029,1075,834,834,597,581,340,340,339,324,804,833,532,532,832,772,818,803,817,787,816,771,290,290,290,290,288,258, - -253,-349,-414,-447,-463,1329,1299,-479,1314,1312,1057,1057,1042,1042,1026,1026,785,785,785,785,784,784,784,784,769,769,769,769,768,768,768,768,-319,851,821,-335,836,850,805,849,341,340,325,336,533,533,579,579,564,564,773,832,578,548,563,516,321,276,306,291,304,259, - -251,-572,-733,-830,-863,-879,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,1396,1351,1381,1366,1395,1335,1380,-559,1334,1138,1138,1063,1063,1350,1392,1031,1031,1062,1062,1364,1363,1120,1120,1333,1348,881,881,881,881,375,374,359,373,343,358,341,325,791,791,1123,1122,-703,1105,1045,-719,865,865,790,790,774,774,1104,1029,338,293,323,308,-799,-815,833,788,772,818,803,816,322,292,307,320,561,531,515,546,289,274,288,258, - -251,-525,-605,-685,-765,-831,-846,1298,1057,1057,1312,1282,785,785,785,785,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,1399,1398,1383,1367,1382,1396,1351,-511,1381,1366,1139,1139,1079,1079,1124,1124,1364,1349,1363,1333,882,882,882,882,807,807,807,807,1094,1094,1136,1136,373,341,535,535,881,775,867,822,774,-591,324,338,-671,849,550,550,866,864,609,609,293,336,534,534,789,835,773,-751,834,804,308,307,833,788,832,772,562,562,547,547,305,275,560,515,290,290, - -252,-397,-477,-557,-622,-653,-719,-735,-750,1329,1299,1314,1057,1057,1042,1042,1312,1282,1024,1024,785,785,785,785,784,784,784,784,769,769,769,769,-383,1127,1141,1111,1126,1140,1095,1110,869,869,883,883,1079,1109,882,882,375,374,807,868,838,881,791,-463,867,822,368,263,852,837,836,-543,610,610,550,550,352,336,534,534,865,774,851,821,850,805,593,533,579,564,773,832,578,578,548,548,577,577,307,276,306,291,516,560,259,259, - -250,-2107,-2507,-2764,-2909,-2974,-3007,-3023,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-767,-1052,-1213,-1277,-1358,-1405,-1469,-1535,-1550,-1582,-1614,-1647,-1662,-1694,-1726,-1759,-1774,-1807,-1822,-1854,-1886,1565,-1919,-1935,-1951,-1967,1731,1730,1580,1717,-1983,1729,1564,-1999,1548,-2015,-2031,1715,1595,-2047,1714,-2063,1610,-2079,1609,-2095,1323,1323,1457,1457,1307,1307,1712,1547,1641,1700,1699,1594,1685,1625,1442,1442,1322,1322,-780,-973,-910,1279,1278,1277,1262,1276,1261,1275,1215,1260,1229,-959,974,974,989,989,-943,735,478,478,495,463,506,414,-1039,1003,958,1017,927,942,987,957,431,476,1272,1167,1228,-1183,1256,-1199,895,895,941,941,1242,1227,1212,1135,1014,1014,490,489,503,487,910,1013,985,925,863,894,970,955,1012,847,-1343,831,755,755,984,909,428,366,754,559,-1391,752,486,457,924,997,698,698,983,893,740,740,908,877,739,739,667,667,953,938,497,287,271,271,683,606,590,712,726,574,302,302,738,736,481,286,526,725,605,711,636,724,696,651,589,681,666,710,364,467,573,695,466,466,301,465,379,379,709,604,665,679,316,316,634,633,436,436,464,269,424,394,452,332,438,363,347,408,393,448,331,422,362,407,392,421,346,406,391,376,375,359,1441,1306,-2367,1290,-2383,1337,-2399,-2415,1426,1321,-2431,1411,1336,-2447,-2463,-2479,1169,1169,1049,1049,1424,1289,1412,1352,1319,-2495,1154,1154,1064,1064,1153,1153,416,390,360,404,403,389,344,374,373,343,358,372,327,357,342,311,356,326,1395,1394,1137,1137,1047,1047,1365,1392,1287,1379,1334,1364,1349,1378,1318,1363,792,792,792,792,1152,1152,1032,1032,1121,1121,1046,1046,1120,1120,1030,1030,-2895,1106,1061,1104,849,849,789,789,1091,1076,1029,1090,1060,1075,833,833,309,324,532,532,832,772,818,803,561,561,531,560,515,546,289,274,288,258, - -250,-1179,-1579,-1836,-1996,-2124,-2253,-2333,-2413,-2477,-2542,-2574,-2607,-2622,-2655,1314,1313,1298,1312,1282,785,785,785,785,1040,1040,1025,1025,768,768,768,768,-766,-798,-830,-862,-895,-911,-927,-943,-959,-975,-991,-1007,-1023,-1039,-1055,-1070,1724,1647,-1103,-1119,1631,1767,1662,1738,1708,1723,-1135,1780,1615,1779,1599,1677,1646,1778,1583,-1151,1777,1567,1737,1692,1765,1722,1707,1630,1751,1661,1764,1614,1736,1676,1763,1750,1645,1598,1721,1691,1762,1706,1582,1761,1566,-1167,1749,1629,767,766,751,765,494,494,735,764,719,749,734,763,447,447,748,718,477,506,431,491,446,476,461,505,415,430,475,445,504,399,460,489,414,503,383,474,429,459,502,502,746,752,488,398,501,473,413,472,486,271,480,270,-1439,-1455,1357,-1471,-1487,-1503,1341,1325,-1519,1489,1463,1403,1309,-1535,1372,1448,1418,1476,1356,1462,1387,-1551,1475,1340,1447,1402,1386,-1567,1068,1068,1474,1461,455,380,468,440,395,425,410,454,364,467,466,464,453,269,409,448,268,432,1371,1473,1432,1417,1308,1460,1355,1446,1459,1431,1083,1083,1401,1416,1458,1445,1067,1067,1370,1457,1051,1051,1291,1430,1385,1444,1354,1415,1400,1443,1082,1082,1173,1113,1186,1066,1185,1050,-1967,1158,1128,1172,1097,1171,1081,-1983,1157,1112,416,266,375,400,1170,1142,1127,1065,793,793,1169,1033,1156,1096,1141,1111,1155,1080,1126,1140,898,898,808,808,897,897,792,792,1095,1152,1032,1125,1110,1139,1079,1124,882,807,838,881,853,791,-2319,867,368,263,822,852,837,866,806,865,-2399,851,352,262,534,534,821,836,594,594,549,549,593,593,533,533,848,773,579,579,564,578,548,563,276,276,577,576,306,291,516,560,305,305,275,259, - -251,-892,-2058,-2620,-2828,-2957,-3023,-3039,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,-559,1530,-575,-591,1528,1527,1407,1526,1391,1023,1023,1023,1023,1525,1375,1268,1268,1103,1103,1087,1087,1039,1039,1523,-604,815,815,815,815,510,495,509,479,508,463,507,447,431,505,415,399,-734,-782,1262,-815,1259,1244,-831,1258,1228,-847,-863,1196,-879,1253,987,987,748,-767,493,493,462,477,414,414,686,669,478,446,461,445,474,429,487,458,412,471,1266,1264,1009,1009,799,799,-1019,-1276,-1452,-1581,-1677,-1757,-1821,-1886,-1933,-1997,1257,1257,1483,1468,1512,1422,1497,1406,1467,1496,1421,1510,1134,1134,1225,1225,1466,1451,1374,1405,1252,1252,1358,1480,1164,1164,1251,1251,1238,1238,1389,1465,-1407,1054,1101,-1423,1207,-1439,830,830,1248,1038,1237,1117,1223,1148,1236,1208,411,426,395,410,379,269,1193,1222,1132,1235,1221,1116,976,976,1192,1162,1177,1220,1131,1191,963,963,-1647,961,780,-1663,558,558,994,993,437,408,393,407,829,978,813,797,947,-1743,721,721,377,392,844,950,828,890,706,706,812,859,796,960,948,843,934,874,571,571,-1919,690,555,689,421,346,539,539,944,779,918,873,932,842,903,888,570,570,931,917,674,674,-2575,1562,-2591,1609,-2607,1654,1322,1322,1441,1441,1696,1546,1683,1593,1669,1624,1426,1426,1321,1321,1639,1680,1425,1425,1305,1305,1545,1668,1608,1623,1667,1592,1638,1666,1320,1320,1652,1607,1409,1409,1304,1304,1288,1288,1664,1637,1395,1395,1335,1335,1622,1636,1394,1394,1319,1319,1606,1621,1392,1392,1137,1137,1137,1137,345,390,360,375,404,373,1047,-2751,-2767,-2783,1062,1121,1046,-2799,1077,-2815,1106,1061,789,789,1105,1104,263,355,310,340,325,354,352,262,339,324,1091,1076,1029,1090,1060,1075,833,833,788,788,1088,1028,818,818,803,803,561,561,531,531,816,771,546,546,289,274,288,258, - -253,-317,-381,-446,-478,-509,1279,1279,-811,-1179,-1451,-1756,-1900,-2028,-2189,-2253,-2333,-2414,-2445,-2511,-2526,1313,1298,-2559,1041,1041,1040,1040,1025,1025,1024,1024,1022,1007,1021,991,1020,975,1019,959,687,687,1018,1017,671,671,655,655,1016,1015,639,639,758,758,623,623,757,607,756,591,755,575,754,559,543,543,1009,783,-575,-621,-685,-749,496,-590,750,749,734,748,974,989,1003,958,988,973,1002,942,987,957,972,1001,926,986,941,971,956,1000,910,985,925,999,894,970,-1071,-1087,-1102,1390,-1135,1436,1509,1451,1374,-1151,1405,1358,1480,1420,-1167,1507,1494,1389,1342,1465,1435,1450,1326,1505,1310,1493,1373,1479,1404,1492,1464,1419,428,443,472,397,736,526,464,464,486,457,442,471,484,482,1357,1449,1434,1478,1388,1491,1341,1490,1325,1489,1463,1403,1309,1477,1372,1448,1418,1433,1476,1356,1462,1387,-1439,1475,1340,1447,1402,1474,1324,1461,1371,1473,269,448,1432,1417,1308,1460,-1711,1459,-1727,1441,1099,1099,1446,1386,1431,1401,-1743,1289,1083,1083,1160,1160,1458,1445,1067,1067,1370,1457,1307,1430,1129,1129,1098,1098,268,432,267,416,266,400,-1887,1144,1187,1082,1173,1113,1186,1066,1050,1158,1128,1143,1172,1097,1171,1081,420,391,1157,1112,1170,1142,1127,1065,1169,1049,1156,1096,1141,1111,1155,1080,1126,1154,1064,1153,1140,1095,1048,-2159,1125,1110,1137,-2175,823,823,1139,1138,807,807,384,264,368,263,868,838,853,791,867,822,852,837,866,806,865,790,-2319,851,821,836,352,262,850,805,849,-2399,533,533,835,820,336,261,578,548,563,577,532,532,832,772,562,562,547,547,305,275,560,515,290,290,288,258 }; - static const uint8_t tab32[] = { 130,162,193,209,44,28,76,140,9,9,9,9,9,9,9,9,190,254,222,238,126,94,157,157,109,61,173,205 }; - static const uint8_t tab33[] = { 252,236,220,204,188,172,156,140,124,108,92,76,60,44,28,12 }; - static const int16_t tabindex[2*16] = { 0,32,64,98,0,132,180,218,292,364,426,538,648,746,0,1126,1460,1460,1460,1460,1460,1460,1460,1460,1842,1842,1842,1842,1842,1842,1842,1842 }; - static const uint8_t g_linbits[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,6,8,10,13,4,5,6,7,8,9,11,13 }; - -#define PEEK_BITS(n) (bs_cache >> (32 - n)) -#define FLUSH_BITS(n) { bs_cache <<= (n); bs_sh += (n); } -#define CHECK_BITS while (bs_sh >= 0) { bs_cache |= (uint32_t)*bs_next_ptr++ << bs_sh; bs_sh -= 8; } -#define BSPOS ((bs_next_ptr - bs->buf)*8 - 24 + bs_sh) - - float one = 0.0f; - int ireg = 0, big_val_cnt = gr_info->big_values; - const uint8_t *sfb = gr_info->sfbtab; - const uint8_t *bs_next_ptr = bs->buf + bs->pos/8; - uint32_t bs_cache = (((bs_next_ptr[0]*256u + bs_next_ptr[1])*256u + bs_next_ptr[2])*256u + bs_next_ptr[3]) << (bs->pos & 7); - int pairs_to_decode, np, bs_sh = (bs->pos & 7) - 8; - bs_next_ptr += 4; - - while (big_val_cnt > 0) - { - int tab_num = gr_info->table_select[ireg]; - int sfb_cnt = gr_info->region_count[ireg++]; - const int16_t *codebook = tabs + tabindex[tab_num]; - int linbits = g_linbits[tab_num]; - if (linbits) - { - do - { - np = *sfb++ / 2; - pairs_to_decode = MINIMP3_MIN(big_val_cnt, np); - one = *scf++; - do - { - int j, w = 5; - int leaf = codebook[PEEK_BITS(w)]; - while (leaf < 0) - { - FLUSH_BITS(w); - w = leaf & 7; - leaf = codebook[PEEK_BITS(w) - (leaf >> 3)]; - } - FLUSH_BITS(leaf >> 8); - - for (j = 0; j < 2; j++, dst++, leaf >>= 4) - { - int lsb = leaf & 0x0F; - if (lsb == 15) - { - lsb += PEEK_BITS(linbits); - FLUSH_BITS(linbits); - CHECK_BITS; - *dst = one*L3_pow_43(lsb)*((int32_t)bs_cache < 0 ? -1: 1); - } else - { - *dst = g_pow43[16 + lsb - 16*(bs_cache >> 31)]*one; - } - FLUSH_BITS(lsb ? 1 : 0); - } - CHECK_BITS; - } while (--pairs_to_decode); - } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0); - } else - { - do - { - np = *sfb++ / 2; - pairs_to_decode = MINIMP3_MIN(big_val_cnt, np); - one = *scf++; - do - { - int j, w = 5; - int leaf = codebook[PEEK_BITS(w)]; - while (leaf < 0) - { - FLUSH_BITS(w); - w = leaf & 7; - leaf = codebook[PEEK_BITS(w) - (leaf >> 3)]; - } - FLUSH_BITS(leaf >> 8); - - for (j = 0; j < 2; j++, dst++, leaf >>= 4) - { - int lsb = leaf & 0x0F; - *dst = g_pow43[16 + lsb - 16*(bs_cache >> 31)]*one; - FLUSH_BITS(lsb ? 1 : 0); - } - CHECK_BITS; - } while (--pairs_to_decode); - } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0); - } - } - - for (np = 1 - big_val_cnt;; dst += 4) - { - const uint8_t *codebook_count1 = (gr_info->count1_table) ? tab33 : tab32; - int leaf = codebook_count1[PEEK_BITS(4)]; - if (!(leaf & 8)) - { - leaf = codebook_count1[(leaf >> 3) + (bs_cache << 4 >> (32 - (leaf & 3)))]; - } - FLUSH_BITS(leaf & 7); - if (BSPOS > layer3gr_limit) - { - break; - } -#define RELOAD_SCALEFACTOR if (!--np) { np = *sfb++/2; if (!np) break; one = *scf++; } -#define DEQ_COUNT1(s) if (leaf & (128 >> s)) { dst[s] = ((int32_t)bs_cache < 0) ? -one : one; FLUSH_BITS(1) } - RELOAD_SCALEFACTOR; - DEQ_COUNT1(0); - DEQ_COUNT1(1); - RELOAD_SCALEFACTOR; - DEQ_COUNT1(2); - DEQ_COUNT1(3); - CHECK_BITS; - } - - bs->pos = layer3gr_limit; -} - -static void L3_midside_stereo(float *left, int n) -{ - int i = 0; - float *right = left + 576; -#if HAVE_SIMD - if (have_simd()) - { - for (; i < n - 3; i += 4) - { - f4 vl = VLD(left + i); - f4 vr = VLD(right + i); - VSTORE(left + i, VADD(vl, vr)); - VSTORE(right + i, VSUB(vl, vr)); - } -#ifdef __GNUC__ - /* Workaround for spurious -Waggressive-loop-optimizations warning from gcc. - * For more info see: https://github.com/lieff/minimp3/issues/88 - */ - if (__builtin_constant_p(n % 4 == 0) && n % 4 == 0) - return; -#endif - } -#endif /* HAVE_SIMD */ - for (; i < n; i++) - { - float a = left[i]; - float b = right[i]; - left[i] = a + b; - right[i] = a - b; - } -} - -static void L3_intensity_stereo_band(float *left, int n, float kl, float kr) -{ - int i; - for (i = 0; i < n; i++) - { - left[i + 576] = left[i]*kr; - left[i] = left[i]*kl; - } -} - -static void L3_stereo_top_band(const float *right, const uint8_t *sfb, int nbands, int max_band[3]) -{ - int i, k; - - max_band[0] = max_band[1] = max_band[2] = -1; - - for (i = 0; i < nbands; i++) - { - for (k = 0; k < sfb[i]; k += 2) - { - if (right[k] != 0 || right[k + 1] != 0) - { - max_band[i % 3] = i; - break; - } - } - right += sfb[i]; - } -} - -static void L3_stereo_process(float *left, const uint8_t *ist_pos, const uint8_t *sfb, const uint8_t *hdr, int max_band[3], int mpeg2_sh) -{ - static const float g_pan[7*2] = { 0,1,0.21132487f,0.78867513f,0.36602540f,0.63397460f,0.5f,0.5f,0.63397460f,0.36602540f,0.78867513f,0.21132487f,1,0 }; - unsigned i, max_pos = HDR_TEST_MPEG1(hdr) ? 7 : 64; - - for (i = 0; sfb[i]; i++) - { - unsigned ipos = ist_pos[i]; - if ((int)i > max_band[i % 3] && ipos < max_pos) - { - float kl, kr, s = HDR_TEST_MS_STEREO(hdr) ? 1.41421356f : 1; - if (HDR_TEST_MPEG1(hdr)) - { - kl = g_pan[2*ipos]; - kr = g_pan[2*ipos + 1]; - } else - { - kl = 1; - kr = L3_ldexp_q2(1, (ipos + 1) >> 1 << mpeg2_sh); - if (ipos & 1) - { - kl = kr; - kr = 1; - } - } - L3_intensity_stereo_band(left, sfb[i], kl*s, kr*s); - } else if (HDR_TEST_MS_STEREO(hdr)) - { - L3_midside_stereo(left, sfb[i]); - } - left += sfb[i]; - } -} - -static void L3_intensity_stereo(float *left, uint8_t *ist_pos, const L3_gr_info_t *gr, const uint8_t *hdr) -{ - int max_band[3], n_sfb = gr->n_long_sfb + gr->n_short_sfb; - int i, max_blocks = gr->n_short_sfb ? 3 : 1; - - L3_stereo_top_band(left + 576, gr->sfbtab, n_sfb, max_band); - if (gr->n_long_sfb) - { - max_band[0] = max_band[1] = max_band[2] = MINIMP3_MAX(MINIMP3_MAX(max_band[0], max_band[1]), max_band[2]); - } - for (i = 0; i < max_blocks; i++) - { - int default_pos = HDR_TEST_MPEG1(hdr) ? 3 : 0; - int itop = n_sfb - max_blocks + i; - int prev = itop - max_blocks; - ist_pos[itop] = max_band[i] >= prev ? default_pos : ist_pos[prev]; - } - L3_stereo_process(left, ist_pos, gr->sfbtab, hdr, max_band, gr[1].scalefac_compress & 1); -} - -static void L3_reorder(float *grbuf, float *scratch, const uint8_t *sfb) -{ - int i, len; - float *src = grbuf, *dst = scratch; - - for (;0 != (len = *sfb); sfb += 3, src += 2*len) - { - for (i = 0; i < len; i++, src++) - { - *dst++ = src[0*len]; - *dst++ = src[1*len]; - *dst++ = src[2*len]; - } - } - memcpy(grbuf, scratch, (dst - scratch)*sizeof(float)); -} - -static void L3_antialias(float *grbuf, int nbands) -{ - static const float g_aa[2][8] = { - {0.85749293f,0.88174200f,0.94962865f,0.98331459f,0.99551782f,0.99916056f,0.99989920f,0.99999316f}, - {0.51449576f,0.47173197f,0.31337745f,0.18191320f,0.09457419f,0.04096558f,0.01419856f,0.00369997f} - }; - - for (; nbands > 0; nbands--, grbuf += 18) - { - int i = 0; -#if HAVE_SIMD - if (have_simd()) for (; i < 8; i += 4) - { - f4 vu = VLD(grbuf + 18 + i); - f4 vd = VLD(grbuf + 14 - i); - f4 vc0 = VLD(g_aa[0] + i); - f4 vc1 = VLD(g_aa[1] + i); - vd = VREV(vd); - VSTORE(grbuf + 18 + i, VSUB(VMUL(vu, vc0), VMUL(vd, vc1))); - vd = VADD(VMUL(vu, vc1), VMUL(vd, vc0)); - VSTORE(grbuf + 14 - i, VREV(vd)); - } -#endif /* HAVE_SIMD */ -#ifndef MINIMP3_ONLY_SIMD - for(; i < 8; i++) - { - float u = grbuf[18 + i]; - float d = grbuf[17 - i]; - grbuf[18 + i] = u*g_aa[0][i] - d*g_aa[1][i]; - grbuf[17 - i] = u*g_aa[1][i] + d*g_aa[0][i]; - } -#endif /* MINIMP3_ONLY_SIMD */ - } -} - -static void L3_dct3_9(float *y) -{ - float s0, s1, s2, s3, s4, s5, s6, s7, s8, t0, t2, t4; - - s0 = y[0]; s2 = y[2]; s4 = y[4]; s6 = y[6]; s8 = y[8]; - t0 = s0 + s6*0.5f; - s0 -= s6; - t4 = (s4 + s2)*0.93969262f; - t2 = (s8 + s2)*0.76604444f; - s6 = (s4 - s8)*0.17364818f; - s4 += s8 - s2; - - s2 = s0 - s4*0.5f; - y[4] = s4 + s0; - s8 = t0 - t2 + s6; - s0 = t0 - t4 + t2; - s4 = t0 + t4 - s6; - - s1 = y[1]; s3 = y[3]; s5 = y[5]; s7 = y[7]; - - s3 *= 0.86602540f; - t0 = (s5 + s1)*0.98480775f; - t4 = (s5 - s7)*0.34202014f; - t2 = (s1 + s7)*0.64278761f; - s1 = (s1 - s5 - s7)*0.86602540f; - - s5 = t0 - s3 - t2; - s7 = t4 - s3 - t0; - s3 = t4 + s3 - t2; - - y[0] = s4 - s7; - y[1] = s2 + s1; - y[2] = s0 - s3; - y[3] = s8 + s5; - y[5] = s8 - s5; - y[6] = s0 + s3; - y[7] = s2 - s1; - y[8] = s4 + s7; -} - -static void L3_imdct36(float *grbuf, float *overlap, const float *window, int nbands) -{ - int i, j; - static const float g_twid9[18] = { - 0.73727734f,0.79335334f,0.84339145f,0.88701083f,0.92387953f,0.95371695f,0.97629601f,0.99144486f,0.99904822f,0.67559021f,0.60876143f,0.53729961f,0.46174861f,0.38268343f,0.30070580f,0.21643961f,0.13052619f,0.04361938f - }; - - for (j = 0; j < nbands; j++, grbuf += 18, overlap += 9) - { - float co[9], si[9]; - co[0] = -grbuf[0]; - si[0] = grbuf[17]; - for (i = 0; i < 4; i++) - { - si[8 - 2*i] = grbuf[4*i + 1] - grbuf[4*i + 2]; - co[1 + 2*i] = grbuf[4*i + 1] + grbuf[4*i + 2]; - si[7 - 2*i] = grbuf[4*i + 4] - grbuf[4*i + 3]; - co[2 + 2*i] = -(grbuf[4*i + 3] + grbuf[4*i + 4]); - } - L3_dct3_9(co); - L3_dct3_9(si); - - si[1] = -si[1]; - si[3] = -si[3]; - si[5] = -si[5]; - si[7] = -si[7]; - - i = 0; - -#if HAVE_SIMD - if (have_simd()) for (; i < 8; i += 4) - { - f4 vovl = VLD(overlap + i); - f4 vc = VLD(co + i); - f4 vs = VLD(si + i); - f4 vr0 = VLD(g_twid9 + i); - f4 vr1 = VLD(g_twid9 + 9 + i); - f4 vw0 = VLD(window + i); - f4 vw1 = VLD(window + 9 + i); - f4 vsum = VADD(VMUL(vc, vr1), VMUL(vs, vr0)); - VSTORE(overlap + i, VSUB(VMUL(vc, vr0), VMUL(vs, vr1))); - VSTORE(grbuf + i, VSUB(VMUL(vovl, vw0), VMUL(vsum, vw1))); - vsum = VADD(VMUL(vovl, vw1), VMUL(vsum, vw0)); - VSTORE(grbuf + 14 - i, VREV(vsum)); - } -#endif /* HAVE_SIMD */ - for (; i < 9; i++) - { - float ovl = overlap[i]; - float sum = co[i]*g_twid9[9 + i] + si[i]*g_twid9[0 + i]; - overlap[i] = co[i]*g_twid9[0 + i] - si[i]*g_twid9[9 + i]; - grbuf[i] = ovl*window[0 + i] - sum*window[9 + i]; - grbuf[17 - i] = ovl*window[9 + i] + sum*window[0 + i]; - } - } -} - -static void L3_idct3(float x0, float x1, float x2, float *dst) -{ - float m1 = x1*0.86602540f; - float a1 = x0 - x2*0.5f; - dst[1] = x0 + x2; - dst[0] = a1 + m1; - dst[2] = a1 - m1; -} - -static void L3_imdct12(float *x, float *dst, float *overlap) -{ - static const float g_twid3[6] = { 0.79335334f,0.92387953f,0.99144486f, 0.60876143f,0.38268343f,0.13052619f }; - float co[3], si[3]; - int i; - - L3_idct3(-x[0], x[6] + x[3], x[12] + x[9], co); - L3_idct3(x[15], x[12] - x[9], x[6] - x[3], si); - si[1] = -si[1]; - - for (i = 0; i < 3; i++) - { - float ovl = overlap[i]; - float sum = co[i]*g_twid3[3 + i] + si[i]*g_twid3[0 + i]; - overlap[i] = co[i]*g_twid3[0 + i] - si[i]*g_twid3[3 + i]; - dst[i] = ovl*g_twid3[2 - i] - sum*g_twid3[5 - i]; - dst[5 - i] = ovl*g_twid3[5 - i] + sum*g_twid3[2 - i]; - } -} - -static void L3_imdct_short(float *grbuf, float *overlap, int nbands) -{ - for (;nbands > 0; nbands--, overlap += 9, grbuf += 18) - { - float tmp[18]; - memcpy(tmp, grbuf, sizeof(tmp)); - memcpy(grbuf, overlap, 6*sizeof(float)); - L3_imdct12(tmp, grbuf + 6, overlap + 6); - L3_imdct12(tmp + 1, grbuf + 12, overlap + 6); - L3_imdct12(tmp + 2, overlap, overlap + 6); - } -} - -static void L3_change_sign(float *grbuf) -{ - int b, i; - for (b = 0, grbuf += 18; b < 32; b += 2, grbuf += 36) - for (i = 1; i < 18; i += 2) - grbuf[i] = -grbuf[i]; -} - -static void L3_imdct_gr(float *grbuf, float *overlap, unsigned block_type, unsigned n_long_bands) -{ - static const float g_mdct_window[2][18] = { - { 0.99904822f,0.99144486f,0.97629601f,0.95371695f,0.92387953f,0.88701083f,0.84339145f,0.79335334f,0.73727734f,0.04361938f,0.13052619f,0.21643961f,0.30070580f,0.38268343f,0.46174861f,0.53729961f,0.60876143f,0.67559021f }, - { 1,1,1,1,1,1,0.99144486f,0.92387953f,0.79335334f,0,0,0,0,0,0,0.13052619f,0.38268343f,0.60876143f } - }; - if (n_long_bands) - { - L3_imdct36(grbuf, overlap, g_mdct_window[0], n_long_bands); - grbuf += 18*n_long_bands; - overlap += 9*n_long_bands; - } - if (block_type == SHORT_BLOCK_TYPE) - L3_imdct_short(grbuf, overlap, 32 - n_long_bands); - else - L3_imdct36(grbuf, overlap, g_mdct_window[block_type == STOP_BLOCK_TYPE], 32 - n_long_bands); -} - -static void L3_save_reservoir(mp3dec_t *h, mp3dec_scratch_t *s) -{ - int pos = (s->bs.pos + 7)/8u; - int remains = s->bs.limit/8u - pos; - if (remains > MAX_BITRESERVOIR_BYTES) - { - pos += remains - MAX_BITRESERVOIR_BYTES; - remains = MAX_BITRESERVOIR_BYTES; - } - if (remains > 0) - { - memmove(h->reserv_buf, s->maindata + pos, remains); - } - h->reserv = remains; -} - -static int L3_restore_reservoir(mp3dec_t *h, bs_t *bs, mp3dec_scratch_t *s, int main_data_begin) -{ - int frame_bytes = (bs->limit - bs->pos)/8; - int bytes_have = MINIMP3_MIN(h->reserv, main_data_begin); - memcpy(s->maindata, h->reserv_buf + MINIMP3_MAX(0, h->reserv - main_data_begin), MINIMP3_MIN(h->reserv, main_data_begin)); - memcpy(s->maindata + bytes_have, bs->buf + bs->pos/8, frame_bytes); - bs_init(&s->bs, s->maindata, bytes_have + frame_bytes); - return h->reserv >= main_data_begin; -} - -static void L3_decode(mp3dec_t *h, mp3dec_scratch_t *s, L3_gr_info_t *gr_info, int nch) -{ - int ch; - - for (ch = 0; ch < nch; ch++) - { - int layer3gr_limit = s->bs.pos + gr_info[ch].part_23_length; - L3_decode_scalefactors(h->header, s->ist_pos[ch], &s->bs, gr_info + ch, s->scf, ch); - L3_huffman(s->grbuf[ch], &s->bs, gr_info + ch, s->scf, layer3gr_limit); - } - - if (HDR_TEST_I_STEREO(h->header)) - { - L3_intensity_stereo(s->grbuf[0], s->ist_pos[1], gr_info, h->header); - } else if (HDR_IS_MS_STEREO(h->header)) - { - L3_midside_stereo(s->grbuf[0], 576); - } - - for (ch = 0; ch < nch; ch++, gr_info++) - { - int aa_bands = 31; - int n_long_bands = (gr_info->mixed_block_flag ? 2 : 0) << (int)(HDR_GET_MY_SAMPLE_RATE(h->header) == 2); - - if (gr_info->n_short_sfb) - { - aa_bands = n_long_bands - 1; - L3_reorder(s->grbuf[ch] + n_long_bands*18, s->syn[0], gr_info->sfbtab + gr_info->n_long_sfb); - } - - L3_antialias(s->grbuf[ch], aa_bands); - L3_imdct_gr(s->grbuf[ch], h->mdct_overlap[ch], gr_info->block_type, n_long_bands); - L3_change_sign(s->grbuf[ch]); - } -} - -static void mp3d_DCT_II(float *grbuf, int n) -{ - static const float g_sec[24] = { - 10.19000816f,0.50060302f,0.50241929f,3.40760851f,0.50547093f,0.52249861f,2.05778098f,0.51544732f,0.56694406f,1.48416460f,0.53104258f,0.64682180f,1.16943991f,0.55310392f,0.78815460f,0.97256821f,0.58293498f,1.06067765f,0.83934963f,0.62250412f,1.72244716f,0.74453628f,0.67480832f,5.10114861f - }; - int i, k = 0; -#if HAVE_SIMD - if (have_simd()) for (; k < n; k += 4) - { - f4 t[4][8], *x; - float *y = grbuf + k; - - for (x = t[0], i = 0; i < 8; i++, x++) - { - f4 x0 = VLD(&y[i*18]); - f4 x1 = VLD(&y[(15 - i)*18]); - f4 x2 = VLD(&y[(16 + i)*18]); - f4 x3 = VLD(&y[(31 - i)*18]); - f4 t0 = VADD(x0, x3); - f4 t1 = VADD(x1, x2); - f4 t2 = VMUL_S(VSUB(x1, x2), g_sec[3*i + 0]); - f4 t3 = VMUL_S(VSUB(x0, x3), g_sec[3*i + 1]); - x[0] = VADD(t0, t1); - x[8] = VMUL_S(VSUB(t0, t1), g_sec[3*i + 2]); - x[16] = VADD(t3, t2); - x[24] = VMUL_S(VSUB(t3, t2), g_sec[3*i + 2]); - } - for (x = t[0], i = 0; i < 4; i++, x += 8) - { - f4 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt; - xt = VSUB(x0, x7); x0 = VADD(x0, x7); - x7 = VSUB(x1, x6); x1 = VADD(x1, x6); - x6 = VSUB(x2, x5); x2 = VADD(x2, x5); - x5 = VSUB(x3, x4); x3 = VADD(x3, x4); - x4 = VSUB(x0, x3); x0 = VADD(x0, x3); - x3 = VSUB(x1, x2); x1 = VADD(x1, x2); - x[0] = VADD(x0, x1); - x[4] = VMUL_S(VSUB(x0, x1), 0.70710677f); - x5 = VADD(x5, x6); - x6 = VMUL_S(VADD(x6, x7), 0.70710677f); - x7 = VADD(x7, xt); - x3 = VMUL_S(VADD(x3, x4), 0.70710677f); - x5 = VSUB(x5, VMUL_S(x7, 0.198912367f)); /* rotate by PI/8 */ - x7 = VADD(x7, VMUL_S(x5, 0.382683432f)); - x5 = VSUB(x5, VMUL_S(x7, 0.198912367f)); - x0 = VSUB(xt, x6); xt = VADD(xt, x6); - x[1] = VMUL_S(VADD(xt, x7), 0.50979561f); - x[2] = VMUL_S(VADD(x4, x3), 0.54119611f); - x[3] = VMUL_S(VSUB(x0, x5), 0.60134488f); - x[5] = VMUL_S(VADD(x0, x5), 0.89997619f); - x[6] = VMUL_S(VSUB(x4, x3), 1.30656302f); - x[7] = VMUL_S(VSUB(xt, x7), 2.56291556f); - } - - if (k > n - 3) - { -#if HAVE_SSE -#define VSAVE2(i, v) _mm_storel_pi((__m64 *)(void*)&y[i*18], v) -#else /* HAVE_SSE */ -#define VSAVE2(i, v) vst1_f32((float32_t *)&y[i*18], vget_low_f32(v)) -#endif /* HAVE_SSE */ - for (i = 0; i < 7; i++, y += 4*18) - { - f4 s = VADD(t[3][i], t[3][i + 1]); - VSAVE2(0, t[0][i]); - VSAVE2(1, VADD(t[2][i], s)); - VSAVE2(2, VADD(t[1][i], t[1][i + 1])); - VSAVE2(3, VADD(t[2][1 + i], s)); - } - VSAVE2(0, t[0][7]); - VSAVE2(1, VADD(t[2][7], t[3][7])); - VSAVE2(2, t[1][7]); - VSAVE2(3, t[3][7]); - } else - { -#define VSAVE4(i, v) VSTORE(&y[i*18], v) - for (i = 0; i < 7; i++, y += 4*18) - { - f4 s = VADD(t[3][i], t[3][i + 1]); - VSAVE4(0, t[0][i]); - VSAVE4(1, VADD(t[2][i], s)); - VSAVE4(2, VADD(t[1][i], t[1][i + 1])); - VSAVE4(3, VADD(t[2][1 + i], s)); - } - VSAVE4(0, t[0][7]); - VSAVE4(1, VADD(t[2][7], t[3][7])); - VSAVE4(2, t[1][7]); - VSAVE4(3, t[3][7]); - } - } else -#endif /* HAVE_SIMD */ -#ifdef MINIMP3_ONLY_SIMD - {} /* for HAVE_SIMD=1, MINIMP3_ONLY_SIMD=1 case we do not need non-intrinsic "else" branch */ -#else /* MINIMP3_ONLY_SIMD */ - for (; k < n; k++) - { - float t[4][8], *x, *y = grbuf + k; - - for (x = t[0], i = 0; i < 8; i++, x++) - { - float x0 = y[i*18]; - float x1 = y[(15 - i)*18]; - float x2 = y[(16 + i)*18]; - float x3 = y[(31 - i)*18]; - float t0 = x0 + x3; - float t1 = x1 + x2; - float t2 = (x1 - x2)*g_sec[3*i + 0]; - float t3 = (x0 - x3)*g_sec[3*i + 1]; - x[0] = t0 + t1; - x[8] = (t0 - t1)*g_sec[3*i + 2]; - x[16] = t3 + t2; - x[24] = (t3 - t2)*g_sec[3*i + 2]; - } - for (x = t[0], i = 0; i < 4; i++, x += 8) - { - float x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt; - xt = x0 - x7; x0 += x7; - x7 = x1 - x6; x1 += x6; - x6 = x2 - x5; x2 += x5; - x5 = x3 - x4; x3 += x4; - x4 = x0 - x3; x0 += x3; - x3 = x1 - x2; x1 += x2; - x[0] = x0 + x1; - x[4] = (x0 - x1)*0.70710677f; - x5 = x5 + x6; - x6 = (x6 + x7)*0.70710677f; - x7 = x7 + xt; - x3 = (x3 + x4)*0.70710677f; - x5 -= x7*0.198912367f; /* rotate by PI/8 */ - x7 += x5*0.382683432f; - x5 -= x7*0.198912367f; - x0 = xt - x6; xt += x6; - x[1] = (xt + x7)*0.50979561f; - x[2] = (x4 + x3)*0.54119611f; - x[3] = (x0 - x5)*0.60134488f; - x[5] = (x0 + x5)*0.89997619f; - x[6] = (x4 - x3)*1.30656302f; - x[7] = (xt - x7)*2.56291556f; - - } - for (i = 0; i < 7; i++, y += 4*18) - { - y[0*18] = t[0][i]; - y[1*18] = t[2][i] + t[3][i] + t[3][i + 1]; - y[2*18] = t[1][i] + t[1][i + 1]; - y[3*18] = t[2][i + 1] + t[3][i] + t[3][i + 1]; - } - y[0*18] = t[0][7]; - y[1*18] = t[2][7] + t[3][7]; - y[2*18] = t[1][7]; - y[3*18] = t[3][7]; - } -#endif /* MINIMP3_ONLY_SIMD */ -} - -#ifndef MINIMP3_FLOAT_OUTPUT -static int16_t mp3d_scale_pcm(float sample) -{ -#if HAVE_ARMV6 - int32_t s32 = (int32_t)(sample + .5f); - s32 -= (s32 < 0); - int16_t s = (int16_t)minimp3_clip_int16_arm(s32); -#else - if (sample >= 32766.5) return (int16_t) 32767; - if (sample <= -32767.5) return (int16_t)-32768; - int16_t s = (int16_t)(sample + .5f); - s -= (s < 0); /* away from zero, to be compliant */ -#endif - return s; -} -#else /* MINIMP3_FLOAT_OUTPUT */ -static float mp3d_scale_pcm(float sample) -{ - return sample*(1.f/32768.f); -} -#endif /* MINIMP3_FLOAT_OUTPUT */ - -static void mp3d_synth_pair(mp3d_sample_t *pcm, int nch, const float *z) -{ - float a; - a = (z[14*64] - z[ 0]) * 29; - a += (z[ 1*64] + z[13*64]) * 213; - a += (z[12*64] - z[ 2*64]) * 459; - a += (z[ 3*64] + z[11*64]) * 2037; - a += (z[10*64] - z[ 4*64]) * 5153; - a += (z[ 5*64] + z[ 9*64]) * 6574; - a += (z[ 8*64] - z[ 6*64]) * 37489; - a += z[ 7*64] * 75038; - pcm[0] = mp3d_scale_pcm(a); - - z += 2; - a = z[14*64] * 104; - a += z[12*64] * 1567; - a += z[10*64] * 9727; - a += z[ 8*64] * 64019; - a += z[ 6*64] * -9975; - a += z[ 4*64] * -45; - a += z[ 2*64] * 146; - a += z[ 0*64] * -5; - pcm[16*nch] = mp3d_scale_pcm(a); -} - -static void mp3d_synth(float *xl, mp3d_sample_t *dstl, int nch, float *lins) -{ - int i; - float *xr = xl + 576*(nch - 1); - mp3d_sample_t *dstr = dstl + (nch - 1); - - static const float g_win[] = { - -1,26,-31,208,218,401,-519,2063,2000,4788,-5517,7134,5959,35640,-39336,74992, - -1,24,-35,202,222,347,-581,2080,1952,4425,-5879,7640,5288,33791,-41176,74856, - -1,21,-38,196,225,294,-645,2087,1893,4063,-6237,8092,4561,31947,-43006,74630, - -1,19,-41,190,227,244,-711,2085,1822,3705,-6589,8492,3776,30112,-44821,74313, - -1,17,-45,183,228,197,-779,2075,1739,3351,-6935,8840,2935,28289,-46617,73908, - -1,16,-49,176,228,153,-848,2057,1644,3004,-7271,9139,2037,26482,-48390,73415, - -2,14,-53,169,227,111,-919,2032,1535,2663,-7597,9389,1082,24694,-50137,72835, - -2,13,-58,161,224,72,-991,2001,1414,2330,-7910,9592,70,22929,-51853,72169, - -2,11,-63,154,221,36,-1064,1962,1280,2006,-8209,9750,-998,21189,-53534,71420, - -2,10,-68,147,215,2,-1137,1919,1131,1692,-8491,9863,-2122,19478,-55178,70590, - -3,9,-73,139,208,-29,-1210,1870,970,1388,-8755,9935,-3300,17799,-56778,69679, - -3,8,-79,132,200,-57,-1283,1817,794,1095,-8998,9966,-4533,16155,-58333,68692, - -4,7,-85,125,189,-83,-1356,1759,605,814,-9219,9959,-5818,14548,-59838,67629, - -4,7,-91,117,177,-106,-1428,1698,402,545,-9416,9916,-7154,12980,-61289,66494, - -5,6,-97,111,163,-127,-1498,1634,185,288,-9585,9838,-8540,11455,-62684,65290 - }; - float *zlin = lins + 15*64; - const float *w = g_win; - - zlin[4*15] = xl[18*16]; - zlin[4*15 + 1] = xr[18*16]; - zlin[4*15 + 2] = xl[0]; - zlin[4*15 + 3] = xr[0]; - - zlin[4*31] = xl[1 + 18*16]; - zlin[4*31 + 1] = xr[1 + 18*16]; - zlin[4*31 + 2] = xl[1]; - zlin[4*31 + 3] = xr[1]; - - mp3d_synth_pair(dstr, nch, lins + 4*15 + 1); - mp3d_synth_pair(dstr + 32*nch, nch, lins + 4*15 + 64 + 1); - mp3d_synth_pair(dstl, nch, lins + 4*15); - mp3d_synth_pair(dstl + 32*nch, nch, lins + 4*15 + 64); - -#if HAVE_SIMD - if (have_simd()) for (i = 14; i >= 0; i--) - { -#define VLOAD(k) f4 w0 = VSET(*w++); f4 w1 = VSET(*w++); f4 vz = VLD(&zlin[4*i - 64*k]); f4 vy = VLD(&zlin[4*i - 64*(15 - k)]); -#define V0(k) { VLOAD(k) b = VADD(VMUL(vz, w1), VMUL(vy, w0)) ; a = VSUB(VMUL(vz, w0), VMUL(vy, w1)); } -#define V1(k) { VLOAD(k) b = VADD(b, VADD(VMUL(vz, w1), VMUL(vy, w0))); a = VADD(a, VSUB(VMUL(vz, w0), VMUL(vy, w1))); } -#define V2(k) { VLOAD(k) b = VADD(b, VADD(VMUL(vz, w1), VMUL(vy, w0))); a = VADD(a, VSUB(VMUL(vy, w1), VMUL(vz, w0))); } - f4 a, b; - zlin[4*i] = xl[18*(31 - i)]; - zlin[4*i + 1] = xr[18*(31 - i)]; - zlin[4*i + 2] = xl[1 + 18*(31 - i)]; - zlin[4*i + 3] = xr[1 + 18*(31 - i)]; - zlin[4*i + 64] = xl[1 + 18*(1 + i)]; - zlin[4*i + 64 + 1] = xr[1 + 18*(1 + i)]; - zlin[4*i - 64 + 2] = xl[18*(1 + i)]; - zlin[4*i - 64 + 3] = xr[18*(1 + i)]; - - V0(0) V2(1) V1(2) V2(3) V1(4) V2(5) V1(6) V2(7) - - { -#ifndef MINIMP3_FLOAT_OUTPUT -#if HAVE_SSE - static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f }; - static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f }; - __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)), - _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min))); - dstr[(15 - i)*nch] = _mm_extract_epi16(pcm8, 1); - dstr[(17 + i)*nch] = _mm_extract_epi16(pcm8, 5); - dstl[(15 - i)*nch] = _mm_extract_epi16(pcm8, 0); - dstl[(17 + i)*nch] = _mm_extract_epi16(pcm8, 4); - dstr[(47 - i)*nch] = _mm_extract_epi16(pcm8, 3); - dstr[(49 + i)*nch] = _mm_extract_epi16(pcm8, 7); - dstl[(47 - i)*nch] = _mm_extract_epi16(pcm8, 2); - dstl[(49 + i)*nch] = _mm_extract_epi16(pcm8, 6); -#else /* HAVE_SSE */ - int16x4_t pcma, pcmb; - a = VADD(a, VSET(0.5f)); - b = VADD(b, VSET(0.5f)); - pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0))))); - pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0))))); - vst1_lane_s16(dstr + (15 - i)*nch, pcma, 1); - vst1_lane_s16(dstr + (17 + i)*nch, pcmb, 1); - vst1_lane_s16(dstl + (15 - i)*nch, pcma, 0); - vst1_lane_s16(dstl + (17 + i)*nch, pcmb, 0); - vst1_lane_s16(dstr + (47 - i)*nch, pcma, 3); - vst1_lane_s16(dstr + (49 + i)*nch, pcmb, 3); - vst1_lane_s16(dstl + (47 - i)*nch, pcma, 2); - vst1_lane_s16(dstl + (49 + i)*nch, pcmb, 2); -#endif /* HAVE_SSE */ - -#else /* MINIMP3_FLOAT_OUTPUT */ - - static const f4 g_scale = { 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f }; - a = VMUL(a, g_scale); - b = VMUL(b, g_scale); -#if HAVE_SSE - _mm_store_ss(dstr + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); - _mm_store_ss(dstr + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); - _mm_store_ss(dstl + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0))); - _mm_store_ss(dstl + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 0, 0, 0))); - _mm_store_ss(dstr + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3))); - _mm_store_ss(dstr + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 3, 3))); - _mm_store_ss(dstl + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2))); - _mm_store_ss(dstl + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 2, 2))); -#else /* HAVE_SSE */ - vst1q_lane_f32(dstr + (15 - i)*nch, a, 1); - vst1q_lane_f32(dstr + (17 + i)*nch, b, 1); - vst1q_lane_f32(dstl + (15 - i)*nch, a, 0); - vst1q_lane_f32(dstl + (17 + i)*nch, b, 0); - vst1q_lane_f32(dstr + (47 - i)*nch, a, 3); - vst1q_lane_f32(dstr + (49 + i)*nch, b, 3); - vst1q_lane_f32(dstl + (47 - i)*nch, a, 2); - vst1q_lane_f32(dstl + (49 + i)*nch, b, 2); -#endif /* HAVE_SSE */ -#endif /* MINIMP3_FLOAT_OUTPUT */ - } - } else -#endif /* HAVE_SIMD */ -#ifdef MINIMP3_ONLY_SIMD - {} /* for HAVE_SIMD=1, MINIMP3_ONLY_SIMD=1 case we do not need non-intrinsic "else" branch */ -#else /* MINIMP3_ONLY_SIMD */ - for (i = 14; i >= 0; i--) - { -#define LOAD(k) float w0 = *w++; float w1 = *w++; float *vz = &zlin[4*i - k*64]; float *vy = &zlin[4*i - (15 - k)*64]; -#define S0(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] = vz[j]*w1 + vy[j]*w0, a[j] = vz[j]*w0 - vy[j]*w1; } -#define S1(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vz[j]*w0 - vy[j]*w1; } -#define S2(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vy[j]*w1 - vz[j]*w0; } - float a[4], b[4]; - - zlin[4*i] = xl[18*(31 - i)]; - zlin[4*i + 1] = xr[18*(31 - i)]; - zlin[4*i + 2] = xl[1 + 18*(31 - i)]; - zlin[4*i + 3] = xr[1 + 18*(31 - i)]; - zlin[4*(i + 16)] = xl[1 + 18*(1 + i)]; - zlin[4*(i + 16) + 1] = xr[1 + 18*(1 + i)]; - zlin[4*(i - 16) + 2] = xl[18*(1 + i)]; - zlin[4*(i - 16) + 3] = xr[18*(1 + i)]; - - S0(0) S2(1) S1(2) S2(3) S1(4) S2(5) S1(6) S2(7) - - dstr[(15 - i)*nch] = mp3d_scale_pcm(a[1]); - dstr[(17 + i)*nch] = mp3d_scale_pcm(b[1]); - dstl[(15 - i)*nch] = mp3d_scale_pcm(a[0]); - dstl[(17 + i)*nch] = mp3d_scale_pcm(b[0]); - dstr[(47 - i)*nch] = mp3d_scale_pcm(a[3]); - dstr[(49 + i)*nch] = mp3d_scale_pcm(b[3]); - dstl[(47 - i)*nch] = mp3d_scale_pcm(a[2]); - dstl[(49 + i)*nch] = mp3d_scale_pcm(b[2]); - } -#endif /* MINIMP3_ONLY_SIMD */ -} - -static void mp3d_synth_granule(float *qmf_state, float *grbuf, int nbands, int nch, mp3d_sample_t *pcm, float *lins) -{ - int i; - for (i = 0; i < nch; i++) - { - mp3d_DCT_II(grbuf + 576*i, nbands); - } - - memcpy(lins, qmf_state, sizeof(float)*15*64); - - for (i = 0; i < nbands; i += 2) - { - mp3d_synth(grbuf + i, pcm + 32*nch*i, nch, lins + i*64); - } -#ifndef MINIMP3_NONSTANDARD_BUT_LOGICAL - if (nch == 1) - { - for (i = 0; i < 15*64; i += 2) - { - qmf_state[i] = lins[nbands*64 + i]; - } - } else -#endif /* MINIMP3_NONSTANDARD_BUT_LOGICAL */ - { - memcpy(qmf_state, lins + nbands*64, sizeof(float)*15*64); - } -} - -static int mp3d_match_frame(const uint8_t *hdr, int mp3_bytes, int frame_bytes) -{ - int i, nmatch; - for (i = 0, nmatch = 0; nmatch < MAX_FRAME_SYNC_MATCHES; nmatch++) - { - i += hdr_frame_bytes(hdr + i, frame_bytes) + hdr_padding(hdr + i); - if (i + HDR_SIZE > mp3_bytes) - return nmatch > 0; - if (!hdr_compare(hdr, hdr + i)) - return 0; - } - return 1; -} - -static int mp3d_find_frame(const uint8_t *mp3, int mp3_bytes, int *free_format_bytes, int *ptr_frame_bytes) -{ - int i, k; - for (i = 0; i < mp3_bytes - HDR_SIZE; i++, mp3++) - { - if (hdr_valid(mp3)) - { - int frame_bytes = hdr_frame_bytes(mp3, *free_format_bytes); - int frame_and_padding = frame_bytes + hdr_padding(mp3); - - for (k = HDR_SIZE; !frame_bytes && k < MAX_FREE_FORMAT_FRAME_SIZE && i + 2*k < mp3_bytes - HDR_SIZE; k++) - { - if (hdr_compare(mp3, mp3 + k)) - { - int fb = k - hdr_padding(mp3); - int nextfb = fb + hdr_padding(mp3 + k); - if (i + k + nextfb + HDR_SIZE > mp3_bytes || !hdr_compare(mp3, mp3 + k + nextfb)) - continue; - frame_and_padding = k; - frame_bytes = fb; - *free_format_bytes = fb; - } - } - if ((frame_bytes && i + frame_and_padding <= mp3_bytes && - mp3d_match_frame(mp3, mp3_bytes - i, frame_bytes)) || - (!i && frame_and_padding == mp3_bytes)) - { - *ptr_frame_bytes = frame_and_padding; - return i; - } - *free_format_bytes = 0; - } - } - *ptr_frame_bytes = 0; - return mp3_bytes; -} - -void mp3dec_init(mp3dec_t *dec) -{ - dec->header[0] = 0; -} - -int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_sample_t *pcm, mp3dec_frame_info_t *info) -{ - int i = 0, igr, frame_size = 0, success = 1; - const uint8_t *hdr; - bs_t bs_frame[1]; - mp3dec_scratch_t scratch; - - if (mp3_bytes > 4 && dec->header[0] == 0xff && hdr_compare(dec->header, mp3)) - { - frame_size = hdr_frame_bytes(mp3, dec->free_format_bytes) + hdr_padding(mp3); - if (frame_size != mp3_bytes && (frame_size + HDR_SIZE > mp3_bytes || !hdr_compare(mp3, mp3 + frame_size))) - { - frame_size = 0; - } - } - if (!frame_size) - { - memset(dec, 0, sizeof(mp3dec_t)); - i = mp3d_find_frame(mp3, mp3_bytes, &dec->free_format_bytes, &frame_size); - if (!frame_size || i + frame_size > mp3_bytes) - { - info->frame_bytes = i; - return 0; - } - } - - hdr = mp3 + i; - memcpy(dec->header, hdr, HDR_SIZE); - info->frame_bytes = i + frame_size; - info->frame_offset = i; - info->channels = HDR_IS_MONO(hdr) ? 1 : 2; - info->hz = hdr_sample_rate_hz(hdr); - info->layer = 4 - HDR_GET_LAYER(hdr); - info->bitrate_kbps = hdr_bitrate_kbps(hdr); - - if (!pcm) - { - return hdr_frame_samples(hdr); - } - - bs_init(bs_frame, hdr + HDR_SIZE, frame_size - HDR_SIZE); - if (HDR_IS_CRC(hdr)) - { - get_bits(bs_frame, 16); - } - - if (info->layer == 3) - { - int main_data_begin = L3_read_side_info(bs_frame, scratch.gr_info, hdr); - if (main_data_begin < 0 || bs_frame->pos > bs_frame->limit) - { - mp3dec_init(dec); - return 0; - } - success = L3_restore_reservoir(dec, bs_frame, &scratch, main_data_begin); - if (success) - { - for (igr = 0; igr < (HDR_TEST_MPEG1(hdr) ? 2 : 1); igr++, pcm += 576*info->channels) - { - memset(scratch.grbuf[0], 0, 576*2*sizeof(float)); - L3_decode(dec, &scratch, scratch.gr_info + igr*info->channels, info->channels); - mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 18, info->channels, pcm, scratch.syn[0]); - } - } - L3_save_reservoir(dec, &scratch); - } else - { -#ifdef MINIMP3_ONLY_MP3 - return 0; -#else /* MINIMP3_ONLY_MP3 */ - L12_scale_info sci[1]; - L12_read_scale_info(hdr, bs_frame, sci); - - memset(scratch.grbuf[0], 0, 576*2*sizeof(float)); - for (i = 0, igr = 0; igr < 3; igr++) - { - if (12 == (i += L12_dequantize_granule(scratch.grbuf[0] + i, bs_frame, sci, info->layer | 1))) - { - i = 0; - L12_apply_scf_384(sci, sci->scf + igr, scratch.grbuf[0]); - mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 12, info->channels, pcm, scratch.syn[0]); - memset(scratch.grbuf[0], 0, 576*2*sizeof(float)); - pcm += 384*info->channels; - } - if (bs_frame->pos > bs_frame->limit) - { - mp3dec_init(dec); - return 0; - } - } -#endif /* MINIMP3_ONLY_MP3 */ - } - return success*hdr_frame_samples(dec->header); -} - -#ifdef MINIMP3_FLOAT_OUTPUT -void mp3dec_f32_to_s16(const float *in, int16_t *out, int num_samples) -{ - int i = 0; -#if HAVE_SIMD - int aligned_count = num_samples & ~7; - for(; i < aligned_count; i += 8) - { - static const f4 g_scale = { 32768.0f, 32768.0f, 32768.0f, 32768.0f }; - f4 a = VMUL(VLD(&in[i ]), g_scale); - f4 b = VMUL(VLD(&in[i+4]), g_scale); -#if HAVE_SSE - static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f }; - static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f }; - __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)), - _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min))); - out[i ] = _mm_extract_epi16(pcm8, 0); - out[i+1] = _mm_extract_epi16(pcm8, 1); - out[i+2] = _mm_extract_epi16(pcm8, 2); - out[i+3] = _mm_extract_epi16(pcm8, 3); - out[i+4] = _mm_extract_epi16(pcm8, 4); - out[i+5] = _mm_extract_epi16(pcm8, 5); - out[i+6] = _mm_extract_epi16(pcm8, 6); - out[i+7] = _mm_extract_epi16(pcm8, 7); -#else /* HAVE_SSE */ - int16x4_t pcma, pcmb; - a = VADD(a, VSET(0.5f)); - b = VADD(b, VSET(0.5f)); - pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0))))); - pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0))))); - vst1_lane_s16(out+i , pcma, 0); - vst1_lane_s16(out+i+1, pcma, 1); - vst1_lane_s16(out+i+2, pcma, 2); - vst1_lane_s16(out+i+3, pcma, 3); - vst1_lane_s16(out+i+4, pcmb, 0); - vst1_lane_s16(out+i+5, pcmb, 1); - vst1_lane_s16(out+i+6, pcmb, 2); - vst1_lane_s16(out+i+7, pcmb, 3); -#endif /* HAVE_SSE */ - } -#endif /* HAVE_SIMD */ - for(; i < num_samples; i++) - { - float sample = in[i] * 32768.0f; - if (sample >= 32766.5) - out[i] = (int16_t) 32767; - else if (sample <= -32767.5) - out[i] = (int16_t)-32768; - else - { - int16_t s = (int16_t)(sample + .5f); - s -= (s < 0); /* away from zero, to be compliant */ - out[i] = s; - } - } -} -#endif /* MINIMP3_FLOAT_OUTPUT */ -#endif /* MINIMP3_IMPLEMENTATION && !_MINIMP3_IMPLEMENTATION_GUARD */ diff --git a/tools/ace-qwen3.cpp b/tools/ace-qwen3.cpp index fbfd049..9605e2a 100644 --- a/tools/ace-qwen3.cpp +++ b/tools/ace-qwen3.cpp @@ -691,8 +691,7 @@ int main(int argc, char ** argv) { "# Instruction\n" "Expand the user's input into a more detailed" " and specific musical description:\n"; - std::string user_msg = ace.caption + "\n\ninstrumental: " - + std::string(req.instrumental ? "true" : "false"); + std::string user_msg = ace.caption; prompt = build_custom_prompt(bpe, sys, user_msg.c_str()); } else { prompt = build_lm_prompt(bpe, ace); diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index 1f4ffac..8893f4b 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -6,13 +6,10 @@ #include #include #include -#include #include #include #include "philox.h" -#include "ggml.h" -#include "ggml-backend.h" #include "dit-sampler.h" #include "vae.h" #include "qwen3-enc.h" @@ -22,7 +19,6 @@ #include "debug.h" #include "request.h" #include "timer.h" -#include "audio.h" // Minimal WAV writer (16-bit PCM stereo) static bool write_wav(const char * path, const float * audio, int T_audio, int sr) { @@ -67,9 +63,6 @@ static void print_usage(const char * prog) { " --text-encoder Text encoder GGUF file\n" " --dit DiT GGUF file\n" " --vae VAE GGUF file\n\n" - "LoRA:\n" - " --lora LoRA adapter (adapter_model.safetensors)\n" - " --lora-scale LoRA scale, e.g. alpha/rank (default: 1.0)\n\n" "Batch:\n" " --batch DiT variations per request (default: 1, max 9)\n\n" "Output naming: input.json -> input0.wav, input1.wav, ... (last digit = batch index)\n\n" @@ -103,12 +96,10 @@ int main(int argc, char ** argv) { const char * dit_gguf = NULL; const char * vae_gguf = NULL; const char * dump_dir = NULL; - const char * lora_path = NULL; - float lora_scale = 1.0f; bool use_fa = true; - int batch_n = 1; - int vae_chunk = 256; - int vae_overlap = 64; + int batch_n = 1; + int vae_chunk = 256; + int vae_overlap = 64; for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "--request") == 0) { @@ -124,8 +115,6 @@ int main(int argc, char ** argv) { else if (strcmp(argv[i], "--batch") == 0 && i+1 < argc) batch_n = atoi(argv[++i]); else if (strcmp(argv[i], "--vae-chunk") == 0 && i+1 < argc) vae_chunk = atoi(argv[++i]); else if (strcmp(argv[i], "--vae-overlap") == 0 && i+1 < argc) vae_overlap = atoi(argv[++i]); - else if (strcmp(argv[i], "--lora") == 0 && i+1 < argc) lora_path = argv[++i]; - else if (strcmp(argv[i], "--lora-scale") == 0 && i+1 < argc) lora_scale = (float)atof(argv[++i]); else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { print_usage(argv[0]); return 0; } else { @@ -172,16 +161,6 @@ int main(int argc, char ** argv) { } fprintf(stderr, "[Load] DiT weight load: %.1f ms\n", timer.ms()); - if (lora_path) { - timer.reset(); - if (!dit_ggml_load_lora(&model, lora_path, lora_scale)) { - fprintf(stderr, "FATAL: failed to load LoRA from %s\n", lora_path); - dit_ggml_free(&model); - return 1; - } - fprintf(stderr, "[Load] LoRA: %.1f ms\n", timer.ms()); - } - // Read DiT GGUF metadata + silence_latent tensor (once) bool is_turbo = false; std::vector silence_full; // [15000, 64] f32 @@ -247,11 +226,8 @@ int main(int argc, char ** argv) { continue; } - // Extract params (append custom_tag to caption for LoRA/condition so trigger is in text) - std::string caption_for_cond = req.caption; - if (!req.custom_tag.empty()) - caption_for_cond += ", " + req.custom_tag; - const char * caption = caption_for_cond.c_str(); + // Extract params + const char * caption = req.caption.c_str(); const char * lyrics = req.lyrics.c_str(); char bpm_str[16] = "N/A"; if (req.bpm > 0) snprintf(bpm_str, sizeof(bpm_str), "%d", req.bpm); @@ -262,12 +238,12 @@ int main(int argc, char ** argv) { float duration = req.duration > 0 ? req.duration : 30.0f; long long seed = req.seed; int num_steps = req.inference_steps > 0 ? req.inference_steps : 8; - float guidance_scale = req.guidance_scale > 0 ? req.guidance_scale : 7.0f; + float guidance_scale = req.guidance_scale; float shift = req.shift > 0 ? req.shift : 1.0f; - float cover_strength = req.audio_cover_strength >= 0 && req.audio_cover_strength <= 1 - ? req.audio_cover_strength : 1.0f; - if (is_turbo && guidance_scale > 1.0f) { + if (guidance_scale <= 0.0f) + guidance_scale = is_turbo ? 1.0f : 7.0f; + else if (is_turbo && guidance_scale > 1.0f) { fprintf(stderr, "[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was %.1f)\n", guidance_scale); guidance_scale = 1.0f; @@ -281,39 +257,8 @@ int main(int argc, char ** argv) { fprintf(stderr, "[Pipeline] seed=%lld, steps=%d, guidance=%.1f, shift=%.1f, duration=%.1fs\n", seed, num_steps, guidance_scale, shift, duration); - // Parse audio codes from request (or produce from src_audio WAV/MP3) + // Parse audio codes from request std::vector codes_vec = parse_codes_string(req.audio_codes); - if (codes_vec.empty() && !req.src_audio.empty() && have_vae) { - const std::string & src_path = req.src_audio; - std::vector wav_stereo; - int n_samples = load_audio_48k_stereo(src_path.c_str(), &wav_stereo); - if (n_samples > 0) { - int T_audio = n_samples; - if (T_audio >= 1920) { - VAEEncoderGGML enc = {}; - if (vae_encoder_load(&enc, vae_gguf)) { - size_t max_lat = (size_t)(T_audio / 2048) + 1; - std::vector enc_out(max_lat * 64); - int T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data()); - vae_encoder_free(&enc); - if (T_lat >= FSQ_FRAMES_PER_CODE) { - DetokGGML detok = {}; - if (detok_ggml_load(&detok, dit_gguf, model.backend, model.cpu_backend)) { - std::vector codeword_table((size_t)FSQ_N_CODES * FSQ_FRAMES_PER_CODE * 64); - fprintf(stderr, "[Cover] building FSQ codeword table (8000 codes)...\n"); - detok_ggml_build_codeword_table(&detok, codeword_table.data()); - latent_frames_to_codes(T_lat, enc_out.data(), codeword_table.data(), &codes_vec); - fprintf(stderr, "[Cover] encoded %s -> %zu codes (%.1fs @ 5Hz)\n", - src_path.c_str(), codes_vec.size(), (float)codes_vec.size() / 5.0f); - detok_ggml_free(&detok); - } - } - } - } - } else { - fprintf(stderr, "[Cover] WARNING: cannot load src_audio %s (use .wav or .mp3), skipping cover-from-file\n", src_path.c_str()); - } - } if (!codes_vec.empty()) fprintf(stderr, "[Pipeline] %zu audio codes (%.1fs @ 5Hz)\n", codes_vec.size(), (float)codes_vec.size() / 5.0f); @@ -416,50 +361,16 @@ int main(int argc, char ** argv) { } fprintf(stderr, "[Load] ConditionEncoder: %.1f ms\n", timer.ms()); - // Timbre input: reference_audio (WAV or MP3 via VAE encoder) or silence (first 750 frames = 30s @ 25Hz) + // Silence feats for timbre input: first 750 frames (30s @ 25Hz) const int S_ref = 750; - std::vector timbre_feats(S_ref * 64); - const float * timbre_ptr = silence_full.data(); - int S_ref_actual = S_ref; - if (!req.reference_audio.empty()) { - const std::string & ref_path = req.reference_audio; - std::vector wav_stereo; - int n_samples = load_audio_48k_stereo(ref_path.c_str(), &wav_stereo); - if (n_samples > 0 && have_vae) { - VAEEncoderGGML enc = {}; - if (vae_encoder_load(&enc, vae_gguf)) { - int T_audio = n_samples; - if (T_audio >= 1920) { - // Encoder strides 2,4,4,8,8 -> max latent frames = T_audio/2048 + 1 - size_t max_lat = (size_t)(T_audio / 2048) + 1; - std::vector enc_out(max_lat * 64); - int T_lat = vae_encoder_forward(&enc, wav_stereo.data(), T_audio, enc_out.data()); - if (T_lat > 0) { - size_t copy_frames = (size_t)(T_lat < S_ref ? T_lat : S_ref); - memcpy(timbre_feats.data(), enc_out.data(), copy_frames * 64 * sizeof(float)); - if (T_lat < S_ref) - memcpy(timbre_feats.data() + copy_frames * 64, silence_full.data(), - (S_ref - (int)copy_frames) * 64 * sizeof(float)); - S_ref_actual = (int)copy_frames; - if (T_lat > S_ref) S_ref_actual = S_ref; - timbre_ptr = timbre_feats.data(); - fprintf(stderr, "[Timbre] encoded %s -> %d frames (25Hz)\n", ref_path.c_str(), S_ref_actual); - } - } - vae_encoder_free(&enc); - } - } else if (n_samples <= 0) { - fprintf(stderr, "[Timbre] WARNING: cannot load audio %s (use .wav or .mp3), using silence\n", ref_path.c_str()); - } else if (!have_vae) { - fprintf(stderr, "[Timbre] reference_audio requires --vae (with encoder weights); using silence\n"); - } - } + std::vector silence_feats(S_ref * 64); + memcpy(silence_feats.data(), silence_full.data(), S_ref * 64 * sizeof(float)); timer.reset(); std::vector enc_hidden; cond_ggml_forward(&cond, text_hidden.data(), S_text, lyric_embed.data(), S_lyric, - timbre_ptr, S_ref_actual, + silence_feats.data(), S_ref, enc_hidden, &enc_S); fprintf(stderr, "[Encode] ConditionEncoder: %.1f ms, enc_S=%d\n", timer.ms(), enc_S); @@ -503,20 +414,15 @@ int main(int argc, char ** argv) { } // Build single context: [T, ctx_ch] = src_latents[64] + mask_ones[64] - // src_latents = blend(decoded_codes, silence) for t context_single(T * ctx_ch); for (int t = 0; t < T; t++) { - for (int c = 0; c < Oc; c++) { - float v; - if (t < decoded_T) { - float dec = decoded_latents[t * Oc + c]; - float sil = silence_full[c]; // frame 0 of silence - v = (1.0f - cover_strength) * sil + cover_strength * dec; - } else { - v = silence_full[(t - decoded_T) * Oc + c]; - } - context_single[t * ctx_ch + c] = v; - } + const float * src = (t < decoded_T) + ? decoded_latents.data() + t * Oc + : silence_full.data() + (t - decoded_T) * Oc; + for (int c = 0; c < Oc; c++) + context_single[t * ctx_ch + c] = src[c]; for (int c = 0; c < Oc; c++) context_single[t * ctx_ch + Oc + c] = 1.0f; } diff --git a/tools/neural-codec.cpp b/tools/neural-codec.cpp new file mode 100644 index 0000000..e75f3c3 --- /dev/null +++ b/tools/neural-codec.cpp @@ -0,0 +1,522 @@ +// neural-codec.cpp: neural audio codec (Oobleck VAE) +// +// 2 modes: +// encode: WAV -> latent file (f32, Q8, or Q4) +// decode: latent file -> WAV (48kHz stereo) +// +// Three latent formats, decode auto-detects: +// +// f32 (default): flat [T, 64] f32, no header. +// T = file_size / 256. 25Hz, ~6.4 KB/s, ~51 kbit/s. +// +// Q8 (--q8): symmetric per-frame int8 quantization. +// header: "NAC8" magic (4B) + uint32 T_latent (4B) +// frame: f16 scale (2B) + int8[64] (64B) = 66B +// 25Hz, ~1.6 KB/s, ~13 kbit/s. +// +// Q4 (--q4): symmetric per-frame 4-bit quantization. +// header: "NAC4" magic (4B) + uint32 T_latent (4B) +// frame: f16 scale (2B) + nibbles[32] (32B) = 34B +// 25Hz, ~850 B/s, ~6.8 kbit/s. +// +// Usage: +// neural-codec --vae model.gguf --encode -i song.wav -o song.latent +// neural-codec --vae model.gguf --encode --q8 -i song.wav -o song.nac8 +// neural-codec --vae model.gguf --encode --q4 -i song.wav -o song.nac4 +// neural-codec --vae model.gguf --decode -i song.nac4 -o song.wav + +#include "vae.h" +#include "vae-enc.h" +#include +#include +#include +#include +#include + +// Minimal WAV reader: 16-bit PCM or 32-bit float, mono/stereo, any sample rate. +// Returns interleaved float [T, 2]. Sets *T_audio, *sr. Caller frees. +static float * read_wav(const char * path, int * T_audio, int * sr) { + FILE * f = fopen(path, "rb"); + if (!f) { fprintf(stderr, "[WAV] Cannot open %s\n", path); return NULL; } + + char riff[4]; fread(riff, 1, 4, f); + if (memcmp(riff, "RIFF", 4) != 0) { + fprintf(stderr, "[WAV] Not a RIFF file: %s\n", path); fclose(f); return NULL; + } + fseek(f, 4, SEEK_CUR); + char wave[4]; fread(wave, 1, 4, f); + if (memcmp(wave, "WAVE", 4) != 0) { + fprintf(stderr, "[WAV] Not a WAVE file: %s\n", path); fclose(f); return NULL; + } + + int n_channels = 0, sample_rate = 0, bits_per_sample = 0; + short audio_format = 0; + float * audio = NULL; + int n_samples = 0; + + while (!feof(f)) { + char chunk_id[4]; + int chunk_size; + if (fread(chunk_id, 1, 4, f) != 4) break; + if (fread(&chunk_size, 4, 1, f) != 1) break; + + if (memcmp(chunk_id, "fmt ", 4) == 0) { + fread(&audio_format, 2, 1, f); + short nc; fread(&nc, 2, 1, f); n_channels = nc; + fread(&sample_rate, 4, 1, f); + fseek(f, 4, SEEK_CUR); // byte_rate + fseek(f, 2, SEEK_CUR); // block_align + short bps; fread(&bps, 2, 1, f); bits_per_sample = bps; + int consumed = 16; + if (chunk_size > consumed) fseek(f, chunk_size - consumed, SEEK_CUR); + + } else if (memcmp(chunk_id, "data", 4) == 0 && n_channels > 0) { + if (audio_format == 1 && bits_per_sample == 16) { + n_samples = chunk_size / (n_channels * 2); + audio = (float *)malloc((size_t)n_samples * 2 * sizeof(float)); + std::vector buf((size_t)n_samples * n_channels); + fread(buf.data(), 2, (size_t)n_samples * n_channels, f); + for (int t = 0; t < n_samples; t++) { + if (n_channels == 1) { + float s = (float)buf[t] / 32768.0f; + audio[t * 2 + 0] = s; + audio[t * 2 + 1] = s; + } else { + audio[t * 2 + 0] = (float)buf[t * n_channels + 0] / 32768.0f; + audio[t * 2 + 1] = (float)buf[t * n_channels + 1] / 32768.0f; + } + } + } else if (audio_format == 3 && bits_per_sample == 32) { + n_samples = chunk_size / (n_channels * 4); + audio = (float *)malloc((size_t)n_samples * 2 * sizeof(float)); + std::vector buf((size_t)n_samples * n_channels); + fread(buf.data(), 4, (size_t)n_samples * n_channels, f); + for (int t = 0; t < n_samples; t++) { + if (n_channels == 1) { + audio[t * 2 + 0] = buf[t]; + audio[t * 2 + 1] = buf[t]; + } else { + audio[t * 2 + 0] = buf[t * n_channels + 0]; + audio[t * 2 + 1] = buf[t * n_channels + 1]; + } + } + } else { + fprintf(stderr, "[WAV] Unsupported: format=%d bits=%d (need PCM16 or float32)\n", + audio_format, bits_per_sample); + fclose(f); return NULL; + } + break; + } else { + fseek(f, chunk_size, SEEK_CUR); + } + } + fclose(f); + if (!audio) { fprintf(stderr, "[WAV] No audio data in %s\n", path); return NULL; } + + *T_audio = n_samples; + *sr = sample_rate; + fprintf(stderr, "[WAV] Read %s: %d samples, %d Hz, %d ch, %d bit\n", + path, n_samples, sample_rate, n_channels, bits_per_sample); + return audio; +} + +// WAV writer: planar [ch0: T, ch1: T] -> 16-bit PCM stereo +static bool write_wav(const char * path, const float * audio, int T_audio, int sr) { + FILE * f = fopen(path, "wb"); + if (!f) return false; + int n_channels = 2, bits = 16; + int byte_rate = sr * n_channels * (bits / 8); + int block_align = n_channels * (bits / 8); + int data_size = T_audio * n_channels * (bits / 8); + int file_size = 36 + data_size; + fwrite("RIFF", 1, 4, f); + fwrite(&file_size, 4, 1, f); + fwrite("WAVE", 1, 4, f); + fwrite("fmt ", 1, 4, f); + int fmt_size = 16; fwrite(&fmt_size, 4, 1, f); + short audio_fmt = 1; fwrite(&audio_fmt, 2, 1, f); + short nc = (short)n_channels; fwrite(&nc, 2, 1, f); + fwrite(&sr, 4, 1, f); + fwrite(&byte_rate, 4, 1, f); + short ba = (short)block_align; fwrite(&ba, 2, 1, f); + short bp = (short)bits; fwrite(&bp, 2, 1, f); + fwrite("data", 1, 4, f); + fwrite(&data_size, 4, 1, f); + for (int t = 0; t < T_audio; t++) { + for (int c = 0; c < 2; c++) { + float s = audio[c * T_audio + t]; + s = s < -1.0f ? -1.0f : (s > 1.0f ? 1.0f : s); + short v = (short)(s * 32767.0f); + fwrite(&v, 2, 1, f); + } + } + fclose(f); + return true; +} + +// Q8 format constants +static const char NAC8_MAGIC[4] = {'N', 'A', 'C', '8'}; +static const int NAC8_HEADER = 8; // 4B magic + 4B T_latent +static const int NAC8_FRAME = 66; // 2B f16 scale + 64B int8 + +// Write Q8 quantized latent +static bool write_latent_q8(const char * path, const float * data, int T_latent) { + FILE * f = fopen(path, "wb"); + if (!f) return false; + + fwrite(NAC8_MAGIC, 1, 4, f); + uint32_t t = (uint32_t)T_latent; + fwrite(&t, 4, 1, f); + + for (int i = 0; i < T_latent; i++) { + const float * frame = data + i * 64; + + // find max abs for symmetric quant + float amax = 0.0f; + for (int j = 0; j < 64; j++) { + float a = fabsf(frame[j]); + if (a > amax) amax = a; + } + float scale = amax / 127.0f; + ggml_fp16_t scale_f16 = ggml_fp32_to_fp16(scale); + fwrite(&scale_f16, 2, 1, f); + + // quantize + int8_t q[64]; + float inv = (scale > 0.0f) ? 127.0f / amax : 0.0f; + for (int j = 0; j < 64; j++) { + int v = (int)roundf(frame[j] * inv); + q[j] = (int8_t)(v < -127 ? -127 : (v > 127 ? 127 : v)); + } + fwrite(q, 1, 64, f); + } + fclose(f); + + size_t bytes = NAC8_HEADER + (size_t)T_latent * NAC8_FRAME; + float duration = (float)T_latent * 1920.0f / 48000.0f; + float kbps = (float)bytes * 8.0f / (duration * 1000.0f); + fprintf(stderr, "[Latent] Wrote %s: Q8, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n", + path, T_latent, duration, (float)bytes / 1024.0f, kbps); + return true; +} + +// Q4 format constants +static const char NAC4_MAGIC[4] = {'N', 'A', 'C', '4'}; +static const int NAC4_HEADER = 8; // 4B magic + 4B T_latent +static const int NAC4_FRAME = 34; // 2B f16 scale + 32B packed nibbles + +// Write Q4 quantized latent +// Symmetric 4-bit: range [-7, 7], scale = amax / 7.0 +// Packing: byte = (low & 0x0F) | (high << 4), two signed nibbles per byte +static bool write_latent_q4(const char * path, const float * data, int T_latent) { + FILE * f = fopen(path, "wb"); + if (!f) return false; + + fwrite(NAC4_MAGIC, 1, 4, f); + uint32_t t = (uint32_t)T_latent; + fwrite(&t, 4, 1, f); + + for (int i = 0; i < T_latent; i++) { + const float * frame = data + i * 64; + + // find max abs for symmetric quant + float amax = 0.0f; + for (int j = 0; j < 64; j++) { + float a = fabsf(frame[j]); + if (a > amax) amax = a; + } + float scale = amax / 7.0f; + ggml_fp16_t scale_f16 = ggml_fp32_to_fp16(scale); + fwrite(&scale_f16, 2, 1, f); + + // quantize and pack pairs into bytes + float inv = (scale > 0.0f) ? 7.0f / amax : 0.0f; + uint8_t packed[32]; + for (int j = 0; j < 32; j++) { + int lo = (int)roundf(frame[j * 2 + 0] * inv); + int hi = (int)roundf(frame[j * 2 + 1] * inv); + lo = lo < -7 ? -7 : (lo > 7 ? 7 : lo); + hi = hi < -7 ? -7 : (hi > 7 ? 7 : hi); + packed[j] = (uint8_t)((lo & 0x0F) | (hi << 4)); + } + fwrite(packed, 1, 32, f); + } + fclose(f); + + size_t bytes = NAC4_HEADER + (size_t)T_latent * NAC4_FRAME; + float duration = (float)T_latent * 1920.0f / 48000.0f; + float kbps = (float)bytes * 8.0f / (duration * 1000.0f); + fprintf(stderr, "[Latent] Wrote %s: Q4, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n", + path, T_latent, duration, (float)bytes / 1024.0f, kbps); + return true; +} + +// Write f32 raw latent (no header) +static bool write_latent_f32(const char * path, const float * data, int T_latent) { + FILE * f = fopen(path, "wb"); + if (!f) return false; + size_t bytes = (size_t)T_latent * 64 * sizeof(float); + fwrite(data, 1, bytes, f); + fclose(f); + float duration = (float)T_latent * 1920.0f / 48000.0f; + fprintf(stderr, "[Latent] Wrote %s: f32, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n", + path, T_latent, duration, (float)bytes / 1024.0f, + (float)bytes * 8.0f / (duration * 1000.0f)); + return true; +} + +// Read latent, auto-detect format (NAC8 -> Q8, NAC4 -> Q4, else f32). +// Returns [T_latent, 64] f32 (dequantized if quantized). Caller frees. +static float * read_latent(const char * path, int * T_latent) { + FILE * f = fopen(path, "rb"); + if (!f) { fprintf(stderr, "[Latent] Cannot open %s\n", path); return NULL; } + fseek(f, 0, SEEK_END); + long fsize = ftell(f); + fseek(f, 0, SEEK_SET); + + // Check magic + char magic[4] = {}; + if (fsize >= 8) fread(magic, 1, 4, f); + + if (memcmp(magic, NAC8_MAGIC, 4) == 0) { + // Q8 format + uint32_t t; + fread(&t, 4, 1, f); + *T_latent = (int)t; + + long expected = NAC8_HEADER + (long)t * NAC8_FRAME; + if (fsize != expected) { + fprintf(stderr, "[Latent] Q8 size mismatch: expected %ld, got %ld\n", expected, fsize); + fclose(f); return NULL; + } + + float * data = (float *)malloc((size_t)t * 64 * sizeof(float)); + for (int i = 0; i < (int)t; i++) { + ggml_fp16_t scale_f16; + fread(&scale_f16, 2, 1, f); + float scale = ggml_fp16_to_fp32(scale_f16); + + int8_t q[64]; + fread(q, 1, 64, f); + + float * frame = data + i * 64; + for (int j = 0; j < 64; j++) + frame[j] = (float)q[j] * scale; + } + fclose(f); + + float duration = (float)(*T_latent) * 1920.0f / 48000.0f; + float kbps = (float)fsize * 8.0f / (duration * 1000.0f); + fprintf(stderr, "[Latent] Read %s: Q8, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n", + path, *T_latent, duration, (float)fsize / 1024.0f, kbps); + return data; + } + + if (memcmp(magic, NAC4_MAGIC, 4) == 0) { + // Q4 format + uint32_t t; + fread(&t, 4, 1, f); + *T_latent = (int)t; + + long expected = NAC4_HEADER + (long)t * NAC4_FRAME; + if (fsize != expected) { + fprintf(stderr, "[Latent] Q4 size mismatch: expected %ld, got %ld\n", expected, fsize); + fclose(f); return NULL; + } + + float * data = (float *)malloc((size_t)t * 64 * sizeof(float)); + for (int i = 0; i < (int)t; i++) { + ggml_fp16_t scale_f16; + fread(&scale_f16, 2, 1, f); + float scale = ggml_fp16_to_fp32(scale_f16); + + uint8_t packed[32]; + fread(packed, 1, 32, f); + + // unpack signed nibbles + float * frame = data + i * 64; + for (int j = 0; j < 32; j++) { + int lo = (int)(packed[j] & 0x0F); + int hi = (int)(packed[j] >> 4); + if (lo >= 8) lo -= 16; + if (hi >= 8) hi -= 16; + frame[j * 2 + 0] = (float)lo * scale; + frame[j * 2 + 1] = (float)hi * scale; + } + } + fclose(f); + + float duration = (float)(*T_latent) * 1920.0f / 48000.0f; + float kbps = (float)fsize * 8.0f / (duration * 1000.0f); + fprintf(stderr, "[Latent] Read %s: Q4, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n", + path, *T_latent, duration, (float)fsize / 1024.0f, kbps); + return data; + } + + // f32 format (no header, rewind) + fseek(f, 0, SEEK_SET); + if (fsize % (64 * (int)sizeof(float)) != 0) { + fprintf(stderr, "[Latent] File size %ld not a multiple of %d (64 * f32)\n", + fsize, (int)(64 * sizeof(float))); + fclose(f); return NULL; + } + + *T_latent = (int)(fsize / (64 * sizeof(float))); + float * data = (float *)malloc(fsize); + fread(data, 1, fsize, f); + fclose(f); + + float duration = (float)(*T_latent) * 1920.0f / 48000.0f; + fprintf(stderr, "[Latent] Read %s: f32, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n", + path, *T_latent, duration, (float)fsize / 1024.0f, + (float)fsize * 8.0f / (duration * 1000.0f)); + return data; +} + +static void print_usage(const char * prog) { + fprintf(stderr, + "Usage: %s --vae --encode|--decode -i [-o ] [--q8|--q4]\n\n" + "Required:\n" + " --vae VAE GGUF file\n" + " --encode | --decode Encode WAV to latent, or decode latent to WAV\n" + " -i Input (WAV for encode, latent for decode)\n\n" + "Output:\n" + " -o Output file (auto-named if omitted)\n" + " --q8 Quantize latent to int8 (~13 kbit/s)\n" + " --q4 Quantize latent to int4 (~6.8 kbit/s)\n\n" + "Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4)\n" + " song.latent -> song.wav\n\n" + "VAE tiling (memory control):\n" + " --vae-chunk Latent frames per tile (default: 256)\n" + " --vae-overlap Overlap frames per side (default: 64)\n\n" + "Latent formats (decode auto-detects):\n" + " f32: flat [T, 64] f32, no header. ~51 kbit/s.\n" + " NAC8: header + per-frame Q8. ~13 kbit/s.\n" + " NAC4: header + per-frame Q4. ~6.8 kbit/s.\n", + prog); +} + +static std::string auto_output(const char * input, const char * ext) { + std::string s = input; + size_t dot = s.rfind('.'); + if (dot != std::string::npos) + return s.substr(0, dot) + ext; + return s + ext; +} + +int main(int argc, char ** argv) { + const char * vae_path = NULL; + const char * input_path = NULL; + const char * output_path = NULL; + int chunk_size = 256; + int overlap = 64; + int mode = -1; // 0 = encode, 1 = decode + int quant = 0; // 0 = f32, 8 = q8, 4 = q4 + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--vae") == 0 && i + 1 < argc) vae_path = argv[++i]; + else if (strcmp(argv[i], "-i") == 0 && i + 1 < argc) input_path = argv[++i]; + else if (strcmp(argv[i], "--input") == 0 && i + 1 < argc) input_path = argv[++i]; + else if (strcmp(argv[i], "-o") == 0 && i + 1 < argc) output_path = argv[++i]; + else if (strcmp(argv[i], "--output") == 0 && i + 1 < argc) output_path = argv[++i]; + else if (strcmp(argv[i], "--vae-chunk") == 0 && i + 1 < argc) chunk_size = atoi(argv[++i]); + else if (strcmp(argv[i], "--vae-overlap") == 0 && i + 1 < argc) overlap = atoi(argv[++i]); + else if (strcmp(argv[i], "--encode") == 0) mode = 0; + else if (strcmp(argv[i], "--decode") == 0) mode = 1; + else if (strcmp(argv[i], "--q8") == 0) quant = 8; + else if (strcmp(argv[i], "--q4") == 0) quant = 4; + else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { + print_usage(argv[0]); return 0; + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[i]); + print_usage(argv[0]); return 1; + } + } + + if (!vae_path || !input_path || mode < 0) { + print_usage(argv[0]); return 1; + } + + // Auto output names + std::string out_str; + if (!output_path) { + if (mode == 0) { + const char * ext = ".latent"; + if (quant == 8) ext = ".nac8"; + if (quant == 4) ext = ".nac4"; + out_str = auto_output(input_path, ext); + } else { + out_str = auto_output(input_path, ".wav"); + } + output_path = out_str.c_str(); + } + + const char * quant_str = ""; + if (mode == 0 && quant == 8) quant_str = " (Q8)"; + if (mode == 0 && quant == 4) quant_str = " (Q4)"; + fprintf(stderr, "\n[VAE] Mode: %s%s\n", mode == 0 ? "encode" : "decode", quant_str); + fprintf(stderr, "[VAE] Input: %s\n", input_path); + fprintf(stderr, "[VAE] Output: %s\n\n", output_path); + + // ENCODE + if (mode == 0) { + int T_audio = 0, sr = 0; + float * audio = read_wav(input_path, &T_audio, &sr); + if (!audio) return 1; + if (sr != 48000) + fprintf(stderr, "[WARN] Input is %d Hz, VAE expects 48000. Resample with ffmpeg first.\n", sr); + + VAEEncoder enc = {}; + vae_enc_load(&enc, vae_path); + + int max_T = (T_audio / 1920) + 64; + std::vector latent((size_t)max_T * 64); + + fprintf(stderr, "\n[VAE] Encoding %d samples (%.2fs)...\n", + T_audio, (float)T_audio / (float)(sr > 0 ? sr : 48000)); + int T_latent = vae_enc_encode_tiled(&enc, audio, T_audio, + latent.data(), max_T, chunk_size, overlap); + free(audio); + if (T_latent < 0) { vae_enc_free(&enc); return 1; } + + if (quant == 8) + write_latent_q8(output_path, latent.data(), T_latent); + else if (quant == 4) + write_latent_q4(output_path, latent.data(), T_latent); + else + write_latent_f32(output_path, latent.data(), T_latent); + + vae_enc_free(&enc); + fprintf(stderr, "[VAE] Done.\n"); + return 0; + } + + // DECODE (auto-detects f32 vs Q8 vs Q4 from file content) + { + int T_latent = 0; + float * latent = read_latent(input_path, &T_latent); + if (!latent) return 1; + + VAEGGML dec = {}; + vae_ggml_load(&dec, vae_path); + + int max_T = T_latent * 1920 + 4096; + std::vector audio((size_t)2 * max_T, 0.0f); + + fprintf(stderr, "\n[VAE] Decoding %d latent frames...\n", T_latent); + int T_audio = vae_ggml_decode_tiled(&dec, latent, T_latent, + audio.data(), max_T, chunk_size, overlap); + free(latent); + if (T_audio < 0) { vae_ggml_free(&dec); return 1; } + + if (write_wav(output_path, audio.data(), T_audio, 48000)) + fprintf(stderr, "\n[VAE] Output: %s (%d samples, %.2fs @ 48kHz)\n", + output_path, T_audio, (float)T_audio / 48000.0f); + else + fprintf(stderr, "[VAE] FATAL: failed to write %s\n", output_path); + + vae_ggml_free(&dec); + fprintf(stderr, "[VAE] Done.\n"); + return 0; + } +} diff --git a/tools/quantize.cpp b/tools/quantize.cpp index c778a47..84a3dd4 100644 --- a/tools/quantize.cpp +++ b/tools/quantize.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #ifdef _WIN32