From 2c8e46098759eb3b3bba190bf9f1945f079c164a Mon Sep 17 00:00:00 2001 From: Zile Wang <116347517+expectqwq@users.noreply.github.com> Date: Mon, 16 Mar 2026 18:49:27 +0800 Subject: [PATCH 1/2] moss-tts: add first-class MOSS-TTS support --- convert_hf_to_gguf.py | 67 + docs/moss-tts-firstclass-e2e.md | 224 +++ docs/moss-tts-firstclass-e2e_zh.md | 225 +++ examples/parallel/parallel.cpp | 2 + ggml/src/ggml-backend.cpp | 2 +- gguf-py/gguf/constants.py | 34 + gguf-py/gguf/tensor_mapping.py | 6 + include/llama.h | 4 + scripts/run-moss-tts-delay-8b-quality.sh | 195 ++ scripts/run-moss-tts-delay-decode-parity.sh | 19 + .../run-moss-tts-delay-firstclass-parity.sh | 203 ++ scripts/run-moss-tts-delay-parity.sh | 47 + src/CMakeLists.txt | 1 + src/llama-arch.cpp | 44 +- src/llama-arch.h | 13 + src/llama-batch.cpp | 38 + src/llama-batch.h | 3 + src/llama-context.cpp | 28 +- src/llama-context.h | 8 +- src/llama-graph.cpp | 4 +- src/llama-graph.h | 5 + src/llama-hparams.h | 12 + src/llama-model.cpp | 90 + src/llama-model.h | 3 + src/models/models.h | 4 + src/models/moss-tts-delay.cpp | 183 ++ tests/CMakeLists.txt | 2 + tests/moss_tts_delay_export_decode_ref.py | 94 + tests/moss_tts_delay_export_generation_ref.py | 115 ++ tests/moss_tts_delay_export_ref.py | 108 + tests/test-llama-archs.cpp | 9 + tests/test-moss-tts-delay-forward.cpp | 205 ++ tests/test-moss-tts-delay-load.cpp | 77 + tools/batched-bench/batched-bench.cpp | 2 + tools/mtmd/mtmd-helper.cpp | 4 + tools/perplexity/perplexity.cpp | 2 + tools/server/server-context.cpp | 2 + tools/tts/CMakeLists.txt | 9 + tools/tts/moss-tts-audio-decode.py | 98 + tools/tts/moss-tts-build-generation-ref.py | 127 ++ tools/tts/moss-tts-firstclass-e2e.py | 218 +++ tools/tts/moss-tts-seed-tts-eval-generate.py | 205 ++ tools/tts/moss-tts.cpp | 1737 +++++++++++++++++ 43 files changed, 4461 insertions(+), 17 deletions(-) create mode 100644 docs/moss-tts-firstclass-e2e.md create mode 100644 docs/moss-tts-firstclass-e2e_zh.md create mode 100755 scripts/run-moss-tts-delay-8b-quality.sh create mode 100755 scripts/run-moss-tts-delay-decode-parity.sh create mode 100755 scripts/run-moss-tts-delay-firstclass-parity.sh create mode 100755 scripts/run-moss-tts-delay-parity.sh create mode 100644 src/models/moss-tts-delay.cpp create mode 100755 tests/moss_tts_delay_export_decode_ref.py create mode 100755 tests/moss_tts_delay_export_generation_ref.py create mode 100644 tests/moss_tts_delay_export_ref.py create mode 100644 tests/test-moss-tts-delay-forward.cpp create mode 100644 tests/test-moss-tts-delay-load.cpp create mode 100755 tools/tts/moss-tts-audio-decode.py create mode 100755 tools/tts/moss-tts-build-generation-ref.py create mode 100755 tools/tts/moss-tts-firstclass-e2e.py create mode 100644 tools/tts/moss-tts-seed-tts-eval-generate.py create mode 100644 tools/tts/moss-tts.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index eec0ea14e..aae14fa4e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4628,6 +4628,73 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("MossTTSDelayModel", "MossTTSDelayForCausalLM") +class MossTTSDelayModel(Qwen3Model): + model_arch = gguf.MODEL_ARCH.MOSS_TTS_DELAY + + def __init__(self, *args, **kwargs): + hparams = kwargs.get("hparams") + if hparams is None: + hparams = ModelBase.load_hparams(args[0], self.is_mistral_format) + else: + hparams = dict(hparams) + + language_config = hparams.get("language_config") + if isinstance(language_config, dict): + # Expose the Qwen3 backbone params at the root level so TextModel can + # discover block_count / hidden_size / attention params without + # losing the top-level MOSS architecture identity. + language_hparams = { + key: value + for key, value in language_config.items() + if key not in ("architectures", "model_type") + } + hparams = {**hparams, **language_hparams} + + kwargs["hparams"] = hparams + super().__init__(*args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + arch = self.gguf_writer.arch + self.gguf_writer.add_uint32(gguf.Keys.LLM.N_VQ.format(arch=arch), self.hparams["n_vq"]) + self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_VOCAB_SIZE.format(arch=arch), self.hparams["audio_vocab_size"]) + self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_PAD_CODE.format(arch=arch), self.hparams["audio_pad_code"]) + self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_START_TOKEN_ID.format(arch=arch), self.hparams["audio_start_token_id"]) + self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_END_TOKEN_ID.format(arch=arch), self.hparams["audio_end_token_id"]) + self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_USER_SLOT_TOKEN_ID.format(arch=arch), self.hparams["audio_user_slot_token_id"]) + self.gguf_writer.add_uint32( + gguf.Keys.LLM.AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID.format(arch=arch), + self.hparams["audio_assistant_gen_slot_token_id"], + ) + self.gguf_writer.add_uint32( + gguf.Keys.LLM.AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID.format(arch=arch), + self.hparams["audio_assistant_delay_slot_token_id"], + ) + if (sampling_rate := self.hparams.get("sampling_rate")) is not None: + self.gguf_writer.add_uint32(gguf.Keys.LLM.SAMPLING_RATE.format(arch=arch), sampling_rate) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("language_model."): + name = name.replace("language_model.", "", 1) + + if (match := re.fullmatch(r"emb_ext\.(\d+)\.weight", name)) is not None: + vq_idx = int(match.group(1)) + yield (f"{gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD_AUDIO]}.{vq_idx}.weight", data_torch) + return + + if (match := re.fullmatch(r"lm_heads\.(\d+)\.weight", name)) is not None: + head_idx = int(match.group(1)) + if head_idx == 0: + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight", data_torch) + else: + yield (f"{gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT_AUDIO]}.{head_idx - 1}.weight", data_torch) + return + + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Qwen3MoeForCausalLM") class Qwen3MoeModel(Qwen2MoeModel): model_arch = gguf.MODEL_ARCH.QWEN3MOE diff --git a/docs/moss-tts-firstclass-e2e.md b/docs/moss-tts-firstclass-e2e.md new file mode 100644 index 000000000..5015fd77d --- /dev/null +++ b/docs/moss-tts-firstclass-e2e.md @@ -0,0 +1,224 @@ +# MOSS-TTS First-Class End-to-End Inference Pipeline + +[English](moss-tts-firstclass-e2e.md) | [简体中文](moss-tts-firstclass-e2e_zh.md) + +This document describes the **first-class** MOSS-TTS end-to-end inference pipeline in the current `llama.cpp` repository. + +This pipeline uses: + +- **llama.cpp** and `llama-moss-tts` to run the first-class MOSS-TTS-Delay GGUF model +- **ONNX Runtime** for reference-audio encoding and final waveform decoding +- **Python helper scripts** for prompt construction and end-to-end orchestration +- A local **MOSS-TTS** checkout that provides the prompt builder and ONNX tokenizer Python modules + +Unlike the older `moss_tts_delay/llama_cpp` backend in the `MOSS-TTS` repository, this path moves multi-channel inputs, the transformer backbone, multi-head outputs, and delay-pattern decoding into `llama.cpp`. Python is only responsible for preparing inputs and invoking the ONNX audio tokenizer. + +## Prerequisites + +1. **llama.cpp** built from source with the `llama-moss-tts` target +2. **Python >= 3.10** +3. A local **MOSS-TTS** checkout, provided in any of the following ways: + - available at `../MOSS-TTS` relative to the repository root + - passed through `--moss-tts-dir` + - passed through `MOSS_TTS_DIR` or `MOSS_TTS_ROOT` +4. Python packages required by the helper scripts: + - `numpy` + - `soundfile` + - `onnxruntime` + +## Build + +```bash +cd /path/to/llama.cpp + +cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON +cmake --build build --target llama-moss-tts -j +``` + +The resulting binary is: + +- `build/bin/llama-moss-tts` + +If you want to build at runtime, you can also pass `--build` to the e2e script. + +## Weight Preparation + +### Step 1: Prepare the first-class GGUF model + +You need a first-class MOSS-TTS-Delay GGUF model that already contains: + +- text embedding tables +- 32 audio embedding tables +- Qwen3 backbone weights +- a text output head +- 32 audio output heads + +For example: + +- `out/stage1a_moss_delay_firstclass_f16.gguf` + +### Step 2: Prepare the tokenizer directory + +You need a tokenizer directory containing at least: + +- `tokenizer.json` + +For example: + +- `weights/extracted/qwen3_backbone/` + +### Step 3: Prepare the ONNX audio tokenizer + +You need both ONNX files: + +- `encoder.onnx` +- `decoder.onnx` + +For example: + +- `weights/MOSS-Audio-Tokenizer-ONNX/encoder.onnx` +- `weights/MOSS-Audio-Tokenizer-ONNX/decoder.onnx` + +### Step 4: Make the MOSS-TTS repository visible + +The helper scripts import: + +- `moss_tts_delay.llama_cpp.processor` +- `moss_audio_tokenizer.onnx` + +You can provide the repository path like this: + +```bash +export MOSS_TTS_DIR=/path/to/MOSS-TTS +``` + +or: + +```bash +python tools/tts/moss-tts-firstclass-e2e.py --moss-tts-dir /path/to/MOSS-TTS ... +``` + +## Usage + +### CLI + +```bash +# Voice cloning: text + reference audio -> wav +python tools/tts/moss-tts-firstclass-e2e.py \ + --model-gguf /path/to/moss_delay_firstclass.gguf \ + --moss-tts-dir /path/to/MOSS-TTS \ + --tokenizer-dir /path/to/tokenizer_dir \ + --onnx-encoder /path/to/encoder.onnx \ + --onnx-decoder /path/to/decoder.onnx \ + --text-file /path/to/text.txt \ + --reference-audio /path/to/reference_24k.wav \ + --output-wav /path/to/output.wav + +# Direct generation without reference audio +python tools/tts/moss-tts-firstclass-e2e.py \ + --model-gguf /path/to/moss_delay_firstclass.gguf \ + --moss-tts-dir /path/to/MOSS-TTS \ + --tokenizer-dir /path/to/tokenizer_dir \ + --onnx-encoder /path/to/encoder.onnx \ + --onnx-decoder /path/to/decoder.onnx \ + --text "Hello, world!" \ + --output-wav /path/to/output.wav + +# Build llama-moss-tts before running +python tools/tts/moss-tts-firstclass-e2e.py \ + --build \ + --model-gguf /path/to/moss_delay_firstclass.gguf \ + --moss-tts-dir /path/to/MOSS-TTS \ + --tokenizer-dir /path/to/tokenizer_dir \ + --onnx-encoder /path/to/encoder.onnx \ + --onnx-decoder /path/to/decoder.onnx \ + --text "Hello!" \ + --output-wav /path/to/output.wav +``` + +## Key Options + +| Option | Values | Description | +|------|------|------| +| `--model-gguf` | path | First-class MOSS-TTS GGUF model | +| `--moss-tts-dir` | path | Local `MOSS-TTS` repository root | +| `--tokenizer-dir` | path | Directory containing `tokenizer.json` | +| `--onnx-encoder` | path | Audio tokenizer encoder ONNX | +| `--onnx-decoder` | path | Audio tokenizer decoder ONNX | +| `--text` / `--text-file` | string / path | Input text, choose exactly one | +| `--reference-audio` | path | Optional 24 kHz reference audio | +| `--language` | `zh` / `en` / tag | Language tag passed to the prompt builder | +| `--max-new-tokens` | int | Maximum generation steps | +| `--text-temperature` | float | Text-channel sampling temperature, default `1.5` | +| `--audio-temperature` | float | Audio-channel sampling temperature, default `1.7` | +| `--n-gpu-layers` | `-1` / `0` / `N` | GPU offload layers, default `-1` | +| `--audio-decoder-cpu` | flag | Force ONNX waveform decoding on CPU | +| `--cpu-audio-encode` | flag | Force ONNX reference-audio encoding on CPU | +| `--build` | flag | Build `llama-moss-tts` before running | + +## Architecture + +```text +Input text (+ optional reference wav) + | + v +moss-tts-build-generation-ref.py + | + |- tokenizes text with the Qwen3 tokenizer + |- optionally encodes the reference wav into audio codes with ONNX + |- calls the prompt builder from the local MOSS-TTS repo + v +generation.ref.bin + | + v +llama-moss-tts + | + |- loads the first-class GGUF model + |- performs multi-channel embedding lookup in-graph + |- runs the Qwen3 backbone inside llama.cpp + |- samples multi-head logits + |- performs delay-pattern decoding in C++ + v +raw.codes.bin + | + v +moss-tts-audio-decode.py + | + |- decodes raw audio codes into waveform with ONNX + v +wav +``` + +## Temporary Artifacts + +The e2e script creates a temporary directory and removes it automatically after the run. + +The following intermediate files are not kept: + +- `generation.ref.bin` +- `raw.codes.bin` + +The only visible artifact after the run is the output wav you requested. + +## Output + +At the end of a successful run, the script prints: + +- `wav` — output path +- `wav_info` — sample rate, channel count, frame count, and duration + +## File Structure + +```text +llama.cpp/ +├── docs/ +│ ├── moss-tts-firstclass-e2e.md +│ └── moss-tts-firstclass-e2e_zh.md +├── tools/tts/ +│ ├── moss-tts-firstclass-e2e.py # End-to-end wrapper +│ ├── moss-tts-build-generation-ref.py # Prompt / input builder +│ ├── moss-tts-audio-decode.py # ONNX audio decode helper +│ └── moss-tts.cpp # llama-moss-tts implementation +└── build/bin/ + └── llama-moss-tts +``` diff --git a/docs/moss-tts-firstclass-e2e_zh.md b/docs/moss-tts-firstclass-e2e_zh.md new file mode 100644 index 000000000..345187e3b --- /dev/null +++ b/docs/moss-tts-firstclass-e2e_zh.md @@ -0,0 +1,225 @@ +# MOSS-TTS First-Class 端到端推理流水线 + +[English](moss-tts-firstclass-e2e.md) | [简体中文](moss-tts-firstclass-e2e_zh.md) + +本文档说明当前 `llama.cpp` 仓库中的 **first-class** MOSS-TTS 端到端推理链路。 + +这条链路使用: + +- **llama.cpp** 和 `llama-moss-tts` 运行 first-class MOSS-TTS-Delay GGUF 模型 +- **ONNX Runtime** 完成参考音频编码和最终波形解码 +- **Python helper scripts** 负责 prompt 构建和整条链路编排 +- 本地 **MOSS-TTS** 仓库 checkout 提供 prompt builder 和 ONNX tokenizer Python 模块 + +与 `MOSS-TTS` 仓库中较早的 `moss_tts_delay/llama_cpp` 后端不同,这条链路把多通道输入、transformer backbone、多头输出以及 delay-pattern decode 都放进了 `llama.cpp`。Python 只负责准备输入和调用 ONNX 音频编解码器。 + +## 前置条件 + +1. **llama.cpp** 已从源码编译,并包含 `llama-moss-tts` 目标 +2. **Python >= 3.10** +3. 本地存在一个 **MOSS-TTS** checkout,可以通过以下任一方式提供: + - 位于当前仓库根目录旁边的 `../MOSS-TTS` + - 通过 `--moss-tts-dir` 指定 + - 通过 `MOSS_TTS_DIR` 或 `MOSS_TTS_ROOT` 指定 +4. helper scripts 需要的 Python 包: + - `numpy` + - `soundfile` + - `onnxruntime` + +## 编译 + +```bash +cd /path/to/llama.cpp + +cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON +cmake --build build --target llama-moss-tts -j +``` + +编译产物为: + +- `build/bin/llama-moss-tts` + +如果你希望在运行时自动构建,也可以在 e2e 脚本里传 `--build`。 + +## 权重准备 + +### 第一步:准备 first-class GGUF 模型 + +需要一个已经包含以下内容的 first-class MOSS-TTS-Delay GGUF: + +- 文本 embedding 表 +- 32 个音频 embedding 表 +- Qwen3 backbone 权重 +- 文本输出头 +- 32 个音频输出头 + +例如: + +- `out/stage1a_moss_delay_firstclass_f16.gguf` + +### 第二步:准备 tokenizer 目录 + +需要一个至少包含以下文件的 tokenizer 目录: + +- `tokenizer.json` + +例如: + +- `weights/extracted/qwen3_backbone/` + +### 第三步:准备 ONNX 音频编解码器 + +需要同时提供两个 ONNX 文件: + +- `encoder.onnx` +- `decoder.onnx` + +例如: + +- `weights/MOSS-Audio-Tokenizer-ONNX/encoder.onnx` +- `weights/MOSS-Audio-Tokenizer-ONNX/decoder.onnx` + +### 第四步:让脚本能找到 MOSS-TTS 仓库 + +helper scripts 会导入: + +- `moss_tts_delay.llama_cpp.processor` +- `moss_audio_tokenizer.onnx` + +可以通过以下方式提供 repo 路径: + +```bash +export MOSS_TTS_DIR=/path/to/MOSS-TTS +``` + +或者: + +```bash +python tools/tts/moss-tts-firstclass-e2e.py --moss-tts-dir /path/to/MOSS-TTS ... +``` + +## 使用方式 + +### 命令行 + +```bash +# 音色克隆:text + reference audio -> wav +python tools/tts/moss-tts-firstclass-e2e.py \ + --model-gguf /path/to/moss_delay_firstclass.gguf \ + --moss-tts-dir /path/to/MOSS-TTS \ + --tokenizer-dir /path/to/tokenizer_dir \ + --onnx-encoder /path/to/encoder.onnx \ + --onnx-decoder /path/to/decoder.onnx \ + --text-file /path/to/text.txt \ + --reference-audio /path/to/reference_24k.wav \ + --output-wav /path/to/output.wav + +# 不带参考音频的直接生成 +python tools/tts/moss-tts-firstclass-e2e.py \ + --model-gguf /path/to/moss_delay_firstclass.gguf \ + --moss-tts-dir /path/to/MOSS-TTS \ + --tokenizer-dir /path/to/tokenizer_dir \ + --onnx-encoder /path/to/encoder.onnx \ + --onnx-decoder /path/to/decoder.onnx \ + --text "你好,世界!" \ + --output-wav /path/to/output.wav + +# 运行前自动构建 llama-moss-tts +python tools/tts/moss-tts-firstclass-e2e.py \ + --build \ + --model-gguf /path/to/moss_delay_firstclass.gguf \ + --moss-tts-dir /path/to/MOSS-TTS \ + --tokenizer-dir /path/to/tokenizer_dir \ + --onnx-encoder /path/to/encoder.onnx \ + --onnx-decoder /path/to/decoder.onnx \ + --text "你好!" \ + --output-wav /path/to/output.wav +``` + + +## 关键参数 + +| 参数 | 取值 | 说明 | +|------|------|------| +| `--model-gguf` | path | first-class MOSS-TTS GGUF 模型 | +| `--moss-tts-dir` | path | 本地 `MOSS-TTS` 仓库根目录 | +| `--tokenizer-dir` | path | 含 `tokenizer.json` 的目录 | +| `--onnx-encoder` | path | 音频 tokenizer encoder ONNX | +| `--onnx-decoder` | path | 音频 tokenizer decoder ONNX | +| `--text` / `--text-file` | string / path | 输入文本,二选一 | +| `--reference-audio` | path | 可选的 24 kHz 参考音频 | +| `--language` | `zh` / `en` / tag | 传给 prompt builder 的语言标签 | +| `--max-new-tokens` | int | 最大生成步数 | +| `--text-temperature` | float | 文本通道采样温度,默认 `1.5` | +| `--audio-temperature` | float | 音频通道采样温度,默认 `1.7` | +| `--n-gpu-layers` | `-1` / `0` / `N` | GPU offload 层数,默认 `-1` | +| `--audio-decoder-cpu` | flag | 强制 ONNX 波形解码走 CPU | +| `--cpu-audio-encode` | flag | 强制 ONNX 参考音频编码走 CPU | +| `--build` | flag | 运行前构建 `llama-moss-tts` | + +## 架构 + +```text +输入文本(+ 可选 reference wav) + | + v +moss-tts-build-generation-ref.py + | + |- 用 Qwen3 tokenizer 处理文本 + |- 可选:用 ONNX 把 reference wav 编成 audio codes + |- 调用本地 MOSS-TTS repo 的 prompt builder + v +generation.ref.bin + | + v +llama-moss-tts + | + |- 加载 first-class GGUF 模型 + |- 在图内完成多通道 embedding lookup + |- 在 llama.cpp 中执行 Qwen3 backbone + |- 对多头 logits 做采样 + |- 在 C++ 中完成 delay-pattern decode + v +raw.codes.bin + | + v +moss-tts-audio-decode.py + | + |- 用 ONNX 把 raw audio codes 解码成波形 + v +wav +``` + +## 临时产物 + +e2e 脚本会创建临时目录,并在流程结束后自动删除。 + +以下中间文件不会保留: + +- `generation.ref.bin` +- `raw.codes.bin` + +最终对外可见的产物只有你指定的输出 wav。 + +## 输出 + +成功结束时,脚本会打印: + +- `wav` — 输出路径 +- `wav_info` — 采样率、声道数、帧数和时长 + +## 文件结构 + +```text +llama.cpp/ +├── docs/ +│ ├── moss-tts-firstclass-e2e.md +│ └── moss-tts-firstclass-e2e_zh.md +├── tools/tts/ +│ ├── moss-tts-firstclass-e2e.py # 端到端 wrapper +│ ├── moss-tts-build-generation-ref.py # prompt / input 构建器 +│ ├── moss-tts-audio-decode.py # ONNX 音频解码 helper +│ └── moss-tts.cpp # llama-moss-tts 实现 +└── build/bin/ + └── llama-moss-tts +``` diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 1700ceefb..970f4657c 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -395,6 +395,8 @@ int main(int argc, char ** argv) { llama_batch batch_view = { n_tokens, batch.token + i, + 0, + nullptr, nullptr, batch.pos + i, batch.n_seq_id + i, diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 22c656996..c6c537ea8 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -665,7 +665,7 @@ static bool ggml_is_view_op(enum ggml_op op) { #endif #ifndef GGML_SCHED_MAX_SPLIT_INPUTS -#define GGML_SCHED_MAX_SPLIT_INPUTS 30 +#define GGML_SCHED_MAX_SPLIT_INPUTS 64 #endif #ifndef GGML_SCHED_MAX_COPIES diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index bf617382d..8ae6dd3b8 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -105,6 +105,15 @@ class LLM: CONTEXT_LENGTH = "{arch}.context_length" EMBEDDING_LENGTH = "{arch}.embedding_length" EMBEDDING_LENGTH_OUT = "{arch}.embedding_length_out" + N_VQ = "{arch}.n_vq" + AUDIO_VOCAB_SIZE = "{arch}.audio_vocab_size" + AUDIO_PAD_CODE = "{arch}.audio_pad_code" + AUDIO_START_TOKEN_ID = "{arch}.audio_start_token_id" + AUDIO_END_TOKEN_ID = "{arch}.audio_end_token_id" + AUDIO_USER_SLOT_TOKEN_ID = "{arch}.audio_user_slot_token_id" + AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID = "{arch}.audio_assistant_gen_slot_token_id" + AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID = "{arch}.audio_assistant_delay_slot_token_id" + SAMPLING_RATE = "{arch}.sampling_rate" FEATURES_LENGTH = "{arch}.features_length" BLOCK_COUNT = "{arch}.block_count" LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" @@ -391,6 +400,7 @@ class MODEL_ARCH(IntEnum): QWEN2MOE = auto() QWEN2VL = auto() QWEN3 = auto() + MOSS_TTS_DELAY = auto() QWEN3MOE = auto() QWEN3NEXT = auto() QWEN3VL = auto() @@ -501,10 +511,12 @@ class VISION_PROJECTOR_TYPE(IntEnum): class MODEL_TENSOR(IntEnum): TOKEN_EMBD = auto() + TOKEN_EMBD_AUDIO = auto() # moss-tts-delay, indexed as token_embd_audio.{id} TOKEN_EMBD_NORM = auto() TOKEN_TYPES = auto() POS_EMBD = auto() OUTPUT = auto() + OUTPUT_AUDIO = auto() # moss-tts-delay, indexed as output_audio.{id} DENSE_2_OUT = auto() # embeddinggemma 2_Dense DENSE_3_OUT = auto() # embeddinggemma 3_Dense OUTPUT_NORM = auto() @@ -836,6 +848,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.QWEN2MOE: "qwen2moe", MODEL_ARCH.QWEN2VL: "qwen2vl", MODEL_ARCH.QWEN3: "qwen3", + MODEL_ARCH.MOSS_TTS_DELAY: "moss-tts-delay", MODEL_ARCH.QWEN3MOE: "qwen3moe", MODEL_ARCH.QWEN3NEXT: "qwen3next", MODEL_ARCH.QWEN3VL: "qwen3vl", @@ -944,11 +957,13 @@ class MODEL_TENSOR(IntEnum): TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.TOKEN_EMBD_AUDIO: "token_embd_audio", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", MODEL_TENSOR.TOKEN_TYPES: "token_types", MODEL_TENSOR.POS_EMBD: "position_embd", MODEL_TENSOR.OUTPUT_NORM: "output_norm", MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.OUTPUT_AUDIO: "output_audio", MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense MODEL_TENSOR.ROPE_FREQS: "rope_freqs", @@ -1791,6 +1806,25 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.MOSS_TTS_DELAY: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_AUDIO, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_AUDIO, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.QWEN3MOE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 18131e540..253ed5fb6 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -37,6 +37,9 @@ class TensorNameMap: "model.transformer.wte", # llada "embed_tokens", # qwen3-embedding ), + MODEL_TENSOR.TOKEN_EMBD_AUDIO: ( + "token_embd_audio", # moss-tts-delay, indexed tensors emitted manually + ), # Token type embeddings MODEL_TENSOR.TOKEN_TYPES: ( @@ -79,6 +82,9 @@ class TensorNameMap: "model.transformer.ff_out", # llada "head.decoder", # modern-bert ), + MODEL_TENSOR.OUTPUT_AUDIO: ( + "output_audio", # moss-tts-delay, indexed tensors emitted manually + ), MODEL_TENSOR.DENSE_2_OUT: ( "dense_2_out", # embeddinggemma ), diff --git a/include/llama.h b/include/llama.h index c6e102abe..c79adbaf5 100644 --- a/include/llama.h +++ b/include/llama.h @@ -219,6 +219,8 @@ extern "C" { // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens // // - token : the token ids of the input (used when embd is NULL) + // - token_audio: optional auxiliary token channels, flattened as [n_tokens, n_token_audio] + // this is currently used by architectures with summed multi-channel embeddings // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) // - pos : the positions of the respective token in the sequence // (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode) @@ -234,6 +236,8 @@ extern "C" { int32_t n_tokens; llama_token * token; + int32_t n_token_audio; + llama_token * token_audio; float * embd; llama_pos * pos; int32_t * n_seq_id; diff --git a/scripts/run-moss-tts-delay-8b-quality.sh b/scripts/run-moss-tts-delay-8b-quality.sh new file mode 100755 index 000000000..3c802ade6 --- /dev/null +++ b/scripts/run-moss-tts-delay-8b-quality.sh @@ -0,0 +1,195 @@ +#!/usr/bin/env bash +set -euo pipefail + +# End-to-end setup for MOSS-TTS-Delay 8B quality smoke test on a fresh machine. +# +# What it does: +# 1. Clones this llama.cpp fork and the official OpenMOSS/MOSS-TTS repo. +# 2. Creates/uses a conda env and installs the minimal Python stack. +# 3. Downloads the official GGUF backbone + embeddings/lm_heads/tokenizer. +# 4. Downloads the official ONNX audio tokenizer. +# 5. Builds llama-moss-tts and runs the C++ vs Python de-delay/raw-code parity test. +# 6. Runs the official Python llama_cpp backend to synthesize wavs for listening. +# +# Defaults target a CUDA machine. For CPU-only ONNX Runtime: +# ORT_PKG=onnxruntime USE_GPU_AUDIO=false bash run-moss-tts-delay-8b-quality.sh + +WORKDIR="${WORKDIR:-$HOME/moss-tts-delay-8b-eval}" +CONDA_ENV="${CONDA_ENV:-moss-tts-delay-8b}" +PYTHON_VERSION="${PYTHON_VERSION:-3.11}" + +LLAMA_CPP_REPO="${LLAMA_CPP_REPO:-https://github.com/expectqwq/llama.cpp.git}" +LLAMA_CPP_REF="${LLAMA_CPP_REF:-master}" +MOSS_TTS_REPO="${MOSS_TTS_REPO:-https://github.com/OpenMOSS/MOSS-TTS.git}" +MOSS_TTS_REF="${MOSS_TTS_REF:-main}" + +ORT_PKG="${ORT_PKG:-onnxruntime-gpu}" +USE_GPU_AUDIO="${USE_GPU_AUDIO:-true}" +N_JOBS="${N_JOBS:-$(nproc)}" + +TEXT_ZH="${TEXT_ZH:-今天天气很好,我们来测试一下 MOSS-TTS Delay 8B 的音质和稳定性。}" +TEXT_EN="${TEXT_EN:-Hello, this is a quality smoke test for the MOSS-TTS Delay 8B pipeline running with llama.cpp and the ONNX audio tokenizer.}" +REFERENCE_AUDIO="${REFERENCE_AUDIO:-}" + +HF_MODEL_REPO="${HF_MODEL_REPO:-OpenMOSS-Team/MOSS-TTS-GGUF}" +HF_AUDIO_REPO="${HF_AUDIO_REPO:-OpenMOSS-Team/MOSS-Audio-Tokenizer-ONNX}" + +LLAMA_CPP_DIR="$WORKDIR/llama.cpp" +MOSS_TTS_DIR="$WORKDIR/MOSS-TTS" +WEIGHTS_DIR="$WORKDIR/weights" +GGUF_DIR="$WEIGHTS_DIR/MOSS-TTS-GGUF" +AUDIO_ORT_DIR="$WEIGHTS_DIR/MOSS-Audio-Tokenizer-ONNX" +OUT_DIR="$WORKDIR/out" +CONFIG_PATH="$WORKDIR/moss_delay_8b_eval.yaml" + +mkdir -p "$WORKDIR" "$WEIGHTS_DIR" "$OUT_DIR" + +need_cmd() { + command -v "$1" >/dev/null 2>&1 || { + echo "error: required command not found: $1" >&2 + exit 1 + } +} + +git_clone_or_update() { + local repo_url="$1" + local repo_dir="$2" + local repo_ref="$3" + + if [[ ! -d "$repo_dir/.git" ]]; then + git clone "$repo_url" "$repo_dir" + fi + + git -C "$repo_dir" fetch --all --tags + git -C "$repo_dir" checkout "$repo_ref" + git -C "$repo_dir" pull --ff-only || true +} + +need_cmd git +need_cmd cmake +need_cmd conda + +source "$(conda info --base)/etc/profile.d/conda.sh" + +if ! conda env list | awk '{print $1}' | grep -qx "$CONDA_ENV"; then + conda create -y -n "$CONDA_ENV" "python=$PYTHON_VERSION" +fi +conda activate "$CONDA_ENV" + +python -m pip install --upgrade pip setuptools wheel +python -m pip install --upgrade "huggingface_hub[cli]>=0.30" + +git_clone_or_update "$LLAMA_CPP_REPO" "$LLAMA_CPP_DIR" "$LLAMA_CPP_REF" +git_clone_or_update "$MOSS_TTS_REPO" "$MOSS_TTS_DIR" "$MOSS_TTS_REF" +git -C "$MOSS_TTS_DIR" submodule update --init --recursive + +if [[ "$ORT_PKG" == "onnxruntime-gpu" ]]; then + python -m pip install -e "${MOSS_TTS_DIR}[llama-cpp-onnx]" +else + python -m pip install -e "${MOSS_TTS_DIR}[llama-cpp]" + python -m pip install --upgrade "${ORT_PKG}>=1.19" +fi + +huggingface-cli download "$HF_MODEL_REPO" --local-dir "$GGUF_DIR" +huggingface-cli download "$HF_AUDIO_REPO" --local-dir "$AUDIO_ORT_DIR" + +if [[ -z "$REFERENCE_AUDIO" ]]; then + REFERENCE_AUDIO="$MOSS_TTS_DIR/assets/audio/reference_zh.wav" +fi + +if [[ ! -f "$GGUF_DIR/MOSS_TTS_Q4_K_M.gguf" ]]; then + echo "error: expected backbone file missing: $GGUF_DIR/MOSS_TTS_Q4_K_M.gguf" >&2 + exit 1 +fi + +if [[ ! -f "$AUDIO_ORT_DIR/encoder.onnx" || ! -f "$AUDIO_ORT_DIR/decoder.onnx" ]]; then + echo "error: expected ONNX audio tokenizer files missing in $AUDIO_ORT_DIR" >&2 + exit 1 +fi + +cat > "$CONFIG_PATH" </dev/null 2>&1 || { + echo "error: missing command: $1" >&2 + exit 1 + } +} + +clone_or_update() { + local repo_url="$1" + local repo_dir="$2" + local repo_ref="$3" + if [[ ! -d "$repo_dir/.git" ]]; then + git clone "$repo_url" "$repo_dir" + fi + git -C "$repo_dir" fetch --all --tags + git -C "$repo_dir" checkout "$repo_ref" + git -C "$repo_dir" pull --ff-only || true +} + +need_cmd git +need_cmd cmake +need_cmd conda + +export PS1="${PS1:-}" +source "$(conda info --base)/etc/profile.d/conda.sh" + +if ! conda env list | awk '{print $1}' | grep -qx "$CONDA_ENV"; then + conda create -y -n "$CONDA_ENV" "python=$PYTHON_VERSION" +fi +conda activate "$CONDA_ENV" + +python -m pip install --upgrade pip setuptools wheel +python -m pip install --upgrade "huggingface_hub[cli]>=0.30" + +clone_or_update "https://github.com/expectqwq/llama.cpp.git" "$LLAMA_CPP_DIR" master +clone_or_update "https://github.com/OpenMOSS/MOSS-TTS.git" "$MOSS_TTS_DIR" main +git -C "$MOSS_TTS_DIR" submodule update --init --recursive + +if [[ "$ORT_PKG" == "onnxruntime-gpu" ]]; then + python -m pip install -e "${MOSS_TTS_DIR}[llama-cpp-onnx]" +else + python -m pip install -e "${MOSS_TTS_DIR}[llama-cpp]" + python -m pip install --upgrade "${ORT_PKG}>=1.19" +fi + +huggingface-cli download "$HF_MODEL_ID" --local-dir "$HF_MODEL_DIR" +huggingface-cli download "$HF_AUDIO_REPO" --local-dir "$ONNX_DIR" + +cmake -S "$LLAMA_CPP_DIR" -B "$LLAMA_CPP_DIR/build" +cmake --build "$LLAMA_CPP_DIR/build" --target llama-moss-tts llama-quantize -j"$N_JOBS" + +bash "$MOSS_TTS_DIR/moss_tts_delay/llama_cpp/build_bridge.sh" "$LLAMA_CPP_DIR" + +python "$MOSS_TTS_DIR/moss_tts_delay/llama_cpp/conversion/extract_weights.py" \ + --model "$HF_MODEL_DIR" \ + --output "$EXTRACT_DIR" + +python "$LLAMA_CPP_DIR/convert_hf_to_gguf.py" \ + "$EXTRACT_DIR/qwen3_backbone" \ + --outfile "$BACKBONE_GGUF" \ + --outtype f16 + +python "$LLAMA_CPP_DIR/convert_hf_to_gguf.py" \ + "$HF_MODEL_DIR" \ + --outfile "$FIRSTCLASS_GGUF" \ + --outtype f16 + +cat > "$PY_CONFIG" <&2 + exit 1 +fi + +if [[ ! -d "${HF_DIR}" ]]; then + echo "error: tiny HF fixture not found: ${HF_DIR}" >&2 + exit 1 +fi + +source "${CONDA_SH}" +conda activate "${CONDA_ENV_NAME}" + +echo "[1/4] building parity test target" +cmake --build "${BUILD_DIR}" --target test-moss-tts-delay-forward -j2 + +echo "[2/4] converting tiny HF fixture to F32 GGUF" +python "${LLAMA_DIR}/convert_hf_to_gguf.py" \ + "${HF_DIR}" \ + --outfile "${GGUF_PATH}" \ + --outtype f32 + +echo "[3/4] exporting PyTorch reference" +python "${LLAMA_DIR}/tests/moss_tts_delay_export_ref.py" \ + "${HF_DIR}" \ + "${REF_PATH}" + +echo "[4/4] running forward parity" +"${TEST_BIN}" "${GGUF_PATH}" "${REF_PATH}" + +echo "PASS: moss-tts-delay forward parity verified" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 283823fa9..06e6e23ed 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -100,6 +100,7 @@ add_library(llama models/minicpm3.cpp models/minimax-m2.cpp models/mistral3.cpp + models/moss-tts-delay.cpp models/modern-bert.cpp models/mpt.cpp models/nemotron-h.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 799d16167..07800c68a 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -35,6 +35,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_QWEN2MOE, "qwen2moe" }, { LLM_ARCH_QWEN2VL, "qwen2vl" }, { LLM_ARCH_QWEN3, "qwen3" }, + { LLM_ARCH_MOSS_TTS_DELAY, "moss-tts-delay" }, { LLM_ARCH_QWEN3MOE, "qwen3moe" }, { LLM_ARCH_QWEN3NEXT, "qwen3next" }, { LLM_ARCH_QWEN3VL, "qwen3vl" }, @@ -278,6 +279,15 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" }, { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, + { LLM_KV_N_VQ, "%s.n_vq" }, + { LLM_KV_AUDIO_VOCAB_SIZE, "%s.audio_vocab_size" }, + { LLM_KV_AUDIO_PAD_CODE, "%s.audio_pad_code" }, + { LLM_KV_AUDIO_START_TOKEN_ID, "%s.audio_start_token_id" }, + { LLM_KV_AUDIO_END_TOKEN_ID, "%s.audio_end_token_id" }, + { LLM_KV_AUDIO_USER_SLOT_TOKEN_ID, "%s.audio_user_slot_token_id" }, + { LLM_KV_AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID, "%s.audio_assistant_gen_slot_token_id" }, + { LLM_KV_AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID, "%s.audio_assistant_delay_slot_token_id" }, + { LLM_KV_SAMPLING_RATE, "%s.sampling_rate" }, // sentence-transformers dense modules feature dims { LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" }, { LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" }, @@ -335,9 +345,11 @@ static const std::map LLM_KV_NAMES = { static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_AUDIO, "token_embd_audio.%d" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT_NORM_LFM2, "token_embd_norm" }, // fix for wrong tensor name { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_OUTPUT_AUDIO, "output_audio.%d" }, { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, @@ -971,6 +983,25 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, }; + case LLM_ARCH_MOSS_TTS_DELAY: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_AUDIO, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_OUTPUT_AUDIO, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; case LLM_ARCH_QWEN3MOE: case LLM_ARCH_QWEN3VLMOE: case LLM_ARCH_OLMOE: @@ -2560,10 +2591,12 @@ static std::set llm_get_tensor_names(llm_arch arch) { // static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_TOKEN_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, + {LLM_TENSOR_TOKEN_EMBD_AUDIO, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, {LLM_TENSOR_POS_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, {LLM_TENSOR_TOKEN_TYPES, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, {LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL}}, {LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_OUTPUT_AUDIO, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CLS_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, @@ -2790,7 +2823,16 @@ std::string LLM_TN_IMPL::str() const { return LLM_TENSOR_NAMES.at(tensor); } - std::string name = ::format(LLM_TENSOR_NAMES.at(tensor), bid, xid); + std::string name; + switch (tensor) { + case LLM_TENSOR_TOKEN_EMBD_AUDIO: + case LLM_TENSOR_OUTPUT_AUDIO: + name = ::format(LLM_TENSOR_NAMES.at(tensor), xid); + break; + default: + name = ::format(LLM_TENSOR_NAMES.at(tensor), bid, xid); + break; + } if (suffix != nullptr) { name += "."; name += suffix; diff --git a/src/llama-arch.h b/src/llama-arch.h index b1b1dcf18..9320b01da 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -39,6 +39,7 @@ enum llm_arch { LLM_ARCH_QWEN2MOE, LLM_ARCH_QWEN2VL, LLM_ARCH_QWEN3, + LLM_ARCH_MOSS_TTS_DELAY, LLM_ARCH_QWEN3MOE, LLM_ARCH_QWEN3NEXT, LLM_ARCH_QWEN3VL, @@ -321,6 +322,16 @@ enum llm_kv { LLM_KV_SHORTCONV_L_CACHE, + LLM_KV_N_VQ, + LLM_KV_AUDIO_VOCAB_SIZE, + LLM_KV_AUDIO_PAD_CODE, + LLM_KV_AUDIO_START_TOKEN_ID, + LLM_KV_AUDIO_END_TOKEN_ID, + LLM_KV_AUDIO_USER_SLOT_TOKEN_ID, + LLM_KV_AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID, + LLM_KV_AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID, + LLM_KV_SAMPLING_RATE, + LLM_KV_XIELU_ALPHA_N, LLM_KV_XIELU_ALPHA_P, LLM_KV_XIELU_BETA, @@ -340,12 +351,14 @@ enum llm_kv { enum llm_tensor { LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_AUDIO, LLM_TENSOR_TOKEN_EMBD_NORM, LLM_TENSOR_TOKEN_TYPES, LLM_TENSOR_POS_EMBD, LLM_TENSOR_DENSE_2_OUT, LLM_TENSOR_DENSE_3_OUT, LLM_TENSOR_OUTPUT, + LLM_TENSOR_OUTPUT_AUDIO, LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name LLM_TENSOR_ROPE_FREQS, diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 6bf76939c..ecf4f9263 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -55,6 +55,21 @@ bool llama_batch_allocr::init( } } + if ((batch.token_audio == nullptr) != (batch.n_token_audio == 0)) { + LLAMA_LOG_ERROR("%s: token_audio and n_token_audio must either both be set or both be empty\n", __func__); + return false; + } + + if (batch.token_audio && !batch.token) { + LLAMA_LOG_ERROR("%s: token_audio currently requires token inputs to also be provided\n", __func__); + return false; + } + + if (batch.token_audio && batch.embd) { + LLAMA_LOG_ERROR("%s: token_audio is not supported together with embd inputs\n", __func__); + return false; + } + if (batch.seq_id) { for (int32_t i = 0; i < batch.n_tokens; ++i) { for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) { @@ -217,6 +232,8 @@ bool llama_batch_allocr::init( /*.n_seqs_unq =*/ (uint32_t) this->seq_id_unq.size(), /*.n_pos =*/ n_pos_per_embd, /*.token =*/ batch.token, + /*.n_token_audio=*/ (uint32_t) batch.n_token_audio, + /*.token_audio =*/ batch.token_audio, /*.embd =*/ batch.embd, /*.pos =*/ batch.pos, /*.n_seq_id =*/ batch.n_seq_id, @@ -399,6 +416,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t auto udata = std::make_shared(); udata->token .resize(n_tokens); + udata->token_audio.clear(); udata->embd .clear(); udata->pos .resize(n_pos_all); udata->n_seq_id .resize(n_tokens); @@ -421,6 +439,8 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t /*.n_pos =*/ n_pos_per_embd, /*.token =*/ udata->token.data(), + /*.n_token_audio=*/ 0, + /*.token_audio =*/ nullptr, /*.embd =*/ nullptr, /*.pos =*/ udata->pos.data(), /*.n_seq_id =*/ udata->n_seq_id.data(), @@ -687,8 +707,10 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0; const int64_t n_pos_all = (int64_t) n_tokens*n_pos_per_embd; + const int64_t n_token_audio_all = batch.token_audio ? (int64_t) n_tokens*batch.n_token_audio : 0; udata->token .resize(n_tokens); + udata->token_audio.resize(n_token_audio_all); udata->embd .resize(n_embd_all); udata->pos .resize(n_pos_all); udata->n_seq_id .resize(n_tokens); @@ -706,6 +728,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u udata->token[i] = batch.token[idxs[i]]; } + if (batch.token_audio) { + memcpy( + udata->token_audio.data() + i*batch.n_token_audio, + batch.token_audio + (int64_t) idxs[i]*batch.n_token_audio, + batch.n_token_audio*sizeof(llama_token)); + } + if (batch.embd) { memcpy(udata->embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float)); } @@ -756,6 +785,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u /*.n_pos =*/ n_pos_per_embd, /*.token =*/ batch.token ? udata->token.data() : nullptr, + /*.n_token_audio=*/ (uint32_t) batch.n_token_audio, + /*.token_audio =*/ batch.token_audio ? udata->token_audio.data() : nullptr, /*.embd =*/ batch.embd ? udata->embd.data() : nullptr, /*.pos =*/ udata->pos.data(), /*.n_seq_id =*/ udata->n_seq_id.data(), @@ -805,6 +836,8 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) { ss_seq_idx << "]"; LLAMA_LOG_DEBUG("%s: token = %p\n", __func__, (void *) ubatch.token); + LLAMA_LOG_DEBUG("%s: n_token_audio = %u\n", __func__, ubatch.n_token_audio); + LLAMA_LOG_DEBUG("%s: token_audio = %p\n", __func__, (void *) ubatch.token_audio); LLAMA_LOG_DEBUG("%s: embd = %p\n", __func__, (void *) ubatch.embd); LLAMA_LOG_DEBUG("%s: pos = %p\n", __func__, (void *) ubatch.pos); LLAMA_LOG_DEBUG("%s: n_seq_id = %p\n", __func__, (void *) ubatch.n_seq_id); @@ -866,6 +899,8 @@ struct llama_batch llama_batch_get_one( return { /*n_tokens =*/ n_tokens, /*tokens =*/ tokens, + /*n_token_audio =*/ 0, + /*token_audio =*/ nullptr, /*embd =*/ nullptr, /*pos =*/ nullptr, /*n_seq_id =*/ nullptr, @@ -878,6 +913,8 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ llama_batch batch = { /*n_tokens =*/ 0, /*tokens =*/ nullptr, + /*n_token_audio =*/ 0, + /*token_audio =*/ nullptr, /*embd =*/ nullptr, /*pos =*/ nullptr, /*n_seq_id =*/ nullptr, @@ -906,6 +943,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ void llama_batch_free(struct llama_batch batch) { if (batch.token) free(batch.token); + if (batch.token_audio) free(batch.token_audio); if (batch.embd) free(batch.embd); if (batch.pos) free(batch.pos); if (batch.n_seq_id) free(batch.n_seq_id); diff --git a/src/llama-batch.h b/src/llama-batch.h index 8e6fac0ef..7f9205476 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -43,6 +43,8 @@ struct llama_ubatch { // // size | idx | val llama_token * token; // [n_tokens] | i | id, token + uint32_t n_token_audio;// architecture-specific auxiliary token channels per token + llama_token * token_audio; // [n_tokens * n_token_audio] | i * n_token_audio + c | id, audio token float * embd; // [n_embd, n_tokens] | i | embd llama_pos * pos; // [n_tokens*n_pos] | i | pos int32_t * n_seq_id; // [n_tokens] | i | - @@ -53,6 +55,7 @@ struct llama_ubatch { struct data_t { std::vector token; + std::vector token_audio; std::vector embd; std::vector pos; std::vector n_seq_id; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 1f7a52d78..915380b26 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -815,7 +815,7 @@ float * llama_context::get_logits_ith(int32_t i) { } const int64_t j = output_resolve_row(i); - return logits.data + j*model.vocab.n_tokens(); + return logits.data + j*logits_stride; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG @@ -1245,8 +1245,8 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; - const int64_t n_embd = hparams.n_embd_inp(); - const int64_t n_vocab = model.vocab.n_tokens(); + const int64_t n_embd = hparams.n_embd_inp(); + const int64_t n_logits = model.n_logits(); // note: during encode, we always pass the full sequence starting from pos = 0 if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) { @@ -1316,7 +1316,7 @@ int llama_context::encode(const llama_batch & batch_inp) { GGML_ASSERT(backend_res != nullptr); GGML_ASSERT(logits.data != nullptr); - ggml_backend_tensor_get_async(backend_res, t_logits, logits.data, 0, n_tokens*n_vocab*sizeof(float)); + ggml_backend_tensor_get_async(backend_res, t_logits, logits.data, 0, n_tokens*n_logits*sizeof(float)); } // extract embeddings @@ -1732,12 +1732,13 @@ int llama_context::decode(const llama_batch & batch_inp) { GGML_ASSERT(backend_res != nullptr); GGML_ASSERT(logits.data != nullptr); - float * logits_out = logits.data + n_outputs_prev*n_vocab; + const int64_t n_logits = model.n_logits(); + float * logits_out = logits.data + n_outputs_prev*n_logits; if (n_outputs) { GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); - GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits.size); - ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); + GGML_ASSERT((n_outputs_prev + n_outputs)*n_logits <= (int64_t) logits.size); + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_logits*sizeof(float)); } } @@ -1881,6 +1882,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); + const auto n_logits = model.n_logits(); const auto n_embd_out = hparams.n_embd_out(); bool has_logits = true; @@ -1896,7 +1898,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { size_t backend_float_count = 0; size_t backend_token_count = 0; - logits.size = has_logits ? n_vocab*n_outputs_max : 0; + logits_stride = has_logits ? n_logits : 0; + logits.size = has_logits ? n_logits*n_outputs_max : 0; embd.size = has_embd ? n_embd_out*n_outputs_max : 0; // Allocate backend sampling output buffers if there are backend samplers configured. @@ -2002,16 +2005,17 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { } void llama_context::output_reorder() { - const uint64_t n_vocab = model.vocab.n_tokens(); - const uint64_t n_embd = model.hparams.n_embd; + const uint64_t n_logits = logits_stride; + const uint64_t n_vocab = model.vocab.n_tokens(); + const uint64_t n_embd = model.hparams.n_embd; for (size_t s = 0; s < output_swaps.size(); ++s) { const uint64_t i0 = output_swaps[s].i0; const uint64_t i1 = output_swaps[s].i1; if (logits.size > 0) { - for (uint64_t k = 0; k < n_vocab; k++) { - std::swap(logits.data[i0*n_vocab + k], logits.data[i1*n_vocab + k]); + for (uint64_t k = 0; k < n_logits; k++) { + std::swap(logits.data[i0*n_logits + k], logits.data[i1*n_logits + k]); } } diff --git a/src/llama-context.h b/src/llama-context.h index e0d0085c1..49c39f023 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -265,8 +265,14 @@ struct llama_context { std::unique_ptr memory; - // decode output (2-dimensional array: [n_outputs][n_vocab]) + // decode output (2-dimensional row-major array: [n_outputs][logits_stride]) + // logits_stride is the number of float entries per output row in `logits`. + // For standard text models, logits_stride == n_vocab. Architectures with + // concatenated multi-head logits (for example MOSS-TTS-Delay) can set + // logits_stride > n_vocab, so callers must always stride with + // llama_get_logits_ith() / logits_stride rather than assuming n_vocab. buffer_view logits = {nullptr, 0}; + uint32_t logits_stride = 0; // embeddings output (2-dimensional array: [n_outputs][n_embd]) // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 9a215bb77..77735daad 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1134,8 +1134,8 @@ ggml_tensor * llm_graph_context::build_ffn( if (down) { cur = build_lora_mm(down, cur); - if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) { - // GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators + if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2 || arch == LLM_ARCH_MOSS_TTS_DELAY) { + // GLM4/JAIS2 and MOSS-TTS-Delay FFN down-projection can overflow with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); } } diff --git a/src/llama-graph.h b/src/llama-graph.h index 4855685ef..a1362cc5a 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -567,9 +567,14 @@ struct llm_graph_params { ubatch.n_seq_tokens == other.ubatch.n_seq_tokens && ubatch.n_seqs == other.ubatch.n_seqs && ubatch.n_seqs_unq == other.ubatch.n_seqs_unq && + ubatch.n_token_audio == other.ubatch.n_token_audio && ( (!ubatch.token && !other.ubatch.token) || (!ubatch.embd && !other.ubatch.embd) + ) && + ( + (!ubatch.token_audio && !other.ubatch.token_audio) || + (ubatch.token_audio && other.ubatch.token_audio) ); // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 78c0bc27d..0a1c76965 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -68,6 +68,18 @@ struct llama_hparams { uint32_t n_shortconv_l_cache = 0; + // MOSS-TTS-Delay + uint32_t n_vq = 0; + uint32_t audio_vocab_size = 0; + uint32_t audio_pad_code = 0; + uint32_t sampling_rate = 0; + + uint32_t audio_start_token_id = 0; + uint32_t audio_end_token_id = 0; + uint32_t audio_user_slot_token_id = 0; + uint32_t audio_assistant_gen_slot_token_id = 0; + uint32_t audio_assistant_delay_slot_token_id = 0; + std::array n_head_arr; std::array n_head_kv_arr; std::array n_ff_arr; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e8e1bbf1c..f7b4bd12f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -995,6 +995,29 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_QWEN3: { + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; + case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; + case 40: type = LLM_TYPE_14B; break; + case 64: type = LLM_TYPE_32B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; + case LLM_ARCH_MOSS_TTS_DELAY: + { + ml.get_key(LLM_KV_N_VQ, hparams.n_vq); + ml.get_key(LLM_KV_AUDIO_VOCAB_SIZE, hparams.audio_vocab_size); + ml.get_key(LLM_KV_AUDIO_PAD_CODE, hparams.audio_pad_code); + + ml.get_key(LLM_KV_AUDIO_START_TOKEN_ID, hparams.audio_start_token_id, false); + ml.get_key(LLM_KV_AUDIO_END_TOKEN_ID, hparams.audio_end_token_id, false); + ml.get_key(LLM_KV_AUDIO_USER_SLOT_TOKEN_ID, hparams.audio_user_slot_token_id, false); + ml.get_key(LLM_KV_AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID, hparams.audio_assistant_gen_slot_token_id, false); + ml.get_key(LLM_KV_AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID, hparams.audio_assistant_delay_slot_token_id, false); + ml.get_key(LLM_KV_SAMPLING_RATE, hparams.sampling_rate, false); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { @@ -3640,6 +3663,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output rerank head cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED); + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } + } break; + case LLM_ARCH_MOSS_TTS_DELAY: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + const int64_t n_audio_vocab = + hparams.audio_vocab_size > 0 ? std::max(hparams.audio_vocab_size + 1, hparams.audio_pad_code + 1) : 0; + + if (hparams.n_vq == 0) { + throw std::runtime_error("n_vq must be > 0 for MOSS_TTS_DELAY"); + } + if (n_audio_vocab == 0) { + throw std::runtime_error("audio_vocab_size must be > 0 for MOSS_TTS_DELAY"); + } + + tok_embd_audio.resize(hparams.n_vq); + output_audio.resize(hparams.n_vq); + + for (uint32_t i = 0; i < hparams.n_vq; ++i) { + tok_embd_audio[i] = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_AUDIO, "weight", -1, i), {n_embd, n_audio_vocab}, 0); + } + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + for (uint32_t i = 0; i < hparams.n_vq; ++i) { + output_audio[i] = create_tensor(tn(LLM_TENSOR_OUTPUT_AUDIO, "weight", -1, i), {n_embd, n_audio_vocab}, 0); + } + for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -7683,6 +7753,14 @@ size_t llama_model::n_devices() const { return devices.size(); } +uint32_t llama_model::n_logits() const { + if (arch == LLM_ARCH_MOSS_TTS_DELAY) { + return vocab.n_tokens() + hparams.n_vq * (hparams.audio_vocab_size + 1); + } + + return vocab.n_tokens(); +} + uint32_t llama_model::n_gpu_layers() const { return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1; } @@ -7869,6 +7947,13 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); } + if (arch == LLM_ARCH_MOSS_TTS_DELAY) { + LLAMA_LOG_INFO("%s: n_vq = %u\n", __func__, hparams.n_vq); + LLAMA_LOG_INFO("%s: audio_vocab_size = %u\n", __func__, hparams.audio_vocab_size); + LLAMA_LOG_INFO("%s: audio_pad_code = %u\n", __func__, hparams.audio_pad_code); + LLAMA_LOG_INFO("%s: sampling_rate = %u\n", __func__, hparams.sampling_rate); + } + if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE || @@ -8277,6 +8362,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_MOSS_TTS_DELAY: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_QWEN3MOE: { llm = std::make_unique(*this, params); @@ -8859,6 +8948,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_DREAM: case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN3: + case LLM_ARCH_MOSS_TTS_DELAY: case LLM_ARCH_QWEN3MOE: case LLM_ARCH_LLADA_MOE: case LLM_ARCH_RND1: diff --git a/src/llama-model.h b/src/llama-model.h index 25bf892e7..1dfbab09c 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -483,6 +483,7 @@ struct llama_model { std::vector classifier_labels; struct ggml_tensor * tok_embd = nullptr; + std::vector tok_embd_audio; struct ggml_tensor * type_embd = nullptr; struct ggml_tensor * pos_embd = nullptr; struct ggml_tensor * tok_norm = nullptr; @@ -491,6 +492,7 @@ struct llama_model { struct ggml_tensor * output_norm = nullptr; struct ggml_tensor * output_norm_b = nullptr; struct ggml_tensor * output = nullptr; + std::vector output_audio; struct ggml_tensor * output_b = nullptr; struct ggml_tensor * output_norm_enc = nullptr; @@ -552,6 +554,7 @@ struct llama_model { size_t size() const; // file size size_t n_tensors() const; size_t n_devices() const; + uint32_t n_logits() const; uint32_t n_gpu_layers() const; llama_split_mode split_mode() const; diff --git a/src/models/models.h b/src/models/models.h index a86b2b1eb..3f21b0102 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -421,6 +421,10 @@ struct llm_build_mistral3 : public llm_graph_context { llm_build_mistral3(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_moss_tts_delay : public llm_graph_context { + llm_build_moss_tts_delay(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_modern_bert : public llm_graph_context { llm_build_modern_bert(const llama_model & model, const llm_graph_params & params); }; diff --git a/src/models/moss-tts-delay.cpp b/src/models/moss-tts-delay.cpp new file mode 100644 index 000000000..8212ba059 --- /dev/null +++ b/src/models/moss-tts-delay.cpp @@ -0,0 +1,183 @@ +#include "models.h" + +namespace { + +class llm_graph_input_moss_audio_channel : public llm_graph_input_i { +public: + llm_graph_input_moss_audio_channel(uint32_t channel, uint32_t n_channels) + : channel(channel), n_channels(n_channels) {} + + void set_input(const llama_ubatch * ubatch) override { + GGML_ASSERT(tokens != nullptr); + + data.resize(ubatch->n_tokens, 0); + if (ubatch->token_audio != nullptr) { + GGML_ASSERT(ubatch->n_token_audio == n_channels); + + for (uint32_t i = 0; i < ubatch->n_tokens; ++i) { + data[i] = ubatch->token_audio[(size_t) i*n_channels + channel]; + } + } + + ggml_backend_tensor_set(tokens, data.data(), 0, data.size()*ggml_element_size(tokens)); + } + + bool can_reuse(const llm_graph_params & params) override { + return + tokens != nullptr && + tokens->ne[0] == params.ubatch.n_tokens && + ( + (params.ubatch.n_token_audio == n_channels && params.ubatch.token_audio != nullptr) || + (params.ubatch.n_token_audio == 0 && params.ubatch.token_audio == nullptr) + ); + } + + ggml_tensor * tokens = nullptr; + +private: + const uint32_t channel; + const uint32_t n_channels; + std::vector data; +}; + +} + +llm_build_moss_tts_delay::llm_build_moss_tts_delay(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); + GGML_ASSERT(hparams.n_vq == model.tok_embd_audio.size()); + + ggml_tensor * cur; + ggml_tensor * inpL = build_inp_embd(model.tok_embd); + + GGML_ASSERT(ubatch.token != nullptr); + GGML_ASSERT( + (ubatch.token_audio != nullptr && ubatch.n_token_audio == hparams.n_vq) || + (ubatch.token_audio == nullptr && ubatch.n_token_audio == 0)); + + for (uint32_t i = 0; i < hparams.n_vq; ++i) { + auto inp_audio = std::make_unique(i, hparams.n_vq); + inp_audio->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + cb(inp_audio->tokens, "inp_audio_tokens", i); + ggml_set_input(inp_audio->tokens); + + ggml_tensor * audio_embd = ggml_get_rows(ctx0, model.tok_embd_audio[i], inp_audio->tokens); + cb(audio_embd, "audio_embd", i); + + inpL = ggml_add(ctx0, inpL, audio_embd); + cb(inpL, "input_sum", i); + + res->add_input(std::move(inp_audio)); + } + + ggml_tensor * inp_pos = build_inp_pos(); + auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + cur = build_norm(inpL, + model.layers[il].attn_norm, nullptr, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + { + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_normed", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, nullptr, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + inpL = cur; + } + + cur = build_norm(inpL, + model.output_norm, nullptr, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + GGML_ASSERT(hparams.n_vq == model.output_audio.size()); + + ggml_tensor * logits = build_lora_mm(model.output, cur); + cb(logits, "result_output_text", -1); + + for (uint32_t i = 0; i < hparams.n_vq; ++i) { + ggml_tensor * audio_logits = build_lora_mm(model.output_audio[i], cur); + cb(audio_logits, "result_output_audio", i); + + logits = ggml_concat(ctx0, logits, audio_logits, 0); + cb(logits, "result_output_concat", i); + } + + logits = ggml_cont(ctx0, logits); + cb(logits, "result_output_cont", -1); + + res->t_logits = logits; + cb(logits, "result_output", -1); + + ggml_build_forward_expand(gf, logits); +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9582164b5..6160ef8af 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -165,6 +165,8 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS) endif() llama_build(test-gbnf-validator.cpp) + llama_build(test-moss-tts-delay-load.cpp) + llama_build(test-moss-tts-delay-forward.cpp) # build test-tokenizer-1-bpe target once and add many tests llama_build(test-tokenizer-1-bpe.cpp) diff --git a/tests/moss_tts_delay_export_decode_ref.py b/tests/moss_tts_delay_export_decode_ref.py new file mode 100755 index 000000000..1b33f8ff8 --- /dev/null +++ b/tests/moss_tts_delay_export_decode_ref.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import struct +import sys +from pathlib import Path + +import numpy as np + +WORKROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(WORKROOT / "MOSS-TTS")) + +from moss_tts_delay.llama_cpp.delay_state import apply_delay_pattern, extract_audio_segments # noqa: E402 + + +REF_MAGIC = 0x4652444D # "MDRF" +REF_VERSION = 1 + + +def main() -> int: + if len(sys.argv) != 2: + sys.stderr.write(f"usage: {sys.argv[0]} \n") + return 1 + + out_path = Path(sys.argv[1]) + + n_vq = 32 + audio_pad_code = 1024 + prompt_frames = 3 + + ref_prompt = np.full((prompt_frames, n_vq), audio_pad_code, dtype=np.int64) + ref_prompt[1, 0] = 77 + ref_prompt[2, :2] = [88, 66] + + raw_a = np.stack([np.arange(10, 10 + n_vq), np.arange(110, 110 + n_vq)], axis=0).astype(np.int64) + raw_b = np.stack([np.arange(210, 210 + n_vq)], axis=0).astype(np.int64) + + delayed_a = apply_delay_pattern(raw_a, audio_pad_code) + delayed_b = apply_delay_pattern(raw_b, audio_pad_code) + + packed_rows: list[np.ndarray] = [] + for t in range(prompt_frames): + row = np.full(1 + n_vq, audio_pad_code, dtype=np.int64) + row[0] = 100 + t + row[1:] = ref_prompt[t] + packed_rows.append(row) + + def append_delayed(text_token: int, delayed: np.ndarray) -> None: + for frame in delayed: + row = np.full(1 + n_vq, audio_pad_code, dtype=np.int64) + row[0] = text_token + row[1:] = frame + packed_rows.append(row) + + append_delayed(200, delayed_a) + + gap = np.full(1 + n_vq, audio_pad_code, dtype=np.int64) + gap[0] = 201 + packed_rows.append(gap) + + append_delayed(202, delayed_b) + + packed = np.stack(packed_rows, axis=0) + generation_audio = packed[prompt_frames:, 1:] + segments = extract_audio_segments(generation_audio) + raw_codes = np.concatenate(segments, axis=0) if segments else np.zeros((0, n_vq), dtype=np.int64) + + header = struct.pack( + " None: + wav = np.asarray(wav, dtype=np.float32).ravel() + pcm = np.clip(np.round(wav * 32767.0), -32768, 32767).astype(np.int16) + with wave.open(str(path), "wb") as f: + f.setnchannels(1) + f.setsampwidth(2) + f.setframerate(sample_rate) + f.writeframes(pcm.tobytes()) + + +def main() -> int: + ap = argparse.ArgumentParser(description="Export Python generation reference for first-class MOSS parity") + ap.add_argument("--config", required=True) + ap.add_argument("--text", required=True) + ap.add_argument("--output-ref", required=True) + ap.add_argument("--output-wav", default="") + ap.add_argument("--reference-audio", default=None) + ap.add_argument("--max-new-tokens", type=int, default=512) + ap.add_argument("--text-temperature", type=float, default=0.0) + ap.add_argument("--text-top-p", type=float, default=1.0) + ap.add_argument("--text-top-k", type=int, default=50) + ap.add_argument("--audio-temperature", type=float, default=0.0) + ap.add_argument("--audio-top-p", type=float, default=1.0) + ap.add_argument("--audio-top-k", type=int, default=25) + ap.add_argument("--audio-repetition-penalty", type=float, default=1.0) + args = ap.parse_args() + + config = PipelineConfig.from_yaml(args.config) + config.max_new_tokens = args.max_new_tokens + config.text_temperature = args.text_temperature + config.text_top_p = args.text_top_p + config.text_top_k = args.text_top_k + config.audio_temperature = args.audio_temperature + config.audio_top_p = args.audio_top_p + config.audio_top_k = args.audio_top_k + config.audio_repetition_penalty = args.audio_repetition_penalty + + out_ref = Path(args.output_ref) + out_ref.parent.mkdir(parents=True, exist_ok=True) + + with LlamaCppPipeline(config) as pipeline: + ref_codes = pipeline._prepare_reference(args.reference_audio) + input_ids = build_generation_prompt( + pipeline.tokenizer, + text=args.text, + reference_codes=ref_codes, + ) + prompt_len = input_ids.shape[0] + + backbone = pipeline.backbone + embedder = pipeline.embedder + lm_heads = pipeline.lm_heads + if backbone is None or embedder is None or lm_heads is None: + raise RuntimeError("pipeline low-memory mode is not supported by this export script") + + backbone.clear_kv() + pipeline._prefill(input_ids, backbone, embedder) + generation_ids = pipeline._autoregressive_loop( + input_ids, config.max_new_tokens, backbone, embedder, lm_heads + ) + _text, audio_codes = parse_generation_output(pipeline.tokenizer, generation_ids, prompt_len) + + if args.output_wav: + wav = pipeline.audio_tokenizer.decode(audio_codes) + write_wav16(Path(args.output_wav), wav, 24000) + + hdr = struct.pack( + " np.ndarray: + if vocab_size < 8: + raise ValueError(f"vocab_size must be >= 8, got {vocab_size}") + + # Keep away from the first few special ids and generate a deterministic but + # non-trivial pattern that works for both tiny toy models and full exports. + ids = np.zeros(length, dtype=np.int32) + span = vocab_size - 4 + for i in range(length): + ids[i] = 4 + ((i * 7 + 3) % span) + return ids + + +def build_audio_ids(n_tokens: int, n_vq: int, audio_vocab_size: int) -> np.ndarray: + audio = np.zeros((n_tokens, n_vq), dtype=np.int32) + for t in range(n_tokens): + for q in range(n_vq): + audio[t, q] = (t * 37 + q * 53) % audio_vocab_size + return audio + + +def main() -> int: + if len(sys.argv) != 3: + sys.stderr.write(f"usage: {sys.argv[0]} \n") + return 1 + + model_dir = sys.argv[1] + out_path = sys.argv[2] + + config = MossTTSDelayConfig.from_pretrained(model_dir) + model = MossTTSDelayModel.from_pretrained(model_dir, local_files_only=True).eval() + + n_tokens = 4 + text_ids = build_text_ids(n_tokens, config.language_config.vocab_size) + audio_ids = build_audio_ids(n_tokens, config.n_vq, config.audio_vocab_size) + input_ids = np.concatenate([text_ids[:, None], audio_ids], axis=1)[None, :, :] + + with torch.no_grad(): + outputs = model( + input_ids=torch.from_numpy(input_ids).long(), + use_cache=False, + ) + + ref_embd = outputs.hidden_states[-1][0, -1].float().cpu().numpy().astype(np.float32, copy=False) + ref_logits = np.concatenate( + [head[0, -1].float().cpu().numpy() for head in outputs.logits], + axis=0, + ).astype(np.float32, copy=False) + + Path(out_path).parent.mkdir(parents=True, exist_ok=True) + with open(out_path, "wb") as f: + f.write(struct.pack("<6I", REF_MAGIC, REF_VERSION, n_tokens, config.n_vq, ref_embd.shape[0], ref_logits.shape[0])) + f.write(text_ids.astype(np.int32, copy=False).tobytes()) + f.write(audio_ids.reshape(-1).astype(np.int32, copy=False).tobytes()) + f.write(ref_embd.tobytes()) + f.write(ref_logits.tobytes()) + + sys.stderr.write( + "exported moss-tts-delay reference: " + f"n_tokens={n_tokens} n_vq={config.n_vq} " + f"n_embd={ref_embd.shape[0]} n_logits={ref_logits.shape[0]} -> {out_path}\n" + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 014b3f2b1..50f07af8b 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -78,6 +78,9 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { gguf_context_ptr ret(gguf_init_empty()); llama_model_saver ms(arch, ret.get()); const uint32_t n_ctx = 128; + const uint32_t n_vq = 32; + const uint32_t audio_vocab_size = 1024; + const uint32_t audio_pad_code = 1024; uint32_t n_vocab = 128; uint32_t n_embd = 256; @@ -181,6 +184,12 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { // ms.add_kv(LLM_KV_DENSE_2_FEAT_OUT, n_embd); // ms.add_kv(LLM_KV_DENSE_3_FEAT_IN, n_embd); + if (arch == LLM_ARCH_MOSS_TTS_DELAY) { + ms.add_kv(LLM_KV_N_VQ, n_vq); + ms.add_kv(LLM_KV_AUDIO_VOCAB_SIZE, audio_vocab_size); + ms.add_kv(LLM_KV_AUDIO_PAD_CODE, audio_pad_code); + } + if (moe) { ms.add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, n_ff); ms.add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, uint32_t(2)); diff --git a/tests/test-moss-tts-delay-forward.cpp b/tests/test-moss-tts-delay-forward.cpp new file mode 100644 index 000000000..483fe7c38 --- /dev/null +++ b/tests/test-moss-tts-delay-forward.cpp @@ -0,0 +1,205 @@ +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +struct ref_header { + uint32_t magic; + uint32_t version; + uint32_t n_tokens; + uint32_t n_vq; + uint32_t n_embd; + uint32_t n_logits; +}; + +constexpr uint32_t REF_MAGIC = 0x4d545452; // "RTTM" +constexpr uint32_t REF_VERSION = 1; + +template +void read_exact(std::ifstream & in, T * data, size_t count, const char * what) { + in.read(reinterpret_cast(data), sizeof(T) * count); + if (!in) { + throw std::runtime_error(std::string("failed to read ") + what); + } +} + +float max_abs_diff(const float * got, const std::vector & ref) { + float out = 0.0f; + for (size_t i = 0; i < ref.size(); ++i) { + if (!std::isfinite(ref[i])) { + continue; + } + if (!std::isfinite(got[i])) { + return INFINITY; + } + out = std::max(out, std::fabs(got[i] - ref[i])); + } + return out; +} + +float max_abs_diff_span(const float * got, const float * ref, size_t count) { + float out = 0.0f; + for (size_t i = 0; i < count; ++i) { + if (!std::isfinite(ref[i])) { + continue; + } + if (!std::isfinite(got[i])) { + return INFINITY; + } + out = std::max(out, std::fabs(got[i] - ref[i])); + } + return out; +} + +} + +int main(int argc, char ** argv) { + if (argc != 3) { + std::fprintf(stderr, "usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + std::ifstream in(argv[2], std::ios::binary); + if (!in) { + std::fprintf(stderr, "error: failed to open reference '%s'\n", argv[2]); + return EXIT_FAILURE; + } + + ref_header hdr{}; + read_exact(in, &hdr, 1, "header"); + if (hdr.magic != REF_MAGIC || hdr.version != REF_VERSION) { + std::fprintf(stderr, "error: unexpected reference format\n"); + return EXIT_FAILURE; + } + + std::vector text(hdr.n_tokens); + std::vector audio((size_t) hdr.n_tokens * hdr.n_vq); + std::vector ref_embd(hdr.n_embd); + std::vector ref_logits(hdr.n_logits); + + read_exact(in, text.data(), text.size(), "text tokens"); + read_exact(in, audio.data(), audio.size(), "audio tokens"); + read_exact(in, ref_embd.data(), ref_embd.size(), "reference embeddings"); + read_exact(in, ref_logits.data(), ref_logits.size(), "reference logits"); + + llama_backend_init(); + + llama_model_params mparams = llama_model_default_params(); + mparams.use_mmap = true; + // Keep this parity test deterministic and avoid multi-backend split-input limits. + mparams.n_gpu_layers = 0; + + llama_model * model = llama_model_load_from_file(argv[1], mparams); + if (model == nullptr) { + std::fprintf(stderr, "error: failed to load model '%s'\n", argv[1]); + llama_backend_free(); + return EXIT_FAILURE; + } + + llama_context_params cparams = llama_context_default_params(); + cparams.n_ctx = std::max(hdr.n_tokens + 8, 64); + cparams.n_batch = hdr.n_tokens; + cparams.n_ubatch = hdr.n_tokens; + cparams.n_seq_max = 1; + cparams.embeddings = true; + cparams.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED; + cparams.type_k = GGML_TYPE_F32; + cparams.type_v = GGML_TYPE_F32; + + llama_context * ctx = llama_init_from_model(model, cparams); + if (ctx == nullptr) { + std::fprintf(stderr, "error: failed to create context\n"); + llama_model_free(model); + llama_backend_free(); + return EXIT_FAILURE; + } + + llama_set_warmup(ctx, false); + llama_set_embeddings(ctx, true); + llama_set_causal_attn(ctx, true); + + llama_batch batch = llama_batch_init(hdr.n_tokens, 0, 1); + batch.n_tokens = hdr.n_tokens; + batch.n_token_audio = hdr.n_vq; + batch.token_audio = (llama_token *) std::malloc(sizeof(llama_token) * audio.size()); + if (batch.token_audio == nullptr) { + std::fprintf(stderr, "error: failed to allocate token_audio\n"); + llama_batch_free(batch); + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + return EXIT_FAILURE; + } + + for (uint32_t i = 0; i < hdr.n_tokens; ++i) { + batch.token[i] = text[i]; + std::memcpy(batch.token_audio + (size_t) i * hdr.n_vq, audio.data() + (size_t) i * hdr.n_vq, sizeof(llama_token) * hdr.n_vq); + batch.pos[i] = i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = i + 1 == hdr.n_tokens; + } + + const int ret = llama_decode(ctx, batch); + if (ret != 0) { + std::fprintf(stderr, "error: llama_decode failed: %d\n", ret); + llama_batch_free(batch); + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + return EXIT_FAILURE; + } + + const llama_vocab * vocab = llama_model_get_vocab(model); + const int32_t n_vocab = llama_vocab_n_tokens(vocab); + const int32_t n_audio_logits = (int32_t) (hdr.n_logits - n_vocab) / (int32_t) hdr.n_vq; + const int32_t out_idx = (int32_t) hdr.n_tokens - 1; + const float * got_embd = llama_get_embeddings_ith(ctx, out_idx); + const float * got_logits = llama_get_logits_ith(ctx, out_idx); + + if (got_embd == nullptr || got_logits == nullptr) { + std::fprintf(stderr, "error: missing outputs from context\n"); + llama_batch_free(batch); + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + return EXIT_FAILURE; + } + + const float embd_max_abs = max_abs_diff(got_embd, ref_embd); + const float logits_max_abs = max_abs_diff(got_logits, ref_logits); + const float text_logits_max_abs = max_abs_diff_span(got_logits, ref_logits.data(), n_vocab); + const float audio_logits_max_abs = max_abs_diff_span(got_logits + n_vocab, ref_logits.data() + n_vocab, hdr.n_logits - n_vocab); + + std::fprintf(stderr, + "moss-tts-delay forward parity: out_idx=%d embd_max_abs=%g logits_max_abs=%g text_logits_max_abs=%g audio_logits_max_abs=%g n_audio_logits=%d\n", + out_idx, embd_max_abs, logits_max_abs, text_logits_max_abs, audio_logits_max_abs, n_audio_logits); + + const bool ok = embd_max_abs < 1e-4f && logits_max_abs < 1e-4f; + + if (!ok) { + for (uint32_t i = 0; i < hdr.n_tokens; ++i) { + const float * got_embd_i = llama_get_embeddings_ith(ctx, (int32_t) i); + if (got_embd_i != nullptr) { + std::fprintf(stderr, " embd_max_abs[out=%u]=%g\n", i, max_abs_diff(got_embd_i, ref_embd)); + } + } + } + + llama_batch_free(batch); + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + + return ok ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/tests/test-moss-tts-delay-load.cpp b/tests/test-moss-tts-delay-load.cpp new file mode 100644 index 000000000..fe97c2317 --- /dev/null +++ b/tests/test-moss-tts-delay-load.cpp @@ -0,0 +1,77 @@ +#include "llama.h" +#include "../src/llama-arch.h" +#include "../src/llama-model.h" + +#include +#include +#include +#include +#include + +static void check(bool cond, const std::string & msg) { + if (!cond) { + throw std::runtime_error(msg); + } +} + +static void check_tensor_2d(const ggml_tensor * tensor, const char * name, int64_t ne0, int64_t ne1) { + check(tensor != nullptr, std::string("missing tensor: ") + name); + check(tensor->ne[0] == ne0, std::string(name) + " ne[0] mismatch"); + check(tensor->ne[1] == ne1, std::string(name) + " ne[1] mismatch"); +} + +int main(int argc, char ** argv) { + if (argc != 2) { + std::fprintf(stderr, "usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + llama_backend_init(); + + llama_model_params params = llama_model_default_params(); + params.use_mmap = false; + + llama_model * model = llama_model_load_from_file(argv[1], params); + if (model == nullptr) { + std::fprintf(stderr, "error: failed to load model '%s'\n", argv[1]); + llama_backend_free(); + return EXIT_FAILURE; + } + + try { + check(model->arch == LLM_ARCH_MOSS_TTS_DELAY, "unexpected architecture"); + check(model->hparams.n_vq > 0, "n_vq must be > 0"); + check(model->hparams.audio_vocab_size > 0, "audio_vocab_size must be > 0"); + + const int64_t n_embd = model->hparams.n_embd; + const int64_t n_vocab = model->vocab.n_tokens(); + const int64_t n_audio_vocab = std::max(model->hparams.audio_vocab_size + 1, model->hparams.audio_pad_code + 1); + + check_tensor_2d(model->tok_embd, "token_embd.weight", n_embd, n_vocab); + check_tensor_2d(model->output, "output.weight", n_embd, n_vocab); + + check(model->tok_embd_audio.size() == model->hparams.n_vq, "token_embd_audio size mismatch"); + check(model->output_audio.size() == model->hparams.n_vq, "output_audio size mismatch"); + + for (uint32_t i = 0; i < model->hparams.n_vq; ++i) { + check_tensor_2d(model->tok_embd_audio.at(i), "token_embd_audio", n_embd, n_audio_vocab); + check_tensor_2d(model->output_audio.at(i), "output_audio", n_embd, n_audio_vocab); + } + + std::fprintf(stderr, + "loaded MOSS-TTS-Delay: n_layer=%u n_embd=%u n_vq=%u audio_vocab=%u tensors_ok=1\n", + model->hparams.n_layer, + model->hparams.n_embd, + model->hparams.n_vq, + model->hparams.audio_vocab_size); + } catch (const std::exception & err) { + std::fprintf(stderr, "validation failed: %s\n", err.what()); + llama_model_free(model); + llama_backend_free(); + return EXIT_FAILURE; + } + + llama_model_free(model); + llama_backend_free(); + return EXIT_SUCCESS; +} diff --git a/tools/batched-bench/batched-bench.cpp b/tools/batched-bench/batched-bench.cpp index 224f0e1f1..0a5902a76 100644 --- a/tools/batched-bench/batched-bench.cpp +++ b/tools/batched-bench/batched-bench.cpp @@ -83,6 +83,8 @@ int main(int argc, char ** argv) { llama_batch batch_view = { n_tokens, batch.token + i, + 0, + nullptr, nullptr, batch.pos + i, batch.n_seq_id + i, diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 5bcb7ec1b..574422b40 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -136,6 +136,8 @@ struct decode_embd_batch { batch = { /*n_tokens =*/ n_tokens, /*tokens =*/ nullptr, + /*n_token_audio =*/ 0, + /*token_audio =*/ nullptr, /*embd =*/ embd, /*pos =*/ pos.data(), /*n_seq_id =*/ n_seq_id.data(), @@ -216,6 +218,8 @@ struct decode_embd_batch { return { /*n_tokens =*/ n_tokens, /*tokens =*/ nullptr, + /*n_token_audio =*/ 0, + /*token_audio =*/ nullptr, /*embd =*/ batch.embd + offset * n_mmproj_embd, /*pos =*/ pos_ptr, /*n_seq_id =*/ batch.n_seq_id + offset, diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp index 0eb062f05..8e64212ef 100644 --- a/tools/perplexity/perplexity.cpp +++ b/tools/perplexity/perplexity.cpp @@ -668,6 +668,8 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector< llama_batch batch_view = { n_tokens, batch.token + i, + 0, + nullptr, nullptr, batch.pos + i, batch.n_seq_id + i, diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index c47ad876c..42cc72287 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2689,6 +2689,8 @@ struct server_context_impl { llama_batch batch_view = { n_tokens, batch.token + i, + 0, + nullptr, nullptr, batch.pos + i, batch.n_seq_id + i, diff --git a/tools/tts/CMakeLists.txt b/tools/tts/CMakeLists.txt index 76320d4c2..b91a84759 100644 --- a/tools/tts/CMakeLists.txt +++ b/tools/tts/CMakeLists.txt @@ -6,3 +6,12 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17) if(LLAMA_TOOLS_INSTALL) install(TARGETS ${TARGET} RUNTIME) endif() + +set(TARGET llama-moss-tts) +add_executable(${TARGET} moss-tts.cpp) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/tts/moss-tts-audio-decode.py b/tools/tts/moss-tts-audio-decode.py new file mode 100755 index 000000000..160579149 --- /dev/null +++ b/tools/tts/moss-tts-audio-decode.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import os +import struct +import sys +import wave +from pathlib import Path + +import numpy as np + + +def resolve_moss_tts_dir() -> Path: + env_dir = os.getenv("MOSS_TTS_DIR") or os.getenv("MOSS_TTS_ROOT") + if env_dir: + path = Path(env_dir).expanduser().resolve() + else: + path = Path(__file__).resolve().parents[3] / "MOSS-TTS" + + if not path.is_dir(): + raise FileNotFoundError( + f"MOSS-TTS repo not found: {path}. Set MOSS_TTS_DIR to the MOSS-TTS checkout root." + ) + return path + + +sys.path.insert(0, str(resolve_moss_tts_dir())) + +from moss_tts_delay.llama_cpp._constants import N_VQ, SAMPLE_RATE # noqa: E402 + + +CODES_MAGIC = 0x53444F43 # "CODS" +CODES_VERSION = 1 + + +def read_codes(path: Path) -> np.ndarray: + with path.open("rb") as f: + hdr = f.read(16) + if len(hdr) != 16: + raise RuntimeError("codes header is truncated") + magic, version, n_frames, n_vq = struct.unpack(" None: + wav = np.asarray(wav, dtype=np.float32).ravel() + pcm = np.clip(np.round(wav * 32767.0), -32768, 32767).astype(np.int16) + + with wave.open(str(path), "wb") as f: + f.setnchannels(1) + f.setsampwidth(2) + f.setframerate(sample_rate) + f.writeframes(pcm.tobytes()) + + +def main() -> int: + ap = argparse.ArgumentParser(description="Decode MOSS raw audio codes to wav via Python audio tokenizer") + ap.add_argument("--codes-bin", required=True) + ap.add_argument("--wav-out", required=True) + ap.add_argument("--encoder-onnx", required=True) + ap.add_argument("--decoder-onnx", required=True) + ap.add_argument("--cpu", action="store_true") + args = ap.parse_args() + + try: + from moss_audio_tokenizer.onnx import OnnxAudioTokenizer + except Exception as exc: + raise RuntimeError( + "moss_audio_tokenizer.onnx is unavailable; initialize the submodule/package and install ONNX deps" + ) from exc + + codes = read_codes(Path(args.codes_bin)) + if codes.ndim != 2 or codes.shape[1] != N_VQ: + raise RuntimeError(f"expected raw codes with shape (T, {N_VQ}), got {codes.shape}") + + tokenizer = OnnxAudioTokenizer( + encoder_path=args.encoder_onnx, + decoder_path=args.decoder_onnx, + use_gpu=not args.cpu, + ) + wav = tokenizer.decode(codes) + write_wav16(Path(args.wav_out), wav, SAMPLE_RATE) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/tts/moss-tts-build-generation-ref.py b/tools/tts/moss-tts-build-generation-ref.py new file mode 100755 index 000000000..48a784673 --- /dev/null +++ b/tools/tts/moss-tts-build-generation-ref.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import os +import struct +import sys +from pathlib import Path + +import numpy as np + +REF_MAGIC = 0x4652474D # "MGRF" +REF_VERSION = 1 + + +def resolve_moss_tts_dir() -> Path: + env_dir = os.getenv("MOSS_TTS_DIR") or os.getenv("MOSS_TTS_ROOT") + if env_dir: + path = Path(env_dir).expanduser().resolve() + else: + path = Path(__file__).resolve().parents[3] / "MOSS-TTS" + + if not path.is_dir(): + raise FileNotFoundError( + f"MOSS-TTS repo not found: {path}. Set MOSS_TTS_DIR to the MOSS-TTS checkout root." + ) + return path + + +def parse_args() -> argparse.Namespace: + ap = argparse.ArgumentParser( + description="Build first-class MOSS-TTS generation input (.bin) from text (+ optional reference audio)." + ) + ap.add_argument("--tokenizer-dir", required=True, help="Directory containing tokenizer.json") + ap.add_argument("--output-ref", required=True, help="Output .ref.bin path") + ap.add_argument("--language", default="zh", help="Language tag passed to prompt builder") + ap.add_argument("--text", default="", help="Input text (optional when --text-file is used)") + ap.add_argument("--text-file", default="", help="UTF-8 text file path") + ap.add_argument("--reference-audio", default="", help="Optional reference wav path (24kHz preferred)") + ap.add_argument("--encoder-onnx", default="", help="Required when --reference-audio is set") + ap.add_argument("--decoder-onnx", default="", help="Required when --reference-audio is set") + ap.add_argument("--cpu-audio-encode", action="store_true", help="Force CPU for ONNX reference encode") + return ap.parse_args() + + +def _load_text(args: argparse.Namespace) -> str: + if args.text_file: + return Path(args.text_file).read_text(encoding="utf-8") + if args.text: + return args.text + raise ValueError("either --text or --text-file is required") + + +def _read_reference_codes(args: argparse.Namespace) -> np.ndarray | None: + if not args.reference_audio: + return None + if not args.encoder_onnx or not args.decoder_onnx: + raise ValueError("--encoder-onnx and --decoder-onnx are required when --reference-audio is set") + + import soundfile as sf + from moss_audio_tokenizer.onnx import OnnxAudioTokenizer + + wav, sr = sf.read(args.reference_audio, dtype="float32") + if wav.ndim > 1: + wav = wav.mean(axis=1) + if sr != 24000: + raise ValueError(f"reference sample rate must be 24000, got {sr}: {args.reference_audio}") + + tokenizer = OnnxAudioTokenizer( + encoder_path=args.encoder_onnx, + decoder_path=args.decoder_onnx, + use_gpu=not args.cpu_audio_encode, + ) + codes = tokenizer.encode(wav) + return np.asarray(codes, dtype=np.int64) + + +def main() -> int: + args = parse_args() + + sys.path.insert(0, str(resolve_moss_tts_dir())) + + from moss_tts_delay.llama_cpp._constants import AUDIO_PAD_CODE + from moss_tts_delay.llama_cpp.processor import Tokenizer, build_generation_prompt + + text = _load_text(args) + reference_codes = _read_reference_codes(args) + + tok = Tokenizer(args.tokenizer_dir) + input_ids = build_generation_prompt( + tokenizer=tok, + text=text, + reference_codes=reference_codes, + language=args.language, + ) + + out_ref = Path(args.output_ref) + out_ref.parent.mkdir(parents=True, exist_ok=True) + + prompt_frames = int(input_ids.shape[0]) + n_vq = int(input_ids.shape[1] - 1) + with out_ref.open("wb") as f: + f.write( + struct.pack( + " subprocess.CompletedProcess: + print("+", shlex.join(cmd), flush=True) + return subprocess.run(cmd, env=env, check=False) + + +def need_file(path: Path, name: str) -> None: + if not path.is_file(): + raise FileNotFoundError(f"missing {name}: {path}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "End-to-end first-class MOSS-TTS pipeline (llama.cpp backbone + ONNX tokenizer): " + "text(+ref) -> wav. Intermediate artifacts are stored in a temporary directory " + "and removed automatically." + ) + ) + + parser.add_argument("--model-gguf", default=os.getenv("MODEL_GGUF", "")) + parser.add_argument("--moss-tts-dir", default=os.getenv("MOSS_TTS_DIR", os.getenv("MOSS_TTS_ROOT", ""))) + parser.add_argument("--tokenizer-dir", default=os.getenv("TOKENIZER_DIR", "")) + parser.add_argument("--onnx-encoder", default=os.getenv("ONNX_ENCODER", "")) + parser.add_argument("--onnx-decoder", default=os.getenv("ONNX_DECODER", "")) + parser.add_argument("--output-wav", required=True) + parser.add_argument("--reference-audio", default="") + parser.add_argument("--language", default="zh") + parser.add_argument("--max-new-tokens", type=int, default=512) + parser.add_argument("--text-temperature", type=float, default=1.5) + parser.add_argument("--audio-temperature", type=float, default=1.7) + parser.add_argument("--n-gpu-layers", type=int, default=-1) + parser.add_argument("--python-bin", default=sys.executable) + parser.add_argument("--llama-bin", default="") + parser.add_argument("--build", action="store_true", help="Build llama-moss-tts before running") + parser.add_argument("--n-jobs", type=int, default=(os.cpu_count() or 1)) + parser.add_argument("--audio-decoder-cpu", action="store_true") + parser.add_argument("--cpu-audio-encode", action="store_true") + + text_group = parser.add_mutually_exclusive_group(required=True) + text_group.add_argument("--text", default="") + text_group.add_argument("--text-file", default="") + + args = parser.parse_args() + + if not args.model_gguf: + parser.error("--model-gguf is required (or set MODEL_GGUF)") + if not args.tokenizer_dir: + parser.error("--tokenizer-dir is required (or set TOKENIZER_DIR)") + if not args.onnx_encoder: + parser.error("--onnx-encoder is required (or set ONNX_ENCODER)") + if not args.onnx_decoder: + parser.error("--onnx-decoder is required (or set ONNX_DECODER)") + + return args + + +def main() -> int: + args = parse_args() + + repo_root = Path(__file__).resolve().parents[2] + build_ref_script = repo_root / "tools/tts/moss-tts-build-generation-ref.py" + decode_script = repo_root / "tools/tts/moss-tts-audio-decode.py" + llama_bin = Path(args.llama_bin) if args.llama_bin else repo_root / "build/bin/llama-moss-tts" + + model_gguf = Path(args.model_gguf).expanduser().resolve() + tokenizer_dir = Path(args.tokenizer_dir).expanduser().resolve() + onnx_encoder = Path(args.onnx_encoder).expanduser().resolve() + onnx_decoder = Path(args.onnx_decoder).expanduser().resolve() + python_bin = Path(args.python_bin).expanduser().resolve() + output_wav = Path(args.output_wav).expanduser().resolve() + moss_tts_dir = Path(args.moss_tts_dir).expanduser().resolve() if args.moss_tts_dir else None + + need_file(python_bin, "python binary") + need_file(model_gguf, "first-class model gguf") + need_file(tokenizer_dir / "tokenizer.json", "tokenizer.json") + need_file(onnx_encoder, "ONNX encoder") + need_file(onnx_decoder, "ONNX decoder") + need_file(build_ref_script, "generation-ref builder") + need_file(decode_script, "audio decode helper") + if moss_tts_dir is not None and not moss_tts_dir.is_dir(): + raise FileNotFoundError(f"missing MOSS-TTS repo: {moss_tts_dir}") + if args.text_file: + need_file(Path(args.text_file).expanduser().resolve(), "text file") + if args.reference_audio: + need_file(Path(args.reference_audio).expanduser().resolve(), "reference audio") + + if args.build: + rc = run_cmd(["cmake", "-S", str(repo_root), "-B", str(repo_root / "build")]).returncode + if rc != 0: + raise RuntimeError(f"cmake configure failed with rc={rc}") + rc = run_cmd( + [ + "cmake", + "--build", + str(repo_root / "build"), + "--target", + "llama-moss-tts", + "-j", + str(args.n_jobs), + ] + ).returncode + if rc != 0: + raise RuntimeError(f"cmake build failed with rc={rc}") + + need_file(llama_bin, "llama-moss-tts binary") + output_wav.parent.mkdir(parents=True, exist_ok=True) + shared_env = os.environ.copy() + if moss_tts_dir is not None: + shared_env["MOSS_TTS_DIR"] = str(moss_tts_dir) + old_pythonpath = shared_env.get("PYTHONPATH") + shared_env["PYTHONPATH"] = ( + f"{moss_tts_dir}{os.pathsep}{old_pythonpath}" if old_pythonpath else str(moss_tts_dir) + ) + + with tempfile.TemporaryDirectory(prefix="moss-tts-firstclass-") as tmpdir: + tmpdir_path = Path(tmpdir) + generation_ref = tmpdir_path / "generation.ref.bin" + raw_codes = tmpdir_path / "raw.codes.bin" + + build_ref_cmd = [ + str(python_bin), + str(build_ref_script), + "--tokenizer-dir", + str(tokenizer_dir), + "--output-ref", + str(generation_ref), + "--language", + args.language, + ] + if args.text_file: + build_ref_cmd.extend(["--text-file", str(Path(args.text_file).expanduser().resolve())]) + else: + build_ref_cmd.extend(["--text", args.text]) + + if args.reference_audio: + build_ref_cmd.extend( + [ + "--reference-audio", + str(Path(args.reference_audio).expanduser().resolve()), + "--encoder-onnx", + str(onnx_encoder), + "--decoder-onnx", + str(onnx_decoder), + ] + ) + if args.cpu_audio_encode: + build_ref_cmd.append("--cpu-audio-encode") + + rc = run_cmd(build_ref_cmd, env=shared_env).returncode + if rc != 0: + raise RuntimeError(f"generation-ref build failed with rc={rc}") + + run_args = [ + str(llama_bin), + "-m", + str(model_gguf), + "--generation-input", + str(generation_ref), + "--n-gpu-layers", + str(args.n_gpu_layers), + "--max-new-tokens", + str(args.max_new_tokens), + "--text-temperature", + str(args.text_temperature), + "--audio-temperature", + str(args.audio_temperature), + "--dump-raw-codes", + str(raw_codes), + "--audio-decoder-script", + str(decode_script), + "--audio-encoder-onnx", + str(onnx_encoder), + "--audio-decoder-onnx", + str(onnx_decoder), + "--wav-out", + str(output_wav), + "--python-bin", + str(python_bin), + ] + if args.audio_decoder_cpu: + run_args.append("--audio-decoder-cpu") + llama_rc = run_cmd(run_args, env=shared_env).returncode + + if not output_wav.is_file(): + raise RuntimeError(f"llama-moss-tts did not produce wav: {output_wav} (rc={llama_rc})") + if llama_rc != 0: + print( + f"warning: llama-moss-tts exited with rc={llama_rc}, but wav was produced.", + file=sys.stderr, + ) + + with wave.open(str(output_wav), "rb") as f: + sr = f.getframerate() + n = f.getnframes() + ch = f.getnchannels() + + print("done") + print(f"wav : {output_wav}") + print(f"wav_info: sr={sr} ch={ch} frames={n} sec={n/max(sr,1):.3f}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/tts/moss-tts-seed-tts-eval-generate.py b/tools/tts/moss-tts-seed-tts-eval-generate.py new file mode 100644 index 000000000..36861a0e6 --- /dev/null +++ b/tools/tts/moss-tts-seed-tts-eval-generate.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import os +import shlex +import subprocess +import sys +from pathlib import Path + + +def run_cmd(cmd: list[str], env: dict[str, str] | None = None, cwd: Path | None = None) -> int: + print("+", shlex.join(cmd), flush=True) + return subprocess.run(cmd, env=env, cwd=str(cwd) if cwd else None, check=False).returncode + + +def need_file(path: Path, name: str) -> None: + if not path.is_file(): + raise FileNotFoundError(f"missing {name}: {path}") + + +def need_dir(path: Path, name: str) -> None: + if not path.is_dir(): + raise FileNotFoundError(f"missing {name}: {path}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Run MOSS-TTS first-class generation over a seed-tts-eval meta list." + ) + parser.add_argument("--meta", required=True, help="seed-tts-eval meta.lst path") + parser.add_argument("--output-dir", required=True, help="Directory to write .wav outputs") + + parser.add_argument("--skip-generate", action="store_true") + + parser.add_argument("--model-gguf", default=os.getenv("MODEL_GGUF", "")) + parser.add_argument("--moss-tts-dir", default=os.getenv("MOSS_TTS_DIR", os.getenv("MOSS_TTS_ROOT", ""))) + parser.add_argument("--tokenizer-dir", default=os.getenv("TOKENIZER_DIR", "")) + parser.add_argument("--onnx-encoder", default=os.getenv("ONNX_ENCODER", "")) + parser.add_argument("--onnx-decoder", default=os.getenv("ONNX_DECODER", "")) + parser.add_argument("--language", default="zh") + parser.add_argument("--max-new-tokens", type=int, default=512) + parser.add_argument("--text-temperature", type=float, default=1.5) + parser.add_argument("--audio-temperature", type=float, default=1.7) + parser.add_argument("--n-gpu-layers", type=int, default=-1) + parser.add_argument("--python-bin", default=sys.executable) + parser.add_argument("--llama-bin", default="") + parser.add_argument("--build", action="store_true") + parser.add_argument("--audio-decoder-cpu", action="store_true") + parser.add_argument("--cpu-audio-encode", action="store_true") + parser.add_argument("--overwrite", action="store_true") + parser.add_argument("--limit", type=int, default=0, help="Only synthesize the first N items when > 0") + parser.add_argument("--skip-missing-reference", action="store_true") + parser.add_argument("--e2e-script", default="") + + args = parser.parse_args() + + if not args.skip_generate: + for key in ("model_gguf", "tokenizer_dir", "onnx_encoder", "onnx_decoder"): + if not getattr(args, key): + parser.error(f"--{key.replace('_', '-')} is required unless --skip-generate is set") + + return args + + +def parse_meta_line(line: str) -> tuple[str, str, str | None]: + fields = line.rstrip("\n").split("|") + if len(fields) == 5: + utt, _prompt_text, prompt_wav, infer_text, _infer_wav = fields + elif len(fields) == 4: + utt, _prompt_text, prompt_wav, infer_text = fields + elif len(fields) == 3: + utt, infer_text, prompt_wav = fields + elif len(fields) == 2: + utt, infer_text = fields + prompt_wav = None + else: + raise ValueError(f"unsupported meta format: {line.rstrip()}") + + utt = utt[:-4] if utt.endswith(".wav") else utt + return utt, infer_text, prompt_wav + + +def resolve_prompt_wav(meta_path: Path, prompt_wav: str | None) -> Path | None: + if not prompt_wav: + return None + path = Path(prompt_wav).expanduser() + if not path.is_absolute(): + path = (meta_path.parent / path).resolve() + else: + path = path.resolve() + return path + + +def build_generation_env(args: argparse.Namespace) -> dict[str, str]: + env = os.environ.copy() + if args.moss_tts_dir: + moss_tts_dir = Path(args.moss_tts_dir).expanduser().resolve() + need_dir(moss_tts_dir, "MOSS-TTS repo") + env["MOSS_TTS_DIR"] = str(moss_tts_dir) + old_pythonpath = env.get("PYTHONPATH") + env["PYTHONPATH"] = f"{moss_tts_dir}{os.pathsep}{old_pythonpath}" if old_pythonpath else str(moss_tts_dir) + return env + + +def generate_wavs(args: argparse.Namespace, meta_path: Path, output_dir: Path, e2e_script: Path) -> None: + env = build_generation_env(args) + built = False + count = 0 + + for raw_line in meta_path.read_text(encoding="utf-8").splitlines(): + if not raw_line.strip(): + continue + + utt, infer_text, prompt_wav = parse_meta_line(raw_line) + reference_audio = resolve_prompt_wav(meta_path, prompt_wav) + if reference_audio is not None and not reference_audio.is_file(): + if args.skip_missing_reference: + print(f"skip missing reference: {reference_audio}", file=sys.stderr) + continue + raise FileNotFoundError(f"missing reference audio: {reference_audio}") + + output_wav = output_dir / f"{utt}.wav" + if output_wav.exists() and not args.overwrite: + print(f"skip existing: {output_wav}", file=sys.stderr) + count += 1 + if args.limit > 0 and count >= args.limit: + break + continue + + cmd = [ + str(args.python_bin), + str(e2e_script), + "--model-gguf", + args.model_gguf, + "--tokenizer-dir", + args.tokenizer_dir, + "--onnx-encoder", + args.onnx_encoder, + "--onnx-decoder", + args.onnx_decoder, + "--output-wav", + str(output_wav), + "--language", + args.language, + "--max-new-tokens", + str(args.max_new_tokens), + "--text-temperature", + str(args.text_temperature), + "--audio-temperature", + str(args.audio_temperature), + "--n-gpu-layers", + str(args.n_gpu_layers), + "--python-bin", + args.python_bin, + "--text", + infer_text, + ] + if args.moss_tts_dir: + cmd.extend(["--moss-tts-dir", args.moss_tts_dir]) + if args.llama_bin: + cmd.extend(["--llama-bin", args.llama_bin]) + if args.build and not built: + cmd.append("--build") + built = True + if args.audio_decoder_cpu: + cmd.append("--audio-decoder-cpu") + if args.cpu_audio_encode: + cmd.append("--cpu-audio-encode") + if reference_audio is not None: + cmd.extend(["--reference-audio", str(reference_audio)]) + + rc = run_cmd(cmd, env=env) + if rc != 0: + raise RuntimeError(f"failed to synthesize {utt} with rc={rc}") + + count += 1 + if args.limit > 0 and count >= args.limit: + break + + print(f"generation done: {count} items in {output_dir}") + + +def main() -> int: + args = parse_args() + + repo_root = Path(__file__).resolve().parents[2] + e2e_script = Path(args.e2e_script).expanduser().resolve() if args.e2e_script else repo_root / "tools/tts/moss-tts-firstclass-e2e.py" + meta_path = Path(args.meta).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + + need_file(meta_path, "seed-tts-eval meta") + need_file(e2e_script, "moss-tts firstclass e2e script") + output_dir.mkdir(parents=True, exist_ok=True) + + if not args.skip_generate: + generate_wavs(args, meta_path, output_dir, e2e_script) + + print(f"done: meta={meta_path} output_dir={output_dir}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/tts/moss-tts.cpp b/tools/tts/moss-tts.cpp new file mode 100644 index 000000000..2e545297c --- /dev/null +++ b/tools/tts/moss-tts.cpp @@ -0,0 +1,1737 @@ +#include "arg.h" +#include "common.h" +#include "log.h" +#include "llama.h" +#include "llama-cpp.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +constexpr uint32_t MOSS_DELAY_DEFAULT_N_VQ = 32; +constexpr llama_token MOSS_DELAY_DEFAULT_PAD_TOKEN_ID = 151643; +constexpr llama_token MOSS_DELAY_DEFAULT_IM_START_TOKEN_ID = 151644; +constexpr llama_token MOSS_DELAY_DEFAULT_IM_END_TOKEN_ID = 151645; +constexpr llama_token MOSS_DELAY_DEFAULT_AUDIO_START_TOKEN_ID = 151652; +constexpr llama_token MOSS_DELAY_DEFAULT_AUDIO_END_TOKEN_ID = 151653; +constexpr llama_token MOSS_DELAY_DEFAULT_AUDIO_USER_SLOT_TOKEN_ID = 151654; +constexpr llama_token MOSS_DELAY_DEFAULT_AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID = 151656; +constexpr llama_token MOSS_DELAY_DEFAULT_AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID = 151662; +constexpr llama_token MOSS_DELAY_DEFAULT_AUDIO_PAD_CODE = 1024; +constexpr uint32_t MOSS_DELAY_DEFAULT_AUDIO_VOCAB_SIZE = 1024; +constexpr int64_t MOSS_DELAY_INT64_MAX = std::numeric_limits::max(); +constexpr float MOSS_NEG_INF = -std::numeric_limits::infinity(); +constexpr uint32_t MOSS_CODES_MAGIC = 0x53444f43; // "CODS" +constexpr uint32_t MOSS_CODES_VERSION = 1; +constexpr uint32_t MOSS_DECODE_REF_MAGIC = 0x4652444d; // "MDRF" +constexpr uint32_t MOSS_DECODE_REF_VERSION = 1; +constexpr uint32_t MOSS_GEN_REF_MAGIC = 0x4652474d; // "MGRF" +constexpr uint32_t MOSS_GEN_REF_VERSION = 1; + +struct moss_sampling_config { + float text_temperature = 1.5f; + float text_top_p = 1.0f; + int32_t text_top_k = 50; + float audio_temperature = 1.7f; + float audio_top_p = 0.8f; + int32_t audio_top_k = 25; + float audio_repetition_penalty = 1.0f; +}; + +struct moss_delay_config { + uint32_t n_vq = MOSS_DELAY_DEFAULT_N_VQ; + llama_token pad_token_id = MOSS_DELAY_DEFAULT_PAD_TOKEN_ID; + llama_token im_start_token_id = MOSS_DELAY_DEFAULT_IM_START_TOKEN_ID; + llama_token im_end_token_id = MOSS_DELAY_DEFAULT_IM_END_TOKEN_ID; + llama_token audio_start_token_id = MOSS_DELAY_DEFAULT_AUDIO_START_TOKEN_ID; + llama_token audio_end_token_id = MOSS_DELAY_DEFAULT_AUDIO_END_TOKEN_ID; + llama_token audio_user_slot_token_id = MOSS_DELAY_DEFAULT_AUDIO_USER_SLOT_TOKEN_ID; + llama_token audio_assistant_gen_slot_token_id = MOSS_DELAY_DEFAULT_AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID; + llama_token audio_assistant_delay_slot_token_id = MOSS_DELAY_DEFAULT_AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID; + llama_token audio_pad_code = MOSS_DELAY_DEFAULT_AUDIO_PAD_CODE; + uint32_t audio_vocab_size = MOSS_DELAY_DEFAULT_AUDIO_VOCAB_SIZE; + + size_t packed_stride() const { + return 1u + n_vq; + } +}; + +struct moss_audio_segment { + std::vector codes; + size_t n_frames = 0; +}; + +struct moss_generation_audio { + std::vector delayed_codes; + size_t delayed_frames = 0; + + std::vector segments; + + std::vector raw_codes; + size_t raw_frames = 0; +}; + +struct moss_delay_state { + int32_t audio_length = 0; + int64_t delayed_length = MOSS_DELAY_INT64_MAX; + bool is_audio = false; + bool is_stopping = false; + int32_t time_step = 0; + std::vector text_history; + + uint32_t n_vq = MOSS_DELAY_DEFAULT_N_VQ; + std::vector audio_history; + + size_t audio_frames() const { + return n_vq == 0 ? 0 : audio_history.size() / n_vq; + } + + bool empty_audio() const { + return audio_history.empty(); + } + + const llama_token * audio_frame_ptr(size_t frame_idx) const { + if (n_vq == 0 || frame_idx >= audio_frames()) { + return nullptr; + } + return audio_history.data() + frame_idx * n_vq; + } + + void reserve_audio_frames(size_t frames) { + audio_history.reserve(frames * n_vq); + } + + void append_audio(const std::vector & frame) { + GGML_ASSERT(frame.size() == n_vq); + audio_history.insert(audio_history.end(), frame.begin(), frame.end()); + } + + void append_audio(const llama_token * frame) { + GGML_ASSERT(frame != nullptr); + audio_history.insert(audio_history.end(), frame, frame + n_vq); + } +}; + +using moss_rng = std::mt19937; + +struct moss_codes_header { + uint32_t magic = MOSS_CODES_MAGIC; + uint32_t version = MOSS_CODES_VERSION; + uint32_t n_frames = 0; + uint32_t n_vq = 0; +}; + +struct moss_decode_ref_header { + uint32_t magic = MOSS_DECODE_REF_MAGIC; + uint32_t version = MOSS_DECODE_REF_VERSION; + uint32_t prompt_frames = 0; + uint32_t n_vq = 0; + uint32_t audio_pad_code = 0; + uint32_t packed_frames = 0; + uint32_t raw_frames = 0; +}; + +struct moss_generation_ref_header { + uint32_t magic = MOSS_GEN_REF_MAGIC; + uint32_t version = MOSS_GEN_REF_VERSION; + uint32_t prompt_frames = 0; + uint32_t n_vq = 0; + uint32_t audio_pad_code = 0; + uint32_t prompt_packed_frames = 0; + uint32_t raw_frames = 0; +}; + +static moss_generation_audio moss_decode_generation_audio( + const moss_delay_state & state, + size_t prompt_frames, + const moss_delay_config & cfg); + +static moss_generation_audio moss_decode_generation_audio( + const std::vector & packed_ids, + size_t prompt_frames, + const moss_delay_config & cfg); + +static void moss_generate_from_ref( + const std::string & model_path, + const std::string & ref_path, + int32_t n_gpu_layers, + int32_t max_new_tokens, + const moss_sampling_config & sampling_cfg, + uint32_t seed, + const std::string & dump_raw_codes_path, + const std::string & python_bin, + const std::string & helper_script, + const std::string & encoder_onnx, + const std::string & decoder_onnx, + const std::string & wav_out, + bool use_gpu_audio); + +struct llama_backend_scope { + llama_backend_scope() { + llama_backend_init(); + } + + ~llama_backend_scope() { + llama_backend_free(); + } + + llama_backend_scope(const llama_backend_scope &) = delete; + llama_backend_scope & operator=(const llama_backend_scope &) = delete; +}; + +struct moss_owned_batch { + llama_batch batch = { + /*n_tokens =*/ 0, + /*token =*/ nullptr, + /*n_token_audio =*/ 0, + /*token_audio =*/ nullptr, + /*embd =*/ nullptr, + /*pos =*/ nullptr, + /*n_seq_id =*/ nullptr, + /*seq_id =*/ nullptr, + /*logits =*/ nullptr, + }; + std::vector token_audio; + + moss_owned_batch(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) + : batch(llama_batch_init(n_tokens_alloc, embd, n_seq_max)) { + } + + ~moss_owned_batch() { + release(); + } + + moss_owned_batch(const moss_owned_batch &) = delete; + moss_owned_batch & operator=(const moss_owned_batch &) = delete; + + moss_owned_batch(moss_owned_batch && other) noexcept + : batch(other.batch), + token_audio(std::move(other.token_audio)) { + other.batch = {}; + refresh_token_audio_ptr(); + } + + moss_owned_batch & operator=(moss_owned_batch && other) noexcept { + if (this != &other) { + release(); + batch = other.batch; + token_audio = std::move(other.token_audio); + other.batch = {}; + refresh_token_audio_ptr(); + } + return *this; + } + + void refresh_token_audio_ptr() { + batch.token_audio = token_audio.empty() ? nullptr : token_audio.data(); + } + + void release() { + if (!token_audio.empty() && batch.token_audio == token_audio.data()) { + batch.token_audio = nullptr; + batch.n_token_audio = 0; + } + llama_batch_free(batch); + batch = {}; + token_audio.clear(); + } +}; + +static void print_usage(int argc, char ** argv) { + (void) argc; + LOG("\nexample usage:\n"); + LOG(" %s -m model.gguf --print-delay-config\n", argv[0]); + LOG(" %s -m model.gguf --generation-input generation.input.bin -ngl -1\n", argv[0]); + LOG(" %s --decode-parity-ref decode.ref.bin\n", argv[0]); + LOG("\noptions:\n"); + LOG(" -ngl, --gpu-layers, --n-gpu-layers N number of layers to offload to GPU (default: -1)\n"); + LOG("\n"); +} + +template +static void moss_read_exact(std::ifstream & in, T * data, size_t count, const char * what) { + in.read(reinterpret_cast(data), sizeof(T) * count); + if (!in) { + throw std::runtime_error(std::string("failed to read ") + what); + } +} + +template +static void moss_write_exact(std::ofstream & out, const T * data, size_t count, const char * what) { + out.write(reinterpret_cast(data), sizeof(T) * count); + if (!out) { + throw std::runtime_error(std::string("failed to write ") + what); + } +} + +static std::string moss_shell_quote(const std::string & value) { + std::string out = "'"; + for (char c : value) { + if (c == '\'') { + out += "'\\''"; + } else { + out += c; + } + } + out += "'"; + return out; +} + +static bool parse_meta_i64(const llama_model * model, const char * key, int64_t & out) { + char buf[128]; + const int32_t n = llama_model_meta_val_str(model, key, buf, sizeof(buf)); + if (n <= 0) { + return false; + } + + char * end = nullptr; + const long long val = std::strtoll(buf, &end, 10); + if (end == buf || *end != '\0') { + return false; + } + out = val; + return true; +} + +static bool parse_meta_u32(const llama_model * model, const char * key, uint32_t & out) { + int64_t tmp = 0; + if (!parse_meta_i64(model, key, tmp) || tmp < 0 || tmp > std::numeric_limits::max()) { + return false; + } + out = static_cast(tmp); + return true; +} + +static bool parse_meta_token(const llama_model * model, const char * key, llama_token & out) { + int64_t tmp = 0; + if (!parse_meta_i64(model, key, tmp) || tmp < std::numeric_limits::min() || tmp > std::numeric_limits::max()) { + return false; + } + out = static_cast(tmp); + return true; +} + +static moss_delay_config moss_delay_config_from_model(const llama_model * model) { + moss_delay_config cfg; + + parse_meta_u32(model, "moss-tts-delay.n_vq", cfg.n_vq); + parse_meta_u32(model, "moss-tts-delay.audio_vocab_size", cfg.audio_vocab_size); + parse_meta_token(model, "moss-tts-delay.audio_pad_code", cfg.audio_pad_code); + parse_meta_token(model, "moss-tts-delay.pad_token_id", cfg.pad_token_id); + parse_meta_token(model, "moss-tts-delay.im_start_token_id", cfg.im_start_token_id); + parse_meta_token(model, "moss-tts-delay.im_end_token_id", cfg.im_end_token_id); + parse_meta_token(model, "moss-tts-delay.audio_start_token_id", cfg.audio_start_token_id); + parse_meta_token(model, "moss-tts-delay.audio_end_token_id", cfg.audio_end_token_id); + parse_meta_token(model, "moss-tts-delay.audio_user_slot_token_id", cfg.audio_user_slot_token_id); + parse_meta_token(model, "moss-tts-delay.audio_gen_slot_token_id", cfg.audio_assistant_gen_slot_token_id); + parse_meta_token(model, "moss-tts-delay.audio_delay_slot_token_id", cfg.audio_assistant_delay_slot_token_id); + + return cfg; +} + +static size_t moss_audio_vocab_with_pad(const moss_delay_config & cfg) { + return std::max(cfg.audio_vocab_size + 1u, (size_t) cfg.audio_pad_code + 1u); +} + +static int64_t moss_find_last_equal(const std::vector & values, llama_token target) { + for (int64_t i = (int64_t) values.size() - 1; i >= 0; --i) { + if (values[(size_t) i] == target) { + return i; + } + } + return -1; +} + +static moss_delay_state moss_init_delay_state( + const std::vector & packed_input_ids, + const moss_delay_config & cfg) { + GGML_ASSERT(cfg.n_vq > 0); + GGML_ASSERT(packed_input_ids.size() % cfg.packed_stride() == 0); + + moss_delay_state state; + state.n_vq = cfg.n_vq; + + const size_t seq_len = packed_input_ids.size() / cfg.packed_stride(); + state.text_history.resize(seq_len); + state.reserve_audio_frames(std::max(seq_len + 1024, 256)); + + for (size_t t = 0; t < seq_len; ++t) { + const size_t row = t * cfg.packed_stride(); + state.text_history[t] = packed_input_ids[row]; + state.audio_history.insert( + state.audio_history.end(), + packed_input_ids.begin() + row + 1, + packed_input_ids.begin() + row + 1 + cfg.n_vq); + } + + if (!state.text_history.empty()) { + const llama_token last_text_token = state.text_history.back(); + const bool is_continuation = + last_text_token == cfg.audio_start_token_id || + last_text_token == cfg.audio_assistant_gen_slot_token_id; + if (is_continuation) { + const int64_t audio_start_idx = moss_find_last_equal(state.text_history, cfg.audio_start_token_id); + if (audio_start_idx >= 0) { + state.audio_length = (int32_t) (seq_len - (size_t) audio_start_idx); + state.is_audio = true; + } + } + } + + return state; +} + +static void moss_apply_top_p_inplace(std::vector & logits, size_t n_rows, size_t n_vocab, float top_p) { + if (top_p >= 1.0f) { + return; + } + + for (size_t row = 0; row < n_rows; ++row) { + float max_logit = MOSS_NEG_INF; + for (size_t col = 0; col < n_vocab; ++col) { + max_logit = std::max(max_logit, logits[row * n_vocab + col]); + } + + if (!std::isfinite(max_logit)) { + continue; + } + + std::vector probs(n_vocab, 0.0f); + float sum_exp = 0.0f; + for (size_t col = 0; col < n_vocab; ++col) { + const float logit = logits[row * n_vocab + col]; + if (std::isfinite(logit)) { + probs[col] = std::exp(logit - max_logit); + sum_exp += probs[col]; + } + } + + if (!(sum_exp > 0.0f) || !std::isfinite(sum_exp)) { + continue; + } + + for (float & p : probs) { + p /= sum_exp; + } + + std::vector sorted_idx(n_vocab); + std::iota(sorted_idx.begin(), sorted_idx.end(), 0); + std::sort(sorted_idx.begin(), sorted_idx.end(), [&](size_t a, size_t b) { + return probs[a] > probs[b]; + }); + + float cum_probs = 0.0f; + bool prev_remove = false; + for (size_t rank = 0; rank < n_vocab; ++rank) { + const size_t idx = sorted_idx[rank]; + cum_probs += probs[idx]; + + bool remove = cum_probs > top_p; + if (rank > 0) { + remove = prev_remove; + } else { + remove = false; + } + prev_remove = cum_probs > top_p; + + if (remove) { + logits[row * n_vocab + idx] = MOSS_NEG_INF; + } + } + } +} + +static void moss_apply_repetition_penalty_inplace( + std::vector & logits, + size_t n_rows, + size_t n_vocab, + const std::vector * prev_tokens, + float penalty) { + if (penalty == 1.0f || prev_tokens == nullptr || prev_tokens->empty()) { + return; + } + + std::vector seen(n_vocab, 0); + for (llama_token tok : *prev_tokens) { + if (tok >= 0 && (size_t) tok < n_vocab) { + seen[(size_t) tok] = 1; + } + } + + for (size_t col = 0; col < n_vocab; ++col) { + if (!seen[col]) { + continue; + } + for (size_t row = 0; row < n_rows; ++row) { + float & logit = logits[row * n_vocab + col]; + if (logit > 0.0f) { + logit /= penalty; + } else { + logit *= penalty; + } + } + } +} + +static llama_token moss_argmax_row(const std::vector & logits, size_t row, size_t n_vocab) { + size_t best_idx = 0; + float best_val = logits[row * n_vocab + 0]; + for (size_t col = 1; col < n_vocab; ++col) { + const float cur = logits[row * n_vocab + col]; + if (cur > best_val) { + best_val = cur; + best_idx = col; + } + } + return (llama_token) best_idx; +} + +static llama_token moss_multinomial_row( + const std::vector & probs, + size_t row, + size_t n_vocab, + moss_rng & rng) { + const float * row_probs = probs.data() + row * n_vocab; + std::uniform_real_distribution dist(0.0f, 1.0f); + const float r = dist(rng); + + float cum = 0.0f; + size_t idx = 0; + for (; idx < n_vocab; ++idx) { + cum += row_probs[idx]; + if (!(cum < r)) { + break; + } + } + if (idx >= n_vocab) { + idx = n_vocab - 1; + } + return (llama_token) idx; +} + +static std::vector moss_softmax(const std::vector & logits, size_t n_rows, size_t n_vocab) { + std::vector probs(n_rows * n_vocab, 0.0f); + + for (size_t row = 0; row < n_rows; ++row) { + float max_logit = MOSS_NEG_INF; + for (size_t col = 0; col < n_vocab; ++col) { + max_logit = std::max(max_logit, logits[row * n_vocab + col]); + } + + if (!std::isfinite(max_logit)) { + probs[row * n_vocab + 0] = 1.0f; + continue; + } + + float sum_exp = 0.0f; + for (size_t col = 0; col < n_vocab; ++col) { + const float logit = logits[row * n_vocab + col]; + if (std::isfinite(logit)) { + probs[row * n_vocab + col] = std::exp(logit - max_logit); + sum_exp += probs[row * n_vocab + col]; + } + } + + if (!(sum_exp > 0.0f) || !std::isfinite(sum_exp)) { + probs[row * n_vocab + 0] = 1.0f; + continue; + } + + for (size_t col = 0; col < n_vocab; ++col) { + probs[row * n_vocab + col] /= sum_exp; + } + } + + return probs; +} + +static std::vector moss_sample_token( + const std::vector & logits_in, + size_t n_rows, + size_t n_vocab, + moss_rng & rng, + const std::vector * prev_tokens = nullptr, + float repetition_penalty = 1.0f, + float top_p = 1.0f, + int32_t top_k = 0, + bool do_sample = true) { + GGML_ASSERT(logits_in.size() == n_rows * n_vocab); + + std::vector logits = logits_in; + moss_apply_repetition_penalty_inplace(logits, n_rows, n_vocab, prev_tokens, repetition_penalty); + + std::vector tokens(n_rows, 0); + if (!do_sample) { + for (size_t row = 0; row < n_rows; ++row) { + tokens[row] = moss_argmax_row(logits, row, n_vocab); + } + return tokens; + } + + if (top_k > 0) { + const size_t k = std::min((size_t) top_k, n_vocab); + for (size_t row = 0; row < n_rows; ++row) { + std::vector top_idx(n_vocab); + std::iota(top_idx.begin(), top_idx.end(), 0); + std::nth_element(top_idx.begin(), top_idx.end() - k, top_idx.end(), [&](size_t a, size_t b) { + return logits[row * n_vocab + a] < logits[row * n_vocab + b]; + }); + top_idx.erase(top_idx.begin(), top_idx.end() - k); + + std::vector top_vals(k); + for (size_t i = 0; i < k; ++i) { + top_vals[i] = logits[row * n_vocab + top_idx[i]]; + } + + if (top_p < 1.0f) { + moss_apply_top_p_inplace(top_vals, 1, k, top_p); + } + + const std::vector probs = moss_softmax(top_vals, 1, k); + const llama_token local = moss_multinomial_row(probs, 0, k, rng); + tokens[row] = (llama_token) top_idx[(size_t) local]; + } + return tokens; + } + + if (top_p < 1.0f) { + moss_apply_top_p_inplace(logits, n_rows, n_vocab, top_p); + } + const std::vector probs = moss_softmax(logits, n_rows, n_vocab); + for (size_t row = 0; row < n_rows; ++row) { + tokens[row] = moss_multinomial_row(probs, row, n_vocab, rng); + } + + return tokens; +} + +static std::vector moss_collect_audio_history_channels( + const moss_delay_state & state, + const std::vector & channels) { + if (channels.empty() || state.empty_audio()) { + return {}; + } + + std::vector out; + out.reserve(state.audio_frames() * channels.size()); + for (size_t frame = 0; frame < state.audio_frames(); ++frame) { + const llama_token * audio = state.audio_frame_ptr(frame); + for (size_t channel : channels) { + out.push_back(audio[channel]); + } + } + return out; +} + +static std::vector moss_delay_step( + moss_delay_state & state, + const std::vector & text_logits, + const std::vector & audio_logits, + const moss_sampling_config & sampling_cfg, + const moss_delay_config & cfg, + moss_rng & rng) { + GGML_ASSERT(cfg.n_vq == state.n_vq); + + const size_t n_vq = cfg.n_vq; + const size_t text_vocab = text_logits.size(); + const size_t audio_vocab = moss_audio_vocab_with_pad(cfg); + GGML_ASSERT(audio_logits.size() == n_vq * audio_vocab); + + std::vector result(cfg.packed_stride(), cfg.audio_pad_code); + if (state.is_stopping) { + result[0] = cfg.pad_token_id; + return result; + } + + llama_token next_text = cfg.pad_token_id; + + if (state.delayed_length < (int64_t) n_vq) { + next_text = cfg.audio_assistant_delay_slot_token_id; + } else if (state.delayed_length == (int64_t) n_vq) { + next_text = cfg.audio_end_token_id; + state.is_audio = false; + } else { + std::vector scaled = text_logits; + const float text_temp = sampling_cfg.text_temperature > 0.0f ? sampling_cfg.text_temperature : 1.0f; + const bool text_do_sample = sampling_cfg.text_temperature > 0.0f; + for (float & v : scaled) { + v /= text_temp; + } + + if (!state.is_audio) { + const llama_token excluded[] = { + cfg.pad_token_id, + cfg.audio_assistant_gen_slot_token_id, + cfg.audio_assistant_delay_slot_token_id, + cfg.audio_end_token_id, + }; + for (llama_token tok : excluded) { + if (tok >= 0 && (size_t) tok < text_vocab) { + scaled[(size_t) tok] = MOSS_NEG_INF; + } + } + } else { + std::fill(scaled.begin(), scaled.end(), MOSS_NEG_INF); + if ((size_t) cfg.audio_assistant_gen_slot_token_id < text_vocab) { + scaled[(size_t) cfg.audio_assistant_gen_slot_token_id] = + text_logits[(size_t) cfg.audio_assistant_gen_slot_token_id] / text_temp; + } + if ((size_t) cfg.audio_assistant_delay_slot_token_id < text_vocab) { + scaled[(size_t) cfg.audio_assistant_delay_slot_token_id] = + text_logits[(size_t) cfg.audio_assistant_delay_slot_token_id] / text_temp; + } + } + + if (state.time_step == 0 && (size_t) cfg.audio_assistant_delay_slot_token_id < text_vocab) { + scaled[(size_t) cfg.audio_assistant_delay_slot_token_id] = MOSS_NEG_INF; + } + if (state.time_step <= (int32_t) n_vq && (size_t) cfg.im_end_token_id < text_vocab) { + scaled[(size_t) cfg.im_end_token_id] = MOSS_NEG_INF; + } + + next_text = moss_sample_token( + scaled, 1, text_vocab, rng, nullptr, 1.0f, + sampling_cfg.text_top_p, sampling_cfg.text_top_k, text_do_sample)[0]; + } + + if (next_text == cfg.audio_start_token_id) { + state.is_audio = true; + } + if (next_text == cfg.im_end_token_id) { + state.is_stopping = true; + } + + std::vector next_audio(n_vq, cfg.audio_pad_code); + bool any_sampling = false; + for (size_t channel = 0; channel < n_vq; ++channel) { + const bool pre_audio = channel < (size_t) std::max(state.audio_length, 0); + const bool post_audio = state.delayed_length == MOSS_DELAY_INT64_MAX || + channel > (size_t) std::max(state.delayed_length - 1, -1); + any_sampling = any_sampling || (pre_audio && post_audio); + } + + if (any_sampling) { + std::vector scaled_audio = audio_logits; + const float audio_temp = sampling_cfg.audio_temperature > 0.0f ? sampling_cfg.audio_temperature : 1.0f; + const bool audio_do_sample = sampling_cfg.audio_temperature > 0.0f; + for (float & v : scaled_audio) { + v /= audio_temp; + } + if ((size_t) cfg.audio_pad_code < audio_vocab) { + for (size_t channel = 0; channel < n_vq; ++channel) { + scaled_audio[channel * audio_vocab + (size_t) cfg.audio_pad_code] = MOSS_NEG_INF; + } + } + + const bool sample_ch0 = + 0 < (size_t) std::max(state.audio_length, 0) && + (state.delayed_length == MOSS_DELAY_INT64_MAX || + 0 > std::max(state.delayed_length - 1, -1)); + if (sample_ch0) { + const std::vector ch0 = {0}; + const std::vector prev = moss_collect_audio_history_channels(state, ch0); + const std::vector ch0_logits(scaled_audio.begin(), scaled_audio.begin() + audio_vocab); + next_audio[0] = moss_sample_token( + ch0_logits, 1, audio_vocab, rng, &prev, + sampling_cfg.audio_repetition_penalty, + sampling_cfg.audio_top_p, + sampling_cfg.audio_top_k, + audio_do_sample)[0]; + } + + std::vector rest_channels; + for (size_t channel = 1; channel < n_vq; ++channel) { + const bool pre_audio = channel < (size_t) std::max(state.audio_length, 0); + const bool post_audio = state.delayed_length == MOSS_DELAY_INT64_MAX || + channel > (size_t) std::max(state.delayed_length - 1, -1); + if (pre_audio && post_audio) { + rest_channels.push_back(channel); + } + } + + if (!rest_channels.empty()) { + std::vector rest_logits(rest_channels.size() * audio_vocab); + for (size_t i = 0; i < rest_channels.size(); ++i) { + const size_t channel = rest_channels[i]; + std::copy_n( + scaled_audio.begin() + channel * audio_vocab, + audio_vocab, + rest_logits.begin() + i * audio_vocab); + } + const std::vector prev = moss_collect_audio_history_channels(state, rest_channels); + const std::vector sampled = moss_sample_token( + rest_logits, rest_channels.size(), audio_vocab, rng, &prev, + sampling_cfg.audio_repetition_penalty, + sampling_cfg.audio_top_p, + sampling_cfg.audio_top_k, + audio_do_sample); + for (size_t i = 0; i < rest_channels.size(); ++i) { + next_audio[rest_channels[i]] = sampled[i]; + } + } + } + + if (next_text == cfg.audio_start_token_id || + next_text == cfg.audio_assistant_gen_slot_token_id || + next_text == cfg.audio_assistant_delay_slot_token_id) { + state.audio_length += 1; + } + if (next_text == cfg.audio_end_token_id) { + state.audio_length = 0; + } + + if (state.delayed_length == MOSS_DELAY_INT64_MAX && next_text == cfg.audio_assistant_delay_slot_token_id) { + state.delayed_length = 0; + } + if (state.delayed_length != MOSS_DELAY_INT64_MAX) { + state.delayed_length += 1; + } + if (state.delayed_length > (int64_t) n_vq) { + state.delayed_length = MOSS_DELAY_INT64_MAX; + } + state.time_step += 1; + state.text_history.push_back(next_text); + state.append_audio(next_audio); + + result[0] = next_text; + std::copy(next_audio.begin(), next_audio.end(), result.begin() + 1); + return result; +} + +static std::vector moss_apply_delay_pattern( + const std::vector & codes, + size_t n_frames, + const moss_delay_config & cfg) { + GGML_ASSERT(cfg.n_vq > 0); + GGML_ASSERT(codes.size() == n_frames * cfg.n_vq); + + const size_t delayed_frames = n_frames + cfg.n_vq - 1; + std::vector delayed(delayed_frames * cfg.n_vq, cfg.audio_pad_code); + + for (size_t channel = 0; channel < cfg.n_vq; ++channel) { + for (size_t t = 0; t < n_frames; ++t) { + delayed[(channel + t) * cfg.n_vq + channel] = codes[t * cfg.n_vq + channel]; + } + } + + return delayed; +} + +static std::vector moss_apply_de_delay_pattern( + const std::vector & delayed_codes, + size_t delayed_frames, + const moss_delay_config & cfg, + size_t * out_frames = nullptr) { + GGML_ASSERT(cfg.n_vq > 0); + GGML_ASSERT(delayed_codes.size() == delayed_frames * cfg.n_vq); + + if (delayed_frames + 1 <= cfg.n_vq) { + if (out_frames != nullptr) { + *out_frames = 0; + } + return {}; + } + + const size_t n_frames = delayed_frames - cfg.n_vq + 1; + std::vector codes(n_frames * cfg.n_vq); + for (size_t channel = 0; channel < cfg.n_vq; ++channel) { + for (size_t t = 0; t < n_frames; ++t) { + codes[t * cfg.n_vq + channel] = delayed_codes[(channel + t) * cfg.n_vq + channel]; + } + } + + if (out_frames != nullptr) { + *out_frames = n_frames; + } + + return codes; +} + +static std::vector moss_extract_audio_segments( + const std::vector & generation_audio, + size_t delayed_frames, + const moss_delay_config & cfg) { + size_t n_frames = 0; + const std::vector codes = moss_apply_de_delay_pattern(generation_audio, delayed_frames, cfg, &n_frames); + if (n_frames == 0) { + return {}; + } + + std::vector segments; + size_t cur_start = SIZE_MAX; + + for (size_t t = 0; t < n_frames; ++t) { + bool is_pad = true; + for (size_t channel = 0; channel < cfg.n_vq; ++channel) { + if (codes[t * cfg.n_vq + channel] != cfg.audio_pad_code) { + is_pad = false; + break; + } + } + + if (!is_pad && cur_start == SIZE_MAX) { + cur_start = t; + } + + const bool close_segment = cur_start != SIZE_MAX && (is_pad || t + 1 == n_frames); + if (close_segment) { + const size_t cur_end = is_pad ? t : t + 1; + moss_audio_segment seg; + seg.n_frames = cur_end - cur_start; + seg.codes.insert( + seg.codes.end(), + codes.begin() + cur_start * cfg.n_vq, + codes.begin() + cur_end * cfg.n_vq); + segments.push_back(std::move(seg)); + cur_start = SIZE_MAX; + } + } + + return segments; +} + +static std::vector moss_concat_audio_segments( + const std::vector & segments, + size_t n_vq, + size_t * out_frames = nullptr) { + size_t total_frames = 0; + size_t total_tokens = 0; + for (const auto & seg : segments) { + total_frames += seg.n_frames; + total_tokens += seg.codes.size(); + } + + std::vector out; + out.reserve(total_tokens); + for (const auto & seg : segments) { + GGML_ASSERT(seg.codes.size() == seg.n_frames * n_vq); + out.insert(out.end(), seg.codes.begin(), seg.codes.end()); + } + + if (out_frames != nullptr) { + *out_frames = total_frames; + } + return out; +} + +static void moss_write_codes_file( + const std::string & path, + const std::vector & raw_codes, + size_t raw_frames, + const moss_delay_config & cfg) { + GGML_ASSERT(raw_codes.size() == raw_frames * cfg.n_vq); + + std::ofstream out(path, std::ios::binary); + if (!out) { + throw std::runtime_error("failed to open codes file for writing: " + path); + } + + moss_codes_header hdr; + hdr.n_frames = (uint32_t) raw_frames; + hdr.n_vq = cfg.n_vq; + + moss_write_exact(out, &hdr, 1, "codes header"); + moss_write_exact(out, raw_codes.data(), raw_codes.size(), "codes payload"); +} + +static int moss_run_audio_decoder_helper( + const std::string & python_bin, + const std::string & helper_script, + const std::string & codes_path, + const std::string & wav_path, + const std::string & encoder_onnx, + const std::string & decoder_onnx, + bool use_gpu_audio) { + std::ostringstream cmd; + cmd + << moss_shell_quote(python_bin) << " " + << moss_shell_quote(helper_script) + << " --codes-bin " << moss_shell_quote(codes_path) + << " --wav-out " << moss_shell_quote(wav_path) + << " --encoder-onnx " << moss_shell_quote(encoder_onnx) + << " --decoder-onnx " << moss_shell_quote(decoder_onnx); + if (!use_gpu_audio) { + cmd << " --cpu"; + } + + LOG("running audio decoder helper: %s\n", cmd.str().c_str()); + return std::system(cmd.str().c_str()); +} + +static bool moss_decode_parity( + const std::string & ref_path, + const std::string & dump_codes_path, + const std::string & python_bin, + const std::string & helper_script, + const std::string & encoder_onnx, + const std::string & decoder_onnx, + const std::string & wav_out, + bool use_gpu_audio) { + std::ifstream in(ref_path, std::ios::binary); + if (!in) { + throw std::runtime_error("failed to open decode parity reference: " + ref_path); + } + + moss_decode_ref_header hdr; + moss_read_exact(in, &hdr, 1, "decode parity header"); + if (hdr.magic != MOSS_DECODE_REF_MAGIC || hdr.version != MOSS_DECODE_REF_VERSION) { + throw std::runtime_error("unexpected decode parity reference format"); + } + + moss_delay_config cfg; + cfg.n_vq = hdr.n_vq; + cfg.audio_pad_code = (llama_token) hdr.audio_pad_code; + + std::vector packed_ids((size_t) hdr.packed_frames * cfg.packed_stride()); + std::vector ref_raw_codes((size_t) hdr.raw_frames * cfg.n_vq); + moss_read_exact(in, packed_ids.data(), packed_ids.size(), "packed ids"); + moss_read_exact(in, ref_raw_codes.data(), ref_raw_codes.size(), "reference raw codes"); + + const moss_generation_audio decoded = moss_decode_generation_audio(packed_ids, hdr.prompt_frames, cfg); + + size_t mismatch_count = 0; + const size_t compare_count = std::min(decoded.raw_codes.size(), ref_raw_codes.size()); + for (size_t i = 0; i < compare_count; ++i) { + if (decoded.raw_codes[i] != ref_raw_codes[i]) { + ++mismatch_count; + } + } + mismatch_count += decoded.raw_codes.size() > ref_raw_codes.size() + ? decoded.raw_codes.size() - ref_raw_codes.size() + : ref_raw_codes.size() - decoded.raw_codes.size(); + + LOG("moss-tts delay decode parity: prompt_frames=%u delayed_frames=%zu raw_frames=%zu ref_raw_frames=%u mismatch_count=%zu segments=%zu\n", + hdr.prompt_frames, + decoded.delayed_frames, + decoded.raw_frames, + hdr.raw_frames, + mismatch_count, + decoded.segments.size()); + + if (!dump_codes_path.empty()) { + moss_write_codes_file(dump_codes_path, decoded.raw_codes, decoded.raw_frames, cfg); + } + + if (!helper_script.empty()) { + if (dump_codes_path.empty()) { + throw std::runtime_error("--audio-decoder-script requires --dump-raw-codes"); + } + if (wav_out.empty()) { + throw std::runtime_error("--audio-decoder-script requires --wav-out"); + } + if (encoder_onnx.empty() || decoder_onnx.empty()) { + throw std::runtime_error("--audio-decoder-script requires both --audio-encoder-onnx and --audio-decoder-onnx"); + } + + const int rc = moss_run_audio_decoder_helper( + python_bin, helper_script, dump_codes_path, wav_out, + encoder_onnx, decoder_onnx, use_gpu_audio); + if (rc != 0) { + throw std::runtime_error("audio decoder helper failed with exit code " + std::to_string(rc)); + } + } + + return mismatch_count == 0; +} + +static moss_owned_batch moss_batch_from_packed_rows( + const std::vector & packed_ids, + size_t start_frame, + size_t n_frames, + const moss_delay_config & cfg, + size_t pos_start, + bool output_last) { + GGML_ASSERT(cfg.n_vq > 0); + GGML_ASSERT(packed_ids.size() % cfg.packed_stride() == 0); + GGML_ASSERT(start_frame + n_frames <= packed_ids.size() / cfg.packed_stride()); + + moss_owned_batch owned_batch((int32_t) n_frames, 0, 1); + llama_batch & batch = owned_batch.batch; + batch.n_tokens = (int32_t) n_frames; + batch.n_token_audio = (int32_t) cfg.n_vq; + owned_batch.token_audio.resize(n_frames * cfg.n_vq); + owned_batch.refresh_token_audio_ptr(); + + for (size_t i = 0; i < n_frames; ++i) { + const size_t row = (start_frame + i) * cfg.packed_stride(); + batch.token[i] = packed_ids[row + 0]; + std::memcpy( + batch.token_audio + i * cfg.n_vq, + packed_ids.data() + row + 1, + sizeof(llama_token) * cfg.n_vq); + batch.pos[i] = (llama_pos) (pos_start + i); + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = output_last && (i + 1 == n_frames); + } + + return owned_batch; +} + +static void moss_generate_from_ref( + const std::string & model_path, + const std::string & ref_path, + int32_t n_gpu_layers, + int32_t max_new_tokens, + const moss_sampling_config & sampling_cfg, + uint32_t seed, + const std::string & dump_raw_codes_path, + const std::string & python_bin, + const std::string & helper_script, + const std::string & encoder_onnx, + const std::string & decoder_onnx, + const std::string & wav_out, + bool use_gpu_audio) { + std::ifstream in(ref_path, std::ios::binary); + if (!in) { + throw std::runtime_error("failed to open generation reference: " + ref_path); + } + + moss_generation_ref_header hdr; + moss_read_exact(in, &hdr, 1, "generation reference header"); + if (hdr.magic != MOSS_GEN_REF_MAGIC || hdr.version != MOSS_GEN_REF_VERSION) { + throw std::runtime_error("unexpected generation reference format"); + } + + moss_delay_config cfg; + cfg.n_vq = hdr.n_vq; + cfg.audio_pad_code = (llama_token) hdr.audio_pad_code; + + std::vector prompt_packed((size_t) hdr.prompt_packed_frames * cfg.packed_stride()); + std::vector ignored_ref_raw_codes((size_t) hdr.raw_frames * cfg.n_vq); + moss_read_exact(in, prompt_packed.data(), prompt_packed.size(), "prompt packed ids"); + moss_read_exact(in, ignored_ref_raw_codes.data(), ignored_ref_raw_codes.size(), "reference raw codes"); + + llama_backend_scope backend_scope; + + llama_model_params mparams = llama_model_default_params(); + mparams.use_mmap = true; + mparams.n_gpu_layers = n_gpu_layers; + + llama_model_ptr model(llama_model_load_from_file(model_path.c_str(), mparams)); + if (!model) { + throw std::runtime_error("failed to load model: " + model_path); + } + + const llama_vocab * vocab = llama_model_get_vocab(model.get()); + const int32_t text_vocab = llama_vocab_n_tokens(vocab); + const moss_delay_config model_cfg = moss_delay_config_from_model(model.get()); + + if (model_cfg.n_vq != cfg.n_vq) { + throw std::runtime_error("generation reference n_vq does not match model metadata"); + } + cfg.audio_vocab_size = model_cfg.audio_vocab_size; + + llama_context_params cparams = llama_context_default_params(); + cparams.n_ctx = std::max((uint32_t) hdr.prompt_frames + (uint32_t) max_new_tokens + 8u, 64u); + cparams.n_batch = std::max((uint32_t) hdr.prompt_frames, 1u); + cparams.n_ubatch = cparams.n_batch; + cparams.n_seq_max = 1; + cparams.embeddings = false; + + llama_context_ptr ctx(llama_init_from_model(model.get(), cparams)); + if (!ctx) { + throw std::runtime_error("failed to create context"); + } + + llama_set_warmup(ctx.get(), false); + llama_set_causal_attn(ctx.get(), true); + llama_set_embeddings(ctx.get(), false); + + { + moss_owned_batch batch = moss_batch_from_packed_rows( + prompt_packed, 0, hdr.prompt_frames, cfg, 0, true); + const int ret = llama_decode(ctx.get(), batch.batch); + if (ret != 0) { + throw std::runtime_error("prefill llama_decode failed: " + std::to_string(ret)); + } + } + + moss_delay_state state = moss_init_delay_state(prompt_packed, cfg); + + std::vector generated_packed; + generated_packed.reserve((size_t) max_new_tokens * cfg.packed_stride()); + + const size_t audio_vocab = moss_audio_vocab_with_pad(cfg); + moss_rng rng(seed); + + for (int32_t step = 0; step < max_new_tokens; ++step) { + const float * logits = llama_get_logits_ith(ctx.get(), -1); + if (logits == nullptr) { + throw std::runtime_error("llama_get_logits_ith returned null"); + } + + std::vector text_logits(logits, logits + text_vocab); + std::vector audio_logits( + logits + text_vocab, + logits + text_vocab + cfg.n_vq * audio_vocab); + + const std::vector next = moss_delay_step( + state, text_logits, audio_logits, sampling_cfg, cfg, rng); + generated_packed.insert(generated_packed.end(), next.begin(), next.end()); + + moss_owned_batch batch = moss_batch_from_packed_rows( + generated_packed, generated_packed.size() / cfg.packed_stride() - 1, 1, cfg, + hdr.prompt_frames + (size_t) step, true); + const int ret = llama_decode(ctx.get(), batch.batch); + if (ret != 0) { + throw std::runtime_error("generation llama_decode failed: " + std::to_string(ret)); + } + + if (state.is_stopping) { + break; + } + } + + const moss_generation_audio decoded = moss_decode_generation_audio(state, hdr.prompt_frames, cfg); + + LOG("moss-tts first-class generation: prompt_frames=%u generated_frames=%zu raw_frames=%zu input_ref_raw_frames=%u\n", + hdr.prompt_frames, + generated_packed.size() / cfg.packed_stride(), + decoded.raw_frames, + hdr.raw_frames); + + if (!dump_raw_codes_path.empty()) { + moss_write_codes_file(dump_raw_codes_path, decoded.raw_codes, decoded.raw_frames, cfg); + } + + if (!helper_script.empty()) { + if (dump_raw_codes_path.empty()) { + throw std::runtime_error("--audio-decoder-script requires --dump-raw-codes"); + } + if (wav_out.empty()) { + throw std::runtime_error("--audio-decoder-script requires --wav-out"); + } + if (encoder_onnx.empty() || decoder_onnx.empty()) { + throw std::runtime_error("--audio-decoder-script requires both ONNX paths"); + } + + const int rc = moss_run_audio_decoder_helper( + python_bin, helper_script, dump_raw_codes_path, wav_out, + encoder_onnx, decoder_onnx, use_gpu_audio); + if (rc != 0) { + throw std::runtime_error("audio decoder helper failed with exit code " + std::to_string(rc)); + } + } +} + +static std::vector moss_audio_history_slice( + const moss_delay_state & state, + size_t start_frame, + size_t * out_frames = nullptr) { + const size_t total_frames = state.audio_frames(); + if (start_frame >= total_frames) { + if (out_frames != nullptr) { + *out_frames = 0; + } + return {}; + } + + const size_t n_frames = total_frames - start_frame; + std::vector out; + out.reserve(n_frames * state.n_vq); + out.insert( + out.end(), + state.audio_history.begin() + start_frame * state.n_vq, + state.audio_history.end()); + + if (out_frames != nullptr) { + *out_frames = n_frames; + } + + return out; +} + +static moss_generation_audio moss_decode_generation_audio( + const moss_delay_state & state, + size_t prompt_frames, + const moss_delay_config & cfg) { + GGML_ASSERT(state.n_vq == cfg.n_vq); + + moss_generation_audio out; + out.delayed_codes = moss_audio_history_slice(state, prompt_frames, &out.delayed_frames); + if (out.delayed_frames == 0) { + return out; + } + + out.segments = moss_extract_audio_segments(out.delayed_codes, out.delayed_frames, cfg); + out.raw_codes = moss_concat_audio_segments(out.segments, cfg.n_vq, &out.raw_frames); + return out; +} + +static moss_generation_audio moss_decode_generation_audio( + const std::vector & packed_ids, + size_t prompt_frames, + const moss_delay_config & cfg) { + GGML_ASSERT(cfg.n_vq > 0); + GGML_ASSERT(packed_ids.size() % cfg.packed_stride() == 0); + + const size_t total_frames = packed_ids.size() / cfg.packed_stride(); + GGML_ASSERT(prompt_frames <= total_frames); + + moss_generation_audio out; + out.delayed_frames = total_frames - prompt_frames; + out.delayed_codes.reserve(out.delayed_frames * cfg.n_vq); + + for (size_t t = prompt_frames; t < total_frames; ++t) { + const size_t row = t * cfg.packed_stride(); + out.delayed_codes.insert( + out.delayed_codes.end(), + packed_ids.begin() + row + 1, + packed_ids.begin() + row + 1 + cfg.n_vq); + } + + if (out.delayed_frames == 0) { + return out; + } + + out.segments = moss_extract_audio_segments(out.delayed_codes, out.delayed_frames, cfg); + out.raw_codes = moss_concat_audio_segments(out.segments, cfg.n_vq, &out.raw_frames); + return out; +} + +static std::string moss_delay_config_to_string(const moss_delay_config & cfg) { + std::ostringstream oss; + oss + << "n_vq=" << cfg.n_vq + << " pad_token_id=" << cfg.pad_token_id + << " im_start_token_id=" << cfg.im_start_token_id + << " im_end_token_id=" << cfg.im_end_token_id + << " audio_start_token_id=" << cfg.audio_start_token_id + << " audio_end_token_id=" << cfg.audio_end_token_id + << " audio_user_slot_token_id=" << cfg.audio_user_slot_token_id + << " audio_gen_slot_token_id=" << cfg.audio_assistant_gen_slot_token_id + << " audio_delay_slot_token_id=" << cfg.audio_assistant_delay_slot_token_id + << " audio_pad_code=" << cfg.audio_pad_code + << " audio_vocab_size=" << cfg.audio_vocab_size; + return oss.str(); +} + +static bool moss_delay_self_test() { + moss_delay_config cfg; + + std::vector codes = { + 10, 11, 12, + 20, 21, 22, + 30, 31, 32, + }; + cfg.n_vq = 3; + cfg.audio_pad_code = 99; + + const std::vector delayed = moss_apply_delay_pattern(codes, 3, cfg); + const std::vector expected_delayed = { + 10, 99, 99, + 20, 11, 99, + 30, 21, 12, + 99, 31, 22, + 99, 99, 32, + }; + if (delayed != expected_delayed) { + return false; + } + + size_t dedelayed_frames = 0; + const std::vector restored = moss_apply_de_delay_pattern(delayed, 5, cfg, &dedelayed_frames); + if (dedelayed_frames != 3 || restored != codes) { + return false; + } + + std::vector packed = { + 1, 99, 99, 99, + cfg.audio_start_token_id, 10, 11, 12, + cfg.audio_assistant_gen_slot_token_id, 20, 21, 22, + }; + const moss_delay_state state = moss_init_delay_state(packed, cfg); + if (!(state.text_history.size() == 3 && + state.audio_frames() == 3 && + state.is_audio && + state.audio_length == 2 && + !state.is_stopping && + state.time_step == 0)) { + return false; + } + + { + std::vector logits = { + 3.0f, 2.0f, 1.0f, + 1.0f, 3.0f, 2.0f, + }; + std::vector prev = {1}; + moss_apply_repetition_penalty_inplace(logits, 2, 3, &prev, 2.0f); + if (std::fabs(logits[1] - 1.0f) > 1e-6f || std::fabs(logits[4] - 1.5f) > 1e-6f) { + return false; + } + } + + { + std::vector logits = {5.0f, 4.0f, 1.0f}; + moss_apply_top_p_inplace(logits, 1, 3, 0.7f); + if (!std::isfinite(logits[0]) || std::isfinite(logits[1]) || std::isfinite(logits[2])) { + return false; + } + } + + { + moss_rng rng(123); + const std::vector logits = { + 1.0f, 9.0f, 3.0f, + 2.0f, 1.0f, 8.0f, + }; + const std::vector sampled = moss_sample_token(logits, 2, 3, rng, nullptr, 1.0f, 1.0f, 1, true); + if (sampled.size() != 2 || sampled[0] != 1 || sampled[1] != 2) { + return false; + } + } + + { + moss_delay_state step_state; + step_state.n_vq = 3; + step_state.audio_length = 2; + step_state.is_audio = true; + step_state.text_history = {cfg.audio_start_token_id, cfg.audio_assistant_gen_slot_token_id}; + step_state.audio_history = { + 3, 4, cfg.audio_pad_code, + 5, 6, cfg.audio_pad_code, + }; + + const std::vector text_logits = { + 0.0f, 0.0f, 0.0f, 0.0f, 10.0f, 9.0f, 0.0f, 0.0f, + }; + moss_delay_config step_cfg = cfg; + step_cfg.pad_token_id = 0; + step_cfg.im_end_token_id = 1; + step_cfg.audio_start_token_id = 2; + step_cfg.audio_end_token_id = 3; + step_cfg.audio_assistant_gen_slot_token_id = 4; + step_cfg.audio_assistant_delay_slot_token_id = 5; + step_cfg.audio_pad_code = 7; + step_cfg.audio_vocab_size = 7; + + const std::vector audio_logits = { + 1.0f, 8.0f, 2.0f, 1.0f, 1.0f, 1.0f, 1.0f, -100.0f, + 2.0f, 1.0f, 9.0f, 1.0f, 1.0f, 1.0f, 1.0f, -100.0f, + 9.0f, 1.0f, 2.0f, 1.0f, 1.0f, 1.0f, 1.0f, -100.0f, + }; + moss_sampling_config sampling_cfg; + sampling_cfg.text_temperature = 1.0f; + sampling_cfg.text_top_k = 1; + sampling_cfg.audio_temperature = 1.0f; + sampling_cfg.audio_top_k = 1; + + moss_rng rng(7); + const std::vector next = moss_delay_step( + step_state, text_logits, audio_logits, sampling_cfg, step_cfg, rng); + if (next.size() != 4 || next[0] != 4 || next[1] != 1 || next[2] != 2 || next[3] != 7) { + return false; + } + } + + { + moss_delay_config decode_cfg = cfg; + decode_cfg.n_vq = 3; + decode_cfg.audio_pad_code = 99; + + const std::vector prompt_audio = { + 77, 99, 99, + 88, 66, 99, + }; + const std::vector raw_codes = { + 10, 11, 12, + 20, 21, 22, + 30, 31, 32, + }; + const std::vector delayed = moss_apply_delay_pattern(raw_codes, 3, decode_cfg); + + moss_delay_state decode_state; + decode_state.n_vq = decode_cfg.n_vq; + decode_state.audio_history = prompt_audio; + decode_state.append_audio(delayed.data() + 0 * decode_cfg.n_vq); + decode_state.append_audio(delayed.data() + 1 * decode_cfg.n_vq); + decode_state.append_audio(delayed.data() + 2 * decode_cfg.n_vq); + decode_state.append_audio(delayed.data() + 3 * decode_cfg.n_vq); + decode_state.append_audio(delayed.data() + 4 * decode_cfg.n_vq); + + const moss_generation_audio decoded = moss_decode_generation_audio(decode_state, 2, decode_cfg); + if (decoded.delayed_frames != 5 || decoded.raw_frames != 3 || decoded.raw_codes != raw_codes) { + return false; + } + if (decoded.segments.size() != 1 || decoded.segments[0].n_frames != 3 || decoded.segments[0].codes != raw_codes) { + return false; + } + } + + { + moss_delay_config decode_cfg = cfg; + decode_cfg.n_vq = 3; + decode_cfg.audio_pad_code = 99; + + const std::vector raw_a = { + 10, 11, 12, + 20, 21, 22, + }; + const std::vector raw_b = { + 40, 41, 42, + }; + const std::vector delayed_a = moss_apply_delay_pattern(raw_a, 2, decode_cfg); + const std::vector delayed_b = moss_apply_delay_pattern(raw_b, 1, decode_cfg); + + std::vector packed = { + 100, 99, 99, 99, + 101, 99, 99, 99, + }; + auto append_delayed_rows = [&](llama_token text_token, const std::vector & delayed_rows, size_t n_frames) { + for (size_t t = 0; t < n_frames; ++t) { + packed.push_back(text_token); + packed.insert( + packed.end(), + delayed_rows.begin() + t * decode_cfg.n_vq, + delayed_rows.begin() + (t + 1) * decode_cfg.n_vq); + } + }; + append_delayed_rows(200, delayed_a, 4); + packed.push_back(201); + packed.insert(packed.end(), {99, 99, 99}); + append_delayed_rows(202, delayed_b, 3); + + const moss_generation_audio decoded = moss_decode_generation_audio(packed, 2, decode_cfg); + const std::vector raw_expected = { + 10, 11, 12, + 20, 21, 22, + 40, 41, 42, + }; + if (decoded.segments.size() != 2 || decoded.raw_frames != 3 || decoded.raw_codes != raw_expected) { + return false; + } + if (decoded.segments[0].codes != raw_a || decoded.segments[1].codes != raw_b) { + return false; + } + } + + return true; +} + +} // namespace + +int main(int argc, char ** argv) { + std::string model_path; + std::string decode_parity_ref_path; + std::string generation_input_path; + std::string dump_raw_codes_path; + std::string audio_decoder_script; + std::string audio_encoder_onnx; + std::string audio_decoder_onnx; + std::string wav_out_path; + std::string python_bin = "python"; + bool print_delay_config = false; + bool self_test = false; + bool use_gpu_audio = true; + int32_t n_gpu_layers = -1; + int32_t max_new_tokens = 2048; + uint32_t seed = 1234; + moss_sampling_config sampling_cfg; + + for (int i = 1; i < argc; ++i) { + const std::string arg = argv[i]; + if ((arg == "-m" || arg == "--model") && i + 1 < argc) { + model_path = argv[++i]; + continue; + } + if (arg == "--generation-input" && i + 1 < argc) { + generation_input_path = argv[++i]; + continue; + } + if (arg == "--generation-ref" && i + 1 < argc) { + generation_input_path = argv[++i]; + LOG("warning: --generation-ref is deprecated; use --generation-input instead.\n"); + continue; + } + if (arg == "--decode-parity-ref" && i + 1 < argc) { + decode_parity_ref_path = argv[++i]; + continue; + } + if ((arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") && i + 1 < argc) { + n_gpu_layers = std::stoi(argv[++i]); + continue; + } + if (arg == "--max-new-tokens" && i + 1 < argc) { + max_new_tokens = std::stoi(argv[++i]); + continue; + } + if (arg == "--seed" && i + 1 < argc) { + seed = (uint32_t) std::stoul(argv[++i]); + continue; + } + if (arg == "--dump-raw-codes" && i + 1 < argc) { + dump_raw_codes_path = argv[++i]; + continue; + } + if (arg == "--audio-decoder-script" && i + 1 < argc) { + audio_decoder_script = argv[++i]; + continue; + } + if (arg == "--audio-encoder-onnx" && i + 1 < argc) { + audio_encoder_onnx = argv[++i]; + continue; + } + if (arg == "--audio-decoder-onnx" && i + 1 < argc) { + audio_decoder_onnx = argv[++i]; + continue; + } + if (arg == "--wav-out" && i + 1 < argc) { + wav_out_path = argv[++i]; + continue; + } + if (arg == "--python-bin" && i + 1 < argc) { + python_bin = argv[++i]; + continue; + } + if (arg == "--text-temperature" && i + 1 < argc) { + sampling_cfg.text_temperature = std::stof(argv[++i]); + continue; + } + if (arg == "--text-top-p" && i + 1 < argc) { + sampling_cfg.text_top_p = std::stof(argv[++i]); + continue; + } + if (arg == "--text-top-k" && i + 1 < argc) { + sampling_cfg.text_top_k = std::stoi(argv[++i]); + continue; + } + if (arg == "--audio-temperature" && i + 1 < argc) { + sampling_cfg.audio_temperature = std::stof(argv[++i]); + continue; + } + if (arg == "--audio-top-p" && i + 1 < argc) { + sampling_cfg.audio_top_p = std::stof(argv[++i]); + continue; + } + if (arg == "--audio-top-k" && i + 1 < argc) { + sampling_cfg.audio_top_k = std::stoi(argv[++i]); + continue; + } + if (arg == "--audio-repetition-penalty" && i + 1 < argc) { + sampling_cfg.audio_repetition_penalty = std::stof(argv[++i]); + continue; + } + if (arg == "--audio-decoder-cpu") { + use_gpu_audio = false; + continue; + } + if (arg == "--print-delay-config") { + print_delay_config = true; + continue; + } + if (arg == "--self-test-delay-state") { + self_test = true; + continue; + } + if (arg == "-h" || arg == "--help") { + print_usage(argc, argv); + return EXIT_SUCCESS; + } + + LOG_ERR("unknown argument: %s\n", arg.c_str()); + print_usage(argc, argv); + return EXIT_FAILURE; + } + + if (self_test) { + if (!moss_delay_self_test()) { + LOG_ERR("moss delay state self-test failed\n"); + return EXIT_FAILURE; + } + LOG("moss delay state self-test: ok\n"); + } + + if (!generation_input_path.empty()) { + if (model_path.empty()) { + LOG_ERR("--generation-input requires -m \n"); + return EXIT_FAILURE; + } + try { + moss_generate_from_ref( + model_path, + generation_input_path, + n_gpu_layers, + max_new_tokens, + sampling_cfg, + seed, + dump_raw_codes_path, + python_bin, + audio_decoder_script, + audio_encoder_onnx, + audio_decoder_onnx, + wav_out_path, + use_gpu_audio); + return EXIT_SUCCESS; + } catch (const std::exception & err) { + LOG_ERR("generation failed: %s\n", err.what()); + return EXIT_FAILURE; + } + } + + if (!decode_parity_ref_path.empty()) { + try { + const bool ok = moss_decode_parity( + decode_parity_ref_path, + dump_raw_codes_path, + python_bin, + audio_decoder_script, + audio_encoder_onnx, + audio_decoder_onnx, + wav_out_path, + use_gpu_audio); + return ok ? EXIT_SUCCESS : EXIT_FAILURE; + } catch (const std::exception & err) { + LOG_ERR("decode parity failed: %s\n", err.what()); + return EXIT_FAILURE; + } + } + + if (!print_delay_config) { + if (self_test) { + return EXIT_SUCCESS; + } + LOG("moss delay state, multi-head sampler, and raw-code decode are in place; audio decode is available via the external Python/ONNX helper.\n"); + LOG("use --print-delay-config with -m to inspect model metadata.\n"); + LOG("use --decode-parity-ref to verify C++ de-delay/raw-code extraction against Python.\n"); + LOG("use --generation-input -m for first-class generation.\n"); + return EXIT_SUCCESS; + } + + if (model_path.empty()) { + LOG_ERR("--print-delay-config requires -m \n"); + return EXIT_FAILURE; + } + + llama_backend_scope backend_scope; + + llama_model_params mparams = llama_model_default_params(); + mparams.use_mmap = true; + mparams.n_gpu_layers = n_gpu_layers; + + llama_model_ptr model(llama_model_load_from_file(model_path.c_str(), mparams)); + if (!model) { + LOG_ERR("failed to load model: %s\n", model_path.c_str()); + return EXIT_FAILURE; + } + + const moss_delay_config cfg = moss_delay_config_from_model(model.get()); + LOG("%s\n", moss_delay_config_to_string(cfg).c_str()); + + return EXIT_SUCCESS; +} From fbf8865f5bdeab855fdfbe90317ee1bca85422bc Mon Sep 17 00:00:00 2001 From: Zile Wang <116347517+expectqwq@users.noreply.github.com> Date: Thu, 19 Mar 2026 14:27:35 +0800 Subject: [PATCH 2/2] docs: clarify first-class GGUF preparation --- docs/moss-tts-firstclass-e2e.md | 22 ++++++++++++++++++++-- docs/moss-tts-firstclass-e2e_zh.md | 22 ++++++++++++++++++++-- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/docs/moss-tts-firstclass-e2e.md b/docs/moss-tts-firstclass-e2e.md index 5015fd77d..bdf9efd96 100644 --- a/docs/moss-tts-firstclass-e2e.md +++ b/docs/moss-tts-firstclass-e2e.md @@ -24,6 +24,7 @@ Unlike the older `moss_tts_delay/llama_cpp` backend in the `MOSS-TTS` repository 4. Python packages required by the helper scripts: - `numpy` - `soundfile` + - `tokenizers` - `onnxruntime` ## Build @@ -55,7 +56,24 @@ You need a first-class MOSS-TTS-Delay GGUF model that already contains: For example: -- `out/stage1a_moss_delay_firstclass_f16.gguf` +- `out/moss_delay_firstclass_f16.gguf` + +You can generate it directly from the full Hugging Face MOSS-TTS model directory: + +```bash +huggingface-cli download OpenMOSS-Team/MOSS-TTS --local-dir /path/to/MOSS-TTS-hf + +python convert_hf_to_gguf.py \ + /path/to/MOSS-TTS-hf \ + --outfile /path/to/moss_delay_firstclass_f16.gguf \ + --outtype f16 +``` + +Important: + +- The `--model-gguf` file used by this e2e pipeline is a **special first-class MOSS-TTS-Delay GGUF** generated from the full `OpenMOSS-Team/MOSS-TTS` Hugging Face model directory with the command above. +- It is **not** the same thing as a generic GGUF downloaded from `OpenMOSS/MOSS-TTS-GGUF`. +- Do not point this pipeline at a file from `OpenMOSS/MOSS-TTS-GGUF` unless that file was explicitly produced as a first-class MOSS-TTS-Delay GGUF for this `llama.cpp` implementation. ### Step 2: Prepare the tokenizer directory @@ -146,7 +164,7 @@ python tools/tts/moss-tts-firstclass-e2e.py \ | `--onnx-encoder` | path | Audio tokenizer encoder ONNX | | `--onnx-decoder` | path | Audio tokenizer decoder ONNX | | `--text` / `--text-file` | string / path | Input text, choose exactly one | -| `--reference-audio` | path | Optional 24 kHz reference audio | +| `--reference-audio` | path | Optional reference audio; if provided, it must be 24 kHz | | `--language` | `zh` / `en` / tag | Language tag passed to the prompt builder | | `--max-new-tokens` | int | Maximum generation steps | | `--text-temperature` | float | Text-channel sampling temperature, default `1.5` | diff --git a/docs/moss-tts-firstclass-e2e_zh.md b/docs/moss-tts-firstclass-e2e_zh.md index 345187e3b..644a4bf4c 100644 --- a/docs/moss-tts-firstclass-e2e_zh.md +++ b/docs/moss-tts-firstclass-e2e_zh.md @@ -24,6 +24,7 @@ 4. helper scripts 需要的 Python 包: - `numpy` - `soundfile` + - `tokenizers` - `onnxruntime` ## 编译 @@ -55,7 +56,24 @@ cmake --build build --target llama-moss-tts -j 例如: -- `out/stage1a_moss_delay_firstclass_f16.gguf` +- `out/moss_delay_firstclass_f16.gguf` + +你可以直接从完整的 Hugging Face MOSS-TTS 模型目录生成它: + +```bash +huggingface-cli download OpenMOSS-Team/MOSS-TTS --local-dir /path/to/MOSS-TTS-hf + +python convert_hf_to_gguf.py \ + /path/to/MOSS-TTS-hf \ + --outfile /path/to/moss_delay_firstclass_f16.gguf \ + --outtype f16 +``` + +重要说明: + +- 这里 `--model-gguf` 使用的是一个**特殊的 first-class MOSS-TTS-Delay GGUF**,它需要像上面这样,从完整的 `OpenMOSS-Team/MOSS-TTS` Hugging Face 模型目录直接转换得到。 +- 它**不是** `OpenMOSS/MOSS-TTS-GGUF` 仓库里的通用 GGUF 文件。 +- 除非某个文件被明确说明为适配这套 `llama.cpp` first-class 实现的 MOSS-TTS-Delay GGUF,否则不要把 `OpenMOSS/MOSS-TTS-GGUF` 里的文件直接拿来给这条 e2e 流水线使用。 ### 第二步:准备 tokenizer 目录 @@ -147,7 +165,7 @@ python tools/tts/moss-tts-firstclass-e2e.py \ | `--onnx-encoder` | path | 音频 tokenizer encoder ONNX | | `--onnx-decoder` | path | 音频 tokenizer decoder ONNX | | `--text` / `--text-file` | string / path | 输入文本,二选一 | -| `--reference-audio` | path | 可选的 24 kHz 参考音频 | +| `--reference-audio` | path | 可选参考音频;如果提供,必须是 24 kHz | | `--language` | `zh` / `en` / tag | 传给 prompt builder 的语言标签 | | `--max-new-tokens` | int | 最大生成步数 | | `--text-temperature` | float | 文本通道采样温度,默认 `1.5` |