From 1a4e3350f4233d222881a23cae261f77176be27c Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Tue, 10 Mar 2026 15:52:10 +0800
Subject: [PATCH 01/20] Add MOSS-TTS Delay GGUF conversion support

---
 convert_hf_to_gguf.py          | 67 ++++++++++++++++++++++++++++++++++
 gguf-py/gguf/constants.py      | 34 +++++++++++++++++
 gguf-py/gguf/tensor_mapping.py |  6 +++
 3 files changed, 107 insertions(+)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 083b5bca9..b0de64204 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4459,6 +4459,73 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield from super().modify_tensors(data_torch, name, bid)
 
 
+@ModelBase.register("MossTTSDelayModel", "MossTTSDelayForCausalLM")
+class MossTTSDelayModel(Qwen3Model):
+    model_arch = gguf.MODEL_ARCH.MOSS_TTS_DELAY
+
+    def __init__(self, *args, **kwargs):
+        hparams = kwargs.get("hparams")
+        if hparams is None:
+            hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
+        else:
+            hparams = dict(hparams)
+
+        language_config = hparams.get("language_config")
+        if isinstance(language_config, dict):
+            # Expose the Qwen3 backbone params at the root level so TextModel can
+            # discover block_count / hidden_size / attention params without
+            # losing the top-level MOSS architecture identity.
+            language_hparams = {
+                key: value
+                for key, value in language_config.items()
+                if key not in ("architectures", "model_type")
+            }
+            hparams = {**hparams, **language_hparams}
+
+        kwargs["hparams"] = hparams
+        super().__init__(*args, **kwargs)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        arch = self.gguf_writer.arch
+        self.gguf_writer.add_uint32(gguf.Keys.LLM.N_VQ.format(arch=arch), self.hparams["n_vq"])
+        self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_VOCAB_SIZE.format(arch=arch), self.hparams["audio_vocab_size"])
+        self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_PAD_CODE.format(arch=arch), self.hparams["audio_pad_code"])
+        self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_START_TOKEN_ID.format(arch=arch), self.hparams["audio_start_token_id"])
+        self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_END_TOKEN_ID.format(arch=arch), self.hparams["audio_end_token_id"])
+        self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_USER_SLOT_TOKEN_ID.format(arch=arch), self.hparams["audio_user_slot_token_id"])
+        self.gguf_writer.add_uint32(
+            gguf.Keys.LLM.AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID.format(arch=arch),
+            self.hparams["audio_assistant_gen_slot_token_id"],
+        )
+        self.gguf_writer.add_uint32(
+            gguf.Keys.LLM.AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID.format(arch=arch),
+            self.hparams["audio_assistant_delay_slot_token_id"],
+        )
+        if (sampling_rate := self.hparams.get("sampling_rate")) is not None:
+            self.gguf_writer.add_uint32(gguf.Keys.LLM.SAMPLING_RATE.format(arch=arch), sampling_rate)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("language_model."):
+            name = name.replace("language_model.", "", 1)
+
+        if (match := re.fullmatch(r"emb_ext\.(\d+)\.weight", name)) is not None:
+            vq_idx = int(match.group(1))
+            yield (f"{gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD_AUDIO]}.{vq_idx}.weight", data_torch)
+            return
+
+        if (match := re.fullmatch(r"lm_heads\.(\d+)\.weight", name)) is not None:
+            head_idx = int(match.group(1))
+            if head_idx == 0:
+                yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight", data_torch)
+            else:
+                yield (f"{gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT_AUDIO]}.{head_idx - 1}.weight", data_torch)
+            return
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("Qwen3MoeForCausalLM")
 class Qwen3MoeModel(Qwen2MoeModel):
     model_arch = gguf.MODEL_ARCH.QWEN3MOE
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 839c6e787..7a3500cd2 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -105,6 +105,15 @@ class LLM:
         CONTEXT_LENGTH                    = "{arch}.context_length"
         EMBEDDING_LENGTH                  = "{arch}.embedding_length"
         EMBEDDING_LENGTH_OUT              = "{arch}.embedding_length_out"
+        N_VQ                              = "{arch}.n_vq"
+        AUDIO_VOCAB_SIZE                  = "{arch}.audio_vocab_size"
+        AUDIO_PAD_CODE                    = "{arch}.audio_pad_code"
+        AUDIO_START_TOKEN_ID              = "{arch}.audio_start_token_id"
+        AUDIO_END_TOKEN_ID                = "{arch}.audio_end_token_id"
+        AUDIO_USER_SLOT_TOKEN_ID          = "{arch}.audio_user_slot_token_id"
+        AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID = "{arch}.audio_assistant_gen_slot_token_id"
+        AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID = "{arch}.audio_assistant_delay_slot_token_id"
+        SAMPLING_RATE                     = "{arch}.sampling_rate"
         FEATURES_LENGTH                   = "{arch}.features_length"
         BLOCK_COUNT                       = "{arch}.block_count"
         LEADING_DENSE_BLOCK_COUNT         = "{arch}.leading_dense_block_count"
@@ -387,6 +396,7 @@ class MODEL_ARCH(IntEnum):
     QWEN2MOE         = auto()
     QWEN2VL          = auto()
     QWEN3            = auto()
+    MOSS_TTS_DELAY   = auto()
     QWEN3MOE         = auto()
     QWEN3NEXT        = auto()
     QWEN3VL          = auto()
@@ -497,10 +507,12 @@ class VISION_PROJECTOR_TYPE(IntEnum):
 
 class MODEL_TENSOR(IntEnum):
     TOKEN_EMBD           = auto()
+    TOKEN_EMBD_AUDIO     = auto() # moss-tts-delay, indexed as token_embd_audio.{id}
     TOKEN_EMBD_NORM      = auto()
     TOKEN_TYPES          = auto()
     POS_EMBD             = auto()
     OUTPUT               = auto()
+    OUTPUT_AUDIO         = auto() # moss-tts-delay, indexed as output_audio.{id}
     DENSE_2_OUT          = auto() # embeddinggemma 2_Dense
     DENSE_3_OUT          = auto() # embeddinggemma 3_Dense
     OUTPUT_NORM          = auto()
@@ -830,6 +842,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.QWEN2MOE:         "qwen2moe",
     MODEL_ARCH.QWEN2VL:          "qwen2vl",
     MODEL_ARCH.QWEN3:            "qwen3",
+    MODEL_ARCH.MOSS_TTS_DELAY:   "moss-tts-delay",
     MODEL_ARCH.QWEN3MOE:         "qwen3moe",
     MODEL_ARCH.QWEN3NEXT:        "qwen3next",
     MODEL_ARCH.QWEN3VL:          "qwen3vl",
@@ -938,11 +951,13 @@ class MODEL_TENSOR(IntEnum):
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.TOKEN_EMBD:                "token_embd",
+    MODEL_TENSOR.TOKEN_EMBD_AUDIO:          "token_embd_audio",
     MODEL_TENSOR.TOKEN_EMBD_NORM:           "token_embd_norm",
     MODEL_TENSOR.TOKEN_TYPES:               "token_types",
     MODEL_TENSOR.POS_EMBD:                  "position_embd",
     MODEL_TENSOR.OUTPUT_NORM:               "output_norm",
     MODEL_TENSOR.OUTPUT:                    "output",
+    MODEL_TENSOR.OUTPUT_AUDIO:              "output_audio",
     MODEL_TENSOR.DENSE_2_OUT:                "dense_2", # embeddinggemma 2_Dense
     MODEL_TENSOR.DENSE_3_OUT:                "dense_3", # embeddinggemma 2_Dense
     MODEL_TENSOR.ROPE_FREQS:                "rope_freqs",
@@ -1783,6 +1798,25 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.MOSS_TTS_DELAY: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.TOKEN_EMBD_AUDIO,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.OUTPUT_AUDIO,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.QWEN3MOE: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index e57561090..7274d6f38 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -37,6 +37,9 @@ class TensorNameMap:
             "model.transformer.wte",                     # llada
             "embed_tokens",                              # qwen3-embedding
         ),
+        MODEL_TENSOR.TOKEN_EMBD_AUDIO: (
+            "token_embd_audio",                          # moss-tts-delay, indexed tensors emitted manually
+        ),
 
         # Token type embeddings
         MODEL_TENSOR.TOKEN_TYPES: (
@@ -79,6 +82,9 @@ class TensorNameMap:
             "model.transformer.ff_out",  # llada
             "head.decoder",              # modern-bert
         ),
+        MODEL_TENSOR.OUTPUT_AUDIO: (
+            "output_audio",              # moss-tts-delay, indexed tensors emitted manually
+        ),
         MODEL_TENSOR.DENSE_2_OUT: (
             "dense_2_out",  # embeddinggemma
         ),

From 8655a7084c6b063fabf5cc5a0f34d010ac23d1ef Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Tue, 10 Mar 2026 19:26:00 +0800
Subject: [PATCH 02/20] Add MOSS-TTS Delay model loading support

---
 src/llama-arch.cpp                 | 44 ++++++++++++++++-
 src/llama-arch.h                   | 13 +++++
 src/llama-hparams.h                | 12 +++++
 src/llama-model.cpp                | 78 ++++++++++++++++++++++++++++++
 src/llama-model.h                  |  2 +
 tests/CMakeLists.txt               |  6 +--
 tests/test-moss-tts-delay-load.cpp | 77 +++++++++++++++++++++++++++++
 7 files changed, 228 insertions(+), 4 deletions(-)
 create mode 100644 tests/test-moss-tts-delay-load.cpp

diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 9d8eb88d0..bc6516d60 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -35,6 +35,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_QWEN2MOE,         "qwen2moe"         },
     { LLM_ARCH_QWEN2VL,          "qwen2vl"          },
     { LLM_ARCH_QWEN3,            "qwen3"            },
+    { LLM_ARCH_MOSS_TTS_DELAY,   "moss-tts-delay"   },
     { LLM_ARCH_QWEN3MOE,         "qwen3moe"         },
     { LLM_ARCH_QWEN3NEXT,        "qwen3next"        },
     { LLM_ARCH_QWEN3VL,          "qwen3vl"          },
@@ -274,6 +275,15 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
 
     { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
+    { LLM_KV_N_VQ, "%s.n_vq" },
+    { LLM_KV_AUDIO_VOCAB_SIZE, "%s.audio_vocab_size" },
+    { LLM_KV_AUDIO_PAD_CODE, "%s.audio_pad_code" },
+    { LLM_KV_AUDIO_START_TOKEN_ID, "%s.audio_start_token_id" },
+    { LLM_KV_AUDIO_END_TOKEN_ID, "%s.audio_end_token_id" },
+    { LLM_KV_AUDIO_USER_SLOT_TOKEN_ID, "%s.audio_user_slot_token_id" },
+    { LLM_KV_AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID, "%s.audio_assistant_gen_slot_token_id" },
+    { LLM_KV_AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID, "%s.audio_assistant_delay_slot_token_id" },
+    { LLM_KV_SAMPLING_RATE, "%s.sampling_rate" },
     // sentence-transformers dense modules feature dims
     { LLM_KV_DENSE_2_FEAT_IN,        "%s.dense_2_feat_in"  },
     { LLM_KV_DENSE_2_FEAT_OUT,       "%s.dense_2_feat_out"  },
@@ -331,9 +341,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 
 static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_TOKEN_EMBD,                             "token_embd" },
+    { LLM_TENSOR_TOKEN_EMBD_AUDIO,                       "token_embd_audio.%d" },
     { LLM_TENSOR_OUTPUT_NORM,                            "output_norm" },
     { LLM_TENSOR_OUTPUT_NORM_LFM2,                       "token_embd_norm" }, // fix for wrong tensor name
     { LLM_TENSOR_OUTPUT,                                 "output" },
+    { LLM_TENSOR_OUTPUT_AUDIO,                           "output_audio.%d" },
     { LLM_TENSOR_ROPE_FREQS,                             "rope_freqs" },
     { LLM_TENSOR_ATTN_NORM,                              "blk.%d.attn_norm" },
     { LLM_TENSOR_ATTN_Q,                                 "blk.%d.attn_q" },
@@ -965,6 +977,25 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                 LLM_TENSOR_FFN_DOWN,
                 LLM_TENSOR_FFN_UP,
             };
+        case LLM_ARCH_MOSS_TTS_DELAY:
+            return {
+                LLM_TENSOR_TOKEN_EMBD,
+                LLM_TENSOR_TOKEN_EMBD_AUDIO,
+                LLM_TENSOR_OUTPUT_NORM,
+                LLM_TENSOR_OUTPUT,
+                LLM_TENSOR_OUTPUT_AUDIO,
+                LLM_TENSOR_ATTN_NORM,
+                LLM_TENSOR_ATTN_Q,
+                LLM_TENSOR_ATTN_Q_NORM,
+                LLM_TENSOR_ATTN_K,
+                LLM_TENSOR_ATTN_K_NORM,
+                LLM_TENSOR_ATTN_V,
+                LLM_TENSOR_ATTN_OUT,
+                LLM_TENSOR_FFN_NORM,
+                LLM_TENSOR_FFN_GATE,
+                LLM_TENSOR_FFN_DOWN,
+                LLM_TENSOR_FFN_UP,
+            };
         case LLM_ARCH_QWEN3MOE:
         case LLM_ARCH_QWEN3VLMOE:
         case LLM_ARCH_OLMOE:
@@ -2551,10 +2582,12 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
 //
 static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_TOKEN_EMBD,                 {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_TOKEN_EMBD_AUDIO,           {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
     {LLM_TENSOR_POS_EMBD,                   {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
     {LLM_TENSOR_TOKEN_TYPES,                {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
     {LLM_TENSOR_TOKEN_EMBD_NORM,            {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL}},
     {LLM_TENSOR_OUTPUT,                     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_OUTPUT_AUDIO,               {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CLS,                        {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CLS_OUT,                    {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CLS_NORM,                   {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
@@ -2778,7 +2811,16 @@ std::string LLM_TN_IMPL::str() const {
         return LLM_TENSOR_NAMES.at(tensor);
     }
 
-    std::string name = ::format(LLM_TENSOR_NAMES.at(tensor), bid, xid);
+    std::string name;
+    switch (tensor) {
+        case LLM_TENSOR_TOKEN_EMBD_AUDIO:
+        case LLM_TENSOR_OUTPUT_AUDIO:
+            name = ::format(LLM_TENSOR_NAMES.at(tensor), xid);
+            break;
+        default:
+            name = ::format(LLM_TENSOR_NAMES.at(tensor), bid, xid);
+            break;
+    }
     if (suffix != nullptr) {
         name += ".";
         name += suffix;
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 07aac40aa..c1394551c 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -39,6 +39,7 @@ enum llm_arch {
     LLM_ARCH_QWEN2MOE,
     LLM_ARCH_QWEN2VL,
     LLM_ARCH_QWEN3,
+    LLM_ARCH_MOSS_TTS_DELAY,
     LLM_ARCH_QWEN3MOE,
     LLM_ARCH_QWEN3NEXT,
     LLM_ARCH_QWEN3VL,
@@ -317,6 +318,16 @@ enum llm_kv {
 
     LLM_KV_SHORTCONV_L_CACHE,
 
+    LLM_KV_N_VQ,
+    LLM_KV_AUDIO_VOCAB_SIZE,
+    LLM_KV_AUDIO_PAD_CODE,
+    LLM_KV_AUDIO_START_TOKEN_ID,
+    LLM_KV_AUDIO_END_TOKEN_ID,
+    LLM_KV_AUDIO_USER_SLOT_TOKEN_ID,
+    LLM_KV_AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID,
+    LLM_KV_AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID,
+    LLM_KV_SAMPLING_RATE,
+
     LLM_KV_XIELU_ALPHA_N,
     LLM_KV_XIELU_ALPHA_P,
     LLM_KV_XIELU_BETA,
@@ -336,12 +347,14 @@ enum llm_kv {
 
 enum llm_tensor {
     LLM_TENSOR_TOKEN_EMBD,
+    LLM_TENSOR_TOKEN_EMBD_AUDIO,
     LLM_TENSOR_TOKEN_EMBD_NORM,
     LLM_TENSOR_TOKEN_TYPES,
     LLM_TENSOR_POS_EMBD,
     LLM_TENSOR_DENSE_2_OUT,
     LLM_TENSOR_DENSE_3_OUT,
     LLM_TENSOR_OUTPUT,
+    LLM_TENSOR_OUTPUT_AUDIO,
     LLM_TENSOR_OUTPUT_NORM,
     LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
     LLM_TENSOR_ROPE_FREQS,
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index c4b2a99da..f0fd61259 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -61,6 +61,18 @@ struct llama_hparams {
 
     uint32_t n_shortconv_l_cache  = 0;
 
+    // MOSS-TTS-Delay
+    uint32_t n_vq             = 0;
+    uint32_t audio_vocab_size = 0;
+    uint32_t audio_pad_code   = 0;
+    uint32_t sampling_rate    = 0;
+
+    uint32_t audio_start_token_id                = 0;
+    uint32_t audio_end_token_id                  = 0;
+    uint32_t audio_user_slot_token_id            = 0;
+    uint32_t audio_assistant_gen_slot_token_id   = 0;
+    uint32_t audio_assistant_delay_slot_token_id = 0;
+
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index e18cca052..bed7b3835 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -983,6 +983,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             } break;
         case LLM_ARCH_QWEN3:
             {
+                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
+                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
+                    case 40: type = LLM_TYPE_14B; break;
+                    case 64: type = LLM_TYPE_32B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+        case LLM_ARCH_MOSS_TTS_DELAY:
+            {
+                ml.get_key(LLM_KV_N_VQ,             hparams.n_vq);
+                ml.get_key(LLM_KV_AUDIO_VOCAB_SIZE, hparams.audio_vocab_size);
+                ml.get_key(LLM_KV_AUDIO_PAD_CODE,   hparams.audio_pad_code);
+
+                ml.get_key(LLM_KV_AUDIO_START_TOKEN_ID,                 hparams.audio_start_token_id,                false);
+                ml.get_key(LLM_KV_AUDIO_END_TOKEN_ID,                   hparams.audio_end_token_id,                  false);
+                ml.get_key(LLM_KV_AUDIO_USER_SLOT_TOKEN_ID,             hparams.audio_user_slot_token_id,            false);
+                ml.get_key(LLM_KV_AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID,    hparams.audio_assistant_gen_slot_token_id,   false);
+                ml.get_key(LLM_KV_AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID,  hparams.audio_assistant_delay_slot_token_id, false);
+                ml.get_key(LLM_KV_SAMPLING_RATE,                        hparams.sampling_rate,                       false);
+
                 ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
@@ -3628,6 +3651,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     // output rerank head
                     cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
 
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
+            case LLM_ARCH_MOSS_TTS_DELAY:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    const int64_t n_audio_vocab =
+                        hparams.audio_vocab_size > 0 ? std::max<int64_t>(hparams.audio_vocab_size + 1, hparams.audio_pad_code + 1) : 0;
+
+                    if (hparams.n_vq == 0) {
+                        throw std::runtime_error("n_vq must be > 0 for MOSS_TTS_DELAY");
+                    }
+                    if (n_audio_vocab == 0) {
+                        throw std::runtime_error("audio_vocab_size must be > 0 for MOSS_TTS_DELAY");
+                    }
+
+                    tok_embd_audio.resize(hparams.n_vq);
+                    output_audio.resize(hparams.n_vq);
+
+                    for (uint32_t i = 0; i < hparams.n_vq; ++i) {
+                        tok_embd_audio[i] = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_AUDIO, "weight", -1, i), {n_embd, n_audio_vocab}, 0);
+                    }
+
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (uint32_t i = 0; i < hparams.n_vq; ++i) {
+                        output_audio[i] = create_tensor(tn(LLM_TENSOR_OUTPUT_AUDIO, "weight", -1, i), {n_embd, n_audio_vocab}, 0);
+                    }
+
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = layers[i];
 
@@ -7782,6 +7852,13 @@ void llama_model::print_info() const {
         LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
     }
 
+    if (arch == LLM_ARCH_MOSS_TTS_DELAY) {
+        LLAMA_LOG_INFO("%s: n_vq                  = %u\n",     __func__, hparams.n_vq);
+        LLAMA_LOG_INFO("%s: audio_vocab_size      = %u\n",     __func__, hparams.audio_vocab_size);
+        LLAMA_LOG_INFO("%s: audio_pad_code        = %u\n",     __func__, hparams.audio_pad_code);
+        LLAMA_LOG_INFO("%s: sampling_rate         = %u\n",     __func__, hparams.sampling_rate);
+    }
+
     if (arch == LLM_ARCH_MINICPM ||
         arch == LLM_ARCH_GRANITE ||
         arch == LLM_ARCH_GRANITE_MOE ||
@@ -8772,6 +8849,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_DREAM:
         case LLM_ARCH_QWEN2MOE:
         case LLM_ARCH_QWEN3:
+        case LLM_ARCH_MOSS_TTS_DELAY:
         case LLM_ARCH_QWEN3MOE:
         case LLM_ARCH_LLADA_MOE:
         case LLM_ARCH_RND1:
diff --git a/src/llama-model.h b/src/llama-model.h
index 5ecb8344a..549e42681 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -465,6 +465,7 @@ struct llama_model {
     std::vector<std::string> classifier_labels;
 
     struct ggml_tensor * tok_embd   = nullptr;
+    std::vector<struct ggml_tensor *> tok_embd_audio;
     struct ggml_tensor * type_embd  = nullptr;
     struct ggml_tensor * pos_embd   = nullptr;
     struct ggml_tensor * tok_norm   = nullptr;
@@ -473,6 +474,7 @@ struct llama_model {
     struct ggml_tensor * output_norm     = nullptr;
     struct ggml_tensor * output_norm_b   = nullptr;
     struct ggml_tensor * output          = nullptr;
+    std::vector<struct ggml_tensor *> output_audio;
     struct ggml_tensor * output_b        = nullptr;
     struct ggml_tensor * output_norm_enc = nullptr;
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7fd895e2b..513075a82 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -155,8 +155,8 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
     llama_build_and_test(test-chat.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
     # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
     if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-        llama_build_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
-        target_include_directories(test-json-schema-to-grammar PRIVATE ${PROJECT_SOURCE_DIR}/tools/server)
+    llama_build_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
+    target_include_directories(test-json-schema-to-grammar PRIVATE ${PROJECT_SOURCE_DIR}/tools/server)
     endif()
 
     if (NOT GGML_BACKEND_DL)
@@ -164,6 +164,7 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
     endif()
 
     llama_build(test-gbnf-validator.cpp)
+    llama_build(test-moss-tts-delay-load.cpp)
 
     # build test-tokenizer-1-bpe target once and add many tests
     llama_build(test-tokenizer-1-bpe.cpp)
@@ -283,4 +284,3 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
 llama_build_and_test(test-alloc.cpp)
 target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
 
-
diff --git a/tests/test-moss-tts-delay-load.cpp b/tests/test-moss-tts-delay-load.cpp
new file mode 100644
index 000000000..fe97c2317
--- /dev/null
+++ b/tests/test-moss-tts-delay-load.cpp
@@ -0,0 +1,77 @@
+#include "llama.h"
+#include "../src/llama-arch.h"
+#include "../src/llama-model.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <stdexcept>
+#include <string>
+
+static void check(bool cond, const std::string & msg) {
+    if (!cond) {
+        throw std::runtime_error(msg);
+    }
+}
+
+static void check_tensor_2d(const ggml_tensor * tensor, const char * name, int64_t ne0, int64_t ne1) {
+    check(tensor != nullptr, std::string("missing tensor: ") + name);
+    check(tensor->ne[0] == ne0, std::string(name) + " ne[0] mismatch");
+    check(tensor->ne[1] == ne1, std::string(name) + " ne[1] mismatch");
+}
+
+int main(int argc, char ** argv) {
+    if (argc != 2) {
+        std::fprintf(stderr, "usage: %s <model.gguf>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    llama_backend_init();
+
+    llama_model_params params = llama_model_default_params();
+    params.use_mmap = false;
+
+    llama_model * model = llama_model_load_from_file(argv[1], params);
+    if (model == nullptr) {
+        std::fprintf(stderr, "error: failed to load model '%s'\n", argv[1]);
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    try {
+        check(model->arch == LLM_ARCH_MOSS_TTS_DELAY, "unexpected architecture");
+        check(model->hparams.n_vq > 0, "n_vq must be > 0");
+        check(model->hparams.audio_vocab_size > 0, "audio_vocab_size must be > 0");
+
+        const int64_t n_embd = model->hparams.n_embd;
+        const int64_t n_vocab = model->vocab.n_tokens();
+        const int64_t n_audio_vocab = std::max<int64_t>(model->hparams.audio_vocab_size + 1, model->hparams.audio_pad_code + 1);
+
+        check_tensor_2d(model->tok_embd, "token_embd.weight", n_embd, n_vocab);
+        check_tensor_2d(model->output, "output.weight", n_embd, n_vocab);
+
+        check(model->tok_embd_audio.size() == model->hparams.n_vq, "token_embd_audio size mismatch");
+        check(model->output_audio.size() == model->hparams.n_vq, "output_audio size mismatch");
+
+        for (uint32_t i = 0; i < model->hparams.n_vq; ++i) {
+            check_tensor_2d(model->tok_embd_audio.at(i), "token_embd_audio", n_embd, n_audio_vocab);
+            check_tensor_2d(model->output_audio.at(i), "output_audio", n_embd, n_audio_vocab);
+        }
+
+        std::fprintf(stderr,
+            "loaded MOSS-TTS-Delay: n_layer=%u n_embd=%u n_vq=%u audio_vocab=%u tensors_ok=1\n",
+            model->hparams.n_layer,
+            model->hparams.n_embd,
+            model->hparams.n_vq,
+            model->hparams.audio_vocab_size);
+    } catch (const std::exception & err) {
+        std::fprintf(stderr, "validation failed: %s\n", err.what());
+        llama_model_free(model);
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    llama_model_free(model);
+    llama_backend_free();
+    return EXIT_SUCCESS;
+}

From 483470e978937ff2c742dd098f1c379939f78c64 Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Wed, 11 Mar 2026 15:49:01 +0800
Subject: [PATCH 03/20] moss-tts-delay: add forward graph parity test

---
 include/llama.h                       |   4 +
 scripts/run-moss-tts-delay-parity.sh  |  47 ++++++
 src/CMakeLists.txt                    |   1 +
 src/llama-batch.cpp                   |  38 +++++
 src/llama-batch.h                     |   3 +
 src/llama-context.cpp                 |  28 ++--
 src/llama-context.h                   |   1 +
 src/llama-graph.h                     |   5 +
 src/llama-model.cpp                   |  12 ++
 src/llama-model.h                     |   1 +
 src/models/models.h                   |   4 +
 src/models/moss-tts-delay.cpp         | 192 ++++++++++++++++++++++++
 tests/CMakeLists.txt                  |   2 +-
 tests/moss_tts_delay_export_ref.py    | 113 ++++++++++++++
 tests/test-moss-tts-delay-forward.cpp | 203 ++++++++++++++++++++++++++
 15 files changed, 641 insertions(+), 13 deletions(-)
 create mode 100755 scripts/run-moss-tts-delay-parity.sh
 create mode 100644 src/models/moss-tts-delay.cpp
 create mode 100644 tests/moss_tts_delay_export_ref.py
 create mode 100644 tests/test-moss-tts-delay-forward.cpp

diff --git a/include/llama.h b/include/llama.h
index 0bd10294c..41e2a0715 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -218,6 +218,8 @@ extern "C" {
     // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
     //
     // - token  : the token ids of the input (used when embd is NULL)
+    // - token_audio: optional auxiliary token channels, flattened as [n_tokens, n_token_audio]
+    //                this is currently used by architectures with summed multi-channel embeddings
     // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
     // - pos    : the positions of the respective token in the sequence
     //            (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
@@ -233,6 +235,8 @@ extern "C" {
         int32_t n_tokens;
 
         llama_token  *  token;
+        int32_t       n_token_audio;
+        llama_token  *  token_audio;
         float        *  embd;
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
diff --git a/scripts/run-moss-tts-delay-parity.sh b/scripts/run-moss-tts-delay-parity.sh
new file mode 100755
index 000000000..1f9eb2efe
--- /dev/null
+++ b/scripts/run-moss-tts-delay-parity.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+LLAMA_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+WORKROOT="$(cd "${LLAMA_DIR}/.." && pwd)"
+
+CONDA_SH="${CONDA_SH:-/home/expec/miniconda3/etc/profile.d/conda.sh}"
+CONDA_ENV_NAME="${CONDA_ENV_NAME:-llama-cpp}"
+
+HF_DIR="${HF_DIR:-${WORKROOT}/tmp/moss_tts_delay_test_hf_kv4}"
+GGUF_PATH="${GGUF_PATH:-${WORKROOT}/tmp/moss_tts_delay_test_kv4_f32.gguf}"
+REF_PATH="${REF_PATH:-${WORKROOT}/tmp/moss_tts_delay_test_kv4.ref.bin}"
+BUILD_DIR="${BUILD_DIR:-${LLAMA_DIR}/build}"
+TEST_BIN="${TEST_BIN:-${BUILD_DIR}/bin/test-moss-tts-delay-forward}"
+
+if [[ ! -f "${CONDA_SH}" ]]; then
+    echo "error: conda init script not found: ${CONDA_SH}" >&2
+    exit 1
+fi
+
+if [[ ! -d "${HF_DIR}" ]]; then
+    echo "error: tiny HF fixture not found: ${HF_DIR}" >&2
+    exit 1
+fi
+
+source "${CONDA_SH}"
+conda activate "${CONDA_ENV_NAME}"
+
+echo "[1/4] building parity test target"
+cmake --build "${BUILD_DIR}" --target test-moss-tts-delay-forward -j2
+
+echo "[2/4] converting tiny HF fixture to F32 GGUF"
+python "${LLAMA_DIR}/convert_hf_to_gguf.py" \
+    "${HF_DIR}" \
+    --outfile "${GGUF_PATH}" \
+    --outtype f32
+
+echo "[3/4] exporting PyTorch reference"
+python "${LLAMA_DIR}/tests/moss_tts_delay_export_ref.py" \
+    "${HF_DIR}" \
+    "${REF_PATH}"
+
+echo "[4/4] running forward parity"
+"${TEST_BIN}" "${GGUF_PATH}" "${REF_PATH}"
+
+echo "PASS: moss-tts-delay forward parity verified"
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 283823fa9..06e6e23ed 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -100,6 +100,7 @@ add_library(llama
             models/minicpm3.cpp
             models/minimax-m2.cpp
             models/mistral3.cpp
+            models/moss-tts-delay.cpp
             models/modern-bert.cpp
             models/mpt.cpp
             models/nemotron-h.cpp
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
index 6bf76939c..ecf4f9263 100644
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -55,6 +55,21 @@ bool llama_batch_allocr::init(
         }
     }
 
+    if ((batch.token_audio == nullptr) != (batch.n_token_audio == 0)) {
+        LLAMA_LOG_ERROR("%s: token_audio and n_token_audio must either both be set or both be empty\n", __func__);
+        return false;
+    }
+
+    if (batch.token_audio && !batch.token) {
+        LLAMA_LOG_ERROR("%s: token_audio currently requires token inputs to also be provided\n", __func__);
+        return false;
+    }
+
+    if (batch.token_audio && batch.embd) {
+        LLAMA_LOG_ERROR("%s: token_audio is not supported together with embd inputs\n", __func__);
+        return false;
+    }
+
     if (batch.seq_id) {
         for (int32_t i = 0; i < batch.n_tokens; ++i) {
             for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
@@ -217,6 +232,8 @@ bool llama_batch_allocr::init(
             /*.n_seqs_unq   =*/ (uint32_t) this->seq_id_unq.size(),
             /*.n_pos        =*/ n_pos_per_embd,
             /*.token        =*/ batch.token,
+            /*.n_token_audio=*/ (uint32_t) batch.n_token_audio,
+            /*.token_audio  =*/ batch.token_audio,
             /*.embd         =*/ batch.embd,
             /*.pos          =*/ batch.pos,
             /*.n_seq_id     =*/ batch.n_seq_id,
@@ -399,6 +416,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
     auto udata = std::make_shared<llama_ubatch::data_t>();
 
     udata->token     .resize(n_tokens);
+    udata->token_audio.clear();
     udata->embd      .clear();
     udata->pos       .resize(n_pos_all);
     udata->n_seq_id  .resize(n_tokens);
@@ -421,6 +439,8 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
         /*.n_pos        =*/ n_pos_per_embd,
 
         /*.token        =*/ udata->token.data(),
+        /*.n_token_audio=*/ 0,
+        /*.token_audio  =*/ nullptr,
         /*.embd         =*/ nullptr,
         /*.pos          =*/ udata->pos.data(),
         /*.n_seq_id     =*/ udata->n_seq_id.data(),
@@ -687,8 +707,10 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
 
     const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
     const int64_t n_pos_all  =              (int64_t) n_tokens*n_pos_per_embd;
+    const int64_t n_token_audio_all = batch.token_audio ? (int64_t) n_tokens*batch.n_token_audio : 0;
 
     udata->token     .resize(n_tokens);
+    udata->token_audio.resize(n_token_audio_all);
     udata->embd      .resize(n_embd_all);
     udata->pos       .resize(n_pos_all);
     udata->n_seq_id  .resize(n_tokens);
@@ -706,6 +728,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
             udata->token[i] = batch.token[idxs[i]];
         }
 
+        if (batch.token_audio) {
+            memcpy(
+                    udata->token_audio.data() + i*batch.n_token_audio,
+                    batch.token_audio + (int64_t) idxs[i]*batch.n_token_audio,
+                    batch.n_token_audio*sizeof(llama_token));
+        }
+
         if (batch.embd) {
             memcpy(udata->embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
         }
@@ -756,6 +785,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
         /*.n_pos        =*/ n_pos_per_embd,
 
         /*.token        =*/ batch.token ? udata->token.data() : nullptr,
+        /*.n_token_audio=*/ (uint32_t) batch.n_token_audio,
+        /*.token_audio  =*/ batch.token_audio ? udata->token_audio.data() : nullptr,
         /*.embd         =*/ batch.embd ? udata->embd.data() : nullptr,
         /*.pos          =*/ udata->pos.data(),
         /*.n_seq_id     =*/ udata->n_seq_id.data(),
@@ -805,6 +836,8 @@ void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
         ss_seq_idx    << "]";
 
         LLAMA_LOG_DEBUG("%s:   token      = %p\n", __func__, (void *) ubatch.token);
+        LLAMA_LOG_DEBUG("%s:   n_token_audio = %u\n", __func__, ubatch.n_token_audio);
+        LLAMA_LOG_DEBUG("%s:   token_audio = %p\n", __func__, (void *) ubatch.token_audio);
         LLAMA_LOG_DEBUG("%s:   embd       = %p\n", __func__, (void *) ubatch.embd);
         LLAMA_LOG_DEBUG("%s:   pos        = %p\n", __func__, (void *) ubatch.pos);
         LLAMA_LOG_DEBUG("%s:   n_seq_id   = %p\n", __func__, (void *) ubatch.n_seq_id);
@@ -866,6 +899,8 @@ struct llama_batch llama_batch_get_one(
     return {
         /*n_tokens =*/ n_tokens,
         /*tokens   =*/ tokens,
+        /*n_token_audio =*/ 0,
+        /*token_audio   =*/ nullptr,
         /*embd     =*/ nullptr,
         /*pos      =*/ nullptr,
         /*n_seq_id =*/ nullptr,
@@ -878,6 +913,8 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
     llama_batch batch = {
         /*n_tokens =*/ 0,
         /*tokens   =*/ nullptr,
+        /*n_token_audio =*/ 0,
+        /*token_audio   =*/ nullptr,
         /*embd     =*/ nullptr,
         /*pos      =*/ nullptr,
         /*n_seq_id =*/ nullptr,
@@ -906,6 +943,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
 
 void llama_batch_free(struct llama_batch batch) {
     if (batch.token)    free(batch.token);
+    if (batch.token_audio) free(batch.token_audio);
     if (batch.embd)     free(batch.embd);
     if (batch.pos)      free(batch.pos);
     if (batch.n_seq_id) free(batch.n_seq_id);
diff --git a/src/llama-batch.h b/src/llama-batch.h
index 8e6fac0ef..7f9205476 100644
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@@ -43,6 +43,8 @@ struct llama_ubatch {
 
     //                          // size               | idx | val
     llama_token  *  token;      // [n_tokens]         | i   | id, token
+    uint32_t      n_token_audio;// architecture-specific auxiliary token channels per token
+    llama_token  *  token_audio; // [n_tokens * n_token_audio] | i * n_token_audio + c | id, audio token
     float        *  embd;       // [n_embd, n_tokens] | i   | embd
     llama_pos    *  pos;        // [n_tokens*n_pos]   | i   | pos
     int32_t      *  n_seq_id;   // [n_tokens]         | i   | -
@@ -53,6 +55,7 @@ struct llama_ubatch {
 
     struct data_t {
         std::vector<llama_token>    token;
+        std::vector<llama_token>    token_audio;
         std::vector<float>          embd;
         std::vector<llama_pos>      pos;
         std::vector<int32_t>        n_seq_id;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 009d07e00..9d0dea206 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -756,7 +756,7 @@ float * llama_context::get_logits_ith(int32_t i) {
         }
 
         const int64_t j = output_resolve_row(i);
-        return logits.data + j*model.vocab.n_tokens();
+        return logits.data + j*logits_stride;
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
@@ -1186,8 +1186,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
     const auto & hparams = model.hparams;
 
-    const int64_t n_embd  = hparams.n_embd_inp();
-    const int64_t n_vocab = model.vocab.n_tokens();
+    const int64_t n_embd   = hparams.n_embd_inp();
+    const int64_t n_logits = model.n_logits();
 
     // note: during encode, we always pass the full sequence starting from pos = 0
     if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
@@ -1257,7 +1257,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
         GGML_ASSERT(backend_res != nullptr);
         GGML_ASSERT(logits.data != nullptr);
 
-        ggml_backend_tensor_get_async(backend_res, t_logits, logits.data, 0, n_tokens*n_vocab*sizeof(float));
+        ggml_backend_tensor_get_async(backend_res, t_logits, logits.data, 0, n_tokens*n_logits*sizeof(float));
     }
 
     // extract embeddings
@@ -1673,12 +1673,13 @@ int llama_context::decode(const llama_batch & batch_inp) {
             GGML_ASSERT(backend_res != nullptr);
             GGML_ASSERT(logits.data != nullptr);
 
-            float * logits_out = logits.data + n_outputs_prev*n_vocab;
+            const int64_t n_logits = model.n_logits();
+            float * logits_out = logits.data + n_outputs_prev*n_logits;
 
             if (n_outputs) {
                 GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
-                GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits.size);
-                ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
+                GGML_ASSERT((n_outputs_prev + n_outputs)*n_logits <= (int64_t) logits.size);
+                ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_logits*sizeof(float));
             }
         }
 
@@ -1822,6 +1823,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 
     const auto n_batch    = cparams.n_batch;
     const auto n_vocab    = vocab.n_tokens();
+    const auto n_logits   = model.n_logits();
     const auto n_embd_out = hparams.n_embd_out();
 
     bool has_logits = true;
@@ -1837,7 +1839,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
     size_t backend_float_count = 0;
     size_t backend_token_count = 0;
 
-    logits.size = has_logits ? n_vocab*n_outputs_max : 0;
+    logits_stride = has_logits ? n_logits : 0;
+    logits.size = has_logits ? n_logits*n_outputs_max : 0;
     embd.size   = has_embd ? n_embd_out*n_outputs_max : 0;
 
     // Allocate backend sampling output buffers if there are backend samplers configured.
@@ -1943,16 +1946,17 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 }
 
 void llama_context::output_reorder() {
-    const uint64_t n_vocab = model.vocab.n_tokens();
-    const uint64_t n_embd  = model.hparams.n_embd;
+    const uint64_t n_logits = logits_stride;
+    const uint64_t n_vocab  = model.vocab.n_tokens();
+    const uint64_t n_embd   = model.hparams.n_embd;
 
     for (size_t s = 0; s < output_swaps.size(); ++s) {
         const uint64_t i0 = output_swaps[s].i0;
         const uint64_t i1 = output_swaps[s].i1;
 
         if (logits.size > 0) {
-            for (uint64_t k = 0; k < n_vocab; k++) {
-                std::swap(logits.data[i0*n_vocab + k], logits.data[i1*n_vocab + k]);
+            for (uint64_t k = 0; k < n_logits; k++) {
+                std::swap(logits.data[i0*n_logits + k], logits.data[i1*n_logits + k]);
             }
         }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index e0d0085c1..0188619bf 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -267,6 +267,7 @@ struct llama_context {
 
     // decode output (2-dimensional array: [n_outputs][n_vocab])
     buffer_view<float> logits = {nullptr, 0};
+    uint32_t logits_stride = 0;
 
     // embeddings output (2-dimensional array: [n_outputs][n_embd])
     // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 7f6c9e963..757cf3ca1 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -567,9 +567,14 @@ struct llm_graph_params {
             ubatch.n_seq_tokens == other.ubatch.n_seq_tokens &&
             ubatch.n_seqs       == other.ubatch.n_seqs &&
             ubatch.n_seqs_unq   == other.ubatch.n_seqs_unq &&
+            ubatch.n_token_audio == other.ubatch.n_token_audio &&
             (
                 (!ubatch.token && !other.ubatch.token) ||
                 (!ubatch.embd  && !other.ubatch.embd)
+            ) &&
+            (
+                (!ubatch.token_audio && !other.ubatch.token_audio) ||
+                (ubatch.token_audio && other.ubatch.token_audio)
             );
 
         // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index bed7b3835..a1b68b94c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7669,6 +7669,14 @@ size_t llama_model::n_devices() const {
     return devices.size();
 }
 
+uint32_t llama_model::n_logits() const {
+    if (arch == LLM_ARCH_MOSS_TTS_DELAY) {
+        return vocab.n_tokens() + hparams.n_vq * (hparams.audio_vocab_size + 1);
+    }
+
+    return vocab.n_tokens();
+}
+
 uint32_t llama_model::n_gpu_layers() const {
     return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
 }
@@ -8267,6 +8275,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_qwen3>(*this, params);
             } break;
+        case LLM_ARCH_MOSS_TTS_DELAY:
+            {
+                llm = std::make_unique<llm_build_moss_tts_delay>(*this, params);
+            } break;
         case LLM_ARCH_QWEN3MOE:
             {
                 llm = std::make_unique<llm_build_qwen3moe>(*this, params);
diff --git a/src/llama-model.h b/src/llama-model.h
index 549e42681..619ee188e 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -536,6 +536,7 @@ struct llama_model {
     size_t size() const; // file size
     size_t n_tensors() const;
     size_t n_devices() const;
+    uint32_t n_logits() const;
 
     uint32_t n_gpu_layers() const;
     llama_split_mode split_mode() const;
diff --git a/src/models/models.h b/src/models/models.h
index cf9ba04e7..d1894859c 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -401,6 +401,10 @@ struct llm_build_mistral3 : public llm_graph_context {
     llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
 };
 
+struct llm_build_moss_tts_delay : public llm_graph_context {
+    llm_build_moss_tts_delay(const llama_model & model, const llm_graph_params & params);
+};
+
 struct llm_build_modern_bert : public llm_graph_context {
     llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
 };
diff --git a/src/models/moss-tts-delay.cpp b/src/models/moss-tts-delay.cpp
new file mode 100644
index 000000000..ae7ff48c4
--- /dev/null
+++ b/src/models/moss-tts-delay.cpp
@@ -0,0 +1,192 @@
+#include "models.h"
+
+namespace {
+
+class llm_graph_input_moss_audio_channel : public llm_graph_input_i {
+public:
+    llm_graph_input_moss_audio_channel(uint32_t channel, uint32_t n_channels)
+        : channel(channel), n_channels(n_channels) {}
+
+    void set_input(const llama_ubatch * ubatch) override {
+        GGML_ASSERT(tokens != nullptr);
+
+        std::vector<llama_token> data(ubatch->n_tokens, 0);
+        if (ubatch->token_audio != nullptr) {
+            GGML_ASSERT(ubatch->n_token_audio == n_channels);
+
+            for (uint32_t i = 0; i < ubatch->n_tokens; ++i) {
+                data[i] = ubatch->token_audio[(size_t) i*n_channels + channel];
+            }
+        }
+
+        ggml_backend_tensor_set(tokens, data.data(), 0, data.size()*ggml_element_size(tokens));
+    }
+
+    bool can_reuse(const llm_graph_params & params) override {
+        return
+            tokens != nullptr &&
+            tokens->ne[0] == params.ubatch.n_tokens &&
+            (
+                (params.ubatch.n_token_audio == n_channels && params.ubatch.token_audio != nullptr) ||
+                (params.ubatch.n_token_audio == 0 && params.ubatch.token_audio == nullptr)
+            );
+    }
+
+    ggml_tensor * tokens = nullptr;
+
+private:
+    const uint32_t channel;
+    const uint32_t n_channels;
+};
+
+}
+
+llm_build_moss_tts_delay::llm_build_moss_tts_delay(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(hparams.n_vq == model.tok_embd_audio.size());
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL = build_inp_embd(model.tok_embd);
+
+    GGML_ASSERT(ubatch.token != nullptr);
+    GGML_ASSERT(
+        (ubatch.token_audio != nullptr && ubatch.n_token_audio == hparams.n_vq) ||
+        (ubatch.token_audio == nullptr && ubatch.n_token_audio == 0));
+
+    for (uint32_t i = 0; i < hparams.n_vq; ++i) {
+        auto inp_audio = std::make_unique<llm_graph_input_moss_audio_channel>(i, hparams.n_vq);
+        inp_audio->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+        cb(inp_audio->tokens, "inp_audio_tokens", i);
+        ggml_set_input(inp_audio->tokens);
+
+        ggml_tensor * audio_embd = ggml_get_rows(ctx0, model.tok_embd_audio[i], inp_audio->tokens);
+        cb(audio_embd, "audio_embd", i);
+
+        inpL = ggml_add(ctx0, inpL, audio_embd);
+        cb(inpL, "input_sum", i);
+
+        res->add_input(std::move(inp_audio));
+    }
+
+    ggml_tensor * inp_pos = build_inp_pos();
+    auto * inp_attn = build_attn_inp_kv();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+
+        cur = build_norm(inpL,
+                model.layers[il].attn_norm, nullptr,
+                LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        {
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            cb(Qcur, "Qcur", il);
+
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            cb(Kcur, "Kcur", il);
+
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
+            cb(Qcur, "Qcur_normed", il);
+
+            Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+
+            Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
+            cb(Kcur, "Kcur_normed", il);
+
+            Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, model.layers[il].bo,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+        }
+
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0, cur,   inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        cur = build_norm(ffn_inp,
+                model.layers[il].ffn_norm, nullptr,
+                LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+
+        cur = build_ffn(cur,
+                model.layers[il].ffn_up,   nullptr, nullptr,
+                model.layers[il].ffn_gate, nullptr, nullptr,
+                model.layers[il].ffn_down, nullptr, nullptr,
+                nullptr,
+                LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(cur, "ffn_out", il);
+
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        inpL = cur;
+    }
+
+    cur = build_norm(inpL,
+            model.output_norm, nullptr,
+            LLM_NORM_RMS, -1);
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    GGML_ASSERT(hparams.n_vq == model.output_audio.size());
+
+    ggml_tensor * logits = build_lora_mm(model.output, cur);
+    cb(logits, "result_output_text", -1);
+
+    for (uint32_t i = 0; i < hparams.n_vq; ++i) {
+        ggml_tensor * audio_logits = build_lora_mm(model.output_audio[i], cur);
+        ggml_tensor * invalid_audio_logits = ggml_view_2d(
+                ctx0, audio_logits,
+                1, audio_logits->ne[1],
+                audio_logits->nb[1],
+                ggml_element_size(audio_logits) * (audio_logits->ne[0] - 1));
+        invalid_audio_logits = ggml_clamp(ctx0, invalid_audio_logits, -INFINITY, -INFINITY);
+        audio_logits = ggml_set_2d(
+                ctx0, audio_logits, invalid_audio_logits,
+                audio_logits->nb[1],
+                ggml_element_size(audio_logits) * (audio_logits->ne[0] - 1));
+        cb(audio_logits, "result_output_audio", i);
+
+        logits = ggml_concat(ctx0, logits, audio_logits, 0);
+        cb(logits, "result_output_concat", i);
+    }
+
+    logits = ggml_cont(ctx0, logits);
+    cb(logits, "result_output_cont", -1);
+
+    res->t_logits = logits;
+    cb(logits, "result_output", -1);
+
+    ggml_build_forward_expand(gf, logits);
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 513075a82..e39fb805f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -165,6 +165,7 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
 
     llama_build(test-gbnf-validator.cpp)
     llama_build(test-moss-tts-delay-load.cpp)
+    llama_build(test-moss-tts-delay-forward.cpp)
 
     # build test-tokenizer-1-bpe target once and add many tests
     llama_build(test-tokenizer-1-bpe.cpp)
@@ -283,4 +284,3 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
 
 llama_build_and_test(test-alloc.cpp)
 target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
-
diff --git a/tests/moss_tts_delay_export_ref.py b/tests/moss_tts_delay_export_ref.py
new file mode 100644
index 000000000..c78559092
--- /dev/null
+++ b/tests/moss_tts_delay_export_ref.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+
+import os
+import struct
+import sys
+import types
+from pathlib import Path
+
+import numpy as np
+import torch
+import transformers
+from safetensors.torch import load_file
+
+if "transformers.initialization" not in sys.modules:
+    import torch.nn.init as nn_init
+
+    shim = types.SimpleNamespace(
+        normal_=nn_init.normal_,
+        zeros_=nn_init.zeros_,
+    )
+    transformers.initialization = shim
+    sys.modules["transformers.initialization"] = shim
+
+WORKROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(WORKROOT / "MOSS-TTS"))
+
+from moss_tts_delay.configuration_moss_tts import MossTTSDelayConfig
+from moss_tts_delay.modeling_moss_tts import MossTTSDelayModel
+
+REF_MAGIC = 0x4D545452  # "RTTM"
+REF_VERSION = 1
+
+
+def build_text_ids(length: int, vocab_size: int) -> np.ndarray:
+    if vocab_size < 8:
+        raise ValueError(f"vocab_size must be >= 8, got {vocab_size}")
+
+    # Keep away from the first few special ids and generate a deterministic but
+    # non-trivial pattern that works for both tiny toy models and full exports.
+    ids = np.zeros(length, dtype=np.int32)
+    span = vocab_size - 4
+    for i in range(length):
+        ids[i] = 4 + ((i * 7 + 3) % span)
+    return ids
+
+
+def build_audio_ids(n_tokens: int, n_vq: int, audio_vocab_size: int) -> np.ndarray:
+    audio = np.zeros((n_tokens, n_vq), dtype=np.int32)
+    for t in range(n_tokens):
+        for q in range(n_vq):
+            audio[t, q] = (t * 37 + q * 53) % audio_vocab_size
+    return audio
+
+
+def main() -> int:
+    if len(sys.argv) != 3:
+        print(f"usage: {sys.argv[0]} <hf-model-dir> <reference.bin>", file=sys.stderr)
+        return 1
+
+    model_dir = sys.argv[1]
+    out_path = sys.argv[2]
+
+    config = MossTTSDelayConfig.from_pretrained(model_dir)
+    orig_get_input_embeddings = MossTTSDelayModel.get_input_embeddings
+    orig_tie_weights = MossTTSDelayModel.tie_weights
+
+    MossTTSDelayModel.get_input_embeddings = lambda self: self.language_model.get_input_embeddings()
+    MossTTSDelayModel.tie_weights = lambda self: None
+    try:
+        model = MossTTSDelayModel(config).eval()
+        state_dict = load_file(os.path.join(model_dir, "model.safetensors"), device="cpu")
+        missing, unexpected = model.load_state_dict(state_dict, strict=False)
+        if missing or unexpected:
+            raise RuntimeError(f"state_dict mismatch: missing={missing} unexpected={unexpected}")
+    finally:
+        MossTTSDelayModel.get_input_embeddings = orig_get_input_embeddings
+        MossTTSDelayModel.tie_weights = orig_tie_weights
+
+    n_tokens = 4
+    text_ids = build_text_ids(n_tokens, config.language_config.vocab_size)
+    audio_ids = build_audio_ids(n_tokens, config.n_vq, config.audio_vocab_size)
+    input_ids = np.concatenate([text_ids[:, None], audio_ids], axis=1)[None, :, :]
+
+    with torch.no_grad():
+        outputs = model(
+            input_ids=torch.from_numpy(input_ids).long(),
+            use_cache=False,
+        )
+
+    ref_embd = outputs.hidden_states[-1][0, -1].float().cpu().numpy().astype(np.float32, copy=False)
+    ref_logits = np.concatenate(
+        [head[0, -1].float().cpu().numpy() for head in outputs.logits],
+        axis=0,
+    ).astype(np.float32, copy=False)
+
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    with open(out_path, "wb") as f:
+        f.write(struct.pack("<6I", REF_MAGIC, REF_VERSION, n_tokens, config.n_vq, ref_embd.shape[0], ref_logits.shape[0]))
+        f.write(text_ids.astype(np.int32, copy=False).tobytes())
+        f.write(audio_ids.reshape(-1).astype(np.int32, copy=False).tobytes())
+        f.write(ref_embd.tobytes())
+        f.write(ref_logits.tobytes())
+
+    print(
+        f"exported moss-tts-delay reference: n_tokens={n_tokens} n_vq={config.n_vq} "
+        f"n_embd={ref_embd.shape[0]} n_logits={ref_logits.shape[0]} -> {out_path}",
+        file=sys.stderr,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/test-moss-tts-delay-forward.cpp b/tests/test-moss-tts-delay-forward.cpp
new file mode 100644
index 000000000..87dece0c7
--- /dev/null
+++ b/tests/test-moss-tts-delay-forward.cpp
@@ -0,0 +1,203 @@
+#include "llama.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace {
+
+struct ref_header {
+    uint32_t magic;
+    uint32_t version;
+    uint32_t n_tokens;
+    uint32_t n_vq;
+    uint32_t n_embd;
+    uint32_t n_logits;
+};
+
+constexpr uint32_t REF_MAGIC = 0x4d545452; // "RTTM"
+constexpr uint32_t REF_VERSION = 1;
+
+template <typename T>
+void read_exact(std::ifstream & in, T * data, size_t count, const char * what) {
+    in.read(reinterpret_cast<char *>(data), sizeof(T) * count);
+    if (!in) {
+        throw std::runtime_error(std::string("failed to read ") + what);
+    }
+}
+
+float max_abs_diff(const float * got, const std::vector<float> & ref) {
+    float out = 0.0f;
+    for (size_t i = 0; i < ref.size(); ++i) {
+        if (!std::isfinite(ref[i])) {
+            continue;
+        }
+        if (!std::isfinite(got[i])) {
+            return INFINITY;
+        }
+        out = std::max(out, std::fabs(got[i] - ref[i]));
+    }
+    return out;
+}
+
+float max_abs_diff_span(const float * got, const float * ref, size_t count) {
+    float out = 0.0f;
+    for (size_t i = 0; i < count; ++i) {
+        if (!std::isfinite(ref[i])) {
+            continue;
+        }
+        if (!std::isfinite(got[i])) {
+            return INFINITY;
+        }
+        out = std::max(out, std::fabs(got[i] - ref[i]));
+    }
+    return out;
+}
+
+}
+
+int main(int argc, char ** argv) {
+    if (argc != 3) {
+        std::fprintf(stderr, "usage: %s <model.gguf> <reference.bin>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    std::ifstream in(argv[2], std::ios::binary);
+    if (!in) {
+        std::fprintf(stderr, "error: failed to open reference '%s'\n", argv[2]);
+        return EXIT_FAILURE;
+    }
+
+    ref_header hdr{};
+    read_exact(in, &hdr, 1, "header");
+    if (hdr.magic != REF_MAGIC || hdr.version != REF_VERSION) {
+        std::fprintf(stderr, "error: unexpected reference format\n");
+        return EXIT_FAILURE;
+    }
+
+    std::vector<llama_token> text(hdr.n_tokens);
+    std::vector<llama_token> audio((size_t) hdr.n_tokens * hdr.n_vq);
+    std::vector<float> ref_embd(hdr.n_embd);
+    std::vector<float> ref_logits(hdr.n_logits);
+
+    read_exact(in, text.data(),      text.size(),      "text tokens");
+    read_exact(in, audio.data(),     audio.size(),     "audio tokens");
+    read_exact(in, ref_embd.data(),  ref_embd.size(),  "reference embeddings");
+    read_exact(in, ref_logits.data(), ref_logits.size(), "reference logits");
+
+    llama_backend_init();
+
+    llama_model_params mparams = llama_model_default_params();
+    mparams.use_mmap = true;
+
+    llama_model * model = llama_model_load_from_file(argv[1], mparams);
+    if (model == nullptr) {
+        std::fprintf(stderr, "error: failed to load model '%s'\n", argv[1]);
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    llama_context_params cparams = llama_context_default_params();
+    cparams.n_ctx = std::max<uint32_t>(hdr.n_tokens + 8, 64);
+    cparams.n_batch = hdr.n_tokens;
+    cparams.n_ubatch = hdr.n_tokens;
+    cparams.n_seq_max = 1;
+    cparams.embeddings = true;
+    cparams.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
+    cparams.type_k = GGML_TYPE_F32;
+    cparams.type_v = GGML_TYPE_F32;
+
+    llama_context * ctx = llama_init_from_model(model, cparams);
+    if (ctx == nullptr) {
+        std::fprintf(stderr, "error: failed to create context\n");
+        llama_model_free(model);
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    llama_set_warmup(ctx, false);
+    llama_set_embeddings(ctx, true);
+    llama_set_causal_attn(ctx, true);
+
+    llama_batch batch = llama_batch_init(hdr.n_tokens, 0, 1);
+    batch.n_tokens = hdr.n_tokens;
+    batch.n_token_audio = hdr.n_vq;
+    batch.token_audio = (llama_token *) std::malloc(sizeof(llama_token) * audio.size());
+    if (batch.token_audio == nullptr) {
+        std::fprintf(stderr, "error: failed to allocate token_audio\n");
+        llama_batch_free(batch);
+        llama_free(ctx);
+        llama_model_free(model);
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    for (uint32_t i = 0; i < hdr.n_tokens; ++i) {
+        batch.token[i] = text[i];
+        std::memcpy(batch.token_audio + (size_t) i * hdr.n_vq, audio.data() + (size_t) i * hdr.n_vq, sizeof(llama_token) * hdr.n_vq);
+        batch.pos[i] = i;
+        batch.n_seq_id[i] = 1;
+        batch.seq_id[i][0] = 0;
+        batch.logits[i] = i + 1 == hdr.n_tokens;
+    }
+
+    const int ret = llama_decode(ctx, batch);
+    if (ret != 0) {
+        std::fprintf(stderr, "error: llama_decode failed: %d\n", ret);
+        llama_batch_free(batch);
+        llama_free(ctx);
+        llama_model_free(model);
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    const int32_t n_vocab = llama_vocab_n_tokens(vocab);
+    const int32_t n_audio_logits = (int32_t) (hdr.n_logits - n_vocab) / (int32_t) hdr.n_vq;
+    const int32_t out_idx = (int32_t) hdr.n_tokens - 1;
+    const float * got_embd = llama_get_embeddings_ith(ctx, out_idx);
+    const float * got_logits = llama_get_logits_ith(ctx, out_idx);
+
+    if (got_embd == nullptr || got_logits == nullptr) {
+        std::fprintf(stderr, "error: missing outputs from context\n");
+        llama_batch_free(batch);
+        llama_free(ctx);
+        llama_model_free(model);
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    const float embd_max_abs = max_abs_diff(got_embd, ref_embd);
+    const float logits_max_abs = max_abs_diff(got_logits, ref_logits);
+    const float text_logits_max_abs = max_abs_diff_span(got_logits, ref_logits.data(), n_vocab);
+    const float audio_logits_max_abs = max_abs_diff_span(got_logits + n_vocab, ref_logits.data() + n_vocab, hdr.n_logits - n_vocab);
+
+    std::fprintf(stderr,
+            "moss-tts-delay forward parity: out_idx=%d embd_max_abs=%g logits_max_abs=%g text_logits_max_abs=%g audio_logits_max_abs=%g n_audio_logits=%d\n",
+            out_idx, embd_max_abs, logits_max_abs, text_logits_max_abs, audio_logits_max_abs, n_audio_logits);
+
+    const bool ok = embd_max_abs < 1e-4f && logits_max_abs < 1e-4f;
+
+    if (!ok) {
+        for (uint32_t i = 0; i < hdr.n_tokens; ++i) {
+            const float * got_embd_i = llama_get_embeddings_ith(ctx, (int32_t) i);
+            if (got_embd_i != nullptr) {
+                std::fprintf(stderr, "  embd_max_abs[out=%u]=%g\n", i, max_abs_diff(got_embd_i, ref_embd));
+            }
+        }
+    }
+
+    llama_batch_free(batch);
+    llama_free(ctx);
+    llama_model_free(model);
+    llama_backend_free();
+
+    return ok ? EXIT_SUCCESS : EXIT_FAILURE;
+}

From 1db8c77ac4f624673c463f490775ba06f3014bf1 Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Wed, 11 Mar 2026 15:51:06 +0800
Subject: [PATCH 04/20] moss-tts: add delay-state sampling scaffold

---
 tools/tts/CMakeLists.txt |   9 +
 tools/tts/moss-tts.cpp   | 943 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 952 insertions(+)
 create mode 100644 tools/tts/moss-tts.cpp

diff --git a/tools/tts/CMakeLists.txt b/tools/tts/CMakeLists.txt
index 76320d4c2..b91a84759 100644
--- a/tools/tts/CMakeLists.txt
+++ b/tools/tts/CMakeLists.txt
@@ -6,3 +6,12 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17)
 if(LLAMA_TOOLS_INSTALL)
     install(TARGETS ${TARGET} RUNTIME)
 endif()
+
+set(TARGET llama-moss-tts)
+add_executable(${TARGET} moss-tts.cpp)
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/tools/tts/moss-tts.cpp b/tools/tts/moss-tts.cpp
new file mode 100644
index 000000000..7260da3af
--- /dev/null
+++ b/tools/tts/moss-tts.cpp
@@ -0,0 +1,943 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cinttypes>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace {
+
+constexpr uint32_t MOSS_DELAY_DEFAULT_N_VQ = 32;
+constexpr llama_token MOSS_DELAY_DEFAULT_PAD_TOKEN_ID = 151643;
+constexpr llama_token MOSS_DELAY_DEFAULT_IM_START_TOKEN_ID = 151644;
+constexpr llama_token MOSS_DELAY_DEFAULT_IM_END_TOKEN_ID = 151645;
+constexpr llama_token MOSS_DELAY_DEFAULT_AUDIO_START_TOKEN_ID = 151652;
+constexpr llama_token MOSS_DELAY_DEFAULT_AUDIO_END_TOKEN_ID = 151653;
+constexpr llama_token MOSS_DELAY_DEFAULT_AUDIO_USER_SLOT_TOKEN_ID = 151654;
+constexpr llama_token MOSS_DELAY_DEFAULT_AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID = 151656;
+constexpr llama_token MOSS_DELAY_DEFAULT_AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID = 151662;
+constexpr llama_token MOSS_DELAY_DEFAULT_AUDIO_PAD_CODE = 1024;
+constexpr uint32_t MOSS_DELAY_DEFAULT_AUDIO_VOCAB_SIZE = 1024;
+constexpr int64_t MOSS_DELAY_INT64_MAX = std::numeric_limits<int64_t>::max();
+constexpr float MOSS_NEG_INF = -std::numeric_limits<float>::infinity();
+
+struct moss_sampling_config {
+    float text_temperature = 1.5f;
+    float text_top_p = 1.0f;
+    int32_t text_top_k = 50;
+    float audio_temperature = 1.7f;
+    float audio_top_p = 0.8f;
+    int32_t audio_top_k = 25;
+    float audio_repetition_penalty = 1.0f;
+};
+
+struct moss_delay_config {
+    uint32_t n_vq = MOSS_DELAY_DEFAULT_N_VQ;
+    llama_token pad_token_id = MOSS_DELAY_DEFAULT_PAD_TOKEN_ID;
+    llama_token im_start_token_id = MOSS_DELAY_DEFAULT_IM_START_TOKEN_ID;
+    llama_token im_end_token_id = MOSS_DELAY_DEFAULT_IM_END_TOKEN_ID;
+    llama_token audio_start_token_id = MOSS_DELAY_DEFAULT_AUDIO_START_TOKEN_ID;
+    llama_token audio_end_token_id = MOSS_DELAY_DEFAULT_AUDIO_END_TOKEN_ID;
+    llama_token audio_user_slot_token_id = MOSS_DELAY_DEFAULT_AUDIO_USER_SLOT_TOKEN_ID;
+    llama_token audio_assistant_gen_slot_token_id = MOSS_DELAY_DEFAULT_AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID;
+    llama_token audio_assistant_delay_slot_token_id = MOSS_DELAY_DEFAULT_AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID;
+    llama_token audio_pad_code = MOSS_DELAY_DEFAULT_AUDIO_PAD_CODE;
+    uint32_t audio_vocab_size = MOSS_DELAY_DEFAULT_AUDIO_VOCAB_SIZE;
+
+    size_t packed_stride() const {
+        return 1u + n_vq;
+    }
+};
+
+struct moss_audio_segment {
+    std::vector<llama_token> codes;
+    size_t n_frames = 0;
+};
+
+struct moss_delay_state {
+    int32_t audio_length = 0;
+    int64_t delayed_length = MOSS_DELAY_INT64_MAX;
+    bool is_audio = false;
+    bool is_stopping = false;
+    int32_t time_step = 0;
+    std::vector<llama_token> text_history;
+
+    uint32_t n_vq = MOSS_DELAY_DEFAULT_N_VQ;
+    std::vector<llama_token> audio_history;
+
+    size_t audio_frames() const {
+        return n_vq == 0 ? 0 : audio_history.size() / n_vq;
+    }
+
+    bool empty_audio() const {
+        return audio_history.empty();
+    }
+
+    const llama_token * audio_frame_ptr(size_t frame_idx) const {
+        if (n_vq == 0 || frame_idx >= audio_frames()) {
+            return nullptr;
+        }
+        return audio_history.data() + frame_idx * n_vq;
+    }
+
+    void reserve_audio_frames(size_t frames) {
+        audio_history.reserve(frames * n_vq);
+    }
+
+    void append_audio(const std::vector<llama_token> & frame) {
+        GGML_ASSERT(frame.size() == n_vq);
+        audio_history.insert(audio_history.end(), frame.begin(), frame.end());
+    }
+
+    void append_audio(const llama_token * frame) {
+        GGML_ASSERT(frame != nullptr);
+        audio_history.insert(audio_history.end(), frame, frame + n_vq);
+    }
+};
+
+using moss_rng = std::mt19937;
+
+static void print_usage(int argc, char ** argv) {
+    (void) argc;
+    LOG("\nexample usage:\n");
+    LOG("  %s -m model.gguf --print-delay-config\n", argv[0]);
+    LOG("\n");
+}
+
+static bool parse_meta_i64(const llama_model * model, const char * key, int64_t & out) {
+    char buf[128];
+    const int32_t n = llama_model_meta_val_str(model, key, buf, sizeof(buf));
+    if (n <= 0) {
+        return false;
+    }
+
+    char * end = nullptr;
+    const long long val = std::strtoll(buf, &end, 10);
+    if (end == buf || *end != '\0') {
+        return false;
+    }
+    out = val;
+    return true;
+}
+
+static bool parse_meta_u32(const llama_model * model, const char * key, uint32_t & out) {
+    int64_t tmp = 0;
+    if (!parse_meta_i64(model, key, tmp) || tmp < 0 || tmp > std::numeric_limits<uint32_t>::max()) {
+        return false;
+    }
+    out = static_cast<uint32_t>(tmp);
+    return true;
+}
+
+static bool parse_meta_token(const llama_model * model, const char * key, llama_token & out) {
+    int64_t tmp = 0;
+    if (!parse_meta_i64(model, key, tmp) || tmp < std::numeric_limits<llama_token>::min() || tmp > std::numeric_limits<llama_token>::max()) {
+        return false;
+    }
+    out = static_cast<llama_token>(tmp);
+    return true;
+}
+
+static moss_delay_config moss_delay_config_from_model(const llama_model * model) {
+    moss_delay_config cfg;
+
+    parse_meta_u32(model, "moss-tts-delay.n_vq", cfg.n_vq);
+    parse_meta_u32(model, "moss-tts-delay.audio_vocab_size", cfg.audio_vocab_size);
+    parse_meta_token(model, "moss-tts-delay.audio_pad_code", cfg.audio_pad_code);
+    parse_meta_token(model, "moss-tts-delay.pad_token_id", cfg.pad_token_id);
+    parse_meta_token(model, "moss-tts-delay.im_start_token_id", cfg.im_start_token_id);
+    parse_meta_token(model, "moss-tts-delay.im_end_token_id", cfg.im_end_token_id);
+    parse_meta_token(model, "moss-tts-delay.audio_start_token_id", cfg.audio_start_token_id);
+    parse_meta_token(model, "moss-tts-delay.audio_end_token_id", cfg.audio_end_token_id);
+    parse_meta_token(model, "moss-tts-delay.audio_user_slot_token_id", cfg.audio_user_slot_token_id);
+    parse_meta_token(model, "moss-tts-delay.audio_gen_slot_token_id", cfg.audio_assistant_gen_slot_token_id);
+    parse_meta_token(model, "moss-tts-delay.audio_delay_slot_token_id", cfg.audio_assistant_delay_slot_token_id);
+
+    return cfg;
+}
+
+static size_t moss_audio_vocab_with_pad(const moss_delay_config & cfg) {
+    return std::max<size_t>(cfg.audio_vocab_size + 1u, (size_t) cfg.audio_pad_code + 1u);
+}
+
+static int64_t moss_find_last_equal(const std::vector<llama_token> & values, llama_token target) {
+    for (int64_t i = (int64_t) values.size() - 1; i >= 0; --i) {
+        if (values[(size_t) i] == target) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+static moss_delay_state moss_init_delay_state(
+        const std::vector<llama_token> & packed_input_ids,
+        const moss_delay_config & cfg) {
+    GGML_ASSERT(cfg.n_vq > 0);
+    GGML_ASSERT(packed_input_ids.size() % cfg.packed_stride() == 0);
+
+    moss_delay_state state;
+    state.n_vq = cfg.n_vq;
+
+    const size_t seq_len = packed_input_ids.size() / cfg.packed_stride();
+    state.text_history.resize(seq_len);
+    state.reserve_audio_frames(std::max<size_t>(seq_len + 1024, 256));
+
+    for (size_t t = 0; t < seq_len; ++t) {
+        const size_t row = t * cfg.packed_stride();
+        state.text_history[t] = packed_input_ids[row];
+        state.audio_history.insert(
+                state.audio_history.end(),
+                packed_input_ids.begin() + row + 1,
+                packed_input_ids.begin() + row + 1 + cfg.n_vq);
+    }
+
+    if (!state.text_history.empty()) {
+        const llama_token last_text_token = state.text_history.back();
+        const bool is_continuation =
+                last_text_token == cfg.audio_start_token_id ||
+                last_text_token == cfg.audio_assistant_gen_slot_token_id;
+        if (is_continuation) {
+            const int64_t audio_start_idx = moss_find_last_equal(state.text_history, cfg.audio_start_token_id);
+            if (audio_start_idx >= 0) {
+                state.audio_length = (int32_t) (seq_len - (size_t) audio_start_idx);
+                state.is_audio = true;
+            }
+        }
+    }
+
+    return state;
+}
+
+static void moss_apply_top_p_inplace(std::vector<float> & logits, size_t n_rows, size_t n_vocab, float top_p) {
+    if (top_p >= 1.0f) {
+        return;
+    }
+
+    for (size_t row = 0; row < n_rows; ++row) {
+        float max_logit = MOSS_NEG_INF;
+        for (size_t col = 0; col < n_vocab; ++col) {
+            max_logit = std::max(max_logit, logits[row * n_vocab + col]);
+        }
+
+        if (!std::isfinite(max_logit)) {
+            continue;
+        }
+
+        std::vector<float> probs(n_vocab, 0.0f);
+        float sum_exp = 0.0f;
+        for (size_t col = 0; col < n_vocab; ++col) {
+            const float logit = logits[row * n_vocab + col];
+            if (std::isfinite(logit)) {
+                probs[col] = std::exp(logit - max_logit);
+                sum_exp += probs[col];
+            }
+        }
+
+        if (!(sum_exp > 0.0f) || !std::isfinite(sum_exp)) {
+            continue;
+        }
+
+        for (float & p : probs) {
+            p /= sum_exp;
+        }
+
+        std::vector<size_t> sorted_idx(n_vocab);
+        std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
+        std::sort(sorted_idx.begin(), sorted_idx.end(), [&](size_t a, size_t b) {
+            return probs[a] > probs[b];
+        });
+
+        float cum_probs = 0.0f;
+        bool prev_remove = false;
+        for (size_t rank = 0; rank < n_vocab; ++rank) {
+            const size_t idx = sorted_idx[rank];
+            cum_probs += probs[idx];
+
+            bool remove = cum_probs > top_p;
+            if (rank > 0) {
+                remove = prev_remove;
+            } else {
+                remove = false;
+            }
+            prev_remove = cum_probs > top_p;
+
+            if (remove) {
+                logits[row * n_vocab + idx] = MOSS_NEG_INF;
+            }
+        }
+    }
+}
+
+static void moss_apply_repetition_penalty_inplace(
+        std::vector<float> & logits,
+        size_t n_rows,
+        size_t n_vocab,
+        const std::vector<llama_token> * prev_tokens,
+        float penalty) {
+    if (penalty == 1.0f || prev_tokens == nullptr || prev_tokens->empty()) {
+        return;
+    }
+
+    std::vector<uint8_t> seen(n_vocab, 0);
+    for (llama_token tok : *prev_tokens) {
+        if (tok >= 0 && (size_t) tok < n_vocab) {
+            seen[(size_t) tok] = 1;
+        }
+    }
+
+    for (size_t col = 0; col < n_vocab; ++col) {
+        if (!seen[col]) {
+            continue;
+        }
+        for (size_t row = 0; row < n_rows; ++row) {
+            float & logit = logits[row * n_vocab + col];
+            if (logit > 0.0f) {
+                logit /= penalty;
+            } else {
+                logit *= penalty;
+            }
+        }
+    }
+}
+
+static llama_token moss_argmax_row(const std::vector<float> & logits, size_t row, size_t n_vocab) {
+    size_t best_idx = 0;
+    float best_val = logits[row * n_vocab + 0];
+    for (size_t col = 1; col < n_vocab; ++col) {
+        const float cur = logits[row * n_vocab + col];
+        if (cur > best_val) {
+            best_val = cur;
+            best_idx = col;
+        }
+    }
+    return (llama_token) best_idx;
+}
+
+static llama_token moss_multinomial_row(
+        const std::vector<float> & probs,
+        size_t row,
+        size_t n_vocab,
+        moss_rng & rng) {
+    const float * row_probs = probs.data() + row * n_vocab;
+    std::uniform_real_distribution<float> dist(0.0f, 1.0f);
+    const float r = dist(rng);
+
+    float cum = 0.0f;
+    size_t idx = 0;
+    for (; idx < n_vocab; ++idx) {
+        cum += row_probs[idx];
+        if (!(cum < r)) {
+            break;
+        }
+    }
+    if (idx >= n_vocab) {
+        idx = n_vocab - 1;
+    }
+    return (llama_token) idx;
+}
+
+static std::vector<float> moss_softmax(const std::vector<float> & logits, size_t n_rows, size_t n_vocab) {
+    std::vector<float> probs(n_rows * n_vocab, 0.0f);
+
+    for (size_t row = 0; row < n_rows; ++row) {
+        float max_logit = MOSS_NEG_INF;
+        for (size_t col = 0; col < n_vocab; ++col) {
+            max_logit = std::max(max_logit, logits[row * n_vocab + col]);
+        }
+
+        if (!std::isfinite(max_logit)) {
+            probs[row * n_vocab + 0] = 1.0f;
+            continue;
+        }
+
+        float sum_exp = 0.0f;
+        for (size_t col = 0; col < n_vocab; ++col) {
+            const float logit = logits[row * n_vocab + col];
+            if (std::isfinite(logit)) {
+                probs[row * n_vocab + col] = std::exp(logit - max_logit);
+                sum_exp += probs[row * n_vocab + col];
+            }
+        }
+
+        if (!(sum_exp > 0.0f) || !std::isfinite(sum_exp)) {
+            probs[row * n_vocab + 0] = 1.0f;
+            continue;
+        }
+
+        for (size_t col = 0; col < n_vocab; ++col) {
+            probs[row * n_vocab + col] /= sum_exp;
+        }
+    }
+
+    return probs;
+}
+
+static std::vector<llama_token> moss_sample_token(
+        const std::vector<float> & logits_in,
+        size_t n_rows,
+        size_t n_vocab,
+        moss_rng & rng,
+        const std::vector<llama_token> * prev_tokens = nullptr,
+        float repetition_penalty = 1.0f,
+        float top_p = 1.0f,
+        int32_t top_k = 0,
+        bool do_sample = true) {
+    GGML_ASSERT(logits_in.size() == n_rows * n_vocab);
+
+    std::vector<float> logits = logits_in;
+    moss_apply_repetition_penalty_inplace(logits, n_rows, n_vocab, prev_tokens, repetition_penalty);
+
+    std::vector<llama_token> tokens(n_rows, 0);
+    if (!do_sample) {
+        for (size_t row = 0; row < n_rows; ++row) {
+            tokens[row] = moss_argmax_row(logits, row, n_vocab);
+        }
+        return tokens;
+    }
+
+    if (top_k > 0) {
+        const size_t k = std::min<size_t>((size_t) top_k, n_vocab);
+        for (size_t row = 0; row < n_rows; ++row) {
+            std::vector<size_t> top_idx(n_vocab);
+            std::iota(top_idx.begin(), top_idx.end(), 0);
+            std::nth_element(top_idx.begin(), top_idx.end() - k, top_idx.end(), [&](size_t a, size_t b) {
+                return logits[row * n_vocab + a] < logits[row * n_vocab + b];
+            });
+            top_idx.erase(top_idx.begin(), top_idx.end() - k);
+
+            std::vector<float> top_vals(k);
+            for (size_t i = 0; i < k; ++i) {
+                top_vals[i] = logits[row * n_vocab + top_idx[i]];
+            }
+
+            if (top_p < 1.0f) {
+                moss_apply_top_p_inplace(top_vals, 1, k, top_p);
+            }
+
+            const std::vector<float> probs = moss_softmax(top_vals, 1, k);
+            const llama_token local = moss_multinomial_row(probs, 0, k, rng);
+            tokens[row] = (llama_token) top_idx[(size_t) local];
+        }
+        return tokens;
+    }
+
+    if (top_p < 1.0f) {
+        moss_apply_top_p_inplace(logits, n_rows, n_vocab, top_p);
+    }
+    const std::vector<float> probs = moss_softmax(logits, n_rows, n_vocab);
+    for (size_t row = 0; row < n_rows; ++row) {
+        tokens[row] = moss_multinomial_row(probs, row, n_vocab, rng);
+    }
+
+    return tokens;
+}
+
+static std::vector<llama_token> moss_collect_audio_history_channels(
+        const moss_delay_state & state,
+        const std::vector<size_t> & channels) {
+    if (channels.empty() || state.empty_audio()) {
+        return {};
+    }
+
+    std::vector<llama_token> out;
+    out.reserve(state.audio_frames() * channels.size());
+    for (size_t frame = 0; frame < state.audio_frames(); ++frame) {
+        const llama_token * audio = state.audio_frame_ptr(frame);
+        for (size_t channel : channels) {
+            out.push_back(audio[channel]);
+        }
+    }
+    return out;
+}
+
+static std::vector<llama_token> moss_delay_step(
+        moss_delay_state & state,
+        const std::vector<float> & text_logits,
+        const std::vector<float> & audio_logits,
+        const moss_sampling_config & sampling_cfg,
+        const moss_delay_config & cfg,
+        moss_rng & rng) {
+    GGML_ASSERT(cfg.n_vq == state.n_vq);
+
+    const size_t n_vq = cfg.n_vq;
+    const size_t text_vocab = text_logits.size();
+    const size_t audio_vocab = moss_audio_vocab_with_pad(cfg);
+    GGML_ASSERT(audio_logits.size() == n_vq * audio_vocab);
+
+    std::vector<llama_token> result(cfg.packed_stride(), cfg.audio_pad_code);
+    if (state.is_stopping) {
+        result[0] = cfg.pad_token_id;
+        return result;
+    }
+
+    llama_token next_text = cfg.pad_token_id;
+
+    if (state.delayed_length < (int64_t) n_vq) {
+        next_text = cfg.audio_assistant_delay_slot_token_id;
+    } else if (state.delayed_length == (int64_t) n_vq) {
+        next_text = cfg.audio_end_token_id;
+        state.is_audio = false;
+    } else {
+        std::vector<float> scaled = text_logits;
+        const float text_temp = sampling_cfg.text_temperature > 0.0f ? sampling_cfg.text_temperature : 1.0f;
+        const bool text_do_sample = sampling_cfg.text_temperature > 0.0f;
+        for (float & v : scaled) {
+            v /= text_temp;
+        }
+
+        if (!state.is_audio) {
+            const llama_token excluded[] = {
+                cfg.pad_token_id,
+                cfg.audio_assistant_gen_slot_token_id,
+                cfg.audio_assistant_delay_slot_token_id,
+                cfg.audio_end_token_id,
+            };
+            for (llama_token tok : excluded) {
+                if (tok >= 0 && (size_t) tok < text_vocab) {
+                    scaled[(size_t) tok] = MOSS_NEG_INF;
+                }
+            }
+        } else {
+            std::fill(scaled.begin(), scaled.end(), MOSS_NEG_INF);
+            if ((size_t) cfg.audio_assistant_gen_slot_token_id < text_vocab) {
+                scaled[(size_t) cfg.audio_assistant_gen_slot_token_id] =
+                        text_logits[(size_t) cfg.audio_assistant_gen_slot_token_id] / text_temp;
+            }
+            if ((size_t) cfg.audio_assistant_delay_slot_token_id < text_vocab) {
+                scaled[(size_t) cfg.audio_assistant_delay_slot_token_id] =
+                        text_logits[(size_t) cfg.audio_assistant_delay_slot_token_id] / text_temp;
+            }
+        }
+
+        if (state.time_step == 0 && (size_t) cfg.audio_assistant_delay_slot_token_id < text_vocab) {
+            scaled[(size_t) cfg.audio_assistant_delay_slot_token_id] = MOSS_NEG_INF;
+        }
+        if (state.time_step <= (int32_t) n_vq && (size_t) cfg.im_end_token_id < text_vocab) {
+            scaled[(size_t) cfg.im_end_token_id] = MOSS_NEG_INF;
+        }
+
+        next_text = moss_sample_token(
+                scaled, 1, text_vocab, rng, nullptr, 1.0f,
+                sampling_cfg.text_top_p, sampling_cfg.text_top_k, text_do_sample)[0];
+    }
+
+    if (next_text == cfg.audio_start_token_id) {
+        state.is_audio = true;
+    }
+    if (next_text == cfg.im_end_token_id) {
+        state.is_stopping = true;
+    }
+
+    std::vector<llama_token> next_audio(n_vq, cfg.audio_pad_code);
+    bool any_sampling = false;
+    for (size_t channel = 0; channel < n_vq; ++channel) {
+        const bool pre_audio = channel < (size_t) std::max(state.audio_length, 0);
+        const bool post_audio = state.delayed_length == MOSS_DELAY_INT64_MAX ||
+                channel > (size_t) std::max<int64_t>(state.delayed_length - 1, -1);
+        any_sampling = any_sampling || (pre_audio && post_audio);
+    }
+
+    if (any_sampling) {
+        std::vector<float> scaled_audio = audio_logits;
+        const float audio_temp = sampling_cfg.audio_temperature > 0.0f ? sampling_cfg.audio_temperature : 1.0f;
+        const bool audio_do_sample = sampling_cfg.audio_temperature > 0.0f;
+        for (float & v : scaled_audio) {
+            v /= audio_temp;
+        }
+        if ((size_t) cfg.audio_pad_code < audio_vocab) {
+            for (size_t channel = 0; channel < n_vq; ++channel) {
+                scaled_audio[channel * audio_vocab + (size_t) cfg.audio_pad_code] = MOSS_NEG_INF;
+            }
+        }
+
+        const bool sample_ch0 =
+                0 < (size_t) std::max(state.audio_length, 0) &&
+                (state.delayed_length == MOSS_DELAY_INT64_MAX ||
+                 0 > std::max<int64_t>(state.delayed_length - 1, -1));
+        if (sample_ch0) {
+            const std::vector<size_t> ch0 = {0};
+            const std::vector<llama_token> prev = moss_collect_audio_history_channels(state, ch0);
+            const std::vector<float> ch0_logits(scaled_audio.begin(), scaled_audio.begin() + audio_vocab);
+            next_audio[0] = moss_sample_token(
+                    ch0_logits, 1, audio_vocab, rng, &prev,
+                    sampling_cfg.audio_repetition_penalty,
+                    sampling_cfg.audio_top_p,
+                    sampling_cfg.audio_top_k,
+                    audio_do_sample)[0];
+        }
+
+        std::vector<size_t> rest_channels;
+        for (size_t channel = 1; channel < n_vq; ++channel) {
+            const bool pre_audio = channel < (size_t) std::max(state.audio_length, 0);
+            const bool post_audio = state.delayed_length == MOSS_DELAY_INT64_MAX ||
+                    channel > (size_t) std::max<int64_t>(state.delayed_length - 1, -1);
+            if (pre_audio && post_audio) {
+                rest_channels.push_back(channel);
+            }
+        }
+
+        if (!rest_channels.empty()) {
+            std::vector<float> rest_logits(rest_channels.size() * audio_vocab);
+            for (size_t i = 0; i < rest_channels.size(); ++i) {
+                const size_t channel = rest_channels[i];
+                std::copy_n(
+                        scaled_audio.begin() + channel * audio_vocab,
+                        audio_vocab,
+                        rest_logits.begin() + i * audio_vocab);
+            }
+            const std::vector<llama_token> prev = moss_collect_audio_history_channels(state, rest_channels);
+            const std::vector<llama_token> sampled = moss_sample_token(
+                    rest_logits, rest_channels.size(), audio_vocab, rng, &prev,
+                    sampling_cfg.audio_repetition_penalty,
+                    sampling_cfg.audio_top_p,
+                    sampling_cfg.audio_top_k,
+                    audio_do_sample);
+            for (size_t i = 0; i < rest_channels.size(); ++i) {
+                next_audio[rest_channels[i]] = sampled[i];
+            }
+        }
+    }
+
+    if (next_text == cfg.audio_start_token_id ||
+            next_text == cfg.audio_assistant_gen_slot_token_id ||
+            next_text == cfg.audio_assistant_delay_slot_token_id) {
+        state.audio_length += 1;
+    }
+    if (next_text == cfg.audio_end_token_id) {
+        state.audio_length = 0;
+    }
+
+    if (state.delayed_length == MOSS_DELAY_INT64_MAX && next_text == cfg.audio_assistant_delay_slot_token_id) {
+        state.delayed_length = 0;
+    }
+    if (state.delayed_length != MOSS_DELAY_INT64_MAX) {
+        state.delayed_length += 1;
+    }
+    if (state.delayed_length > (int64_t) n_vq) {
+        state.delayed_length = MOSS_DELAY_INT64_MAX;
+    }
+
+    state.time_step += 1;
+    state.text_history.push_back(next_text);
+    state.append_audio(next_audio);
+
+    result[0] = next_text;
+    std::copy(next_audio.begin(), next_audio.end(), result.begin() + 1);
+    return result;
+}
+
+static std::vector<llama_token> moss_apply_delay_pattern(
+        const std::vector<llama_token> & codes,
+        size_t n_frames,
+        const moss_delay_config & cfg) {
+    GGML_ASSERT(cfg.n_vq > 0);
+    GGML_ASSERT(codes.size() == n_frames * cfg.n_vq);
+
+    const size_t delayed_frames = n_frames + cfg.n_vq - 1;
+    std::vector<llama_token> delayed(delayed_frames * cfg.n_vq, cfg.audio_pad_code);
+
+    for (size_t channel = 0; channel < cfg.n_vq; ++channel) {
+        for (size_t t = 0; t < n_frames; ++t) {
+            delayed[(channel + t) * cfg.n_vq + channel] = codes[t * cfg.n_vq + channel];
+        }
+    }
+
+    return delayed;
+}
+
+static std::vector<llama_token> moss_apply_de_delay_pattern(
+        const std::vector<llama_token> & delayed_codes,
+        size_t delayed_frames,
+        const moss_delay_config & cfg,
+        size_t * out_frames = nullptr) {
+    GGML_ASSERT(cfg.n_vq > 0);
+    GGML_ASSERT(delayed_codes.size() == delayed_frames * cfg.n_vq);
+
+    if (delayed_frames + 1 <= cfg.n_vq) {
+        if (out_frames != nullptr) {
+            *out_frames = 0;
+        }
+        return {};
+    }
+
+    const size_t n_frames = delayed_frames - cfg.n_vq + 1;
+    std::vector<llama_token> codes(n_frames * cfg.n_vq);
+    for (size_t channel = 0; channel < cfg.n_vq; ++channel) {
+        for (size_t t = 0; t < n_frames; ++t) {
+            codes[t * cfg.n_vq + channel] = delayed_codes[(channel + t) * cfg.n_vq + channel];
+        }
+    }
+
+    if (out_frames != nullptr) {
+        *out_frames = n_frames;
+    }
+
+    return codes;
+}
+
+static std::vector<moss_audio_segment> moss_extract_audio_segments(
+        const std::vector<llama_token> & generation_audio,
+        size_t delayed_frames,
+        const moss_delay_config & cfg) {
+    size_t n_frames = 0;
+    const std::vector<llama_token> codes = moss_apply_de_delay_pattern(generation_audio, delayed_frames, cfg, &n_frames);
+    if (n_frames == 0) {
+        return {};
+    }
+
+    std::vector<moss_audio_segment> segments;
+    size_t cur_start = SIZE_MAX;
+
+    for (size_t t = 0; t < n_frames; ++t) {
+        bool is_pad = true;
+        for (size_t channel = 0; channel < cfg.n_vq; ++channel) {
+            if (codes[t * cfg.n_vq + channel] != cfg.audio_pad_code) {
+                is_pad = false;
+                break;
+            }
+        }
+
+        if (!is_pad && cur_start == SIZE_MAX) {
+            cur_start = t;
+        }
+
+        const bool close_segment = cur_start != SIZE_MAX && (is_pad || t + 1 == n_frames);
+        if (close_segment) {
+            const size_t cur_end = is_pad ? t : t + 1;
+            moss_audio_segment seg;
+            seg.n_frames = cur_end - cur_start;
+            seg.codes.insert(
+                    seg.codes.end(),
+                    codes.begin() + cur_start * cfg.n_vq,
+                    codes.begin() + cur_end * cfg.n_vq);
+            segments.push_back(std::move(seg));
+            cur_start = SIZE_MAX;
+        }
+    }
+
+    return segments;
+}
+
+static std::string moss_delay_config_to_string(const moss_delay_config & cfg) {
+    std::ostringstream oss;
+    oss
+        << "n_vq=" << cfg.n_vq
+        << " pad_token_id=" << cfg.pad_token_id
+        << " im_start_token_id=" << cfg.im_start_token_id
+        << " im_end_token_id=" << cfg.im_end_token_id
+        << " audio_start_token_id=" << cfg.audio_start_token_id
+        << " audio_end_token_id=" << cfg.audio_end_token_id
+        << " audio_user_slot_token_id=" << cfg.audio_user_slot_token_id
+        << " audio_gen_slot_token_id=" << cfg.audio_assistant_gen_slot_token_id
+        << " audio_delay_slot_token_id=" << cfg.audio_assistant_delay_slot_token_id
+        << " audio_pad_code=" << cfg.audio_pad_code
+        << " audio_vocab_size=" << cfg.audio_vocab_size;
+    return oss.str();
+}
+
+static bool moss_delay_self_test() {
+    moss_delay_config cfg;
+
+    std::vector<llama_token> codes = {
+        10, 11, 12,
+        20, 21, 22,
+        30, 31, 32,
+    };
+    cfg.n_vq = 3;
+    cfg.audio_pad_code = 99;
+
+    const std::vector<llama_token> delayed = moss_apply_delay_pattern(codes, 3, cfg);
+    const std::vector<llama_token> expected_delayed = {
+        10, 99, 99,
+        20, 11, 99,
+        30, 21, 12,
+        99, 31, 22,
+        99, 99, 32,
+    };
+    if (delayed != expected_delayed) {
+        return false;
+    }
+
+    size_t dedelayed_frames = 0;
+    const std::vector<llama_token> restored = moss_apply_de_delay_pattern(delayed, 5, cfg, &dedelayed_frames);
+    if (dedelayed_frames != 3 || restored != codes) {
+        return false;
+    }
+
+    std::vector<llama_token> packed = {
+        1, 99, 99, 99,
+        cfg.audio_start_token_id, 10, 11, 12,
+        cfg.audio_assistant_gen_slot_token_id, 20, 21, 22,
+    };
+    const moss_delay_state state = moss_init_delay_state(packed, cfg);
+    if (!(state.text_history.size() == 3 &&
+            state.audio_frames() == 3 &&
+            state.is_audio &&
+            state.audio_length == 2 &&
+            !state.is_stopping &&
+            state.time_step == 0)) {
+        return false;
+    }
+
+    {
+        std::vector<float> logits = {
+            3.0f, 2.0f, 1.0f,
+            1.0f, 3.0f, 2.0f,
+        };
+        std::vector<llama_token> prev = {1};
+        moss_apply_repetition_penalty_inplace(logits, 2, 3, &prev, 2.0f);
+        if (std::fabs(logits[1] - 1.0f) > 1e-6f || std::fabs(logits[4] - 1.5f) > 1e-6f) {
+            return false;
+        }
+    }
+
+    {
+        std::vector<float> logits = {5.0f, 4.0f, 1.0f};
+        moss_apply_top_p_inplace(logits, 1, 3, 0.7f);
+        if (!std::isfinite(logits[0]) || std::isfinite(logits[1]) || std::isfinite(logits[2])) {
+            return false;
+        }
+    }
+
+    {
+        moss_rng rng(123);
+        const std::vector<float> logits = {
+            1.0f, 9.0f, 3.0f,
+            2.0f, 1.0f, 8.0f,
+        };
+        const std::vector<llama_token> sampled = moss_sample_token(logits, 2, 3, rng, nullptr, 1.0f, 1.0f, 1, true);
+        if (sampled.size() != 2 || sampled[0] != 1 || sampled[1] != 2) {
+            return false;
+        }
+    }
+
+    {
+        moss_delay_state step_state;
+        step_state.n_vq = 3;
+        step_state.audio_length = 2;
+        step_state.is_audio = true;
+        step_state.text_history = {cfg.audio_start_token_id, cfg.audio_assistant_gen_slot_token_id};
+        step_state.audio_history = {
+            3, 4, cfg.audio_pad_code,
+            5, 6, cfg.audio_pad_code,
+        };
+
+        const std::vector<float> text_logits = {
+            0.0f, 0.0f, 0.0f, 0.0f, 10.0f, 9.0f, 0.0f, 0.0f,
+        };
+        moss_delay_config step_cfg = cfg;
+        step_cfg.pad_token_id = 0;
+        step_cfg.im_end_token_id = 1;
+        step_cfg.audio_start_token_id = 2;
+        step_cfg.audio_end_token_id = 3;
+        step_cfg.audio_assistant_gen_slot_token_id = 4;
+        step_cfg.audio_assistant_delay_slot_token_id = 5;
+        step_cfg.audio_pad_code = 7;
+        step_cfg.audio_vocab_size = 7;
+
+        const std::vector<float> audio_logits = {
+            1.0f, 8.0f, 2.0f, 1.0f, 1.0f, 1.0f, 1.0f, -100.0f,
+            2.0f, 1.0f, 9.0f, 1.0f, 1.0f, 1.0f, 1.0f, -100.0f,
+            9.0f, 1.0f, 2.0f, 1.0f, 1.0f, 1.0f, 1.0f, -100.0f,
+        };
+        moss_sampling_config sampling_cfg;
+        sampling_cfg.text_temperature = 1.0f;
+        sampling_cfg.text_top_k = 1;
+        sampling_cfg.audio_temperature = 1.0f;
+        sampling_cfg.audio_top_k = 1;
+
+        moss_rng rng(7);
+        const std::vector<llama_token> next = moss_delay_step(
+                step_state, text_logits, audio_logits, sampling_cfg, step_cfg, rng);
+        if (next.size() != 4 || next[0] != 4 || next[1] != 1 || next[2] != 2 || next[3] != 7) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+} // namespace
+
+int main(int argc, char ** argv) {
+    std::string model_path;
+    bool print_delay_config = false;
+    bool self_test = false;
+
+    for (int i = 1; i < argc; ++i) {
+        const std::string arg = argv[i];
+        if ((arg == "-m" || arg == "--model") && i + 1 < argc) {
+            model_path = argv[++i];
+            continue;
+        }
+        if (arg == "--print-delay-config") {
+            print_delay_config = true;
+            continue;
+        }
+        if (arg == "--self-test-delay-state") {
+            self_test = true;
+            continue;
+        }
+        if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv);
+            return EXIT_SUCCESS;
+        }
+
+        LOG_ERR("unknown argument: %s\n", arg.c_str());
+        print_usage(argc, argv);
+        return EXIT_FAILURE;
+    }
+
+    if (self_test) {
+        if (!moss_delay_self_test()) {
+            LOG_ERR("moss delay state self-test failed\n");
+            return EXIT_FAILURE;
+        }
+        LOG("moss delay state self-test: ok\n");
+    }
+
+    if (!print_delay_config) {
+        if (self_test) {
+            return EXIT_SUCCESS;
+        }
+        LOG("moss delay state and multi-head sampler are in place; audio decode is not implemented yet.\n");
+        LOG("use --print-delay-config with -m <model.gguf> to inspect model metadata.\n");
+        return EXIT_SUCCESS;
+    }
+
+    if (model_path.empty()) {
+        LOG_ERR("--print-delay-config requires -m <model.gguf>\n");
+        return EXIT_FAILURE;
+    }
+
+    llama_backend_init();
+
+    llama_model_params mparams = llama_model_default_params();
+    mparams.use_mmap = true;
+
+    llama_model * model = llama_model_load_from_file(model_path.c_str(), mparams);
+    if (model == nullptr) {
+        LOG_ERR("failed to load model: %s\n", model_path.c_str());
+        llama_backend_free();
+        return EXIT_FAILURE;
+    }
+
+    const moss_delay_config cfg = moss_delay_config_from_model(model);
+    LOG("%s\n", moss_delay_config_to_string(cfg).c_str());
+
+    llama_model_free(model);
+    llama_backend_free();
+
+    return EXIT_SUCCESS;
+}

From 53480ea91c905cac343888722362b086c0129748 Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Wed, 11 Mar 2026 16:19:12 +0800
Subject: [PATCH 05/20] moss-tts: add first-class generation parity runner

---
 scripts/run-moss-tts-delay-8b-quality.sh      | 195 +++++
 scripts/run-moss-tts-delay-decode-parity.sh   |  19 +
 .../run-moss-tts-delay-firstclass-parity.sh   | 202 +++++
 tests/moss_tts_delay_export_decode_ref.py     |  94 +++
 tests/moss_tts_delay_export_generation_ref.py | 107 +++
 tools/tts/moss-tts-audio-decode.py            |  83 ++
 tools/tts/moss-tts.cpp                        | 770 +++++++++++++++++-
 7 files changed, 1469 insertions(+), 1 deletion(-)
 create mode 100755 scripts/run-moss-tts-delay-8b-quality.sh
 create mode 100755 scripts/run-moss-tts-delay-decode-parity.sh
 create mode 100755 scripts/run-moss-tts-delay-firstclass-parity.sh
 create mode 100755 tests/moss_tts_delay_export_decode_ref.py
 create mode 100755 tests/moss_tts_delay_export_generation_ref.py
 create mode 100755 tools/tts/moss-tts-audio-decode.py

diff --git a/scripts/run-moss-tts-delay-8b-quality.sh b/scripts/run-moss-tts-delay-8b-quality.sh
new file mode 100755
index 000000000..3c802ade6
--- /dev/null
+++ b/scripts/run-moss-tts-delay-8b-quality.sh
@@ -0,0 +1,195 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# End-to-end setup for MOSS-TTS-Delay 8B quality smoke test on a fresh machine.
+#
+# What it does:
+# 1. Clones this llama.cpp fork and the official OpenMOSS/MOSS-TTS repo.
+# 2. Creates/uses a conda env and installs the minimal Python stack.
+# 3. Downloads the official GGUF backbone + embeddings/lm_heads/tokenizer.
+# 4. Downloads the official ONNX audio tokenizer.
+# 5. Builds llama-moss-tts and runs the C++ vs Python de-delay/raw-code parity test.
+# 6. Runs the official Python llama_cpp backend to synthesize wavs for listening.
+#
+# Defaults target a CUDA machine. For CPU-only ONNX Runtime:
+#   ORT_PKG=onnxruntime USE_GPU_AUDIO=false bash run-moss-tts-delay-8b-quality.sh
+
+WORKDIR="${WORKDIR:-$HOME/moss-tts-delay-8b-eval}"
+CONDA_ENV="${CONDA_ENV:-moss-tts-delay-8b}"
+PYTHON_VERSION="${PYTHON_VERSION:-3.11}"
+
+LLAMA_CPP_REPO="${LLAMA_CPP_REPO:-https://github.com/expectqwq/llama.cpp.git}"
+LLAMA_CPP_REF="${LLAMA_CPP_REF:-master}"
+MOSS_TTS_REPO="${MOSS_TTS_REPO:-https://github.com/OpenMOSS/MOSS-TTS.git}"
+MOSS_TTS_REF="${MOSS_TTS_REF:-main}"
+
+ORT_PKG="${ORT_PKG:-onnxruntime-gpu}"
+USE_GPU_AUDIO="${USE_GPU_AUDIO:-true}"
+N_JOBS="${N_JOBS:-$(nproc)}"
+
+TEXT_ZH="${TEXT_ZH:-今天天气很好，我们来测试一下 MOSS-TTS Delay 8B 的音质和稳定性。}"
+TEXT_EN="${TEXT_EN:-Hello, this is a quality smoke test for the MOSS-TTS Delay 8B pipeline running with llama.cpp and the ONNX audio tokenizer.}"
+REFERENCE_AUDIO="${REFERENCE_AUDIO:-}"
+
+HF_MODEL_REPO="${HF_MODEL_REPO:-OpenMOSS-Team/MOSS-TTS-GGUF}"
+HF_AUDIO_REPO="${HF_AUDIO_REPO:-OpenMOSS-Team/MOSS-Audio-Tokenizer-ONNX}"
+
+LLAMA_CPP_DIR="$WORKDIR/llama.cpp"
+MOSS_TTS_DIR="$WORKDIR/MOSS-TTS"
+WEIGHTS_DIR="$WORKDIR/weights"
+GGUF_DIR="$WEIGHTS_DIR/MOSS-TTS-GGUF"
+AUDIO_ORT_DIR="$WEIGHTS_DIR/MOSS-Audio-Tokenizer-ONNX"
+OUT_DIR="$WORKDIR/out"
+CONFIG_PATH="$WORKDIR/moss_delay_8b_eval.yaml"
+
+mkdir -p "$WORKDIR" "$WEIGHTS_DIR" "$OUT_DIR"
+
+need_cmd() {
+    command -v "$1" >/dev/null 2>&1 || {
+        echo "error: required command not found: $1" >&2
+        exit 1
+    }
+}
+
+git_clone_or_update() {
+    local repo_url="$1"
+    local repo_dir="$2"
+    local repo_ref="$3"
+
+    if [[ ! -d "$repo_dir/.git" ]]; then
+        git clone "$repo_url" "$repo_dir"
+    fi
+
+    git -C "$repo_dir" fetch --all --tags
+    git -C "$repo_dir" checkout "$repo_ref"
+    git -C "$repo_dir" pull --ff-only || true
+}
+
+need_cmd git
+need_cmd cmake
+need_cmd conda
+
+source "$(conda info --base)/etc/profile.d/conda.sh"
+
+if ! conda env list | awk '{print $1}' | grep -qx "$CONDA_ENV"; then
+    conda create -y -n "$CONDA_ENV" "python=$PYTHON_VERSION"
+fi
+conda activate "$CONDA_ENV"
+
+python -m pip install --upgrade pip setuptools wheel
+python -m pip install --upgrade "huggingface_hub[cli]>=0.30"
+
+git_clone_or_update "$LLAMA_CPP_REPO" "$LLAMA_CPP_DIR" "$LLAMA_CPP_REF"
+git_clone_or_update "$MOSS_TTS_REPO" "$MOSS_TTS_DIR" "$MOSS_TTS_REF"
+git -C "$MOSS_TTS_DIR" submodule update --init --recursive
+
+if [[ "$ORT_PKG" == "onnxruntime-gpu" ]]; then
+    python -m pip install -e "${MOSS_TTS_DIR}[llama-cpp-onnx]"
+else
+    python -m pip install -e "${MOSS_TTS_DIR}[llama-cpp]"
+    python -m pip install --upgrade "${ORT_PKG}>=1.19"
+fi
+
+huggingface-cli download "$HF_MODEL_REPO" --local-dir "$GGUF_DIR"
+huggingface-cli download "$HF_AUDIO_REPO" --local-dir "$AUDIO_ORT_DIR"
+
+if [[ -z "$REFERENCE_AUDIO" ]]; then
+    REFERENCE_AUDIO="$MOSS_TTS_DIR/assets/audio/reference_zh.wav"
+fi
+
+if [[ ! -f "$GGUF_DIR/MOSS_TTS_Q4_K_M.gguf" ]]; then
+    echo "error: expected backbone file missing: $GGUF_DIR/MOSS_TTS_Q4_K_M.gguf" >&2
+    exit 1
+fi
+
+if [[ ! -f "$AUDIO_ORT_DIR/encoder.onnx" || ! -f "$AUDIO_ORT_DIR/decoder.onnx" ]]; then
+    echo "error: expected ONNX audio tokenizer files missing in $AUDIO_ORT_DIR" >&2
+    exit 1
+fi
+
+cat > "$CONFIG_PATH" <<EOF
+backbone_gguf: $GGUF_DIR/MOSS_TTS_Q4_K_M.gguf
+embedding_dir: $GGUF_DIR/embeddings
+lm_head_dir: $GGUF_DIR/lm_heads
+tokenizer_dir: $GGUF_DIR/tokenizer
+
+audio_backend: onnx
+audio_encoder_onnx: $AUDIO_ORT_DIR/encoder.onnx
+audio_decoder_onnx: $AUDIO_ORT_DIR/decoder.onnx
+
+heads_backend: auto
+
+n_ctx: 4096
+n_batch: 512
+n_threads: 8
+n_gpu_layers: -1
+max_new_tokens: 2000
+use_gpu_audio: $USE_GPU_AUDIO
+
+text_temperature: 1.5
+text_top_p: 1.0
+text_top_k: 50
+
+audio_temperature: 1.7
+audio_top_p: 0.8
+audio_top_k: 25
+audio_repetition_penalty: 1.0
+EOF
+
+cmake -S "$LLAMA_CPP_DIR" -B "$LLAMA_CPP_DIR/build"
+cmake --build "$LLAMA_CPP_DIR/build" --target llama-moss-tts -j"$N_JOBS"
+
+echo
+echo "== Running C++ vs Python decode parity =="
+"$LLAMA_CPP_DIR/scripts/run-moss-tts-delay-decode-parity.sh"
+
+echo
+echo "== Generating zero-shot quality sample (ZH) =="
+python -m moss_tts_delay.llama_cpp \
+    --config "$CONFIG_PATH" \
+    --text "$TEXT_ZH" \
+    --output "$OUT_DIR/zero_shot_zh.wav" \
+    --profile
+
+echo
+echo "== Generating zero-shot quality sample (EN) =="
+python -m moss_tts_delay.llama_cpp \
+    --config "$CONFIG_PATH" \
+    --text "$TEXT_EN" \
+    --output "$OUT_DIR/zero_shot_en.wav" \
+    --profile
+
+if [[ -f "$REFERENCE_AUDIO" ]]; then
+    echo
+    echo "== Generating voice-clone quality sample =="
+    python -m moss_tts_delay.llama_cpp \
+        --config "$CONFIG_PATH" \
+        --reference "$REFERENCE_AUDIO" \
+        --text "$TEXT_ZH" \
+        --output "$OUT_DIR/clone_zh.wav" \
+        --profile
+fi
+
+python - <<PY
+from pathlib import Path
+import wave
+
+out_dir = Path("$OUT_DIR")
+print("\\n== Output summary ==")
+for wav_path in sorted(out_dir.glob("*.wav")):
+    with wave.open(str(wav_path), "rb") as f:
+        sr = f.getframerate()
+        n = f.getnframes()
+        ch = f.getnchannels()
+    dur = n / max(sr, 1)
+    print(f"{wav_path}: sr={sr}Hz channels={ch} duration={dur:.2f}s frames={n}")
+print("\\nListen to the generated wavs above for subjective quality.")
+PY
+
+echo
+echo "Done."
+echo "Artifacts:"
+echo "  config: $CONFIG_PATH"
+echo "  outputs: $OUT_DIR"
+echo "  gguf weights: $GGUF_DIR"
+echo "  onnx tokenizer: $AUDIO_ORT_DIR"
diff --git a/scripts/run-moss-tts-delay-decode-parity.sh b/scripts/run-moss-tts-delay-decode-parity.sh
new file mode 100755
index 000000000..85406cfa1
--- /dev/null
+++ b/scripts/run-moss-tts-delay-decode-parity.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="/home/expec/workwork/llama.cpp"
+REF_PATH="${REF_PATH:-/home/expec/workwork/tmp/moss_tts_delay_decode.ref.bin}"
+
+source /home/expec/miniconda3/etc/profile.d/conda.sh
+conda activate llama-cpp
+
+python "$ROOT/tests/moss_tts_delay_export_decode_ref.py" "$REF_PATH"
+
+cmake -S "$ROOT" -B "$ROOT/build"
+cmake --build "$ROOT/build" --target llama-moss-tts -j4
+
+"$ROOT/build/bin/llama-moss-tts" \
+  --python-bin python \
+  --decode-parity-ref "$REF_PATH"
+
+echo "PASS: moss-tts delay decode parity verified"
diff --git a/scripts/run-moss-tts-delay-firstclass-parity.sh b/scripts/run-moss-tts-delay-firstclass-parity.sh
new file mode 100755
index 000000000..77039c044
--- /dev/null
+++ b/scripts/run-moss-tts-delay-firstclass-parity.sh
@@ -0,0 +1,202 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Full hybrid parity runner for a machine with enough RAM / disk:
+# - downloads OpenMOSS-Team/MOSS-TTS (HF)
+# - extracts hybrid llama_cpp weights for the official Python pipeline
+# - converts the same HF checkpoint to a first-class MOSS-TTS-Delay GGUF
+# - runs deterministic generation on both paths
+# - compares raw audio codes exactly
+# - decodes both sides to wav through the same ONNX audio tokenizer
+
+WORKDIR="${WORKDIR:-$HOME/moss-tts-delay-firstclass}"
+CONDA_ENV="${CONDA_ENV:-moss-tts-firstclass}"
+PYTHON_VERSION="${PYTHON_VERSION:-3.11}"
+N_JOBS="${N_JOBS:-$(nproc)}"
+
+TEXT="${TEXT:-今天天气很好，我们来验证 first-class MOSS-TTS Delay 在 llama.cpp 中的端到端一致性。}"
+REFERENCE_AUDIO="${REFERENCE_AUDIO:-}"
+MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-512}"
+USE_GPU_AUDIO="${USE_GPU_AUDIO:-true}"
+
+HF_MODEL_ID="${HF_MODEL_ID:-OpenMOSS-Team/MOSS-TTS}"
+HF_AUDIO_REPO="${HF_AUDIO_REPO:-OpenMOSS-Team/MOSS-Audio-Tokenizer-ONNX}"
+ORT_PKG="${ORT_PKG:-onnxruntime-gpu}"
+
+LLAMA_CPP_DIR="${LLAMA_CPP_DIR:-$WORKDIR/llama.cpp}"
+MOSS_TTS_DIR="${MOSS_TTS_DIR:-$WORKDIR/MOSS-TTS}"
+HF_MODEL_DIR="${HF_MODEL_DIR:-$WORKDIR/weights/MOSS-TTS-hf}"
+EXTRACT_DIR="${EXTRACT_DIR:-$WORKDIR/weights/extracted}"
+ONNX_DIR="${ONNX_DIR:-$WORKDIR/weights/MOSS-Audio-Tokenizer-ONNX}"
+PY_CONFIG="${PY_CONFIG:-$WORKDIR/moss_delay_python_ref.yaml}"
+
+BACKBONE_GGUF="${BACKBONE_GGUF:-$WORKDIR/weights/backbone_f16.gguf}"
+FIRSTCLASS_GGUF="${FIRSTCLASS_GGUF:-$WORKDIR/weights/moss_delay_firstclass_f16.gguf}"
+
+GEN_REF_BIN="${GEN_REF_BIN:-$WORKDIR/out/python_generation.ref.bin}"
+PY_WAV="${PY_WAV:-$WORKDIR/out/python_reference.wav}"
+CPP_CODES_BIN="${CPP_CODES_BIN:-$WORKDIR/out/cpp_raw_codes.bin}"
+CPP_WAV="${CPP_WAV:-$WORKDIR/out/cpp_firstclass.wav}"
+
+mkdir -p "$WORKDIR" "$WORKDIR/weights" "$WORKDIR/out"
+
+need_cmd() {
+    command -v "$1" >/dev/null 2>&1 || {
+        echo "error: missing command: $1" >&2
+        exit 1
+    }
+}
+
+clone_or_update() {
+    local repo_url="$1"
+    local repo_dir="$2"
+    local repo_ref="$3"
+    if [[ ! -d "$repo_dir/.git" ]]; then
+        git clone "$repo_url" "$repo_dir"
+    fi
+    git -C "$repo_dir" fetch --all --tags
+    git -C "$repo_dir" checkout "$repo_ref"
+    git -C "$repo_dir" pull --ff-only || true
+}
+
+need_cmd git
+need_cmd cmake
+need_cmd conda
+
+source "$(conda info --base)/etc/profile.d/conda.sh"
+
+if ! conda env list | awk '{print $1}' | grep -qx "$CONDA_ENV"; then
+    conda create -y -n "$CONDA_ENV" "python=$PYTHON_VERSION"
+fi
+conda activate "$CONDA_ENV"
+
+python -m pip install --upgrade pip setuptools wheel
+python -m pip install --upgrade "huggingface_hub[cli]>=0.30"
+
+clone_or_update "https://github.com/expectqwq/llama.cpp.git" "$LLAMA_CPP_DIR" master
+clone_or_update "https://github.com/OpenMOSS/MOSS-TTS.git" "$MOSS_TTS_DIR" main
+git -C "$MOSS_TTS_DIR" submodule update --init --recursive
+
+if [[ "$ORT_PKG" == "onnxruntime-gpu" ]]; then
+    python -m pip install -e "${MOSS_TTS_DIR}[llama-cpp-onnx]"
+else
+    python -m pip install -e "${MOSS_TTS_DIR}[llama-cpp]"
+    python -m pip install --upgrade "${ORT_PKG}>=1.19"
+fi
+
+huggingface-cli download "$HF_MODEL_ID" --local-dir "$HF_MODEL_DIR"
+huggingface-cli download "$HF_AUDIO_REPO" --local-dir "$ONNX_DIR"
+
+cmake -S "$LLAMA_CPP_DIR" -B "$LLAMA_CPP_DIR/build"
+cmake --build "$LLAMA_CPP_DIR/build" --target llama-moss-tts llama-quantize -j"$N_JOBS"
+
+bash "$MOSS_TTS_DIR/moss_tts_delay/llama_cpp/build_bridge.sh" "$LLAMA_CPP_DIR"
+
+python "$MOSS_TTS_DIR/moss_tts_delay/llama_cpp/conversion/extract_weights.py" \
+    --model "$HF_MODEL_DIR" \
+    --output "$EXTRACT_DIR"
+
+python "$LLAMA_CPP_DIR/convert_hf_to_gguf.py" \
+    "$EXTRACT_DIR/qwen3_backbone" \
+    --outfile "$BACKBONE_GGUF" \
+    --outtype f16
+
+python "$LLAMA_CPP_DIR/convert_hf_to_gguf.py" \
+    "$HF_MODEL_DIR" \
+    --outfile "$FIRSTCLASS_GGUF" \
+    --outtype f16
+
+cat > "$PY_CONFIG" <<EOF
+backbone_gguf: $BACKBONE_GGUF
+embedding_dir: $EXTRACT_DIR/embeddings
+lm_head_dir: $EXTRACT_DIR/lm_heads
+tokenizer_dir: $EXTRACT_DIR/qwen3_backbone
+
+audio_backend: onnx
+audio_encoder_onnx: $ONNX_DIR/encoder.onnx
+audio_decoder_onnx: $ONNX_DIR/decoder.onnx
+
+heads_backend: numpy
+
+n_ctx: 4096
+n_batch: 512
+n_threads: 8
+n_gpu_layers: -1
+max_new_tokens: $MAX_NEW_TOKENS
+use_gpu_audio: $USE_GPU_AUDIO
+
+text_temperature: 0.0
+text_top_p: 1.0
+text_top_k: 50
+
+audio_temperature: 0.0
+audio_top_p: 1.0
+audio_top_k: 25
+audio_repetition_penalty: 1.0
+EOF
+
+if [[ -z "$REFERENCE_AUDIO" ]]; then
+    REF_ARGS=()
+else
+    REF_ARGS=(--reference-audio "$REFERENCE_AUDIO")
+fi
+
+python "$LLAMA_CPP_DIR/tests/moss_tts_delay_export_generation_ref.py" \
+    --config "$PY_CONFIG" \
+    --text "$TEXT" \
+    --max-new-tokens "$MAX_NEW_TOKENS" \
+    --output-ref "$GEN_REF_BIN" \
+    --output-wav "$PY_WAV" \
+    "${REF_ARGS[@]}"
+
+"$LLAMA_CPP_DIR/build/bin/llama-moss-tts" \
+    -m "$FIRSTCLASS_GGUF" \
+    --generation-ref "$GEN_REF_BIN" \
+    --max-new-tokens "$MAX_NEW_TOKENS" \
+    --text-temperature 0.0 \
+    --audio-temperature 0.0 \
+    --dump-raw-codes "$CPP_CODES_BIN" \
+    --audio-decoder-script "$LLAMA_CPP_DIR/tools/tts/moss-tts-audio-decode.py" \
+    --audio-encoder-onnx "$ONNX_DIR/encoder.onnx" \
+    --audio-decoder-onnx "$ONNX_DIR/decoder.onnx" \
+    --wav-out "$CPP_WAV" \
+    --python-bin python \
+    $( [[ "$USE_GPU_AUDIO" == "true" ]] || echo --audio-decoder-cpu )
+
+python - <<PY
+from pathlib import Path
+import hashlib
+import wave
+import numpy as np
+
+py_wav = Path("$PY_WAV")
+cpp_wav = Path("$CPP_WAV")
+
+def read_pcm(path: Path):
+    with wave.open(str(path), "rb") as f:
+        sr = f.getframerate()
+        n = f.getnframes()
+        data = np.frombuffer(f.readframes(n), dtype=np.int16)
+    return sr, data
+
+py_sr, py_pcm = read_pcm(py_wav)
+cpp_sr, cpp_pcm = read_pcm(cpp_wav)
+same_len = py_pcm.shape == cpp_pcm.shape
+max_abs = None
+if same_len:
+    max_abs = int(np.max(np.abs(py_pcm.astype(np.int32) - cpp_pcm.astype(np.int32)))) if py_pcm.size else 0
+
+print("python wav:", py_wav)
+print("cpp wav   :", cpp_wav)
+print("sample_rate_equal:", py_sr == cpp_sr, "py_sr=", py_sr, "cpp_sr=", cpp_sr)
+print("pcm_length_equal:", same_len, "py_samples=", py_pcm.size, "cpp_samples=", cpp_pcm.size)
+print("pcm_max_abs_diff:", max_abs)
+print("python_md5:", hashlib.md5(py_wav.read_bytes()).hexdigest())
+print("cpp_md5   :", hashlib.md5(cpp_wav.read_bytes()).hexdigest())
+PY
+
+echo
+echo "Parity run finished."
+echo "Python reference wav: $PY_WAV"
+echo "First-class C++ wav: $CPP_WAV"
+echo "Generation ref file : $GEN_REF_BIN"
diff --git a/tests/moss_tts_delay_export_decode_ref.py b/tests/moss_tts_delay_export_decode_ref.py
new file mode 100755
index 000000000..19edb3b8f
--- /dev/null
+++ b/tests/moss_tts_delay_export_decode_ref.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import struct
+import sys
+from pathlib import Path
+
+import numpy as np
+
+WORKROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(WORKROOT / "MOSS-TTS"))
+
+from moss_tts_delay.llama_cpp.delay_state import apply_delay_pattern, extract_audio_segments  # noqa: E402
+
+
+REF_MAGIC = 0x4652444D  # "MDRF"
+REF_VERSION = 1
+
+
+def main() -> int:
+    if len(sys.argv) != 2:
+        print(f"usage: {sys.argv[0]} <output.ref.bin>", file=sys.stderr)
+        return 1
+
+    out_path = Path(sys.argv[1])
+
+    n_vq = 32
+    audio_pad_code = 1024
+    prompt_frames = 3
+
+    ref_prompt = np.full((prompt_frames, n_vq), audio_pad_code, dtype=np.int64)
+    ref_prompt[1, 0] = 77
+    ref_prompt[2, :2] = [88, 66]
+
+    raw_a = np.stack([np.arange(10, 10 + n_vq), np.arange(110, 110 + n_vq)], axis=0).astype(np.int64)
+    raw_b = np.stack([np.arange(210, 210 + n_vq)], axis=0).astype(np.int64)
+
+    delayed_a = apply_delay_pattern(raw_a, audio_pad_code)
+    delayed_b = apply_delay_pattern(raw_b, audio_pad_code)
+
+    packed_rows: list[np.ndarray] = []
+    for t in range(prompt_frames):
+        row = np.full(1 + n_vq, audio_pad_code, dtype=np.int64)
+        row[0] = 100 + t
+        row[1:] = ref_prompt[t]
+        packed_rows.append(row)
+
+    def append_delayed(text_token: int, delayed: np.ndarray) -> None:
+        for frame in delayed:
+            row = np.full(1 + n_vq, audio_pad_code, dtype=np.int64)
+            row[0] = text_token
+            row[1:] = frame
+            packed_rows.append(row)
+
+    append_delayed(200, delayed_a)
+
+    gap = np.full(1 + n_vq, audio_pad_code, dtype=np.int64)
+    gap[0] = 201
+    packed_rows.append(gap)
+
+    append_delayed(202, delayed_b)
+
+    packed = np.stack(packed_rows, axis=0)
+    generation_audio = packed[prompt_frames:, 1:]
+    segments = extract_audio_segments(generation_audio)
+    raw_codes = np.concatenate(segments, axis=0) if segments else np.zeros((0, n_vq), dtype=np.int64)
+
+    header = struct.pack(
+        "<IIIIIII",
+        REF_MAGIC,
+        REF_VERSION,
+        prompt_frames,
+        n_vq,
+        audio_pad_code,
+        packed.shape[0],
+        raw_codes.shape[0],
+    )
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("wb") as f:
+        f.write(header)
+        f.write(packed.astype(np.int32).tobytes())
+        f.write(raw_codes.astype(np.int32).tobytes())
+
+    print(
+        f"wrote decode parity reference: packed_frames={packed.shape[0]} raw_frames={raw_codes.shape[0]} path={out_path}",
+        file=sys.stderr,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/moss_tts_delay_export_generation_ref.py b/tests/moss_tts_delay_export_generation_ref.py
new file mode 100755
index 000000000..29d5f4525
--- /dev/null
+++ b/tests/moss_tts_delay_export_generation_ref.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import struct
+import sys
+import wave
+from pathlib import Path
+
+import numpy as np
+
+WORKROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(WORKROOT / "MOSS-TTS"))
+
+from moss_tts_delay.llama_cpp._constants import AUDIO_PAD_CODE  # noqa: E402
+from moss_tts_delay.llama_cpp.pipeline import LlamaCppPipeline, PipelineConfig  # noqa: E402
+from moss_tts_delay.llama_cpp.processor import build_generation_prompt, parse_generation_output  # noqa: E402
+
+
+REF_MAGIC = 0x4652474D  # "MGRF"
+REF_VERSION = 1
+
+
+def write_wav16(path: Path, wav: np.ndarray, sample_rate: int = 24000) -> None:
+    wav = np.asarray(wav, dtype=np.float32).ravel()
+    pcm = np.clip(np.round(wav * 32767.0), -32768, 32767).astype(np.int16)
+    with wave.open(str(path), "wb") as f:
+        f.setnchannels(1)
+        f.setsampwidth(2)
+        f.setframerate(sample_rate)
+        f.writeframes(pcm.tobytes())
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Export Python generation reference for first-class MOSS parity")
+    ap.add_argument("--config", required=True)
+    ap.add_argument("--text", required=True)
+    ap.add_argument("--output-ref", required=True)
+    ap.add_argument("--output-wav", default="")
+    ap.add_argument("--reference-audio", default=None)
+    ap.add_argument("--max-new-tokens", type=int, default=512)
+    ap.add_argument("--text-temperature", type=float, default=0.0)
+    ap.add_argument("--text-top-p", type=float, default=1.0)
+    ap.add_argument("--text-top-k", type=int, default=50)
+    ap.add_argument("--audio-temperature", type=float, default=0.0)
+    ap.add_argument("--audio-top-p", type=float, default=1.0)
+    ap.add_argument("--audio-top-k", type=int, default=25)
+    ap.add_argument("--audio-repetition-penalty", type=float, default=1.0)
+    args = ap.parse_args()
+
+    config = PipelineConfig.from_yaml(args.config)
+    config.max_new_tokens = args.max_new_tokens
+    config.text_temperature = args.text_temperature
+    config.text_top_p = args.text_top_p
+    config.text_top_k = args.text_top_k
+    config.audio_temperature = args.audio_temperature
+    config.audio_top_p = args.audio_top_p
+    config.audio_top_k = args.audio_top_k
+    config.audio_repetition_penalty = args.audio_repetition_penalty
+
+    out_ref = Path(args.output_ref)
+    out_ref.parent.mkdir(parents=True, exist_ok=True)
+
+    with LlamaCppPipeline(config) as pipeline:
+        ref_codes = pipeline._prepare_reference(args.reference_audio)
+        input_ids = build_generation_prompt(
+            pipeline.tokenizer,
+            text=args.text,
+            reference_codes=ref_codes,
+        )
+        prompt_len = input_ids.shape[0]
+
+        pipeline.backbone.clear_kv()
+        pipeline._prefill(input_ids)
+        generation_ids = pipeline._autoregressive_loop(input_ids, config.max_new_tokens)
+        _text, audio_codes = parse_generation_output(pipeline.tokenizer, generation_ids, prompt_len)
+
+        if args.output_wav:
+            wav = pipeline.audio_tokenizer.decode(audio_codes)
+            write_wav16(Path(args.output_wav), wav, 24000)
+
+    hdr = struct.pack(
+        "<IIIIIII",
+        REF_MAGIC,
+        REF_VERSION,
+        prompt_len,
+        input_ids.shape[1] - 1,
+        AUDIO_PAD_CODE,
+        prompt_len,
+        audio_codes.shape[0],
+    )
+
+    with out_ref.open("wb") as f:
+        f.write(hdr)
+        f.write(input_ids.astype(np.int32).tobytes())
+        f.write(audio_codes.astype(np.int32).tobytes())
+
+    print(
+        f"wrote generation reference: prompt_frames={prompt_len} raw_frames={audio_codes.shape[0]} ref={out_ref}",
+        file=sys.stderr,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tools/tts/moss-tts-audio-decode.py b/tools/tts/moss-tts-audio-decode.py
new file mode 100755
index 000000000..1facdba4f
--- /dev/null
+++ b/tools/tts/moss-tts-audio-decode.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import struct
+import sys
+import wave
+from pathlib import Path
+
+import numpy as np
+
+WORKROOT = Path(__file__).resolve().parents[3]
+sys.path.insert(0, str(WORKROOT / "MOSS-TTS"))
+
+from moss_tts_delay.llama_cpp._constants import N_VQ, SAMPLE_RATE  # noqa: E402
+
+
+CODES_MAGIC = 0x53444F43  # "CODS"
+CODES_VERSION = 1
+
+
+def read_codes(path: Path) -> np.ndarray:
+    with path.open("rb") as f:
+        hdr = f.read(16)
+        if len(hdr) != 16:
+            raise RuntimeError("codes header is truncated")
+        magic, version, n_frames, n_vq = struct.unpack("<IIII", hdr)
+        if magic != CODES_MAGIC or version != CODES_VERSION:
+            raise RuntimeError("unexpected codes file format")
+        payload = np.frombuffer(f.read(), dtype=np.int32)
+
+    expected = n_frames * n_vq
+    if payload.size != expected:
+        raise RuntimeError(f"codes payload size mismatch: got {payload.size}, expected {expected}")
+
+    codes = payload.reshape(n_frames, n_vq).astype(np.int64)
+    return codes
+
+
+def write_wav16(path: Path, wav: np.ndarray, sample_rate: int = SAMPLE_RATE) -> None:
+    wav = np.asarray(wav, dtype=np.float32).ravel()
+    pcm = np.clip(np.round(wav * 32767.0), -32768, 32767).astype(np.int16)
+
+    with wave.open(str(path), "wb") as f:
+        f.setnchannels(1)
+        f.setsampwidth(2)
+        f.setframerate(sample_rate)
+        f.writeframes(pcm.tobytes())
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Decode MOSS raw audio codes to wav via Python audio tokenizer")
+    ap.add_argument("--codes-bin", required=True)
+    ap.add_argument("--wav-out", required=True)
+    ap.add_argument("--encoder-onnx", required=True)
+    ap.add_argument("--decoder-onnx", required=True)
+    ap.add_argument("--cpu", action="store_true")
+    args = ap.parse_args()
+
+    try:
+        from moss_audio_tokenizer.onnx import OnnxAudioTokenizer
+    except Exception as exc:
+        raise RuntimeError(
+            "moss_audio_tokenizer.onnx is unavailable; initialize the submodule/package and install ONNX deps"
+        ) from exc
+
+    codes = read_codes(Path(args.codes_bin))
+    if codes.ndim != 2 or codes.shape[1] != N_VQ:
+        raise RuntimeError(f"expected raw codes with shape (T, {N_VQ}), got {codes.shape}")
+
+    tokenizer = OnnxAudioTokenizer(
+        encoder_path=args.encoder_onnx,
+        decoder_path=args.decoder_onnx,
+        use_gpu=not args.cpu,
+    )
+    wav = tokenizer.decode(codes)
+    write_wav16(Path(args.wav_out), wav, SAMPLE_RATE)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tools/tts/moss-tts.cpp b/tools/tts/moss-tts.cpp
index 7260da3af..4dc33879d 100644
--- a/tools/tts/moss-tts.cpp
+++ b/tools/tts/moss-tts.cpp
@@ -4,14 +4,18 @@
 #include "llama.h"
 
 #include <algorithm>
+#include <cstdio>
 #include <cmath>
 #include <cinttypes>
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
+#include <fstream>
 #include <limits>
 #include <numeric>
 #include <random>
 #include <sstream>
+#include <stdexcept>
 #include <string>
 #include <utility>
 #include <vector>
@@ -31,6 +35,12 @@ constexpr llama_token MOSS_DELAY_DEFAULT_AUDIO_PAD_CODE = 1024;
 constexpr uint32_t MOSS_DELAY_DEFAULT_AUDIO_VOCAB_SIZE = 1024;
 constexpr int64_t MOSS_DELAY_INT64_MAX = std::numeric_limits<int64_t>::max();
 constexpr float MOSS_NEG_INF = -std::numeric_limits<float>::infinity();
+constexpr uint32_t MOSS_CODES_MAGIC = 0x53444f43; // "CODS"
+constexpr uint32_t MOSS_CODES_VERSION = 1;
+constexpr uint32_t MOSS_DECODE_REF_MAGIC = 0x4652444d; // "MDRF"
+constexpr uint32_t MOSS_DECODE_REF_VERSION = 1;
+constexpr uint32_t MOSS_GEN_REF_MAGIC = 0x4652474d; // "MGRF"
+constexpr uint32_t MOSS_GEN_REF_VERSION = 1;
 
 struct moss_sampling_config {
     float text_temperature = 1.5f;
@@ -65,6 +75,16 @@ struct moss_audio_segment {
     size_t n_frames = 0;
 };
 
+struct moss_generation_audio {
+    std::vector<llama_token> delayed_codes;
+    size_t delayed_frames = 0;
+
+    std::vector<moss_audio_segment> segments;
+
+    std::vector<llama_token> raw_codes;
+    size_t raw_frames = 0;
+};
+
 struct moss_delay_state {
     int32_t audio_length = 0;
     int64_t delayed_length = MOSS_DELAY_INT64_MAX;
@@ -108,13 +128,94 @@ struct moss_delay_state {
 
 using moss_rng = std::mt19937;
 
+struct moss_codes_header {
+    uint32_t magic = MOSS_CODES_MAGIC;
+    uint32_t version = MOSS_CODES_VERSION;
+    uint32_t n_frames = 0;
+    uint32_t n_vq = 0;
+};
+
+struct moss_decode_ref_header {
+    uint32_t magic = MOSS_DECODE_REF_MAGIC;
+    uint32_t version = MOSS_DECODE_REF_VERSION;
+    uint32_t prompt_frames = 0;
+    uint32_t n_vq = 0;
+    uint32_t audio_pad_code = 0;
+    uint32_t packed_frames = 0;
+    uint32_t raw_frames = 0;
+};
+
+struct moss_generation_ref_header {
+    uint32_t magic = MOSS_GEN_REF_MAGIC;
+    uint32_t version = MOSS_GEN_REF_VERSION;
+    uint32_t prompt_frames = 0;
+    uint32_t n_vq = 0;
+    uint32_t audio_pad_code = 0;
+    uint32_t prompt_packed_frames = 0;
+    uint32_t raw_frames = 0;
+};
+
+static moss_generation_audio moss_decode_generation_audio(
+        const moss_delay_state & state,
+        size_t prompt_frames,
+        const moss_delay_config & cfg);
+
+static moss_generation_audio moss_decode_generation_audio(
+        const std::vector<llama_token> & packed_ids,
+        size_t prompt_frames,
+        const moss_delay_config & cfg);
+
+static bool moss_generate_from_ref(
+        const std::string & model_path,
+        const std::string & ref_path,
+        int32_t max_new_tokens,
+        const moss_sampling_config & sampling_cfg,
+        uint32_t seed,
+        const std::string & dump_raw_codes_path,
+        const std::string & python_bin,
+        const std::string & helper_script,
+        const std::string & encoder_onnx,
+        const std::string & decoder_onnx,
+        const std::string & wav_out,
+        bool use_gpu_audio);
+
 static void print_usage(int argc, char ** argv) {
     (void) argc;
     LOG("\nexample usage:\n");
     LOG("  %s -m model.gguf --print-delay-config\n", argv[0]);
+    LOG("  %s --decode-parity-ref decode.ref.bin\n", argv[0]);
     LOG("\n");
 }
 
+template <typename T>
+static void moss_read_exact(std::ifstream & in, T * data, size_t count, const char * what) {
+    in.read(reinterpret_cast<char *>(data), sizeof(T) * count);
+    if (!in) {
+        throw std::runtime_error(std::string("failed to read ") + what);
+    }
+}
+
+template <typename T>
+static void moss_write_exact(std::ofstream & out, const T * data, size_t count, const char * what) {
+    out.write(reinterpret_cast<const char *>(data), sizeof(T) * count);
+    if (!out) {
+        throw std::runtime_error(std::string("failed to write ") + what);
+    }
+}
+
+static std::string moss_shell_quote(const std::string & value) {
+    std::string out = "'";
+    for (char c : value) {
+        if (c == '\'') {
+            out += "'\\''";
+        } else {
+            out += c;
+        }
+    }
+    out += "'";
+    return out;
+}
+
 static bool parse_meta_i64(const llama_model * model, const char * key, int64_t & out) {
     char buf[128];
     const int32_t n = llama_model_meta_val_str(model, key, buf, sizeof(buf));
@@ -729,6 +830,462 @@ static std::vector<moss_audio_segment> moss_extract_audio_segments(
     return segments;
 }
 
+static std::vector<llama_token> moss_concat_audio_segments(
+        const std::vector<moss_audio_segment> & segments,
+        size_t n_vq,
+        size_t * out_frames = nullptr) {
+    size_t total_frames = 0;
+    size_t total_tokens = 0;
+    for (const auto & seg : segments) {
+        total_frames += seg.n_frames;
+        total_tokens += seg.codes.size();
+    }
+
+    std::vector<llama_token> out;
+    out.reserve(total_tokens);
+    for (const auto & seg : segments) {
+        GGML_ASSERT(seg.codes.size() == seg.n_frames * n_vq);
+        out.insert(out.end(), seg.codes.begin(), seg.codes.end());
+    }
+
+    if (out_frames != nullptr) {
+        *out_frames = total_frames;
+    }
+    return out;
+}
+
+static void moss_write_codes_file(
+        const std::string & path,
+        const std::vector<llama_token> & raw_codes,
+        size_t raw_frames,
+        const moss_delay_config & cfg) {
+    GGML_ASSERT(raw_codes.size() == raw_frames * cfg.n_vq);
+
+    std::ofstream out(path, std::ios::binary);
+    if (!out) {
+        throw std::runtime_error("failed to open codes file for writing: " + path);
+    }
+
+    moss_codes_header hdr;
+    hdr.n_frames = (uint32_t) raw_frames;
+    hdr.n_vq = cfg.n_vq;
+
+    moss_write_exact(out, &hdr, 1, "codes header");
+    moss_write_exact(out, raw_codes.data(), raw_codes.size(), "codes payload");
+}
+
+static int moss_run_audio_decoder_helper(
+        const std::string & python_bin,
+        const std::string & helper_script,
+        const std::string & codes_path,
+        const std::string & wav_path,
+        const std::string & encoder_onnx,
+        const std::string & decoder_onnx,
+        bool use_gpu_audio) {
+    std::ostringstream cmd;
+    cmd
+        << moss_shell_quote(python_bin) << " "
+        << moss_shell_quote(helper_script)
+        << " --codes-bin " << moss_shell_quote(codes_path)
+        << " --wav-out " << moss_shell_quote(wav_path)
+        << " --encoder-onnx " << moss_shell_quote(encoder_onnx)
+        << " --decoder-onnx " << moss_shell_quote(decoder_onnx);
+    if (!use_gpu_audio) {
+        cmd << " --cpu";
+    }
+
+    LOG("running audio decoder helper: %s\n", cmd.str().c_str());
+    return std::system(cmd.str().c_str());
+}
+
+static bool moss_decode_parity(
+        const std::string & ref_path,
+        const std::string & dump_codes_path,
+        const std::string & python_bin,
+        const std::string & helper_script,
+        const std::string & encoder_onnx,
+        const std::string & decoder_onnx,
+        const std::string & wav_out,
+        bool use_gpu_audio) {
+    std::ifstream in(ref_path, std::ios::binary);
+    if (!in) {
+        throw std::runtime_error("failed to open decode parity reference: " + ref_path);
+    }
+
+    moss_decode_ref_header hdr;
+    moss_read_exact(in, &hdr, 1, "decode parity header");
+    if (hdr.magic != MOSS_DECODE_REF_MAGIC || hdr.version != MOSS_DECODE_REF_VERSION) {
+        throw std::runtime_error("unexpected decode parity reference format");
+    }
+
+    moss_delay_config cfg;
+    cfg.n_vq = hdr.n_vq;
+    cfg.audio_pad_code = (llama_token) hdr.audio_pad_code;
+
+    std::vector<llama_token> packed_ids((size_t) hdr.packed_frames * cfg.packed_stride());
+    std::vector<llama_token> ref_raw_codes((size_t) hdr.raw_frames * cfg.n_vq);
+    moss_read_exact(in, packed_ids.data(), packed_ids.size(), "packed ids");
+    moss_read_exact(in, ref_raw_codes.data(), ref_raw_codes.size(), "reference raw codes");
+
+    const moss_generation_audio decoded = moss_decode_generation_audio(packed_ids, hdr.prompt_frames, cfg);
+
+    size_t mismatch_count = 0;
+    const size_t compare_count = std::min(decoded.raw_codes.size(), ref_raw_codes.size());
+    for (size_t i = 0; i < compare_count; ++i) {
+        if (decoded.raw_codes[i] != ref_raw_codes[i]) {
+            ++mismatch_count;
+        }
+    }
+    mismatch_count += decoded.raw_codes.size() > ref_raw_codes.size()
+            ? decoded.raw_codes.size() - ref_raw_codes.size()
+            : ref_raw_codes.size() - decoded.raw_codes.size();
+
+    LOG("moss-tts delay decode parity: prompt_frames=%u delayed_frames=%zu raw_frames=%zu ref_raw_frames=%u mismatch_count=%zu segments=%zu\n",
+            hdr.prompt_frames,
+            decoded.delayed_frames,
+            decoded.raw_frames,
+            hdr.raw_frames,
+            mismatch_count,
+            decoded.segments.size());
+
+    if (!dump_codes_path.empty()) {
+        moss_write_codes_file(dump_codes_path, decoded.raw_codes, decoded.raw_frames, cfg);
+    }
+
+    if (!helper_script.empty()) {
+        if (dump_codes_path.empty()) {
+            throw std::runtime_error("--audio-decoder-script requires --dump-raw-codes");
+        }
+        if (wav_out.empty()) {
+            throw std::runtime_error("--audio-decoder-script requires --wav-out");
+        }
+        if (encoder_onnx.empty() || decoder_onnx.empty()) {
+            throw std::runtime_error("--audio-decoder-script requires both --audio-encoder-onnx and --audio-decoder-onnx");
+        }
+
+        const int rc = moss_run_audio_decoder_helper(
+                python_bin, helper_script, dump_codes_path, wav_out,
+                encoder_onnx, decoder_onnx, use_gpu_audio);
+        if (rc != 0) {
+            throw std::runtime_error("audio decoder helper failed with exit code " + std::to_string(rc));
+        }
+    }
+
+    return mismatch_count == 0;
+}
+
+static llama_batch moss_batch_from_packed_rows(
+        const std::vector<llama_token> & packed_ids,
+        size_t start_frame,
+        size_t n_frames,
+        const moss_delay_config & cfg,
+        size_t pos_start,
+        bool output_last) {
+    GGML_ASSERT(cfg.n_vq > 0);
+    GGML_ASSERT(packed_ids.size() % cfg.packed_stride() == 0);
+    GGML_ASSERT(start_frame + n_frames <= packed_ids.size() / cfg.packed_stride());
+
+    llama_batch batch = llama_batch_init((int32_t) n_frames, 0, 1);
+    batch.n_tokens = (int32_t) n_frames;
+    batch.n_token_audio = (int32_t) cfg.n_vq;
+    batch.token_audio = (llama_token *) std::malloc(sizeof(llama_token) * n_frames * cfg.n_vq);
+    if (batch.token_audio == nullptr) {
+        throw std::runtime_error("failed to allocate token_audio");
+    }
+
+    for (size_t i = 0; i < n_frames; ++i) {
+        const size_t row = (start_frame + i) * cfg.packed_stride();
+        batch.token[i] = packed_ids[row + 0];
+        std::memcpy(
+                batch.token_audio + i * cfg.n_vq,
+                packed_ids.data() + row + 1,
+                sizeof(llama_token) * cfg.n_vq);
+        batch.pos[i] = (llama_pos) (pos_start + i);
+        batch.n_seq_id[i] = 1;
+        batch.seq_id[i][0] = 0;
+        batch.logits[i] = output_last && (i + 1 == n_frames);
+    }
+
+    return batch;
+}
+
+static bool moss_generate_from_ref(
+        const std::string & model_path,
+        const std::string & ref_path,
+        int32_t max_new_tokens,
+        const moss_sampling_config & sampling_cfg,
+        uint32_t seed,
+        const std::string & dump_raw_codes_path,
+        const std::string & python_bin,
+        const std::string & helper_script,
+        const std::string & encoder_onnx,
+        const std::string & decoder_onnx,
+        const std::string & wav_out,
+        bool use_gpu_audio) {
+    std::ifstream in(ref_path, std::ios::binary);
+    if (!in) {
+        throw std::runtime_error("failed to open generation reference: " + ref_path);
+    }
+
+    moss_generation_ref_header hdr;
+    moss_read_exact(in, &hdr, 1, "generation reference header");
+    if (hdr.magic != MOSS_GEN_REF_MAGIC || hdr.version != MOSS_GEN_REF_VERSION) {
+        throw std::runtime_error("unexpected generation reference format");
+    }
+
+    moss_delay_config cfg;
+    cfg.n_vq = hdr.n_vq;
+    cfg.audio_pad_code = (llama_token) hdr.audio_pad_code;
+
+    std::vector<llama_token> prompt_packed((size_t) hdr.prompt_packed_frames * cfg.packed_stride());
+    std::vector<llama_token> ref_raw_codes((size_t) hdr.raw_frames * cfg.n_vq);
+    moss_read_exact(in, prompt_packed.data(), prompt_packed.size(), "prompt packed ids");
+    moss_read_exact(in, ref_raw_codes.data(), ref_raw_codes.size(), "reference raw codes");
+
+    llama_backend_init();
+
+    llama_model_params mparams = llama_model_default_params();
+    mparams.use_mmap = true;
+
+    llama_model * model = llama_model_load_from_file(model_path.c_str(), mparams);
+    if (model == nullptr) {
+        llama_backend_free();
+        throw std::runtime_error("failed to load model: " + model_path);
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    const int32_t text_vocab = llama_vocab_n_tokens(vocab);
+    const moss_delay_config model_cfg = moss_delay_config_from_model(model);
+
+    if (model_cfg.n_vq != cfg.n_vq) {
+        llama_model_free(model);
+        llama_backend_free();
+        throw std::runtime_error("generation reference n_vq does not match model metadata");
+    }
+    cfg.audio_vocab_size = model_cfg.audio_vocab_size;
+
+    llama_context_params cparams = llama_context_default_params();
+    cparams.n_ctx = std::max<uint32_t>((uint32_t) hdr.prompt_frames + (uint32_t) max_new_tokens + 8u, 64u);
+    cparams.n_batch = std::max<uint32_t>((uint32_t) hdr.prompt_frames, 1u);
+    cparams.n_ubatch = cparams.n_batch;
+    cparams.n_seq_max = 1;
+    cparams.embeddings = false;
+    cparams.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
+    cparams.type_k = GGML_TYPE_F32;
+    cparams.type_v = GGML_TYPE_F32;
+
+    llama_context * ctx = llama_init_from_model(model, cparams);
+    if (ctx == nullptr) {
+        llama_model_free(model);
+        llama_backend_free();
+        throw std::runtime_error("failed to create context");
+    }
+
+    llama_set_warmup(ctx, false);
+    llama_set_causal_attn(ctx, true);
+    llama_set_embeddings(ctx, false);
+
+    {
+        llama_batch batch = moss_batch_from_packed_rows(prompt_packed, 0, hdr.prompt_frames, cfg, 0, true);
+        const int ret = llama_decode(ctx, batch);
+        llama_batch_free(batch);
+        if (ret != 0) {
+            llama_free(ctx);
+            llama_model_free(model);
+            llama_backend_free();
+            throw std::runtime_error("prefill llama_decode failed: " + std::to_string(ret));
+        }
+    }
+
+    moss_delay_state state = moss_init_delay_state(prompt_packed, cfg);
+    std::vector<llama_token> generated_packed;
+    generated_packed.reserve((size_t) max_new_tokens * cfg.packed_stride());
+
+    const size_t audio_vocab = moss_audio_vocab_with_pad(cfg);
+    moss_rng rng(seed);
+
+    for (int32_t step = 0; step < max_new_tokens; ++step) {
+        const float * logits = llama_get_logits_ith(ctx, -1);
+        if (logits == nullptr) {
+            llama_free(ctx);
+            llama_model_free(model);
+            llama_backend_free();
+            throw std::runtime_error("llama_get_logits_ith returned null");
+        }
+
+        std::vector<float> text_logits(logits, logits + text_vocab);
+        std::vector<float> audio_logits(
+                logits + text_vocab,
+                logits + text_vocab + cfg.n_vq * audio_vocab);
+
+        const std::vector<llama_token> next = moss_delay_step(
+                state, text_logits, audio_logits, sampling_cfg, cfg, rng);
+        generated_packed.insert(generated_packed.end(), next.begin(), next.end());
+
+        llama_batch batch = moss_batch_from_packed_rows(
+                generated_packed, generated_packed.size() / cfg.packed_stride() - 1, 1, cfg,
+                hdr.prompt_frames + (size_t) step, true);
+        const int ret = llama_decode(ctx, batch);
+        llama_batch_free(batch);
+        if (ret != 0) {
+            llama_free(ctx);
+            llama_model_free(model);
+            llama_backend_free();
+            throw std::runtime_error("generation llama_decode failed: " + std::to_string(ret));
+        }
+
+        if (state.is_stopping) {
+            break;
+        }
+    }
+
+    const moss_generation_audio decoded = moss_decode_generation_audio(state, hdr.prompt_frames, cfg);
+
+    size_t mismatch_count = 0;
+    const size_t compare_count = std::min(decoded.raw_codes.size(), ref_raw_codes.size());
+    size_t first_mismatch = compare_count;
+    for (size_t i = 0; i < compare_count; ++i) {
+        if (decoded.raw_codes[i] != ref_raw_codes[i]) {
+            if (first_mismatch == compare_count) {
+                first_mismatch = i;
+            }
+            ++mismatch_count;
+        }
+    }
+    mismatch_count += decoded.raw_codes.size() > ref_raw_codes.size()
+            ? decoded.raw_codes.size() - ref_raw_codes.size()
+            : ref_raw_codes.size() - decoded.raw_codes.size();
+
+    LOG("moss-tts first-class generation parity: prompt_frames=%u generated_frames=%zu raw_frames=%zu ref_raw_frames=%u mismatch_count=%zu\n",
+            hdr.prompt_frames,
+            generated_packed.size() / cfg.packed_stride(),
+            decoded.raw_frames,
+            hdr.raw_frames,
+            mismatch_count);
+    if (first_mismatch != compare_count) {
+        LOG("first mismatch at raw_token=%zu got=%d ref=%d\n",
+                first_mismatch,
+                (int) decoded.raw_codes[first_mismatch],
+                (int) ref_raw_codes[first_mismatch]);
+    }
+
+    if (!dump_raw_codes_path.empty()) {
+        moss_write_codes_file(dump_raw_codes_path, decoded.raw_codes, decoded.raw_frames, cfg);
+    }
+
+    if (!helper_script.empty()) {
+        if (dump_raw_codes_path.empty()) {
+            llama_free(ctx);
+            llama_model_free(model);
+            llama_backend_free();
+            throw std::runtime_error("--audio-decoder-script requires --dump-raw-codes");
+        }
+        if (wav_out.empty()) {
+            llama_free(ctx);
+            llama_model_free(model);
+            llama_backend_free();
+            throw std::runtime_error("--audio-decoder-script requires --wav-out");
+        }
+        if (encoder_onnx.empty() || decoder_onnx.empty()) {
+            llama_free(ctx);
+            llama_model_free(model);
+            llama_backend_free();
+            throw std::runtime_error("--audio-decoder-script requires both ONNX paths");
+        }
+
+        const int rc = moss_run_audio_decoder_helper(
+                python_bin, helper_script, dump_raw_codes_path, wav_out,
+                encoder_onnx, decoder_onnx, use_gpu_audio);
+        if (rc != 0) {
+            llama_free(ctx);
+            llama_model_free(model);
+            llama_backend_free();
+            throw std::runtime_error("audio decoder helper failed with exit code " + std::to_string(rc));
+        }
+    }
+
+    llama_free(ctx);
+    llama_model_free(model);
+    llama_backend_free();
+
+    return mismatch_count == 0;
+}
+
+static std::vector<llama_token> moss_audio_history_slice(
+        const moss_delay_state & state,
+        size_t start_frame,
+        size_t * out_frames = nullptr) {
+    const size_t total_frames = state.audio_frames();
+    if (start_frame >= total_frames) {
+        if (out_frames != nullptr) {
+            *out_frames = 0;
+        }
+        return {};
+    }
+
+    const size_t n_frames = total_frames - start_frame;
+    std::vector<llama_token> out;
+    out.reserve(n_frames * state.n_vq);
+    out.insert(
+            out.end(),
+            state.audio_history.begin() + start_frame * state.n_vq,
+            state.audio_history.end());
+
+    if (out_frames != nullptr) {
+        *out_frames = n_frames;
+    }
+
+    return out;
+}
+
+static moss_generation_audio moss_decode_generation_audio(
+        const moss_delay_state & state,
+        size_t prompt_frames,
+        const moss_delay_config & cfg) {
+    GGML_ASSERT(state.n_vq == cfg.n_vq);
+
+    moss_generation_audio out;
+    out.delayed_codes = moss_audio_history_slice(state, prompt_frames, &out.delayed_frames);
+    if (out.delayed_frames == 0) {
+        return out;
+    }
+
+    out.segments = moss_extract_audio_segments(out.delayed_codes, out.delayed_frames, cfg);
+    out.raw_codes = moss_concat_audio_segments(out.segments, cfg.n_vq, &out.raw_frames);
+    return out;
+}
+
+static moss_generation_audio moss_decode_generation_audio(
+        const std::vector<llama_token> & packed_ids,
+        size_t prompt_frames,
+        const moss_delay_config & cfg) {
+    GGML_ASSERT(cfg.n_vq > 0);
+    GGML_ASSERT(packed_ids.size() % cfg.packed_stride() == 0);
+
+    const size_t total_frames = packed_ids.size() / cfg.packed_stride();
+    GGML_ASSERT(prompt_frames <= total_frames);
+
+    moss_generation_audio out;
+    out.delayed_frames = total_frames - prompt_frames;
+    out.delayed_codes.reserve(out.delayed_frames * cfg.n_vq);
+
+    for (size_t t = prompt_frames; t < total_frames; ++t) {
+        const size_t row = t * cfg.packed_stride();
+        out.delayed_codes.insert(
+                out.delayed_codes.end(),
+                packed_ids.begin() + row + 1,
+                packed_ids.begin() + row + 1 + cfg.n_vq);
+    }
+
+    if (out.delayed_frames == 0) {
+        return out;
+    }
+
+    out.segments = moss_extract_audio_segments(out.delayed_codes, out.delayed_frames, cfg);
+    out.raw_codes = moss_concat_audio_segments(out.segments, cfg.n_vq, &out.raw_frames);
+    return out;
+}
+
 static std::string moss_delay_config_to_string(const moss_delay_config & cfg) {
     std::ostringstream oss;
     oss
@@ -865,6 +1422,87 @@ static bool moss_delay_self_test() {
         }
     }
 
+    {
+        moss_delay_config decode_cfg = cfg;
+        decode_cfg.n_vq = 3;
+        decode_cfg.audio_pad_code = 99;
+
+        const std::vector<llama_token> prompt_audio = {
+            77, 99, 99,
+            88, 66, 99,
+        };
+        const std::vector<llama_token> raw_codes = {
+            10, 11, 12,
+            20, 21, 22,
+            30, 31, 32,
+        };
+        const std::vector<llama_token> delayed = moss_apply_delay_pattern(raw_codes, 3, decode_cfg);
+
+        moss_delay_state decode_state;
+        decode_state.n_vq = decode_cfg.n_vq;
+        decode_state.audio_history = prompt_audio;
+        decode_state.append_audio(delayed.data() + 0 * decode_cfg.n_vq);
+        decode_state.append_audio(delayed.data() + 1 * decode_cfg.n_vq);
+        decode_state.append_audio(delayed.data() + 2 * decode_cfg.n_vq);
+        decode_state.append_audio(delayed.data() + 3 * decode_cfg.n_vq);
+        decode_state.append_audio(delayed.data() + 4 * decode_cfg.n_vq);
+
+        const moss_generation_audio decoded = moss_decode_generation_audio(decode_state, 2, decode_cfg);
+        if (decoded.delayed_frames != 5 || decoded.raw_frames != 3 || decoded.raw_codes != raw_codes) {
+            return false;
+        }
+        if (decoded.segments.size() != 1 || decoded.segments[0].n_frames != 3 || decoded.segments[0].codes != raw_codes) {
+            return false;
+        }
+    }
+
+    {
+        moss_delay_config decode_cfg = cfg;
+        decode_cfg.n_vq = 3;
+        decode_cfg.audio_pad_code = 99;
+
+        const std::vector<llama_token> raw_a = {
+            10, 11, 12,
+            20, 21, 22,
+        };
+        const std::vector<llama_token> raw_b = {
+            40, 41, 42,
+        };
+        const std::vector<llama_token> delayed_a = moss_apply_delay_pattern(raw_a, 2, decode_cfg);
+        const std::vector<llama_token> delayed_b = moss_apply_delay_pattern(raw_b, 1, decode_cfg);
+
+        std::vector<llama_token> packed = {
+            100, 99, 99, 99,
+            101, 99, 99, 99,
+        };
+        auto append_delayed_rows = [&](llama_token text_token, const std::vector<llama_token> & delayed_rows, size_t n_frames) {
+            for (size_t t = 0; t < n_frames; ++t) {
+                packed.push_back(text_token);
+                packed.insert(
+                        packed.end(),
+                        delayed_rows.begin() + t * decode_cfg.n_vq,
+                        delayed_rows.begin() + (t + 1) * decode_cfg.n_vq);
+            }
+        };
+        append_delayed_rows(200, delayed_a, 4);
+        packed.push_back(201);
+        packed.insert(packed.end(), {99, 99, 99});
+        append_delayed_rows(202, delayed_b, 3);
+
+        const moss_generation_audio decoded = moss_decode_generation_audio(packed, 2, decode_cfg);
+        const std::vector<llama_token> raw_expected = {
+            10, 11, 12,
+            20, 21, 22,
+            40, 41, 42,
+        };
+        if (decoded.segments.size() != 2 || decoded.raw_frames != 3 || decoded.raw_codes != raw_expected) {
+            return false;
+        }
+        if (decoded.segments[0].codes != raw_a || decoded.segments[1].codes != raw_b) {
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -872,8 +1510,20 @@ static bool moss_delay_self_test() {
 
 int main(int argc, char ** argv) {
     std::string model_path;
+    std::string decode_parity_ref_path;
+    std::string generation_ref_path;
+    std::string dump_raw_codes_path;
+    std::string audio_decoder_script;
+    std::string audio_encoder_onnx;
+    std::string audio_decoder_onnx;
+    std::string wav_out_path;
+    std::string python_bin = "python";
     bool print_delay_config = false;
     bool self_test = false;
+    bool use_gpu_audio = true;
+    int32_t max_new_tokens = 2048;
+    uint32_t seed = 1234;
+    moss_sampling_config sampling_cfg;
 
     for (int i = 1; i < argc; ++i) {
         const std::string arg = argv[i];
@@ -881,6 +1531,78 @@ int main(int argc, char ** argv) {
             model_path = argv[++i];
             continue;
         }
+        if (arg == "--generation-ref" && i + 1 < argc) {
+            generation_ref_path = argv[++i];
+            continue;
+        }
+        if (arg == "--decode-parity-ref" && i + 1 < argc) {
+            decode_parity_ref_path = argv[++i];
+            continue;
+        }
+        if (arg == "--max-new-tokens" && i + 1 < argc) {
+            max_new_tokens = std::stoi(argv[++i]);
+            continue;
+        }
+        if (arg == "--seed" && i + 1 < argc) {
+            seed = (uint32_t) std::stoul(argv[++i]);
+            continue;
+        }
+        if (arg == "--dump-raw-codes" && i + 1 < argc) {
+            dump_raw_codes_path = argv[++i];
+            continue;
+        }
+        if (arg == "--audio-decoder-script" && i + 1 < argc) {
+            audio_decoder_script = argv[++i];
+            continue;
+        }
+        if (arg == "--audio-encoder-onnx" && i + 1 < argc) {
+            audio_encoder_onnx = argv[++i];
+            continue;
+        }
+        if (arg == "--audio-decoder-onnx" && i + 1 < argc) {
+            audio_decoder_onnx = argv[++i];
+            continue;
+        }
+        if (arg == "--wav-out" && i + 1 < argc) {
+            wav_out_path = argv[++i];
+            continue;
+        }
+        if (arg == "--python-bin" && i + 1 < argc) {
+            python_bin = argv[++i];
+            continue;
+        }
+        if (arg == "--text-temperature" && i + 1 < argc) {
+            sampling_cfg.text_temperature = std::stof(argv[++i]);
+            continue;
+        }
+        if (arg == "--text-top-p" && i + 1 < argc) {
+            sampling_cfg.text_top_p = std::stof(argv[++i]);
+            continue;
+        }
+        if (arg == "--text-top-k" && i + 1 < argc) {
+            sampling_cfg.text_top_k = std::stoi(argv[++i]);
+            continue;
+        }
+        if (arg == "--audio-temperature" && i + 1 < argc) {
+            sampling_cfg.audio_temperature = std::stof(argv[++i]);
+            continue;
+        }
+        if (arg == "--audio-top-p" && i + 1 < argc) {
+            sampling_cfg.audio_top_p = std::stof(argv[++i]);
+            continue;
+        }
+        if (arg == "--audio-top-k" && i + 1 < argc) {
+            sampling_cfg.audio_top_k = std::stoi(argv[++i]);
+            continue;
+        }
+        if (arg == "--audio-repetition-penalty" && i + 1 < argc) {
+            sampling_cfg.audio_repetition_penalty = std::stof(argv[++i]);
+            continue;
+        }
+        if (arg == "--audio-decoder-cpu") {
+            use_gpu_audio = false;
+            continue;
+        }
         if (arg == "--print-delay-config") {
             print_delay_config = true;
             continue;
@@ -907,12 +1629,58 @@ int main(int argc, char ** argv) {
         LOG("moss delay state self-test: ok\n");
     }
 
+    if (!generation_ref_path.empty()) {
+        if (model_path.empty()) {
+            LOG_ERR("--generation-ref requires -m <model.gguf>\n");
+            return EXIT_FAILURE;
+        }
+        try {
+            const bool ok = moss_generate_from_ref(
+                    model_path,
+                    generation_ref_path,
+                    max_new_tokens,
+                    sampling_cfg,
+                    seed,
+                    dump_raw_codes_path,
+                    python_bin,
+                    audio_decoder_script,
+                    audio_encoder_onnx,
+                    audio_decoder_onnx,
+                    wav_out_path,
+                    use_gpu_audio);
+            return ok ? EXIT_SUCCESS : EXIT_FAILURE;
+        } catch (const std::exception & err) {
+            LOG_ERR("generation parity failed: %s\n", err.what());
+            return EXIT_FAILURE;
+        }
+    }
+
+    if (!decode_parity_ref_path.empty()) {
+        try {
+            const bool ok = moss_decode_parity(
+                    decode_parity_ref_path,
+                    dump_raw_codes_path,
+                    python_bin,
+                    audio_decoder_script,
+                    audio_encoder_onnx,
+                    audio_decoder_onnx,
+                    wav_out_path,
+                    use_gpu_audio);
+            return ok ? EXIT_SUCCESS : EXIT_FAILURE;
+        } catch (const std::exception & err) {
+            LOG_ERR("decode parity failed: %s\n", err.what());
+            return EXIT_FAILURE;
+        }
+    }
+
     if (!print_delay_config) {
         if (self_test) {
             return EXIT_SUCCESS;
         }
-        LOG("moss delay state and multi-head sampler are in place; audio decode is not implemented yet.\n");
+        LOG("moss delay state, multi-head sampler, and raw-code decode are in place; audio decode is available via the external Python/ONNX helper.\n");
         LOG("use --print-delay-config with -m <model.gguf> to inspect model metadata.\n");
+        LOG("use --decode-parity-ref <ref.bin> to verify C++ de-delay/raw-code extraction against Python.\n");
+        LOG("use --generation-ref <ref.bin> -m <first-class-model.gguf> to verify end-to-end first-class generation against Python.\n");
         return EXIT_SUCCESS;
     }
 

From d91245d6f8ebe74f99bdaf985af63c535df9cc19 Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Thu, 12 Mar 2026 14:38:38 +0800
Subject: [PATCH 06/20] fix(moss-tts-delay): stabilize audio input buffer and
 parity script env

---
 scripts/run-moss-tts-delay-firstclass-parity.sh | 3 ++-
 src/models/moss-tts-delay.cpp                   | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/run-moss-tts-delay-firstclass-parity.sh b/scripts/run-moss-tts-delay-firstclass-parity.sh
index 77039c044..2a231ade0 100755
--- a/scripts/run-moss-tts-delay-firstclass-parity.sh
+++ b/scripts/run-moss-tts-delay-firstclass-parity.sh
@@ -9,7 +9,7 @@ set -euo pipefail
 # - compares raw audio codes exactly
 # - decodes both sides to wav through the same ONNX audio tokenizer
 
-WORKDIR="${WORKDIR:-$HOME/moss-tts-delay-firstclass}"
+WORKDIR="/home/taoji/data/zlwang/workwork"
 CONDA_ENV="${CONDA_ENV:-moss-tts-firstclass}"
 PYTHON_VERSION="${PYTHON_VERSION:-3.11}"
 N_JOBS="${N_JOBS:-$(nproc)}"
@@ -63,6 +63,7 @@ need_cmd git
 need_cmd cmake
 need_cmd conda
 
+export PS1="${PS1:-}"
 source "$(conda info --base)/etc/profile.d/conda.sh"
 
 if ! conda env list | awk '{print $1}' | grep -qx "$CONDA_ENV"; then
diff --git a/src/models/moss-tts-delay.cpp b/src/models/moss-tts-delay.cpp
index ae7ff48c4..87c6585cb 100644
--- a/src/models/moss-tts-delay.cpp
+++ b/src/models/moss-tts-delay.cpp
@@ -10,7 +10,7 @@ class llm_graph_input_moss_audio_channel : public llm_graph_input_i {
     void set_input(const llama_ubatch * ubatch) override {
         GGML_ASSERT(tokens != nullptr);
 
-        std::vector<llama_token> data(ubatch->n_tokens, 0);
+        data.resize(ubatch->n_tokens, 0);
         if (ubatch->token_audio != nullptr) {
             GGML_ASSERT(ubatch->n_token_audio == n_channels);
 
@@ -37,6 +37,7 @@ class llm_graph_input_moss_audio_channel : public llm_graph_input_i {
 private:
     const uint32_t channel;
     const uint32_t n_channels;
+    std::vector<llama_token> data;
 };
 
 }
@@ -70,7 +71,6 @@ llm_build_moss_tts_delay::llm_build_moss_tts_delay(const llama_model & model, co
 
         res->add_input(std::move(inp_audio));
     }
-
     ggml_tensor * inp_pos = build_inp_pos();
     auto * inp_attn = build_attn_inp_kv();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -172,6 +172,7 @@ llm_build_moss_tts_delay::llm_build_moss_tts_delay(const llama_model & model, co
                 audio_logits->nb[1],
                 ggml_element_size(audio_logits) * (audio_logits->ne[0] - 1));
         invalid_audio_logits = ggml_clamp(ctx0, invalid_audio_logits, -INFINITY, -INFINITY);
+        invalid_audio_logits = ggml_cont(ctx0, invalid_audio_logits);
         audio_logits = ggml_set_2d(
                 ctx0, audio_logits, invalid_audio_logits,
                 audio_logits->nb[1],

From b156eeb14099a950768cb3fbf48d350f75930677 Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Thu, 12 Mar 2026 19:53:18 +0800
Subject: [PATCH 07/20] moss-tts: add phase1d diagnostics and prefill chunk
 workaround

---
 src/models/moss-tts-delay.cpp                 |  11 -
 tests/moss_tts_delay_export_generation_ref.py |  14 +-
 tools/tts/moss-tts.cpp                        | 234 ++++++++++++++++--
 3 files changed, 227 insertions(+), 32 deletions(-)

diff --git a/src/models/moss-tts-delay.cpp b/src/models/moss-tts-delay.cpp
index 87c6585cb..f7f56ff5c 100644
--- a/src/models/moss-tts-delay.cpp
+++ b/src/models/moss-tts-delay.cpp
@@ -166,17 +166,6 @@ llm_build_moss_tts_delay::llm_build_moss_tts_delay(const llama_model & model, co
 
     for (uint32_t i = 0; i < hparams.n_vq; ++i) {
         ggml_tensor * audio_logits = build_lora_mm(model.output_audio[i], cur);
-        ggml_tensor * invalid_audio_logits = ggml_view_2d(
-                ctx0, audio_logits,
-                1, audio_logits->ne[1],
-                audio_logits->nb[1],
-                ggml_element_size(audio_logits) * (audio_logits->ne[0] - 1));
-        invalid_audio_logits = ggml_clamp(ctx0, invalid_audio_logits, -INFINITY, -INFINITY);
-        invalid_audio_logits = ggml_cont(ctx0, invalid_audio_logits);
-        audio_logits = ggml_set_2d(
-                ctx0, audio_logits, invalid_audio_logits,
-                audio_logits->nb[1],
-                ggml_element_size(audio_logits) * (audio_logits->ne[0] - 1));
         cb(audio_logits, "result_output_audio", i);
 
         logits = ggml_concat(ctx0, logits, audio_logits, 0);
diff --git a/tests/moss_tts_delay_export_generation_ref.py b/tests/moss_tts_delay_export_generation_ref.py
index 29d5f4525..407fac8c9 100755
--- a/tests/moss_tts_delay_export_generation_ref.py
+++ b/tests/moss_tts_delay_export_generation_ref.py
@@ -71,9 +71,17 @@ def main() -> int:
         )
         prompt_len = input_ids.shape[0]
 
-        pipeline.backbone.clear_kv()
-        pipeline._prefill(input_ids)
-        generation_ids = pipeline._autoregressive_loop(input_ids, config.max_new_tokens)
+        backbone = pipeline.backbone
+        embedder = pipeline.embedder
+        lm_heads = pipeline.lm_heads
+        if backbone is None or embedder is None or lm_heads is None:
+            raise RuntimeError("pipeline low-memory mode is not supported by this export script")
+
+        backbone.clear_kv()
+        pipeline._prefill(input_ids, backbone, embedder)
+        generation_ids = pipeline._autoregressive_loop(
+            input_ids, config.max_new_tokens, backbone, embedder, lm_heads
+        )
         _text, audio_codes = parse_generation_output(pipeline.tokenizer, generation_ids, prompt_len)
 
         if args.output_wav:
diff --git a/tools/tts/moss-tts.cpp b/tools/tts/moss-tts.cpp
index 4dc33879d..37418cccb 100644
--- a/tools/tts/moss-tts.cpp
+++ b/tools/tts/moss-tts.cpp
@@ -250,6 +250,39 @@ static bool parse_meta_token(const llama_model * model, const char * key, llama_
     return true;
 }
 
+static int32_t moss_debug_steps_from_env() {
+    const char * raw = std::getenv("MOSS_TTS_DEBUG_STEPS");
+    if (raw == nullptr || raw[0] == '\0') {
+        return 0;
+    }
+
+    char * end = nullptr;
+    const long parsed = std::strtol(raw, &end, 10);
+    if (end == raw) {
+        return 0;
+    }
+    return (int32_t) std::max<long>(parsed, 0);
+}
+
+static size_t moss_prefill_chunk_from_env() {
+    const char * raw = std::getenv("MOSS_TTS_PREFILL_CHUNK");
+    if (raw == nullptr || raw[0] == '\0') {
+        return 16;
+    }
+
+    char * end = nullptr;
+    const long parsed = std::strtol(raw, &end, 10);
+    if (end == raw) {
+        return 16;
+    }
+    return (size_t) std::max<long>(parsed, 1);
+}
+
+static int32_t & moss_debug_step_counter() {
+    static int32_t counter = 0;
+    return counter;
+}
+
 static moss_delay_config moss_delay_config_from_model(const llama_model * model) {
     moss_delay_config cfg;
 
@@ -582,6 +615,58 @@ static std::vector<llama_token> moss_delay_step(
         return result;
     }
 
+    const int32_t debug_limit = moss_debug_steps_from_env();
+    const int32_t debug_step = moss_debug_step_counter();
+    const bool debug_this_step = debug_limit > 0 && debug_step < debug_limit;
+    if (debug_this_step) {
+        size_t text_nan = 0;
+        size_t text_inf = 0;
+        for (float v : text_logits) {
+            if (std::isnan(v)) {
+                ++text_nan;
+            } else if (std::isinf(v)) {
+                ++text_inf;
+            }
+        }
+        size_t audio_nan = 0;
+        size_t audio_inf = 0;
+        for (float v : audio_logits) {
+            if (std::isnan(v)) {
+                ++audio_nan;
+            } else if (std::isinf(v)) {
+                ++audio_inf;
+            }
+        }
+        const float raw_text0 = text_logits.empty() ? MOSS_NEG_INF : text_logits[0];
+        const float raw_text_gen =
+                ((size_t) cfg.audio_assistant_gen_slot_token_id < text_vocab)
+                ? text_logits[(size_t) cfg.audio_assistant_gen_slot_token_id]
+                : MOSS_NEG_INF;
+        const float raw_text_delay =
+                ((size_t) cfg.audio_assistant_delay_slot_token_id < text_vocab)
+                ? text_logits[(size_t) cfg.audio_assistant_delay_slot_token_id]
+                : MOSS_NEG_INF;
+        LOG("moss-debug step=%d pre text_vocab=%zu audio_vocab=%zu is_audio=%d time_step=%d audio_length=%d delayed_length=%lld\n",
+                debug_step,
+                text_vocab,
+                audio_vocab,
+                state.is_audio ? 1 : 0,
+                state.time_step,
+                state.audio_length,
+                (long long) state.delayed_length);
+        LOG("moss-debug step=%d raw text[0]=%.6f gen[%d]=%.6f delay[%d]=%.6f text_nan=%zu text_inf=%zu audio_nan=%zu audio_inf=%zu\n",
+                debug_step,
+                raw_text0,
+                (int) cfg.audio_assistant_gen_slot_token_id,
+                raw_text_gen,
+                (int) cfg.audio_assistant_delay_slot_token_id,
+                raw_text_delay,
+                text_nan,
+                text_inf,
+                audio_nan,
+                audio_inf);
+    }
+
     llama_token next_text = cfg.pad_token_id;
 
     if (state.delayed_length < (int64_t) n_vq) {
@@ -628,11 +713,41 @@ static std::vector<llama_token> moss_delay_step(
             scaled[(size_t) cfg.im_end_token_id] = MOSS_NEG_INF;
         }
 
+        if (debug_this_step) {
+            size_t finite_count = 0;
+            for (float v : scaled) {
+                if (std::isfinite(v)) {
+                    ++finite_count;
+                }
+            }
+            const float logit0 = !scaled.empty() ? scaled[0] : MOSS_NEG_INF;
+            const float logit_gen =
+                    ((size_t) cfg.audio_assistant_gen_slot_token_id < text_vocab)
+                    ? scaled[(size_t) cfg.audio_assistant_gen_slot_token_id]
+                    : MOSS_NEG_INF;
+            const float logit_delay =
+                    ((size_t) cfg.audio_assistant_delay_slot_token_id < text_vocab)
+                    ? scaled[(size_t) cfg.audio_assistant_delay_slot_token_id]
+                    : MOSS_NEG_INF;
+            LOG("moss-debug step=%d text-mask logit[0]=%.6f gen[%d]=%.6f delay[%d]=%.6f finite=%zu\n",
+                    debug_step,
+                    logit0,
+                    (int) cfg.audio_assistant_gen_slot_token_id,
+                    logit_gen,
+                    (int) cfg.audio_assistant_delay_slot_token_id,
+                    logit_delay,
+                    finite_count);
+        }
+
         next_text = moss_sample_token(
                 scaled, 1, text_vocab, rng, nullptr, 1.0f,
                 sampling_cfg.text_top_p, sampling_cfg.text_top_k, text_do_sample)[0];
     }
 
+    if (debug_this_step) {
+        LOG("moss-debug step=%d text-picked next_text=%d\n", debug_step, (int) next_text);
+    }
+
     if (next_text == cfg.audio_start_token_id) {
         state.is_audio = true;
     }
@@ -730,6 +845,9 @@ static std::vector<llama_token> moss_delay_step(
     }
 
     state.time_step += 1;
+    if (debug_this_step) {
+        moss_debug_step_counter() += 1;
+    }
     state.text_history.push_back(next_text);
     state.append_audio(next_audio);
 
@@ -985,21 +1103,33 @@ static llama_batch moss_batch_from_packed_rows(
     GGML_ASSERT(packed_ids.size() % cfg.packed_stride() == 0);
     GGML_ASSERT(start_frame + n_frames <= packed_ids.size() / cfg.packed_stride());
 
+    const bool disable_audio_input = []() {
+        const char * raw = std::getenv("MOSS_TTS_DISABLE_AUDIO_INPUT");
+        return raw != nullptr && raw[0] == '1';
+    }();
+
     llama_batch batch = llama_batch_init((int32_t) n_frames, 0, 1);
     batch.n_tokens = (int32_t) n_frames;
-    batch.n_token_audio = (int32_t) cfg.n_vq;
-    batch.token_audio = (llama_token *) std::malloc(sizeof(llama_token) * n_frames * cfg.n_vq);
-    if (batch.token_audio == nullptr) {
-        throw std::runtime_error("failed to allocate token_audio");
+    if (!disable_audio_input) {
+        batch.n_token_audio = (int32_t) cfg.n_vq;
+        batch.token_audio = (llama_token *) std::malloc(sizeof(llama_token) * n_frames * cfg.n_vq);
+        if (batch.token_audio == nullptr) {
+            throw std::runtime_error("failed to allocate token_audio");
+        }
+    } else {
+        batch.n_token_audio = 0;
+        batch.token_audio = nullptr;
     }
 
     for (size_t i = 0; i < n_frames; ++i) {
         const size_t row = (start_frame + i) * cfg.packed_stride();
         batch.token[i] = packed_ids[row + 0];
-        std::memcpy(
-                batch.token_audio + i * cfg.n_vq,
-                packed_ids.data() + row + 1,
-                sizeof(llama_token) * cfg.n_vq);
+        if (!disable_audio_input) {
+            std::memcpy(
+                    batch.token_audio + i * cfg.n_vq,
+                    packed_ids.data() + row + 1,
+                    sizeof(llama_token) * cfg.n_vq);
+        }
         batch.pos[i] = (llama_pos) (pos_start + i);
         batch.n_seq_id[i] = 1;
         batch.seq_id[i][0] = 0;
@@ -1046,6 +1176,14 @@ static bool moss_generate_from_ref(
 
     llama_model_params mparams = llama_model_default_params();
     mparams.use_mmap = true;
+    if (const char * raw_ngl = std::getenv("MOSS_TTS_N_GPU_LAYERS"); raw_ngl != nullptr && raw_ngl[0] != '\0') {
+        char * end = nullptr;
+        const long parsed = std::strtol(raw_ngl, &end, 10);
+        if (end != raw_ngl) {
+            mparams.n_gpu_layers = (int32_t) parsed;
+            LOG("moss-debug model n_gpu_layers=%d (from MOSS_TTS_N_GPU_LAYERS)\n", mparams.n_gpu_layers);
+        }
+    }
 
     llama_model * model = llama_model_load_from_file(model_path.c_str(), mparams);
     if (model == nullptr) {
@@ -1063,16 +1201,25 @@ static bool moss_generate_from_ref(
         throw std::runtime_error("generation reference n_vq does not match model metadata");
     }
     cfg.audio_vocab_size = model_cfg.audio_vocab_size;
+    const int32_t debug_steps = moss_debug_steps_from_env();
 
     llama_context_params cparams = llama_context_default_params();
     cparams.n_ctx = std::max<uint32_t>((uint32_t) hdr.prompt_frames + (uint32_t) max_new_tokens + 8u, 64u);
     cparams.n_batch = std::max<uint32_t>((uint32_t) hdr.prompt_frames, 1u);
     cparams.n_ubatch = cparams.n_batch;
     cparams.n_seq_max = 1;
-    cparams.embeddings = false;
+    cparams.embeddings = debug_steps > 0;
     cparams.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
     cparams.type_k = GGML_TYPE_F32;
     cparams.type_v = GGML_TYPE_F32;
+    if (const char * raw = std::getenv("MOSS_TTS_OFFLOAD_KQV"); raw != nullptr && raw[0] == '0') {
+        cparams.offload_kqv = false;
+        LOG("moss-debug cparams.offload_kqv=false (from MOSS_TTS_OFFLOAD_KQV=0)\n");
+    }
+    if (const char * raw = std::getenv("MOSS_TTS_OP_OFFLOAD"); raw != nullptr && raw[0] == '0') {
+        cparams.op_offload = false;
+        LOG("moss-debug cparams.op_offload=false (from MOSS_TTS_OP_OFFLOAD=0)\n");
+    }
 
     llama_context * ctx = llama_init_from_model(model, cparams);
     if (ctx == nullptr) {
@@ -1083,21 +1230,39 @@ static bool moss_generate_from_ref(
 
     llama_set_warmup(ctx, false);
     llama_set_causal_attn(ctx, true);
-    llama_set_embeddings(ctx, false);
+    llama_set_embeddings(ctx, debug_steps > 0);
 
     {
-        llama_batch batch = moss_batch_from_packed_rows(prompt_packed, 0, hdr.prompt_frames, cfg, 0, true);
-        const int ret = llama_decode(ctx, batch);
-        llama_batch_free(batch);
-        if (ret != 0) {
-            llama_free(ctx);
-            llama_model_free(model);
-            llama_backend_free();
-            throw std::runtime_error("prefill llama_decode failed: " + std::to_string(ret));
+        const size_t prefill_chunk = moss_prefill_chunk_from_env();
+        for (size_t start = 0; start < hdr.prompt_frames; start += prefill_chunk) {
+            const size_t n = std::min(prefill_chunk, (size_t) hdr.prompt_frames - start);
+            const bool output_last = (start + n == hdr.prompt_frames);
+            llama_batch batch = moss_batch_from_packed_rows(prompt_packed, start, n, cfg, start, output_last);
+            const int ret = llama_decode(ctx, batch);
+            llama_batch_free(batch);
+            if (ret != 0) {
+                llama_free(ctx);
+                llama_model_free(model);
+                llama_backend_free();
+                throw std::runtime_error("prefill llama_decode failed: " + std::to_string(ret));
+            }
         }
     }
 
     moss_delay_state state = moss_init_delay_state(prompt_packed, cfg);
+    if (debug_steps > 0) {
+        const llama_token last_prompt_text = prompt_packed.empty() ? -1 : prompt_packed[(hdr.prompt_frames - 1u) * cfg.packed_stride()];
+        LOG("moss-debug init: prompt_frames=%u last_prompt_text=%d audio_start=%d gen_slot=%d delay_slot=%d is_audio=%d audio_length=%d delayed_length=%lld\n",
+                hdr.prompt_frames,
+                (int) last_prompt_text,
+                (int) cfg.audio_start_token_id,
+                (int) cfg.audio_assistant_gen_slot_token_id,
+                (int) cfg.audio_assistant_delay_slot_token_id,
+                state.is_audio ? 1 : 0,
+                state.audio_length,
+                (long long) state.delayed_length);
+    }
+
     std::vector<llama_token> generated_packed;
     generated_packed.reserve((size_t) max_new_tokens * cfg.packed_stride());
 
@@ -1105,6 +1270,26 @@ static bool moss_generate_from_ref(
     moss_rng rng(seed);
 
     for (int32_t step = 0; step < max_new_tokens; ++step) {
+        if (debug_steps > 0 && step < debug_steps) {
+            const float * embd = llama_get_embeddings_ith(ctx, -1);
+            if (embd != nullptr) {
+                const int32_t n_embd = llama_model_n_embd(model);
+                size_t embd_nan = 0;
+                size_t embd_inf = 0;
+                for (int32_t i = 0; i < n_embd; ++i) {
+                    if (std::isnan(embd[i])) {
+                        ++embd_nan;
+                    } else if (std::isinf(embd[i])) {
+                        ++embd_inf;
+                    }
+                }
+                LOG("moss-debug step=%d embd[0]=%.6f embd_nan=%zu embd_inf=%zu\n",
+                        step, n_embd > 0 ? embd[0] : 0.0f, embd_nan, embd_inf);
+            } else {
+                LOG("moss-debug step=%d embd unavailable\n", step);
+            }
+        }
+
         const float * logits = llama_get_logits_ith(ctx, -1);
         if (logits == nullptr) {
             llama_free(ctx);
@@ -1120,6 +1305,19 @@ static bool moss_generate_from_ref(
 
         const std::vector<llama_token> next = moss_delay_step(
                 state, text_logits, audio_logits, sampling_cfg, cfg, rng);
+        if (debug_steps > 0 && step < debug_steps) {
+            const llama_token ch0 = next.size() > 1 ? next[1] : -1;
+            const llama_token ch1 = next.size() > 2 ? next[2] : -1;
+            LOG("moss-debug step=%d next_text=%d ch0=%d ch1=%d is_audio=%d audio_length=%d delayed_length=%lld is_stopping=%d\n",
+                    step,
+                    (int) next[0],
+                    (int) ch0,
+                    (int) ch1,
+                    state.is_audio ? 1 : 0,
+                    state.audio_length,
+                    (long long) state.delayed_length,
+                    state.is_stopping ? 1 : 0);
+        }
         generated_packed.insert(generated_packed.end(), next.begin(), next.end());
 
         llama_batch batch = moss_batch_from_packed_rows(

From 4a6686297af9bcad8d6b9b292d1d804bbd8b2f65 Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Fri, 13 Mar 2026 14:49:05 +0800
Subject: [PATCH 08/20] moss-tts: stabilize first-class prefill and FFN
 precision

---
 src/llama-graph.cpp    |  4 ++--
 tests/CMakeLists.txt   |  2 ++
 tools/tts/moss-tts.cpp | 36 +++++++++---------------------------
 3 files changed, 13 insertions(+), 29 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index f4cb7dce1..863c7b0ef 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1129,8 +1129,8 @@ ggml_tensor * llm_graph_context::build_ffn(
 
     if (down) {
         cur = build_lora_mm(down, cur);
-        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
-            // GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
+        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2 || arch == LLM_ARCH_MOSS_TTS_DELAY) {
+            // GLM4/JAIS2 and MOSS-TTS-Delay FFN down-projection can overflow with half-precision accumulators
             ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
         }
     }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e39fb805f..b692bd0c1 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -166,6 +166,8 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
     llama_build(test-gbnf-validator.cpp)
     llama_build(test-moss-tts-delay-load.cpp)
     llama_build(test-moss-tts-delay-forward.cpp)
+    llama_build(test-moss-tts-delay-op-offload-repro.cpp)
+    llama_build(test-moss-tts-delay-op-offload-minimal.cpp)
 
     # build test-tokenizer-1-bpe target once and add many tests
     llama_build(test-tokenizer-1-bpe.cpp)
diff --git a/tools/tts/moss-tts.cpp b/tools/tts/moss-tts.cpp
index 37418cccb..36a0f747f 100644
--- a/tools/tts/moss-tts.cpp
+++ b/tools/tts/moss-tts.cpp
@@ -264,20 +264,6 @@ static int32_t moss_debug_steps_from_env() {
     return (int32_t) std::max<long>(parsed, 0);
 }
 
-static size_t moss_prefill_chunk_from_env() {
-    const char * raw = std::getenv("MOSS_TTS_PREFILL_CHUNK");
-    if (raw == nullptr || raw[0] == '\0') {
-        return 16;
-    }
-
-    char * end = nullptr;
-    const long parsed = std::strtol(raw, &end, 10);
-    if (end == raw) {
-        return 16;
-    }
-    return (size_t) std::max<long>(parsed, 1);
-}
-
 static int32_t & moss_debug_step_counter() {
     static int32_t counter = 0;
     return counter;
@@ -1233,19 +1219,15 @@ static bool moss_generate_from_ref(
     llama_set_embeddings(ctx, debug_steps > 0);
 
     {
-        const size_t prefill_chunk = moss_prefill_chunk_from_env();
-        for (size_t start = 0; start < hdr.prompt_frames; start += prefill_chunk) {
-            const size_t n = std::min(prefill_chunk, (size_t) hdr.prompt_frames - start);
-            const bool output_last = (start + n == hdr.prompt_frames);
-            llama_batch batch = moss_batch_from_packed_rows(prompt_packed, start, n, cfg, start, output_last);
-            const int ret = llama_decode(ctx, batch);
-            llama_batch_free(batch);
-            if (ret != 0) {
-                llama_free(ctx);
-                llama_model_free(model);
-                llama_backend_free();
-                throw std::runtime_error("prefill llama_decode failed: " + std::to_string(ret));
-            }
+        llama_batch batch = moss_batch_from_packed_rows(
+                prompt_packed, 0, hdr.prompt_frames, cfg, 0, true);
+        const int ret = llama_decode(ctx, batch);
+        llama_batch_free(batch);
+        if (ret != 0) {
+            llama_free(ctx);
+            llama_model_free(model);
+            llama_backend_free();
+            throw std::runtime_error("prefill llama_decode failed: " + std::to_string(ret));
         }
     }
 

From c8c448686a86299c74e6ed4be980c364015e611b Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Fri, 13 Mar 2026 17:01:47 +0800
Subject: [PATCH 09/20] tools/tts: add first-class MOSS-TTS e2e runner and docs

---
 docs/moss-tts-firstclass-e2e.md       |  75 ++++++++++
 tests/moss_tts_delay_export_ref.py    |  31 ++--
 tests/test-moss-tts-delay-forward.cpp |   2 +
 tools/tts/moss-tts-firstclass-e2e.py  | 208 ++++++++++++++++++++++++++
 4 files changed, 298 insertions(+), 18 deletions(-)
 create mode 100644 docs/moss-tts-firstclass-e2e.md
 create mode 100755 tools/tts/moss-tts-firstclass-e2e.py

diff --git a/docs/moss-tts-firstclass-e2e.md b/docs/moss-tts-firstclass-e2e.md
new file mode 100644
index 000000000..6b969d479
--- /dev/null
+++ b/docs/moss-tts-firstclass-e2e.md
@@ -0,0 +1,75 @@
+# MOSS-TTS First-Class E2E 脚本说明
+
+## 脚本位置
+`tools/tts/moss-tts-firstclass-e2e.py`
+
+## 功能
+该脚本把以下链路封装为一次命令执行：
+
+1. 用 `moss-tts-build-generation-ref.py` 构建 `generation.ref.bin`
+2. 调用 `llama-moss-tts` 进行 first-class backbone 生成 raw audio codes
+3. 用 `moss-tts-audio-decode.py` + ONNX audio tokenizer 解码为 WAV
+
+输入：`text`（可选 `reference audio`）
+输出：`wav`
+
+中间产物（`generation.ref.bin`、`raw.codes.bin`）会写入临时目录并在结束后自动删除。
+
+## 必需参数
+- `--model-gguf`：MOSS-TTS first-class GGUF 模型
+- `--tokenizer-dir`：包含 `tokenizer.json` 的目录
+- `--onnx-encoder`：MOSS Audio Tokenizer encoder ONNX
+- `--onnx-decoder`：MOSS Audio Tokenizer decoder ONNX
+- `--output-wav`：输出 wav 路径
+- `--text` 或 `--text-file`：二选一
+
+## 常用可选参数
+- `--reference-audio`：参考音频（24kHz）
+- `--text-temperature`：默认 `1.5`
+- `--audio-temperature`：默认 `1.7`
+- `--max-new-tokens`：默认 `512`
+- `--n-gpu-layers`：默认读取 `MOSS_TTS_N_GPU_LAYERS`，未设置时默认 `1`
+- `--python-bin`：指定 Python 解释器
+- `--audio-decoder-cpu`：强制 ONNX 解码走 CPU
+- `--cpu-audio-encode`：参考音频编码走 CPU
+- `--build`：运行前自动构建 `llama-moss-tts`
+
+## `tokenizer-dir` 是什么
+`tokenizer-dir` 不是 ONNX 目录，它是文本 tokenizer 目录，至少要有：
+
+- `tokenizer.json`
+
+通常来自 Qwen3 backbone tokenizer 的提取目录。例如：
+`weights/extracted/qwen3_backbone`
+
+## 示例
+### 1) text + reference 音色克隆
+```bash
+python tools/tts/moss-tts-firstclass-e2e.py \
+  --model-gguf /path/to/moss_delay_firstclass_f16.gguf \
+  --tokenizer-dir /path/to/weights/extracted/qwen3_backbone \
+  --onnx-encoder /path/to/MOSS-Audio-Tokenizer-ONNX/encoder.onnx \
+  --onnx-decoder /path/to/MOSS-Audio-Tokenizer-ONNX/decoder.onnx \
+  --text-file /path/to/text.txt \
+  --reference-audio /path/to/reference_24k.wav \
+  --output-wav /path/to/output.wav
+```
+
+### 2) 不带 reference
+```bash
+python tools/tts/moss-tts-firstclass-e2e.py \
+  --model-gguf /path/to/moss_delay_firstclass_f16.gguf \
+  --tokenizer-dir /path/to/weights/extracted/qwen3_backbone \
+  --onnx-encoder /path/to/MOSS-Audio-Tokenizer-ONNX/encoder.onnx \
+  --onnx-decoder /path/to/MOSS-Audio-Tokenizer-ONNX/decoder.onnx \
+  --text "清晨的青藏高原，空气稀薄而寒冷。" \
+  --output-wav /path/to/output.wav
+```
+
+## 输出
+脚本结束时会打印：
+
+- `wav` 路径
+- `wav_info`（采样率、声道、帧数、时长）
+
+注：当前 `llama-moss-tts` 可能出现“返回码非 0 但 wav 已成功产出”的情况，脚本会保留并提示该结果。
diff --git a/tests/moss_tts_delay_export_ref.py b/tests/moss_tts_delay_export_ref.py
index c78559092..7f79666bf 100644
--- a/tests/moss_tts_delay_export_ref.py
+++ b/tests/moss_tts_delay_export_ref.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 
-import os
 import struct
 import sys
 import types
@@ -9,14 +8,23 @@
 import numpy as np
 import torch
 import transformers
-from safetensors.torch import load_file
 
-if "transformers.initialization" not in sys.modules:
+# Keep compatibility with environments that do not provide
+# transformers.initialization (older transformers releases).
+try:
+    from transformers import initialization as _hf_init  # noqa: F401
+except Exception:
     import torch.nn.init as nn_init
 
+    def _guard_torch_init_functions():
+        def _decorator(fn):
+            return fn
+        return _decorator
+
     shim = types.SimpleNamespace(
         normal_=nn_init.normal_,
         zeros_=nn_init.zeros_,
+        guard_torch_init_functions=_guard_torch_init_functions,
     )
     transformers.initialization = shim
     sys.modules["transformers.initialization"] = shim
@@ -61,20 +69,7 @@ def main() -> int:
     out_path = sys.argv[2]
 
     config = MossTTSDelayConfig.from_pretrained(model_dir)
-    orig_get_input_embeddings = MossTTSDelayModel.get_input_embeddings
-    orig_tie_weights = MossTTSDelayModel.tie_weights
-
-    MossTTSDelayModel.get_input_embeddings = lambda self: self.language_model.get_input_embeddings()
-    MossTTSDelayModel.tie_weights = lambda self: None
-    try:
-        model = MossTTSDelayModel(config).eval()
-        state_dict = load_file(os.path.join(model_dir, "model.safetensors"), device="cpu")
-        missing, unexpected = model.load_state_dict(state_dict, strict=False)
-        if missing or unexpected:
-            raise RuntimeError(f"state_dict mismatch: missing={missing} unexpected={unexpected}")
-    finally:
-        MossTTSDelayModel.get_input_embeddings = orig_get_input_embeddings
-        MossTTSDelayModel.tie_weights = orig_tie_weights
+    model = MossTTSDelayModel.from_pretrained(model_dir, local_files_only=True).eval()
 
     n_tokens = 4
     text_ids = build_text_ids(n_tokens, config.language_config.vocab_size)
@@ -93,7 +88,7 @@ def main() -> int:
         axis=0,
     ).astype(np.float32, copy=False)
 
-    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
     with open(out_path, "wb") as f:
         f.write(struct.pack("<6I", REF_MAGIC, REF_VERSION, n_tokens, config.n_vq, ref_embd.shape[0], ref_logits.shape[0]))
         f.write(text_ids.astype(np.int32, copy=False).tobytes())
diff --git a/tests/test-moss-tts-delay-forward.cpp b/tests/test-moss-tts-delay-forward.cpp
index 87dece0c7..483fe7c38 100644
--- a/tests/test-moss-tts-delay-forward.cpp
+++ b/tests/test-moss-tts-delay-forward.cpp
@@ -96,6 +96,8 @@ int main(int argc, char ** argv) {
 
     llama_model_params mparams = llama_model_default_params();
     mparams.use_mmap = true;
+    // Keep this parity test deterministic and avoid multi-backend split-input limits.
+    mparams.n_gpu_layers = 0;
 
     llama_model * model = llama_model_load_from_file(argv[1], mparams);
     if (model == nullptr) {
diff --git a/tools/tts/moss-tts-firstclass-e2e.py b/tools/tts/moss-tts-firstclass-e2e.py
new file mode 100755
index 000000000..603b2905c
--- /dev/null
+++ b/tools/tts/moss-tts-firstclass-e2e.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import os
+import shlex
+import subprocess
+import sys
+import tempfile
+import wave
+from pathlib import Path
+
+
+def run_cmd(cmd: list[str], env: dict[str, str] | None = None) -> subprocess.CompletedProcess:
+    print("+", shlex.join(cmd), flush=True)
+    return subprocess.run(cmd, env=env, check=False)
+
+
+def need_file(path: Path, name: str) -> None:
+    if not path.is_file():
+        raise FileNotFoundError(f"missing {name}: {path}")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=(
+            "End-to-end first-class MOSS-TTS pipeline (llama.cpp backbone + ONNX tokenizer): "
+            "text(+ref) -> wav. Intermediate artifacts are stored in a temporary directory "
+            "and removed automatically."
+        )
+    )
+
+    parser.add_argument("--model-gguf", default=os.getenv("MODEL_GGUF", ""))
+    parser.add_argument("--tokenizer-dir", default=os.getenv("TOKENIZER_DIR", ""))
+    parser.add_argument("--onnx-encoder", default=os.getenv("ONNX_ENCODER", ""))
+    parser.add_argument("--onnx-decoder", default=os.getenv("ONNX_DECODER", ""))
+    parser.add_argument("--output-wav", required=True)
+    parser.add_argument("--reference-audio", default="")
+    parser.add_argument("--language", default="zh")
+    parser.add_argument("--max-new-tokens", type=int, default=512)
+    parser.add_argument("--text-temperature", type=float, default=1.5)
+    parser.add_argument("--audio-temperature", type=float, default=1.7)
+    parser.add_argument("--n-gpu-layers", type=int, default=int(os.getenv("MOSS_TTS_N_GPU_LAYERS", "1")))
+    parser.add_argument("--python-bin", default=sys.executable)
+    parser.add_argument("--llama-bin", default="")
+    parser.add_argument("--build", action="store_true", help="Build llama-moss-tts before running")
+    parser.add_argument("--n-jobs", type=int, default=(os.cpu_count() or 1))
+    parser.add_argument("--audio-decoder-cpu", action="store_true")
+    parser.add_argument("--cpu-audio-encode", action="store_true")
+
+    text_group = parser.add_mutually_exclusive_group(required=True)
+    text_group.add_argument("--text", default="")
+    text_group.add_argument("--text-file", default="")
+
+    args = parser.parse_args()
+
+    if not args.model_gguf:
+        parser.error("--model-gguf is required (or set MODEL_GGUF)")
+    if not args.tokenizer_dir:
+        parser.error("--tokenizer-dir is required (or set TOKENIZER_DIR)")
+    if not args.onnx_encoder:
+        parser.error("--onnx-encoder is required (or set ONNX_ENCODER)")
+    if not args.onnx_decoder:
+        parser.error("--onnx-decoder is required (or set ONNX_DECODER)")
+
+    return args
+
+
+def main() -> int:
+    args = parse_args()
+
+    repo_root = Path(__file__).resolve().parents[2]
+    build_ref_script = repo_root / "tools/tts/moss-tts-build-generation-ref.py"
+    decode_script = repo_root / "tools/tts/moss-tts-audio-decode.py"
+    llama_bin = Path(args.llama_bin) if args.llama_bin else repo_root / "build/bin/llama-moss-tts"
+
+    model_gguf = Path(args.model_gguf).expanduser().resolve()
+    tokenizer_dir = Path(args.tokenizer_dir).expanduser().resolve()
+    onnx_encoder = Path(args.onnx_encoder).expanduser().resolve()
+    onnx_decoder = Path(args.onnx_decoder).expanduser().resolve()
+    python_bin = Path(args.python_bin).expanduser().resolve()
+    output_wav = Path(args.output_wav).expanduser().resolve()
+
+    need_file(python_bin, "python binary")
+    need_file(model_gguf, "first-class model gguf")
+    need_file(tokenizer_dir / "tokenizer.json", "tokenizer.json")
+    need_file(onnx_encoder, "ONNX encoder")
+    need_file(onnx_decoder, "ONNX decoder")
+    need_file(build_ref_script, "generation-ref builder")
+    need_file(decode_script, "audio decode helper")
+    if args.text_file:
+        need_file(Path(args.text_file).expanduser().resolve(), "text file")
+    if args.reference_audio:
+        need_file(Path(args.reference_audio).expanduser().resolve(), "reference audio")
+
+    if args.build:
+        rc = run_cmd(["cmake", "-S", str(repo_root), "-B", str(repo_root / "build")]).returncode
+        if rc != 0:
+            raise RuntimeError(f"cmake configure failed with rc={rc}")
+        rc = run_cmd(
+            [
+                "cmake",
+                "--build",
+                str(repo_root / "build"),
+                "--target",
+                "llama-moss-tts",
+                "-j",
+                str(args.n_jobs),
+            ]
+        ).returncode
+        if rc != 0:
+            raise RuntimeError(f"cmake build failed with rc={rc}")
+
+    need_file(llama_bin, "llama-moss-tts binary")
+    output_wav.parent.mkdir(parents=True, exist_ok=True)
+
+    with tempfile.TemporaryDirectory(prefix="moss-tts-firstclass-") as tmpdir:
+        tmpdir_path = Path(tmpdir)
+        generation_ref = tmpdir_path / "generation.ref.bin"
+        raw_codes = tmpdir_path / "raw.codes.bin"
+
+        build_ref_cmd = [
+            str(python_bin),
+            str(build_ref_script),
+            "--tokenizer-dir",
+            str(tokenizer_dir),
+            "--output-ref",
+            str(generation_ref),
+            "--language",
+            args.language,
+        ]
+        if args.text_file:
+            build_ref_cmd.extend(["--text-file", str(Path(args.text_file).expanduser().resolve())])
+        else:
+            build_ref_cmd.extend(["--text", args.text])
+
+        if args.reference_audio:
+            build_ref_cmd.extend(
+                [
+                    "--reference-audio",
+                    str(Path(args.reference_audio).expanduser().resolve()),
+                    "--encoder-onnx",
+                    str(onnx_encoder),
+                    "--decoder-onnx",
+                    str(onnx_decoder),
+                ]
+            )
+            if args.cpu_audio_encode:
+                build_ref_cmd.append("--cpu-audio-encode")
+
+        rc = run_cmd(build_ref_cmd).returncode
+        if rc != 0:
+            raise RuntimeError(f"generation-ref build failed with rc={rc}")
+
+        run_args = [
+            str(llama_bin),
+            "-m",
+            str(model_gguf),
+            "--generation-ref",
+            str(generation_ref),
+            "--max-new-tokens",
+            str(args.max_new_tokens),
+            "--text-temperature",
+            str(args.text_temperature),
+            "--audio-temperature",
+            str(args.audio_temperature),
+            "--dump-raw-codes",
+            str(raw_codes),
+            "--audio-decoder-script",
+            str(decode_script),
+            "--audio-encoder-onnx",
+            str(onnx_encoder),
+            "--audio-decoder-onnx",
+            str(onnx_decoder),
+            "--wav-out",
+            str(output_wav),
+            "--python-bin",
+            str(python_bin),
+        ]
+        if args.audio_decoder_cpu:
+            run_args.append("--audio-decoder-cpu")
+
+        env = os.environ.copy()
+        env["MOSS_TTS_N_GPU_LAYERS"] = str(args.n_gpu_layers)
+        llama_rc = run_cmd(run_args, env=env).returncode
+
+        if not output_wav.is_file():
+            raise RuntimeError(f"llama-moss-tts did not produce wav: {output_wav} (rc={llama_rc})")
+        if llama_rc != 0:
+            print(
+                f"warning: llama-moss-tts exited with rc={llama_rc}, but wav was produced.",
+                file=sys.stderr,
+            )
+
+    with wave.open(str(output_wav), "rb") as f:
+        sr = f.getframerate()
+        n = f.getnframes()
+        ch = f.getnchannels()
+
+    print("done")
+    print(f"wav     : {output_wav}")
+    print(f"wav_info: sr={sr} ch={ch} frames={n} sec={n/max(sr,1):.3f}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 9f557a1047b49cad22b4485547472529c83b8402 Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Fri, 13 Mar 2026 17:59:04 +0800
Subject: [PATCH 10/20] moss-tts: switch to generation-input flow and remove
 parity failure exit

---
 docs/moss-tts-firstclass-e2e.md      |  6 +--
 ggml/src/ggml-backend.cpp            |  2 +-
 tools/tts/moss-tts-firstclass-e2e.py |  2 +-
 tools/tts/moss-tts.cpp               | 60 ++++++++++------------------
 4 files changed, 26 insertions(+), 44 deletions(-)

diff --git a/docs/moss-tts-firstclass-e2e.md b/docs/moss-tts-firstclass-e2e.md
index 6b969d479..fde8393b4 100644
--- a/docs/moss-tts-firstclass-e2e.md
+++ b/docs/moss-tts-firstclass-e2e.md
@@ -6,14 +6,14 @@
 ## 功能
 该脚本把以下链路封装为一次命令执行：
 
-1. 用 `moss-tts-build-generation-ref.py` 构建 `generation.ref.bin`
+1. 用 `moss-tts-build-generation-ref.py` 构建 `generation.input.bin`
 2. 调用 `llama-moss-tts` 进行 first-class backbone 生成 raw audio codes
 3. 用 `moss-tts-audio-decode.py` + ONNX audio tokenizer 解码为 WAV
 
 输入：`text`（可选 `reference audio`）
 输出：`wav`
 
-中间产物（`generation.ref.bin`、`raw.codes.bin`）会写入临时目录并在结束后自动删除。
+中间产物（`generation.input.bin`、`raw.codes.bin`）会写入临时目录并在结束后自动删除。
 
 ## 必需参数
 - `--model-gguf`：MOSS-TTS first-class GGUF 模型
@@ -72,4 +72,4 @@ python tools/tts/moss-tts-firstclass-e2e.py \
 - `wav` 路径
 - `wav_info`（采样率、声道、帧数、时长）
 
-注：当前 `llama-moss-tts` 可能出现“返回码非 0 但 wav 已成功产出”的情况，脚本会保留并提示该结果。
+注：`llama-moss-tts` 在该链路中不再做 generation parity 返回码判定；只要流程成功会返回 0。
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index bc57df20b..01868370b 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -665,7 +665,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
 #endif
 
 #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
-#define GGML_SCHED_MAX_SPLIT_INPUTS 30
+#define GGML_SCHED_MAX_SPLIT_INPUTS 64
 #endif
 
 #ifndef GGML_SCHED_MAX_COPIES
diff --git a/tools/tts/moss-tts-firstclass-e2e.py b/tools/tts/moss-tts-firstclass-e2e.py
index 603b2905c..78d286bcc 100755
--- a/tools/tts/moss-tts-firstclass-e2e.py
+++ b/tools/tts/moss-tts-firstclass-e2e.py
@@ -157,7 +157,7 @@ def main() -> int:
             str(llama_bin),
             "-m",
             str(model_gguf),
-            "--generation-ref",
+            "--generation-input",
             str(generation_ref),
             "--max-new-tokens",
             str(args.max_new_tokens),
diff --git a/tools/tts/moss-tts.cpp b/tools/tts/moss-tts.cpp
index 36a0f747f..ae1bcfbea 100644
--- a/tools/tts/moss-tts.cpp
+++ b/tools/tts/moss-tts.cpp
@@ -165,7 +165,7 @@ static moss_generation_audio moss_decode_generation_audio(
         size_t prompt_frames,
         const moss_delay_config & cfg);
 
-static bool moss_generate_from_ref(
+static void moss_generate_from_ref(
         const std::string & model_path,
         const std::string & ref_path,
         int32_t max_new_tokens,
@@ -183,6 +183,7 @@ static void print_usage(int argc, char ** argv) {
     (void) argc;
     LOG("\nexample usage:\n");
     LOG("  %s -m model.gguf --print-delay-config\n", argv[0]);
+    LOG("  %s -m model.gguf --generation-input generation.input.bin\n", argv[0]);
     LOG("  %s --decode-parity-ref decode.ref.bin\n", argv[0]);
     LOG("\n");
 }
@@ -1125,7 +1126,7 @@ static llama_batch moss_batch_from_packed_rows(
     return batch;
 }
 
-static bool moss_generate_from_ref(
+static void moss_generate_from_ref(
         const std::string & model_path,
         const std::string & ref_path,
         int32_t max_new_tokens,
@@ -1154,9 +1155,9 @@ static bool moss_generate_from_ref(
     cfg.audio_pad_code = (llama_token) hdr.audio_pad_code;
 
     std::vector<llama_token> prompt_packed((size_t) hdr.prompt_packed_frames * cfg.packed_stride());
-    std::vector<llama_token> ref_raw_codes((size_t) hdr.raw_frames * cfg.n_vq);
+    std::vector<llama_token> ignored_ref_raw_codes((size_t) hdr.raw_frames * cfg.n_vq);
     moss_read_exact(in, prompt_packed.data(), prompt_packed.size(), "prompt packed ids");
-    moss_read_exact(in, ref_raw_codes.data(), ref_raw_codes.size(), "reference raw codes");
+    moss_read_exact(in, ignored_ref_raw_codes.data(), ignored_ref_raw_codes.size(), "reference raw codes");
 
     llama_backend_init();
 
@@ -1321,33 +1322,11 @@ static bool moss_generate_from_ref(
 
     const moss_generation_audio decoded = moss_decode_generation_audio(state, hdr.prompt_frames, cfg);
 
-    size_t mismatch_count = 0;
-    const size_t compare_count = std::min(decoded.raw_codes.size(), ref_raw_codes.size());
-    size_t first_mismatch = compare_count;
-    for (size_t i = 0; i < compare_count; ++i) {
-        if (decoded.raw_codes[i] != ref_raw_codes[i]) {
-            if (first_mismatch == compare_count) {
-                first_mismatch = i;
-            }
-            ++mismatch_count;
-        }
-    }
-    mismatch_count += decoded.raw_codes.size() > ref_raw_codes.size()
-            ? decoded.raw_codes.size() - ref_raw_codes.size()
-            : ref_raw_codes.size() - decoded.raw_codes.size();
-
-    LOG("moss-tts first-class generation parity: prompt_frames=%u generated_frames=%zu raw_frames=%zu ref_raw_frames=%u mismatch_count=%zu\n",
+    LOG("moss-tts first-class generation: prompt_frames=%u generated_frames=%zu raw_frames=%zu input_ref_raw_frames=%u\n",
             hdr.prompt_frames,
             generated_packed.size() / cfg.packed_stride(),
             decoded.raw_frames,
-            hdr.raw_frames,
-            mismatch_count);
-    if (first_mismatch != compare_count) {
-        LOG("first mismatch at raw_token=%zu got=%d ref=%d\n",
-                first_mismatch,
-                (int) decoded.raw_codes[first_mismatch],
-                (int) ref_raw_codes[first_mismatch]);
-    }
+            hdr.raw_frames);
 
     if (!dump_raw_codes_path.empty()) {
         moss_write_codes_file(dump_raw_codes_path, decoded.raw_codes, decoded.raw_frames, cfg);
@@ -1387,8 +1366,6 @@ static bool moss_generate_from_ref(
     llama_free(ctx);
     llama_model_free(model);
     llama_backend_free();
-
-    return mismatch_count == 0;
 }
 
 static std::vector<llama_token> moss_audio_history_slice(
@@ -1691,7 +1668,7 @@ static bool moss_delay_self_test() {
 int main(int argc, char ** argv) {
     std::string model_path;
     std::string decode_parity_ref_path;
-    std::string generation_ref_path;
+    std::string generation_input_path;
     std::string dump_raw_codes_path;
     std::string audio_decoder_script;
     std::string audio_encoder_onnx;
@@ -1711,8 +1688,13 @@ int main(int argc, char ** argv) {
             model_path = argv[++i];
             continue;
         }
+        if (arg == "--generation-input" && i + 1 < argc) {
+            generation_input_path = argv[++i];
+            continue;
+        }
         if (arg == "--generation-ref" && i + 1 < argc) {
-            generation_ref_path = argv[++i];
+            generation_input_path = argv[++i];
+            LOG("warning: --generation-ref is deprecated; use --generation-input instead.\n");
             continue;
         }
         if (arg == "--decode-parity-ref" && i + 1 < argc) {
@@ -1809,15 +1791,15 @@ int main(int argc, char ** argv) {
         LOG("moss delay state self-test: ok\n");
     }
 
-    if (!generation_ref_path.empty()) {
+    if (!generation_input_path.empty()) {
         if (model_path.empty()) {
-            LOG_ERR("--generation-ref requires -m <model.gguf>\n");
+            LOG_ERR("--generation-input requires -m <model.gguf>\n");
             return EXIT_FAILURE;
         }
         try {
-            const bool ok = moss_generate_from_ref(
+            moss_generate_from_ref(
                     model_path,
-                    generation_ref_path,
+                    generation_input_path,
                     max_new_tokens,
                     sampling_cfg,
                     seed,
@@ -1828,9 +1810,9 @@ int main(int argc, char ** argv) {
                     audio_decoder_onnx,
                     wav_out_path,
                     use_gpu_audio);
-            return ok ? EXIT_SUCCESS : EXIT_FAILURE;
+            return EXIT_SUCCESS;
         } catch (const std::exception & err) {
-            LOG_ERR("generation parity failed: %s\n", err.what());
+            LOG_ERR("generation failed: %s\n", err.what());
             return EXIT_FAILURE;
         }
     }
@@ -1860,7 +1842,7 @@ int main(int argc, char ** argv) {
         LOG("moss delay state, multi-head sampler, and raw-code decode are in place; audio decode is available via the external Python/ONNX helper.\n");
         LOG("use --print-delay-config with -m <model.gguf> to inspect model metadata.\n");
         LOG("use --decode-parity-ref <ref.bin> to verify C++ de-delay/raw-code extraction against Python.\n");
-        LOG("use --generation-ref <ref.bin> -m <first-class-model.gguf> to verify end-to-end first-class generation against Python.\n");
+        LOG("use --generation-input <input.bin> -m <first-class-model.gguf> for first-class generation.\n");
         return EXIT_SUCCESS;
     }
 

From f8d0fa2bd29a5733ad5a1b443d8de06a7c3fd493 Mon Sep 17 00:00:00 2001
From: CHiSwsz <xzzduang@gmail.com>
Date: Fri, 13 Mar 2026 20:12:50 +0800
Subject: [PATCH 11/20] no changes

---
 src/models/moss-tts-delay.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/models/moss-tts-delay.cpp b/src/models/moss-tts-delay.cpp
index ae7ff48c4..8b02ad232 100644
--- a/src/models/moss-tts-delay.cpp
+++ b/src/models/moss-tts-delay.cpp
@@ -71,6 +71,7 @@ llm_build_moss_tts_delay::llm_build_moss_tts_delay(const llama_model & model, co
         res->add_input(std::move(inp_audio));
     }
 
+    
     ggml_tensor * inp_pos = build_inp_pos();
     auto * inp_attn = build_attn_inp_kv();
     ggml_tensor * inp_out_ids = build_inp_out_ids();

From 2c3a90ea8c7a1634bef03a03e0e6d4f77474f34e Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Fri, 13 Mar 2026 20:34:58 +0800
Subject: [PATCH 12/20] tools/tts: add moss-tts generation input builder script

---
 tools/tts/moss-tts-build-generation-ref.py | 113 +++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100755 tools/tts/moss-tts-build-generation-ref.py

diff --git a/tools/tts/moss-tts-build-generation-ref.py b/tools/tts/moss-tts-build-generation-ref.py
new file mode 100755
index 000000000..5d8d68fbc
--- /dev/null
+++ b/tools/tts/moss-tts-build-generation-ref.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import struct
+import sys
+from pathlib import Path
+
+import numpy as np
+
+REF_MAGIC = 0x4652474D  # "MGRF"
+REF_VERSION = 1
+
+
+def parse_args() -> argparse.Namespace:
+    ap = argparse.ArgumentParser(
+        description="Build first-class MOSS-TTS generation input (.bin) from text (+ optional reference audio)."
+    )
+    ap.add_argument("--tokenizer-dir", required=True, help="Directory containing tokenizer.json")
+    ap.add_argument("--output-ref", required=True, help="Output .ref.bin path")
+    ap.add_argument("--language", default="zh", help="Language tag passed to prompt builder")
+    ap.add_argument("--text", default="", help="Input text (optional when --text-file is used)")
+    ap.add_argument("--text-file", default="", help="UTF-8 text file path")
+    ap.add_argument("--reference-audio", default="", help="Optional reference wav path (24kHz preferred)")
+    ap.add_argument("--encoder-onnx", default="", help="Required when --reference-audio is set")
+    ap.add_argument("--decoder-onnx", default="", help="Required when --reference-audio is set")
+    ap.add_argument("--cpu-audio-encode", action="store_true", help="Force CPU for ONNX reference encode")
+    return ap.parse_args()
+
+
+def _load_text(args: argparse.Namespace) -> str:
+    if args.text_file:
+        return Path(args.text_file).read_text(encoding="utf-8")
+    if args.text:
+        return args.text
+    raise ValueError("either --text or --text-file is required")
+
+
+def _read_reference_codes(args: argparse.Namespace) -> np.ndarray | None:
+    if not args.reference_audio:
+        return None
+    if not args.encoder_onnx or not args.decoder_onnx:
+        raise ValueError("--encoder-onnx and --decoder-onnx are required when --reference-audio is set")
+
+    import soundfile as sf
+    from moss_audio_tokenizer.onnx import OnnxAudioTokenizer
+
+    wav, sr = sf.read(args.reference_audio, dtype="float32")
+    if wav.ndim > 1:
+        wav = wav.mean(axis=1)
+    if sr != 24000:
+        raise ValueError(f"reference sample rate must be 24000, got {sr}: {args.reference_audio}")
+
+    tokenizer = OnnxAudioTokenizer(
+        encoder_path=args.encoder_onnx,
+        decoder_path=args.decoder_onnx,
+        use_gpu=not args.cpu_audio_encode,
+    )
+    codes = tokenizer.encode(wav)
+    return np.asarray(codes, dtype=np.int64)
+
+
+def main() -> int:
+    args = parse_args()
+
+    workroot = Path(__file__).resolve().parents[3]
+    sys.path.insert(0, str(workroot / "MOSS-TTS"))
+
+    from moss_tts_delay.llama_cpp._constants import AUDIO_PAD_CODE
+    from moss_tts_delay.llama_cpp.processor import Tokenizer, build_generation_prompt
+
+    text = _load_text(args)
+    reference_codes = _read_reference_codes(args)
+
+    tok = Tokenizer(args.tokenizer_dir)
+    input_ids = build_generation_prompt(
+        tokenizer=tok,
+        text=text,
+        reference_codes=reference_codes,
+        language=args.language,
+    )
+
+    out_ref = Path(args.output_ref)
+    out_ref.parent.mkdir(parents=True, exist_ok=True)
+
+    prompt_frames = int(input_ids.shape[0])
+    n_vq = int(input_ids.shape[1] - 1)
+    with out_ref.open("wb") as f:
+        f.write(
+            struct.pack(
+                "<IIIIIII",
+                REF_MAGIC,
+                REF_VERSION,
+                prompt_frames,
+                n_vq,
+                int(AUDIO_PAD_CODE),
+                prompt_frames,
+                0,
+            )
+        )
+        f.write(input_ids.astype(np.int32).tobytes(order="C"))
+
+    ref_frames = 0 if reference_codes is None else int(reference_codes.shape[0])
+    print(
+        f"wrote {out_ref} prompt_frames={prompt_frames} n_vq={n_vq} reference_frames={ref_frames}",
+        file=sys.stderr,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 4bce32d3bf3281aeecc98665a4a9247db609dd84 Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Fri, 13 Mar 2026 20:50:59 +0800
Subject: [PATCH 13/20] tests: remove moss op-offload ad-hoc test targets from
 CMake

---
 tests/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b692bd0c1..e39fb805f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -166,8 +166,6 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
     llama_build(test-gbnf-validator.cpp)
     llama_build(test-moss-tts-delay-load.cpp)
     llama_build(test-moss-tts-delay-forward.cpp)
-    llama_build(test-moss-tts-delay-op-offload-repro.cpp)
-    llama_build(test-moss-tts-delay-op-offload-minimal.cpp)
 
     # build test-tokenizer-1-bpe target once and add many tests
     llama_build(test-tokenizer-1-bpe.cpp)

From ac555e4f6c3fae3ce2eee47dfee7344cbb7e86b0 Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Fri, 13 Mar 2026 21:13:35 +0800
Subject: [PATCH 14/20] core: align llama_batch view initializers with
 token_audio fields

---
 examples/parallel/parallel.cpp        | 2 ++
 tools/batched-bench/batched-bench.cpp | 2 ++
 tools/mtmd/mtmd-helper.cpp            | 4 ++++
 tools/perplexity/perplexity.cpp       | 2 ++
 tools/server/server-context.cpp       | 2 ++
 5 files changed, 12 insertions(+)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 1700ceefb..970f4657c 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -395,6 +395,8 @@ int main(int argc, char ** argv) {
             llama_batch batch_view = {
                 n_tokens,
                 batch.token    + i,
+                0,
+                nullptr,
                 nullptr,
                 batch.pos      + i,
                 batch.n_seq_id + i,
diff --git a/tools/batched-bench/batched-bench.cpp b/tools/batched-bench/batched-bench.cpp
index 224f0e1f1..0a5902a76 100644
--- a/tools/batched-bench/batched-bench.cpp
+++ b/tools/batched-bench/batched-bench.cpp
@@ -83,6 +83,8 @@ int main(int argc, char ** argv) {
             llama_batch batch_view = {
                 n_tokens,
                 batch.token    + i,
+                0,
+                nullptr,
                 nullptr,
                 batch.pos      + i,
                 batch.n_seq_id + i,
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index c75f90730..10a691a13 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -136,6 +136,8 @@ struct decode_embd_batch {
         batch = {
             /*n_tokens       =*/ n_tokens,
             /*tokens         =*/ nullptr,
+            /*n_token_audio  =*/ 0,
+            /*token_audio    =*/ nullptr,
             /*embd           =*/ embd,
             /*pos            =*/ pos.data(),
             /*n_seq_id       =*/ n_seq_id.data(),
@@ -216,6 +218,8 @@ struct decode_embd_batch {
         return {
             /*n_tokens       =*/ n_tokens,
             /*tokens         =*/ nullptr,
+            /*n_token_audio  =*/ 0,
+            /*token_audio    =*/ nullptr,
             /*embd           =*/ batch.embd     + offset * n_mmproj_embd,
             /*pos            =*/ pos_ptr,
             /*n_seq_id       =*/ batch.n_seq_id + offset,
diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp
index cc5ea99c4..fe20ca2ff 100644
--- a/tools/perplexity/perplexity.cpp
+++ b/tools/perplexity/perplexity.cpp
@@ -668,6 +668,8 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
         llama_batch batch_view = {
             n_tokens,
             batch.token    + i,
+            0,
+            nullptr,
             nullptr,
             batch.pos      + i,
             batch.n_seq_id + i,
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 3790308d0..9db120a98 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2639,6 +2639,8 @@ struct server_context_impl {
             llama_batch batch_view = {
                 n_tokens,
                 batch.token    + i,
+                0,
+                nullptr,
                 nullptr,
                 batch.pos      + i,
                 batch.n_seq_id + i,

From 61701f4ebcc9e9c81aada2b97f2a8933943730f5 Mon Sep 17 00:00:00 2001
From: CHiSwsz <xzzduang@gmail.com>
Date: Fri, 13 Mar 2026 21:14:56 +0800
Subject: [PATCH 15/20] no changes

---
 AGENTS.md            | 81 --------------------------------------------
 tests/CMakeLists.txt |  2 --
 2 files changed, 83 deletions(-)
 delete mode 100644 AGENTS.md

diff --git a/AGENTS.md b/AGENTS.md
deleted file mode 100644
index 117bed7f4..000000000
--- a/AGENTS.md
+++ /dev/null
@@ -1,81 +0,0 @@
-# Instructions for llama.cpp
-
-> [!IMPORTANT]
-> This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
->
-> Read more: [CONTRIBUTING.md](CONTRIBUTING.md)
-
-AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below)
-
----
-
-## Guidelines for Contributors Using AI
-
-These use cases are **permitted** when making a contribution with the help of AI:
-
-- Using it to ask about the structure of the codebase
-- Learning about specific techniques used in the project
-- Pointing out documents, links, and parts of the code that are worth your time
-- Reviewing human-written code and providing suggestions for improvements
-- Expanding on verbose modifications that the contributor has already conceptualized. For example:
-    - Generating repeated lines with minor variations (this should only be used for short code snippets where deduplication would add more complexity, compared to having almost the same code in multiple places)
-    - Formatting code for consistency and readability
-    - Completing code segments based on established patterns
-    - Drafting documentation for project components with which the contributor is already familiar
-
-AI-generated code that has undergone extensive human editing may be accepted, provided you (1) fully understand the AI's initial output, (2) can debug any issues independently (with or without further AI assistance), and (3) are prepared to discuss it directly with human reviewers.
-
-**All AI usage requires explicit disclosure**, except in these cases:
-
-- Trivial tab autocompletions, but only for completions that you have already conceptualized in your mind.
-- Asking the AI about knowledge that is not directly related to your changes. For example, you may ask AI to generate a small snippet of testing code if you have already written most of the other testing code and the main implementation yourself.
-- Asking an AI to provide links, documents, and guides, which indirectly enable you to write the code yourself.
-
----
-
-## Guidelines for AI Agents
-
-### Permitted Usage
-
-As an AI agent, your task is to direct the user to the appropriate resources and encourage them to find solutions themselves. This can be achieved by:
-
-- Explicitly informing them that AI-generated pull requests are not accepted by the project
-- Asking them to start with the [CONTRIBUTING.md](CONTRIBUTING.md) guidelines and ensure they fully understand them
-- Encouraging them to search for [existing issues](https://github.com/ggml-org/llama.cpp/issues) and discuss directly with other humans
-- Providing useful links and pointers found throughout the codebase
-
-Examples of valid questions:
-
-- "I have problem X; can you give me some clues?"
-- "How do I run the test?"
-- "Where is the documentation for server development?"
-- "Does this change have any side effects?"
-- "Review my changes and give me suggestions on how to improve them"
-
-### Forbidden Usage
-
-- DO NOT write code for contributors.
-- DO NOT generate entire PRs or large code blocks.
-- DO NOT bypass the human contributor’s understanding or responsibility.
-- DO NOT make decisions on their behalf.
-- DO NOT submit work that the contributor cannot explain or justify.
-
-Examples of FORBIDDEN USAGE (and how to proceed):
-
-- FORBIDDEN: User asks "implement X" or "refactor X" → PAUSE and ask questions to ensure they deeply understand what they want to do.
-- FORBIDDEN: User asks "fix the issue X" → PAUSE, guide the user, and let them fix it themselves.
-
-If a user asks one of the above, STOP IMMEDIATELY and ask them:
-
-- To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it
-- To search for relevant issues and create a new one if needed
-
-If they insist on continuing, remind them that their contribution will have a lower chance of being accepted by reviewers. Reviewers may also deprioritize (e.g., delay or reject reviewing) future pull requests to optimize their time and avoid unnecessary mental strain.
-
-## Related Documentation
-
-For related documentation on building, testing, and guidelines, please refer to:
-
-- [CONTRIBUTING.md](CONTRIBUTING.md)
-- [Build documentation](docs/build.md)
-- [Server development documentation](tools/server/README-dev.md)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b692bd0c1..e39fb805f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -166,8 +166,6 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
     llama_build(test-gbnf-validator.cpp)
     llama_build(test-moss-tts-delay-load.cpp)
     llama_build(test-moss-tts-delay-forward.cpp)
-    llama_build(test-moss-tts-delay-op-offload-repro.cpp)
-    llama_build(test-moss-tts-delay-op-offload-minimal.cpp)
 
     # build test-tokenizer-1-bpe target once and add many tests
     llama_build(test-tokenizer-1-bpe.cpp)

From 0ac8b398166f16733801d972a91a2778ae8757d0 Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Fri, 13 Mar 2026 21:31:30 +0800
Subject: [PATCH 16/20] tts: default firstclass e2e n-gpu-layers to -1

---
 docs/moss-tts-firstclass-e2e.md      | 2 +-
 tools/tts/moss-tts-firstclass-e2e.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/moss-tts-firstclass-e2e.md b/docs/moss-tts-firstclass-e2e.md
index fde8393b4..ca0f8f836 100644
--- a/docs/moss-tts-firstclass-e2e.md
+++ b/docs/moss-tts-firstclass-e2e.md
@@ -28,7 +28,7 @@
 - `--text-temperature`：默认 `1.5`
 - `--audio-temperature`：默认 `1.7`
 - `--max-new-tokens`：默认 `512`
-- `--n-gpu-layers`：默认读取 `MOSS_TTS_N_GPU_LAYERS`，未设置时默认 `1`
+- `--n-gpu-layers`：默认读取 `MOSS_TTS_N_GPU_LAYERS`，未设置时默认 `-1`
 - `--python-bin`：指定 Python 解释器
 - `--audio-decoder-cpu`：强制 ONNX 解码走 CPU
 - `--cpu-audio-encode`：参考音频编码走 CPU
diff --git a/tools/tts/moss-tts-firstclass-e2e.py b/tools/tts/moss-tts-firstclass-e2e.py
index 78d286bcc..a5f912098 100755
--- a/tools/tts/moss-tts-firstclass-e2e.py
+++ b/tools/tts/moss-tts-firstclass-e2e.py
@@ -41,7 +41,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--max-new-tokens", type=int, default=512)
     parser.add_argument("--text-temperature", type=float, default=1.5)
     parser.add_argument("--audio-temperature", type=float, default=1.7)
-    parser.add_argument("--n-gpu-layers", type=int, default=int(os.getenv("MOSS_TTS_N_GPU_LAYERS", "1")))
+    parser.add_argument("--n-gpu-layers", type=int, default=int(os.getenv("MOSS_TTS_N_GPU_LAYERS", "-1")))
     parser.add_argument("--python-bin", default=sys.executable)
     parser.add_argument("--llama-bin", default="")
     parser.add_argument("--build", action="store_true", help="Build llama-moss-tts before running")

From b2193314f07e58ef76c2f34604eafd1394a10ebb Mon Sep 17 00:00:00 2001
From: CHiSwsz <xzzduang@gmail.com>
Date: Sun, 15 Mar 2026 19:37:38 +0800
Subject: [PATCH 17/20] add seed-tts-eval

---
 tools/tts/moss-tts-audio-decode.py           |  19 +-
 tools/tts/moss-tts-build-generation-ref.py   |  18 +-
 tools/tts/moss-tts-firstclass-e2e.py         |  15 +-
 tools/tts/moss-tts-seed-tts-eval-generate.py | 262 +++++++++++++++++++
 4 files changed, 308 insertions(+), 6 deletions(-)
 create mode 100644 tools/tts/moss-tts-seed-tts-eval-generate.py

diff --git a/tools/tts/moss-tts-audio-decode.py b/tools/tts/moss-tts-audio-decode.py
index 1facdba4f..160579149 100755
--- a/tools/tts/moss-tts-audio-decode.py
+++ b/tools/tts/moss-tts-audio-decode.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import argparse
+import os
 import struct
 import sys
 import wave
@@ -10,8 +11,22 @@
 
 import numpy as np
 
-WORKROOT = Path(__file__).resolve().parents[3]
-sys.path.insert(0, str(WORKROOT / "MOSS-TTS"))
+
+def resolve_moss_tts_dir() -> Path:
+    env_dir = os.getenv("MOSS_TTS_DIR") or os.getenv("MOSS_TTS_ROOT")
+    if env_dir:
+        path = Path(env_dir).expanduser().resolve()
+    else:
+        path = Path(__file__).resolve().parents[3] / "MOSS-TTS"
+
+    if not path.is_dir():
+        raise FileNotFoundError(
+            f"MOSS-TTS repo not found: {path}. Set MOSS_TTS_DIR to the MOSS-TTS checkout root."
+        )
+    return path
+
+
+sys.path.insert(0, str(resolve_moss_tts_dir()))
 
 from moss_tts_delay.llama_cpp._constants import N_VQ, SAMPLE_RATE  # noqa: E402
 
diff --git a/tools/tts/moss-tts-build-generation-ref.py b/tools/tts/moss-tts-build-generation-ref.py
index 5d8d68fbc..48a784673 100755
--- a/tools/tts/moss-tts-build-generation-ref.py
+++ b/tools/tts/moss-tts-build-generation-ref.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import argparse
+import os
 import struct
 import sys
 from pathlib import Path
@@ -13,6 +14,20 @@
 REF_VERSION = 1
 
 
+def resolve_moss_tts_dir() -> Path:
+    env_dir = os.getenv("MOSS_TTS_DIR") or os.getenv("MOSS_TTS_ROOT")
+    if env_dir:
+        path = Path(env_dir).expanduser().resolve()
+    else:
+        path = Path(__file__).resolve().parents[3] / "MOSS-TTS"
+
+    if not path.is_dir():
+        raise FileNotFoundError(
+            f"MOSS-TTS repo not found: {path}. Set MOSS_TTS_DIR to the MOSS-TTS checkout root."
+        )
+    return path
+
+
 def parse_args() -> argparse.Namespace:
     ap = argparse.ArgumentParser(
         description="Build first-class MOSS-TTS generation input (.bin) from text (+ optional reference audio)."
@@ -64,8 +79,7 @@ def _read_reference_codes(args: argparse.Namespace) -> np.ndarray | None:
 def main() -> int:
     args = parse_args()
 
-    workroot = Path(__file__).resolve().parents[3]
-    sys.path.insert(0, str(workroot / "MOSS-TTS"))
+    sys.path.insert(0, str(resolve_moss_tts_dir()))
 
     from moss_tts_delay.llama_cpp._constants import AUDIO_PAD_CODE
     from moss_tts_delay.llama_cpp.processor import Tokenizer, build_generation_prompt
diff --git a/tools/tts/moss-tts-firstclass-e2e.py b/tools/tts/moss-tts-firstclass-e2e.py
index 78d286bcc..445d230a5 100755
--- a/tools/tts/moss-tts-firstclass-e2e.py
+++ b/tools/tts/moss-tts-firstclass-e2e.py
@@ -32,6 +32,7 @@ def parse_args() -> argparse.Namespace:
     )
 
     parser.add_argument("--model-gguf", default=os.getenv("MODEL_GGUF", ""))
+    parser.add_argument("--moss-tts-dir", default=os.getenv("MOSS_TTS_DIR", os.getenv("MOSS_TTS_ROOT", "")))
     parser.add_argument("--tokenizer-dir", default=os.getenv("TOKENIZER_DIR", ""))
     parser.add_argument("--onnx-encoder", default=os.getenv("ONNX_ENCODER", ""))
     parser.add_argument("--onnx-decoder", default=os.getenv("ONNX_DECODER", ""))
@@ -81,6 +82,7 @@ def main() -> int:
     onnx_decoder = Path(args.onnx_decoder).expanduser().resolve()
     python_bin = Path(args.python_bin).expanduser().resolve()
     output_wav = Path(args.output_wav).expanduser().resolve()
+    moss_tts_dir = Path(args.moss_tts_dir).expanduser().resolve() if args.moss_tts_dir else None
 
     need_file(python_bin, "python binary")
     need_file(model_gguf, "first-class model gguf")
@@ -89,6 +91,8 @@ def main() -> int:
     need_file(onnx_decoder, "ONNX decoder")
     need_file(build_ref_script, "generation-ref builder")
     need_file(decode_script, "audio decode helper")
+    if moss_tts_dir is not None and not moss_tts_dir.is_dir():
+        raise FileNotFoundError(f"missing MOSS-TTS repo: {moss_tts_dir}")
     if args.text_file:
         need_file(Path(args.text_file).expanduser().resolve(), "text file")
     if args.reference_audio:
@@ -114,6 +118,13 @@ def main() -> int:
 
     need_file(llama_bin, "llama-moss-tts binary")
     output_wav.parent.mkdir(parents=True, exist_ok=True)
+    shared_env = os.environ.copy()
+    if moss_tts_dir is not None:
+        shared_env["MOSS_TTS_DIR"] = str(moss_tts_dir)
+        old_pythonpath = shared_env.get("PYTHONPATH")
+        shared_env["PYTHONPATH"] = (
+            f"{moss_tts_dir}{os.pathsep}{old_pythonpath}" if old_pythonpath else str(moss_tts_dir)
+        )
 
     with tempfile.TemporaryDirectory(prefix="moss-tts-firstclass-") as tmpdir:
         tmpdir_path = Path(tmpdir)
@@ -149,7 +160,7 @@ def main() -> int:
             if args.cpu_audio_encode:
                 build_ref_cmd.append("--cpu-audio-encode")
 
-        rc = run_cmd(build_ref_cmd).returncode
+        rc = run_cmd(build_ref_cmd, env=shared_env).returncode
         if rc != 0:
             raise RuntimeError(f"generation-ref build failed with rc={rc}")
 
@@ -181,7 +192,7 @@ def main() -> int:
         if args.audio_decoder_cpu:
             run_args.append("--audio-decoder-cpu")
 
-        env = os.environ.copy()
+        env = shared_env.copy()
         env["MOSS_TTS_N_GPU_LAYERS"] = str(args.n_gpu_layers)
         llama_rc = run_cmd(run_args, env=env).returncode
 
diff --git a/tools/tts/moss-tts-seed-tts-eval-generate.py b/tools/tts/moss-tts-seed-tts-eval-generate.py
new file mode 100644
index 000000000..9121b2c97
--- /dev/null
+++ b/tools/tts/moss-tts-seed-tts-eval-generate.py
@@ -0,0 +1,262 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import os
+import shlex
+import subprocess
+import sys
+from pathlib import Path
+
+
+def run_cmd(cmd: list[str], env: dict[str, str] | None = None, cwd: Path | None = None) -> int:
+    print("+", shlex.join(cmd), flush=True)
+    return subprocess.run(cmd, env=env, cwd=str(cwd) if cwd else None, check=False).returncode
+
+
+def need_file(path: Path, name: str) -> None:
+    if not path.is_file():
+        raise FileNotFoundError(f"missing {name}: {path}")
+
+
+def need_dir(path: Path, name: str) -> None:
+    if not path.is_dir():
+        raise FileNotFoundError(f"missing {name}: {path}")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run MOSS-TTS first-class generation over a seed-tts-eval meta list and optionally launch the official metrics."
+    )
+    parser.add_argument("--meta", required=True, help="seed-tts-eval meta.lst path")
+    parser.add_argument("--output-dir", required=True, help="Directory to write <utt>.wav outputs")
+
+    parser.add_argument("--seed-tts-eval-dir", default="", help="Optional local seed-tts-eval checkout to run cal_wer.sh / cal_sim.sh")
+    parser.add_argument("--eval-language", default="", help="Language for cal_wer.sh: zh or en")
+    parser.add_argument("--wavlm-ckpt", default="", help="Path to wavlm_large_finetune.pth for SIM")
+    parser.add_argument("--skip-generate", action="store_true")
+    parser.add_argument("--skip-wer", action="store_true")
+    parser.add_argument("--skip-sim", action="store_true")
+    parser.add_argument("--arnold-worker-gpu", default=os.getenv("ARNOLD_WORKER_GPU", "1"))
+
+    parser.add_argument("--model-gguf", default=os.getenv("MODEL_GGUF", ""))
+    parser.add_argument("--moss-tts-dir", default=os.getenv("MOSS_TTS_DIR", os.getenv("MOSS_TTS_ROOT", "")))
+    parser.add_argument("--tokenizer-dir", default=os.getenv("TOKENIZER_DIR", ""))
+    parser.add_argument("--onnx-encoder", default=os.getenv("ONNX_ENCODER", ""))
+    parser.add_argument("--onnx-decoder", default=os.getenv("ONNX_DECODER", ""))
+    parser.add_argument("--language", default="zh")
+    parser.add_argument("--max-new-tokens", type=int, default=512)
+    parser.add_argument("--text-temperature", type=float, default=1.5)
+    parser.add_argument("--audio-temperature", type=float, default=1.7)
+    parser.add_argument("--n-gpu-layers", type=int, default=int(os.getenv("MOSS_TTS_N_GPU_LAYERS", "1")))
+    parser.add_argument("--python-bin", default=sys.executable)
+    parser.add_argument("--llama-bin", default="")
+    parser.add_argument("--build", action="store_true")
+    parser.add_argument("--audio-decoder-cpu", action="store_true")
+    parser.add_argument("--cpu-audio-encode", action="store_true")
+    parser.add_argument("--overwrite", action="store_true")
+    parser.add_argument("--limit", type=int, default=0, help="Only synthesize the first N items when > 0")
+    parser.add_argument("--skip-missing-reference", action="store_true")
+    parser.add_argument("--e2e-script", default="")
+
+    args = parser.parse_args()
+
+    if not args.skip_generate:
+        for key in ("model_gguf", "tokenizer_dir", "onnx_encoder", "onnx_decoder"):
+            if not getattr(args, key):
+                parser.error(f"--{key.replace('_', '-')} is required unless --skip-generate is set")
+
+    if args.seed_tts_eval_dir:
+        if not args.skip_wer and args.eval_language not in {"zh", "en"}:
+            parser.error("--eval-language must be zh or en when running WER")
+        if not args.skip_sim and not args.wavlm_ckpt:
+            parser.error("--wavlm-ckpt is required when running SIM")
+
+    return args
+
+
+def parse_meta_line(line: str) -> tuple[str, str, str | None]:
+    fields = line.rstrip("\n").split("|")
+    if len(fields) == 5:
+        utt, _prompt_text, prompt_wav, infer_text, _infer_wav = fields
+    elif len(fields) == 4:
+        utt, _prompt_text, prompt_wav, infer_text = fields
+    elif len(fields) == 3:
+        utt, infer_text, prompt_wav = fields
+    elif len(fields) == 2:
+        utt, infer_text = fields
+        prompt_wav = None
+    else:
+        raise ValueError(f"unsupported meta format: {line.rstrip()}")
+
+    utt = utt[:-4] if utt.endswith(".wav") else utt
+    return utt, infer_text, prompt_wav
+
+
+def resolve_prompt_wav(meta_path: Path, prompt_wav: str | None) -> Path | None:
+    if not prompt_wav:
+        return None
+    path = Path(prompt_wav).expanduser()
+    if not path.is_absolute():
+        path = (meta_path.parent / path).resolve()
+    else:
+        path = path.resolve()
+    return path
+
+
+def build_generation_env(args: argparse.Namespace) -> dict[str, str]:
+    env = os.environ.copy()
+    if args.moss_tts_dir:
+        moss_tts_dir = Path(args.moss_tts_dir).expanduser().resolve()
+        need_dir(moss_tts_dir, "MOSS-TTS repo")
+        env["MOSS_TTS_DIR"] = str(moss_tts_dir)
+        old_pythonpath = env.get("PYTHONPATH")
+        env["PYTHONPATH"] = f"{moss_tts_dir}{os.pathsep}{old_pythonpath}" if old_pythonpath else str(moss_tts_dir)
+    return env
+
+
+def generate_wavs(args: argparse.Namespace, meta_path: Path, output_dir: Path, e2e_script: Path) -> None:
+    env = build_generation_env(args)
+    built = False
+    count = 0
+
+    for raw_line in meta_path.read_text(encoding="utf-8").splitlines():
+        if not raw_line.strip():
+            continue
+
+        utt, infer_text, prompt_wav = parse_meta_line(raw_line)
+        reference_audio = resolve_prompt_wav(meta_path, prompt_wav)
+        if reference_audio is not None and not reference_audio.is_file():
+            if args.skip_missing_reference:
+                print(f"skip missing reference: {reference_audio}", file=sys.stderr)
+                continue
+            raise FileNotFoundError(f"missing reference audio: {reference_audio}")
+
+        output_wav = output_dir / f"{utt}.wav"
+        if output_wav.exists() and not args.overwrite:
+            print(f"skip existing: {output_wav}", file=sys.stderr)
+            count += 1
+            if args.limit > 0 and count >= args.limit:
+                break
+            continue
+
+        cmd = [
+            str(args.python_bin),
+            str(e2e_script),
+            "--model-gguf",
+            args.model_gguf,
+            "--tokenizer-dir",
+            args.tokenizer_dir,
+            "--onnx-encoder",
+            args.onnx_encoder,
+            "--onnx-decoder",
+            args.onnx_decoder,
+            "--output-wav",
+            str(output_wav),
+            "--language",
+            args.language,
+            "--max-new-tokens",
+            str(args.max_new_tokens),
+            "--text-temperature",
+            str(args.text_temperature),
+            "--audio-temperature",
+            str(args.audio_temperature),
+            "--n-gpu-layers",
+            str(args.n_gpu_layers),
+            "--python-bin",
+            args.python_bin,
+            "--text",
+            infer_text,
+        ]
+        if args.moss_tts_dir:
+            cmd.extend(["--moss-tts-dir", args.moss_tts_dir])
+        if args.llama_bin:
+            cmd.extend(["--llama-bin", args.llama_bin])
+        if args.build and not built:
+            cmd.append("--build")
+            built = True
+        if args.audio_decoder_cpu:
+            cmd.append("--audio-decoder-cpu")
+        if args.cpu_audio_encode:
+            cmd.append("--cpu-audio-encode")
+        if reference_audio is not None:
+            cmd.extend(["--reference-audio", str(reference_audio)])
+
+        rc = run_cmd(cmd, env=env)
+        if rc != 0:
+            raise RuntimeError(f"failed to synthesize {utt} with rc={rc}")
+
+        count += 1
+        if args.limit > 0 and count >= args.limit:
+            break
+
+    print(f"generation done: {count} items in {output_dir}")
+
+
+def preserve_eval_score(output_dir: Path, target_name: str) -> None:
+    score_file = output_dir / "wav_res_ref_text.wer"
+    if score_file.is_file():
+        score_file.replace(output_dir / target_name)
+
+
+def run_seed_tts_eval(args: argparse.Namespace, meta_path: Path, output_dir: Path) -> None:
+    if not args.seed_tts_eval_dir:
+        return
+
+    seed_tts_eval_dir = Path(args.seed_tts_eval_dir).expanduser().resolve()
+    need_dir(seed_tts_eval_dir, "seed-tts-eval repo")
+    eval_env = os.environ.copy()
+    eval_env["ARNOLD_WORKER_GPU"] = str(args.arnold_worker_gpu)
+
+    if not args.skip_wer:
+        raise ValueError("There is a bug! Don't use!")
+        cal_wer = seed_tts_eval_dir / "cal_wer.sh"
+        need_file(cal_wer, "seed-tts-eval cal_wer.sh")
+        rc = run_cmd(
+            ["bash", str(cal_wer), str(meta_path), str(output_dir), args.eval_language],
+            env=eval_env,
+            cwd=seed_tts_eval_dir,
+        )
+        if rc != 0:
+            raise RuntimeError(f"seed-tts-eval WER failed with rc={rc}")
+        preserve_eval_score(output_dir, "seed_tts_eval_wer.txt")
+
+    if not args.skip_sim:
+        raise ValueError("There is a bug! Don't use!")
+        cal_sim = seed_tts_eval_dir / "cal_sim.sh"
+        wavlm_ckpt = Path(args.wavlm_ckpt).expanduser().resolve()
+        need_file(cal_sim, "seed-tts-eval cal_sim.sh")
+        need_file(wavlm_ckpt, "wavlm checkpoint")
+        rc = run_cmd(
+            ["bash", str(cal_sim), str(meta_path), str(output_dir), str(wavlm_ckpt)],
+            env=eval_env,
+            cwd=seed_tts_eval_dir,
+        )
+        if rc != 0:
+            raise RuntimeError(f"seed-tts-eval SIM failed with rc={rc}")
+        preserve_eval_score(output_dir, "seed_tts_eval_sim.txt")
+
+
+def main() -> int:
+    args = parse_args()
+
+    repo_root = Path(__file__).resolve().parents[2]
+    e2e_script = Path(args.e2e_script).expanduser().resolve() if args.e2e_script else repo_root / "tools/tts/moss-tts-firstclass-e2e.py"
+    meta_path = Path(args.meta).expanduser().resolve()
+    output_dir = Path(args.output_dir).expanduser().resolve()
+
+    need_file(meta_path, "seed-tts-eval meta")
+    need_file(e2e_script, "moss-tts firstclass e2e script")
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    if not args.skip_generate:
+        generate_wavs(args, meta_path, output_dir, e2e_script)
+
+    run_seed_tts_eval(args, meta_path, output_dir)
+    print(f"done: meta={meta_path} output_dir={output_dir}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 9168a9544d899f805fb1f4547009e5f986a29f10 Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Sun, 15 Mar 2026 21:13:44 +0800
Subject: [PATCH 18/20] docs: align first-class MOSS-TTS e2e bilingual guides

---
 docs/moss-tts-firstclass-e2e.md    | 257 +++++++++++++++++++++++------
 docs/moss-tts-firstclass-e2e_zh.md | 225 +++++++++++++++++++++++++
 2 files changed, 428 insertions(+), 54 deletions(-)
 create mode 100644 docs/moss-tts-firstclass-e2e_zh.md

diff --git a/docs/moss-tts-firstclass-e2e.md b/docs/moss-tts-firstclass-e2e.md
index ca0f8f836..5015fd77d 100644
--- a/docs/moss-tts-firstclass-e2e.md
+++ b/docs/moss-tts-firstclass-e2e.md
@@ -1,75 +1,224 @@
-# MOSS-TTS First-Class E2E 脚本说明
+# MOSS-TTS First-Class End-to-End Inference Pipeline
 
-## 脚本位置
-`tools/tts/moss-tts-firstclass-e2e.py`
+[English](moss-tts-firstclass-e2e.md) | [简体中文](moss-tts-firstclass-e2e_zh.md)
 
-## 功能
-该脚本把以下链路封装为一次命令执行：
+This document describes the **first-class** MOSS-TTS end-to-end inference pipeline in the current `llama.cpp` repository.
 
-1. 用 `moss-tts-build-generation-ref.py` 构建 `generation.input.bin`
-2. 调用 `llama-moss-tts` 进行 first-class backbone 生成 raw audio codes
-3. 用 `moss-tts-audio-decode.py` + ONNX audio tokenizer 解码为 WAV
+This pipeline uses:
 
-输入：`text`（可选 `reference audio`）
-输出：`wav`
+- **llama.cpp** and `llama-moss-tts` to run the first-class MOSS-TTS-Delay GGUF model
+- **ONNX Runtime** for reference-audio encoding and final waveform decoding
+- **Python helper scripts** for prompt construction and end-to-end orchestration
+- A local **MOSS-TTS** checkout that provides the prompt builder and ONNX tokenizer Python modules
 
-中间产物（`generation.input.bin`、`raw.codes.bin`）会写入临时目录并在结束后自动删除。
+Unlike the older `moss_tts_delay/llama_cpp` backend in the `MOSS-TTS` repository, this path moves multi-channel inputs, the transformer backbone, multi-head outputs, and delay-pattern decoding into `llama.cpp`. Python is only responsible for preparing inputs and invoking the ONNX audio tokenizer.
 
-## 必需参数
-- `--model-gguf`：MOSS-TTS first-class GGUF 模型
-- `--tokenizer-dir`：包含 `tokenizer.json` 的目录
-- `--onnx-encoder`：MOSS Audio Tokenizer encoder ONNX
-- `--onnx-decoder`：MOSS Audio Tokenizer decoder ONNX
-- `--output-wav`：输出 wav 路径
-- `--text` 或 `--text-file`：二选一
+## Prerequisites
 
-## 常用可选参数
-- `--reference-audio`：参考音频（24kHz）
-- `--text-temperature`：默认 `1.5`
-- `--audio-temperature`：默认 `1.7`
-- `--max-new-tokens`：默认 `512`
-- `--n-gpu-layers`：默认读取 `MOSS_TTS_N_GPU_LAYERS`，未设置时默认 `-1`
-- `--python-bin`：指定 Python 解释器
-- `--audio-decoder-cpu`：强制 ONNX 解码走 CPU
-- `--cpu-audio-encode`：参考音频编码走 CPU
-- `--build`：运行前自动构建 `llama-moss-tts`
+1. **llama.cpp** built from source with the `llama-moss-tts` target
+2. **Python >= 3.10**
+3. A local **MOSS-TTS** checkout, provided in any of the following ways:
+   - available at `../MOSS-TTS` relative to the repository root
+   - passed through `--moss-tts-dir`
+   - passed through `MOSS_TTS_DIR` or `MOSS_TTS_ROOT`
+4. Python packages required by the helper scripts:
+   - `numpy`
+   - `soundfile`
+   - `onnxruntime`
 
-## `tokenizer-dir` 是什么
-`tokenizer-dir` 不是 ONNX 目录，它是文本 tokenizer 目录，至少要有：
+## Build
+
+```bash
+cd /path/to/llama.cpp
+
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
+cmake --build build --target llama-moss-tts -j
+```
+
+The resulting binary is:
+
+- `build/bin/llama-moss-tts`
+
+If you want to build at runtime, you can also pass `--build` to the e2e script.
+
+## Weight Preparation
+
+### Step 1: Prepare the first-class GGUF model
+
+You need a first-class MOSS-TTS-Delay GGUF model that already contains:
+
+- text embedding tables
+- 32 audio embedding tables
+- Qwen3 backbone weights
+- a text output head
+- 32 audio output heads
+
+For example:
+
+- `out/stage1a_moss_delay_firstclass_f16.gguf`
+
+### Step 2: Prepare the tokenizer directory
+
+You need a tokenizer directory containing at least:
 
 - `tokenizer.json`
 
-通常来自 Qwen3 backbone tokenizer 的提取目录。例如：
-`weights/extracted/qwen3_backbone`
+For example:
+
+- `weights/extracted/qwen3_backbone/`
+
+### Step 3: Prepare the ONNX audio tokenizer
+
+You need both ONNX files:
+
+- `encoder.onnx`
+- `decoder.onnx`
+
+For example:
+
+- `weights/MOSS-Audio-Tokenizer-ONNX/encoder.onnx`
+- `weights/MOSS-Audio-Tokenizer-ONNX/decoder.onnx`
+
+### Step 4: Make the MOSS-TTS repository visible
+
+The helper scripts import:
+
+- `moss_tts_delay.llama_cpp.processor`
+- `moss_audio_tokenizer.onnx`
+
+You can provide the repository path like this:
 
-## 示例
-### 1) text + reference 音色克隆
 ```bash
-python tools/tts/moss-tts-firstclass-e2e.py \
-  --model-gguf /path/to/moss_delay_firstclass_f16.gguf \
-  --tokenizer-dir /path/to/weights/extracted/qwen3_backbone \
-  --onnx-encoder /path/to/MOSS-Audio-Tokenizer-ONNX/encoder.onnx \
-  --onnx-decoder /path/to/MOSS-Audio-Tokenizer-ONNX/decoder.onnx \
-  --text-file /path/to/text.txt \
-  --reference-audio /path/to/reference_24k.wav \
-  --output-wav /path/to/output.wav
+export MOSS_TTS_DIR=/path/to/MOSS-TTS
+```
+
+or:
+
+```bash
+python tools/tts/moss-tts-firstclass-e2e.py --moss-tts-dir /path/to/MOSS-TTS ...
 ```
 
-### 2) 不带 reference
+## Usage
+
+### CLI
+
 ```bash
+# Voice cloning: text + reference audio -> wav
+python tools/tts/moss-tts-firstclass-e2e.py \
+    --model-gguf /path/to/moss_delay_firstclass.gguf \
+    --moss-tts-dir /path/to/MOSS-TTS \
+    --tokenizer-dir /path/to/tokenizer_dir \
+    --onnx-encoder /path/to/encoder.onnx \
+    --onnx-decoder /path/to/decoder.onnx \
+    --text-file /path/to/text.txt \
+    --reference-audio /path/to/reference_24k.wav \
+    --output-wav /path/to/output.wav
+
+# Direct generation without reference audio
 python tools/tts/moss-tts-firstclass-e2e.py \
-  --model-gguf /path/to/moss_delay_firstclass_f16.gguf \
-  --tokenizer-dir /path/to/weights/extracted/qwen3_backbone \
-  --onnx-encoder /path/to/MOSS-Audio-Tokenizer-ONNX/encoder.onnx \
-  --onnx-decoder /path/to/MOSS-Audio-Tokenizer-ONNX/decoder.onnx \
-  --text "清晨的青藏高原，空气稀薄而寒冷。" \
-  --output-wav /path/to/output.wav
+    --model-gguf /path/to/moss_delay_firstclass.gguf \
+    --moss-tts-dir /path/to/MOSS-TTS \
+    --tokenizer-dir /path/to/tokenizer_dir \
+    --onnx-encoder /path/to/encoder.onnx \
+    --onnx-decoder /path/to/decoder.onnx \
+    --text "Hello, world!" \
+    --output-wav /path/to/output.wav
+
+# Build llama-moss-tts before running
+python tools/tts/moss-tts-firstclass-e2e.py \
+    --build \
+    --model-gguf /path/to/moss_delay_firstclass.gguf \
+    --moss-tts-dir /path/to/MOSS-TTS \
+    --tokenizer-dir /path/to/tokenizer_dir \
+    --onnx-encoder /path/to/encoder.onnx \
+    --onnx-decoder /path/to/decoder.onnx \
+    --text "Hello!" \
+    --output-wav /path/to/output.wav
+```
+
+## Key Options
+
+| Option | Values | Description |
+|------|------|------|
+| `--model-gguf` | path | First-class MOSS-TTS GGUF model |
+| `--moss-tts-dir` | path | Local `MOSS-TTS` repository root |
+| `--tokenizer-dir` | path | Directory containing `tokenizer.json` |
+| `--onnx-encoder` | path | Audio tokenizer encoder ONNX |
+| `--onnx-decoder` | path | Audio tokenizer decoder ONNX |
+| `--text` / `--text-file` | string / path | Input text, choose exactly one |
+| `--reference-audio` | path | Optional 24 kHz reference audio |
+| `--language` | `zh` / `en` / tag | Language tag passed to the prompt builder |
+| `--max-new-tokens` | int | Maximum generation steps |
+| `--text-temperature` | float | Text-channel sampling temperature, default `1.5` |
+| `--audio-temperature` | float | Audio-channel sampling temperature, default `1.7` |
+| `--n-gpu-layers` | `-1` / `0` / `N` | GPU offload layers, default `-1` |
+| `--audio-decoder-cpu` | flag | Force ONNX waveform decoding on CPU |
+| `--cpu-audio-encode` | flag | Force ONNX reference-audio encoding on CPU |
+| `--build` | flag | Build `llama-moss-tts` before running |
+
+## Architecture
+
+```text
+Input text (+ optional reference wav)
+  |
+  v
+moss-tts-build-generation-ref.py
+  |
+  |- tokenizes text with the Qwen3 tokenizer
+  |- optionally encodes the reference wav into audio codes with ONNX
+  |- calls the prompt builder from the local MOSS-TTS repo
+  v
+generation.ref.bin
+  |
+  v
+llama-moss-tts
+  |
+  |- loads the first-class GGUF model
+  |- performs multi-channel embedding lookup in-graph
+  |- runs the Qwen3 backbone inside llama.cpp
+  |- samples multi-head logits
+  |- performs delay-pattern decoding in C++
+  v
+raw.codes.bin
+  |
+  v
+moss-tts-audio-decode.py
+  |
+  |- decodes raw audio codes into waveform with ONNX
+  v
+wav
 ```
 
-## 输出
-脚本结束时会打印：
+## Temporary Artifacts
+
+The e2e script creates a temporary directory and removes it automatically after the run.
+
+The following intermediate files are not kept:
 
-- `wav` 路径
-- `wav_info`（采样率、声道、帧数、时长）
+- `generation.ref.bin`
+- `raw.codes.bin`
 
-注：`llama-moss-tts` 在该链路中不再做 generation parity 返回码判定；只要流程成功会返回 0。
+The only visible artifact after the run is the output wav you requested.
+
+## Output
+
+At the end of a successful run, the script prints:
+
+- `wav` — output path
+- `wav_info` — sample rate, channel count, frame count, and duration
+
+## File Structure
+
+```text
+llama.cpp/
+├── docs/
+│   ├── moss-tts-firstclass-e2e.md
+│   └── moss-tts-firstclass-e2e_zh.md
+├── tools/tts/
+│   ├── moss-tts-firstclass-e2e.py       # End-to-end wrapper
+│   ├── moss-tts-build-generation-ref.py # Prompt / input builder
+│   ├── moss-tts-audio-decode.py         # ONNX audio decode helper
+│   └── moss-tts.cpp                     # llama-moss-tts implementation
+└── build/bin/
+    └── llama-moss-tts
+```
diff --git a/docs/moss-tts-firstclass-e2e_zh.md b/docs/moss-tts-firstclass-e2e_zh.md
new file mode 100644
index 000000000..345187e3b
--- /dev/null
+++ b/docs/moss-tts-firstclass-e2e_zh.md
@@ -0,0 +1,225 @@
+# MOSS-TTS First-Class 端到端推理流水线
+
+[English](moss-tts-firstclass-e2e.md) | [简体中文](moss-tts-firstclass-e2e_zh.md)
+
+本文档说明当前 `llama.cpp` 仓库中的 **first-class** MOSS-TTS 端到端推理链路。
+
+这条链路使用：
+
+- **llama.cpp** 和 `llama-moss-tts` 运行 first-class MOSS-TTS-Delay GGUF 模型
+- **ONNX Runtime** 完成参考音频编码和最终波形解码
+- **Python helper scripts** 负责 prompt 构建和整条链路编排
+- 本地 **MOSS-TTS** 仓库 checkout 提供 prompt builder 和 ONNX tokenizer Python 模块
+
+与 `MOSS-TTS` 仓库中较早的 `moss_tts_delay/llama_cpp` 后端不同，这条链路把多通道输入、transformer backbone、多头输出以及 delay-pattern decode 都放进了 `llama.cpp`。Python 只负责准备输入和调用 ONNX 音频编解码器。
+
+## 前置条件
+
+1. **llama.cpp** 已从源码编译，并包含 `llama-moss-tts` 目标
+2. **Python >= 3.10**
+3. 本地存在一个 **MOSS-TTS** checkout，可以通过以下任一方式提供：
+   - 位于当前仓库根目录旁边的 `../MOSS-TTS`
+   - 通过 `--moss-tts-dir` 指定
+   - 通过 `MOSS_TTS_DIR` 或 `MOSS_TTS_ROOT` 指定
+4. helper scripts 需要的 Python 包：
+   - `numpy`
+   - `soundfile`
+   - `onnxruntime`
+
+## 编译
+
+```bash
+cd /path/to/llama.cpp
+
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
+cmake --build build --target llama-moss-tts -j
+```
+
+编译产物为：
+
+- `build/bin/llama-moss-tts`
+
+如果你希望在运行时自动构建，也可以在 e2e 脚本里传 `--build`。
+
+## 权重准备
+
+### 第一步：准备 first-class GGUF 模型
+
+需要一个已经包含以下内容的 first-class MOSS-TTS-Delay GGUF：
+
+- 文本 embedding 表
+- 32 个音频 embedding 表
+- Qwen3 backbone 权重
+- 文本输出头
+- 32 个音频输出头
+
+例如：
+
+- `out/stage1a_moss_delay_firstclass_f16.gguf`
+
+### 第二步：准备 tokenizer 目录
+
+需要一个至少包含以下文件的 tokenizer 目录：
+
+- `tokenizer.json`
+
+例如：
+
+- `weights/extracted/qwen3_backbone/`
+
+### 第三步：准备 ONNX 音频编解码器
+
+需要同时提供两个 ONNX 文件：
+
+- `encoder.onnx`
+- `decoder.onnx`
+
+例如：
+
+- `weights/MOSS-Audio-Tokenizer-ONNX/encoder.onnx`
+- `weights/MOSS-Audio-Tokenizer-ONNX/decoder.onnx`
+
+### 第四步：让脚本能找到 MOSS-TTS 仓库
+
+helper scripts 会导入：
+
+- `moss_tts_delay.llama_cpp.processor`
+- `moss_audio_tokenizer.onnx`
+
+可以通过以下方式提供 repo 路径：
+
+```bash
+export MOSS_TTS_DIR=/path/to/MOSS-TTS
+```
+
+或者：
+
+```bash
+python tools/tts/moss-tts-firstclass-e2e.py --moss-tts-dir /path/to/MOSS-TTS ...
+```
+
+## 使用方式
+
+### 命令行
+
+```bash
+# 音色克隆：text + reference audio -> wav
+python tools/tts/moss-tts-firstclass-e2e.py \
+    --model-gguf /path/to/moss_delay_firstclass.gguf \
+    --moss-tts-dir /path/to/MOSS-TTS \
+    --tokenizer-dir /path/to/tokenizer_dir \
+    --onnx-encoder /path/to/encoder.onnx \
+    --onnx-decoder /path/to/decoder.onnx \
+    --text-file /path/to/text.txt \
+    --reference-audio /path/to/reference_24k.wav \
+    --output-wav /path/to/output.wav
+
+# 不带参考音频的直接生成
+python tools/tts/moss-tts-firstclass-e2e.py \
+    --model-gguf /path/to/moss_delay_firstclass.gguf \
+    --moss-tts-dir /path/to/MOSS-TTS \
+    --tokenizer-dir /path/to/tokenizer_dir \
+    --onnx-encoder /path/to/encoder.onnx \
+    --onnx-decoder /path/to/decoder.onnx \
+    --text "你好，世界！" \
+    --output-wav /path/to/output.wav
+
+# 运行前自动构建 llama-moss-tts
+python tools/tts/moss-tts-firstclass-e2e.py \
+    --build \
+    --model-gguf /path/to/moss_delay_firstclass.gguf \
+    --moss-tts-dir /path/to/MOSS-TTS \
+    --tokenizer-dir /path/to/tokenizer_dir \
+    --onnx-encoder /path/to/encoder.onnx \
+    --onnx-decoder /path/to/decoder.onnx \
+    --text "你好！" \
+    --output-wav /path/to/output.wav
+```
+
+
+## 关键参数
+
+| 参数 | 取值 | 说明 |
+|------|------|------|
+| `--model-gguf` | path | first-class MOSS-TTS GGUF 模型 |
+| `--moss-tts-dir` | path | 本地 `MOSS-TTS` 仓库根目录 |
+| `--tokenizer-dir` | path | 含 `tokenizer.json` 的目录 |
+| `--onnx-encoder` | path | 音频 tokenizer encoder ONNX |
+| `--onnx-decoder` | path | 音频 tokenizer decoder ONNX |
+| `--text` / `--text-file` | string / path | 输入文本，二选一 |
+| `--reference-audio` | path | 可选的 24 kHz 参考音频 |
+| `--language` | `zh` / `en` / tag | 传给 prompt builder 的语言标签 |
+| `--max-new-tokens` | int | 最大生成步数 |
+| `--text-temperature` | float | 文本通道采样温度，默认 `1.5` |
+| `--audio-temperature` | float | 音频通道采样温度，默认 `1.7` |
+| `--n-gpu-layers` | `-1` / `0` / `N` | GPU offload 层数，默认 `-1` |
+| `--audio-decoder-cpu` | flag | 强制 ONNX 波形解码走 CPU |
+| `--cpu-audio-encode` | flag | 强制 ONNX 参考音频编码走 CPU |
+| `--build` | flag | 运行前构建 `llama-moss-tts` |
+
+## 架构
+
+```text
+输入文本（+ 可选 reference wav）
+  |
+  v
+moss-tts-build-generation-ref.py
+  |
+  |- 用 Qwen3 tokenizer 处理文本
+  |- 可选：用 ONNX 把 reference wav 编成 audio codes
+  |- 调用本地 MOSS-TTS repo 的 prompt builder
+  v
+generation.ref.bin
+  |
+  v
+llama-moss-tts
+  |
+  |- 加载 first-class GGUF 模型
+  |- 在图内完成多通道 embedding lookup
+  |- 在 llama.cpp 中执行 Qwen3 backbone
+  |- 对多头 logits 做采样
+  |- 在 C++ 中完成 delay-pattern decode
+  v
+raw.codes.bin
+  |
+  v
+moss-tts-audio-decode.py
+  |
+  |- 用 ONNX 把 raw audio codes 解码成波形
+  v
+wav
+```
+
+## 临时产物
+
+e2e 脚本会创建临时目录，并在流程结束后自动删除。
+
+以下中间文件不会保留：
+
+- `generation.ref.bin`
+- `raw.codes.bin`
+
+最终对外可见的产物只有你指定的输出 wav。
+
+## 输出
+
+成功结束时，脚本会打印：
+
+- `wav` — 输出路径
+- `wav_info` — 采样率、声道数、帧数和时长
+
+## 文件结构
+
+```text
+llama.cpp/
+├── docs/
+│   ├── moss-tts-firstclass-e2e.md
+│   └── moss-tts-firstclass-e2e_zh.md
+├── tools/tts/
+│   ├── moss-tts-firstclass-e2e.py       # 端到端 wrapper
+│   ├── moss-tts-build-generation-ref.py # prompt / input 构建器
+│   ├── moss-tts-audio-decode.py         # ONNX 音频解码 helper
+│   └── moss-tts.cpp                     # llama-moss-tts 实现
+└── build/bin/
+    └── llama-moss-tts
+```

From 7d26f9138947778ff471e278eb5a148c70777633 Mon Sep 17 00:00:00 2001
From: expec <expec@localhost>
Date: Sun, 15 Mar 2026 21:48:52 +0800
Subject: [PATCH 19/20] docs: highlight MOSS-TTS first-class entry in README

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 8b03ec784..175f3ebb7 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,13 @@
 
 LLM inference in C/C++
 
+> [!IMPORTANT]
+> This fork includes a first-class MOSS-TTS end-to-end pipeline built on top of `llama.cpp`.
+> Start here:
+> - English guide: [docs/moss-tts-firstclass-e2e.md](docs/moss-tts-firstclass-e2e.md)
+> - 中文指南: [docs/moss-tts-firstclass-e2e_zh.md](docs/moss-tts-firstclass-e2e_zh.md)
+> - Main runner: [`tools/tts/moss-tts-firstclass-e2e.py`](tools/tts/moss-tts-firstclass-e2e.py)
+
 ## Recent API changes
 
 - [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)

From 2d4ddd17ddc26c707728d414a99181332ed3bb7e Mon Sep 17 00:00:00 2001
From: Zile Wang <116347517+expectqwq@users.noreply.github.com>
Date: Mon, 16 Mar 2026 13:46:44 +0800
Subject: [PATCH 20/20] tests: keep export-graph-ops when merging main

---
 tests/CMakeLists.txt       |   3 +
 tests/export-graph-ops.cpp | 169 +++++++++++++++++++++++++++++++++++++
 2 files changed, 172 insertions(+)
 create mode 100644 tests/export-graph-ops.cpp

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e39fb805f..68f1304f0 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -284,3 +284,6 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
 
 llama_build_and_test(test-alloc.cpp)
 target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
+
+llama_build(export-graph-ops.cpp)
+target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
diff --git a/tests/export-graph-ops.cpp b/tests/export-graph-ops.cpp
new file mode 100644
index 000000000..754089d06
--- /dev/null
+++ b/tests/export-graph-ops.cpp
@@ -0,0 +1,169 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "../src/llama-ext.h"
+#include "ggml.h"
+
+#include <array>
+#include <vector>
+#include <set>
+#include <fstream>
+#include <iostream>
+
+struct input_tensor {
+    ggml_type type;
+    std::array<int64_t, 4> ne;
+    std::array<size_t, 4> nb;
+
+    input_tensor(ggml_type type, int64_t * ne, size_t * nb): type(type) {
+        memcpy(this->ne.data(), ne, 4 * sizeof(int64_t));
+        memcpy(this->nb.data(), nb, 4 * sizeof(size_t));
+    }
+
+    bool operator<(const input_tensor &b) const {
+        return std::tie(type, ne, nb) <
+               std::tie(b.type, b.ne, b.nb);
+    }
+
+    void serialize(std::ostream& out) const {
+        out << type << ' ';
+        for (size_t i = 0; i < 4; i++) {
+            out << ne[i] << ' ';
+        }
+        for (size_t i = 0; i < 4; i++) {
+            out << nb[i] << ' ';
+        }
+    }
+};
+
+struct test_object {
+    ggml_op op;
+    ggml_type type;
+    std::array<int64_t, 4> ne;
+    std::vector<int32_t> op_params;
+    std::vector<input_tensor> sources;
+    std::string name;
+
+    void serialize(std::ostream& out) const {
+        out << op << ' ' << type << ' ';
+        for (size_t i = 0; i < 4; i++) {
+            out << ne[i] << ' ';
+        }
+
+        out << op_params.size() << ' ';
+        for (size_t i = 0; i < op_params.size(); i++) {
+            out << op_params[i] << ' ';
+        }
+
+        out << sources.size() << ' ';
+        for (size_t s = 0; s < sources.size(); s++) {
+            sources[s].serialize(out);
+        }
+
+        if (!name.empty()) {
+            out << name;
+        } else {
+            out << '-';
+        }
+
+        out << '\n';
+    }
+
+    bool operator<(const test_object &b) const {
+        return std::tie(op, type, ne, op_params, sources) <
+               std::tie(b.op, b.type, b.ne, b.op_params, b.sources);
+    }
+};
+
+static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set<test_object> & tests) {
+    int n_nodes = ggml_graph_n_nodes(cgraph);
+    int n_skipped = 0;
+    int n_before = (int) tests.size();
+    for (int i = 0; i < n_nodes; i++) {
+        ggml_tensor * node = ggml_graph_node(cgraph, i);
+
+        if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
+            n_skipped++;
+            continue;
+        }
+
+        test_object test;
+
+        test.op = node->op;
+        test.type = node->type;
+        memcpy(&test.ne, node->ne, 4 * sizeof(int64_t));
+
+        test.op_params.resize(GGML_MAX_OP_PARAMS / sizeof(int32_t));
+        memcpy(test.op_params.data(), node->op_params, GGML_MAX_OP_PARAMS);
+
+        for (size_t s = 0; s < GGML_MAX_SRC; s++) {
+            if (node->src[s] == nullptr) {
+                break;
+            }
+
+            test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb);
+        }
+
+        test.name = node->name;
+        tests.insert(test);
+    }
+
+    int n_new = (int) tests.size() - n_before;
+    LOG_INF("%s: %d unique ops, %d total nodes, %d skipped (view ops)\n",
+            label, n_new, n_nodes, n_skipped);
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+    params.out_file = "tests.txt";
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) {
+        return 1;
+    }
+
+    common_init();
+
+    // Load CPU-only
+    ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    params.devices = { cpu_device, nullptr };
+    params.fit_params = false;
+    params.n_gpu_layers = 0;
+
+    params.warmup = false;
+
+    auto init_result = common_init_from_params(params);
+
+    llama_context * ctx = init_result->context();
+
+    const uint32_t n_seqs  = llama_n_seq_max(ctx);
+    const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx));
+
+    std::set<test_object> tests;
+
+    auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
+    if (!gf_pp) {
+        throw std::runtime_error("failed to reserve prompt processing graph");
+    }
+    extract_graph_ops(gf_pp, "pp", tests);
+
+    auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
+    if (!gf_tg) {
+        throw std::runtime_error("failed to reserve token generation graph");
+    }
+    extract_graph_ops(gf_tg, "tg", tests);
+
+    LOG_INF("%d unique ops total\n", (int) tests.size());
+
+    std::ofstream f(params.out_file);
+
+    if (!f.is_open()) {
+        throw std::runtime_error("Unable to open output file");
+    }
+
+    for (const auto& test : tests) {
+        test.serialize(f);
+    }
+
+    return 0;
+}