Skip to content

Commit d414c63

Browse files
committed
support Qwen3-ForceAligner
1 parent 1c6a067 commit d414c63

4 files changed

Lines changed: 80 additions & 19 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia" style="text-
3333

3434
**What's New:**
3535

36+
* 2026-02-01: Qwen3-ForceAligner
3637
* 2026-01-31: Qwen3-ASR
3738
* 2026-01-21: Step3-VL
3839
* 2026-01-20: GLM-4.7-Flash

docs/models.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,16 @@ Please use `--format completion` for these models.
448448
* Qwen3-ASR (`Qwen3ASRForConditionalGeneration`)
449449
* [x] [0.6B](https://huggingface.co/Qwen/Qwen3-ASR-0.6B/tree/5eb144179a02acc5e5ba31e748d22b0cf3e303b0), [1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B/tree/7278e1e70fe206f11671096ffdd38061171dd6e5)
450450

451+
Additional options (Use `--set X Y` to change values):
452+
* `language`: default "auto".
453+
454+
* [x] [ForcedAligner-0.6B](https://huggingface.co/Qwen/Qwen3-ForcedAligner-0.6B/tree/c7cbfc2048c462b0d63a45797104fc9db3ad62b7)
455+
456+
Additional options (Use `--set X Y` to change values):
457+
* `language`: default "Chinese". This affects how sentences are cutted into words. Each character is a "word" for Chinese. For other languages, words are separated by spaces.
458+
* `delimiter`: default "". Time stamps are reported for "sentences": sentences are separated by this delimiter. For Chinese, when delimiter is empty, each character is treated as a sentence.
459+
* `format`: default "srt". Format of output. "srt" or "json" are supported.
460+
451461
## RAG Models
452462

453463
### Text Embedding

models/qwen.cpp

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3012,7 +3012,9 @@ namespace chatllm::qwen::v3_asr
30123012
class ChatHistoryEncoder : public v1::ChatHistoryEncoder
30133013
{
30143014
public:
3015+
typedef v1::ChatHistoryEncoder Base;
30153016
void append_user(int round_idx, const Content &user, std::vector<int> &ids) const override;
3017+
void append_ai_opening(int round_idx, std::vector<int> &ids) const override;
30163018
protected:
30173019
void load_audio(const Content &user) const;
30183020
public:
@@ -3028,9 +3030,13 @@ namespace chatllm::qwen::v3_asr
30283030
Tokenizer(const BaseConfig &config);
30293031
Tokenizer(const BaseConfig &config, BaseHistoryEncoder *encoder);
30303032
void add_tokens(const std::map<std::string, int> &added_tokens);
3033+
std::string normalize_lang(const std::string &s) const;
30313034
public:
30323035
int asr_text_token_id;
30333036
int timestamp_token_id;
3037+
std::string language = "auto";
3038+
std::string format = "srt";
3039+
std::string delimiter = "";
30343040
};
30353041

30363042
Tokenizer::Tokenizer(const BaseConfig &config):
@@ -3056,7 +3062,7 @@ namespace chatllm::qwen::v3_asr
30563062
im_start_token_id = get_or_def("<|im_start|>");
30573063
im_end_token_id = get_or_def("<|im_end|>");
30583064
asr_text_token_id = get_or_def("<asr_text>");
3059-
timestamp_token_id = get_or_def("<asr_text>");
3065+
timestamp_token_id = get_or_def("<timestamp>");
30603066
tp->OverrideTokenDecoding(asr_text_token_id, "<asr_text>");
30613067
}
30623068

@@ -3068,14 +3074,17 @@ namespace chatllm::qwen::v3_asr
30683074
ModelType type = ModelType::MODEL_TYPE_QWEN3_ASR, bool skip_lm_head = false);
30693075
bool load_more(const json::JSON &config) override;
30703076
void load(ModelLoader &loader) override;
3077+
void set_additional_args(const std::map<std::string, std::string> &args) override;
30713078
void before_generate(const GenerationConfig &gen_config) override;
30723079
void set_tokenizer(BaseTokenizer *tokenizer) override;
3080+
std::string normalize_lang(const std::string &s) const;
30733081
public:
30743082
v3::audio_tower::AudioEmbeddingGeneration audio;
30753083
private:
30763084
const int extended_vocab_size;
30773085
std::map<std::string, int> added_tokens;
30783086
bool aud_loaded = false;
3087+
std::map<std::string, std::string> support_languages;
30793088
};
30803089

30813090
ConditionalGeneration::ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config,
@@ -3100,6 +3109,20 @@ namespace chatllm::qwen::v3_asr
31003109
enc->aud_config = &audio.config;
31013110
}
31023111

3112+
std::string ConditionalGeneration::normalize_lang(const std::string &s) const
3113+
{
3114+
auto l = utils::to_lower(s);
3115+
return support_languages.count(l) > 0 ? support_languages.find(l)->second : "";
3116+
}
3117+
3118+
void ConditionalGeneration::set_additional_args(const std::map<std::string, std::string> &args)
3119+
{
3120+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
3121+
tok->language = normalize_lang(utils::get_opt(args, "language", tok->language));
3122+
tok->format = utils::to_lower(utils::get_opt(args, "format", tok->format));
3123+
tok->delimiter = utils::get_opt(args, "delimiter", tok->delimiter);
3124+
}
3125+
31033126
bool ConditionalGeneration::load_more(const json::JSON &config)
31043127
{
31053128
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
@@ -3113,6 +3136,13 @@ namespace chatllm::qwen::v3_asr
31133136
added_tokens.insert_or_assign(t, (int)std::atoi(kv.first.c_str()));
31143137
}
31153138

3139+
tok_cfg = config["config.json"]["support_languages"];
3140+
if (!tok_cfg.IsArray()) return false;
3141+
for (auto &ele : tok_cfg.ArrayRange())
3142+
{
3143+
support_languages.insert_or_assign(utils::to_lower(ele.ToString()), ele.ToString());
3144+
}
3145+
31163146
bool r = audio.load_more(this->config.dtype, this->config.hidden_size, config);
31173147
if (r)
31183148
{
@@ -3197,6 +3227,17 @@ namespace chatllm::qwen::v3_asr
31973227
ids.push_back(tok->im_end_token_id);
31983228
ids.push_back(tok->nl_token_id);
31993229
}
3230+
3231+
void ChatHistoryEncoder::append_ai_opening(int round_idx, std::vector<int> &ids) const
3232+
{
3233+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
3234+
Base::append_ai_opening(round_idx, ids);
3235+
3236+
if (tok->language.size() > 0)
3237+
{
3238+
tok->encode("language " + tok->language, ids);
3239+
}
3240+
}
32003241
}
32013242

32023243
namespace chatllm::qwen::v3_forcedaligner
@@ -3209,6 +3250,10 @@ namespace chatllm::qwen::v3_forcedaligner
32093250
class ChatHistoryEncoder : public v3_asr::ChatHistoryEncoder
32103251
{
32113252
public:
3253+
void append_sys_prompt(std::vector<int> &ids) const override {}
3254+
void append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const override {}
3255+
void append_ai_opening(int round_idx, std::vector<int> &ids) const override {}
3256+
void append_user_opening(int round_idx, std::vector<int> &ids) const override {}
32123257
void append_user(int round_idx, const Content &user, std::vector<int> &ids) const override;
32133258
};
32143259

@@ -3237,9 +3282,6 @@ namespace chatllm::qwen::v3_forcedaligner
32373282
protected:
32383283
std::string fmt_time(double timestamp);
32393284
public:
3240-
std::string language = "chinese";
3241-
std::string format = "srt";
3242-
std::string delimiter = "";
32433285
int pos_first_timestamp_token = 0;
32443286
std::vector<word_seg> cleaned_words;
32453287
std::vector<std::string> sentences; // timestamp is reported for each "sentence".
@@ -3249,6 +3291,7 @@ namespace chatllm::qwen::v3_forcedaligner
32493291
Tokenizer::Tokenizer(const BaseConfig &config):
32503292
v3_asr::Tokenizer(config, &_chat_encoder)
32513293
{
3294+
language = "Chinese";
32523295
}
32533296

32543297
class ConditionalGeneration : public v3_asr::ConditionalGeneration
@@ -3257,7 +3300,6 @@ namespace chatllm::qwen::v3_forcedaligner
32573300
typedef v3_asr::ConditionalGeneration Base;
32583301
ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config,
32593302
ModelType type = (ModelType)MODEL_TYPE_QWEN3_ForcedAligner);
3260-
void set_additional_args(const std::map<std::string, std::string> &args) override;
32613303
bool load_more(const json::JSON &config) override;
32623304
std::vector<int> generate(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
32633305
const bool continuous,
@@ -3277,14 +3319,6 @@ namespace chatllm::qwen::v3_forcedaligner
32773319
transformer->lm_head = create_lm_head(&w_ctx_, config.hidden_size, config.classify_num);
32783320
}
32793321

3280-
void ConditionalGeneration::set_additional_args(const std::map<std::string, std::string> &args)
3281-
{
3282-
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
3283-
tok->language = utils::to_lower(utils::get_opt(args, "language", tok->language));
3284-
tok->format = utils::to_lower(utils::get_opt(args, "format", tok->format));
3285-
tok->delimiter = utils::get_opt(args, "delimiter", tok->delimiter);
3286-
}
3287-
32883322
bool ConditionalGeneration::load_more(const json::JSON &config)
32893323
{
32903324
auto r = Base::load_more(config);
@@ -3618,7 +3652,7 @@ namespace chatllm::qwen::v3_forcedaligner
36183652
if (cleaned.size() < 1) continue;
36193653

36203654
std::vector<std::vector<uint32_t>> words32;
3621-
if ("chinese" == language)
3655+
if ("Chinese" == language)
36223656
split_cjk(cleaned, words32);
36233657
else
36243658
words32.push_back(cleaned);
@@ -3638,7 +3672,6 @@ namespace chatllm::qwen::v3_forcedaligner
36383672
tok->sentences.clear();
36393673

36403674
load_audio(user);
3641-
tok->encode("user", ids, true, false, true);
36423675
tok->inject_audio_ids(ids, tok->vocab_size, tok->get_image_total_emb_vectors());
36433676

36443677
if (tok->delimiter.size() > 0)
@@ -3647,7 +3680,7 @@ namespace chatllm::qwen::v3_forcedaligner
36473680
}
36483681
else
36493682
{
3650-
if ("chinese" == tok->language)
3683+
if ("Chinese" == tok->language)
36513684
{
36523685
std::vector<std::string> l;
36533686
utils::split(user.extract_text(" "), l);
@@ -3680,9 +3713,6 @@ namespace chatllm::qwen::v3_forcedaligner
36803713

36813714
tok->pos_first_timestamp_token = (int)ids.size();
36823715
tok->inject_words(tok->cleaned_words, ids);
3683-
3684-
ids.push_back(tok->im_end_token_id);
3685-
ids.push_back(tok->nl_token_id);
36863716
}
36873717
}
36883718

scripts/models.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4000,5 +4000,25 @@
40004000
}
40014001
}
40024002
}
4003+
},
4004+
"qwen3-forcedaligner": {
4005+
"brief": "Qwen3-ForcedAligner-0.6B supports timestamp prediction for arbitrary units within up to 5 minutes of speech in 11 languages.",
4006+
"default": "0.6b",
4007+
"license": "Apache License 2.0",
4008+
"variants": {
4009+
"0.6b": {
4010+
"default": "q8",
4011+
"quantized": {
4012+
"q8": {
4013+
"size": 984439424,
4014+
"url": "chatllm_quantized_qwen3/qwen3-focedaligner-0.6b.bin"
4015+
},
4016+
"f16": {
4017+
"size": 1840552064,
4018+
"url": "chatllm_quantized_qwen3/qwen3-focedaligner-0.6b-f16.bin"
4019+
}
4020+
}
4021+
}
4022+
}
40034023
}
40044024
}

0 commit comments

Comments
 (0)