@@ -3012,7 +3012,9 @@ namespace chatllm::qwen::v3_asr
30123012 class ChatHistoryEncoder : public v1 ::ChatHistoryEncoder
30133013 {
30143014 public:
3015+ typedef v1::ChatHistoryEncoder Base;
30153016 void append_user (int round_idx, const Content &user, std::vector<int > &ids) const override ;
3017+ void append_ai_opening (int round_idx, std::vector<int > &ids) const override ;
30163018 protected:
30173019 void load_audio (const Content &user) const ;
30183020 public:
@@ -3028,9 +3030,13 @@ namespace chatllm::qwen::v3_asr
30283030 Tokenizer (const BaseConfig &config);
30293031 Tokenizer (const BaseConfig &config, BaseHistoryEncoder *encoder);
30303032 void add_tokens (const std::map<std::string, int > &added_tokens);
3033+ std::string normalize_lang (const std::string &s) const ;
30313034 public:
30323035 int asr_text_token_id;
30333036 int timestamp_token_id;
3037+ std::string language = " auto" ;
3038+ std::string format = " srt" ;
3039+ std::string delimiter = " " ;
30343040 };
30353041
30363042 Tokenizer::Tokenizer (const BaseConfig &config):
@@ -3056,7 +3062,7 @@ namespace chatllm::qwen::v3_asr
30563062 im_start_token_id = get_or_def (" <|im_start|>" );
30573063 im_end_token_id = get_or_def (" <|im_end|>" );
30583064 asr_text_token_id = get_or_def (" <asr_text>" );
3059- timestamp_token_id = get_or_def (" <asr_text >" );
3065+ timestamp_token_id = get_or_def (" <timestamp >" );
30603066 tp->OverrideTokenDecoding (asr_text_token_id, " <asr_text>" );
30613067 }
30623068
@@ -3068,14 +3074,17 @@ namespace chatllm::qwen::v3_asr
30683074 ModelType type = ModelType::MODEL_TYPE_QWEN3_ASR, bool skip_lm_head = false );
30693075 bool load_more (const json::JSON &config) override ;
30703076 void load (ModelLoader &loader) override ;
3077+ void set_additional_args (const std::map<std::string, std::string> &args) override ;
30713078 void before_generate (const GenerationConfig &gen_config) override ;
30723079 void set_tokenizer (BaseTokenizer *tokenizer) override ;
3080+ std::string normalize_lang (const std::string &s) const ;
30733081 public:
30743082 v3::audio_tower::AudioEmbeddingGeneration audio;
30753083 private:
30763084 const int extended_vocab_size;
30773085 std::map<std::string, int > added_tokens;
30783086 bool aud_loaded = false ;
3087+ std::map<std::string, std::string> support_languages;
30793088 };
30803089
30813090 ConditionalGeneration::ConditionalGeneration (const Config &config, const RuntimeConfig &runtime_config,
@@ -3100,6 +3109,20 @@ namespace chatllm::qwen::v3_asr
31003109 enc->aud_config = &audio.config ;
31013110 }
31023111
3112+ std::string ConditionalGeneration::normalize_lang (const std::string &s) const
3113+ {
3114+ auto l = utils::to_lower (s);
3115+ return support_languages.count (l) > 0 ? support_languages.find (l)->second : " " ;
3116+ }
3117+
3118+ void ConditionalGeneration::set_additional_args (const std::map<std::string, std::string> &args)
3119+ {
3120+ Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
3121+ tok->language = normalize_lang (utils::get_opt (args, " language" , tok->language ));
3122+ tok->format = utils::to_lower (utils::get_opt (args, " format" , tok->format ));
3123+ tok->delimiter = utils::get_opt (args, " delimiter" , tok->delimiter );
3124+ }
3125+
31033126 bool ConditionalGeneration::load_more (const json::JSON &config)
31043127 {
31053128 Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
@@ -3113,6 +3136,13 @@ namespace chatllm::qwen::v3_asr
31133136 added_tokens.insert_or_assign (t, (int )std::atoi (kv.first .c_str ()));
31143137 }
31153138
3139+ tok_cfg = config[" config.json" ][" support_languages" ];
3140+ if (!tok_cfg.IsArray ()) return false ;
3141+ for (auto &ele : tok_cfg.ArrayRange ())
3142+ {
3143+ support_languages.insert_or_assign (utils::to_lower (ele.ToString ()), ele.ToString ());
3144+ }
3145+
31163146 bool r = audio.load_more (this ->config .dtype , this ->config .hidden_size , config);
31173147 if (r)
31183148 {
@@ -3197,6 +3227,17 @@ namespace chatllm::qwen::v3_asr
31973227 ids.push_back (tok->im_end_token_id );
31983228 ids.push_back (tok->nl_token_id );
31993229 }
3230+
3231+ void ChatHistoryEncoder::append_ai_opening (int round_idx, std::vector<int > &ids) const
3232+ {
3233+ Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
3234+ Base::append_ai_opening (round_idx, ids);
3235+
3236+ if (tok->language .size () > 0 )
3237+ {
3238+ tok->encode (" language " + tok->language , ids);
3239+ }
3240+ }
32003241}
32013242
32023243namespace chatllm ::qwen::v3_forcedaligner
@@ -3209,6 +3250,10 @@ namespace chatllm::qwen::v3_forcedaligner
32093250 class ChatHistoryEncoder : public v3_asr ::ChatHistoryEncoder
32103251 {
32113252 public:
3253+ void append_sys_prompt (std::vector<int > &ids) const override {}
3254+ void append_ai (int round_idx, const std::string &ai, std::vector<int > &ids) const override {}
3255+ void append_ai_opening (int round_idx, std::vector<int > &ids) const override {}
3256+ void append_user_opening (int round_idx, std::vector<int > &ids) const override {}
32123257 void append_user (int round_idx, const Content &user, std::vector<int > &ids) const override ;
32133258 };
32143259
@@ -3237,9 +3282,6 @@ namespace chatllm::qwen::v3_forcedaligner
32373282 protected:
32383283 std::string fmt_time (double timestamp);
32393284 public:
3240- std::string language = " chinese" ;
3241- std::string format = " srt" ;
3242- std::string delimiter = " " ;
32433285 int pos_first_timestamp_token = 0 ;
32443286 std::vector<word_seg> cleaned_words;
32453287 std::vector<std::string> sentences; // timestamp is reported for each "sentence".
@@ -3249,6 +3291,7 @@ namespace chatllm::qwen::v3_forcedaligner
32493291 Tokenizer::Tokenizer (const BaseConfig &config):
32503292 v3_asr::Tokenizer (config, &_chat_encoder)
32513293 {
3294+ language = " Chinese" ;
32523295 }
32533296
32543297 class ConditionalGeneration : public v3_asr ::ConditionalGeneration
@@ -3257,7 +3300,6 @@ namespace chatllm::qwen::v3_forcedaligner
32573300 typedef v3_asr::ConditionalGeneration Base;
32583301 ConditionalGeneration (const Config &config, const RuntimeConfig &runtime_config,
32593302 ModelType type = (ModelType)MODEL_TYPE_QWEN3_ForcedAligner);
3260- void set_additional_args (const std::map<std::string, std::string> &args) override ;
32613303 bool load_more (const json::JSON &config) override ;
32623304 std::vector<int > generate (const std::vector<int > &input_ids, const GenerationConfig &gen_config,
32633305 const bool continuous,
@@ -3277,14 +3319,6 @@ namespace chatllm::qwen::v3_forcedaligner
32773319 transformer->lm_head = create_lm_head (&w_ctx_, config.hidden_size , config.classify_num );
32783320 }
32793321
3280- void ConditionalGeneration::set_additional_args (const std::map<std::string, std::string> &args)
3281- {
3282- Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
3283- tok->language = utils::to_lower (utils::get_opt (args, " language" , tok->language ));
3284- tok->format = utils::to_lower (utils::get_opt (args, " format" , tok->format ));
3285- tok->delimiter = utils::get_opt (args, " delimiter" , tok->delimiter );
3286- }
3287-
32883322 bool ConditionalGeneration::load_more (const json::JSON &config)
32893323 {
32903324 auto r = Base::load_more (config);
@@ -3618,7 +3652,7 @@ namespace chatllm::qwen::v3_forcedaligner
36183652 if (cleaned.size () < 1 ) continue ;
36193653
36203654 std::vector<std::vector<uint32_t >> words32;
3621- if (" chinese " == language)
3655+ if (" Chinese " == language)
36223656 split_cjk (cleaned, words32);
36233657 else
36243658 words32.push_back (cleaned);
@@ -3638,7 +3672,6 @@ namespace chatllm::qwen::v3_forcedaligner
36383672 tok->sentences .clear ();
36393673
36403674 load_audio (user);
3641- tok->encode (" user" , ids, true , false , true );
36423675 tok->inject_audio_ids (ids, tok->vocab_size , tok->get_image_total_emb_vectors ());
36433676
36443677 if (tok->delimiter .size () > 0 )
@@ -3647,7 +3680,7 @@ namespace chatllm::qwen::v3_forcedaligner
36473680 }
36483681 else
36493682 {
3650- if (" chinese " == tok->language )
3683+ if (" Chinese " == tok->language )
36513684 {
36523685 std::vector<std::string> l;
36533686 utils::split (user.extract_text (" " ), l);
@@ -3680,9 +3713,6 @@ namespace chatllm::qwen::v3_forcedaligner
36803713
36813714 tok->pos_first_timestamp_token = (int )ids.size ();
36823715 tok->inject_words (tok->cleaned_words , ids);
3683-
3684- ids.push_back (tok->im_end_token_id );
3685- ids.push_back (tok->nl_token_id );
36863716 }
36873717}
36883718
0 commit comments