diff --git a/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py b/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py index 7e827352b4..41ed261626 100644 --- a/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py +++ b/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py @@ -261,6 +261,22 @@ torch_dtype="bfloat16", ) +qwen3_1_7b_config = transformers.Qwen3Config( + vocab_size=151936, + hidden_size=2048, + intermediate_size=6144, + num_hidden_layers=28, + num_attention_heads=16, + num_key_value_heads=8, + head_dim=128, + hidden_act="silu", + max_position_embeddings=40960, + rms_norm_eps=1.0e-6, + rope_theta=1000000.0, + tie_word_embeddings=True, + torch_dtype="bfloat16", +) + qwen3_4b_config = transformers.Qwen3Config( vocab_size=151936, hidden_size=2560, @@ -853,6 +869,7 @@ "qwen2.5-7b": qwen25_7b_config, "qwen2.5-14b": qwen25_14b_config, "qwen3-0.6b": qwen3_0_6b_config, + "qwen3-1.7b": qwen3_1_7b_config, "qwen3-4b": qwen3_4b_config, "qwen3-4b-thinking-2507": qwen3_4b_config, "qwen3-8b": qwen3_8b_config, diff --git a/src/maxtext/checkpoint_conversion/utils/param_mapping.py b/src/maxtext/checkpoint_conversion/utils/param_mapping.py index 43f2662e34..8cd5bc4ca5 100644 --- a/src/maxtext/checkpoint_conversion/utils/param_mapping.py +++ b/src/maxtext/checkpoint_conversion/utils/param_mapping.py @@ -2368,6 +2368,7 @@ def pad_hf_embedding_layer(input_tensor, target_shape): "qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING, "qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING, "qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING, + "qwen3-1.7b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING, "qwen3-4b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING, "qwen3-4b-thinking-2507": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING, "qwen3-8b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING, @@ -2405,6 +2406,7 @@ def pad_hf_embedding_layer(input_tensor, target_shape): "qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN, "qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN, "qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN, + "qwen3-1.7b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN, "qwen3-4b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN, "qwen3-4b-thinking-2507": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN, "qwen3-8b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN, diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py index 3df51ac106..d74fd197a0 100644 --- a/src/maxtext/configs/types.py +++ b/src/maxtext/configs/types.py @@ -235,6 +235,7 @@ class ProfilerType(str, Enum): "qwen2.5-7b", "qwen2.5-14b", "qwen3-0.6b", + "qwen3-1.7b", "qwen3-4b", "qwen3-4b-thinking-2507", "qwen3-8b", diff --git a/src/maxtext/utils/globals.py b/src/maxtext/utils/globals.py index 203d7a6165..72c1a95ccf 100644 --- a/src/maxtext/utils/globals.py +++ b/src/maxtext/utils/globals.py @@ -53,6 +53,7 @@ "qwen2.5-7b": "Qwen/Qwen2.5-7B-Instruct", "qwen2.5-14b": "Qwen/Qwen2.5-14B-Instruct", "qwen3-0.6b": "Qwen/Qwen3-0.6B", + "qwen3-1.7b": "Qwen/Qwen3-1.7B", "qwen3-4b": "Qwen/Qwen3-4B", "qwen3-4b-thinking-2507": "Qwen/Qwen3-4B-Thinking-2507", "qwen3-8b": "Qwen/Qwen3-8B",