Adding chat template to vllm decode.

NicoGrande · NicoGrande · commit 36ae2d07ca50 · 2026-01-20T22:51:08.000Z
diff --git a/src/MaxText/vllm_decode.py b/src/MaxText/vllm_decode.py
@@ -44,6 +44,7 @@
 import jax
 import transformers
 
+from MaxText import max_logging
 from MaxText import model_creation_utils
 from MaxText import pyconfig
 from MaxText.common_types import Config
@@ -70,6 +71,8 @@
 flags.DEFINE_string("model_name", "qwen3-30b-a3b", "Model name for MaxText.")
 flags.DEFINE_string("hf_model_name", "Qwen/Qwen3-30B-A3B", "Path to the Hugging Face model.")
 flags.DEFINE_string("hf_config_path", None, "Path to the local Hugging Face model config.")
+flags.DEFINE_string("hf_access_token", None, "Hugging Face access token for private models.")
+flags.DEFINE_string("tokenizer_path", None, "Path to the tokenizer. If None, use hf_model_name.")
 flags.DEFINE_string("load_parameters_path", None, "Path to load model parameters from.")
 flags.DEFINE_bool("enable_expert_parallel", False, "Whether to enable expert parallelism.")
 
@@ -80,51 +83,63 @@
 
 # Decoding
 flags.DEFINE_bool("use_tunix", False, "Whether to use Tunix for vLLM decoding.")
+flags.DEFINE_bool("use_chat_template", False, "Whether to format the prompt using chat template.")
 flags.DEFINE_string("prompt", "Suggest some famous landmarks in London.", "The prompt to decode.")
-flags.DEFINE_integer("decode_sampling_temperature", 0, "Temperature for sampling.")
-flags.DEFINE_integer("decode_sampling_nucleus_p", 1, "Nucleus sampling probability.")
+flags.DEFINE_float("decode_sampling_temperature", 0, "Temperature for sampling.")
+flags.DEFINE_float("decode_sampling_nucleus_p", 1.0, "Nucleus sampling probability.")
 flags.DEFINE_integer("decode_sampling_top_k", 1, "Top-k sampling probability.")
-
-# Mark required flags
-flags.mark_flag_as_required("hf_config_path")
+flags.DEFINE_integer("seed", 42, "Random seed for sampling.")
 
 
 def decode_with_vllm(
     model_name: str,
     hf_model_name: str,
-    hf_config_path: str,
-    load_parameters_path: str,
     ici_data_parallelism: int,
     ici_tensor_parallelism: int,
     ici_expert_parallelism: int,
-    max_prefill_length: int,
     max_target_length: int,
     gpu_memory_utilization: float,
     enable_expert_parallel: bool,
     prompt: str,
-    decode_sampling_temperature: float,
-    decode_sampling_nucleus_p: float,
-    decode_sampling_top_k: float,
+    use_chat_template: bool = False,
+    decode_sampling_temperature: float = 0.0,
+    decode_sampling_nucleus_p: float = 1.0,
+    decode_sampling_top_k: int = 1,
+    hf_config_path: str | None = None,
+    hf_access_token: str | None = None,
+    tokenizer_path: str | None = None,
+    load_parameters_path: str | None = None,
+    seed: int = 42,
 ) -> None:
   """Decode using vLLM with a MaxText model implementation.
 
   Args:
     model_name: Name of the model for MaxText.
     hf_model_name: Path to the Hugging Face model.
-    hf_config_path: Path to the local Hugging Face model config.
-    load_parameters_path: Path to load model parameters from.
     ici_data_parallelism: Size of the data parallelism dimension.
     ici_tensor_parallelism: Size of the non-expert tensor parallelism dimension.
     ici_expert_parallelism: Size of the MoE expert parallelism dimension.
-    max_prefill_length: Maximum prefill length.
     max_target_length: Maximum total context length (MCL).
     gpu_memory_utilization: Fraction of GPU memory to be used for the model executor.
     enable_expert_parallel: Whether to enable expert parallelism.
     prompt: The prompt to decode.
+    use_chat_template: Whether to format the prompt using chat template.
     decode_sampling_temperature: Temperature for sampling.
     decode_sampling_nucleus_p: Nucleus sampling probability.
     decode_sampling_top_k: Top-k sampling probability.
+    hf_config_path: Path to the local Hugging Face model config.
+    hf_access_token: Hugging Face access token for private models.
+    tokenizer_path: Path to the tokenizer. If None, use hf_model_name.
+    load_parameters_path: Path to load model parameters from.
   """
+  if not model_name:
+    raise ValueError("model_name must be provided")
+
+  if not hf_model_name:
+    raise ValueError("hf_model_name must be provided")
+
+  if not hf_config_path:
+    raise ValueError("hf_config_path must be provided")
 
   # Prepare vLLM Arguments
   vllm_args = {}
@@ -142,7 +157,6 @@ def decode_with_vllm(
   # Prepare MaxText and sharding configs (Parallelism is dynamic)
   vllm_args["additional_config"]["maxtext_config"] = {
       "model_name": model_name,
-      "max_target_length": max_target_length,
       "weight_dtype": "bfloat16",
       "allow_split_physical_axes": True,
   }
@@ -154,24 +168,14 @@ def decode_with_vllm(
   vllm_args["additional_config"]["sharding"] = {
       "sharding_strategy": {
           "tensor_parallelism": ici_tensor_parallelism,
-          "expert_parallelism": ici_expert_parallelism,
           "data_parallelism": ici_data_parallelism,
       },
   }
 
   if enable_expert_parallel:
     vllm_args["additional_config"]["sharding"]["sharding_strategy"].update({"expert_parallelism": ici_expert_parallelism})
 
-  # Initialize and Run LLM
-  max_tokens = max_target_length - max_prefill_length
-  sampling_params = SamplingParams(
-      temperature=decode_sampling_temperature,
-      max_tokens=max_tokens,
-      top_k=decode_sampling_top_k,
-      top_p=decode_sampling_nucleus_p,
-  )
-
-  print(
+  max_logging.log(
       f"Initializing LLM with DP={vllm_args['data_parallel_size']}, TP={vllm_args['tensor_parallel_size']} "
       f"and EP={ici_expert_parallelism if enable_expert_parallel else 0}..."
   )
@@ -183,14 +187,44 @@ def decode_with_vllm(
   with nn_partitioning.axis_rules(vllm_config.logical_axis_rules):
     llm = LLM(**vllm_args)
 
-  print("Generating output...")
-  outputs = llm.generate([prompt], sampling_params)
+  max_logging.log("Generating output...")
+  tokenizer = transformers.AutoTokenizer.from_pretrained(
+      tokenizer_path if tokenizer_path is not None else hf_model_name,
+      token=hf_access_token,
+  )
+
+  prompts = [prompt]
+  if use_chat_template:
+    # Format the prompt using chat template if specified
+    messages = [
+        {"role": "user", "content": prompt},
+    ]
+    input_with_chat_template = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,  # Set to False to get the string
+        add_generation_prompt=True,
+        add_special_tokens=False,  # Prevent adding special tokens
+    )
+    prompts = [input_with_chat_template]
+
+  max_prompt_length = max(len(tokenizer.encode(p)) for p in prompts)
+  max_tokens_to_generate = max_target_length - max_prompt_length
+
+  sampling_params = SamplingParams(
+      temperature=decode_sampling_temperature,
+      max_tokens=max_tokens_to_generate,
+      top_k=decode_sampling_top_k,
+      top_p=decode_sampling_nucleus_p,
+      seed=seed,
+  )
+
+  outputs = llm.generate(prompts, sampling_params)
 
-  # Print Outputs
+  # max_logging.log Outputs
   for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    max_logging.log(f"Prompt: {prompt}, Generated text: {generated_text}")
 
 
 def decode_with_tunix(
@@ -253,8 +287,8 @@ def decode_with_tunix(
 
   # Generate text
   output = vllm_rollout.generate(prompts, rollout_config)
-  print(f"Prompt: {config.prompt}")
-  print(f"Output: {output.text[0]}")
+  max_logging.log(f"Prompt: {config.prompt}")
+  max_logging.log(f"Output: {output.text[0]}")
 
 
 def main(argv: Sequence[str]) -> None:
@@ -274,18 +308,21 @@ def main(argv: Sequence[str]) -> None:
         model_name=FLAGS.model_name,
         hf_model_name=FLAGS.hf_model_name,
         hf_config_path=FLAGS.hf_config_path,
+        hf_access_token=FLAGS.hf_access_token,
+        tokenizer_path=FLAGS.tokenizer_path,
         load_parameters_path=FLAGS.load_parameters_path,
         ici_data_parallelism=FLAGS.ici_data_parallelism,
         ici_tensor_parallelism=FLAGS.ici_tensor_parallelism,
         ici_expert_parallelism=FLAGS.ici_expert_parallelism,
         max_target_length=FLAGS.max_target_length,
-        max_prefill_length=FLAGS.max_prefill_length,
         gpu_memory_utilization=FLAGS.gpu_memory_utilization,
         enable_expert_parallel=FLAGS.enable_expert_parallel,
         prompt=FLAGS.prompt,
+        use_chat_template=FLAGS.use_chat_template,
         decode_sampling_temperature=FLAGS.decode_sampling_temperature,
         decode_sampling_nucleus_p=FLAGS.decode_sampling_nucleus_p,
         decode_sampling_top_k=FLAGS.decode_sampling_top_k,
+        seed=FLAGS.seed,
     )