From f9fb25384c780ea29562e72939a2509d0084d046 Mon Sep 17 00:00:00 2001 From: Andy Twigg Date: Tue, 17 Mar 2026 01:45:22 +0000 Subject: [PATCH 1/2] modify process_data to generate separate user/system parts in prompts --- .../trainers/post_train/rl/utils_rl.py | 47 ++++++++++--------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/src/maxtext/trainers/post_train/rl/utils_rl.py b/src/maxtext/trainers/post_train/rl/utils_rl.py index c37db48c0a..a49895965c 100644 --- a/src/maxtext/trainers/post_train/rl/utils_rl.py +++ b/src/maxtext/trainers/post_train/rl/utils_rl.py @@ -527,27 +527,28 @@ def _to_str(val): answer = extract_hash_answer(answer) return { - # passed to model forward pass - "prompts": model_tokenizer.apply_chat_template( - [ - { - "role": "user", - "content": template_config["TEMPLATE"].format( - system_prompt=template_config["SYSTEM_PROMPT"].format( - reasoning_start_token=tmvp_config.reasoning_start_token, - reasoning_end_token=tmvp_config.reasoning_end_token, - solution_start_token=tmvp_config.solution_start_token, - solution_end_token=tmvp_config.solution_end_token, - ), - question=question, - ), - }, - ], - tokenize=False, - add_generation_prompt=True, - ), - # passed to reward functions - "question": question, - # passed to reward functions - "answer": answer, + # passed to model forward pass + "prompts": model_tokenizer.apply_chat_template( + [ + { + "role": "system", + "content": template_config["SYSTEM_PROMPT"].format( + reasoning_start_token=tmvp_config.reasoning_start_token, + reasoning_end_token=tmvp_config.reasoning_end_token, + solution_start_token=tmvp_config.solution_start_token, + solution_end_token=tmvp_config.solution_end_token, + ), + }, + { + "role": "user", + "content": question, + } + ], + tokenize=False, + add_generation_prompt=True, + ), + # passed to reward functions + "question": question, + # passed to reward functions + "answer": answer, } From e83505a1d0b73ada7cfd9d4e6e285ae677eb2e20 Mon Sep 17 00:00:00 2001 From: Andy Twigg Date: Wed, 18 Mar 2026 20:32:30 +0000 Subject: [PATCH 2/2] pyink --- .../trainers/post_train/rl/utils_rl.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/maxtext/trainers/post_train/rl/utils_rl.py b/src/maxtext/trainers/post_train/rl/utils_rl.py index a49895965c..fc21cce202 100644 --- a/src/maxtext/trainers/post_train/rl/utils_rl.py +++ b/src/maxtext/trainers/post_train/rl/utils_rl.py @@ -527,28 +527,28 @@ def _to_str(val): answer = extract_hash_answer(answer) return { - # passed to model forward pass - "prompts": model_tokenizer.apply_chat_template( - [ - { - "role": "system", - "content": template_config["SYSTEM_PROMPT"].format( - reasoning_start_token=tmvp_config.reasoning_start_token, - reasoning_end_token=tmvp_config.reasoning_end_token, - solution_start_token=tmvp_config.solution_start_token, - solution_end_token=tmvp_config.solution_end_token, - ), - }, - { - "role": "user", - "content": question, - } - ], - tokenize=False, - add_generation_prompt=True, - ), - # passed to reward functions - "question": question, - # passed to reward functions - "answer": answer, + # passed to model forward pass + "prompts": model_tokenizer.apply_chat_template( + [ + { + "role": "system", + "content": template_config["SYSTEM_PROMPT"].format( + reasoning_start_token=tmvp_config.reasoning_start_token, + reasoning_end_token=tmvp_config.reasoning_end_token, + solution_start_token=tmvp_config.solution_start_token, + solution_end_token=tmvp_config.solution_end_token, + ), + }, + { + "role": "user", + "content": question, + }, + ], + tokenize=False, + add_generation_prompt=True, + ), + # passed to reward functions + "question": question, + # passed to reward functions + "answer": answer, }