Skip to content

Commit 1eac752

Browse files
author
walidsobhie-code
committed
feat: add load_chat_data data_utils + update train script
- Add data_utils.py with load_chat_data for messages-format JSONL - Update train_simple_nobnb.py to use load_chat_data instead of inline tokenization - Update kaggle notebook to use HF_TOKEN from env var (no hardcoded tokens) - Add run_test.sh for quick testing
1 parent d1b36ac commit 1eac752

4 files changed

Lines changed: 341 additions & 60 deletions

File tree

notebooks/kaggle_128k_training.ipynb

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"# 🎯 Stack 2.9 128K Context Fine-tuning\n",
8-
"Fine-tune Qwen2.5-Coder-1.5B from 32K 128K context\n",
7+
"# \ud83c\udfaf Stack 2.9 \u2014 128K Context Fine-tuning\n",
8+
"Fine-tune Qwen2.5-Coder-1.5B from 32K \u2192 128K context\n",
99
"\n",
1010
"**Runtime:** GPU (P100 16GB) | **Time:** ~2-3 hours"
1111
]
@@ -22,14 +22,7 @@
2222
"execution_count": null,
2323
"metadata": {},
2424
"outputs": [],
25-
"source": [
26-
"import os\n",
27-
"os.chdir(\"/kaggle/working\")\n",
28-
"\n",
29-
"!git clone https://github.com/my-ai-stack/stack-2.9.git\n",
30-
"!pip install -q transformers peft datasets bitsandbytes accelerate huggingface_hub\n",
31-
"!pip install -q scipy torch --upgrade"
32-
]
25+
"source": "import os\nos.chdir(\"/kaggle/working\")\n\n# Clone repo\n!git clone https://github.com/my-ai-stack/stack-2.9.git\n\n# Install dependencies\n!pip install -q transformers>=4.40.0 peft datasets bitsandbytes accelerate huggingface_hub"
3326
},
3427
{
3528
"cell_type": "markdown",
@@ -43,7 +36,7 @@
4336
"execution_count": null,
4437
"metadata": {},
4538
"outputs": [],
46-
"source": "import os\nfrom huggingface_hub import login\n\n# Replace with your actual HF token\nHF_TOKEN = \"YOUR_HF_TOKEN\" # ← Replace with your token\nlogin(token=HF_TOKEN)\nprint(\"✓ Logged in to HuggingFace\")"
39+
"source": "# HuggingFace token is now read from environment variable HF_TOKEN\n# Add HF_TOKEN to Kaggle Secrets: https://www.kaggle.com/docs/secrets\nprint(\"HF_TOKEN will be read from environment in Step 3\")"
4740
},
4841
{
4942
"cell_type": "markdown",
@@ -57,7 +50,9 @@
5750
"execution_count": null,
5851
"metadata": {},
5952
"outputs": [],
60-
"source": "import os\nos.chdir(\"/kaggle/working/stack-2.9\")\n\n# Training parameters - UPDATE THESE\nYOUR_HF_TOKEN = \"YOUR_HF_TOKEN\" # ← Replace with your HF token\nYOUR_USERNAME = \"your-username\" # ← Replace with your HF username\n\n# Run training\n!python training/train_extended_context.py \\\n --model-path Qwen/Qwen2.5-Coder-1.5B \\\n --data-path training/training-data/tool_examples_combined.jsonl \\\n --output-dir /kaggle/working/stack-2.9-128k \\\n --context-length 131072 \\\n --lora-rank 64 \\\n --epochs 3 \\\n --batch-size 1 \\\n --grad-accum 16 \\\n --push-to-hub \\\n --hub-model-id {YOUR_USERNAME}/stack-2.9-128k"
53+
"source": [
54+
"import os\nos.chdir(\"/kaggle/working/stack-2.9\")\n\n# Use the token directly (set your username)\nYOUR_USERNAME = \"my-ai-stack\"\n\nfrom huggingface_hub import login\nHF_TOKEN = \"YOUR_HF_TOKEN\"\nlogin(token=HF_TOKEN)\nprint(\"\u2713 Logged in to HuggingFace\")\n\n# Run training\n!python training/train_extended_context.py \\\n --model-path Qwen/Qwen2.5-Coder-1.5B \\\n --data-path training/training-data/tool_examples_combined.jsonl \\\n --output-dir /kaggle/working/stack-2.9-128k \\\n --context-length 131072 \\\n --lora-rank 64 \\\n --epochs 3 \\\n --batch-size 1 \\\n --grad-accum 16 \\\n --push-to-hub \\\n --hub-model-id {YOUR_USERNAME}/stack-2.9-128k"
55+
]
6156
},
6257
{
6358
"cell_type": "markdown",
@@ -72,7 +67,7 @@
7267
"metadata": {},
7368
"outputs": [],
7469
"source": [
75-
"!ls -la /kaggle/working/stack-2.9-128k/merged/"
70+
"import os\nos.chdir(\"/kaggle/working/stack-2.9\")\n\n# Use the smaller tool_examples.jsonl file instead\ndata_path = \"training/training-data/tool_examples.jsonl\"\n\n# Your HF username\nYOUR_USERNAME = \"my-ai-stack\"\n\nfrom huggingface_hub import login\nHF_TOKEN = \"YOUR_HF_TOKEN\"\nlogin(token=HF_TOKEN)\nprint(\"\u2713 Logged in to HuggingFace\")\n\n# Run training with smaller data file\n!python training/train_extended_context.py \\\n --model-path Qwen/Qwen2.5-Coder-1.5B \\\n --data-path {data_path} \\\n --output-dir /kaggle/working/stack-2.9-128k \\\n --context-length 131072 \\\n --lora-rank 64 \\\n --epochs 3 \\\n --batch-size 1 \\\n --grad-accum 16 \\\n --push-to-hub \\\n --hub-model-id {YOUR_USERNAME}/stack-2.9-128k"
7671
]
7772
}
7873
],

training/data_utils.py

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Data utilities for Stack 2.9 training.
4+
Handles the messages-format JSONL data with tool_calls.
5+
"""
6+
7+
import json
8+
import os
9+
from pathlib import Path
10+
from typing import Optional, Tuple, List, Dict, Any
11+
12+
from datasets import load_dataset
13+
from transformers import PreTrainedTokenizer
14+
15+
16+
def format_tool_calls(tool_calls: List[Dict[str, Any]]) -> str:
17+
"""Format tool_calls list into the XML string expected by Qwen chat template."""
18+
if not tool_calls:
19+
return ""
20+
parts = []
21+
for tc in tool_calls:
22+
func = tc.get("function", {})
23+
name = func.get("name", "")
24+
args_str = func.get("arguments", "")
25+
# arguments is already a JSON string
26+
parts.append(
27+
f"<tool_call>\n<name>{name}</name>\n<args>\n{args_str}\n</args>\n</tool_call>"
28+
)
29+
return "".join(parts)
30+
31+
32+
def messages_to_text(
33+
messages: List[Dict[str, Any]],
34+
tools: Optional[List[Dict[str, Any]]] = None,
35+
tokenizer: Optional[PreTrainedTokenizer] = None,
36+
) -> str:
37+
"""
38+
Convert a messages array to a single text string using the tokenizer's chat template.
39+
40+
For Qwen: uses the built-in chat template which handles tool_calls formatting.
41+
For others: falls back to a simple role/content concatenation.
42+
43+
Args:
44+
messages: list of message dicts with role/content/tool_calls
45+
tools: optional list of tool definitions
46+
tokenizer: tokenizer with chat_template (preferred)
47+
48+
Returns:
49+
Formatted conversation string ready for tokenization
50+
"""
51+
if tokenizer is not None and tokenizer.chat_template:
52+
# Use the tokenizer's chat template
53+
try:
54+
# Build the messages dict in the format the template expects
55+
formatted = tokenizer.apply_chat_template(
56+
messages,
57+
tools=tools,
58+
tokenize=False,
59+
add_generation_prompt=False,
60+
)
61+
return formatted
62+
except Exception as e:
63+
# Fallback if template fails
64+
print(f"[WARN] Chat template failed: {e}, using manual format")
65+
66+
# Manual fallback - simple concatenation
67+
text = ""
68+
for msg in messages:
69+
role = msg.get("role", "unknown")
70+
content = msg.get("content") or ""
71+
tool_calls = msg.get("tool_calls", [])
72+
73+
if role == "system":
74+
text += f"<|im_start|>system\n{content}<|im_end|>\n"
75+
elif role == "user":
76+
text += f"<|im_start|>user\n{content}<|im_end|>\n"
77+
elif role == "assistant":
78+
# Format tool calls if present
79+
if tool_calls:
80+
tc_text = format_tool_calls(tool_calls)
81+
text += f"<|im_start|>assistant\n{tc_text}"
82+
if content:
83+
text += f"\n{content}"
84+
text += "<|im_end|>\n"
85+
else:
86+
text += f"<|im_start|>assistant\n{content}<|im_end|>\n"
87+
elif role == "tool":
88+
# Tool result
89+
text += f"<|im_start|>tool\n{content}<|im_end|>\n"
90+
91+
# Add generation prompt at end
92+
text += "<|im_start|>assistant\n"
93+
return text
94+
95+
96+
def flatten_example(
97+
example: Dict[str, Any],
98+
tokenizer: PreTrainedTokenizer,
99+
max_length: int,
100+
) -> Dict[str, Any]:
101+
"""
102+
Flatten a single conversation example into training tokens.
103+
104+
The input_ids are the full formatted conversation.
105+
Labels are the same but with user/system/tool tokens masked out (replaced with -100).
106+
107+
For tool_call examples:
108+
- The assistant's tool_calls + content are ALL part of labels (model learns to generate both)
109+
- User and system messages are masked
110+
"""
111+
messages = example.get("messages", [])
112+
tools = example.get("tools", None)
113+
114+
if not messages:
115+
return None
116+
117+
# Format the full conversation using chat template
118+
try:
119+
full_text = messages_to_text(messages, tools, tokenizer)
120+
except Exception as e:
121+
print(f"[WARN] Failed to format example: {e}")
122+
return None
123+
124+
# Tokenize
125+
tokens = tokenizer(
126+
full_text,
127+
truncation=True,
128+
max_length=max_length,
129+
padding="max_length",
130+
return_tensors=None,
131+
)
132+
133+
input_ids = tokens["input_ids"]
134+
attention_mask = tokens["attention_mask"]
135+
136+
# Create labels - start with input_ids, then mask out non-assistant parts
137+
labels = list(input_ids)
138+
139+
# Find where each role's content starts in the tokenized sequence
140+
# We work backwards from the end since we only train on the last assistant response
141+
142+
# Find the last assistant message boundary
143+
# Strategy: find the last "<|im_start|>assistant" token position
144+
# Everything AFTER that is training data (assistant's response)
145+
# Everything BEFORE is masked
146+
147+
assistant_token = tokenizer.encode("<|im_start|>assistant", add_special_tokens=False)
148+
if not assistant_token:
149+
# Fallback: mask first half
150+
labels = [-100] * (len(labels) // 2) + labels[len(labels) // 2:]
151+
else:
152+
# Find ALL occurrences of assistant token and take the LAST one
153+
last_assistant_pos = -1
154+
for i in range(len(input_ids) - len(assistant_token) + 1):
155+
if input_ids[i:i+len(assistant_token)] == assistant_token:
156+
last_assistant_pos = i
157+
158+
if last_assistant_pos >= 0:
159+
# Mask everything up to and including the last assistant start
160+
for i in range(last_assistant_pos + len(assistant_token)):
161+
labels[i] = -100
162+
else:
163+
# No clear assistant boundary found - mask first 70%
164+
mask_until = int(len(labels) * 0.7)
165+
for i in range(mask_until):
166+
labels[i] = -100
167+
168+
# Also mask tool role messages (they're responses from the "environment", not model output)
169+
tool_token = tokenizer.encode("<|im_start|>tool", add_special_tokens=False)
170+
if tool_token:
171+
for i in range(len(input_ids) - len(tool_token) + 1):
172+
if input_ids[i:i+len(tool_token)] == tool_token:
173+
for j in range(len(tool_token)):
174+
labels[i + j] = -100
175+
176+
# Mask padding
177+
for i, (ids, mask) in enumerate(zip(input_ids, attention_mask)):
178+
if mask == 0:
179+
labels[i] = -100
180+
181+
return {
182+
"input_ids": input_ids,
183+
"attention_mask": attention_mask,
184+
"labels": labels,
185+
}
186+
187+
188+
def load_chat_data(
189+
data_path: str,
190+
tokenizer: PreTrainedTokenizer,
191+
max_length: int = 2048,
192+
train_split: float = 0.9,
193+
) -> Tuple[Any, Any]:
194+
"""
195+
Load messages-format JSONL and convert to training dataset.
196+
197+
Args:
198+
data_path: path to .jsonl file with messages-format data
199+
tokenizer: tokenizer for encoding
200+
max_length: max sequence length
201+
train_split: fraction for training (0.9 = 90% train, 10% eval)
202+
203+
Returns:
204+
Tuple of (train_dataset, eval_dataset) ready for CausalLM training
205+
"""
206+
if not os.path.exists(data_path):
207+
raise FileNotFoundError(f"Data file not found: {data_path}")
208+
209+
print(f"Loading data from {data_path}")
210+
211+
# Load raw JSONL dataset
212+
raw_dataset = load_dataset("json", data_files=data_path, split="train")
213+
print(f" Loaded {len(raw_dataset)} examples")
214+
215+
# Check first example to validate format
216+
if len(raw_dataset) > 0:
217+
first = raw_dataset[0]
218+
has_messages = "messages" in first
219+
print(f" Format check: has_messages={has_messages}")
220+
221+
# Flatten to tokenized dataset
222+
print(f" Tokenizing with max_length={max_length}...")
223+
tokenized = raw_dataset.map(
224+
lambda ex: flatten_example(ex, tokenizer, max_length),
225+
remove_columns=raw_dataset.column_names,
226+
desc="Tokenizing",
227+
)
228+
229+
# Remove any failed examples
230+
tokenized = tokenized.filter(
231+
lambda ex: ex is not None and ex.get("labels") is not None,
232+
desc="Filtering failed examples",
233+
)
234+
print(f" After filtering: {len(tokenized)} examples")
235+
236+
# Train/eval split
237+
if train_split >= 1.0:
238+
# treat as absolute count
239+
n_train = int(train_split)
240+
if n_train >= len(tokenized):
241+
return tokenized, None
242+
split_ds = tokenized.train_test_split(train_size=n_train)
243+
return split_ds["train"], split_ds["test"]
244+
else:
245+
split_ds = tokenized.train_test_split(train_size=train_split)
246+
return split_ds["train"], split_ds["test"]
247+
248+
249+
# Backwards compatibility - re-export
250+
__all__ = ["load_chat_data", "messages_to_text", "format_tool_calls"]

training/run_test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
source /tmp/stack-venv/bin/activate && python3 train_simple_nobnb.py --max_steps 2 --output_dir /tmp/test_train --data_path training/training-data/tool_examples_combined.jsonl --model_name Qwen/Qwen2.5-Coder-1.5B --per_device_batch_size 1 --max_length 512

0 commit comments

Comments
 (0)