Skip to content

Commit 5b6023c

Browse files
author
walidsobhie-code
committed
Add Kaggle 7B smart-train notebook (20K examples, QLoRA T4)
1 parent 1eac752 commit 5b6023c

1 file changed

Lines changed: 371 additions & 0 deletions

File tree

Lines changed: 371 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,371 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# 🚀 Stack 2.9 — 7B QLoRA Fine-tune on Kaggle (Smart 20K)\n",
8+
"\n",
9+
"**Base:** Qwen2.5-Coder-7B-Instruct\n",
10+
"**Data:** my-ai-stack/stack-2-9-tool-20k-examples (20K smart examples)\n",
11+
"**Output:** my-ai-stack/Stack-2.9-7B-finetuned\n",
12+
"**Runtime:** GPU T4 16GB | **Time:** ~6-8 hours | **Cost:** FREE"
13+
]
14+
},
15+
{
16+
"cell_type": "markdown",
17+
"metadata": {},
18+
"source": [
19+
"## ⚠️ Before Starting\n",
20+
"\n",
21+
"1. Go to **Add-ons → Secrets** and add:\n",
22+
" - `HF_TOKEN` = your HuggingFace write token (starts with `hf_`)\n",
23+
"2. Set **Accelerator** to **GPU T4**\n",
24+
"3. Internet must be ON"
25+
]
26+
},
27+
{
28+
"cell_type": "markdown",
29+
"metadata": {},
30+
"source": ["## Step 1: Clone & Install"]
31+
},
32+
{
33+
"cell_type": "code",
34+
"execution_count": null,
35+
"metadata": {},
36+
"outputs": [],
37+
"source": [
38+
"import os\n",
39+
"os.chdir(\"/kaggle/working\")\n",
40+
"\n",
41+
"# Clone repo\n",
42+
"!git clone https://github.com/my-ai-stack/stack-2.9.git\n",
43+
"\n",
44+
"# Install\n",
45+
"!pip install -q transformers>=4.40.0 peft datasets bitsandbytes accelerate huggingface_hub flash-attn --no-build-isolation\n",
46+
"\n",
47+
"# Verify\n",
48+
"import torch\n",
49+
"print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
50+
"print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB\")"
51+
]
52+
},
53+
{
54+
"cell_type": "markdown",
55+
"metadata": {},
56+
"source": ["## Step 2: Login to HuggingFace"]
57+
},
58+
{
59+
"cell_type": "code",
60+
"execution_count": null,
61+
"metadata": {},
62+
"outputs": [],
63+
"source": [
64+
"from huggingface_hub import login\n",
65+
"import os\n",
66+
"\n",
67+
"# Read token from Kaggle secret\n",
68+
"from kaggle_secrets import UserSecretsClient\n",
69+
"user_secrets = UserSecretsClient()\n",
70+
"hf_token = user_secrets.get_secret(\"HF_TOKEN\")\n",
71+
"login(hf_token)\n",
72+
"print(\"✅ Logged into HuggingFace\")"
73+
]
74+
},
75+
{
76+
"cell_type": "markdown",
77+
"metadata": {},
78+
"source": ["## Step 3: Download Smart 20K Dataset"]
79+
},
80+
{
81+
"cell_type": "code",
82+
"execution_count": null,
83+
"metadata": {},
84+
"outputs": [],
85+
"source": [
86+
"from huggingface_hub import hf_hub_download\n",
87+
"import os\n",
88+
"\n",
89+
"DATA_DIR = \"/kaggle/working/data\"\n",
90+
"os.makedirs(DATA_DIR, exist_ok=True)\n",
91+
"\n",
92+
"# Download smart 20K dataset from org\n",
93+
"print(\"Downloading smart 20K dataset...\")\n",
94+
"path = hf_hub_download(\n",
95+
" repo_id=\"my-ai-stack/stack-2-9-tool-20k-examples\",\n",
96+
" filename=\"tool_examples_smart_20k.jsonl\",\n",
97+
" repo_type=\"dataset\",\n",
98+
" local_dir=DATA_DIR,\n",
99+
")\n",
100+
"print(f\"Dataset: {path}\")\n",
101+
"\n",
102+
"# Count\n",
103+
"with open(path) as f:\n",
104+
" n_lines = sum(1 for _ in f)\n",
105+
"print(f\"Examples: {n_lines:,}\")"
106+
]
107+
},
108+
{
109+
"cell_type": "markdown",
110+
"metadata": {},
111+
"source": ["## Step 4: Setup Data Pipeline"]
112+
},
113+
{
114+
"cell_type": "code",
115+
"execution_count": null,
116+
"metadata": {},
117+
"outputs": [],
118+
"source": [
119+
"import json\n",
120+
"import torch\n",
121+
"from transformers import AutoTokenizer\n",
122+
"from datasets import load_dataset\n",
123+
"\n",
124+
"MODEL_NAME = \"Qwen/Qwen2.5-Coder-7B-Instruct\"\n",
125+
"MAX_LENGTH = 2048 # T4 fits this with QLoRA\n",
126+
"\n",
127+
"print(\"Loading tokenizer...\")\n",
128+
"tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
129+
"tokenizer.padding_side = \"right\"\n",
130+
"tokenizer.pad_token = tokenizer.eos_token\n",
131+
"\n",
132+
"print(\"Loading dataset...\")\n",
133+
"raw = load_dataset(\"json\", data_files=path, split=\"train\")\n",
134+
"print(f\"Loaded {len(raw)} examples\")"
135+
]
136+
},
137+
{
138+
"cell_type": "markdown",
139+
"metadata": {},
140+
"source": ["## Step 5: Tokenize Data (messages format)"]
141+
},
142+
{
143+
"cell_type": "code",
144+
"execution_count": null,
145+
"metadata": {},
146+
"outputs": [],
147+
"source": [
148+
"def format_conversation(example):\n",
149+
" messages = example[\"messages\"]\n",
150+
" text = \"\"\n",
151+
" for msg in messages:\n",
152+
" role = msg[\"role\"]\n",
153+
" content = msg.get(\"content\", \"\") or \"\"\n",
154+
" tc = msg.get(\"tool_calls\", [])\n",
155+
" if role == \"system\":\n",
156+
" text += f\"<|im_start|>system\\n{content}<|im_end|>\\n\"\n",
157+
" elif role == \"user\":\n",
158+
" text += f\"<|im_start|>user\\n{content}<|im_end|>\\n\"\n",
159+
" elif role == \"assistant\":\n",
160+
" if tc:\n",
161+
" for t in tc:\n",
162+
" fn = t[\"function\"]\n",
163+
" text += f\"<|im_start|>assistant\\n<tool_call>\\n<name>{fn['name']}</name>\\n<args>\\n{fn['arguments']}\\n</args>\\n</tool_call>\\n\"\n",
164+
" if content:\n",
165+
" text += f\"{content}<|im_end|>\\n\"\n",
166+
" else:\n",
167+
" text += \"<|im_end|>\\n\"\n",
168+
" else:\n",
169+
" text += f\"<|im_start|>assistant\\n{content}<|im_end|>\\n\"\n",
170+
" elif role == \"tool\":\n",
171+
" text += f\"<|im_start|>tool\\n{content}<|im_end|>\\n\"\n",
172+
" text += \"<|im_start|>assistant\\n\"\n",
173+
" return {\"text\": text}\n",
174+
"\n",
175+
"print(\"Tokenizing...\")\n",
176+
"formatted = raw.map(format_conversation)\n",
177+
"\n",
178+
"def tokenize(example):\n",
179+
" tokens = tokenizer(example[\"text\"], truncation=True, max_length=MAX_LENGTH, padding=\"max_length\")\n",
180+
" tokens[\"labels\"] = tokens[\"input_ids\"].copy()\n",
181+
" # Mask user/system/tool tokens\n",
182+
" input_ids = tokens[\"input_ids\"]\n",
183+
" labels = tokens[\"labels\"]\n",
184+
" \n",
185+
" # Find last assistant start\n",
186+
" asr = tokenizer.encode(\"<|im_start|>assistant\", add_special_tokens=False)\n",
187+
" found = -1\n",
188+
" for i in range(len(input_ids) - len(asr) + 1):\n",
189+
" if input_ids[i:i+len(asr)] == asr:\n",
190+
" found = i\n",
191+
" \n",
192+
" if found >= 0:\n",
193+
" for j in range(found + len(asr)):\n",
194+
" labels[j] = -100\n",
195+
" \n",
196+
" # Mask padding\n",
197+
" for j, m in enumerate(tokens[\"attention_mask\"]):\n",
198+
" if m == 0:\n",
199+
" labels[j] = -100\n",
200+
" \n",
201+
" return tokens\n",
202+
"\n",
203+
"tokenized = formatted.map(tokenize, remove_columns=formatted.column_names, desc=\"Tokenizing\")\n",
204+
"tokenized = tokenized.filter(lambda x: x[\"labels\"] is not None)\n",
205+
"\n",
206+
"# Split\n",
207+
"split = tokenized.train_test_split(test_size=0.05)\n",
208+
"train_ds = split[\"train\"]\n",
209+
"val_ds = split[\"test\"]\n",
210+
"print(f\"Train: {len(train_ds)}, Val: {len(val_ds)}\")"
211+
]
212+
},
213+
{
214+
"cell_type": "markdown",
215+
"metadata": {},
216+
"source": ["## Step 6: Load Model + LoRA"]
217+
},
218+
{
219+
"cell_type": "code",
220+
"execution_count": null,
221+
"metadata": {},
222+
"outputs": [],
223+
"source": [
224+
"from peft import LoraConfig, get_peft_model, TaskType\n",
225+
"from transformers import BitsAndBytesConfig\n",
226+
"import torch\n",
227+
"\n",
228+
"# Quantization config for T4\n",
229+
"bnb_config = BitsAndBytesConfig(\n",
230+
" load_in_4bit=True,\n",
231+
" bnb_4bit_compute_dtype=torch.float16,\n",
232+
" bnb_4bit_quant_type=\"nf4\",\n",
233+
" bnb_4bit_use_double_quant=True,\n",
234+
")\n",
235+
"\n",
236+
"print(\"Loading model...\")\n",
237+
"from transformers import AutoModelForCausalLM\n",
238+
"model = AutoModelForCausalLM.from_pretrained(\n",
239+
" MODEL_NAME,\n",
240+
" quantization_config=bnb_config,\n",
241+
" device_map=\"auto\",\n",
242+
" trust_remote_code=True,\n",
243+
")\n",
244+
"model.config.use_cache = False\n",
245+
"\n",
246+
"# LoRA config\n",
247+
"lora_cfg = LoraConfig(\n",
248+
" r=32,\n",
249+
" lora_alpha=64,\n",
250+
" lora_dropout=0.05,\n",
251+
" bias=\"none\",\n",
252+
" task_type=TaskType.CAUSAL_LM,\n",
253+
" target_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"o_proj\",\"gate_proj\",\"up_proj\",\"down_proj\"],\n",
254+
")\n",
255+
"model = get_peft_model(model, lora_cfg)\n",
256+
"model.print_trainable_parameters()"
257+
]
258+
},
259+
{
260+
"cell_type": "markdown",
261+
"metadata": {},
262+
"source": ["## Step 7: Train"]
263+
},
264+
{
265+
"cell_type": "code",
266+
"execution_count": null,
267+
"metadata": {},
268+
"outputs": [],
269+
"source": [
270+
"from transformers import TrainingArguments, Trainer, DataCollator\n",
271+
"import os\n",
272+
"\n",
273+
"OUTPUT_DIR = \"/kaggle/working/output\"\n",
274+
"os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
275+
"\n",
276+
"collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)\n",
277+
"\n",
278+
"trainer = Trainer(\n",
279+
" model=model,\n",
280+
" args=TrainingArguments(\n",
281+
" output_dir=OUTPUT_DIR,\n",
282+
" per_device_train_batch_size=1,\n",
283+
" gradient_accumulation_steps=32, # Effective batch = 32\n",
284+
" num_train_epochs=3,\n",
285+
" learning_rate=1e-4,\n",
286+
" fp16=True,\n",
287+
" bf16=False,\n",
288+
" warmup_ratio=0.05,\n",
289+
" max_grad_norm=0.3,\n",
290+
" logging_steps=10,\n",
291+
" save_steps=100,\n",
292+
" eval_steps=100,\n",
293+
" save_total_limit=2,\n",
294+
" gradient_checkpointing=True,\n",
295+
" gradient_checkpointing_kwargs={\"use_reentrant\": False},\n",
296+
" optim=\"paged_adamw_8bit\",\n",
297+
" remove_unused_columns=False,\n",
298+
" report_to=\"none\",\n",
299+
" ),\n",
300+
" train_dataset=train_ds,\n",
301+
" eval_dataset=val_ds,\n",
302+
" data_collator=collator,\n",
303+
")\n",
304+
"\n",
305+
"print(\"Starting training...\")\n",
306+
"print(f\"Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}\")\n",
307+
"print(f\"GPU memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB\")\n",
308+
"\n",
309+
"model.train()\n",
310+
"trainer.train()\n",
311+
"print(\"✅ Training complete!\")"
312+
]
313+
},
314+
{
315+
"cell_type": "markdown",
316+
"metadata": {},
317+
"source": ["## Step 8: Save & Upload"]
318+
},
319+
{
320+
"cell_type": "code",
321+
"execution_count": null,
322+
"metadata": {},
323+
"outputs": [],
324+
"source": [
325+
"print(\"Saving adapter...\")\n",
326+
"adapter_path = \"/kaggle/working/final_adapter\"\n",
327+
"model.save_pretrained(adapter_path)\n",
328+
"\n",
329+
"print(\"Merging with base model...\")\n",
330+
"from peft import PeftModel\n",
331+
"from transformers import AutoModelForCausalLM\n",
332+
"\n",
333+
"base = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map=\"auto\")\n",
334+
"merged = PeftModel.from_pretrained(base, adapter_path).merge_and_unload()\n",
335+
"\n",
336+
"merged_path = \"/kaggle/working/merged_model\"\n",
337+
"merged.save_pretrained(merged_path)\n",
338+
"tokenizer.save_pretrained(merged_path)\n",
339+
"\n",
340+
"print(f\"Model saved to {merged_path}\")\n",
341+
"\n",
342+
"print(\"Uploading to HuggingFace...\")\n",
343+
"from huggingface_hub import HfApi\n",
344+
"api = HfApi()\n",
345+
"api.upload_folder(\n",
346+
" folder_path=merged_path,\n",
347+
" repo_id=\"my-ai-stack/Stack-2.9-7B-finetuned\",\n",
348+
" repo_type=\"model\",\n",
349+
")\n",
350+
"print(\"🎉 Done! Model uploaded to my-ai-stack/Stack-2.9-7B-finetuned\")"
351+
]
352+
}
353+
],
354+
"metadata": {
355+
"kaggle": {
356+
"accelerator": "GPU",
357+
"data_sources": [],
358+
"docker_image_version": "latest",
359+
"gpu": "T4",
360+
"internet": "on",
361+
"license": ["apache-2.0"]
362+
},
363+
"kernelspec": {
364+
"display_name": "Python 3",
365+
"language": "python",
366+
"name": "python3"
367+
}
368+
},
369+
"nbformat": 4,
370+
"nbformat_minor": 4
371+
}

0 commit comments

Comments
 (0)