Add Kaggle 7B smart-train notebook (20K examples, QLoRA T4)

walidsobhie-code · walidsobhie-code · commit 5b6023c3e9dd · 2026-04-11T22:27:27.000+02:00
diff --git a/notebooks/kaggle_7b_smart_train.ipynb b/notebooks/kaggle_7b_smart_train.ipynb
@@ -0,0 +1,371 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 🚀 Stack 2.9 — 7B QLoRA Fine-tune on Kaggle (Smart 20K)\n",
+    "\n",
+    "**Base:** Qwen2.5-Coder-7B-Instruct\n",
+    "**Data:** my-ai-stack/stack-2-9-tool-20k-examples (20K smart examples)\n",
+    "**Output:** my-ai-stack/Stack-2.9-7B-finetuned\n",
+    "**Runtime:** GPU T4 16GB | **Time:** ~6-8 hours | **Cost:** FREE"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ⚠️ Before Starting\n",
+    "\n",
+    "1. Go to **Add-ons → Secrets** and add:\n",
+    "   - `HF_TOKEN` = your HuggingFace write token (starts with `hf_`)\n",
+    "2. Set **Accelerator** to **GPU T4**\n",
+    "3. Internet must be ON"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": ["## Step 1: Clone & Install"]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.chdir(\"/kaggle/working\")\n",
+    "\n",
+    "# Clone repo\n",
+    "!git clone https://github.com/my-ai-stack/stack-2.9.git\n",
+    "\n",
+    "# Install\n",
+    "!pip install -q transformers>=4.40.0 peft datasets bitsandbytes accelerate huggingface_hub flash-attn --no-build-isolation\n",
+    "\n",
+    "# Verify\n",
+    "import torch\n",
+    "print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
+    "print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": ["## Step 2: Login to HuggingFace"]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import login\n",
+    "import os\n",
+    "\n",
+    "# Read token from Kaggle secret\n",
+    "from kaggle_secrets import UserSecretsClient\n",
+    "user_secrets = UserSecretsClient()\n",
+    "hf_token = user_secrets.get_secret(\"HF_TOKEN\")\n",
+    "login(hf_token)\n",
+    "print(\"✅ Logged into HuggingFace\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": ["## Step 3: Download Smart 20K Dataset"]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import hf_hub_download\n",
+    "import os\n",
+    "\n",
+    "DATA_DIR = \"/kaggle/working/data\"\n",
+    "os.makedirs(DATA_DIR, exist_ok=True)\n",
+    "\n",
+    "# Download smart 20K dataset from org\n",
+    "print(\"Downloading smart 20K dataset...\")\n",
+    "path = hf_hub_download(\n",
+    "    repo_id=\"my-ai-stack/stack-2-9-tool-20k-examples\",\n",
+    "    filename=\"tool_examples_smart_20k.jsonl\",\n",
+    "    repo_type=\"dataset\",\n",
+    "    local_dir=DATA_DIR,\n",
+    ")\n",
+    "print(f\"Dataset: {path}\")\n",
+    "\n",
+    "# Count\n",
+    "with open(path) as f:\n",
+    "    n_lines = sum(1 for _ in f)\n",
+    "print(f\"Examples: {n_lines:,}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": ["## Step 4: Setup Data Pipeline"]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "MODEL_NAME = \"Qwen/Qwen2.5-Coder-7B-Instruct\"\n",
+    "MAX_LENGTH = 2048  # T4 fits this with QLoRA\n",
+    "\n",
+    "print(\"Loading tokenizer...\")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
+    "tokenizer.padding_side = \"right\"\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "\n",
+    "print(\"Loading dataset...\")\n",
+    "raw = load_dataset(\"json\", data_files=path, split=\"train\")\n",
+    "print(f\"Loaded {len(raw)} examples\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": ["## Step 5: Tokenize Data (messages format)"]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def format_conversation(example):\n",
+    "    messages = example[\"messages\"]\n",
+    "    text = \"\"\n",
+    "    for msg in messages:\n",
+    "        role = msg[\"role\"]\n",
+    "        content = msg.get(\"content\", \"\") or \"\"\n",
+    "        tc = msg.get(\"tool_calls\", [])\n",
+    "        if role == \"system\":\n",
+    "            text += f\"<|im_start|>system\\n{content}<|im_end|>\\n\"\n",
+    "        elif role == \"user\":\n",
+    "            text += f\"<|im_start|>user\\n{content}<|im_end|>\\n\"\n",
+    "        elif role == \"assistant\":\n",
+    "            if tc:\n",
+    "                for t in tc:\n",
+    "                    fn = t[\"function\"]\n",
+    "                    text += f\"<|im_start|>assistant\\n<tool_call>\\n<name>{fn['name']}</name>\\n<args>\\n{fn['arguments']}\\n</args>\\n</tool_call>\\n\"\n",
+    "                if content:\n",
+    "                    text += f\"{content}<|im_end|>\\n\"\n",
+    "                else:\n",
+    "                    text += \"<|im_end|>\\n\"\n",
+    "            else:\n",
+    "                text += f\"<|im_start|>assistant\\n{content}<|im_end|>\\n\"\n",
+    "        elif role == \"tool\":\n",
+    "            text += f\"<|im_start|>tool\\n{content}<|im_end|>\\n\"\n",
+    "    text += \"<|im_start|>assistant\\n\"\n",
+    "    return {\"text\": text}\n",
+    "\n",
+    "print(\"Tokenizing...\")\n",
+    "formatted = raw.map(format_conversation)\n",
+    "\n",
+    "def tokenize(example):\n",
+    "    tokens = tokenizer(example[\"text\"], truncation=True, max_length=MAX_LENGTH, padding=\"max_length\")\n",
+    "    tokens[\"labels\"] = tokens[\"input_ids\"].copy()\n",
+    "    # Mask user/system/tool tokens\n",
+    "    input_ids = tokens[\"input_ids\"]\n",
+    "    labels = tokens[\"labels\"]\n",
+    "    \n",
+    "    # Find last assistant start\n",
+    "    asr = tokenizer.encode(\"<|im_start|>assistant\", add_special_tokens=False)\n",
+    "    found = -1\n",
+    "    for i in range(len(input_ids) - len(asr) + 1):\n",
+    "        if input_ids[i:i+len(asr)] == asr:\n",
+    "            found = i\n",
+    "    \n",
+    "    if found >= 0:\n",
+    "        for j in range(found + len(asr)):\n",
+    "            labels[j] = -100\n",
+    "    \n",
+    "    # Mask padding\n",
+    "    for j, m in enumerate(tokens[\"attention_mask\"]):\n",
+    "        if m == 0:\n",
+    "            labels[j] = -100\n",
+    "    \n",
+    "    return tokens\n",
+    "\n",
+    "tokenized = formatted.map(tokenize, remove_columns=formatted.column_names, desc=\"Tokenizing\")\n",
+    "tokenized = tokenized.filter(lambda x: x[\"labels\"] is not None)\n",
+    "\n",
+    "# Split\n",
+    "split = tokenized.train_test_split(test_size=0.05)\n",
+    "train_ds = split[\"train\"]\n",
+    "val_ds = split[\"test\"]\n",
+    "print(f\"Train: {len(train_ds)}, Val: {len(val_ds)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": ["## Step 6: Load Model + LoRA"]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from peft import LoraConfig, get_peft_model, TaskType\n",
+    "from transformers import BitsAndBytesConfig\n",
+    "import torch\n",
+    "\n",
+    "# Quantization config for T4\n",
+    "bnb_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_compute_dtype=torch.float16,\n",
+    "    bnb_4bit_quant_type=\"nf4\",\n",
+    "    bnb_4bit_use_double_quant=True,\n",
+    ")\n",
+    "\n",
+    "print(\"Loading model...\")\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    MODEL_NAME,\n",
+    "    quantization_config=bnb_config,\n",
+    "    device_map=\"auto\",\n",
+    "    trust_remote_code=True,\n",
+    ")\n",
+    "model.config.use_cache = False\n",
+    "\n",
+    "# LoRA config\n",
+    "lora_cfg = LoraConfig(\n",
+    "    r=32,\n",
+    "    lora_alpha=64,\n",
+    "    lora_dropout=0.05,\n",
+    "    bias=\"none\",\n",
+    "    task_type=TaskType.CAUSAL_LM,\n",
+    "    target_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"o_proj\",\"gate_proj\",\"up_proj\",\"down_proj\"],\n",
+    ")\n",
+    "model = get_peft_model(model, lora_cfg)\n",
+    "model.print_trainable_parameters()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": ["## Step 7: Train"]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments, Trainer, DataCollator\n",
+    "import os\n",
+    "\n",
+    "OUTPUT_DIR = \"/kaggle/working/output\"\n",
+    "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
+    "\n",
+    "collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=TrainingArguments(\n",
+    "        output_dir=OUTPUT_DIR,\n",
+    "        per_device_train_batch_size=1,\n",
+    "        gradient_accumulation_steps=32,  # Effective batch = 32\n",
+    "        num_train_epochs=3,\n",
+    "        learning_rate=1e-4,\n",
+    "        fp16=True,\n",
+    "        bf16=False,\n",
+    "        warmup_ratio=0.05,\n",
+    "        max_grad_norm=0.3,\n",
+    "        logging_steps=10,\n",
+    "        save_steps=100,\n",
+    "        eval_steps=100,\n",
+    "        save_total_limit=2,\n",
+    "        gradient_checkpointing=True,\n",
+    "        gradient_checkpointing_kwargs={\"use_reentrant\": False},\n",
+    "        optim=\"paged_adamw_8bit\",\n",
+    "        remove_unused_columns=False,\n",
+    "        report_to=\"none\",\n",
+    "    ),\n",
+    "    train_dataset=train_ds,\n",
+    "    eval_dataset=val_ds,\n",
+    "    data_collator=collator,\n",
+    ")\n",
+    "\n",
+    "print(\"Starting training...\")\n",
+    "print(f\"Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}\")\n",
+    "print(f\"GPU memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB\")\n",
+    "\n",
+    "model.train()\n",
+    "trainer.train()\n",
+    "print(\"✅ Training complete!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": ["## Step 8: Save & Upload"]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Saving adapter...\")\n",
+    "adapter_path = \"/kaggle/working/final_adapter\"\n",
+    "model.save_pretrained(adapter_path)\n",
+    "\n",
+    "print(\"Merging with base model...\")\n",
+    "from peft import PeftModel\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "\n",
+    "base = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map=\"auto\")\n",
+    "merged = PeftModel.from_pretrained(base, adapter_path).merge_and_unload()\n",
+    "\n",
+    "merged_path = \"/kaggle/working/merged_model\"\n",
+    "merged.save_pretrained(merged_path)\n",
+    "tokenizer.save_pretrained(merged_path)\n",
+    "\n",
+    "print(f\"Model saved to {merged_path}\")\n",
+    "\n",
+    "print(\"Uploading to HuggingFace...\")\n",
+    "from huggingface_hub import HfApi\n",
+    "api = HfApi()\n",
+    "api.upload_folder(\n",
+    "    folder_path=merged_path,\n",
+    "    repo_id=\"my-ai-stack/Stack-2.9-7B-finetuned\",\n",
+    "    repo_type=\"model\",\n",
+    ")\n",
+    "print(\"🎉 Done! Model uploaded to my-ai-stack/Stack-2.9-7B-finetuned\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kaggle": {
+   "accelerator": "GPU",
+   "data_sources": [],
+   "docker_image_version": "latest",
+   "gpu": "T4",
+   "internet": "on",
+   "license": ["apache-2.0"]
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}