fix: update notebook for packing training + HuggingFace data + base model

walidsobhie-code · walidsobhie-code · commit 1a9fd833ecfd · 2026-04-10T15:12:53.000+02:00
diff --git a/notebooks/colab_128k_training.ipynb b/notebooks/colab_128k_training.ipynb
@@ -5,36 +5,41 @@
    "metadata": {},
    "source": [
     "# 🎯 Stack 2.9 — 128K Context Fine-tuning\n",
-    "Fine-tune Qwen2.5-Coder-1.5B from 32K → 128K context\n",
     "\n",
-    "**Runtime:** GPU (T4 16GB recommended) | **Time:** ~2-3 hours"
+    "Fine-tune **Qwen2.5-Coder-1.5B** with **packed 128K context windows**.\n",
+    "\n",
+    "**Key innovation:** Instead of training on short ~500-token examples, we **pack 200+ examples** into each 128K window. This multiplies training signal and teaches the model to track tool state across long, multi-turn interactions.\n",
+    "\n",
+    "**Runtime:** Runtime → Change runtime type → **GPU (T4 16GB recommended)**\n",
+    "**Time:** ~6-8 hours on free Colab T4"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "## Step 1: Clone Stack 2.9 & Install Dependencies"
-   ]
+   "source": ["## Step 1: Clone Stack 2.9 & Install Dependencies"]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Clone the repo (gets the fixed training script)\n",
     "!git clone https://github.com/my-ai-stack/stack-2.9.git\n",
     "cd stack-2.9\n",
-    "!pip install -q transformers peft datasets bitsandbytes accelerate huggingface_hub\n",
-    "!pip install -q scipy torch --upgrade"
+    "\n",
+    "# Install all dependencies\n",
+    "!pip install -q transformers peft datasets bitsandbytes>=0.46.1 accelerate huggingface_hub scipy\n",
+    "!pip install -q torch --upgrade\n",
+    "\n",
+    "print('✅ Dependencies installed')"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "## Step 2: Login to HuggingFace (push weights later)"
-   ]
+   "source": ["## Step 2: Login to HuggingFace\n\nGet your token at: https://huggingface.co/settings/tokens"]
   },
   {
    "cell_type": "code",
@@ -43,15 +48,16 @@
    "outputs": [],
    "source": [
     "from huggingface_hub import login\n",
-    "# Get your token at: https://huggingface.co/settings/tokens\n",
-    "login(token=\"YOUR_HF_TOKEN\")  # ← Replace with your token"
+    "# 👇 Replace with YOUR HuggingFace token\n",
+    "login(token=\"YOUR_HF_TOKEN_HERE\")  # ← 🔴 PUT YOUR HF TOKEN HERE\n",
+    "print('✅ Logged into HuggingFace')"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Step 3: Mount Google Drive (optional — for saving checkpoints)"
+    "## Step 3: Mount Google Drive\n\nTraining checkpoints and the final adapter will be saved here."
    ]
   },
   {
@@ -62,14 +68,16 @@
    "source": [
     "from google.colab import drive\n",
     "drive.mount('/content/drive')\n",
-    "OUTPUT_DIR = \"/content/drive/MyDrive/stack-2.9-128k-output\""
+    "OUTPUT_DIR = '/content/drive/MyDrive/stack-2.9-128k-output'\n",
+    "import os; os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
+    "print(f'📁 Output directory: {OUTPUT_DIR}')"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Step 4: Run 128K Context Fine-tuning"
+    "## Step 4: Download Training Data\n\nWe use the dataset uploaded to HuggingFace Hub — 1500 tool-calling examples, packed into 128K sequences."
    ]
   },
   {
@@ -78,31 +86,39 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import subprocess\n",
-    "result = subprocess.run([\n",
-    "    \"python3\", \"training/train_extended_context.py\",\n",
-    "    \"--model-path\", \"my-ai-stack/Stack-2-9-finetuned\",\n",
-    "    \"--data-path\", \"training/training-data/tool_examples_combined.jsonl\",\n",
-    "    \"--output-dir\", OUTPUT_DIR,\n",
-    "    \"--context-length\", \"131072\",\n",
-    "    \"--lora-rank\", \"64\",\n",
-    "    \"--epochs\", \"3\",\n",
-    "    \"--push-to-hub\",\n",
-    "    \"--hub-model-id\", \"YOUR_USERNAME/stack-2.9-128k\"\n",
-    "], cwd=\"/content/stack-2.9\")\n",
-    "print(result.stdout)\n",
-    "print(result.stderr)"
+    "import huggingface_hub\n",
+    "\n",
+    "DATA_FILE = '/content/tool_examples.jsonl'\n",
+    "\n",
+    "print('Downloading training data from HuggingFace...')\n",
+    "hf_id = 'walidsobhie/stack-2-9-tool-examples'\n",
+    "path = huggingface_hub.hf_hub_download(\n",
+    "    repo_id=hf_id,\n",
+    "    filename='tool_examples_combined.jsonl',\n",
+    "    repo_type='dataset',\n",
+    "    local_dir='/content/',\n",
+    "    local_dir_use_symlinks=False,\n",
+    ")\n",
+    "import shutil\n",
+    "shutil.move(path, DATA_FILE)\n",
+    "print(f'✅ Dataset ready: {DATA_FILE}')\n",
+    "\n",
+    "# Quick sanity check\n",
+    "import json\n",
+    "with open(DATA_FILE) as f:\n",
+    "    lines = f.readlines()\n",
+    "print(f'   Total examples: {len(lines)}')\n",
+    "ex = json.loads(lines[0])\n",
+    "print(f'   Keys: {list(ex.keys())}')"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "---\n",
+    "## Step 5: Run 128K Packed Context Fine-tuning\n\n**This cell runs the full training. On free Colab T4 it takes ~6-8 hours.**\n",
     "\n",
-    "## Alternative: Run on Base Qwen Model (if HF model not loaded)\n",
-    "\n",
-    "If the fine-tuned model isn't available, use the base model:"
+    "If Colab disconnects, your checkpoints are safe in Google Drive. Reconnect and re-run this cell — it will resume from the last checkpoint."
    ]
   },
   {
@@ -111,22 +127,38 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Change --model-path to:\n",
-    "# \"Qwen/Qwen2.5-Coder-1.5B\"\n",
-    "# And add --push-to-hub with your own model ID"
+    "import subprocess\n",
+    "\n",
+    "# Run the fixed training script with packing enabled\n",
+    "result = subprocess.run([\n",
+    "    \"python3\", \"training/train_extended_context.py\",\n",
+    "    \"--model-path\", \"Qwen/Qwen2.5-Coder-1.5B\",\n",
+    "    \"--data-path\", \"/content/tool_examples.jsonl\",\n",
+    "    \"--output-dir\", OUTPUT_DIR,\n",
+    "    \"--context-length\", \"131072\",\n",
+    "    \"--lora-rank\", \"32\",\n",
+    "    \"--epochs\", \"3\",\n",
+    "    \"--batch-size\", \"1\",\n",
+    "    \"--grad-accum\", \"16\",\n",
+    "    \"--lr\", \"2e-4\",\n",
+    "    \"--use-packing\",\n",
+    "    \"--push-to-hub\",\n",
+    "    \"--hub-model-id\", \"walidsobhie/stack-2.9-128k-context\"\n",
+    "], cwd=\"/content/stack-2.9\")\n",
+    "\n",
+    "print('STDOUT:', result.stdout)\n",
+    "print('STDERR:', result.stderr[-3000:] if result.stderr else '(none)')"
    ]
   }
  ],
  "metadata": {
-  "accelerator": "GPU",
   "colab": {
    "provenance": [],
-   "machine_shape": "hm"
+   "name": "stack-2.9-128k-packed-training"
   },
   "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
+   "name": "python3",
+   "display_name": "Python 3"
   },
   "language_info": {
    "name": "python",