paramkpr
diff --git a/‎README.md‎
Lines changed: 24 additions & 1 deletion b/‎README.md‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎configs/deberta_large_sst2.yaml‎
Lines changed: 0 additions & 20 deletions b/‎configs/deberta_large_sst2.yaml‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎configs/deberta_large_sst2_mps.yaml‎ ‎…figs/teacher/deberta_large_sst2_mps.yaml‎configs/deberta_large_sst2_mps.yaml renamed to configs/teacher/deberta_large_sst2_mps.yaml b/‎configs/deberta_large_sst2_mps.yaml‎ ‎…figs/teacher/deberta_large_sst2_mps.yaml‎configs/deberta_large_sst2_mps.yaml renamed to configs/teacher/deberta_large_sst2_mps.yaml
diff --git a/‎configs/teacher/sst2_hf.yaml‎
Lines changed: 50 additions & 0 deletions b/‎configs/teacher/sst2_hf.yaml‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎notebooks/eda_sst_2.ipynb‎
Lines changed: 0 additions & 12 deletions b/‎notebooks/eda_sst_2.ipynb‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎notebooks/teacher.ipynb‎
Lines changed: 0 additions & 12 deletions b/‎notebooks/teacher.ipynb‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎tests/__init__.py‎ ‎src/__init__.py‎tests/__init__.py renamed to src/__init__.py b/‎tests/__init__.py‎ ‎src/__init__.py‎tests/__init__.py renamed to src/__init__.py
diff --git a/‎src/cli/01_train_teacher.py‎
Lines changed: 135 additions & 0 deletions b/‎src/cli/01_train_teacher.py‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎src/data.py‎
Lines changed: 78 additions & 0 deletions b/‎src/data.py‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎src/models.py‎
Lines changed: 34 additions & 0 deletions b/‎src/models.py‎
Lines changed: 34 additions & 0 deletions
@@ -31,4 +31,27 @@ More details coming soon!
   - `evaluation/`: Evaluation metrics and analysis
 - `tests/`: Unit tests
 - `notebooks/`: Jupyter notebooks for exploration
-- `scripts/`: Utility scripts 
+- `scripts/`: Utility scripts 
+
+
+## Training
+To run on `weftdrive`: 
+```bash
+  nohup /srv/gpurun.pl python src/senti_synth/cli/01_train_teacher.py configs/teacher/stt2_hf.yaml > ~/scratch/senti_synth/logs/$(date +%Y%m%d_%H%M).log 2>&1 &
+```
+
+### Setting up on weftdrive
+1. SSH into weftdrive: `ssh paramkapur@weftdrive.private.reed.edu`
+2. Git clone the repository: `git clone https://github.com/paramkpr/senti_synth.git`
+3. Setup the conda environment `/srv/conda/bin/conda init` and `source ~/.bashrc`
+4. Enter the conda environment `conda activate deep-learning`
+   1. Check what packages are installed `conda list`
+   2. Install the packages for the project `pip install -r requirements.txt`
+   3. Install the project `pip install -e .`
+5. SCP `data/clean` to `weftdrive:~/scratch/data/clean`: `scp -r data/clean weftdrive:~/scratch/data/`
+   1. Ensure that the config file points to the correct path: `dataset_path: "~/scratch/data/clean"`
+6. Setup W&B: 
+   1. `export WANDB_API_KEY="..."`
+   2. `python -m wandb login`
+7. Run the training script: `nohup /srv/gpurun.pl python src/senti_synth/cli/01_train_teacher.py configs/teacher/stt2_hf.yaml > ~/scratch/senti_synth/logs/$(date +%Y%m%d_%H%M).log 2>&1 &`
+
@@ -0,0 +1,50 @@
+model:
+  model_name: "microsoft/deberta-v3-base"
+  num_labels: 2
+  use_fast_tokenizer: true
+
+data:
+  dataset_path: "~/scratch/data/clean" # Use HF dataset identifier
+  max_len: 32
+  train_split: "train"
+  validation_split: "val"
+  test_split: "test"
+
+training:
+  output_dir: "runs/teacher/deberta_v3_base" # Specific output for this run
+  overwrite_output_dir: true
+  run_name: "teacher_sst2_deberta_v3_base_run" # Optional W&B/TensorBoard run name
+
+  # Reporting
+  report_to: "wandb" 
+  wandb_project: "senti_synth_teacher" 
+
+  # Batching & Epochs
+  per_device_train_batch_size: 16
+  per_device_eval_batch_size: 32
+  gradient_accumulation_steps: 1
+  num_train_epochs: 3
+
+  # Optimizer & Scheduler
+  learning_rate: 3e-5
+  warmup_ratio: 0.1
+
+  # Logging, Saving, Evaluation
+  logging_steps: 50
+  eval_steps: 200 # Evaluate every N steps
+  save_steps: 200 # Save checkpoint every N steps
+  save_total_limit: 2 # Keep only the best and the latest checkpoints
+  load_best_model_at_end: true # Load the best model found during training
+  metric_for_best_model: "eval_f1" # Metric to determine the 'best' model
+  greater_is_better: true
+
+  # Hardware & Performance
+  fp16: true # Set to false if GPU doesn't support FP16 or causes issues
+
+  # Callbacks
+  use_early_stopping: true
+  early_stopping_patience: 3
+  early_stopping_threshold: 0.001 # Small improvement needed to reset patience
+
+  # Optional: Evaluate on test set after training
+  do_test_eval: true
@@ -475,18 +475,6 @@
    "display_name": "Python (sentisynth)",
    "language": "python",
    "name": "auctionn"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.13.3"
   }
  },
  "nbformat": 4,
 
@@ -165,18 +165,6 @@
    "display_name": "Python3.11 (sentisynth)",
    "language": "python",
    "name": "auctionn"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
   }
  },
  "nbformat": 4,
 
@@ -0,0 +1,135 @@
+import typer
+import os
+import yaml
+import logging
+from pathlib import Path
+
+import numpy as np
+from sklearn.metrics import precision_recall_fscore_support, accuracy_score
+import torch
+from transformers import DataCollatorWithPadding, IntervalStrategy, TrainingArguments, Trainer
+
+from src.models import build_teacher
+from src.data import ClassificationDataModule
+
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+app = typer.Typer()
+
+
+def compute_metrics(p):
+    """Computes metrics for HF Trainer."""
+    preds = np.argmax(p.predictions, axis=1)
+    labels = p.label_ids
+    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary') # Assuming binary
+    acc = accuracy_score(labels, preds)
+    return {
+        'accuracy': acc,
+        'f1': f1,
+        'precision': precision,
+        'recall': recall
+    }
+
+
+@app.command()
+def main(config_path: Path = type.Argument(..., help="Path to YAML config")):
+    cfg = yaml.safe_load(config_path.read_text())
+
+    # --- SETUP W&B ---
+    run_name = cfg['training'].get("run_name", f"teacher_train_{Path(cfg['training']['output_dir']).name}")
+    report_to = cfg['training'].get("report_to", "none") # Default to no reporting
+    if report_to == "wandb":
+        project_name = cfg['training'].get("wandb_project", "senti_synth_teacher")
+        os.environ.pop("WANDB_DISABLED", None) # Ensure it's enabled if requested
+        os.environ["WANDB_PROJECT"] = project_name
+        logger.info(f"Reporting to W&B project: {project_name}")
+    else:
+        os.environ["WANDB_DISABLED"] = "true" # Explicitly disable
+        logger.info("W&B reporting disabled.")
+
+    # --- BUILD MODEL ---
+    model, tokenizer = build_teacher(cfg['model'])
+    
+    # --- SETUP DATA ---
+    data_module = ClassificationDataModule(cfg['data'], tokenizer)
+    data_module.setup()
+    train_dataset = data_module.get_train_dataset()
+    eval_dataset = data_module.get_eval_dataset()
+
+    # --- SETUP TRAINER ---
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    training_args_dict = {
+        "output_dir": cfg['training']['output_dir'],
+        "overwrite_output_dir": cfg['training'].get("overwrite_output_dir", True),
+        "do_train": True,
+        "do_eval": eval_dataset is not None, # Only do eval if eval_dataset exists
+        "per_device_train_batch_size": cfg['training'].get("per_device_train_batch_size", 8),
+        "per_device_eval_batch_size": cfg['training'].get("per_device_eval_batch_size", 16),
+        "gradient_accumulation_steps": cfg['training'].get("gradient_accumulation_steps", 1),
+        "num_train_epochs": cfg['training'].get("num_train_epochs", 3),
+        "learning_rate": cfg['training'].get("learning_rate", 5e-5),
+        "warmup_ratio": cfg['training'].get("warmup_ratio", 0.1),
+        "fp16": cfg['training'].get("fp16", torch.cuda.is_available()), # Enable FP16 if available by default
+        "logging_dir": cfg['training'].get("logging_dir", f"{cfg['training']['output_dir']}/logs"),
+        "logging_steps": cfg['training'].get("logging_steps", 100),
+        "eval_strategy": IntervalStrategy.STEPS if eval_dataset is not None else IntervalStrategy.NO,
+        "eval_steps": cfg['training'].get("eval_steps", 500),
+        "save_strategy": IntervalStrategy.STEPS,
+        "save_steps": cfg['training'].get("save_steps", 500),
+        "save_total_limit": cfg['training'].get("save_total_limit", 2),
+        "load_best_model_at_end": cfg['training'].get("load_best_model_at_end", eval_dataset is not None), # Only if eval is done
+        "metric_for_best_model": cfg['training'].get("metric_for_best_model", "eval_f1" if eval_dataset else None),
+        "greater_is_better": cfg['training'].get("greater_is_better", True),
+        "report_to": [report_to] if report_to != "none" else [],
+        "run_name": run_name,
+        "label_names": ["labels"], # Standard practice
+        "remove_unused_columns": False, # We already removed them in data module
+        "ddp_find_unused_parameters": cfg['training'].get("ddp_find_unused_parameters", False),
+    }
+
+    training_args = TrainingArguments(**training_args_dict)
+    logger.info(f"Training arguments: {training_args}. FP16 Enabled: {training_args.fp16}")
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics if eval_dataset is not None else None,
+    )
+
+    # --- TRAIN ---
+    logger.info("Training model...")
+    train_result = trainer.train()
+    logger.info(f"Training results: {train_result}")
+
+    # Save final model & metrics
+    logger.info(f"Saving best model to {training_args.output_dir}")
+    trainer.save_model() # Saves the best model due to load_best_model_at_end=True
+    trainer.save_state()
+
+    # Log final metrics
+    metrics = train_result.metrics
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+
+    # Evaluate on test set if available
+    test_dataset = data_module.get_test_dataset()
+    if test_dataset and cfg['training'].get("do_test_eval", True):
+        logger.info("Evaluating on test set...")
+        test_metrics = trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")
+        trainer.log_metrics("test", test_metrics)
+        trainer.save_metrics("test", test_metrics)
+        logger.info(f"Test set evaluation complete: {test_metrics}")
+
+
+    logger.info("Script finished successfully.")
+
+
+if __name__ == "__main__":
+    app()
@@ -0,0 +1,78 @@
+import logging
+from datasets import load_from_disk, DatasetDict
+from transformers import AutoTokenizer
+logger = logging.getLogger(__name__)
+
+
+class ClassificationDataModule:
+    """
+    Data module for classification tasks. Handles standard text classification setup.
+    Used by Teacher (training/eval) and Student (eval). 
+    """
+    def __init__(self, cfg: dict, tokenizer: AutoTokenizer):
+        self.cfg = cfg
+        self.tokenizer = tokenizer
+        self.dataset_path = cfg.get("dataset_path", None)
+        self.max_len = cfg.get("max_len", 128)
+        
+        self.tokenized_datasets = None
+
+        self.required_splits = ["train", "val", "sanity", "test"]
+        self.required_columns = ["text", "labels"]
+
+    def _load_clean_dataset(self) -> DatasetDict:
+        logger.info(f"Loading dataset from: {self.dataset_path}")
+        dataset = load_from_disk(self.dataset_path)
+
+        missing_splits = [s for s in self.required_splits if s not in dataset]
+        missing_cols = [c for c in self.required_columns if c not in dataset["train"].column_names]
+        
+        if missing_splits:
+            raise ValueError(f"Dataset missing splits: {missing_splits}")
+        if missing_cols:
+            raise ValueError(f"Dataset missing columns: {missing_cols}")
+                
+        return dataset
+    
+    def _tokenize_function(self, examples):
+        """Tokenization function for map."""
+        # Ensure correct text column is used
+        return self.tokenizer(
+            examples["text"],
+            truncation=True,
+            padding=False,  # Trainer handles padding with data collator
+            max_length=self.max_len
+        )
+    
+    def setup(self):
+        """Loads and tokenizes the dataset."""
+        if self.tokenized_datasets:
+            return
+        
+        raw_datasets = self._load_clean_dataset()
+        self.tokenized_datasets = raw_datasets.map(
+            self._tokenize_function,
+            batched=True,
+            remove_columns=[c for c in raw_datasets["train"].column_names if c not in
+                             ["input_ids", "attention_mask", "labels"]]
+        )
+        
+        logger.info(f"Loaded and tokenized datasets with max length: {self.max_len}")
+        logger.info(f"Columns in tokenized datasets: {self.tokenized_datasets['train'].column_names}")
+
+    def get_train_dataset(self):
+        if not self.tokenized_datasets: self.setup()  # noqa: E701
+        return self.tokenized_datasets["train"]
+    
+    def get_eval_dataset(self):
+        if not self.tokenized_datasets: self.setup()  # noqa: E701
+        return self.tokenized_datasets["val"]
+    
+    def get_sanity_dataset(self):
+        if not self.tokenized_datasets: self.setup()  # noqa: E701
+        return self.tokenized_datasets["sanity"]
+    
+    def get_test_dataset(self):
+        if not self.tokenized_datasets: self.setup()  # noqa: E701
+        return self.tokenized_datasets["test"]
+    
@@ -0,0 +1,34 @@
+"""
+Model factory for the teacher model.
+"""
+import logging
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+logger = logging.getLogger(__name__)
+
+def build_teacher(cfg: dict):
+    """
+    Builds and returns the teacher model and tokenizer using Hugging Face.
+
+    Args:
+        cfg (dict): Configuration dictionary for the model, expecting keys like:
+            - model_name (str): Hugging Face model identifier.
+            - num_labels (int): Number of classification labels.
+
+    Returns:
+        tuple: (model, tokenizer)
+    """
+    model_name = cfg.get("model_name", "microsoft/deberta-v3-base")
+    num_labels = cfg.get("num_labels", 2)
+    use_fast_tokenizer = cfg.get("use_fast_tokenizer", True)
+
+    logger.info(f"Loading teacher model: {model_name} with {num_labels} labels.")
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_name,
+        num_labels=num_labels
+    )
+
+    logger.info(f"Loading tokenizer for: {model_name}")
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=use_fast_tokenizer)
+
+    return model, tokenizer