From 102f2c54e3eafcd8a15c1747f215072d6047891f Mon Sep 17 00:00:00 2001
From: BuildTools <unconfigured@null.spigotmc.org>
Date: Sat, 14 Mar 2026 16:01:54 -0400
Subject: [PATCH 1/2] quantization files for table results

---
 example/lptm_quantize_table_results.py   | 223 +++++++++++++++++++++++
 example/moment_quantize_table_results.py | 195 ++++++++++++++++++++
 2 files changed, 418 insertions(+)
 create mode 100644 example/lptm_quantize_table_results.py
 create mode 100644 example/moment_quantize_table_results.py

diff --git a/example/lptm_quantize_table_results.py b/example/lptm_quantize_table_results.py
new file mode 100644
index 0000000..3e3a671
--- /dev/null
+++ b/example/lptm_quantize_table_results.py
@@ -0,0 +1,223 @@
+import os
+import sys
+import time
+import copy
+import torch
+import numpy as np
+import torch.nn as nn
+from sklearn.metrics import mean_squared_error
+
+
+src_path = os.path.abspath(os.path.join("..", "src"))
+if src_path not in sys.path:
+    sys.path.insert(0, src_path)
+
+from samay.model import LPTMModel
+from samay.dataset import LPTMDataset
+
+
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("Using device:", device)
+
+torch.backends.quantized.engine = "qnnpack"
+
+
+# Quantization
+
+def quantize_linear_layers(model, quantization_type="int8"):
+
+    if quantization_type == "int8":
+        return torch.quantization.quantize_dynamic(
+            model,
+            {nn.Linear},
+            dtype=torch.qint8
+        )
+
+    elif quantization_type == "float16":
+        return torch.quantization.quantize_dynamic(
+            model,
+            {nn.Linear},
+            dtype=torch.float16
+        )
+
+    else:
+        raise ValueError("Unsupported quantization type")
+
+
+def quantize(lptm_model, quant_type="int8"):
+
+    lptm_model.model.eval()
+    lptm_model.model = lptm_model.model.to("cpu")
+
+    with torch.no_grad():
+        lptm_model.model = quantize_linear_layers(
+            lptm_model.model,
+            quantization_type=quant_type
+        )
+
+    return lptm_model.model
+
+
+# Load LPTM
+
+config = {
+    "task_name": "forecasting",
+    "forecast_horizon": 192,
+    "head_dropout": 0,
+    "weight_decay": 0,
+    "max_patch": 16,
+    "freeze_encoder": True,
+    "freeze_embedder": True,
+    "freeze_head": False,
+    "freeze_segment": True,
+}
+
+base_model = LPTMModel(config)
+
+
+
+
+train_dataset = LPTMDataset(
+    name="ett",
+    datetime_col="date",
+    path="./data/data/ETTh1.csv",
+    mode="train",
+    horizon=192,
+)
+
+val_dataset = LPTMDataset(
+    name="ett",
+    datetime_col="date",
+    path="./data/data/ETTh1.csv",
+    mode="test",
+    horizon=192,
+)
+
+# Finetune
+
+# print("Finetuning LPTM...")
+# base_model = base_model.finetune(train_dataset)
+
+
+# Create models
+
+fp32_model = copy.deepcopy(base_model)
+
+fp16_model = copy.deepcopy(base_model)
+fp16_model.model = fp16_model.model.half().to(device)
+
+int8_model = copy.deepcopy(base_model)
+quantize(int8_model, "int8")
+
+print("Model setup complete.")
+
+
+# Evaluation Functions
+
+def compute_mse(lptm_model, dataset):
+
+    model = lptm_model.model
+    model.eval()
+
+    run_device = "cpu" if next(model.parameters()).dtype == torch.qint8 else device
+    model.to(run_device)
+
+    preds = []
+    trues = []
+
+    with torch.no_grad():
+
+        for i in range(len(dataset)):
+
+            sample = dataset[i]
+
+            x = sample[0]
+            y_future = sample[2]
+
+            dtype = next(model.parameters()).dtype
+            x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device)
+
+            output = model(x_enc=x)
+            pred = output.forecast.squeeze()
+
+            preds.append(pred.cpu().numpy().reshape(-1))
+            trues.append(np.array(y_future).reshape(-1))
+
+    preds = np.concatenate(preds)
+    trues = np.concatenate(trues)
+
+    print("Prediction shape:", preds.shape)
+    print("Target shape:", trues.shape)
+
+    return mean_squared_error(trues, preds)
+
+
+def model_size(lptm_model):
+
+    torch.save(lptm_model.model.state_dict(), "temp.pt")
+    size = os.path.getsize("temp.pt") / 1e6
+    os.remove("temp.pt")
+
+    return size
+
+
+def inference_time(lptm_model, dataset, runs=10):
+
+    model = lptm_model.model
+    model.eval()
+
+    run_device = "cpu" if next(model.parameters()).dtype == torch.qint8 else device
+    model.to(run_device)
+
+    sample = dataset[0]
+    x = sample[0]
+
+    dtype = next(model.parameters()).dtype
+    x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device)
+
+
+    for _ in range(3):
+        with torch.no_grad():
+            model(x_enc=x).forecast
+
+    start = time.time()
+
+    for _ in range(runs):
+        with torch.no_grad():
+            model(x_enc=x).forecast
+
+    end = time.time()
+
+    return (end - start) / runs
+
+
+# Run Experiments
+
+print("\nEvaluating FP32...")
+mse_fp32 = compute_mse(fp32_model, val_dataset)
+size_fp32 = model_size(fp32_model)
+time_fp32 = inference_time(fp32_model, val_dataset)
+
+print("\nEvaluating FP16...")
+mse_fp16 = compute_mse(fp16_model, val_dataset)
+size_fp16 = model_size(fp16_model)
+time_fp16 = inference_time(fp16_model, val_dataset)
+
+print("\nEvaluating INT8...")
+mse_int8 = compute_mse(int8_model, val_dataset)
+size_int8 = model_size(int8_model)
+time_int8 = inference_time(int8_model, val_dataset)
+
+
+# Results
+
+speedup_fp16 = time_fp32 / time_fp16
+speedup_int8 = time_fp32 / time_int8
+
+print("\nLPTM Results (ETTh1, Horizon=192)")
+print("-------------------------------------")
+
+print(f"Float32  | MSE: {mse_fp32:.5f} | Size: {size_fp32:.2f} MB | Speedup: 1.0x")
+print(f"Float16  | MSE: {mse_fp16:.5f} | Size: {size_fp16:.2f} MB | Speedup: {speedup_fp16:.2f}x")
+print(f"INT8     | MSE: {mse_int8:.5f} | Size: {size_int8:.2f} MB | Speedup: {speedup_int8:.2f}x")
\ No newline at end of file
diff --git a/example/moment_quantize_table_results.py b/example/moment_quantize_table_results.py
new file mode 100644
index 0000000..5e62f91
--- /dev/null
+++ b/example/moment_quantize_table_results.py
@@ -0,0 +1,195 @@
+import os
+import sys
+import time
+import copy
+import torch
+import numpy as np
+from sklearn.metrics import mean_squared_error
+
+
+
+src_path = os.path.abspath(os.path.join("..", "src"))
+if src_path not in sys.path:
+    sys.path.insert(0, src_path)
+
+from samay.model import MomentModel
+from samay.dataset import MomentDataset
+
+import torch
+import torch.nn as nn
+
+torch.backends.quantized.engine = "qnnpack"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def quantize_linear_layers(model, quantization_type="int8"):
+
+    if quantization_type == "int8":
+        return torch.quantization.quantize_dynamic(
+            model,
+            {nn.Linear},
+            dtype=torch.qint8
+        )
+
+    elif quantization_type == "float16":
+        return torch.quantization.quantize_dynamic(
+            model,
+            {nn.Linear},
+            dtype=torch.float16
+        )
+
+    else:
+        raise ValueError("Unsupported quantization type")
+    
+def quantize(moment_model, quant_type="int8", device="cpu"):
+    
+    moment_model.model.eval()
+    moment_model.model = moment_model.model.to(device)
+
+    with torch.no_grad():
+        moment_model.model = quantize_linear_layers(
+            moment_model.model,
+            quantization_type=quant_type
+        )
+
+    return moment_model.model
+
+
+repo = "AutonLab/MOMENT-1-large"
+
+config = {
+    "task_name": "forecasting",
+    "forecast_horizon": 192,
+    "head_dropout": 0.1,
+    "weight_decay": 0,
+    "freeze_encoder": True,
+    "freeze_embedder": True,
+    "freeze_head": False,
+}
+
+base_model = MomentModel(config=config, repo=repo)
+
+val_dataset = MomentDataset(
+    name="ett",
+    datetime_col="date",
+    path="./src/samay/models/moment/data/ETTh1.csv",
+    mode="test",
+    horizon_len=192,
+    freq=None,
+)
+
+# Create models
+
+fp32_model = copy.deepcopy(base_model)
+
+fp16_model = copy.deepcopy(base_model)
+fp16_model.model = fp16_model.model.half().to(device)
+
+int8_model = copy.deepcopy(base_model)
+quantize(int8_model, "int8", device="cpu")
+
+print("THIS IS DONE YES!!!!")
+
+
+# Evaluation functions
+
+
+def compute_mse(moment_model, dataset):
+
+    model = moment_model.model
+    model.eval()
+
+    run_device = device
+    model.to(run_device)
+
+    preds = []
+    trues = []
+
+    with torch.no_grad():
+        for i in range(len(dataset)):
+
+            sample = dataset[i]
+
+            x = sample[0]          # (64, 512)
+            y_future = sample[2]   # (64, 192)
+
+            dtype = next(model.parameters()).dtype
+            x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device)
+
+            output = model(x_enc=x)
+            pred = output.forecast.squeeze()  # (64,192)
+
+            preds.append(pred.cpu().numpy().reshape(-1))
+            trues.append(np.array(y_future).reshape(-1))
+
+    preds = np.concatenate(preds)
+    trues = np.concatenate(trues)
+
+    print("Final prediction shape:", preds.shape)
+    print("Final target shape:", trues.shape)
+
+    return mean_squared_error(trues, preds)
+
+def model_size(moment_model):
+
+    torch.save(moment_model.model.state_dict(), "temp.pt")
+    size = os.path.getsize("temp.pt") / 1e6
+    os.remove("temp.pt")
+
+    return size
+
+
+def inference_time(moment_model, dataset, runs=10):
+
+    model = moment_model.model
+    model.eval()
+
+    run_device = device
+    model.to(run_device)
+
+    sample = dataset[0]
+    x = sample[0]
+
+    dtype = next(model.parameters()).dtype
+    x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device)
+
+    start = time.time()
+
+    for _ in range(runs):
+        with torch.no_grad():
+            model(x_enc=x).forecast
+
+    end = time.time()
+
+    return (end - start) / runs
+
+
+# Run experiments
+
+print("Evaluating FP32...")
+mse_fp32 = compute_mse(fp32_model, val_dataset)
+size_fp32 = model_size(fp32_model)
+time_fp32 = inference_time(fp32_model, val_dataset)
+
+print("Evaluating FP16...")
+mse_fp16 = compute_mse(fp16_model, val_dataset)
+size_fp16 = model_size(fp16_model)
+time_fp16 = inference_time(fp16_model, val_dataset)
+
+print("Evaluating INT8...")
+mse_int8 = compute_mse(int8_model, val_dataset)
+size_int8 = model_size(int8_model)
+time_int8 = inference_time(int8_model, val_dataset)
+
+
+speedup_fp16 = time_fp32 / time_fp16
+speedup_int8 = time_fp32 / time_int8
+
+# RESULTS
+
+print("\nMOMENT Results (ETTh1, Horizon=192)")
+print("-------------------------------------")
+
+print(f"Float32  | MSE: {mse_fp32:.5f} | Size: {size_fp32:.2f} MB | Speedup: 1.0x")
+print(f"Float16  | MSE: {mse_fp16:.5f} | Size: {size_fp16:.2f} MB | Speedup: {speedup_fp16:.2f}x")
+print(f"INT8     | MSE: {mse_int8:.5f} | Size: {size_int8:.2f} MB | Speedup: {speedup_int8:.2f}x")
\ No newline at end of file

From b43b4f7f544b01dc675c4cd6b29f540d5547dcb6 Mon Sep 17 00:00:00 2001
From: kage08 <harshavardhan864.hk@gmail.com>
Date: Mon, 16 Mar 2026 16:40:05 -0400
Subject: [PATCH 2/2] Working quantization example with CUDA

---
 example/lptm_quantize_table_results.py   | 139 ++++++++++++++++----
 example/moment_quantize_table_results.py | 148 +++++++++++++++++----
 pyproject.toml                           |   1 +
 src/samay/dataset.py                     |   1 -
 src/samay/model.py                       | 160 +++++++++++++++++------
 5 files changed, 355 insertions(+), 94 deletions(-)

diff --git a/example/lptm_quantize_table_results.py b/example/lptm_quantize_table_results.py
index 3e3a671..427e8cd 100644
--- a/example/lptm_quantize_table_results.py
+++ b/example/lptm_quantize_table_results.py
@@ -1,44 +1,45 @@
+import copy
 import os
 import sys
 import time
-import copy
-import torch
+
 import numpy as np
+import torch
 import torch.nn as nn
 from sklearn.metrics import mean_squared_error
 
+try:
+    import bitsandbytes as bnb
+except ImportError:
+    bnb = None
+
 
 src_path = os.path.abspath(os.path.join("..", "src"))
 if src_path not in sys.path:
     sys.path.insert(0, src_path)
 
-from samay.model import LPTMModel
 from samay.dataset import LPTMDataset
-
-
+from samay.model import LPTMModel
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("Using device:", device)
 
-torch.backends.quantized.engine = "qnnpack"
+# torch.backends.quantized.engine = "qnnpack"
 
 
 # Quantization
 
+
 def quantize_linear_layers(model, quantization_type="int8"):
 
     if quantization_type == "int8":
         return torch.quantization.quantize_dynamic(
-            model,
-            {nn.Linear},
-            dtype=torch.qint8
+            model, {nn.Linear}, dtype=torch.qint8
         )
 
     elif quantization_type == "float16":
         return torch.quantization.quantize_dynamic(
-            model,
-            {nn.Linear},
-            dtype=torch.float16
+            model, {nn.Linear}, dtype=torch.float16
         )
 
     else:
@@ -52,13 +53,86 @@ def quantize(lptm_model, quant_type="int8"):
 
     with torch.no_grad():
         lptm_model.model = quantize_linear_layers(
-            lptm_model.model,
-            quantization_type=quant_type
+            lptm_model.model, quantization_type=quant_type
+        )
+
+    return lptm_model.model
+
+
+def quantize_linear_layers_bnb(module, threshold=6.0, quantization_type="int8"):
+
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear) and child.in_features >= 128:
+            if quantization_type == "int8":
+                quant_layer = bnb.nn.Linear8bitLt(
+                    child.in_features,
+                    child.out_features,
+                    bias=(child.bias is not None),
+                    threshold=threshold,
+                    has_fp16_weights=False,
+                )
+            elif quantization_type == "nf4":
+                quant_layer = bnb.nn.Linear4bit(
+                    child.in_features,
+                    child.out_features,
+                    bias=(child.bias is not None),
+                    quant_type="nf4",
+                    compute_dtype=torch.float16,
+                )
+            else:
+                raise ValueError("Unsupported CUDA quantization type")
+
+            with torch.no_grad():
+                quant_layer.weight.copy_(child.weight)
+                if child.bias is not None:
+                    quant_layer.bias.copy_(child.bias)
+
+            setattr(module, name, quant_layer)
+        else:
+            quantize_linear_layers_bnb(
+                child, threshold=threshold, quantization_type=quantization_type
+            )
+
+    return module
+
+
+def quantize_cuda(lptm_model, quant_type="int8"):
+
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA not available for CUDA quantization")
+    if bnb is None:
+        raise RuntimeError("bitsandbytes is not installed")
+
+    lptm_model.model.eval()
+    lptm_model.model = lptm_model.model.to("cuda")
+
+    with torch.no_grad():
+        lptm_model.model = quantize_linear_layers_bnb(
+            lptm_model.model, quantization_type=quant_type
         )
+        lptm_model.model = lptm_model.model.to("cuda")
 
     return lptm_model.model
 
 
+def _resolve_run_device(lptm_model):
+
+    return getattr(lptm_model, "_run_device", device)
+
+
+def _resolve_input_dtype(lptm_model):
+
+    model = lptm_model.model
+    forced_dtype = getattr(lptm_model, "_input_dtype", None)
+    if forced_dtype is not None:
+        return forced_dtype
+
+    try:
+        return next(model.parameters()).dtype
+    except StopIteration:
+        return torch.float32
+
+
 # Load LPTM
 
 config = {
@@ -76,8 +150,6 @@ def quantize(lptm_model, quant_type="int8"):
 base_model = LPTMModel(config)
 
 
-
-
 train_dataset = LPTMDataset(
     name="ett",
     datetime_col="date",
@@ -103,39 +175,51 @@ def quantize(lptm_model, quant_type="int8"):
 # Create models
 
 fp32_model = copy.deepcopy(base_model)
+fp32_model._run_device = device
+fp32_model._input_dtype = torch.float32
 
 fp16_model = copy.deepcopy(base_model)
 fp16_model.model = fp16_model.model.half().to(device)
+fp16_model._run_device = device
+fp16_model._input_dtype = torch.float16
 
 int8_model = copy.deepcopy(base_model)
-quantize(int8_model, "int8")
+if torch.cuda.is_available() and bnb is not None:
+    quantize_cuda(int8_model, "int8")
+    int8_model._run_device = torch.device("cuda")
+    int8_model._input_dtype = torch.float32
+    print("Using bitsandbytes INT8 on CUDA")
+else:
+    quantize(int8_model, "int8")
+    int8_model._run_device = torch.device("cpu")
+    int8_model._input_dtype = torch.float32
+    print("Falling back to torch dynamic INT8 on CPU")
 
 print("Model setup complete.")
 
 
 # Evaluation Functions
 
+
 def compute_mse(lptm_model, dataset):
 
     model = lptm_model.model
     model.eval()
 
-    run_device = "cpu" if next(model.parameters()).dtype == torch.qint8 else device
+    run_device = _resolve_run_device(lptm_model)
     model.to(run_device)
 
     preds = []
     trues = []
 
     with torch.no_grad():
-
         for i in range(len(dataset)):
-
             sample = dataset[i]
 
             x = sample[0]
             y_future = sample[2]
 
-            dtype = next(model.parameters()).dtype
+            dtype = _resolve_input_dtype(lptm_model)
             x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device)
 
             output = model(x_enc=x)
@@ -167,16 +251,15 @@ def inference_time(lptm_model, dataset, runs=10):
     model = lptm_model.model
     model.eval()
 
-    run_device = "cpu" if next(model.parameters()).dtype == torch.qint8 else device
+    run_device = _resolve_run_device(lptm_model)
     model.to(run_device)
 
     sample = dataset[0]
     x = sample[0]
 
-    dtype = next(model.parameters()).dtype
+    dtype = _resolve_input_dtype(lptm_model)
     x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device)
 
-
     for _ in range(3):
         with torch.no_grad():
             model(x_enc=x).forecast
@@ -219,5 +302,9 @@ def inference_time(lptm_model, dataset, runs=10):
 print("-------------------------------------")
 
 print(f"Float32  | MSE: {mse_fp32:.5f} | Size: {size_fp32:.2f} MB | Speedup: 1.0x")
-print(f"Float16  | MSE: {mse_fp16:.5f} | Size: {size_fp16:.2f} MB | Speedup: {speedup_fp16:.2f}x")
-print(f"INT8     | MSE: {mse_int8:.5f} | Size: {size_int8:.2f} MB | Speedup: {speedup_int8:.2f}x")
\ No newline at end of file
+print(
+    f"Float16  | MSE: {mse_fp16:.5f} | Size: {size_fp16:.2f} MB | Speedup: {speedup_fp16:.2f}x"
+)
+print(
+    f"INT8     | MSE: {mse_int8:.5f} | Size: {size_int8:.2f} MB | Speedup: {speedup_int8:.2f}x"
+)
diff --git a/example/moment_quantize_table_results.py b/example/moment_quantize_table_results.py
index 5e62f91..ea53e27 100644
--- a/example/moment_quantize_table_results.py
+++ b/example/moment_quantize_table_results.py
@@ -1,24 +1,27 @@
+import copy
 import os
 import sys
 import time
-import copy
-import torch
+
 import numpy as np
+import torch
 from sklearn.metrics import mean_squared_error
 
-
-
 src_path = os.path.abspath(os.path.join("..", "src"))
 if src_path not in sys.path:
     sys.path.insert(0, src_path)
 
-from samay.model import MomentModel
-from samay.dataset import MomentDataset
-
-import torch
 import torch.nn as nn
 
-torch.backends.quantized.engine = "qnnpack"
+try:
+    import bitsandbytes as bnb
+except ImportError:
+    bnb = None
+
+from samay.dataset import MomentDataset
+from samay.model import MomentModel
+
+# torch.backends.quantized.engine = "qnnpack"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
@@ -26,35 +29,105 @@ def quantize_linear_layers(model, quantization_type="int8"):
 
     if quantization_type == "int8":
         return torch.quantization.quantize_dynamic(
-            model,
-            {nn.Linear},
-            dtype=torch.qint8
+            model, {nn.Linear}, dtype=torch.qint8
         )
 
     elif quantization_type == "float16":
         return torch.quantization.quantize_dynamic(
-            model,
-            {nn.Linear},
-            dtype=torch.float16
+            model, {nn.Linear}, dtype=torch.float16
         )
 
     else:
         raise ValueError("Unsupported quantization type")
-    
+
+
 def quantize(moment_model, quant_type="int8", device="cpu"):
-    
+
     moment_model.model.eval()
     moment_model.model = moment_model.model.to(device)
 
     with torch.no_grad():
         moment_model.model = quantize_linear_layers(
-            moment_model.model,
-            quantization_type=quant_type
+            moment_model.model, quantization_type=quant_type
         )
 
     return moment_model.model
 
 
+def quantize_linear_layers_bnb(module, threshold=6.0, quantization_type="int8"):
+
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear) and child.in_features >= 128:
+            if quantization_type == "int8":
+                quant_layer = bnb.nn.Linear8bitLt(
+                    child.in_features,
+                    child.out_features,
+                    bias=(child.bias is not None),
+                    threshold=threshold,
+                    has_fp16_weights=False,
+                )
+            elif quantization_type == "nf4":
+                quant_layer = bnb.nn.Linear4bit(
+                    child.in_features,
+                    child.out_features,
+                    bias=(child.bias is not None),
+                    quant_type="nf4",
+                    compute_dtype=torch.float16,
+                )
+            else:
+                raise ValueError("Unsupported CUDA quantization type")
+
+            with torch.no_grad():
+                quant_layer.weight.copy_(child.weight)
+                if child.bias is not None:
+                    quant_layer.bias.copy_(child.bias)
+
+            setattr(module, name, quant_layer)
+        else:
+            quantize_linear_layers_bnb(
+                child, threshold=threshold, quantization_type=quantization_type
+            )
+
+    return module
+
+
+def quantize_cuda(moment_model, quant_type="int8"):
+
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA not available for CUDA quantization")
+    if bnb is None:
+        raise RuntimeError("bitsandbytes is not installed")
+
+    moment_model.model.eval()
+    moment_model.model = moment_model.model.to("cuda")
+
+    with torch.no_grad():
+        moment_model.model = quantize_linear_layers_bnb(
+            moment_model.model, quantization_type=quant_type
+        )
+        moment_model.model = moment_model.model.to("cuda")
+
+    return moment_model.model
+
+
+def _resolve_run_device(moment_model):
+
+    return getattr(moment_model, "_run_device", device)
+
+
+def _resolve_input_dtype(moment_model):
+
+    model = moment_model.model
+    forced_dtype = getattr(moment_model, "_input_dtype", None)
+    if forced_dtype is not None:
+        return forced_dtype
+
+    try:
+        return next(model.parameters()).dtype
+    except StopIteration:
+        return torch.float32
+
+
 repo = "AutonLab/MOMENT-1-large"
 
 config = {
@@ -81,12 +154,25 @@ def quantize(moment_model, quant_type="int8", device="cpu"):
 # Create models
 
 fp32_model = copy.deepcopy(base_model)
+fp32_model._run_device = device
+fp32_model._input_dtype = torch.float32
 
 fp16_model = copy.deepcopy(base_model)
 fp16_model.model = fp16_model.model.half().to(device)
+fp16_model._run_device = device
+fp16_model._input_dtype = torch.float16
 
 int8_model = copy.deepcopy(base_model)
-quantize(int8_model, "int8", device="cpu")
+if torch.cuda.is_available() and bnb is not None:
+    quantize_cuda(int8_model, "int8")
+    int8_model._run_device = torch.device("cuda")
+    int8_model._input_dtype = torch.float32
+    print("Using bitsandbytes INT8 on CUDA")
+else:
+    quantize(int8_model, "int8", device="cpu")
+    int8_model._run_device = torch.device("cpu")
+    int8_model._input_dtype = torch.float32
+    print("Falling back to torch dynamic INT8 on CPU")
 
 print("THIS IS DONE YES!!!!")
 
@@ -99,7 +185,7 @@ def compute_mse(moment_model, dataset):
     model = moment_model.model
     model.eval()
 
-    run_device = device
+    run_device = _resolve_run_device(moment_model)
     model.to(run_device)
 
     preds = []
@@ -107,13 +193,12 @@ def compute_mse(moment_model, dataset):
 
     with torch.no_grad():
         for i in range(len(dataset)):
-
             sample = dataset[i]
 
-            x = sample[0]          # (64, 512)
-            y_future = sample[2]   # (64, 192)
+            x = sample[0]  # (64, 512)
+            y_future = sample[2]  # (64, 192)
 
-            dtype = next(model.parameters()).dtype
+            dtype = _resolve_input_dtype(moment_model)
             x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device)
 
             output = model(x_enc=x)
@@ -130,6 +215,7 @@ def compute_mse(moment_model, dataset):
 
     return mean_squared_error(trues, preds)
 
+
 def model_size(moment_model):
 
     torch.save(moment_model.model.state_dict(), "temp.pt")
@@ -144,13 +230,13 @@ def inference_time(moment_model, dataset, runs=10):
     model = moment_model.model
     model.eval()
 
-    run_device = device
+    run_device = _resolve_run_device(moment_model)
     model.to(run_device)
 
     sample = dataset[0]
     x = sample[0]
 
-    dtype = next(model.parameters()).dtype
+    dtype = _resolve_input_dtype(moment_model)
     x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device)
 
     start = time.time()
@@ -191,5 +277,9 @@ def inference_time(moment_model, dataset, runs=10):
 print("-------------------------------------")
 
 print(f"Float32  | MSE: {mse_fp32:.5f} | Size: {size_fp32:.2f} MB | Speedup: 1.0x")
-print(f"Float16  | MSE: {mse_fp16:.5f} | Size: {size_fp16:.2f} MB | Speedup: {speedup_fp16:.2f}x")
-print(f"INT8     | MSE: {mse_int8:.5f} | Size: {size_int8:.2f} MB | Speedup: {speedup_int8:.2f}x")
\ No newline at end of file
+print(
+    f"Float16  | MSE: {mse_fp16:.5f} | Size: {size_fp16:.2f} MB | Speedup: {speedup_fp16:.2f}x"
+)
+print(
+    f"INT8     | MSE: {mse_int8:.5f} | Size: {size_int8:.2f} MB | Speedup: {speedup_int8:.2f}x"
+)
diff --git a/pyproject.toml b/pyproject.toml
index 3d3bf5e..805e912 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
     "torchvision>=0.20.1",
     "lightning>=2.5.1",
     "plotly>=6.3.0",
+    "bitsandbytes>=0.49.2",
 ]
 
 [build-system]
diff --git a/src/samay/dataset.py b/src/samay/dataset.py
index 5dc5514..8dcb965 100644
--- a/src/samay/dataset.py
+++ b/src/samay/dataset.py
@@ -82,7 +82,6 @@ def __init__(
             **kwargs: Extra backend-specific options.
         """
         self.name = name
-        self.freq = freq
         self.datetime_col = datetime_col
         self.batchsize = batchsize
         self.mode = mode
diff --git a/src/samay/model.py b/src/samay/model.py
index 55ce1d8..e88d029 100644
--- a/src/samay/model.py
+++ b/src/samay/model.py
@@ -10,13 +10,14 @@
 import yaml
 from einops import rearrange, repeat
 from jaxtyping import Float
+from sklearn.metrics import mean_squared_error
+from torchvision import transforms
+
 from samay.dataset import *
 
 # from chronos import ChronosPipeline
 from samay.models.chronosforecasting.chronos.chronos import ChronosPipeline
 from samay.moirai_utils import convert_module_kwargs, filter_dict
-from sklearn.metrics import mean_squared_error
-from torchvision import transforms
 from uni2ts.model.moirai import MoiraiForecast, MoiraiModule
 from uni2ts.model.moirai.finetune import MoiraiFinetune
 from uni2ts.model.moirai2 import Moirai2Forecast, Moirai2Module
@@ -53,7 +54,23 @@
     TinyTimeMixerForPrediction,
 )
 from .utils import cleanup_dataloader, get_least_used_gpu, quantile_loss, visualize
-from quantization import quantize_linear_layers
+
+
+def quantize_linear_layers(model, quantization_type="int8"):
+
+    if quantization_type == "int8":
+        return torch.quantization.quantize_dynamic(
+            model, {nn.Linear}, dtype=torch.qint8
+        )
+
+    elif quantization_type == "float16":
+        return torch.quantization.quantize_dynamic(
+            model, {nn.Linear}, dtype=torch.float16
+        )
+
+    else:
+        raise ValueError("Unsupported quantization type")
+
 
 class Basemodel:
     def __init__(self, config=None, repo=None):
@@ -460,7 +477,9 @@ def finetune(self, dataset: ChronosDataset, **kwargs):
 
         finetune_model.eval()
 
-    def plot(self, dataset: ChronosDataset, horizon_len: int, quantile_levels: list, **kwargs):
+    def plot(
+        self, dataset: ChronosDataset, horizon_len: int, quantile_levels: list, **kwargs
+    ):
         """Plot forecast results produced by the Chronos pipeline.
 
         Args:
@@ -515,7 +534,12 @@ def plot(self, dataset: ChronosDataset, horizon_len: int, quantile_levels: list,
         )
 
     def evaluate(
-        self, dataset: ChronosDataset, horizon_len: int, quantile_levels: list, metric_only=False, **kwargs
+        self,
+        dataset: ChronosDataset,
+        horizon_len: int,
+        quantile_levels: list,
+        metric_only=False,
+        **kwargs,
     ):
         """Evaluate the Chronos model on a dataset.
 
@@ -684,7 +708,13 @@ def finetune(self, dataset: ChronosBoltDataset, **kwargs):
 
         finetune_model.eval()
 
-    def plot(self, dataset: ChronosBoltDataset, horizon_len: int, quantile_levels: list, **kwargs):
+    def plot(
+        self,
+        dataset: ChronosBoltDataset,
+        horizon_len: int,
+        quantile_levels: list,
+        **kwargs,
+    ):
         """Plot forecast results produced by the ChronosBolt pipeline.
 
         Args:
@@ -737,7 +767,12 @@ def plot(self, dataset: ChronosBoltDataset, horizon_len: int, quantile_levels: l
         )
 
     def evaluate(
-        self, dataset: ChronosBoltDataset, horizon_len: int, quantile_levels: list, metric_only=False, **kwargs
+        self,
+        dataset: ChronosBoltDataset,
+        horizon_len: int,
+        quantile_levels: list,
+        metric_only=False,
+        **kwargs,
     ):
         """Evaluate the ChronosBolt model on a dataset.
 
@@ -1047,14 +1082,16 @@ def evaluate(self, dataset: Chronos_2_Dataset, metric_only=False, **kwargs):
                     output = self.model(
                         context=inputs, num_output_patches=current_horizon_patch
                     )  # b, h, q
-                    quantile_output = output.quantile_preds.transpose(1, 2) # b, h, q
+                    quantile_output = output.quantile_preds.transpose(1, 2)  # b, h, q
                     # take median as the last_ar_output
                     # if the remaining length is smaller than the current horizon patch, we only take the first remaining_length values
                     if remaining_length < current_horizon_patch * self.patch_size:
                         quantile_output = quantile_output[:, :remaining_length, :]
-                    
+
                     # last_ar_output = quantile_output[:, -current_horizon_patch * self.patch_size :, quantile_output.shape[-1] // 2]
-                    last_ar_output = quantile_output[:, :, quantile_output.shape[-1] // 2]
+                    last_ar_output = quantile_output[
+                        :, :, quantile_output.shape[-1] // 2
+                    ]
                     all_quantile_outputs.append(quantile_output)
                     remaining_length -= current_horizon_patch * self.patch_size
                 quantile_prediction = torch.cat(all_quantile_outputs, dim=1)  # b, h, q
@@ -1281,10 +1318,18 @@ def quantize(self, quant_type="int8", device="cuda"):
         self.model.eval()
         self.model = self.model.to(device)
         with torch.no_grad():
-            self.model = quantize_linear_layers(self.model, quantization_type=quant_type)
+            self.model = quantize_linear_layers(
+                self.model, quantization_type=quant_type
+            )
         return self.model
 
-    def evaluate(self, dataset: LPTMDataset, task_name: str = "forecasting", metric_only=False, **kwargs):
+    def evaluate(
+        self,
+        dataset: LPTMDataset,
+        task_name: str = "forecasting",
+        metric_only=False,
+        **kwargs,
+    ):
         """Evaluate the LPTM model on a dataset.
 
         Args:
@@ -1358,7 +1403,7 @@ def evaluate(self, dataset: LPTMDataset, task_name: str = "forecasting", metric_
             mse = MSE(trues, preds)
             mae = MAE(trues, preds)
             mase = MASE(histories, trues, preds)
-            mape = MAPE(trues, preds)   
+            mape = MAPE(trues, preds)
             rmse = RMSE(trues, preds)
             nrmse = NRMSE(trues, preds)
             smape = SMAPE(trues, preds)
@@ -1608,7 +1653,9 @@ def __init__(self, config=None, repo=None):
             self.model = MOMENTPipeline.from_pretrained(repo, model_kwargs=self.config)
         self.model.init()
 
-    def finetune(self, dataset: MomentDataset, task_name: str = "forecasting", **kwargs):
+    def finetune(
+        self, dataset: MomentDataset, task_name: str = "forecasting", **kwargs
+    ):
         """Finetune the model on the given dataset.
 
         Args:
@@ -1730,14 +1777,16 @@ def finetune(self, dataset: MomentDataset, task_name: str = "forecasting", **kwa
             scheduler.step()
 
         return self.model
-    
+
     def quantize(self, quant_type="int8", device="cuda"):
         self.model.eval()
         self.model = self.model.to(device)
         with torch.no_grad():
-            self.model = quantize_linear_layers(self.model, quantization_type=quant_type)
+            self.model = quantize_linear_layers(
+                self.model, quantization_type=quant_type
+            )
         return self.model
-    
+
     def plot(self, dataset: MomentDataset, task_name: str = "forecasting"):
         """Visualize results from the MOMENT model.
 
@@ -1880,7 +1929,13 @@ def plot(self, dataset: MomentDataset, task_name: str = "forecasting"):
         #     labels = np.concatenate(labels)
         #     return accuracy, embeddings, labels
 
-    def evaluate(self, dataset: MomentDataset, task_name: str = "forecasting", metric_only: bool = False, **kwargs):
+    def evaluate(
+        self,
+        dataset: MomentDataset,
+        task_name: str = "forecasting",
+        metric_only: bool = False,
+        **kwargs,
+    ):
         """Evaluate the model.
 
         Args:
@@ -2005,7 +2060,7 @@ def evaluate(self, dataset: MomentDataset, task_name: str = "forecasting", metri
             embeddings = np.concatenate(embeddings)
             labels = np.concatenate(labels)
             return accuracy, embeddings, labels
-        
+
         elif task_name == "detection":
             trues, preds, labels = [], [], []
             with torch.no_grad():
@@ -2849,7 +2904,10 @@ def evaluate(
 
         mse = np.mean(np.array([MSE(t, p) for t, p in zip(trues, preds)]), axis=0)
         mae = np.mean(np.array([MAE(t, p) for t, p in zip(trues, preds)]), axis=0)
-        mase = np.mean(np.array([MASE(h, t, p) for h, t, p in zip(histories, trues, preds)]), axis=0)
+        mase = np.mean(
+            np.array([MASE(h, t, p) for h, t, p in zip(histories, trues, preds)]),
+            axis=0,
+        )
         mape = np.mean(np.array([MAPE(t, p) for t, p in zip(trues, preds)]), axis=0)
         rmse = np.mean(np.array([RMSE(t, p) for t, p in zip(trues, preds)]), axis=0)
         nrmse = np.mean(np.array([NRMSE(t, p) for t, p in zip(trues, preds)]), axis=0)
@@ -3283,15 +3341,21 @@ def plot(self, dataset: TimeMoEDataset, **kwargs):
                 trues.append(true)
                 preds.append(pred)
                 histories.append(history)
-        trues = np.concatenate(trues, axis=0).reshape(
-            dataset.n_channels, -1, dataset.horizon_len
-        ).transpose(1, 0, 2)
-        preds = np.concatenate(preds, axis=0).reshape(
-            dataset.n_channels, -1, dataset.horizon_len
-        ).transpose(1, 0, 2)
-        histories = np.concatenate(histories, axis=0).reshape(
-            dataset.n_channels, -1, dataset.context_len
-        ).transpose(1, 0, 2)
+        trues = (
+            np.concatenate(trues, axis=0)
+            .reshape(dataset.n_channels, -1, dataset.horizon_len)
+            .transpose(1, 0, 2)
+        )
+        preds = (
+            np.concatenate(preds, axis=0)
+            .reshape(dataset.n_channels, -1, dataset.horizon_len)
+            .transpose(1, 0, 2)
+        )
+        histories = (
+            np.concatenate(histories, axis=0)
+            .reshape(dataset.n_channels, -1, dataset.context_len)
+            .transpose(1, 0, 2)
+        )
 
         visualize(
             task_name="forecasting",
@@ -3308,7 +3372,7 @@ def evaluate(self, dataset: TimeMoEDataset, metric_only: bool = False, **kwargs)
             metric_only (bool): If True, return only metrics. Defaults to False.
 
         Returns:
-            (Dict[str, float]): If `metric_only` is True, returns a dictionary containing evaluation metrics, 
+            (Dict[str, float]): If `metric_only` is True, returns a dictionary containing evaluation metrics,
                 including mse, mae, mase, mape, rmse, nrmse, smape, msis, nd.
             (Tuple[Dict[str, float], np.ndarray, np.ndarray, np.ndarray]): If `metric_only` is False, returns a tuple of
                 (metrics, trues, preds, histories).
@@ -3412,7 +3476,9 @@ def __init__(self, config=None, repo=None, **kwargs):
         self.model.compile(self.config)
         self.quantiles = self.model.model.config.quantiles
 
-    def evaluate(self, dataset: TimesFM_2p5_Dataset, metric_only: bool = False, **kwargs):
+    def evaluate(
+        self, dataset: TimesFM_2p5_Dataset, metric_only: bool = False, **kwargs
+    ):
         """Evaluate the model on the given dataset.
 
         Args:
@@ -3420,7 +3486,7 @@ def evaluate(self, dataset: TimesFM_2p5_Dataset, metric_only: bool = False, **kw
             metric_only (bool): If True, return only metrics. Defaults to False.
 
         Returns:
-            (Dict[str, float]): If `metric_only` is True, returns a dictionary containing evaluation metrics, 
+            (Dict[str, float]): If `metric_only` is True, returns a dictionary containing evaluation metrics,
                 including mse, mae, mase, mape, rmse, nrmse, smape, msis, nd, mwsq, crps.
             (Tuple[Dict[str, float], np.ndarray, np.ndarray, np.ndarray]): If `metric_only` is False, returns a tuple of
                 (metrics, trues, preds, histories).
@@ -3451,17 +3517,35 @@ def evaluate(self, dataset: TimesFM_2p5_Dataset, metric_only: bool = False, **kw
                     input_seq,
                     mask_seq,
                 )
-                quantile_forecast = quantile_forecast[..., 1:].transpose(2, 0, 1)   # (q, b, h)
+                quantile_forecast = quantile_forecast[..., 1:].transpose(
+                    2, 0, 1
+                )  # (q, b, h)
 
                 trues.append(target_seq.cpu().numpy())
                 preds.append(point_forecast)
                 q_preds.append(quantile_forecast)
                 histories.append(input_seq.cpu().numpy())
 
-        trues = np.concatenate(trues, axis=0).reshape(dataset.n_channels, -1, dataset.horizon_len).transpose(1, 0, 2)
-        preds = np.concatenate(preds, axis=0).reshape(dataset.n_channels, -1, dataset.horizon_len).transpose(1, 0, 2)
-        q_preds = np.concatenate(q_preds, axis=1).reshape(q_preds[-1].shape[0], dataset.n_channels, -1, dataset.horizon_len).transpose(0, 2, 1, 3)
-        histories = np.concatenate(histories, axis=0).reshape(dataset.n_channels, -1, dataset.context_len).transpose(1, 0, 2)
+        trues = (
+            np.concatenate(trues, axis=0)
+            .reshape(dataset.n_channels, -1, dataset.horizon_len)
+            .transpose(1, 0, 2)
+        )
+        preds = (
+            np.concatenate(preds, axis=0)
+            .reshape(dataset.n_channels, -1, dataset.horizon_len)
+            .transpose(1, 0, 2)
+        )
+        q_preds = (
+            np.concatenate(q_preds, axis=1)
+            .reshape(q_preds[-1].shape[0], dataset.n_channels, -1, dataset.horizon_len)
+            .transpose(0, 2, 1, 3)
+        )
+        histories = (
+            np.concatenate(histories, axis=0)
+            .reshape(dataset.n_channels, -1, dataset.context_len)
+            .transpose(1, 0, 2)
+        )
 
         trues = dataset._denormalize_data(trues)
         preds = dataset._denormalize_data(preds)
@@ -3473,7 +3557,7 @@ def evaluate(self, dataset: TimesFM_2p5_Dataset, metric_only: bool = False, **kw
 
         # Calculate metrics
         mse = MSE(trues, preds)
-        mae = MAE(trues, preds) 
+        mae = MAE(trues, preds)
         mase = MASE(histories, trues, preds)
         mape = MAPE(trues, preds)
         rmse = RMSE(trues, preds)