From 102f2c54e3eafcd8a15c1747f215072d6047891f Mon Sep 17 00:00:00 2001 From: BuildTools Date: Sat, 14 Mar 2026 16:01:54 -0400 Subject: [PATCH 1/2] quantization files for table results --- example/lptm_quantize_table_results.py | 223 +++++++++++++++++++++++ example/moment_quantize_table_results.py | 195 ++++++++++++++++++++ 2 files changed, 418 insertions(+) create mode 100644 example/lptm_quantize_table_results.py create mode 100644 example/moment_quantize_table_results.py diff --git a/example/lptm_quantize_table_results.py b/example/lptm_quantize_table_results.py new file mode 100644 index 0000000..3e3a671 --- /dev/null +++ b/example/lptm_quantize_table_results.py @@ -0,0 +1,223 @@ +import os +import sys +import time +import copy +import torch +import numpy as np +import torch.nn as nn +from sklearn.metrics import mean_squared_error + + +src_path = os.path.abspath(os.path.join("..", "src")) +if src_path not in sys.path: + sys.path.insert(0, src_path) + +from samay.model import LPTMModel +from samay.dataset import LPTMDataset + + + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +print("Using device:", device) + +torch.backends.quantized.engine = "qnnpack" + + +# Quantization + +def quantize_linear_layers(model, quantization_type="int8"): + + if quantization_type == "int8": + return torch.quantization.quantize_dynamic( + model, + {nn.Linear}, + dtype=torch.qint8 + ) + + elif quantization_type == "float16": + return torch.quantization.quantize_dynamic( + model, + {nn.Linear}, + dtype=torch.float16 + ) + + else: + raise ValueError("Unsupported quantization type") + + +def quantize(lptm_model, quant_type="int8"): + + lptm_model.model.eval() + lptm_model.model = lptm_model.model.to("cpu") + + with torch.no_grad(): + lptm_model.model = quantize_linear_layers( + lptm_model.model, + quantization_type=quant_type + ) + + return lptm_model.model + + +# Load LPTM + +config = { + "task_name": "forecasting", + "forecast_horizon": 192, + "head_dropout": 0, + "weight_decay": 0, + "max_patch": 16, + "freeze_encoder": True, + "freeze_embedder": True, + "freeze_head": False, + "freeze_segment": True, +} + +base_model = LPTMModel(config) + + + + +train_dataset = LPTMDataset( + name="ett", + datetime_col="date", + path="./data/data/ETTh1.csv", + mode="train", + horizon=192, +) + +val_dataset = LPTMDataset( + name="ett", + datetime_col="date", + path="./data/data/ETTh1.csv", + mode="test", + horizon=192, +) + +# Finetune + +# print("Finetuning LPTM...") +# base_model = base_model.finetune(train_dataset) + + +# Create models + +fp32_model = copy.deepcopy(base_model) + +fp16_model = copy.deepcopy(base_model) +fp16_model.model = fp16_model.model.half().to(device) + +int8_model = copy.deepcopy(base_model) +quantize(int8_model, "int8") + +print("Model setup complete.") + + +# Evaluation Functions + +def compute_mse(lptm_model, dataset): + + model = lptm_model.model + model.eval() + + run_device = "cpu" if next(model.parameters()).dtype == torch.qint8 else device + model.to(run_device) + + preds = [] + trues = [] + + with torch.no_grad(): + + for i in range(len(dataset)): + + sample = dataset[i] + + x = sample[0] + y_future = sample[2] + + dtype = next(model.parameters()).dtype + x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device) + + output = model(x_enc=x) + pred = output.forecast.squeeze() + + preds.append(pred.cpu().numpy().reshape(-1)) + trues.append(np.array(y_future).reshape(-1)) + + preds = np.concatenate(preds) + trues = np.concatenate(trues) + + print("Prediction shape:", preds.shape) + print("Target shape:", trues.shape) + + return mean_squared_error(trues, preds) + + +def model_size(lptm_model): + + torch.save(lptm_model.model.state_dict(), "temp.pt") + size = os.path.getsize("temp.pt") / 1e6 + os.remove("temp.pt") + + return size + + +def inference_time(lptm_model, dataset, runs=10): + + model = lptm_model.model + model.eval() + + run_device = "cpu" if next(model.parameters()).dtype == torch.qint8 else device + model.to(run_device) + + sample = dataset[0] + x = sample[0] + + dtype = next(model.parameters()).dtype + x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device) + + + for _ in range(3): + with torch.no_grad(): + model(x_enc=x).forecast + + start = time.time() + + for _ in range(runs): + with torch.no_grad(): + model(x_enc=x).forecast + + end = time.time() + + return (end - start) / runs + + +# Run Experiments + +print("\nEvaluating FP32...") +mse_fp32 = compute_mse(fp32_model, val_dataset) +size_fp32 = model_size(fp32_model) +time_fp32 = inference_time(fp32_model, val_dataset) + +print("\nEvaluating FP16...") +mse_fp16 = compute_mse(fp16_model, val_dataset) +size_fp16 = model_size(fp16_model) +time_fp16 = inference_time(fp16_model, val_dataset) + +print("\nEvaluating INT8...") +mse_int8 = compute_mse(int8_model, val_dataset) +size_int8 = model_size(int8_model) +time_int8 = inference_time(int8_model, val_dataset) + + +# Results + +speedup_fp16 = time_fp32 / time_fp16 +speedup_int8 = time_fp32 / time_int8 + +print("\nLPTM Results (ETTh1, Horizon=192)") +print("-------------------------------------") + +print(f"Float32 | MSE: {mse_fp32:.5f} | Size: {size_fp32:.2f} MB | Speedup: 1.0x") +print(f"Float16 | MSE: {mse_fp16:.5f} | Size: {size_fp16:.2f} MB | Speedup: {speedup_fp16:.2f}x") +print(f"INT8 | MSE: {mse_int8:.5f} | Size: {size_int8:.2f} MB | Speedup: {speedup_int8:.2f}x") \ No newline at end of file diff --git a/example/moment_quantize_table_results.py b/example/moment_quantize_table_results.py new file mode 100644 index 0000000..5e62f91 --- /dev/null +++ b/example/moment_quantize_table_results.py @@ -0,0 +1,195 @@ +import os +import sys +import time +import copy +import torch +import numpy as np +from sklearn.metrics import mean_squared_error + + + +src_path = os.path.abspath(os.path.join("..", "src")) +if src_path not in sys.path: + sys.path.insert(0, src_path) + +from samay.model import MomentModel +from samay.dataset import MomentDataset + +import torch +import torch.nn as nn + +torch.backends.quantized.engine = "qnnpack" +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def quantize_linear_layers(model, quantization_type="int8"): + + if quantization_type == "int8": + return torch.quantization.quantize_dynamic( + model, + {nn.Linear}, + dtype=torch.qint8 + ) + + elif quantization_type == "float16": + return torch.quantization.quantize_dynamic( + model, + {nn.Linear}, + dtype=torch.float16 + ) + + else: + raise ValueError("Unsupported quantization type") + +def quantize(moment_model, quant_type="int8", device="cpu"): + + moment_model.model.eval() + moment_model.model = moment_model.model.to(device) + + with torch.no_grad(): + moment_model.model = quantize_linear_layers( + moment_model.model, + quantization_type=quant_type + ) + + return moment_model.model + + +repo = "AutonLab/MOMENT-1-large" + +config = { + "task_name": "forecasting", + "forecast_horizon": 192, + "head_dropout": 0.1, + "weight_decay": 0, + "freeze_encoder": True, + "freeze_embedder": True, + "freeze_head": False, +} + +base_model = MomentModel(config=config, repo=repo) + +val_dataset = MomentDataset( + name="ett", + datetime_col="date", + path="./src/samay/models/moment/data/ETTh1.csv", + mode="test", + horizon_len=192, + freq=None, +) + +# Create models + +fp32_model = copy.deepcopy(base_model) + +fp16_model = copy.deepcopy(base_model) +fp16_model.model = fp16_model.model.half().to(device) + +int8_model = copy.deepcopy(base_model) +quantize(int8_model, "int8", device="cpu") + +print("THIS IS DONE YES!!!!") + + +# Evaluation functions + + +def compute_mse(moment_model, dataset): + + model = moment_model.model + model.eval() + + run_device = device + model.to(run_device) + + preds = [] + trues = [] + + with torch.no_grad(): + for i in range(len(dataset)): + + sample = dataset[i] + + x = sample[0] # (64, 512) + y_future = sample[2] # (64, 192) + + dtype = next(model.parameters()).dtype + x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device) + + output = model(x_enc=x) + pred = output.forecast.squeeze() # (64,192) + + preds.append(pred.cpu().numpy().reshape(-1)) + trues.append(np.array(y_future).reshape(-1)) + + preds = np.concatenate(preds) + trues = np.concatenate(trues) + + print("Final prediction shape:", preds.shape) + print("Final target shape:", trues.shape) + + return mean_squared_error(trues, preds) + +def model_size(moment_model): + + torch.save(moment_model.model.state_dict(), "temp.pt") + size = os.path.getsize("temp.pt") / 1e6 + os.remove("temp.pt") + + return size + + +def inference_time(moment_model, dataset, runs=10): + + model = moment_model.model + model.eval() + + run_device = device + model.to(run_device) + + sample = dataset[0] + x = sample[0] + + dtype = next(model.parameters()).dtype + x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device) + + start = time.time() + + for _ in range(runs): + with torch.no_grad(): + model(x_enc=x).forecast + + end = time.time() + + return (end - start) / runs + + +# Run experiments + +print("Evaluating FP32...") +mse_fp32 = compute_mse(fp32_model, val_dataset) +size_fp32 = model_size(fp32_model) +time_fp32 = inference_time(fp32_model, val_dataset) + +print("Evaluating FP16...") +mse_fp16 = compute_mse(fp16_model, val_dataset) +size_fp16 = model_size(fp16_model) +time_fp16 = inference_time(fp16_model, val_dataset) + +print("Evaluating INT8...") +mse_int8 = compute_mse(int8_model, val_dataset) +size_int8 = model_size(int8_model) +time_int8 = inference_time(int8_model, val_dataset) + + +speedup_fp16 = time_fp32 / time_fp16 +speedup_int8 = time_fp32 / time_int8 + +# RESULTS + +print("\nMOMENT Results (ETTh1, Horizon=192)") +print("-------------------------------------") + +print(f"Float32 | MSE: {mse_fp32:.5f} | Size: {size_fp32:.2f} MB | Speedup: 1.0x") +print(f"Float16 | MSE: {mse_fp16:.5f} | Size: {size_fp16:.2f} MB | Speedup: {speedup_fp16:.2f}x") +print(f"INT8 | MSE: {mse_int8:.5f} | Size: {size_int8:.2f} MB | Speedup: {speedup_int8:.2f}x") \ No newline at end of file From b43b4f7f544b01dc675c4cd6b29f540d5547dcb6 Mon Sep 17 00:00:00 2001 From: kage08 Date: Mon, 16 Mar 2026 16:40:05 -0400 Subject: [PATCH 2/2] Working quantization example with CUDA --- example/lptm_quantize_table_results.py | 139 ++++++++++++++++---- example/moment_quantize_table_results.py | 148 +++++++++++++++++---- pyproject.toml | 1 + src/samay/dataset.py | 1 - src/samay/model.py | 160 +++++++++++++++++------ 5 files changed, 355 insertions(+), 94 deletions(-) diff --git a/example/lptm_quantize_table_results.py b/example/lptm_quantize_table_results.py index 3e3a671..427e8cd 100644 --- a/example/lptm_quantize_table_results.py +++ b/example/lptm_quantize_table_results.py @@ -1,44 +1,45 @@ +import copy import os import sys import time -import copy -import torch + import numpy as np +import torch import torch.nn as nn from sklearn.metrics import mean_squared_error +try: + import bitsandbytes as bnb +except ImportError: + bnb = None + src_path = os.path.abspath(os.path.join("..", "src")) if src_path not in sys.path: sys.path.insert(0, src_path) -from samay.model import LPTMModel from samay.dataset import LPTMDataset - - +from samay.model import LPTMModel device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Using device:", device) -torch.backends.quantized.engine = "qnnpack" +# torch.backends.quantized.engine = "qnnpack" # Quantization + def quantize_linear_layers(model, quantization_type="int8"): if quantization_type == "int8": return torch.quantization.quantize_dynamic( - model, - {nn.Linear}, - dtype=torch.qint8 + model, {nn.Linear}, dtype=torch.qint8 ) elif quantization_type == "float16": return torch.quantization.quantize_dynamic( - model, - {nn.Linear}, - dtype=torch.float16 + model, {nn.Linear}, dtype=torch.float16 ) else: @@ -52,13 +53,86 @@ def quantize(lptm_model, quant_type="int8"): with torch.no_grad(): lptm_model.model = quantize_linear_layers( - lptm_model.model, - quantization_type=quant_type + lptm_model.model, quantization_type=quant_type + ) + + return lptm_model.model + + +def quantize_linear_layers_bnb(module, threshold=6.0, quantization_type="int8"): + + for name, child in module.named_children(): + if isinstance(child, nn.Linear) and child.in_features >= 128: + if quantization_type == "int8": + quant_layer = bnb.nn.Linear8bitLt( + child.in_features, + child.out_features, + bias=(child.bias is not None), + threshold=threshold, + has_fp16_weights=False, + ) + elif quantization_type == "nf4": + quant_layer = bnb.nn.Linear4bit( + child.in_features, + child.out_features, + bias=(child.bias is not None), + quant_type="nf4", + compute_dtype=torch.float16, + ) + else: + raise ValueError("Unsupported CUDA quantization type") + + with torch.no_grad(): + quant_layer.weight.copy_(child.weight) + if child.bias is not None: + quant_layer.bias.copy_(child.bias) + + setattr(module, name, quant_layer) + else: + quantize_linear_layers_bnb( + child, threshold=threshold, quantization_type=quantization_type + ) + + return module + + +def quantize_cuda(lptm_model, quant_type="int8"): + + if not torch.cuda.is_available(): + raise RuntimeError("CUDA not available for CUDA quantization") + if bnb is None: + raise RuntimeError("bitsandbytes is not installed") + + lptm_model.model.eval() + lptm_model.model = lptm_model.model.to("cuda") + + with torch.no_grad(): + lptm_model.model = quantize_linear_layers_bnb( + lptm_model.model, quantization_type=quant_type ) + lptm_model.model = lptm_model.model.to("cuda") return lptm_model.model +def _resolve_run_device(lptm_model): + + return getattr(lptm_model, "_run_device", device) + + +def _resolve_input_dtype(lptm_model): + + model = lptm_model.model + forced_dtype = getattr(lptm_model, "_input_dtype", None) + if forced_dtype is not None: + return forced_dtype + + try: + return next(model.parameters()).dtype + except StopIteration: + return torch.float32 + + # Load LPTM config = { @@ -76,8 +150,6 @@ def quantize(lptm_model, quant_type="int8"): base_model = LPTMModel(config) - - train_dataset = LPTMDataset( name="ett", datetime_col="date", @@ -103,39 +175,51 @@ def quantize(lptm_model, quant_type="int8"): # Create models fp32_model = copy.deepcopy(base_model) +fp32_model._run_device = device +fp32_model._input_dtype = torch.float32 fp16_model = copy.deepcopy(base_model) fp16_model.model = fp16_model.model.half().to(device) +fp16_model._run_device = device +fp16_model._input_dtype = torch.float16 int8_model = copy.deepcopy(base_model) -quantize(int8_model, "int8") +if torch.cuda.is_available() and bnb is not None: + quantize_cuda(int8_model, "int8") + int8_model._run_device = torch.device("cuda") + int8_model._input_dtype = torch.float32 + print("Using bitsandbytes INT8 on CUDA") +else: + quantize(int8_model, "int8") + int8_model._run_device = torch.device("cpu") + int8_model._input_dtype = torch.float32 + print("Falling back to torch dynamic INT8 on CPU") print("Model setup complete.") # Evaluation Functions + def compute_mse(lptm_model, dataset): model = lptm_model.model model.eval() - run_device = "cpu" if next(model.parameters()).dtype == torch.qint8 else device + run_device = _resolve_run_device(lptm_model) model.to(run_device) preds = [] trues = [] with torch.no_grad(): - for i in range(len(dataset)): - sample = dataset[i] x = sample[0] y_future = sample[2] - dtype = next(model.parameters()).dtype + dtype = _resolve_input_dtype(lptm_model) x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device) output = model(x_enc=x) @@ -167,16 +251,15 @@ def inference_time(lptm_model, dataset, runs=10): model = lptm_model.model model.eval() - run_device = "cpu" if next(model.parameters()).dtype == torch.qint8 else device + run_device = _resolve_run_device(lptm_model) model.to(run_device) sample = dataset[0] x = sample[0] - dtype = next(model.parameters()).dtype + dtype = _resolve_input_dtype(lptm_model) x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device) - for _ in range(3): with torch.no_grad(): model(x_enc=x).forecast @@ -219,5 +302,9 @@ def inference_time(lptm_model, dataset, runs=10): print("-------------------------------------") print(f"Float32 | MSE: {mse_fp32:.5f} | Size: {size_fp32:.2f} MB | Speedup: 1.0x") -print(f"Float16 | MSE: {mse_fp16:.5f} | Size: {size_fp16:.2f} MB | Speedup: {speedup_fp16:.2f}x") -print(f"INT8 | MSE: {mse_int8:.5f} | Size: {size_int8:.2f} MB | Speedup: {speedup_int8:.2f}x") \ No newline at end of file +print( + f"Float16 | MSE: {mse_fp16:.5f} | Size: {size_fp16:.2f} MB | Speedup: {speedup_fp16:.2f}x" +) +print( + f"INT8 | MSE: {mse_int8:.5f} | Size: {size_int8:.2f} MB | Speedup: {speedup_int8:.2f}x" +) diff --git a/example/moment_quantize_table_results.py b/example/moment_quantize_table_results.py index 5e62f91..ea53e27 100644 --- a/example/moment_quantize_table_results.py +++ b/example/moment_quantize_table_results.py @@ -1,24 +1,27 @@ +import copy import os import sys import time -import copy -import torch + import numpy as np +import torch from sklearn.metrics import mean_squared_error - - src_path = os.path.abspath(os.path.join("..", "src")) if src_path not in sys.path: sys.path.insert(0, src_path) -from samay.model import MomentModel -from samay.dataset import MomentDataset - -import torch import torch.nn as nn -torch.backends.quantized.engine = "qnnpack" +try: + import bitsandbytes as bnb +except ImportError: + bnb = None + +from samay.dataset import MomentDataset +from samay.model import MomentModel + +# torch.backends.quantized.engine = "qnnpack" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -26,35 +29,105 @@ def quantize_linear_layers(model, quantization_type="int8"): if quantization_type == "int8": return torch.quantization.quantize_dynamic( - model, - {nn.Linear}, - dtype=torch.qint8 + model, {nn.Linear}, dtype=torch.qint8 ) elif quantization_type == "float16": return torch.quantization.quantize_dynamic( - model, - {nn.Linear}, - dtype=torch.float16 + model, {nn.Linear}, dtype=torch.float16 ) else: raise ValueError("Unsupported quantization type") - + + def quantize(moment_model, quant_type="int8", device="cpu"): - + moment_model.model.eval() moment_model.model = moment_model.model.to(device) with torch.no_grad(): moment_model.model = quantize_linear_layers( - moment_model.model, - quantization_type=quant_type + moment_model.model, quantization_type=quant_type ) return moment_model.model +def quantize_linear_layers_bnb(module, threshold=6.0, quantization_type="int8"): + + for name, child in module.named_children(): + if isinstance(child, nn.Linear) and child.in_features >= 128: + if quantization_type == "int8": + quant_layer = bnb.nn.Linear8bitLt( + child.in_features, + child.out_features, + bias=(child.bias is not None), + threshold=threshold, + has_fp16_weights=False, + ) + elif quantization_type == "nf4": + quant_layer = bnb.nn.Linear4bit( + child.in_features, + child.out_features, + bias=(child.bias is not None), + quant_type="nf4", + compute_dtype=torch.float16, + ) + else: + raise ValueError("Unsupported CUDA quantization type") + + with torch.no_grad(): + quant_layer.weight.copy_(child.weight) + if child.bias is not None: + quant_layer.bias.copy_(child.bias) + + setattr(module, name, quant_layer) + else: + quantize_linear_layers_bnb( + child, threshold=threshold, quantization_type=quantization_type + ) + + return module + + +def quantize_cuda(moment_model, quant_type="int8"): + + if not torch.cuda.is_available(): + raise RuntimeError("CUDA not available for CUDA quantization") + if bnb is None: + raise RuntimeError("bitsandbytes is not installed") + + moment_model.model.eval() + moment_model.model = moment_model.model.to("cuda") + + with torch.no_grad(): + moment_model.model = quantize_linear_layers_bnb( + moment_model.model, quantization_type=quant_type + ) + moment_model.model = moment_model.model.to("cuda") + + return moment_model.model + + +def _resolve_run_device(moment_model): + + return getattr(moment_model, "_run_device", device) + + +def _resolve_input_dtype(moment_model): + + model = moment_model.model + forced_dtype = getattr(moment_model, "_input_dtype", None) + if forced_dtype is not None: + return forced_dtype + + try: + return next(model.parameters()).dtype + except StopIteration: + return torch.float32 + + repo = "AutonLab/MOMENT-1-large" config = { @@ -81,12 +154,25 @@ def quantize(moment_model, quant_type="int8", device="cpu"): # Create models fp32_model = copy.deepcopy(base_model) +fp32_model._run_device = device +fp32_model._input_dtype = torch.float32 fp16_model = copy.deepcopy(base_model) fp16_model.model = fp16_model.model.half().to(device) +fp16_model._run_device = device +fp16_model._input_dtype = torch.float16 int8_model = copy.deepcopy(base_model) -quantize(int8_model, "int8", device="cpu") +if torch.cuda.is_available() and bnb is not None: + quantize_cuda(int8_model, "int8") + int8_model._run_device = torch.device("cuda") + int8_model._input_dtype = torch.float32 + print("Using bitsandbytes INT8 on CUDA") +else: + quantize(int8_model, "int8", device="cpu") + int8_model._run_device = torch.device("cpu") + int8_model._input_dtype = torch.float32 + print("Falling back to torch dynamic INT8 on CPU") print("THIS IS DONE YES!!!!") @@ -99,7 +185,7 @@ def compute_mse(moment_model, dataset): model = moment_model.model model.eval() - run_device = device + run_device = _resolve_run_device(moment_model) model.to(run_device) preds = [] @@ -107,13 +193,12 @@ def compute_mse(moment_model, dataset): with torch.no_grad(): for i in range(len(dataset)): - sample = dataset[i] - x = sample[0] # (64, 512) - y_future = sample[2] # (64, 192) + x = sample[0] # (64, 512) + y_future = sample[2] # (64, 192) - dtype = next(model.parameters()).dtype + dtype = _resolve_input_dtype(moment_model) x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device) output = model(x_enc=x) @@ -130,6 +215,7 @@ def compute_mse(moment_model, dataset): return mean_squared_error(trues, preds) + def model_size(moment_model): torch.save(moment_model.model.state_dict(), "temp.pt") @@ -144,13 +230,13 @@ def inference_time(moment_model, dataset, runs=10): model = moment_model.model model.eval() - run_device = device + run_device = _resolve_run_device(moment_model) model.to(run_device) sample = dataset[0] x = sample[0] - dtype = next(model.parameters()).dtype + dtype = _resolve_input_dtype(moment_model) x = torch.tensor(x, dtype=dtype).unsqueeze(0).to(run_device) start = time.time() @@ -191,5 +277,9 @@ def inference_time(moment_model, dataset, runs=10): print("-------------------------------------") print(f"Float32 | MSE: {mse_fp32:.5f} | Size: {size_fp32:.2f} MB | Speedup: 1.0x") -print(f"Float16 | MSE: {mse_fp16:.5f} | Size: {size_fp16:.2f} MB | Speedup: {speedup_fp16:.2f}x") -print(f"INT8 | MSE: {mse_int8:.5f} | Size: {size_int8:.2f} MB | Speedup: {speedup_int8:.2f}x") \ No newline at end of file +print( + f"Float16 | MSE: {mse_fp16:.5f} | Size: {size_fp16:.2f} MB | Speedup: {speedup_fp16:.2f}x" +) +print( + f"INT8 | MSE: {mse_int8:.5f} | Size: {size_int8:.2f} MB | Speedup: {speedup_int8:.2f}x" +) diff --git a/pyproject.toml b/pyproject.toml index 3d3bf5e..805e912 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "torchvision>=0.20.1", "lightning>=2.5.1", "plotly>=6.3.0", + "bitsandbytes>=0.49.2", ] [build-system] diff --git a/src/samay/dataset.py b/src/samay/dataset.py index 5dc5514..8dcb965 100644 --- a/src/samay/dataset.py +++ b/src/samay/dataset.py @@ -82,7 +82,6 @@ def __init__( **kwargs: Extra backend-specific options. """ self.name = name - self.freq = freq self.datetime_col = datetime_col self.batchsize = batchsize self.mode = mode diff --git a/src/samay/model.py b/src/samay/model.py index 55ce1d8..e88d029 100644 --- a/src/samay/model.py +++ b/src/samay/model.py @@ -10,13 +10,14 @@ import yaml from einops import rearrange, repeat from jaxtyping import Float +from sklearn.metrics import mean_squared_error +from torchvision import transforms + from samay.dataset import * # from chronos import ChronosPipeline from samay.models.chronosforecasting.chronos.chronos import ChronosPipeline from samay.moirai_utils import convert_module_kwargs, filter_dict -from sklearn.metrics import mean_squared_error -from torchvision import transforms from uni2ts.model.moirai import MoiraiForecast, MoiraiModule from uni2ts.model.moirai.finetune import MoiraiFinetune from uni2ts.model.moirai2 import Moirai2Forecast, Moirai2Module @@ -53,7 +54,23 @@ TinyTimeMixerForPrediction, ) from .utils import cleanup_dataloader, get_least_used_gpu, quantile_loss, visualize -from quantization import quantize_linear_layers + + +def quantize_linear_layers(model, quantization_type="int8"): + + if quantization_type == "int8": + return torch.quantization.quantize_dynamic( + model, {nn.Linear}, dtype=torch.qint8 + ) + + elif quantization_type == "float16": + return torch.quantization.quantize_dynamic( + model, {nn.Linear}, dtype=torch.float16 + ) + + else: + raise ValueError("Unsupported quantization type") + class Basemodel: def __init__(self, config=None, repo=None): @@ -460,7 +477,9 @@ def finetune(self, dataset: ChronosDataset, **kwargs): finetune_model.eval() - def plot(self, dataset: ChronosDataset, horizon_len: int, quantile_levels: list, **kwargs): + def plot( + self, dataset: ChronosDataset, horizon_len: int, quantile_levels: list, **kwargs + ): """Plot forecast results produced by the Chronos pipeline. Args: @@ -515,7 +534,12 @@ def plot(self, dataset: ChronosDataset, horizon_len: int, quantile_levels: list, ) def evaluate( - self, dataset: ChronosDataset, horizon_len: int, quantile_levels: list, metric_only=False, **kwargs + self, + dataset: ChronosDataset, + horizon_len: int, + quantile_levels: list, + metric_only=False, + **kwargs, ): """Evaluate the Chronos model on a dataset. @@ -684,7 +708,13 @@ def finetune(self, dataset: ChronosBoltDataset, **kwargs): finetune_model.eval() - def plot(self, dataset: ChronosBoltDataset, horizon_len: int, quantile_levels: list, **kwargs): + def plot( + self, + dataset: ChronosBoltDataset, + horizon_len: int, + quantile_levels: list, + **kwargs, + ): """Plot forecast results produced by the ChronosBolt pipeline. Args: @@ -737,7 +767,12 @@ def plot(self, dataset: ChronosBoltDataset, horizon_len: int, quantile_levels: l ) def evaluate( - self, dataset: ChronosBoltDataset, horizon_len: int, quantile_levels: list, metric_only=False, **kwargs + self, + dataset: ChronosBoltDataset, + horizon_len: int, + quantile_levels: list, + metric_only=False, + **kwargs, ): """Evaluate the ChronosBolt model on a dataset. @@ -1047,14 +1082,16 @@ def evaluate(self, dataset: Chronos_2_Dataset, metric_only=False, **kwargs): output = self.model( context=inputs, num_output_patches=current_horizon_patch ) # b, h, q - quantile_output = output.quantile_preds.transpose(1, 2) # b, h, q + quantile_output = output.quantile_preds.transpose(1, 2) # b, h, q # take median as the last_ar_output # if the remaining length is smaller than the current horizon patch, we only take the first remaining_length values if remaining_length < current_horizon_patch * self.patch_size: quantile_output = quantile_output[:, :remaining_length, :] - + # last_ar_output = quantile_output[:, -current_horizon_patch * self.patch_size :, quantile_output.shape[-1] // 2] - last_ar_output = quantile_output[:, :, quantile_output.shape[-1] // 2] + last_ar_output = quantile_output[ + :, :, quantile_output.shape[-1] // 2 + ] all_quantile_outputs.append(quantile_output) remaining_length -= current_horizon_patch * self.patch_size quantile_prediction = torch.cat(all_quantile_outputs, dim=1) # b, h, q @@ -1281,10 +1318,18 @@ def quantize(self, quant_type="int8", device="cuda"): self.model.eval() self.model = self.model.to(device) with torch.no_grad(): - self.model = quantize_linear_layers(self.model, quantization_type=quant_type) + self.model = quantize_linear_layers( + self.model, quantization_type=quant_type + ) return self.model - def evaluate(self, dataset: LPTMDataset, task_name: str = "forecasting", metric_only=False, **kwargs): + def evaluate( + self, + dataset: LPTMDataset, + task_name: str = "forecasting", + metric_only=False, + **kwargs, + ): """Evaluate the LPTM model on a dataset. Args: @@ -1358,7 +1403,7 @@ def evaluate(self, dataset: LPTMDataset, task_name: str = "forecasting", metric_ mse = MSE(trues, preds) mae = MAE(trues, preds) mase = MASE(histories, trues, preds) - mape = MAPE(trues, preds) + mape = MAPE(trues, preds) rmse = RMSE(trues, preds) nrmse = NRMSE(trues, preds) smape = SMAPE(trues, preds) @@ -1608,7 +1653,9 @@ def __init__(self, config=None, repo=None): self.model = MOMENTPipeline.from_pretrained(repo, model_kwargs=self.config) self.model.init() - def finetune(self, dataset: MomentDataset, task_name: str = "forecasting", **kwargs): + def finetune( + self, dataset: MomentDataset, task_name: str = "forecasting", **kwargs + ): """Finetune the model on the given dataset. Args: @@ -1730,14 +1777,16 @@ def finetune(self, dataset: MomentDataset, task_name: str = "forecasting", **kwa scheduler.step() return self.model - + def quantize(self, quant_type="int8", device="cuda"): self.model.eval() self.model = self.model.to(device) with torch.no_grad(): - self.model = quantize_linear_layers(self.model, quantization_type=quant_type) + self.model = quantize_linear_layers( + self.model, quantization_type=quant_type + ) return self.model - + def plot(self, dataset: MomentDataset, task_name: str = "forecasting"): """Visualize results from the MOMENT model. @@ -1880,7 +1929,13 @@ def plot(self, dataset: MomentDataset, task_name: str = "forecasting"): # labels = np.concatenate(labels) # return accuracy, embeddings, labels - def evaluate(self, dataset: MomentDataset, task_name: str = "forecasting", metric_only: bool = False, **kwargs): + def evaluate( + self, + dataset: MomentDataset, + task_name: str = "forecasting", + metric_only: bool = False, + **kwargs, + ): """Evaluate the model. Args: @@ -2005,7 +2060,7 @@ def evaluate(self, dataset: MomentDataset, task_name: str = "forecasting", metri embeddings = np.concatenate(embeddings) labels = np.concatenate(labels) return accuracy, embeddings, labels - + elif task_name == "detection": trues, preds, labels = [], [], [] with torch.no_grad(): @@ -2849,7 +2904,10 @@ def evaluate( mse = np.mean(np.array([MSE(t, p) for t, p in zip(trues, preds)]), axis=0) mae = np.mean(np.array([MAE(t, p) for t, p in zip(trues, preds)]), axis=0) - mase = np.mean(np.array([MASE(h, t, p) for h, t, p in zip(histories, trues, preds)]), axis=0) + mase = np.mean( + np.array([MASE(h, t, p) for h, t, p in zip(histories, trues, preds)]), + axis=0, + ) mape = np.mean(np.array([MAPE(t, p) for t, p in zip(trues, preds)]), axis=0) rmse = np.mean(np.array([RMSE(t, p) for t, p in zip(trues, preds)]), axis=0) nrmse = np.mean(np.array([NRMSE(t, p) for t, p in zip(trues, preds)]), axis=0) @@ -3283,15 +3341,21 @@ def plot(self, dataset: TimeMoEDataset, **kwargs): trues.append(true) preds.append(pred) histories.append(history) - trues = np.concatenate(trues, axis=0).reshape( - dataset.n_channels, -1, dataset.horizon_len - ).transpose(1, 0, 2) - preds = np.concatenate(preds, axis=0).reshape( - dataset.n_channels, -1, dataset.horizon_len - ).transpose(1, 0, 2) - histories = np.concatenate(histories, axis=0).reshape( - dataset.n_channels, -1, dataset.context_len - ).transpose(1, 0, 2) + trues = ( + np.concatenate(trues, axis=0) + .reshape(dataset.n_channels, -1, dataset.horizon_len) + .transpose(1, 0, 2) + ) + preds = ( + np.concatenate(preds, axis=0) + .reshape(dataset.n_channels, -1, dataset.horizon_len) + .transpose(1, 0, 2) + ) + histories = ( + np.concatenate(histories, axis=0) + .reshape(dataset.n_channels, -1, dataset.context_len) + .transpose(1, 0, 2) + ) visualize( task_name="forecasting", @@ -3308,7 +3372,7 @@ def evaluate(self, dataset: TimeMoEDataset, metric_only: bool = False, **kwargs) metric_only (bool): If True, return only metrics. Defaults to False. Returns: - (Dict[str, float]): If `metric_only` is True, returns a dictionary containing evaluation metrics, + (Dict[str, float]): If `metric_only` is True, returns a dictionary containing evaluation metrics, including mse, mae, mase, mape, rmse, nrmse, smape, msis, nd. (Tuple[Dict[str, float], np.ndarray, np.ndarray, np.ndarray]): If `metric_only` is False, returns a tuple of (metrics, trues, preds, histories). @@ -3412,7 +3476,9 @@ def __init__(self, config=None, repo=None, **kwargs): self.model.compile(self.config) self.quantiles = self.model.model.config.quantiles - def evaluate(self, dataset: TimesFM_2p5_Dataset, metric_only: bool = False, **kwargs): + def evaluate( + self, dataset: TimesFM_2p5_Dataset, metric_only: bool = False, **kwargs + ): """Evaluate the model on the given dataset. Args: @@ -3420,7 +3486,7 @@ def evaluate(self, dataset: TimesFM_2p5_Dataset, metric_only: bool = False, **kw metric_only (bool): If True, return only metrics. Defaults to False. Returns: - (Dict[str, float]): If `metric_only` is True, returns a dictionary containing evaluation metrics, + (Dict[str, float]): If `metric_only` is True, returns a dictionary containing evaluation metrics, including mse, mae, mase, mape, rmse, nrmse, smape, msis, nd, mwsq, crps. (Tuple[Dict[str, float], np.ndarray, np.ndarray, np.ndarray]): If `metric_only` is False, returns a tuple of (metrics, trues, preds, histories). @@ -3451,17 +3517,35 @@ def evaluate(self, dataset: TimesFM_2p5_Dataset, metric_only: bool = False, **kw input_seq, mask_seq, ) - quantile_forecast = quantile_forecast[..., 1:].transpose(2, 0, 1) # (q, b, h) + quantile_forecast = quantile_forecast[..., 1:].transpose( + 2, 0, 1 + ) # (q, b, h) trues.append(target_seq.cpu().numpy()) preds.append(point_forecast) q_preds.append(quantile_forecast) histories.append(input_seq.cpu().numpy()) - trues = np.concatenate(trues, axis=0).reshape(dataset.n_channels, -1, dataset.horizon_len).transpose(1, 0, 2) - preds = np.concatenate(preds, axis=0).reshape(dataset.n_channels, -1, dataset.horizon_len).transpose(1, 0, 2) - q_preds = np.concatenate(q_preds, axis=1).reshape(q_preds[-1].shape[0], dataset.n_channels, -1, dataset.horizon_len).transpose(0, 2, 1, 3) - histories = np.concatenate(histories, axis=0).reshape(dataset.n_channels, -1, dataset.context_len).transpose(1, 0, 2) + trues = ( + np.concatenate(trues, axis=0) + .reshape(dataset.n_channels, -1, dataset.horizon_len) + .transpose(1, 0, 2) + ) + preds = ( + np.concatenate(preds, axis=0) + .reshape(dataset.n_channels, -1, dataset.horizon_len) + .transpose(1, 0, 2) + ) + q_preds = ( + np.concatenate(q_preds, axis=1) + .reshape(q_preds[-1].shape[0], dataset.n_channels, -1, dataset.horizon_len) + .transpose(0, 2, 1, 3) + ) + histories = ( + np.concatenate(histories, axis=0) + .reshape(dataset.n_channels, -1, dataset.context_len) + .transpose(1, 0, 2) + ) trues = dataset._denormalize_data(trues) preds = dataset._denormalize_data(preds) @@ -3473,7 +3557,7 @@ def evaluate(self, dataset: TimesFM_2p5_Dataset, metric_only: bool = False, **kw # Calculate metrics mse = MSE(trues, preds) - mae = MAE(trues, preds) + mae = MAE(trues, preds) mase = MASE(histories, trues, preds) mape = MAPE(trues, preds) rmse = RMSE(trues, preds)