From fa7c22a8ba756c77e3a5969153fe47820b197091 Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Thu, 23 Apr 2026 16:03:07 +0000 Subject: [PATCH 1/3] feat: Adding the scale normalize flag in IAttention layer using a pass to annotate target nodes --- .../dynamo/conversion/_ConversionContext.py | 3 + .../dynamo/conversion/_TRTInterpreter.py | 1 + .../dynamo/conversion/impl/attention.py | 67 +++++- .../lowering/passes/_aten_lowering_pass.py | 2 + .../lowering/passes/annotate_fp8_sdpa.py | 76 +++++++ tests/py/dynamo/models/test_models_export.py | 201 ++++++++++++++++++ uv.lock | 196 +++++++++-------- 7 files changed, 442 insertions(+), 104 deletions(-) create mode 100644 py/torch_tensorrt/dynamo/lowering/passes/annotate_fp8_sdpa.py diff --git a/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py b/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py index f5ffdafda2..4555e925c1 100644 --- a/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py +++ b/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py @@ -1,6 +1,8 @@ from dataclasses import dataclass, field +from typing import Optional import torch +import torch.fx from torch_tensorrt.dynamo._settings import CompilationSettings from torch_tensorrt.dynamo.types import TRTNetwork @@ -25,6 +27,7 @@ class ConversionContext: requires_native_multidevice: bool = False weight_refit_map: dict[str, torch.Tensor] = field(default_factory=dict) cpu_weights_reference_holder: list[torch.Tensor] = field(default_factory=list) + current_node: Optional[torch.fx.Node] = field(default=None) def record_weight(self, name: str, weight: torch.Tensor) -> None: """ diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 1b7982f074..d8cff2e317 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -791,6 +791,7 @@ def call_function(self, target: str, args: Any, kwargs: Any) -> Any: self.ctx.requires_native_multidevice = True _LOGGER.debug(f"{target} requires native multi-device support") + self.ctx.current_node = self._cur_node if calling_convention is CallingConvention.LEGACY: return converter(self.ctx.net, target, args, kwargs, self._cur_node_name) else: diff --git a/py/torch_tensorrt/dynamo/conversion/impl/attention.py b/py/torch_tensorrt/dynamo/conversion/impl/attention.py index af9c2c7519..446b7ae99c 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/attention.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/attention.py @@ -1,8 +1,10 @@ import logging +import math from typing import Optional, Tuple, Union import tensorrt as trt from tensorrt import ITensor as TRTTensor +import torch from torch.fx.node import Target from torch_tensorrt._utils import is_tensorrt_version_supported from torch_tensorrt.dynamo._SourceIR import SourceIR @@ -16,6 +18,36 @@ _LOGGER: logging.Logger = logging.getLogger(__name__) +# FP8 E4M3 max representable magnitude. Softmax output is bounded to [0, 1], +# so 1/448 saturates exactly at 1.0 and is data-independent (no calibration needed). +_FP8_E4M3_MAX = 448.0 + + +def _maybe_set_fp8_softmax( + ctx: ConversionContext, + name: str, + attention_layer: trt.IAttention, +) -> bool: + """Set FP8 softmax normalization quantization on the IAttention layer if the current + node was annotated with a softmax FP8 scale by the fp8_attention_softmax lowering pass. + + Returns True if FP8 normalization was configured (caller must set decomposable=False). + """ + if ctx.current_node is None: + return False + scale_val = ctx.current_node.meta.get("_fp8_softmax_scale") + if scale_val is None: + return False + scale_tensor = get_trt_tensor( + ctx, + torch.tensor(scale_val, dtype=torch.float32), + name + "_softmax_fp8_scale", + dtype=torch.float32, + ) + attention_layer.normalization_quantize_to_type = trt.DataType.FP8 + attention_layer.normalization_quantize_scale = scale_tensor + return True + def _normalize_attention_mask_rank( ctx: ConversionContext, @@ -178,6 +210,18 @@ def scaled_dot_product_attention( Returns: TRTTensor: Attention output tensor with shape [batch, heads, seq_len, head_dim] """ + # When FP8 softmax normalization is active (modelopt FP8 MHA pattern) TRT's + # FP8 MHA fusion requires the Q/DQ output to feed IAttention via a single + # same-dtype Mul; any HALF<->FLOAT cast inserted by the default dynamic + # 1/sqrt(D) computation breaks the fusion. Use a static same-dtype scalar + # scale computed from the concrete head_dim. + fp8_norm_active = ( + ctx.current_node is not None + and ctx.current_node.meta.get("_fp8_softmax_scale") is not None + ) + if fp8_norm_active and scale is None and isinstance(query.shape[-1], int): + scale = 1.0 / math.sqrt(query.shape[-1]) + if scale is None: # 1 / math.sqrt(query.size(-1)) q_dim = impl.shape.shape(ctx, target, source_ir, f"{name}_shape_q", query, -1) @@ -291,7 +335,8 @@ def scaled_dot_product_attention( if mask_tensor is not None: attention_layer.mask = mask_tensor - attention_layer.decomposable = True + fp8_norm = _maybe_set_fp8_softmax(ctx, name, attention_layer) + attention_layer.decomposable = not fp8_norm attention_output = attention_layer.get_output(0) return attention_output @@ -319,6 +364,13 @@ def scaled_dot_product_flash_attention( Optional[TRTTensor], Optional[TRTTensor], ]: + fp8_norm_active = ( + ctx.current_node is not None + and ctx.current_node.meta.get("_fp8_softmax_scale") is not None + ) + if fp8_norm_active and scale is None and isinstance(query.shape[-1], int): + scale = 1.0 / math.sqrt(query.shape[-1]) + if scale is None: # 1 / math.sqrt(query.size(-1)) q_dim = impl.shape.shape(ctx, target, source_ir, f"{name}_shape_q", query, -1) @@ -367,7 +419,8 @@ def scaled_dot_product_flash_attention( ) assert attention_layer is not None, "attention layer is None" - attention_layer.decomposable = True + fp8_norm = _maybe_set_fp8_softmax(ctx, name, attention_layer) + attention_layer.decomposable = not fp8_norm attention_output = attention_layer.get_output(0) return attention_output, None, None, None, 0.0, 0.0, None, None, None @@ -387,6 +440,13 @@ def scaled_dot_product_efficient_attention( is_causal: bool = False, scale: Optional[float] = None, ) -> Tuple[TRTTensor, Optional[TRTTensor], Optional[TRTTensor], Optional[TRTTensor]]: + fp8_norm_active = ( + ctx.current_node is not None + and ctx.current_node.meta.get("_fp8_softmax_scale") is not None + ) + if fp8_norm_active and scale is None and isinstance(query.shape[-1], int): + scale = 1.0 / math.sqrt(query.shape[-1]) + if scale is None: # 1 / math.sqrt(query.size(-1)) q_dim = impl.shape.shape(ctx, target, source_ir, f"{name}_shape_q", query, -1) @@ -523,7 +583,8 @@ def scaled_dot_product_efficient_attention( if mask_tensor is not None: attention_layer.mask = mask_tensor - attention_layer.decomposable = True + fp8_norm = _maybe_set_fp8_softmax(ctx, name, attention_layer) + attention_layer.decomposable = not fp8_norm attention_output = attention_layer.get_output(0) return attention_output, None, None, None diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py index 7b770ab68b..271f7c98b7 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py @@ -10,6 +10,7 @@ trace_intermediate_node_outputs, ) +from .annotate_fp8_sdpa import annotate_fp8_sdpa from .complex_graph_rewrite import complex_graph_detection from .constant_folding import constant_fold from .force_causal_efficient_attention import force_causal_efficient_attention @@ -41,6 +42,7 @@ remove_num_users_is_0_nodes, complex_graph_detection, force_causal_efficient_attention, + annotate_fp8_sdpa, ] if not is_tegra_platform(): diff --git a/py/torch_tensorrt/dynamo/lowering/passes/annotate_fp8_sdpa.py b/py/torch_tensorrt/dynamo/lowering/passes/annotate_fp8_sdpa.py new file mode 100644 index 0000000000..257c47974c --- /dev/null +++ b/py/torch_tensorrt/dynamo/lowering/passes/annotate_fp8_sdpa.py @@ -0,0 +1,76 @@ +import logging + +import torch +from torch_tensorrt.dynamo._settings import CompilationSettings + +logger = logging.getLogger(__name__) + +# FP8 E4M3 max. Softmax output is bounded to [0, 1], so 1/448 saturates at 1.0 exactly +# and is data-independent (no calibration required for the softmax output scale). +_FP8_E4M3_SOFTMAX_SCALE = 1.0 / 448.0 + +_SDPA_TARGETS = { + torch.ops.aten.scaled_dot_product_attention.default, + torch.ops.aten._scaled_dot_product_flash_attention.default, + torch.ops.aten._scaled_dot_product_efficient_attention.default, + torch.ops.aten._scaled_dot_product_cudnn_attention.default, +} + + +def _is_fp8_quantize_op(node: torch.fx.Node) -> bool: + """Return True when node is a tensorrt.quantize_op with FP8 dtype (exponent_bits=4).""" + if node.op != "call_function": + return False + try: + if node.target != torch.ops.tensorrt.quantize_op.default: + return False + except AttributeError: + return False + # args: (input, amax, num_bits, exponent_bits, ...) + args = node.args + return len(args) >= 4 and args[2] == 8 and args[3] == 4 + + +def annotate_fp8_sdpa( + gm: torch.fx.GraphModule, settings: CompilationSettings +) -> torch.fx.GraphModule: + """Annotate SDPA nodes whose Q, K, V inputs are all FP8-quantized. + + Detects the pattern emitted by modelopt when an attention module is + registered via ``register_attention_for_kv_quant``, which wraps the + Q, K, V arguments to ``F.scaled_dot_product_attention`` with + ``q_bmm_quantizer``, ``k_bmm_quantizer``, ``v_bmm_quantizer``: + + q_fp8 = quantize_op(q, amax_q, num_bits=8, exponent_bits=4, ...) + k_fp8 = quantize_op(k, amax_k, num_bits=8, exponent_bits=4, ...) + v_fp8 = quantize_op(v, amax_v, num_bits=8, exponent_bits=4, ...) + out = scaled_dot_product_attention(q_fp8, k_fp8, v_fp8, ...) + + When all three inputs match this pattern the pass sets + ``node.meta["_fp8_softmax_scale"] = 1/448`` on the SDPA node so the + attention converter can set ``IAttention.normalization_quantize_to_type + = FP8`` and ``IAttention.normalization_quantize_scale``, which TRT + requires to fuse into the ``_gemm_mha_v2`` FP8 MHA kernel. + """ + changed = False + for node in gm.graph.nodes: + if node.op != "call_function" or node.target not in _SDPA_TARGETS: + continue + if len(node.args) < 3: + continue + q_node, k_node, v_node = node.args[0], node.args[1], node.args[2] + if not all( + isinstance(n, torch.fx.Node) and _is_fp8_quantize_op(n) + for n in (q_node, k_node, v_node) + ): + continue + node.meta["_fp8_softmax_scale"] = _FP8_E4M3_SOFTMAX_SCALE + changed = True + logger.debug( + f"Annotated SDPA node {node.name} with FP8 softmax scale " + f"{_FP8_E4M3_SOFTMAX_SCALE} (Q/K/V inputs are FP8-quantized)" + ) + + if changed: + logger.debug("FP8 SDPA softmax annotation complete") + return gm diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py index ec625a59f2..2456df74af 100644 --- a/tests/py/dynamo/models/test_models_export.py +++ b/tests/py/dynamo/models/test_models_export.py @@ -636,3 +636,204 @@ def calibrate_loop(model): ) outputs_trt = trt_model(input_tensor) assert torch.allclose(output_pyt, outputs_trt, rtol=5e-2, atol=5e-2) + + +@unittest.skipIf( + not importlib.util.find_spec("modelopt"), + "ModelOpt is required to run this test", +) +@pytest.mark.unit +def test_fp8_mha_softmax_quantizer_annotation(ir): + """Regression test for #4200: annotate_fp8_sdpa must tag an SDPA node whose + Q, K, V inputs are all FP8-quantized via ``tensorrt.quantize_op``. + + This matches the FX pattern emitted by modelopt's + ``register_attention_for_kv_quant`` when ``NVFP4_FP8_MHA_CONFIG`` is applied: + the attention module's ``F.scaled_dot_product_attention`` call has its Q, + K, V arguments wrapped by ``q_bmm_quantizer``, ``k_bmm_quantizer``, + ``v_bmm_quantizer`` (all FP8). + + The annotated ``_fp8_softmax_scale = 1/448`` on the SDPA node lets the + attention converter set ``IAttention.normalization_quantize_to_type = FP8`` + and ``IAttention.normalization_quantize_scale`` so TRT can fuse the full + ``_gemm_mha_v2`` FP8 MHA kernel. + + Also verifies that INT8 Q/K/V (exponent_bits=0) or a partially-FP8 input + (one of Q/K/V not quantized) do NOT trigger the annotation. + """ + import torch.fx as fx + from torch_tensorrt.dynamo._settings import CompilationSettings + from torch_tensorrt.dynamo.lowering.passes.annotate_fp8_sdpa import ( + _SDPA_TARGETS, + annotate_fp8_sdpa, + ) + + def _build_sdpa_input_quant_graph( + exponent_bits: int, quantize_v: bool = True + ) -> fx.GraphModule: + """Build FX graph where Q, K, V flow into SDPA through quantize_op nodes.""" + graph = fx.Graph() + q = graph.placeholder("q") + k = graph.placeholder("k") + v = graph.placeholder("v") + amax = graph.placeholder("amax") + q_q = graph.call_function( + torch.ops.tensorrt.quantize_op.default, + args=(q, amax, 8, exponent_bits, False, False), + ) + k_q = graph.call_function( + torch.ops.tensorrt.quantize_op.default, + args=(k, amax, 8, exponent_bits, False, False), + ) + v_q = ( + graph.call_function( + torch.ops.tensorrt.quantize_op.default, + args=(v, amax, 8, exponent_bits, False, False), + ) + if quantize_v + else v + ) + out = graph.call_function( + torch.ops.aten.scaled_dot_product_attention.default, args=(q_q, k_q, v_q) + ) + graph.output(out) + return fx.GraphModule({}, graph) + + settings = CompilationSettings() + + # FP8 Q/K/V inputs (exponent_bits=4): SDPA node must be annotated with 1/448. + gm_fp8 = _build_sdpa_input_quant_graph(exponent_bits=4) + annotate_fp8_sdpa(gm_fp8, settings) + sdpa_nodes = [n for n in gm_fp8.graph.nodes if n.target in _SDPA_TARGETS] + assert sdpa_nodes, "No SDPA node found in graph" + assert all( + "_fp8_softmax_scale" in n.meta for n in sdpa_nodes + ), "annotate_fp8_sdpa did not annotate SDPA when Q/K/V inputs are FP8" + expected_scale = 1.0 / 448.0 + for n in sdpa_nodes: + assert ( + abs(n.meta["_fp8_softmax_scale"] - expected_scale) < 1e-12 + ), f"Wrong softmax scale: {n.meta['_fp8_softmax_scale']}" + + # INT8 Q/K/V inputs (exponent_bits=0): SDPA node must NOT be annotated. + gm_int8 = _build_sdpa_input_quant_graph(exponent_bits=0) + annotate_fp8_sdpa(gm_int8, settings) + sdpa_int8 = [n for n in gm_int8.graph.nodes if n.target in _SDPA_TARGETS] + assert all( + "_fp8_softmax_scale" not in n.meta for n in sdpa_int8 + ), "annotate_fp8_sdpa incorrectly annotated SDPA when Q/K/V are INT8" + + # Only Q and K are FP8-quantized, V is raw: SDPA must NOT be annotated. + gm_partial = _build_sdpa_input_quant_graph(exponent_bits=4, quantize_v=False) + annotate_fp8_sdpa(gm_partial, settings) + sdpa_partial = [n for n in gm_partial.graph.nodes if n.target in _SDPA_TARGETS] + assert all( + "_fp8_softmax_scale" not in n.meta for n in sdpa_partial + ), "annotate_fp8_sdpa incorrectly annotated SDPA when V input is not FP8" + + +@unittest.skipIf( + torch.cuda.get_device_capability() < (8, 9), + "FP8 quantization requires compute capability 8.9 or later", +) +@pytest.mark.unit +def test_fp8_mha_fused_kernel(ir): + """Regression test for #4200: FP8 MHA with FP8 Q/K/V inputs must produce a + fused ``_gemm_mha_v2`` MHA kernel with normalization_quantize_to_type set. + + Hand-constructs the FX pattern that a future modelopt PyTorch-backend + version will emit for FP8 MHA (mirrors PR NVIDIA/Model-Optimizer#1289): + + quantize_op(Q) ─┐ + quantize_op(K) ─┤─ scaled_dot_product_attention + quantize_op(V) ─┘ + + Built directly via ``torch.ops.tensorrt.quantize_op`` so we do not depend + on modelopt actually supporting this pattern in its PyTorch backend today — + if/when it does, torch-tensorrt will compile that graph to the fused kernel. + + Verifies: + 1. Engine inspector shows a layer name containing ``mha`` (i.e. + ``_gemm_mha_v2``), confirming the FP8 MHA fusion triggered. + 2. Numerics match PyTorch reference SDPA within FP8 tolerance + (cosine_similarity > 0.99). + + D=64 meets TRT's head_dim >= 32 requirement for the + normalization_quantize FP8 kernel. + """ + import json + + import torch_tensorrt + + import tensorrt as trt + + B, H, S, D = 1, 2, 32, 64 + torch.manual_seed(0) + + class FP8MHAModel(torch.nn.Module): + """Mirror of what a modelopt FP8 MHA PyTorch export will look like: + tensorrt.quantize_op on Q, K, V feeding F.scaled_dot_product_attention.""" + + def __init__(self, amax_val: float = 6.0): + super().__init__() + self.register_buffer("amax_q", torch.tensor(amax_val, dtype=torch.float32)) + self.register_buffer("amax_k", torch.tensor(amax_val, dtype=torch.float32)) + self.register_buffer("amax_v", torch.tensor(amax_val, dtype=torch.float32)) + + def forward(self, q, k, v): + q_fp8 = torch.ops.tensorrt.quantize_op(q, self.amax_q, 8, 4, False, False) + k_fp8 = torch.ops.tensorrt.quantize_op(k, self.amax_k, 8, 4, False, False) + v_fp8 = torch.ops.tensorrt.quantize_op(v, self.amax_v, 8, 4, False, False) + return torch.nn.functional.scaled_dot_product_attention(q_fp8, k_fp8, v_fp8) + + q = torch.randn(B, H, S, D, dtype=torch.float16).cuda() + k = torch.randn(B, H, S, D, dtype=torch.float16).cuda() + v = torch.randn(B, H, S, D, dtype=torch.float16).cuda() + + model = FP8MHAModel().eval().cuda() + ref_out = torch.nn.functional.scaled_dot_product_attention(q, k, v) + + exp_program = torch.export.export(model, (q, k, v), strict=False) + serialized_engine = ( + torch_tensorrt.dynamo.convert_exported_program_to_serialized_trt_engine( + exp_program, + inputs=[q, k, v], + use_explicit_typing=True, + min_block_size=1, + ) + ) + + runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) + engine = runtime.deserialize_cuda_engine(serialized_engine) + inspector = engine.create_engine_inspector() + engine_json = json.loads( + inspector.get_engine_information(trt.LayerInformationFormat.JSON) + ) + layers = engine_json.get("Layers", []) + layer_names = [ + layer if isinstance(layer, str) else layer.get("Name", "") for layer in layers + ] + assert any("mha" in name.lower() for name in layer_names), ( + f"No fused MHA kernel found in compiled engine. Expected a layer " + f"containing 'mha' (e.g. _gemm_mha_v2) — TRT fuses FP8 Q/K/V + " + f"normalization_quantize_to_type into a single MHA kernel. " + f"Layer names present: {layer_names}" + ) + + # Numerical sanity: FP8-quantized MHA should agree with PyTorch SDPA. + compiled = torch_tensorrt.compile( + model, + ir="dynamo", + inputs=[q, k, v], + use_explicit_typing=True, + min_block_size=1, + ) + with torch.no_grad(): + trt_out = compiled(q, k, v) + cos = torch.nn.functional.cosine_similarity( + ref_out.flatten().float().unsqueeze(0), + trt_out.flatten().float().unsqueeze(0), + ).item() + assert ( + cos > 0.99 + ), f"FP8 MHA output deviates from PyTorch reference: cosine_similarity={cos}" diff --git a/uv.lock b/uv.lock index d2f9f309cb..9cf008868c 100644 --- a/uv.lock +++ b/uv.lock @@ -34,7 +34,7 @@ name = "accelerate" version = "1.13.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "packaging", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -728,7 +728,7 @@ dependencies = [ { name = "filelock", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "fsspec", version = "2026.2.0", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, extra = ["http"], marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "httpx", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "multiprocess", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, @@ -782,7 +782,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "httpx", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "importlib-metadata", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, @@ -1221,39 +1221,6 @@ wheels = [ { url = "https://download.pytorch.org/whl/nightly/httpx-0.28.1-py3-none-any.whl", upload-time = "2025-09-17T03:11:10Z" }, ] -[[package]] -name = "huggingface-hub" -version = "0.36.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14' and sys_platform == 'linux'", - "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", - "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'", - "python_full_version < '3.11' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", - "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'", - "python_full_version >= '3.14' and sys_platform == 'win32'", - "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine != 'AMD64' and sys_platform == 'win32'", - "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine == 'AMD64' and sys_platform == 'win32'", - "python_full_version < '3.11' and platform_machine != 'AMD64' and sys_platform == 'win32'", - "python_full_version < '3.11' and platform_machine == 'AMD64' and sys_platform == 'win32'", -] -dependencies = [ - { name = "filelock", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "fsspec", version = "2026.2.0", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "hf-xet", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'amd64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'arm64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'aarch64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'amd64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'arm64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'x86_64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "packaging", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "pyyaml", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "requests", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "tqdm", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "typing-extensions", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7c/b7/8cb61d2eece5fb05a83271da168186721c450eb74e3c31f7ef3169fa475b/huggingface_hub-0.36.2.tar.gz", hash = "sha256:1934304d2fb224f8afa3b87007d58501acfda9215b334eed53072dd5e815ff7a", size = 649782, upload-time = "2026-02-06T09:24:13.098Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a8/af/48ac8483240de756d2438c380746e7130d1c6f75802ef22f3c6d49982787/huggingface_hub-0.36.2-py3-none-any.whl", hash = "sha256:48f0c8eac16145dfce371e9d2d7772854a4f591bcb56c9cf548accf531d54270", size = 566395, upload-time = "2026-02-06T09:24:11.133Z" }, -] - [[package]] name = "huggingface-hub" version = "1.11.0" @@ -1273,15 +1240,16 @@ resolution-markers = [ "python_full_version < '3.11' and platform_machine == 'AMD64' and sys_platform == 'win32'", ] dependencies = [ - { name = "filelock", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "filelock", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "fsspec", version = "2026.2.0", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "fsspec", version = "2026.3.0", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "hf-xet", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'amd64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'arm64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'AMD64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'aarch64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'amd64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'arm64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'x86_64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "httpx", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "packaging", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "pyyaml", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "tqdm", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "typer", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "typing-extensions", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "hf-xet", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'amd64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine == 'AMD64' and sys_platform == 'win32') or (platform_machine == 'aarch64' and sys_platform == 'win32') or (platform_machine == 'amd64' and sys_platform == 'win32') or (platform_machine == 'arm64' and sys_platform == 'win32') or (platform_machine == 'x86_64' and sys_platform == 'win32')" }, + { name = "httpx", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "packaging", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "pyyaml", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "tqdm", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "typer", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "typing-extensions", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/dc/89/e7aa12d8a6b9259bed10671abb25ae6fa437c0f88a86ecbf59617bae7759/huggingface_hub-1.11.0.tar.gz", hash = "sha256:15fb3713c7f9cdff7b808a94fd91664f661ab142796bb48c9cd9493e8d166278", size = 761749, upload-time = "2026-04-16T13:07:39.73Z" } wheels = [ @@ -2528,8 +2496,8 @@ wheels = [ [[package]] name = "nvidia-modelopt" -version = "0.43.0" -source = { registry = "https://pypi.nvidia.com/" } +version = "0.45.0.dev8+gc7966119e" +source = { git = "https://github.com/NVIDIA/Model-Optimizer.git?rev=main#c7966119eb7fdec1bac4ad938413048f145a45fb" } dependencies = [ { name = "ninja", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, @@ -2549,9 +2517,6 @@ dependencies = [ { name = "torch", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "tqdm", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] -wheels = [ - { url = "https://pypi.nvidia.com/nvidia-modelopt/nvidia_modelopt-0.43.0-py3-none-any.whl", hash = "sha256:fe11a49e16230435b3a17153bcdb5717b2859a61544cdbe9dcb3f062ba2c203a" }, -] [package.optional-dependencies] hf = [ @@ -2559,11 +2524,12 @@ hf = [ { name = "datasets", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "deepspeed", marker = "sys_platform == 'linux'" }, { name = "diffusers", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "nltk", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "peft", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "sentencepiece", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "transformers", version = "4.57.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "tiktoken", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "transformers", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "wonderwords", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] @@ -2796,7 +2762,7 @@ version = "0.19.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "accelerate", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "packaging", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -2805,7 +2771,7 @@ dependencies = [ { name = "safetensors", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "torch", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "tqdm", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "transformers", version = "4.57.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "transformers", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/86/cf/037f1e3d5186496c05513a6754639e2dab3038a05f384284d49a9bd06a2d/peft-0.19.1.tar.gz", hash = "sha256:0d97542fe96dcdaa20d3b81c06f26f988618f416a73544ab23c3618ccb674a40", size = 763738, upload-time = "2026-04-16T15:46:45.105Z" } wheels = [ @@ -4345,12 +4311,73 @@ wheels = [ { url = "https://pypi.nvidia.com/tensorrt-cu13-libs/tensorrt_cu13_libs-10.16.1.11-py3-none-win_amd64.whl", hash = "sha256:96262c3e8c64a45abd29aa3482d99480fac6845bd420b6de125699bb1ae365ff" }, ] +[[package]] +name = "tiktoken" +version = "0.12.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "regex", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "requests", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/89/b3/2cb7c17b6c4cf8ca983204255d3f1d95eda7213e247e6947a0ee2c747a2c/tiktoken-0.12.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3de02f5a491cfd179aec916eddb70331814bd6bf764075d39e21d5862e533970", size = 1051991, upload-time = "2025-10-06T20:21:34.098Z" }, + { url = "https://files.pythonhosted.org/packages/27/0f/df139f1df5f6167194ee5ab24634582ba9a1b62c6b996472b0277ec80f66/tiktoken-0.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b6cfb6d9b7b54d20af21a912bfe63a2727d9cfa8fbda642fd8322c70340aad16", size = 995798, upload-time = "2025-10-06T20:21:35.579Z" }, + { url = "https://files.pythonhosted.org/packages/ef/5d/26a691f28ab220d5edc09b9b787399b130f24327ef824de15e5d85ef21aa/tiktoken-0.12.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:cde24cdb1b8a08368f709124f15b36ab5524aac5fa830cc3fdce9c03d4fb8030", size = 1129865, upload-time = "2025-10-06T20:21:36.675Z" }, + { url = "https://files.pythonhosted.org/packages/b2/94/443fab3d4e5ebecac895712abd3849b8da93b7b7dec61c7db5c9c7ebe40c/tiktoken-0.12.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:6de0da39f605992649b9cfa6f84071e3f9ef2cec458d08c5feb1b6f0ff62e134", size = 1152856, upload-time = "2025-10-06T20:21:37.873Z" }, + { url = "https://files.pythonhosted.org/packages/54/35/388f941251b2521c70dd4c5958e598ea6d2c88e28445d2fb8189eecc1dfc/tiktoken-0.12.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6faa0534e0eefbcafaccb75927a4a380463a2eaa7e26000f0173b920e98b720a", size = 1195308, upload-time = "2025-10-06T20:21:39.577Z" }, + { url = "https://files.pythonhosted.org/packages/f8/00/c6681c7f833dd410576183715a530437a9873fa910265817081f65f9105f/tiktoken-0.12.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:82991e04fc860afb933efb63957affc7ad54f83e2216fe7d319007dab1ba5892", size = 1255697, upload-time = "2025-10-06T20:21:41.154Z" }, + { url = "https://files.pythonhosted.org/packages/5f/d2/82e795a6a9bafa034bf26a58e68fe9a89eeaaa610d51dbeb22106ba04f0a/tiktoken-0.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:6fb2995b487c2e31acf0a9e17647e3b242235a20832642bb7a9d1a181c0c1bb1", size = 879375, upload-time = "2025-10-06T20:21:43.201Z" }, + { url = "https://files.pythonhosted.org/packages/de/46/21ea696b21f1d6d1efec8639c204bdf20fde8bafb351e1355c72c5d7de52/tiktoken-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6e227c7f96925003487c33b1b32265fad2fbcec2b7cf4817afb76d416f40f6bb", size = 1051565, upload-time = "2025-10-06T20:21:44.566Z" }, + { url = "https://files.pythonhosted.org/packages/c9/d9/35c5d2d9e22bb2a5f74ba48266fb56c63d76ae6f66e02feb628671c0283e/tiktoken-0.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c06cf0fcc24c2cb2adb5e185c7082a82cba29c17575e828518c2f11a01f445aa", size = 995284, upload-time = "2025-10-06T20:21:45.622Z" }, + { url = "https://files.pythonhosted.org/packages/01/84/961106c37b8e49b9fdcf33fe007bb3a8fdcc380c528b20cc7fbba80578b8/tiktoken-0.12.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:f18f249b041851954217e9fd8e5c00b024ab2315ffda5ed77665a05fa91f42dc", size = 1129201, upload-time = "2025-10-06T20:21:47.074Z" }, + { url = "https://files.pythonhosted.org/packages/6a/d0/3d9275198e067f8b65076a68894bb52fd253875f3644f0a321a720277b8a/tiktoken-0.12.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:47a5bc270b8c3db00bb46ece01ef34ad050e364b51d406b6f9730b64ac28eded", size = 1152444, upload-time = "2025-10-06T20:21:48.139Z" }, + { url = "https://files.pythonhosted.org/packages/78/db/a58e09687c1698a7c592e1038e01c206569b86a0377828d51635561f8ebf/tiktoken-0.12.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:508fa71810c0efdcd1b898fda574889ee62852989f7c1667414736bcb2b9a4bd", size = 1195080, upload-time = "2025-10-06T20:21:49.246Z" }, + { url = "https://files.pythonhosted.org/packages/9e/1b/a9e4d2bf91d515c0f74afc526fd773a812232dd6cda33ebea7f531202325/tiktoken-0.12.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a1af81a6c44f008cba48494089dd98cccb8b313f55e961a52f5b222d1e507967", size = 1255240, upload-time = "2025-10-06T20:21:50.274Z" }, + { url = "https://files.pythonhosted.org/packages/9d/15/963819345f1b1fb0809070a79e9dd96938d4ca41297367d471733e79c76c/tiktoken-0.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:3e68e3e593637b53e56f7237be560f7a394451cb8c11079755e80ae64b9e6def", size = 879422, upload-time = "2025-10-06T20:21:51.734Z" }, + { url = "https://files.pythonhosted.org/packages/a4/85/be65d39d6b647c79800fd9d29241d081d4eeb06271f383bb87200d74cf76/tiktoken-0.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b97f74aca0d78a1ff21b8cd9e9925714c15a9236d6ceacf5c7327c117e6e21e8", size = 1050728, upload-time = "2025-10-06T20:21:52.756Z" }, + { url = "https://files.pythonhosted.org/packages/4a/42/6573e9129bc55c9bf7300b3a35bef2c6b9117018acca0dc760ac2d93dffe/tiktoken-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b90f5ad190a4bb7c3eb30c5fa32e1e182ca1ca79f05e49b448438c3e225a49b", size = 994049, upload-time = "2025-10-06T20:21:53.782Z" }, + { url = "https://files.pythonhosted.org/packages/66/c5/ed88504d2f4a5fd6856990b230b56d85a777feab84e6129af0822f5d0f70/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:65b26c7a780e2139e73acc193e5c63ac754021f160df919add909c1492c0fb37", size = 1129008, upload-time = "2025-10-06T20:21:54.832Z" }, + { url = "https://files.pythonhosted.org/packages/f4/90/3dae6cc5436137ebd38944d396b5849e167896fc2073da643a49f372dc4f/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:edde1ec917dfd21c1f2f8046b86348b0f54a2c0547f68149d8600859598769ad", size = 1152665, upload-time = "2025-10-06T20:21:56.129Z" }, + { url = "https://files.pythonhosted.org/packages/a3/fe/26df24ce53ffde419a42f5f53d755b995c9318908288c17ec3f3448313a3/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:35a2f8ddd3824608b3d650a000c1ef71f730d0c56486845705a8248da00f9fe5", size = 1194230, upload-time = "2025-10-06T20:21:57.546Z" }, + { url = "https://files.pythonhosted.org/packages/20/cc/b064cae1a0e9fac84b0d2c46b89f4e57051a5f41324e385d10225a984c24/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83d16643edb7fa2c99eff2ab7733508aae1eebb03d5dfc46f5565862810f24e3", size = 1254688, upload-time = "2025-10-06T20:21:58.619Z" }, + { url = "https://files.pythonhosted.org/packages/81/10/b8523105c590c5b8349f2587e2fdfe51a69544bd5a76295fc20f2374f470/tiktoken-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc5288f34a8bc02e1ea7047b8d041104791d2ddbf42d1e5fa07822cbffe16bd", size = 878694, upload-time = "2025-10-06T20:21:59.876Z" }, + { url = "https://files.pythonhosted.org/packages/00/61/441588ee21e6b5cdf59d6870f86beb9789e532ee9718c251b391b70c68d6/tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3", size = 1050802, upload-time = "2025-10-06T20:22:00.96Z" }, + { url = "https://files.pythonhosted.org/packages/1f/05/dcf94486d5c5c8d34496abe271ac76c5b785507c8eae71b3708f1ad9b45a/tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160", size = 993995, upload-time = "2025-10-06T20:22:02.788Z" }, + { url = "https://files.pythonhosted.org/packages/a0/70/5163fe5359b943f8db9946b62f19be2305de8c3d78a16f629d4165e2f40e/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa", size = 1128948, upload-time = "2025-10-06T20:22:03.814Z" }, + { url = "https://files.pythonhosted.org/packages/0c/da/c028aa0babf77315e1cef357d4d768800c5f8a6de04d0eac0f377cb619fa/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be", size = 1151986, upload-time = "2025-10-06T20:22:05.173Z" }, + { url = "https://files.pythonhosted.org/packages/a0/5a/886b108b766aa53e295f7216b509be95eb7d60b166049ce2c58416b25f2a/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a", size = 1194222, upload-time = "2025-10-06T20:22:06.265Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f8/4db272048397636ac7a078d22773dd2795b1becee7bc4922fe6207288d57/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3", size = 1255097, upload-time = "2025-10-06T20:22:07.403Z" }, + { url = "https://files.pythonhosted.org/packages/8e/32/45d02e2e0ea2be3a9ed22afc47d93741247e75018aac967b713b2941f8ea/tiktoken-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697", size = 879117, upload-time = "2025-10-06T20:22:08.418Z" }, + { url = "https://files.pythonhosted.org/packages/ce/76/994fc868f88e016e6d05b0da5ac24582a14c47893f4474c3e9744283f1d5/tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16", size = 1050309, upload-time = "2025-10-06T20:22:10.939Z" }, + { url = "https://files.pythonhosted.org/packages/f6/b8/57ef1456504c43a849821920d582a738a461b76a047f352f18c0b26c6516/tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a", size = 993712, upload-time = "2025-10-06T20:22:12.115Z" }, + { url = "https://files.pythonhosted.org/packages/72/90/13da56f664286ffbae9dbcfadcc625439142675845baa62715e49b87b68b/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27", size = 1128725, upload-time = "2025-10-06T20:22:13.541Z" }, + { url = "https://files.pythonhosted.org/packages/05/df/4f80030d44682235bdaecd7346c90f67ae87ec8f3df4a3442cb53834f7e4/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb", size = 1151875, upload-time = "2025-10-06T20:22:14.559Z" }, + { url = "https://files.pythonhosted.org/packages/22/1f/ae535223a8c4ef4c0c1192e3f9b82da660be9eb66b9279e95c99288e9dab/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e", size = 1194451, upload-time = "2025-10-06T20:22:15.545Z" }, + { url = "https://files.pythonhosted.org/packages/78/a7/f8ead382fce0243cb625c4f266e66c27f65ae65ee9e77f59ea1653b6d730/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25", size = 1253794, upload-time = "2025-10-06T20:22:16.624Z" }, + { url = "https://files.pythonhosted.org/packages/93/e0/6cc82a562bc6365785a3ff0af27a2a092d57c47d7a81d9e2295d8c36f011/tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f", size = 878777, upload-time = "2025-10-06T20:22:18.036Z" }, + { url = "https://files.pythonhosted.org/packages/72/05/3abc1db5d2c9aadc4d2c76fa5640134e475e58d9fbb82b5c535dc0de9b01/tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646", size = 1050188, upload-time = "2025-10-06T20:22:19.563Z" }, + { url = "https://files.pythonhosted.org/packages/e3/7b/50c2f060412202d6c95f32b20755c7a6273543b125c0985d6fa9465105af/tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88", size = 993978, upload-time = "2025-10-06T20:22:20.702Z" }, + { url = "https://files.pythonhosted.org/packages/14/27/bf795595a2b897e271771cd31cb847d479073497344c637966bdf2853da1/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff", size = 1129271, upload-time = "2025-10-06T20:22:22.06Z" }, + { url = "https://files.pythonhosted.org/packages/f5/de/9341a6d7a8f1b448573bbf3425fa57669ac58258a667eb48a25dfe916d70/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830", size = 1151216, upload-time = "2025-10-06T20:22:23.085Z" }, + { url = "https://files.pythonhosted.org/packages/75/0d/881866647b8d1be4d67cb24e50d0c26f9f807f994aa1510cb9ba2fe5f612/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b", size = 1194860, upload-time = "2025-10-06T20:22:24.602Z" }, + { url = "https://files.pythonhosted.org/packages/b3/1e/b651ec3059474dab649b8d5b69f5c65cd8fcd8918568c1935bd4136c9392/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b", size = 1254567, upload-time = "2025-10-06T20:22:25.671Z" }, + { url = "https://files.pythonhosted.org/packages/80/57/ce64fd16ac390fafde001268c364d559447ba09b509181b2808622420eec/tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3", size = 921067, upload-time = "2025-10-06T20:22:26.753Z" }, + { url = "https://files.pythonhosted.org/packages/ac/a4/72eed53e8976a099539cdd5eb36f241987212c29629d0a52c305173e0a68/tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365", size = 1050473, upload-time = "2025-10-06T20:22:27.775Z" }, + { url = "https://files.pythonhosted.org/packages/e6/d7/0110b8f54c008466b19672c615f2168896b83706a6611ba6e47313dbc6e9/tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e", size = 993855, upload-time = "2025-10-06T20:22:28.799Z" }, + { url = "https://files.pythonhosted.org/packages/5f/77/4f268c41a3957c418b084dd576ea2fad2e95da0d8e1ab705372892c2ca22/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63", size = 1129022, upload-time = "2025-10-06T20:22:29.981Z" }, + { url = "https://files.pythonhosted.org/packages/4e/2b/fc46c90fe5028bd094cd6ee25a7db321cb91d45dc87531e2bdbb26b4867a/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0", size = 1150736, upload-time = "2025-10-06T20:22:30.996Z" }, + { url = "https://files.pythonhosted.org/packages/28/c0/3c7a39ff68022ddfd7d93f3337ad90389a342f761c4d71de99a3ccc57857/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a", size = 1194908, upload-time = "2025-10-06T20:22:32.073Z" }, + { url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706, upload-time = "2025-10-06T20:22:33.385Z" }, + { url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" }, +] + [[package]] name = "timm" version = "1.0.26" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "huggingface-hub", version = "1.11.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "pyyaml", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "safetensors", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "torch", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -4378,8 +4405,7 @@ name = "tokenizers" version = "0.22.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "huggingface-hub", version = "1.11.0", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/73/6f/f80cfef4a312e1fb34baf7d85c72d4411afde10978d4657f8cdd811d3ccc/tokenizers-0.22.2.tar.gz", hash = "sha256:473b83b915e547aa366d1eee11806deaf419e17be16310ac0a14077f1e28f917", size = 372115, upload-time = "2026-01-05T10:45:15.988Z" } wheels = [ @@ -4557,6 +4583,7 @@ lint = [ ] quantization = [ { name = "nvidia-modelopt", extra = ["hf"], marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "transformers", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] test = [ { name = "expecttest", marker = "sys_platform == 'linux' or sys_platform == 'win32' or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, @@ -4570,7 +4597,7 @@ test-ext = [ { name = "flashinfer-python", version = "0.6.8.post1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (python_full_version >= '3.13' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform != 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "timm", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "torchvision", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "transformers", version = "5.5.4", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "transformers", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] [package.metadata] @@ -4620,7 +4647,10 @@ lint = [ { name = "black", specifier = ">=24.0.0" }, { name = "clang-format", specifier = "==14.0.6" }, ] -quantization = [{ name = "nvidia-modelopt", extras = ["hf"], specifier = ">=0.43.0" }] +quantization = [ + { name = "nvidia-modelopt", extras = ["hf"], git = "https://github.com/NVIDIA/Model-Optimizer.git?rev=main" }, + { name = "transformers", specifier = ">=5.5.4" }, +] test = [ { name = "expecttest", specifier = "==0.1.6" }, { name = "parameterized", specifier = ">=0.2.0" }, @@ -4707,42 +4737,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, ] -[[package]] -name = "transformers" -version = "4.57.6" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14' and sys_platform == 'linux'", - "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", - "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'", - "python_full_version < '3.11' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", - "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", - "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'", - "python_full_version >= '3.14' and sys_platform == 'win32'", - "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine != 'AMD64' and sys_platform == 'win32'", - "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine == 'AMD64' and sys_platform == 'win32'", - "python_full_version < '3.11' and platform_machine != 'AMD64' and sys_platform == 'win32'", - "python_full_version < '3.11' and platform_machine == 'AMD64' and sys_platform == 'win32'", -] -dependencies = [ - { name = "filelock", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "packaging", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "pyyaml", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "regex", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "requests", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "safetensors", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "tokenizers", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "tqdm", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c4/35/67252acc1b929dc88b6602e8c4a982e64f31e733b804c14bc24b47da35e6/transformers-4.57.6.tar.gz", hash = "sha256:55e44126ece9dc0a291521b7e5492b572e6ef2766338a610b9ab5afbb70689d3", size = 10134912, upload-time = "2026-01-16T10:38:39.284Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/03/b8/e484ef633af3887baeeb4b6ad12743363af7cce68ae51e938e00aaa0529d/transformers-4.57.6-py3-none-any.whl", hash = "sha256:4c9e9de11333ddfe5114bc872c9f370509198acf0b87a832a0ab9458e2bd0550", size = 11993498, upload-time = "2026-01-16T10:38:31.289Z" }, -] - [[package]] name = "transformers" version = "5.5.4" @@ -4762,9 +4756,9 @@ resolution-markers = [ "python_full_version < '3.11' and platform_machine == 'AMD64' and sys_platform == 'win32'", ] dependencies = [ - { name = "huggingface-hub", version = "1.11.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "packaging", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "pyyaml", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "regex", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -4804,10 +4798,10 @@ name = "typer" version = "0.24.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "annotated-doc", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "click", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "rich", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "shellingham", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "annotated-doc", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "click", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "rich", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "shellingham", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/f5/24/cb09efec5cc954f7f9b930bf8279447d24618bb6758d4f6adf2574c41780/typer-0.24.1.tar.gz", hash = "sha256:e39b4732d65fbdcde189ae76cf7cd48aeae72919dea1fdfc16593be016256b45", size = 118613, upload-time = "2026-02-21T16:54:40.609Z" } wheels = [ From d2f8537c40ceaa02e5545c38d7a2f4b585e89afa Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Thu, 23 Apr 2026 17:16:13 +0000 Subject: [PATCH 2/3] fix add a pass to insert softmax quantization node for decomposed attention --- .../lowering/passes/_aten_lowering_pass.py | 2 + .../lowering/passes/insert_fp8_softmax_qdq.py | 159 ++++++++++++++++++ tests/py/dynamo/models/test_models_export.py | 96 +++++++++++ 3 files changed, 257 insertions(+) create mode 100644 py/torch_tensorrt/dynamo/lowering/passes/insert_fp8_softmax_qdq.py diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py index 271f7c98b7..06ef44248a 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py @@ -15,6 +15,7 @@ from .constant_folding import constant_fold from .force_causal_efficient_attention import force_causal_efficient_attention from .fuse_prims_broadcast import fuse_prims_broadcast +from .insert_fp8_softmax_qdq import insert_fp8_softmax_qdq from .pass_manager import DynamoPassManager from .remove_assert_nodes import remove_assert_nodes from .remove_detach import remove_detach @@ -43,6 +44,7 @@ complex_graph_detection, force_causal_efficient_attention, annotate_fp8_sdpa, + insert_fp8_softmax_qdq, ] if not is_tegra_platform(): diff --git a/py/torch_tensorrt/dynamo/lowering/passes/insert_fp8_softmax_qdq.py b/py/torch_tensorrt/dynamo/lowering/passes/insert_fp8_softmax_qdq.py new file mode 100644 index 0000000000..887bcb5a59 --- /dev/null +++ b/py/torch_tensorrt/dynamo/lowering/passes/insert_fp8_softmax_qdq.py @@ -0,0 +1,159 @@ +import logging +from typing import Optional + +import torch +from torch_tensorrt.dynamo._settings import CompilationSettings + +from .annotate_fp8_sdpa import _is_fp8_quantize_op + +logger = logging.getLogger(__name__) + +_FP8_E4M3_SOFTMAX_AMAX = 1.0 +_SOFTMAX_TARGETS = { + torch.ops.aten._softmax.default, + torch.ops.aten.softmax.int, +} +_MATMUL_TARGETS = { + torch.ops.aten.matmul.default, + torch.ops.aten.bmm.default, +} +# Shape-only ops that may sit between a quantize_op output and a matmul input. +_TRANSPARENT_TARGETS = { + torch.ops.aten.permute.default, + torch.ops.aten.transpose.int, + torch.ops.aten.reshape.default, + torch.ops.aten._reshape_copy.default, + torch.ops.aten.view.default, + torch.ops.aten.expand.default, + torch.ops.aten.clone.default, + torch.ops.aten.contiguous.default, +} + + +def _source_is_fp8_quantize(node: Optional[torch.fx.Node]) -> bool: + """Walk through shape-transparent ops to find the producer; True if FP8 quantize_op.""" + seen: set[int] = set() + cur = node + while isinstance(cur, torch.fx.Node) and id(cur) not in seen: + seen.add(id(cur)) + if _is_fp8_quantize_op(cur): + return True + if cur.op == "call_function" and cur.target in _TRANSPARENT_TARGETS: + cur = cur.args[0] if cur.args else None + continue + return False + return False + + +def _single_matmul_user(node: torch.fx.Node) -> Optional[torch.fx.Node]: + """Return the matmul user of ``node`` if it has exactly one and it is a matmul.""" + users = list(node.users) + if len(users) != 1: + return None + user = users[0] + if user.op != "call_function" or user.target not in _MATMUL_TARGETS: + return None + return user + + +def insert_fp8_softmax_qdq( + gm: torch.fx.GraphModule, settings: CompilationSettings +) -> torch.fx.GraphModule: + """Insert an FP8 Q/DQ on softmax output in the decomposed FP8 MHA pattern. + + TRT's Method 2 FP8 MHA fusion requires FP8 Q/DQ on Q, K, V **and** on the + softmax output. modelopt's ``NVFP4_FP8_MHA_CONFIG`` specifies a + ``*softmax_quantizer`` but the HF ``_QuantAttention.softmax_quantizer`` is + only applied in the Triton FA path — not in the standard + ``F.scaled_dot_product_attention`` path used by ``torch.export``. + Consequently the exported FX graph has:: + + matmul(q_fp8, k_fp8.T) → mul(1/sqrt(D)) → softmax → matmul(·, v_fp8) + + with no FP8 Q/DQ between ``softmax`` and the second ``matmul``, so TRT + keeps the two matmuls and the softmax as separate kernels instead of + producing ``_gemm_mha_v2``. + + This pass recovers the fusion by inserting a ``tensorrt.quantize_op`` with + ``num_bits=8, exponent_bits=4, amax=1.0`` (→ scale = 1/448) on the softmax + output when the surrounding matmul inputs are FP8-quantized. 1/448 is + data-independent because softmax output ∈ [0, 1]. + + The pass is conservative: it fires only when *all three* of Q, K, V on the + two matmuls trace back to FP8 ``tensorrt.quantize_op`` nodes. If the + graph is not a quantized MHA, nothing changes. + """ + changed = False + amax_buffer_idx = 0 + for node in list(gm.graph.nodes): + if node.op != "call_function" or node.target not in _SOFTMAX_TARGETS: + continue + # The softmax must feed a single matmul (BMM2 = softmax_out @ V). + bmm2 = _single_matmul_user(node) + if bmm2 is None or len(bmm2.args) < 2: + continue + v_source = bmm2.args[1] + if not _source_is_fp8_quantize(v_source): + continue + + # Trace back from softmax to BMM1 through a possible scale/mul/div. + attn_src = node.args[0] if node.args else None + while ( + isinstance(attn_src, torch.fx.Node) + and attn_src.op == "call_function" + and attn_src.target + in { + torch.ops.aten.mul.Tensor, + torch.ops.aten.div.Tensor, + torch.ops.aten.add.Tensor, + torch.ops.aten.sub.Tensor, + } + ): + attn_src = attn_src.args[0] + if not isinstance(attn_src, torch.fx.Node): + continue + if attn_src.op != "call_function" or attn_src.target not in _MATMUL_TARGETS: + continue + if len(attn_src.args) < 2: + continue + q_source, k_source = attn_src.args[0], attn_src.args[1] + if not ( + _source_is_fp8_quantize(q_source) and _source_is_fp8_quantize(k_source) + ): + continue + + # Register a per-insertion amax buffer (1.0). + amax_name = f"_fp8_softmax_qdq_amax_{amax_buffer_idx}" + amax_buffer_idx += 1 + gm.register_buffer( + amax_name, + torch.tensor(_FP8_E4M3_SOFTMAX_AMAX, dtype=torch.float32), + persistent=False, + ) + + with gm.graph.inserting_after(node): + amax_node = gm.graph.create_node( + "get_attr", amax_name, (), {}, name=amax_name + ) + with gm.graph.inserting_after(amax_node): + q_op = gm.graph.create_node( + "call_function", + torch.ops.tensorrt.quantize_op.default, + (node, amax_node, 8, 4, False, False), + {}, + name=f"fp8_softmax_quantize_{amax_buffer_idx - 1}", + ) + + # Re-route downstream matmul to read from the new quantize_op output. + bmm2.replace_input_with(node, q_op) + changed = True + logger.debug( + f"Inserted FP8 softmax Q/DQ after {node.name} " + f"(scale=1/448, pattern=matmul→...→softmax→matmul with FP8 Q/K/V)" + ) + + if changed: + gm.graph.lint() + gm.recompile() + logger.debug("FP8 decomposed-MHA softmax Q/DQ insertion complete") + return gm diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py index 2456df74af..c2681ef47f 100644 --- a/tests/py/dynamo/models/test_models_export.py +++ b/tests/py/dynamo/models/test_models_export.py @@ -837,3 +837,99 @@ def forward(self, q, k, v): assert ( cos > 0.99 ), f"FP8 MHA output deviates from PyTorch reference: cosine_similarity={cos}" + + +@unittest.skipIf( + torch.cuda.get_device_capability() < (8, 9), + "FP8 quantization requires compute capability 8.9 or later", +) +@pytest.mark.unit +def test_fp8_mha_fused_kernel_decomposed(ir): + """Regression test for the decomposed FP8 MHA path (TRT Method 2). + + With ``decompose_attention=True`` the SDPA op is expanded into explicit + ``matmul → mul(1/sqrt(D)) → softmax → matmul`` primitives (no + ``IAttention``). TRT fuses this into ``_gemm_mha_v2`` only when FP8 + Q/DQ is present on Q, K, V **and** on the softmax output. + + modelopt's HF ``_QuantAttention.softmax_quantizer`` is only applied in + the Triton FA path, so the standard FX graph lacks the softmax Q/DQ. + The ``insert_fp8_softmax_qdq`` lowering pass adds it back (scale = 1/448). + This test constructs the pattern manually and compiles with + ``decompose_attention=True`` to verify the fusion still triggers. + """ + import json + + import torch_tensorrt + + import tensorrt as trt + + B, H, S, D = 1, 2, 32, 64 + torch.manual_seed(0) + + class FP8MHAModel(torch.nn.Module): + def __init__(self, amax_val: float = 6.0): + super().__init__() + self.register_buffer("amax_q", torch.tensor(amax_val, dtype=torch.float32)) + self.register_buffer("amax_k", torch.tensor(amax_val, dtype=torch.float32)) + self.register_buffer("amax_v", torch.tensor(amax_val, dtype=torch.float32)) + + def forward(self, q, k, v): + q_fp8 = torch.ops.tensorrt.quantize_op(q, self.amax_q, 8, 4, False, False) + k_fp8 = torch.ops.tensorrt.quantize_op(k, self.amax_k, 8, 4, False, False) + v_fp8 = torch.ops.tensorrt.quantize_op(v, self.amax_v, 8, 4, False, False) + return torch.nn.functional.scaled_dot_product_attention(q_fp8, k_fp8, v_fp8) + + q = torch.randn(B, H, S, D, dtype=torch.float16).cuda() + k = torch.randn(B, H, S, D, dtype=torch.float16).cuda() + v = torch.randn(B, H, S, D, dtype=torch.float16).cuda() + + model = FP8MHAModel().eval().cuda() + ref_out = torch.nn.functional.scaled_dot_product_attention(q, k, v) + + exp_program = torch.export.export(model, (q, k, v), strict=False) + serialized_engine = ( + torch_tensorrt.dynamo.convert_exported_program_to_serialized_trt_engine( + exp_program, + inputs=[q, k, v], + use_explicit_typing=True, + min_block_size=1, + decompose_attention=True, + ) + ) + + runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) + engine = runtime.deserialize_cuda_engine(serialized_engine) + inspector = engine.create_engine_inspector() + engine_json = json.loads( + inspector.get_engine_information(trt.LayerInformationFormat.JSON) + ) + layers = engine_json.get("Layers", []) + layer_names = [ + layer if isinstance(layer, str) else layer.get("Name", "") for layer in layers + ] + assert any("mha" in name.lower() for name in layer_names), ( + f"No fused MHA kernel found on decomposed path. Expected a layer " + f"containing 'mha' (e.g. _gemm_mha_v2) — TRT fuses FP8 Q/K/V + " + f"softmax-output Q/DQ into _gemm_mha_v2 on Method 2 path. " + f"Layer names: {layer_names}" + ) + + # Numerical sanity + compiled = torch_tensorrt.compile( + model, + ir="dynamo", + inputs=[q, k, v], + use_explicit_typing=True, + min_block_size=1, + decompose_attention=True, + ) + with torch.no_grad(): + trt_out = compiled(q, k, v) + cos = torch.nn.functional.cosine_similarity( + ref_out.flatten().float().unsqueeze(0), + trt_out.flatten().float().unsqueeze(0), + ).item() + assert ( + cos > 0.99 + ), f"Decomposed FP8 MHA output deviates from PyTorch reference: cos={cos}" From 1ff2a6b5cad513cc2db287e9a6aa6c17aa71d7d6 Mon Sep 17 00:00:00 2001 From: Naren Dasan Date: Mon, 11 May 2026 11:54:39 -0600 Subject: [PATCH 3/3] chore: address review comments --- .../dynamo/conversion/_ConversionContext.py | 1 + .../dynamo/conversion/impl/attention.py | 10 +- .../lowering/passes/insert_fp8_softmax_qdq.py | 16 +- uv.lock | 196 +++++++++--------- 4 files changed, 122 insertions(+), 101 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py b/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py index 4555e925c1..e22b8798eb 100644 --- a/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py +++ b/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py @@ -17,6 +17,7 @@ class ConversionContext: requires_output_allocator: Boolean flag indicating if the converter creates operators which require an Output Allocator to run (e.g. data dependent operators) weight_refit_map: Dictionary mapping weight names to their corresponding np.array cpu_weights_reference_holder: Dictionary mapping weight names to their corresponding torch.Tensor + current_node: The FX node currently being converted, used by converters that need access to graph-level metadata (e.g. annotations set by lowering passes) """ net: TRTNetwork diff --git a/py/torch_tensorrt/dynamo/conversion/impl/attention.py b/py/torch_tensorrt/dynamo/conversion/impl/attention.py index 446b7ae99c..40ef5ff4db 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/attention.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/attention.py @@ -7,6 +7,7 @@ import torch from torch.fx.node import Target from torch_tensorrt._utils import is_tensorrt_version_supported +from torch_tensorrt import _enums from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion import impl from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -38,11 +39,16 @@ def _maybe_set_fp8_softmax( scale_val = ctx.current_node.meta.get("_fp8_softmax_scale") if scale_val is None: return False + # Scale dtype must match the IAttention output (= pre-quant Q/K/V) dtype; + # using float32 unconditionally fails TRT compilation on some platforms. + output_dtype = _enums.dtype._from(attention_layer.get_output(0).dtype).to( + torch.dtype + ) scale_tensor = get_trt_tensor( ctx, - torch.tensor(scale_val, dtype=torch.float32), + torch.tensor(scale_val, dtype=output_dtype), name + "_softmax_fp8_scale", - dtype=torch.float32, + dtype=output_dtype, ) attention_layer.normalization_quantize_to_type = trt.DataType.FP8 attention_layer.normalization_quantize_scale = scale_tensor diff --git a/py/torch_tensorrt/dynamo/lowering/passes/insert_fp8_softmax_qdq.py b/py/torch_tensorrt/dynamo/lowering/passes/insert_fp8_softmax_qdq.py index 887bcb5a59..fd3a24beaf 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/insert_fp8_softmax_qdq.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/insert_fp8_softmax_qdq.py @@ -14,7 +14,11 @@ torch.ops.aten.softmax.int, } _MATMUL_TARGETS = { + torch.ops.aten.matmul, torch.ops.aten.matmul.default, + torch.ops.aten.dot.default, + torch.ops.aten.mm.default, + torch.ops.aten.mv.default, torch.ops.aten.bmm.default, } # Shape-only ops that may sit between a quantize_op output and a matmul input. @@ -63,10 +67,14 @@ def insert_fp8_softmax_qdq( TRT's Method 2 FP8 MHA fusion requires FP8 Q/DQ on Q, K, V **and** on the softmax output. modelopt's ``NVFP4_FP8_MHA_CONFIG`` specifies a - ``*softmax_quantizer`` but the HF ``_QuantAttention.softmax_quantizer`` is - only applied in the Triton FA path — not in the standard - ``F.scaled_dot_product_attention`` path used by ``torch.export``. - Consequently the exported FX graph has:: + ``*softmax_quantizer`` glob, but in practice no SDPA-based modelopt + attention wrapper applies it: the HF ``_QuantAttention`` does not create a + ``softmax_quantizer`` at all (only Q/K/V bmm quantizers), and the diffusers + ``_QuantAttention`` creates one but only invokes it on the ``torch.bmm`` + code path — its ``F.scaled_dot_product_attention`` replacement routes + through a custom ``FP8SDPA`` op that skips softmax quantization. + Consequently, for any model that ends up on the SDPA path used by + ``torch.export``, the exported FX graph has:: matmul(q_fp8, k_fp8.T) → mul(1/sqrt(D)) → softmax → matmul(·, v_fp8) diff --git a/uv.lock b/uv.lock index 9cf008868c..d2f9f309cb 100644 --- a/uv.lock +++ b/uv.lock @@ -34,7 +34,7 @@ name = "accelerate" version = "1.13.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "packaging", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -728,7 +728,7 @@ dependencies = [ { name = "filelock", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "fsspec", version = "2026.2.0", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, extra = ["http"], marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "httpx", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "multiprocess", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, @@ -782,7 +782,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "httpx", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "importlib-metadata", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, @@ -1223,7 +1223,7 @@ wheels = [ [[package]] name = "huggingface-hub" -version = "1.11.0" +version = "0.36.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version >= '3.14' and sys_platform == 'linux'", @@ -1240,16 +1240,48 @@ resolution-markers = [ "python_full_version < '3.11' and platform_machine == 'AMD64' and sys_platform == 'win32'", ] dependencies = [ - { name = "filelock", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "filelock", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "fsspec", version = "2026.2.0", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "hf-xet", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'amd64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'arm64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'aarch64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'amd64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'arm64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (platform_machine == 'x86_64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "packaging", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "pyyaml", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "requests", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "tqdm", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "typing-extensions", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7c/b7/8cb61d2eece5fb05a83271da168186721c450eb74e3c31f7ef3169fa475b/huggingface_hub-0.36.2.tar.gz", hash = "sha256:1934304d2fb224f8afa3b87007d58501acfda9215b334eed53072dd5e815ff7a", size = 649782, upload-time = "2026-02-06T09:24:13.098Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/af/48ac8483240de756d2438c380746e7130d1c6f75802ef22f3c6d49982787/huggingface_hub-0.36.2-py3-none-any.whl", hash = "sha256:48f0c8eac16145dfce371e9d2d7772854a4f591bcb56c9cf548accf531d54270", size = 566395, upload-time = "2026-02-06T09:24:11.133Z" }, +] + +[[package]] +name = "huggingface-hub" +version = "1.11.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'linux'", + "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", + "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'", + "python_full_version < '3.11' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", + "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine != 'AMD64' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine == 'AMD64' and sys_platform == 'win32'", + "python_full_version < '3.11' and platform_machine != 'AMD64' and sys_platform == 'win32'", + "python_full_version < '3.11' and platform_machine == 'AMD64' and sys_platform == 'win32'", +] +dependencies = [ + { name = "filelock", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "fsspec", version = "2026.3.0", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "hf-xet", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'amd64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine == 'AMD64' and sys_platform == 'win32') or (platform_machine == 'aarch64' and sys_platform == 'win32') or (platform_machine == 'amd64' and sys_platform == 'win32') or (platform_machine == 'arm64' and sys_platform == 'win32') or (platform_machine == 'x86_64' and sys_platform == 'win32')" }, - { name = "httpx", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "packaging", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "pyyaml", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "tqdm", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "typer", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "typing-extensions", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "hf-xet", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'amd64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'arm64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'AMD64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'aarch64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'amd64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'arm64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (platform_machine == 'x86_64' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "httpx", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "packaging", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "pyyaml", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "tqdm", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "typer", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "typing-extensions", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/dc/89/e7aa12d8a6b9259bed10671abb25ae6fa437c0f88a86ecbf59617bae7759/huggingface_hub-1.11.0.tar.gz", hash = "sha256:15fb3713c7f9cdff7b808a94fd91664f661ab142796bb48c9cd9493e8d166278", size = 761749, upload-time = "2026-04-16T13:07:39.73Z" } wheels = [ @@ -2496,8 +2528,8 @@ wheels = [ [[package]] name = "nvidia-modelopt" -version = "0.45.0.dev8+gc7966119e" -source = { git = "https://github.com/NVIDIA/Model-Optimizer.git?rev=main#c7966119eb7fdec1bac4ad938413048f145a45fb" } +version = "0.43.0" +source = { registry = "https://pypi.nvidia.com/" } dependencies = [ { name = "ninja", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, @@ -2517,6 +2549,9 @@ dependencies = [ { name = "torch", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "tqdm", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] +wheels = [ + { url = "https://pypi.nvidia.com/nvidia-modelopt/nvidia_modelopt-0.43.0-py3-none-any.whl", hash = "sha256:fe11a49e16230435b3a17153bcdb5717b2859a61544cdbe9dcb3f062ba2c203a" }, +] [package.optional-dependencies] hf = [ @@ -2524,12 +2559,11 @@ hf = [ { name = "datasets", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "deepspeed", marker = "sys_platform == 'linux'" }, { name = "diffusers", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "nltk", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "peft", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "sentencepiece", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "tiktoken", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "transformers", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "transformers", version = "4.57.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "wonderwords", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] @@ -2762,7 +2796,7 @@ version = "0.19.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "accelerate", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "packaging", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -2771,7 +2805,7 @@ dependencies = [ { name = "safetensors", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "torch", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "tqdm", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "transformers", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "transformers", version = "4.57.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/86/cf/037f1e3d5186496c05513a6754639e2dab3038a05f384284d49a9bd06a2d/peft-0.19.1.tar.gz", hash = "sha256:0d97542fe96dcdaa20d3b81c06f26f988618f416a73544ab23c3618ccb674a40", size = 763738, upload-time = "2026-04-16T15:46:45.105Z" } wheels = [ @@ -4311,73 +4345,12 @@ wheels = [ { url = "https://pypi.nvidia.com/tensorrt-cu13-libs/tensorrt_cu13_libs-10.16.1.11-py3-none-win_amd64.whl", hash = "sha256:96262c3e8c64a45abd29aa3482d99480fac6845bd420b6de125699bb1ae365ff" }, ] -[[package]] -name = "tiktoken" -version = "0.12.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "regex", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "requests", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/89/b3/2cb7c17b6c4cf8ca983204255d3f1d95eda7213e247e6947a0ee2c747a2c/tiktoken-0.12.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3de02f5a491cfd179aec916eddb70331814bd6bf764075d39e21d5862e533970", size = 1051991, upload-time = "2025-10-06T20:21:34.098Z" }, - { url = "https://files.pythonhosted.org/packages/27/0f/df139f1df5f6167194ee5ab24634582ba9a1b62c6b996472b0277ec80f66/tiktoken-0.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b6cfb6d9b7b54d20af21a912bfe63a2727d9cfa8fbda642fd8322c70340aad16", size = 995798, upload-time = "2025-10-06T20:21:35.579Z" }, - { url = "https://files.pythonhosted.org/packages/ef/5d/26a691f28ab220d5edc09b9b787399b130f24327ef824de15e5d85ef21aa/tiktoken-0.12.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:cde24cdb1b8a08368f709124f15b36ab5524aac5fa830cc3fdce9c03d4fb8030", size = 1129865, upload-time = "2025-10-06T20:21:36.675Z" }, - { url = "https://files.pythonhosted.org/packages/b2/94/443fab3d4e5ebecac895712abd3849b8da93b7b7dec61c7db5c9c7ebe40c/tiktoken-0.12.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:6de0da39f605992649b9cfa6f84071e3f9ef2cec458d08c5feb1b6f0ff62e134", size = 1152856, upload-time = "2025-10-06T20:21:37.873Z" }, - { url = "https://files.pythonhosted.org/packages/54/35/388f941251b2521c70dd4c5958e598ea6d2c88e28445d2fb8189eecc1dfc/tiktoken-0.12.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6faa0534e0eefbcafaccb75927a4a380463a2eaa7e26000f0173b920e98b720a", size = 1195308, upload-time = "2025-10-06T20:21:39.577Z" }, - { url = "https://files.pythonhosted.org/packages/f8/00/c6681c7f833dd410576183715a530437a9873fa910265817081f65f9105f/tiktoken-0.12.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:82991e04fc860afb933efb63957affc7ad54f83e2216fe7d319007dab1ba5892", size = 1255697, upload-time = "2025-10-06T20:21:41.154Z" }, - { url = "https://files.pythonhosted.org/packages/5f/d2/82e795a6a9bafa034bf26a58e68fe9a89eeaaa610d51dbeb22106ba04f0a/tiktoken-0.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:6fb2995b487c2e31acf0a9e17647e3b242235a20832642bb7a9d1a181c0c1bb1", size = 879375, upload-time = "2025-10-06T20:21:43.201Z" }, - { url = "https://files.pythonhosted.org/packages/de/46/21ea696b21f1d6d1efec8639c204bdf20fde8bafb351e1355c72c5d7de52/tiktoken-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6e227c7f96925003487c33b1b32265fad2fbcec2b7cf4817afb76d416f40f6bb", size = 1051565, upload-time = "2025-10-06T20:21:44.566Z" }, - { url = "https://files.pythonhosted.org/packages/c9/d9/35c5d2d9e22bb2a5f74ba48266fb56c63d76ae6f66e02feb628671c0283e/tiktoken-0.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c06cf0fcc24c2cb2adb5e185c7082a82cba29c17575e828518c2f11a01f445aa", size = 995284, upload-time = "2025-10-06T20:21:45.622Z" }, - { url = "https://files.pythonhosted.org/packages/01/84/961106c37b8e49b9fdcf33fe007bb3a8fdcc380c528b20cc7fbba80578b8/tiktoken-0.12.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:f18f249b041851954217e9fd8e5c00b024ab2315ffda5ed77665a05fa91f42dc", size = 1129201, upload-time = "2025-10-06T20:21:47.074Z" }, - { url = "https://files.pythonhosted.org/packages/6a/d0/3d9275198e067f8b65076a68894bb52fd253875f3644f0a321a720277b8a/tiktoken-0.12.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:47a5bc270b8c3db00bb46ece01ef34ad050e364b51d406b6f9730b64ac28eded", size = 1152444, upload-time = "2025-10-06T20:21:48.139Z" }, - { url = "https://files.pythonhosted.org/packages/78/db/a58e09687c1698a7c592e1038e01c206569b86a0377828d51635561f8ebf/tiktoken-0.12.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:508fa71810c0efdcd1b898fda574889ee62852989f7c1667414736bcb2b9a4bd", size = 1195080, upload-time = "2025-10-06T20:21:49.246Z" }, - { url = "https://files.pythonhosted.org/packages/9e/1b/a9e4d2bf91d515c0f74afc526fd773a812232dd6cda33ebea7f531202325/tiktoken-0.12.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a1af81a6c44f008cba48494089dd98cccb8b313f55e961a52f5b222d1e507967", size = 1255240, upload-time = "2025-10-06T20:21:50.274Z" }, - { url = "https://files.pythonhosted.org/packages/9d/15/963819345f1b1fb0809070a79e9dd96938d4ca41297367d471733e79c76c/tiktoken-0.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:3e68e3e593637b53e56f7237be560f7a394451cb8c11079755e80ae64b9e6def", size = 879422, upload-time = "2025-10-06T20:21:51.734Z" }, - { url = "https://files.pythonhosted.org/packages/a4/85/be65d39d6b647c79800fd9d29241d081d4eeb06271f383bb87200d74cf76/tiktoken-0.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b97f74aca0d78a1ff21b8cd9e9925714c15a9236d6ceacf5c7327c117e6e21e8", size = 1050728, upload-time = "2025-10-06T20:21:52.756Z" }, - { url = "https://files.pythonhosted.org/packages/4a/42/6573e9129bc55c9bf7300b3a35bef2c6b9117018acca0dc760ac2d93dffe/tiktoken-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b90f5ad190a4bb7c3eb30c5fa32e1e182ca1ca79f05e49b448438c3e225a49b", size = 994049, upload-time = "2025-10-06T20:21:53.782Z" }, - { url = "https://files.pythonhosted.org/packages/66/c5/ed88504d2f4a5fd6856990b230b56d85a777feab84e6129af0822f5d0f70/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:65b26c7a780e2139e73acc193e5c63ac754021f160df919add909c1492c0fb37", size = 1129008, upload-time = "2025-10-06T20:21:54.832Z" }, - { url = "https://files.pythonhosted.org/packages/f4/90/3dae6cc5436137ebd38944d396b5849e167896fc2073da643a49f372dc4f/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:edde1ec917dfd21c1f2f8046b86348b0f54a2c0547f68149d8600859598769ad", size = 1152665, upload-time = "2025-10-06T20:21:56.129Z" }, - { url = "https://files.pythonhosted.org/packages/a3/fe/26df24ce53ffde419a42f5f53d755b995c9318908288c17ec3f3448313a3/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:35a2f8ddd3824608b3d650a000c1ef71f730d0c56486845705a8248da00f9fe5", size = 1194230, upload-time = "2025-10-06T20:21:57.546Z" }, - { url = "https://files.pythonhosted.org/packages/20/cc/b064cae1a0e9fac84b0d2c46b89f4e57051a5f41324e385d10225a984c24/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83d16643edb7fa2c99eff2ab7733508aae1eebb03d5dfc46f5565862810f24e3", size = 1254688, upload-time = "2025-10-06T20:21:58.619Z" }, - { url = "https://files.pythonhosted.org/packages/81/10/b8523105c590c5b8349f2587e2fdfe51a69544bd5a76295fc20f2374f470/tiktoken-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc5288f34a8bc02e1ea7047b8d041104791d2ddbf42d1e5fa07822cbffe16bd", size = 878694, upload-time = "2025-10-06T20:21:59.876Z" }, - { url = "https://files.pythonhosted.org/packages/00/61/441588ee21e6b5cdf59d6870f86beb9789e532ee9718c251b391b70c68d6/tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3", size = 1050802, upload-time = "2025-10-06T20:22:00.96Z" }, - { url = "https://files.pythonhosted.org/packages/1f/05/dcf94486d5c5c8d34496abe271ac76c5b785507c8eae71b3708f1ad9b45a/tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160", size = 993995, upload-time = "2025-10-06T20:22:02.788Z" }, - { url = "https://files.pythonhosted.org/packages/a0/70/5163fe5359b943f8db9946b62f19be2305de8c3d78a16f629d4165e2f40e/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa", size = 1128948, upload-time = "2025-10-06T20:22:03.814Z" }, - { url = "https://files.pythonhosted.org/packages/0c/da/c028aa0babf77315e1cef357d4d768800c5f8a6de04d0eac0f377cb619fa/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be", size = 1151986, upload-time = "2025-10-06T20:22:05.173Z" }, - { url = "https://files.pythonhosted.org/packages/a0/5a/886b108b766aa53e295f7216b509be95eb7d60b166049ce2c58416b25f2a/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a", size = 1194222, upload-time = "2025-10-06T20:22:06.265Z" }, - { url = "https://files.pythonhosted.org/packages/f4/f8/4db272048397636ac7a078d22773dd2795b1becee7bc4922fe6207288d57/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3", size = 1255097, upload-time = "2025-10-06T20:22:07.403Z" }, - { url = "https://files.pythonhosted.org/packages/8e/32/45d02e2e0ea2be3a9ed22afc47d93741247e75018aac967b713b2941f8ea/tiktoken-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697", size = 879117, upload-time = "2025-10-06T20:22:08.418Z" }, - { url = "https://files.pythonhosted.org/packages/ce/76/994fc868f88e016e6d05b0da5ac24582a14c47893f4474c3e9744283f1d5/tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16", size = 1050309, upload-time = "2025-10-06T20:22:10.939Z" }, - { url = "https://files.pythonhosted.org/packages/f6/b8/57ef1456504c43a849821920d582a738a461b76a047f352f18c0b26c6516/tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a", size = 993712, upload-time = "2025-10-06T20:22:12.115Z" }, - { url = "https://files.pythonhosted.org/packages/72/90/13da56f664286ffbae9dbcfadcc625439142675845baa62715e49b87b68b/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27", size = 1128725, upload-time = "2025-10-06T20:22:13.541Z" }, - { url = "https://files.pythonhosted.org/packages/05/df/4f80030d44682235bdaecd7346c90f67ae87ec8f3df4a3442cb53834f7e4/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb", size = 1151875, upload-time = "2025-10-06T20:22:14.559Z" }, - { url = "https://files.pythonhosted.org/packages/22/1f/ae535223a8c4ef4c0c1192e3f9b82da660be9eb66b9279e95c99288e9dab/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e", size = 1194451, upload-time = "2025-10-06T20:22:15.545Z" }, - { url = "https://files.pythonhosted.org/packages/78/a7/f8ead382fce0243cb625c4f266e66c27f65ae65ee9e77f59ea1653b6d730/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25", size = 1253794, upload-time = "2025-10-06T20:22:16.624Z" }, - { url = "https://files.pythonhosted.org/packages/93/e0/6cc82a562bc6365785a3ff0af27a2a092d57c47d7a81d9e2295d8c36f011/tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f", size = 878777, upload-time = "2025-10-06T20:22:18.036Z" }, - { url = "https://files.pythonhosted.org/packages/72/05/3abc1db5d2c9aadc4d2c76fa5640134e475e58d9fbb82b5c535dc0de9b01/tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646", size = 1050188, upload-time = "2025-10-06T20:22:19.563Z" }, - { url = "https://files.pythonhosted.org/packages/e3/7b/50c2f060412202d6c95f32b20755c7a6273543b125c0985d6fa9465105af/tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88", size = 993978, upload-time = "2025-10-06T20:22:20.702Z" }, - { url = "https://files.pythonhosted.org/packages/14/27/bf795595a2b897e271771cd31cb847d479073497344c637966bdf2853da1/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff", size = 1129271, upload-time = "2025-10-06T20:22:22.06Z" }, - { url = "https://files.pythonhosted.org/packages/f5/de/9341a6d7a8f1b448573bbf3425fa57669ac58258a667eb48a25dfe916d70/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830", size = 1151216, upload-time = "2025-10-06T20:22:23.085Z" }, - { url = "https://files.pythonhosted.org/packages/75/0d/881866647b8d1be4d67cb24e50d0c26f9f807f994aa1510cb9ba2fe5f612/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b", size = 1194860, upload-time = "2025-10-06T20:22:24.602Z" }, - { url = "https://files.pythonhosted.org/packages/b3/1e/b651ec3059474dab649b8d5b69f5c65cd8fcd8918568c1935bd4136c9392/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b", size = 1254567, upload-time = "2025-10-06T20:22:25.671Z" }, - { url = "https://files.pythonhosted.org/packages/80/57/ce64fd16ac390fafde001268c364d559447ba09b509181b2808622420eec/tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3", size = 921067, upload-time = "2025-10-06T20:22:26.753Z" }, - { url = "https://files.pythonhosted.org/packages/ac/a4/72eed53e8976a099539cdd5eb36f241987212c29629d0a52c305173e0a68/tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365", size = 1050473, upload-time = "2025-10-06T20:22:27.775Z" }, - { url = "https://files.pythonhosted.org/packages/e6/d7/0110b8f54c008466b19672c615f2168896b83706a6611ba6e47313dbc6e9/tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e", size = 993855, upload-time = "2025-10-06T20:22:28.799Z" }, - { url = "https://files.pythonhosted.org/packages/5f/77/4f268c41a3957c418b084dd576ea2fad2e95da0d8e1ab705372892c2ca22/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63", size = 1129022, upload-time = "2025-10-06T20:22:29.981Z" }, - { url = "https://files.pythonhosted.org/packages/4e/2b/fc46c90fe5028bd094cd6ee25a7db321cb91d45dc87531e2bdbb26b4867a/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0", size = 1150736, upload-time = "2025-10-06T20:22:30.996Z" }, - { url = "https://files.pythonhosted.org/packages/28/c0/3c7a39ff68022ddfd7d93f3337ad90389a342f761c4d71de99a3ccc57857/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a", size = 1194908, upload-time = "2025-10-06T20:22:32.073Z" }, - { url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706, upload-time = "2025-10-06T20:22:33.385Z" }, - { url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" }, -] - [[package]] name = "timm" version = "1.0.26" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", version = "1.11.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "pyyaml", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "safetensors", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "torch", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -4405,7 +4378,8 @@ name = "tokenizers" version = "0.22.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "huggingface-hub", version = "1.11.0", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/73/6f/f80cfef4a312e1fb34baf7d85c72d4411afde10978d4657f8cdd811d3ccc/tokenizers-0.22.2.tar.gz", hash = "sha256:473b83b915e547aa366d1eee11806deaf419e17be16310ac0a14077f1e28f917", size = 372115, upload-time = "2026-01-05T10:45:15.988Z" } wheels = [ @@ -4583,7 +4557,6 @@ lint = [ ] quantization = [ { name = "nvidia-modelopt", extra = ["hf"], marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "transformers", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] test = [ { name = "expecttest", marker = "sys_platform == 'linux' or sys_platform == 'win32' or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, @@ -4597,7 +4570,7 @@ test-ext = [ { name = "flashinfer-python", version = "0.6.8.post1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (python_full_version >= '3.13' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform != 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "timm", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "torchvision", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "transformers", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "transformers", version = "5.5.4", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] [package.metadata] @@ -4647,10 +4620,7 @@ lint = [ { name = "black", specifier = ">=24.0.0" }, { name = "clang-format", specifier = "==14.0.6" }, ] -quantization = [ - { name = "nvidia-modelopt", extras = ["hf"], git = "https://github.com/NVIDIA/Model-Optimizer.git?rev=main" }, - { name = "transformers", specifier = ">=5.5.4" }, -] +quantization = [{ name = "nvidia-modelopt", extras = ["hf"], specifier = ">=0.43.0" }] test = [ { name = "expecttest", specifier = "==0.1.6" }, { name = "parameterized", specifier = ">=0.2.0" }, @@ -4737,6 +4707,42 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, ] +[[package]] +name = "transformers" +version = "4.57.6" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'linux'", + "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", + "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'", + "python_full_version < '3.11' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", + "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine != 'AMD64' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and platform_machine == 'AMD64' and sys_platform == 'win32'", + "python_full_version < '3.11' and platform_machine != 'AMD64' and sys_platform == 'win32'", + "python_full_version < '3.11' and platform_machine == 'AMD64' and sys_platform == 'win32'", +] +dependencies = [ + { name = "filelock", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "huggingface-hub", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "packaging", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "pyyaml", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "regex", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "requests", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "safetensors", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "tokenizers", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "tqdm", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c4/35/67252acc1b929dc88b6602e8c4a982e64f31e733b804c14bc24b47da35e6/transformers-4.57.6.tar.gz", hash = "sha256:55e44126ece9dc0a291521b7e5492b572e6ef2766338a610b9ab5afbb70689d3", size = 10134912, upload-time = "2026-01-16T10:38:39.284Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/b8/e484ef633af3887baeeb4b6ad12743363af7cce68ae51e938e00aaa0529d/transformers-4.57.6-py3-none-any.whl", hash = "sha256:4c9e9de11333ddfe5114bc872c9f370509198acf0b87a832a0ab9458e2bd0550", size = 11993498, upload-time = "2026-01-16T10:38:31.289Z" }, +] + [[package]] name = "transformers" version = "5.5.4" @@ -4756,9 +4762,9 @@ resolution-markers = [ "python_full_version < '3.11' and platform_machine == 'AMD64' and sys_platform == 'win32'", ] dependencies = [ - { name = "huggingface-hub", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, - { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "huggingface-hub", version = "1.11.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "numpy", version = "2.4.4", source = { registry = "https://download.pytorch.org/whl/nightly/cu130" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform != 'linux' and sys_platform != 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, { name = "packaging", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "pyyaml", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "regex", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -4798,10 +4804,10 @@ name = "typer" version = "0.24.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "annotated-doc", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "click", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "rich", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "shellingham", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "annotated-doc", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "click", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "rich", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, + { name = "shellingham", marker = "(sys_platform == 'linux' and extra == 'group-14-torch-tensorrt-test-ext') or (sys_platform == 'win32' and extra == 'group-14-torch-tensorrt-test-ext') or (extra == 'group-14-torch-tensorrt-quantization' and extra == 'group-14-torch-tensorrt-test-ext')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/f5/24/cb09efec5cc954f7f9b930bf8279447d24618bb6758d4f6adf2574c41780/typer-0.24.1.tar.gz", hash = "sha256:e39b4732d65fbdcde189ae76cf7cd48aeae72919dea1fdfc16593be016256b45", size = 118613, upload-time = "2026-02-21T16:54:40.609Z" } wheels = [