From d47ac1f219a83af1aeaffc5a73a4c95f880cbdd7 Mon Sep 17 00:00:00 2001 From: stephantul Date: Thu, 12 Mar 2026 16:23:22 +0100 Subject: [PATCH 1/3] add tests for new util --- model2vec/train/base.py | 9 +++++++-- model2vec/train/utils.py | 21 +++++++++++++++++++++ pyproject.toml | 2 +- tests/test_trainable.py | 27 +++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 model2vec/train/utils.py diff --git a/model2vec/train/base.py b/model2vec/train/base.py index c8200c1..3966c8f 100644 --- a/model2vec/train/base.py +++ b/model2vec/train/base.py @@ -11,6 +11,7 @@ from torch.utils.data import DataLoader, Dataset from model2vec import StaticModel +from model2vec.train.utils import get_probable_pad_token_id logger = logging.getLogger(__name__) @@ -82,7 +83,7 @@ def from_pretrained( @classmethod def from_static_model( - cls: type[ModelType], *, model: StaticModel, out_dim: int = 2, pad_token: str = "[PAD]", **kwargs: Any + cls: type[ModelType], *, model: StaticModel, out_dim: int = 2, pad_token: str | None = None, **kwargs: Any ) -> ModelType: """Load the model from a static model.""" model.embedding = np.nan_to_num(model.embedding) @@ -92,9 +93,13 @@ def from_static_model( token_mapping = model.token_mapping.tolist() else: token_mapping = None + if pad_token is not None: + pad_id = model.tokenizer.get_vocab()[pad_token] + else: + pad_id = get_probable_pad_token_id(model.tokenizer) return cls( vectors=embeddings_converted, - pad_id=model.tokenizer.token_to_id(pad_token), + pad_id=pad_id, out_dim=out_dim, tokenizer=model.tokenizer, token_mapping=token_mapping, diff --git a/model2vec/train/utils.py b/model2vec/train/utils.py new file mode 100644 index 0000000..4d6b95b --- /dev/null +++ b/model2vec/train/utils.py @@ -0,0 +1,21 @@ +import logging + +from tokenizers import Tokenizer + +logger = logging.getLogger(__name__) + +_KNOWN_PAD_TOKENS = ("[PAD]", "") + + +def get_probable_pad_token_id(tokenizer: Tokenizer) -> int: + """Get a probable pad token by using the padding module and falling back to guessing.""" + if tokenizer.padding is not None: + return tokenizer.padding["pad_id"] + vocab = tokenizer.get_vocab() + for token in _KNOWN_PAD_TOKENS: + token_id = vocab.get(token) + if token_id is not None: + return token_id + + logger.warning("No known pad token found, using 0 as default") + return 0 diff --git a/pyproject.toml b/pyproject.toml index 52cba10..d0304de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ dev = [ "ruff", ] -distill = ["torch", "transformers", "scikit-learn", "skeletoken>=0.3.1"] +distill = ["torch", "transformers", "scikit-learn", "skeletoken>=0.3.2"] onnx = ["onnx", "torch"] # train also installs inference train = ["torch", "lightning", "scikit-learn", "skops"] diff --git a/tests/test_trainable.py b/tests/test_trainable.py index 627d5e0..169c2f3 100644 --- a/tests/test_trainable.py +++ b/tests/test_trainable.py @@ -3,12 +3,14 @@ import numpy as np import pytest import torch +from skeletoken import TokenizerModel from tokenizers import Tokenizer from transformers import AutoTokenizer from model2vec.model import StaticModel from model2vec.train import StaticModelForClassification from model2vec.train.base import FinetunableStaticModel, TextDataset +from model2vec.train.utils import get_probable_pad_token_id @pytest.mark.parametrize("n_layers", [0, 1, 2, 3]) @@ -231,3 +233,28 @@ def test_evaluate(mock_trained_pipeline: StaticModelForClassification) -> None: else: # Ignore the type error since we don't support int labels in our typing, but the code does mock_trained_pipeline.evaluate(["dog cat", "dog"], [1, 1]) # type: ignore + + +def test_get_probable_pad_token_id(mock_tokenizer: Tokenizer) -> None: + """Test loading from a static model with a pad token.""" + model = TokenizerModel.from_tokenizer(mock_tokenizer) + t = model.to_tokenizer() + token_id = get_probable_pad_token_id(t) + assert token_id == 0 + + # Adds new token + model.pad_token = "haha" + t = model.to_tokenizer() + token_id = get_probable_pad_token_id(t) + assert token_id == 5 + + model.pad_token = "word1" + t = model.to_tokenizer() + token_id = get_probable_pad_token_id(t) + assert token_id == 1 + + # Remove padding token + model.pad_token = None + t = model.to_tokenizer() + token_id = get_probable_pad_token_id(t) + assert token_id == model.vocabulary["[PAD]"] From 6e70f51d1362015ec3008fd9bfb347f9465b9855 Mon Sep 17 00:00:00 2001 From: stephantul Date: Thu, 12 Mar 2026 16:37:12 +0100 Subject: [PATCH 2/3] update test for coverage --- tests/test_trainable.py | 10 +++++++++- uv.lock | 8 ++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/test_trainable.py b/tests/test_trainable.py index 169c2f3..c59f69a 100644 --- a/tests/test_trainable.py +++ b/tests/test_trainable.py @@ -1,3 +1,4 @@ +import logging from tempfile import TemporaryDirectory import numpy as np @@ -235,7 +236,7 @@ def test_evaluate(mock_trained_pipeline: StaticModelForClassification) -> None: mock_trained_pipeline.evaluate(["dog cat", "dog"], [1, 1]) # type: ignore -def test_get_probable_pad_token_id(mock_tokenizer: Tokenizer) -> None: +def test_get_probable_pad_token_id(mock_tokenizer: Tokenizer, caplog: pytest.LogCaptureFixture) -> None: """Test loading from a static model with a pad token.""" model = TokenizerModel.from_tokenizer(mock_tokenizer) t = model.to_tokenizer() @@ -258,3 +259,10 @@ def test_get_probable_pad_token_id(mock_tokenizer: Tokenizer) -> None: t = model.to_tokenizer() token_id = get_probable_pad_token_id(t) assert token_id == model.vocabulary["[PAD]"] + + model = model.remove_token_from_vocabulary("[PAD]") + t = model.to_tokenizer() + with caplog.at_level(logging.WARNING, logger="model2vec.train.utils"): + token_id = get_probable_pad_token_id(t) + assert token_id == 0 + assert "No known pad token found, using 0 as default" in caplog.text diff --git a/uv.lock b/uv.lock index 119b247..0a3025f 100644 --- a/uv.lock +++ b/uv.lock @@ -934,7 +934,7 @@ requires-dist = [ { name = "scikit-learn", marker = "extra == 'quantization'" }, { name = "scikit-learn", marker = "extra == 'train'" }, { name = "setuptools" }, - { name = "skeletoken", marker = "extra == 'distill'", specifier = ">=0.3.1" }, + { name = "skeletoken", marker = "extra == 'distill'", specifier = ">=0.3.2" }, { name = "skops", marker = "extra == 'inference'" }, { name = "skops", marker = "extra == 'train'" }, { name = "tokenizers", specifier = ">=0.20" }, @@ -2263,7 +2263,7 @@ wheels = [ [[package]] name = "skeletoken" -version = "0.3.1" +version = "0.3.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "protobuf" }, @@ -2272,9 +2272,9 @@ dependencies = [ { name = "tokenizers" }, { name = "transformers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/49/d3/5c30ca5c615a9a0bbc3a0e045fb0885cf97ddfac152f86eb3e24e4519d92/skeletoken-0.3.1.tar.gz", hash = "sha256:bcd9da7789dc738cbc77f3c94f1b535486f1d455ed2d51e6d92431d7e583b4d9", size = 233048, upload-time = "2026-02-27T13:08:59.88Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1e/a6/71abc874578c9a3290faa04ad8db7eb60e05b3f5052f6dbec525b28bc133/skeletoken-0.3.2.tar.gz", hash = "sha256:24a423e8f789719f62f5e69e040a062f7467f932626119c14ad4b87184457b46", size = 234150, upload-time = "2026-03-12T15:03:02.096Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/31/ab/4078e1a90aafab6f4b642ef2583c693c806f9dd555213b701a22a0a18bbb/skeletoken-0.3.1-py3-none-any.whl", hash = "sha256:804021822f0da0aea272cf5f425f5fc96a2795b29b3f33d821f0e42fd700857b", size = 39430, upload-time = "2026-02-27T13:08:58.312Z" }, + { url = "https://files.pythonhosted.org/packages/b0/77/555503fd8e5cbef7a824481f7154bf90979c4d1b0604ce9de0221d8d38a1/skeletoken-0.3.2-py3-none-any.whl", hash = "sha256:483d6b76bb508b7de7aa2c00b17915804a2bb1b106393efc9ac73fe3de162690", size = 40302, upload-time = "2026-03-12T15:03:00.92Z" }, ] [[package]] From 1fe0eebda4e9acc843771403118eab299197f95d Mon Sep 17 00:00:00 2001 From: stephantul Date: Thu, 12 Mar 2026 16:43:05 +0100 Subject: [PATCH 3/3] test pad_token passing --- tests/test_trainable.py | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/tests/test_trainable.py b/tests/test_trainable.py index c59f69a..0b0179d 100644 --- a/tests/test_trainable.py +++ b/tests/test_trainable.py @@ -70,6 +70,21 @@ def test_init_classifier_from_model(mock_vectors: np.ndarray, mock_tokenizer: To assert s.w.shape[0] == mock_vectors.shape[0] +def test_pad_token(mock_tokenizer: Tokenizer) -> None: + """Test initializion from a static model.""" + tokenizer_model = TokenizerModel.from_tokenizer(mock_tokenizer) + tokenizer_model.pad_token = "[HELLO]" + tokenizer = tokenizer_model.to_tokenizer() + vectors = np.random.RandomState().randn(6, 10) + model = StaticModel(vectors=vectors, tokenizer=tokenizer) + s = StaticModelForClassification.from_static_model(model=model, pad_token="[HELLO]") + assert s.w.shape[0] == vectors.shape[0] + assert s.pad_id == 5 + + with pytest.raises(KeyError): + StaticModelForClassification.from_static_model(model=model, pad_token="[BRR]") + + def test_encode(mock_trained_pipeline: StaticModelForClassification) -> None: """Test the encode function.""" result = mock_trained_pipeline._encode(torch.tensor([[0, 1], [1, 0]]).long()) @@ -238,30 +253,30 @@ def test_evaluate(mock_trained_pipeline: StaticModelForClassification) -> None: def test_get_probable_pad_token_id(mock_tokenizer: Tokenizer, caplog: pytest.LogCaptureFixture) -> None: """Test loading from a static model with a pad token.""" - model = TokenizerModel.from_tokenizer(mock_tokenizer) - t = model.to_tokenizer() + tokenizer_model = TokenizerModel.from_tokenizer(mock_tokenizer) + t = tokenizer_model.to_tokenizer() token_id = get_probable_pad_token_id(t) assert token_id == 0 # Adds new token - model.pad_token = "haha" - t = model.to_tokenizer() + tokenizer_model.pad_token = "haha" + t = tokenizer_model.to_tokenizer() token_id = get_probable_pad_token_id(t) assert token_id == 5 - model.pad_token = "word1" - t = model.to_tokenizer() + tokenizer_model.pad_token = "word1" + t = tokenizer_model.to_tokenizer() token_id = get_probable_pad_token_id(t) assert token_id == 1 # Remove padding token - model.pad_token = None - t = model.to_tokenizer() + tokenizer_model.pad_token = None + t = tokenizer_model.to_tokenizer() token_id = get_probable_pad_token_id(t) - assert token_id == model.vocabulary["[PAD]"] + assert token_id == tokenizer_model.vocabulary["[PAD]"] - model = model.remove_token_from_vocabulary("[PAD]") - t = model.to_tokenizer() + tokenizer_model = tokenizer_model.remove_token_from_vocabulary("[PAD]") + t = tokenizer_model.to_tokenizer() with caplog.at_level(logging.WARNING, logger="model2vec.train.utils"): token_id = get_probable_pad_token_id(t) assert token_id == 0