Machine-Learning-for-Medical-Language · ianbulovic · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -23,7 +23,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
         os: [ubuntu-latest, macos-latest, windows-latest]
 
     runs-on: ${{ matrix.os }}
@@ -34,7 +34,7 @@ jobs:
         id: setup-uv
         uses: astral-sh/setup-uv@v6
         with:
-          version: "0.7.11"
+          version: "0.11.1"
           enable-cache: true
       - name: Cache HF models
         uses: actions/cache@v3

diff --git a/.python-version b/.python-version
@@ -1 +1 @@
-3.12
+3.14
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,26 +11,25 @@ authors = [
 description = "Transformers for Clinical NLP"
 
 readme = "README.md"
-requires-python = ">=3.9, <3.13"
+requires-python = ">=3.10, <3.15"
 dependencies = [
-    "anaforatools~=1.1.0",
-    "datasets~=2.21.0",
-    "fastapi~=0.115.2",
+    "datasets>=4.4.0",
+    "fastapi>=0.115.2",
     "httpx>=0.27.2",
     "huggingface-hub>=0.9.0",
-    "nltk~=3.5",
-    "numpy~=2.0.2",
-    "pandas~=2.2.3",
+    "nltk>=3.5",
+    "numpy>=2.0.2",
+    "pandas>=2.2.3",               # legacy
     "polars>=1.30.0",
-    "pydantic~=1.10.8",
-    "requests~=2.32.2",
+    "pydantic>=1.10.8",
+    "requests>=2.32.2",
     "rich>=14.0.0",
-    "scikit-learn~=1.5.2",
-    "seqeval~=1.2.2",
-    "torch>=2.6",
-    "transformers[torch]~=4.51",
-    "typer~=0.16.0",
-    "uvicorn[standard]~=0.32.0",
+    "scikit-learn>=1.5.2",
+    "seqeval>=1.2.2",              # legacy
+    "torch>=2.11.0",
+    "transformers[torch]==4.56.1",
+    "typer>=0.16.0",
+    "uvicorn[standard]>=0.32.0",
 ]
 
 classifiers = [
@@ -89,7 +88,6 @@ select = [
     "I",   # isort
     "UP",  # pyupgrade
     "G",   # logging
-    "FA",  # future annotations
     "PIE", # misc
     "RUF", # misc
 ]
@@ -99,12 +97,5 @@ ignore = [
     "G004", # f-strings in logging statements
 ]
 
-[tool.ruff.lint.pyupgrade]
-# Preserve Union types, despite alternate 'X | Y' syntax being available via __future__ annotations module.
-# This is necessary because fastAPI and pydantic parse type annotations at runtime, and since the new syntax is
-# a python 3.10 feature they don't expect it in python 3.9.
-# This can be removed if/when we stop supporting python 3.9.
-keep-runtime-typing = true
-
 [tool.uv]
 cache-keys = [{ git = { commit = true, tags = true } }]
diff --git a/src/cnlpt/_cli/train.py b/src/cnlpt/_cli/train.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Annotated, Any, Final, Union
+from typing import Annotated, Any, Final
 
 import typer
 from click.core import ParameterSource
@@ -40,7 +40,7 @@ def callback(ctx: typer.Context, param: typer.CallbackParam, value: Any):
 def training_arg_option(
     field_name: str,
     *aliases,
-    compatibility: Union[list[ModelType], None] = None,
+    compatibility: list[ModelType] | None = None,
     **kwargs,
 ):
     field = CnlpTrainingArguments.__dataclass_fields__[field_name]
@@ -59,7 +59,7 @@ def training_arg_option(
 
 def model_arg_option(
     *args,
-    compatibility: Union[list[ModelType], None] = None,
+    compatibility: list[ModelType] | None = None,
     **kwargs,
 ):
     if compatibility is not None:
@@ -69,7 +69,7 @@ def model_arg_option(
 
 def data_arg_option(
     *args,
-    compatibility: Union[list[ModelType], None] = None,
+    compatibility: list[ModelType] | None = None,
     **kwargs,
 ):
     if compatibility is not None:
@@ -251,15 +251,15 @@ def transformers_arg_option(field_name: str, *args, **kwargs):
     ),
 ]
 TaskNamesArg = Annotated[
-    Union[list[str], None],
+    list[str] | None,
     data_arg_option(
         "--task",
         "-t",
         help="The name of a task in the dataset to train on. Can be specified multiple times to target more than one task. Defaults to all tasks.",
     ),
 ]
 TokenizerArg = Annotated[
-    Union[str, None],
+    str | None,
     data_arg_option(
         "--tokenizer",
         help=f'Name or path to a model to use for tokenization. For projection and hierarchical models, this will default to the --encoder if left unspecified; otherwise defaults to "{DEFAULT_ENCODER}".',
@@ -288,15 +288,15 @@ def transformers_arg_option(field_name: str, *args, **kwargs):
     ),
 ]
 MaxTrainArg = Annotated[
-    Union[int, None],
+    int | None,
     data_arg_option("--max_train", help="Limit the number of training samples to use."),
 ]
 MaxEvalArg = Annotated[
-    Union[int, None],
+    int | None,
     data_arg_option("--max_eval", help="Limit the number of eval samples to use."),
 ]
 MaxTestArg = Annotated[
-    Union[int, None],
+    int | None,
     data_arg_option("--max_test", help="Limit the number of test samples to use."),
 ]
 AllowDisjointLabelsArg = Annotated[
@@ -314,17 +314,17 @@ def transformers_arg_option(field_name: str, *args, **kwargs):
     ),
 ]
 HierChunkLenArg = Annotated[
-    Union[int, None],
+    int | None,
     data_arg_option("--hier_chunk_len", help="Chunk length for hierarchical models."),
 ]
 HierNumChunksArg = Annotated[
-    Union[int, None],
+    int | None,
     data_arg_option(
         "--hier_num_chunks", help="Number of chunks for hierarchical models."
     ),
 ]
 HierPrependEmptyChunkArg = Annotated[
-    Union[int, None],
+    int | None,
     data_arg_option(
         "--hier_prepend_empty_chunk",
         help="Whether to prepend an empty chunk for hierarchical models.",
@@ -342,23 +342,19 @@ def transformers_arg_option(field_name: str, *args, **kwargs):
     IntervalStrategy, training_arg_option("logging_strategy")
 ]
 LoggingFirstStepArg = Annotated[bool, training_arg_option("logging_first_step")]
-CacheDirArg = Annotated[Union[str, None], training_arg_option("cache_dir")]
+CacheDirArg = Annotated[str | None, training_arg_option("cache_dir")]
 MetricForBestModelArg = Annotated[str, training_arg_option("metric_for_best_model")]
 
 
 ##### COMMON HF TRANSFORMERS ARGS #####
-NumTrainEpochsArg = Annotated[
-    Union[float, None], transformers_arg_option("num_train_epochs")
-]
+NumTrainEpochsArg = Annotated[float | None, transformers_arg_option("num_train_epochs")]
 PerDeviceTrainBatchSizeArg = Annotated[
-    Union[int, None], transformers_arg_option("per_device_train_batch_size")
+    int | None, transformers_arg_option("per_device_train_batch_size")
 ]
 GradientAccumulationStepsArg = Annotated[
-    Union[int, None], transformers_arg_option("gradient_accumulation_steps")
-]
-LearningRateArg = Annotated[
-    Union[float, None], transformers_arg_option("learning_rate")
+    int | None, transformers_arg_option("gradient_accumulation_steps")
 ]
+LearningRateArg = Annotated[float | None, transformers_arg_option("learning_rate")]
 DoTrainArg = Annotated[bool, transformers_arg_option("do_train", "--do_train")]
 DoEvalArg = Annotated[bool, transformers_arg_option("do_eval", "--do_eval")]
 DoPredictArg = Annotated[bool, transformers_arg_option("do_predict", "--do_predict")]
@@ -608,7 +604,7 @@ def train(
     if bias_fit:
         model_init_kwargs["bias_fit"] = True
 
-    model: Union[CnnModel, LstmModel, HierarchicalModel, ProjectionModel] = (
+    model: CnnModel | LstmModel | HierarchicalModel | ProjectionModel = (
         AutoModel.from_config(config, **model_init_kwargs)
     )
     train_system = CnlpTrainSystem(model, dataset, training_args)

diff --git a/src/cnlpt/data/cnlp_dataset.py b/src/cnlpt/data/cnlp_dataset.py
@@ -2,7 +2,7 @@
 from collections import Counter
 from dataclasses import dataclass
 from enum import Enum
-from typing import Literal, Union
+from typing import Literal
 
 import torch
 from datasets import Dataset
@@ -22,7 +22,7 @@ class HierarchicalDataConfig:
 
 def load_tokenizer(
     model_name_or_path: str,
-    hf_cache_dir: Union[str, None] = None,
+    hf_cache_dir: str | None = None,
     truncation_side: Literal["left", "right"] = "right",
     character_level: bool = False,
 ) -> PreTrainedTokenizer:
@@ -50,19 +50,19 @@ class CnlpDataset:
 
     def __init__(
         self,
-        data_dir: Union[str, os.PathLike],
-        tokenizer: Union[str, PreTrainedTokenizer] = "roberta-base",
-        task_names: Union[list[str], None] = None,
-        hier_config: Union[HierarchicalDataConfig, None] = None,
+        data_dir: str | os.PathLike,
+        tokenizer: str | PreTrainedTokenizer = "roberta-base",
+        task_names: list[str] | None = None,
+        hier_config: HierarchicalDataConfig | None = None,
         truncation_side: TruncationSide = TruncationSide.RIGHT,
         max_seq_length: int = 128,
         use_data_cache: bool = True,
-        max_train: Union[int, None] = None,
-        max_eval: Union[int, None] = None,
-        max_test: Union[int, None] = None,
+        max_train: int | None = None,
+        max_eval: int | None = None,
+        max_test: int | None = None,
         allow_disjoint_labels: bool = False,
         character_level: bool = False,
-        hf_cache_dir: Union[str, None] = None,
+        hf_cache_dir: str | None = None,
     ):
         """Create a new `CnlpDataset`.
 

diff --git a/src/cnlpt/data/data_reader.py b/src/cnlpt/data/data_reader.py
@@ -1,7 +1,7 @@
 import json
 import os
 from collections.abc import Iterable
-from typing import Any, Final, Literal, Union, cast
+from typing import Any, Final, Literal, cast
 
 from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
 
@@ -21,7 +21,7 @@
 NONE_VALUE: Final = "__None__"
 
 
-def _infer_split(filepath: Union[str, os.PathLike]) -> DatasetSplit:
+def _infer_split(filepath: str | os.PathLike) -> DatasetSplit:
     _dir, filename = os.path.split(filepath)
     root, _ext = os.path.splitext(filename)
 
@@ -106,7 +106,7 @@ def _get_task_by_name(self, task_name: str):
                 return task
         raise ValueError(f'task with name "{task_name}" not found')
 
-    def get_tasks(self, task_names: Union[Iterable[str], None] = None):
+    def get_tasks(self, task_names: Iterable[str] | None = None):
         """Get all or some subset of the tasks in the data.
 
         The `TaskInfo` objects returned by this method will have their `index` property
@@ -198,8 +198,8 @@ def _extend(self, new_dataset: DatasetDict, tasks: list[TaskInfo]):
 
     def load_json(
         self,
-        json_filepath: Union[str, os.PathLike],
-        split: Union[DatasetSplit, None] = None,
+        json_filepath: str | os.PathLike,
+        split: DatasetSplit | None = None,
     ):
         """Update this reader with new data from a CNLP-formatted json file.
 
@@ -274,8 +274,8 @@ def load_json(
 
     def load_csv(
         self,
-        csv_filepath: Union[str, os.PathLike],
-        split: Union[DatasetSplit, None] = None,
+        csv_filepath: str | os.PathLike,
+        split: DatasetSplit | None = None,
         sep: str = ",",
     ):
         """Update this reader with new data from a CNLP-formatted csv (or tsv) file.
@@ -299,7 +299,7 @@ def load_csv(
         tasks = _infer_tasks(dataset[split])
         self._extend(dataset, tasks)
 
-    def load_dir(self, data_dir: Union[str, os.PathLike]):
+    def load_dir(self, data_dir: str | os.PathLike):
         """Update this reader with new data from a directory containing CNLP-formatted data.
 
         This will search (non-recursively) for files named "train", "test", "validation", "valid", or "dev",

diff --git a/src/cnlpt/data/predictions.py b/src/cnlpt/data/predictions.py
@@ -2,7 +2,7 @@
 import os
 from collections.abc import Iterable
 from dataclasses import asdict, dataclass
-from typing import Any, Union
+from typing import Any
 
 import numpy as np
 import numpy.typing as npt
@@ -19,7 +19,7 @@
 class TaskPredictions:
     task: TaskInfo
     logits: npt.NDArray
-    labels: Union[npt.NDArray, None]
+    labels: npt.NDArray | None
 
     @property
     def probs(self) -> npt.NDArray:
@@ -34,7 +34,7 @@ def predicted_str_labels(self) -> npt.NDArray:
         return np.array(self.task.labels)[self.predicted_int_labels]
 
     @property
-    def target_str_labels(self) -> Union[npt.NDArray, None]:
+    def target_str_labels(self) -> npt.NDArray | None:
         if self.labels is None:
             return None
         masked = self.labels.copy()
@@ -68,7 +68,7 @@ def __init__(
 
         self.task_predictions: dict[str, TaskPredictions] = {}
 
-        task_labels: dict[str, Union[npt.NDArray, None]]
+        task_labels: dict[str, npt.NDArray | None]
 
         if self.raw.label_ids is None:
             task_labels = {t.name: None for t in tasks}
@@ -137,7 +137,7 @@ def arr_to_list(obj):
 
     def save_json(
         self,
-        json_filepath: Union[str, os.PathLike],
+        json_filepath: str | os.PathLike,
         allow_overwrite: bool = False,
     ):
         write_mode = "w" if allow_overwrite else "x"
@@ -169,7 +169,7 @@ def list_to_arr(obj, dtype):
         )
 
     @classmethod
-    def load_json(cls, filepath: Union[str, os.PathLike]):
+    def load_json(cls, filepath: str | os.PathLike):
         with open(filepath) as f:
             return cls.from_dict(json.load(f))