Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
os: [ubuntu-latest, macos-latest, windows-latest]

runs-on: ${{ matrix.os }}
Expand All @@ -34,7 +34,7 @@ jobs:
id: setup-uv
uses: astral-sh/setup-uv@v6
with:
version: "0.7.11"
version: "0.11.1"
enable-cache: true
- name: Cache HF models
uses: actions/cache@v3
Expand Down
2 changes: 1 addition & 1 deletion .python-version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.12
3.14
37 changes: 14 additions & 23 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,26 +11,25 @@ authors = [
description = "Transformers for Clinical NLP"

readme = "README.md"
requires-python = ">=3.9, <3.13"
requires-python = ">=3.10, <3.15"
dependencies = [
"anaforatools~=1.1.0",
"datasets~=2.21.0",
"fastapi~=0.115.2",
"datasets>=4.4.0",
"fastapi>=0.115.2",
"httpx>=0.27.2",
"huggingface-hub>=0.9.0",
"nltk~=3.5",
"numpy~=2.0.2",
"pandas~=2.2.3",
"nltk>=3.5",
"numpy>=2.0.2",
"pandas>=2.2.3", # legacy
"polars>=1.30.0",
"pydantic~=1.10.8",
"requests~=2.32.2",
"pydantic>=1.10.8",
"requests>=2.32.2",
"rich>=14.0.0",
"scikit-learn~=1.5.2",
"seqeval~=1.2.2",
"torch>=2.6",
"transformers[torch]~=4.51",
"typer~=0.16.0",
"uvicorn[standard]~=0.32.0",
"scikit-learn>=1.5.2",
"seqeval>=1.2.2", # legacy
"torch>=2.11.0",
"transformers[torch]==4.56.1",
"typer>=0.16.0",
"uvicorn[standard]>=0.32.0",
]

classifiers = [
Expand Down Expand Up @@ -89,7 +88,6 @@ select = [
"I", # isort
"UP", # pyupgrade
"G", # logging
"FA", # future annotations
"PIE", # misc
"RUF", # misc
]
Expand All @@ -99,12 +97,5 @@ ignore = [
"G004", # f-strings in logging statements
]

[tool.ruff.lint.pyupgrade]
# Preserve Union types, despite alternate 'X | Y' syntax being available via __future__ annotations module.
# This is necessary because fastAPI and pydantic parse type annotations at runtime, and since the new syntax is
# a python 3.10 feature they don't expect it in python 3.9.
# This can be removed if/when we stop supporting python 3.9.
keep-runtime-typing = true

[tool.uv]
cache-keys = [{ git = { commit = true, tags = true } }]
40 changes: 18 additions & 22 deletions src/cnlpt/_cli/train.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enum import Enum
from typing import Annotated, Any, Final, Union
from typing import Annotated, Any, Final

import typer
from click.core import ParameterSource
Expand Down Expand Up @@ -40,7 +40,7 @@ def callback(ctx: typer.Context, param: typer.CallbackParam, value: Any):
def training_arg_option(
field_name: str,
*aliases,
compatibility: Union[list[ModelType], None] = None,
compatibility: list[ModelType] | None = None,
**kwargs,
):
field = CnlpTrainingArguments.__dataclass_fields__[field_name]
Expand All @@ -59,7 +59,7 @@ def training_arg_option(

def model_arg_option(
*args,
compatibility: Union[list[ModelType], None] = None,
compatibility: list[ModelType] | None = None,
**kwargs,
):
if compatibility is not None:
Expand All @@ -69,7 +69,7 @@ def model_arg_option(

def data_arg_option(
*args,
compatibility: Union[list[ModelType], None] = None,
compatibility: list[ModelType] | None = None,
**kwargs,
):
if compatibility is not None:
Expand Down Expand Up @@ -251,15 +251,15 @@ def transformers_arg_option(field_name: str, *args, **kwargs):
),
]
TaskNamesArg = Annotated[
Union[list[str], None],
list[str] | None,
data_arg_option(
"--task",
"-t",
help="The name of a task in the dataset to train on. Can be specified multiple times to target more than one task. Defaults to all tasks.",
),
]
TokenizerArg = Annotated[
Union[str, None],
str | None,
data_arg_option(
"--tokenizer",
help=f'Name or path to a model to use for tokenization. For projection and hierarchical models, this will default to the --encoder if left unspecified; otherwise defaults to "{DEFAULT_ENCODER}".',
Expand Down Expand Up @@ -288,15 +288,15 @@ def transformers_arg_option(field_name: str, *args, **kwargs):
),
]
MaxTrainArg = Annotated[
Union[int, None],
int | None,
data_arg_option("--max_train", help="Limit the number of training samples to use."),
]
MaxEvalArg = Annotated[
Union[int, None],
int | None,
data_arg_option("--max_eval", help="Limit the number of eval samples to use."),
]
MaxTestArg = Annotated[
Union[int, None],
int | None,
data_arg_option("--max_test", help="Limit the number of test samples to use."),
]
AllowDisjointLabelsArg = Annotated[
Expand All @@ -314,17 +314,17 @@ def transformers_arg_option(field_name: str, *args, **kwargs):
),
]
HierChunkLenArg = Annotated[
Union[int, None],
int | None,
data_arg_option("--hier_chunk_len", help="Chunk length for hierarchical models."),
]
HierNumChunksArg = Annotated[
Union[int, None],
int | None,
data_arg_option(
"--hier_num_chunks", help="Number of chunks for hierarchical models."
),
]
HierPrependEmptyChunkArg = Annotated[
Union[int, None],
int | None,
data_arg_option(
"--hier_prepend_empty_chunk",
help="Whether to prepend an empty chunk for hierarchical models.",
Expand All @@ -342,23 +342,19 @@ def transformers_arg_option(field_name: str, *args, **kwargs):
IntervalStrategy, training_arg_option("logging_strategy")
]
LoggingFirstStepArg = Annotated[bool, training_arg_option("logging_first_step")]
CacheDirArg = Annotated[Union[str, None], training_arg_option("cache_dir")]
CacheDirArg = Annotated[str | None, training_arg_option("cache_dir")]
MetricForBestModelArg = Annotated[str, training_arg_option("metric_for_best_model")]


##### COMMON HF TRANSFORMERS ARGS #####
NumTrainEpochsArg = Annotated[
Union[float, None], transformers_arg_option("num_train_epochs")
]
NumTrainEpochsArg = Annotated[float | None, transformers_arg_option("num_train_epochs")]
PerDeviceTrainBatchSizeArg = Annotated[
Union[int, None], transformers_arg_option("per_device_train_batch_size")
int | None, transformers_arg_option("per_device_train_batch_size")
]
GradientAccumulationStepsArg = Annotated[
Union[int, None], transformers_arg_option("gradient_accumulation_steps")
]
LearningRateArg = Annotated[
Union[float, None], transformers_arg_option("learning_rate")
int | None, transformers_arg_option("gradient_accumulation_steps")
]
LearningRateArg = Annotated[float | None, transformers_arg_option("learning_rate")]
DoTrainArg = Annotated[bool, transformers_arg_option("do_train", "--do_train")]
DoEvalArg = Annotated[bool, transformers_arg_option("do_eval", "--do_eval")]
DoPredictArg = Annotated[bool, transformers_arg_option("do_predict", "--do_predict")]
Expand Down Expand Up @@ -608,7 +604,7 @@ def train(
if bias_fit:
model_init_kwargs["bias_fit"] = True

model: Union[CnnModel, LstmModel, HierarchicalModel, ProjectionModel] = (
model: CnnModel | LstmModel | HierarchicalModel | ProjectionModel = (
AutoModel.from_config(config, **model_init_kwargs)
)
train_system = CnlpTrainSystem(model, dataset, training_args)
Expand Down
20 changes: 10 additions & 10 deletions src/cnlpt/data/cnlp_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from collections import Counter
from dataclasses import dataclass
from enum import Enum
from typing import Literal, Union
from typing import Literal

import torch
from datasets import Dataset
Expand All @@ -22,7 +22,7 @@ class HierarchicalDataConfig:

def load_tokenizer(
model_name_or_path: str,
hf_cache_dir: Union[str, None] = None,
hf_cache_dir: str | None = None,
truncation_side: Literal["left", "right"] = "right",
character_level: bool = False,
) -> PreTrainedTokenizer:
Expand Down Expand Up @@ -50,19 +50,19 @@ class CnlpDataset:

def __init__(
self,
data_dir: Union[str, os.PathLike],
tokenizer: Union[str, PreTrainedTokenizer] = "roberta-base",
task_names: Union[list[str], None] = None,
hier_config: Union[HierarchicalDataConfig, None] = None,
data_dir: str | os.PathLike,
tokenizer: str | PreTrainedTokenizer = "roberta-base",
task_names: list[str] | None = None,
hier_config: HierarchicalDataConfig | None = None,
truncation_side: TruncationSide = TruncationSide.RIGHT,
max_seq_length: int = 128,
use_data_cache: bool = True,
max_train: Union[int, None] = None,
max_eval: Union[int, None] = None,
max_test: Union[int, None] = None,
max_train: int | None = None,
max_eval: int | None = None,
max_test: int | None = None,
allow_disjoint_labels: bool = False,
character_level: bool = False,
hf_cache_dir: Union[str, None] = None,
hf_cache_dir: str | None = None,
):
"""Create a new `CnlpDataset`.

Expand Down
16 changes: 8 additions & 8 deletions src/cnlpt/data/data_reader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import os
from collections.abc import Iterable
from typing import Any, Final, Literal, Union, cast
from typing import Any, Final, Literal, cast

from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset

Expand All @@ -21,7 +21,7 @@
NONE_VALUE: Final = "__None__"


def _infer_split(filepath: Union[str, os.PathLike]) -> DatasetSplit:
def _infer_split(filepath: str | os.PathLike) -> DatasetSplit:
_dir, filename = os.path.split(filepath)
root, _ext = os.path.splitext(filename)

Expand Down Expand Up @@ -106,7 +106,7 @@ def _get_task_by_name(self, task_name: str):
return task
raise ValueError(f'task with name "{task_name}" not found')

def get_tasks(self, task_names: Union[Iterable[str], None] = None):
def get_tasks(self, task_names: Iterable[str] | None = None):
"""Get all or some subset of the tasks in the data.

The `TaskInfo` objects returned by this method will have their `index` property
Expand Down Expand Up @@ -198,8 +198,8 @@ def _extend(self, new_dataset: DatasetDict, tasks: list[TaskInfo]):

def load_json(
self,
json_filepath: Union[str, os.PathLike],
split: Union[DatasetSplit, None] = None,
json_filepath: str | os.PathLike,
split: DatasetSplit | None = None,
):
"""Update this reader with new data from a CNLP-formatted json file.

Expand Down Expand Up @@ -274,8 +274,8 @@ def load_json(

def load_csv(
self,
csv_filepath: Union[str, os.PathLike],
split: Union[DatasetSplit, None] = None,
csv_filepath: str | os.PathLike,
split: DatasetSplit | None = None,
sep: str = ",",
):
"""Update this reader with new data from a CNLP-formatted csv (or tsv) file.
Expand All @@ -299,7 +299,7 @@ def load_csv(
tasks = _infer_tasks(dataset[split])
self._extend(dataset, tasks)

def load_dir(self, data_dir: Union[str, os.PathLike]):
def load_dir(self, data_dir: str | os.PathLike):
"""Update this reader with new data from a directory containing CNLP-formatted data.

This will search (non-recursively) for files named "train", "test", "validation", "valid", or "dev",
Expand Down
12 changes: 6 additions & 6 deletions src/cnlpt/data/predictions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
from collections.abc import Iterable
from dataclasses import asdict, dataclass
from typing import Any, Union
from typing import Any

import numpy as np
import numpy.typing as npt
Expand All @@ -19,7 +19,7 @@
class TaskPredictions:
task: TaskInfo
logits: npt.NDArray
labels: Union[npt.NDArray, None]
labels: npt.NDArray | None

@property
def probs(self) -> npt.NDArray:
Expand All @@ -34,7 +34,7 @@ def predicted_str_labels(self) -> npt.NDArray:
return np.array(self.task.labels)[self.predicted_int_labels]

@property
def target_str_labels(self) -> Union[npt.NDArray, None]:
def target_str_labels(self) -> npt.NDArray | None:
if self.labels is None:
return None
masked = self.labels.copy()
Expand Down Expand Up @@ -68,7 +68,7 @@ def __init__(

self.task_predictions: dict[str, TaskPredictions] = {}

task_labels: dict[str, Union[npt.NDArray, None]]
task_labels: dict[str, npt.NDArray | None]

if self.raw.label_ids is None:
task_labels = {t.name: None for t in tasks}
Expand Down Expand Up @@ -137,7 +137,7 @@ def arr_to_list(obj):

def save_json(
self,
json_filepath: Union[str, os.PathLike],
json_filepath: str | os.PathLike,
allow_overwrite: bool = False,
):
write_mode = "w" if allow_overwrite else "x"
Expand Down Expand Up @@ -169,7 +169,7 @@ def list_to_arr(obj, dtype):
)

@classmethod
def load_json(cls, filepath: Union[str, os.PathLike]):
def load_json(cls, filepath: str | os.PathLike):
with open(filepath) as f:
return cls.from_dict(json.load(f))

Expand Down
Loading
Loading