timdex-dataset-api/tests/test_dataset.py at 66bd9efc12a35258790f496a5a122787addfdf9d · MITLibraries/timdex-dataset-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# ruff: noqa: D205, D209, SLF001, PLR2004

import glob
import os
from datetime import date
from pathlib import Path
from unittest.mock import MagicMock, patch

import pyarrow as pa
import pytest
from duckdb import ConversionException, DuckDBPyConnection
from pyarrow import fs

from timdex_dataset_api.dataset import (
    TIMDEXDataset,
    TIMDEXDatasetConfig,
)


def test_dataset_init_success(tmp_path):
    timdex_dataset = TIMDEXDataset(str(tmp_path / "path/to/dataset"))
    assert isinstance(timdex_dataset.dataset.filesystem, fs.LocalFileSystem)


def test_dataset_init_env_vars_set_config(monkeypatch, tmp_path):
    location = str(tmp_path / "timdex_dataset/")
    default_timdex_dataset = TIMDEXDataset(location=location)
    default_read_batch_config = default_timdex_dataset.config.read_batch_size
    assert default_read_batch_config == 1_000

    monkeypatch.setenv("TDA_READ_BATCH_SIZE", "100_000")
    env_var_timdex_dataset = TIMDEXDataset(location=location)
    env_var_read_batch_config = env_var_timdex_dataset.config.read_batch_size
    assert env_var_read_batch_config == 100_000


def test_dataset_init_custom_config_object(monkeypatch, tmp_path):
    location = str(tmp_path / "timdex_dataset/")
    config = TIMDEXDatasetConfig()
    config.max_rows_per_file = 42
    timdex_dataset = TIMDEXDataset(location=location, config=config)
    assert timdex_dataset.config.max_rows_per_file == 42


@patch("timdex_dataset_api.dataset.fs.LocalFileSystem")
@patch("timdex_dataset_api.dataset.ds.dataset")
def test_load_pyarrow_dataset_default_uses_data_records_root(
    mock_pyarrow_ds, mock_local_fs, tmp_path
):
    """Ensure load_pyarrow_dataset() without args calls pyarrow.dataset with the
    dataset's data_records_root path as the source and the proper filesystem."""
    mock_local_fs.return_value = MagicMock()
    mock_pyarrow_ds.return_value = MagicMock()

    location = str(Path(tmp_path) / "local/path/to/default_dataset")

    timdex_dataset = TIMDEXDataset(location=location)
    # call the explicit loader to exercise the code path
    dataset_obj = timdex_dataset.load_pyarrow_dataset()

    mock_pyarrow_ds.assert_called_with(
        f"{location}/data/records",
        schema=timdex_dataset.schema,
        format="parquet",
        partitioning="hive",
        filesystem=mock_local_fs.return_value,
    )
    assert dataset_obj == mock_pyarrow_ds.return_value
    assert timdex_dataset.dataset == mock_pyarrow_ds.return_value


@patch("timdex_dataset_api.dataset.fs.LocalFileSystem")
@patch("timdex_dataset_api.dataset.ds.dataset")
def test_load_pyarrow_dataset_with_parquet_files_list(
    mock_pyarrow_ds, mock_local_fs, tmp_path
):
    """Ensure load_pyarrow_dataset(parquet_files=...) passes the explicit list
    of parquet files as the source to pyarrow.dataset."""
    mock_local_fs.return_value = MagicMock()
    mock_pyarrow_ds.return_value = MagicMock()

    location = str(Path(tmp_path) / "local/path/to/dataset_with_files")

    timdex_dataset = TIMDEXDataset(location=location)

    parquet_files = [
        f"{timdex_dataset.data_records_root}/source=alma/run_date=2024-12-01/part-0.parquet",
        f"{timdex_dataset.data_records_root}/source=alma/run_date=2024-12-01/part-1.parquet",
    ]

    dataset_obj = timdex_dataset.load_pyarrow_dataset(parquet_files=parquet_files)

    mock_pyarrow_ds.assert_called_with(
        parquet_files,
        schema=timdex_dataset.schema,
        format="parquet",
        partitioning="hive",
        filesystem=mock_local_fs.return_value,
    )
    assert dataset_obj == mock_pyarrow_ds.return_value
    assert timdex_dataset.dataset == mock_pyarrow_ds.return_value


@patch("timdex_dataset_api.dataset.fs.LocalFileSystem")
@patch("timdex_dataset_api.dataset.ds.dataset")
def test_dataset_load_local_sets_filesystem_and_dataset_success(
    mock_pyarrow_ds, mock_local_fs, tmp_path
):
    mock_local_fs.return_value = MagicMock()
    mock_pyarrow_ds.return_value = MagicMock()

    location = str(Path(tmp_path) / "local/path/to/dataset")

    timdex_dataset = TIMDEXDataset(location=location)

    mock_pyarrow_ds.assert_called_once_with(
        f"{location}/data/records",
        schema=timdex_dataset.schema,
        format="parquet",
        partitioning="hive",
        filesystem=mock_local_fs.return_value,
    )

    assert timdex_dataset.dataset == mock_pyarrow_ds.return_value


@patch("timdex_dataset_api.dataset.TIMDEXDataset.get_s3_filesystem")
@patch("timdex_dataset_api.dataset.ds.dataset")
def test_dataset_load_s3_sets_filesystem_and_dataset_success(
    mock_pyarrow_ds, mock_get_s3_fs, s3_bucket_mocked
):
    mock_get_s3_fs.return_value = MagicMock()
    mock_pyarrow_ds.return_value = MagicMock()

    timdex_dataset = TIMDEXDataset(location="s3://timdex/path/to/dataset")

    mock_pyarrow_ds.assert_called_with(
        "timdex/path/to/dataset/data/records",
        schema=timdex_dataset.schema,
        format="parquet",
        partitioning="hive",
        filesystem=mock_get_s3_fs.return_value,
    )
    assert timdex_dataset.dataset == mock_pyarrow_ds.return_value


def test_filters_single_nonpartition_success(timdex_dataset_multi_source):
    df = timdex_dataset_multi_source.read_dataframe(run_id="abc123")
    assert df is not None
    assert set(df["run_id"].unique().tolist()) == {"abc123"}


def test_filters_multi_nonpartition_success(timdex_dataset_multi_source):
    df = timdex_dataset_multi_source.read_dataframe(
        timdex_record_id="alma:0",
        source="alma",
        run_type="daily",
        run_id="abc123",
        action="index",
    )
    assert df is not None
    assert len(df) == 1
    assert df.iloc[0]["timdex_record_id"] == "alma:0"


def test_filters_or_nonpartition_success(timdex_dataset_multi_source):
    df = timdex_dataset_multi_source.read_dataframe(timdex_record_id=["alma:0", "alma:1"])
    assert df is not None
    assert set(df["timdex_record_id"].tolist()) == {"alma:0", "alma:1"}


def test_filters_run_date_str_success(timdex_dataset_multi_source):
    df = timdex_dataset_multi_source.read_dataframe(run_date="2024-12-01")
    assert df is not None
    df_empty = timdex_dataset_multi_source.read_dataframe(run_date="2024-12-02")
    assert df_empty is None or len(df_empty) == 0


def test_filters_run_date_obj_success(timdex_dataset_multi_source):
    df = timdex_dataset_multi_source.read_dataframe(run_date=date(2024, 12, 1))
    assert df is not None
    df_empty = timdex_dataset_multi_source.read_dataframe(run_date=date(2024, 12, 2))
    assert df_empty is None or len(df_empty) == 0


def test_filters_ymd_success(timdex_dataset_multi_source):
    # metadata filters do not expose partition y/m/d; use run_date equivalents
    df = timdex_dataset_multi_source.read_dataframe(run_date=date(2024, 12, 1))
    assert df is not None
    df_empty = timdex_dataset_multi_source.read_dataframe(run_date=date(2025, 12, 1))
    assert df_empty is None or len(df_empty) == 0


def test_filters_run_date_invalid_raise_error(timdex_dataset_multi_source):
    with pytest.raises(
        ConversionException, match="Conversion Error: Unimplemented type for cast"
    ):
        timdex_dataset_multi_source.read_dataframe(run_date=999)


def test_dataset_get_s3_filesystem_success(mocker):
    mocked_s3_filesystem = mocker.spy(fs, "S3FileSystem")
    s3_filesystem = TIMDEXDataset.get_s3_filesystem()

    assert mocked_s3_filesystem.call_args[1] == {
        "secret_key": "fake_secret_key",
        "access_key": "fake_access_key",
        "region": "us-east-1",
        "session_token": "fake_session_token",
    }
    assert isinstance(s3_filesystem, pa._s3fs.S3FileSystem)


def test_dataset_timdex_dataset_validate_success(timdex_dataset):
    assert timdex_dataset.dataset.to_table().validate() is None  # where None is valid


def test_dataset_timdex_dataset_row_count_success(timdex_dataset):
    assert timdex_dataset.dataset.count_rows() == timdex_dataset.dataset.count_rows()


def test_dataset_all_records_not_current_and_not_deduped(
    timdex_dataset_with_runs_with_metadata,
):
    all_records_df = timdex_dataset_with_runs_with_metadata.read_dataframe()

    # assert counts reflect all records from dataset, no deduping
    assert all_records_df.source.value_counts().to_dict() == {"alma": 254, "dspace": 194}

    # assert run_date min/max dates align with min/max for all runs
    assert all_records_df.run_date.min() == date(2024, 12, 1)
    assert all_records_df.run_date.max() == date(2025, 2, 5)


def test_dataset_records_data_structure_is_idempotent(timdex_dataset_with_runs):
    assert os.path.exists(timdex_dataset_with_runs.data_records_root)
    start_file_count = glob.glob(f"{timdex_dataset_with_runs.data_records_root}/**/*")

    timdex_dataset_with_runs.create_data_structure()

    assert os.path.exists(timdex_dataset_with_runs.data_records_root)
    end_file_count = glob.glob(f"{timdex_dataset_with_runs.data_records_root}/**/*")
    assert start_file_count == end_file_count


def test_dataset_duckdb_context_created_on_init(timdex_dataset):
    assert isinstance(timdex_dataset.conn, DuckDBPyConnection)


def test_dataset_duckdb_context_creates_data_schema(timdex_dataset):
    assert timdex_dataset.conn.query("""
            select count(*)
            from information_schema.schemata
            where catalog_name = 'memory'
            and schema_name = 'data';
            """).fetchone()[0] == 1


def test_dataset_preload_current_records_default_false(timdex_dataset):
    assert timdex_dataset.preload_current_records is False
    assert timdex_dataset.metadata.preload_current_records is False


def test_dataset_preload_current_records_flag_true(tmp_path):
    td = TIMDEXDataset(str(tmp_path), preload_current_records=True)
    assert td.preload_current_records is True
    assert td.metadata.preload_current_records is True