Skip to content
Merged
77 changes: 67 additions & 10 deletions src/osekit/utils/timestamp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,75 @@
"%Y": r"([12]\d{3})",
"%y": r"(\d{2})",
"%m": r"(0[1-9]|1[0-2])",
"%-m": r"(1[0-2]|(?:(?<!\d)[1-9](?!\d)))",
"%d": r"([0-2]\d|3[0-1])",
"%-d": r"(3[01]|[12][0-9]|(?:(?<!\d)[1-9](?!\d)))",
"%H": r"([0-1]\d|2[0-4])",
"%-H": r"(2[0-3]|1[0-9]|(?:(?<!\d)[0-9](?!\d)))",
"%I": r"(0[1-9]|1[0-2])",
"%-I": r"(1[0-2]|(?:(?<!\d)[1-9](?!\d)))",
"%p": r"(AM|PM)",
"%M": r"([0-5]\d)",
"%-M": r"([0-5][0-9]|(?:(?<!\d)[0-9](?!\d)))",
"%S": r"([0-5]\d)",
"%-S": r"([0-5][0-9]|(?:(?<!\d)[0-9](?!\d)))",
"%f": r"(\d{1,6})",
"%Z": r"((?:[a-zA-Z]+)(?:[-/]\w+)*(?:[\+-]\d+)?)",
"%z": r"([\+-]\d{2}:?\d{2})",
}


def normalize_datetime(datetime: tuple[str], template: str) -> tuple[str, str]:
"""Convert a datetime and its template with non-zero padded parts.

Parameters
----------
datetime : tuple[str]
A tuple of datetime component strings (e.g., ('2024', '1', '15')).
Comment thread
mathieudpnt marked this conversation as resolved.
Outdated
template : str
A datetime template string with format specifiers (e.g., '%Y_%-m_%d').
Format specifiers starting with '%-' indicate non-zero-padded values
that will be converted to zero-padded format.

Returns
-------
tuple[str, str]
A tuple containing:
- A normalized template string with all format specifiers zero-padded
(e.g., '%Y_%m_%d')
- A normalized datetime string with all values zero-padded
(e.g., '2024_01_15')

Examples
--------
>>> normalize_datetime(('2024', '1', '15'), '%Y_%-m_%d')
('%Y_%m_%d', '2024_01_15')

>>> normalize_datetime(('2024', '3', '5'), '%Y_%-m_%-d')
('%Y_%m_%d', '2024_03_05')
"""
Comment thread
mathieudpnt marked this conversation as resolved.
datetime_parts = list(datetime)
Comment thread
mathieudpnt marked this conversation as resolved.
Outdated
template_parts = re.findall(r"%-?[A-Za-z]", template)
dt_dict = dict(zip(template_parts, datetime_parts, strict=True))

if sum(1 for _ in {k.lstrip("%-") for k in dt_dict.keys()}) < len(dt_dict):
Comment thread
mathieudpnt marked this conversation as resolved.
Outdated
msg = "Format specifiers in template must be unique."
raise ValueError(msg)

clean_dt_dict = {}
for key, value in dt_dict.items():
if "-" in key:
new_key = key.replace("-", "")
new_value = f"{int(value):02}"
else:
new_key = key
new_value = value

clean_dt_dict[new_key] = new_value

return "_".join(clean_dt_dict.keys()), "_".join(clean_dt_dict.values())


def localize_timestamp(
timestamp: Timestamp,
timezone: str | pytz.timezone,
Expand Down Expand Up @@ -168,13 +225,14 @@ def is_datetime_template_valid(datetime_template: str) -> bool:

"""
strftime_identifiers = [key.lstrip("%") for key in _REGEX_BUILDER]
strftime_identifier_lengths = {len(id) for id in strftime_identifiers}
Comment thread
mathieudpnt marked this conversation as resolved.
Outdated
percent_sign_indexes = (
index for index, char in enumerate(datetime_template) if char == "%"
)
for index in percent_sign_indexes:
if index == len(datetime_template) - 1:
return False
if datetime_template[index + 1] not in strftime_identifiers:
if not any(datetime_template[index + 1: index + 1 + id_len] in strftime_identifiers for id_len in strftime_identifier_lengths):
return False
return True

Expand Down Expand Up @@ -210,10 +268,10 @@ def strptime_from_text(text: str, datetime_template: str | list[str]) -> Timesta
Timestamp('2016-06-13 14:12:00+0500', tz='UTC+05:00')

""" # noqa: E501
if type(datetime_template) is str:
if isinstance(datetime_template, str):
datetime_template = [datetime_template]

valid_datetime_template = ""
valid_datetime_template = None
regex_result = []
msg = []

Expand All @@ -232,16 +290,15 @@ def strptime_from_text(text: str, datetime_template: str | list[str]) -> Timesta
valid_datetime_template = template
break

if not valid_datetime_template:
if valid_datetime_template is None:
raise ValueError("\n".join(msg))

date_string = "_".join(regex_result[0])
cleaned_date_template = "_".join(
c + valid_datetime_template[i + 1]
for i, c in enumerate(valid_datetime_template)
if c == "%"
cleaned_date_template, cleaned_date_string = normalize_datetime(
datetime=regex_result[0],
template=valid_datetime_template
)
return pd.to_datetime(date_string, format=cleaned_date_template)

return pd.to_datetime(cleaned_date_string, format=cleaned_date_template)


def last_window_end(
Expand Down
160 changes: 159 additions & 1 deletion tests/test_timestamp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
localize_timestamp,
reformat_timestamp,
strftime_osmose_format,
strptime_from_text,
strptime_from_text, normalize_datetime,
)


Expand All @@ -30,6 +30,10 @@
pytest.param("%y%m%d%H%M%S%", False, id="trailing_%_is_wrong_strftime_code"),
pytest.param("%y%m%d%H%M%S%z", True, id="utc_offset"),
pytest.param("%y%m%d%H%M%S_%Z", True, id="timezone_name"),
pytest.param("%y%-m%d%H%M%S", True, id="datetime_month_decimal_number"),
pytest.param("%y-%-m-%d", True, id="date_month_decimal_number"),
pytest.param("%y-%-m-%dT%H:%M:%S.%f", True, id="date2_month_decimal_number"),
pytest.param("%d-%-m-%Y", True, id="date3_month_decimal_number"),
],
)
def test_is_datetime_template_valid(datetime_template: str, expected: bool) -> None:
Expand Down Expand Up @@ -82,6 +86,11 @@ def test_is_datetime_template_valid(datetime_template: str, expected: bool) -> N
r"(\d{2})(0[1-9]|1[0-2])([0-2]\d|3[0-1])((?:[a-zA-Z]+)(?:[-/]\w+)*(?:[\+-]\d+)?)",
id="timezone_name",
),
pytest.param(
"%d/%-m%y",
r"([0-2]\d|3[0-1])/(1[0-2]|(?:(?<!\d)[1-9](?!\d)))(\d{2})",
id="decimal_month_date",
),
],
)
def test_build_regex_from_datetime_template(
Expand Down Expand Up @@ -239,6 +248,24 @@ def test_build_regex_from_datetime_template(
Timestamp("2023-04-05 14:49:06.123000", tz="UTC"),
id="invalid_then_unmatching_then_matching",
),
pytest.param(
"10-2-2022",
"%d-%-m-%Y",
Timestamp("2022-02-10"),
id="date_with_decimal_month",
),
pytest.param(
"10-2-2022",
"%-d-%-m-%Y",
Timestamp("2022-02-10"),
id="date_with_decimal_month_and_day",
),
pytest.param(
"1-1-2022T2:2:2.000000+0700",
"%-d-%-m-%YT%-H:%-M:%-S.%f%z",
Timestamp("2022-01-01T02:02:02.000000+0700"),
id="date_full_decimal",
),
],
)
def test_strptime_from_text(
Expand Down Expand Up @@ -480,6 +507,12 @@ def test_strftime_osmose_format(timestamp: Timestamp, expected: str) -> None:
"2024-10-17T10:14:11.000+0000",
id="negative_zero_UTC_offset_timezone",
),
pytest.param(
"2024-1-17",
"%Y-%-m-%d",
"2024-01-17T00:00:00.000+0000",
id="decimal_month_date",
),
],
)
def test_reformat_timestamp(
Expand Down Expand Up @@ -679,3 +712,128 @@ def test_last_window_end(
)
== expected
)



@pytest.mark.parametrize(
("datetime", "template", "expected_keys", "expected_values"),
[

pytest.param(
("5", "3", "2023"),
"%-m_%-d_%Y",
"%m_%d_%Y",
"05_03_2023",
id="single_digit_month_day"
),
pytest.param(
("05", "03", "2023"),
"%-m_%-d_%Y",
"%m_%d_%Y",
"05_03_2023",
id="already_zero_padded"
),
pytest.param(
("05", "30", "2023"),
"%m_%d_%Y",
"%m_%d_%Y",
"05_30_2023",
id="no_non_zero_padding"
),
pytest.param(
("5", "3"),
"%-m_%-d",
"%m_%d",
"05_03",
id="only_non_zero_padded"
),
pytest.param(
("9", "5", "30"),
"%-H_%-M_%S",
"%H_%M_%S",
"09_05_30",
id="hour_minute_second"
),
pytest.param(
("2023", "5", "30"),
"%Y_%-m_%d",
"%Y_%m_%d",
"2023_05_30",
id="mixed_padded_nonpadded"
),
pytest.param(
("1", "2", "3"),
"%-m_%-d_%-H",
"%m_%d_%H",
"01_02_03",
id="all_single_digits_need_padding"
),
pytest.param(
("12", "25", "2023"),
"%-m_%-d_%Y",
"%m_%d_%Y",
"12_25_2023",
id="double_digits_on_nonpadded"
),
pytest.param(
("5", "30", "23"),
"%-m_%d_%-y",
"%m_%d_%y",
"05_30_23",
id="different_format_specifiers"
),
pytest.param(
("1", "1", "1", "1"),
"%-m_%-d_%-H_%-M",
"%m_%d_%H_%M",
"01_01_01_01",
id="edge_case_all_ones",
),
pytest.param(
("05", "03", "2023"),
"%-m_%-d_%Y",
"%m_%d_%Y",
"05_03_2023",
id="non_zero_format_with_already_padded_values",
),
],
)
def test_normalize_datetime(datetime, template, expected_keys, expected_values):
result_keys, result_values = normalize_datetime(datetime, template)
assert result_keys == expected_keys
assert result_values == expected_values
Comment thread
mathieudpnt marked this conversation as resolved.


@pytest.mark.parametrize(
("datetime", "template", "expected"),
[
pytest.param(
("5", "3", "1998"),
"%m_%-m_%Y",
pytest.raises(ValueError),
Comment thread
mathieudpnt marked this conversation as resolved.
Outdated
id="duplicate_format_specifiers_padded_and_nonpadded"
),
pytest.param(
("5", "3"),
"%-m_%-d_%Y",
pytest.raises(ValueError),
id="mismatched_datetime_template_length_more_specifiers"
),
pytest.param(
("5", "3", "2023"),
"%-m_%-d",
pytest.raises(ValueError),
id="mismatched_datetime_template_length_more_values"
),
pytest.param(
("abc", "3", "2023"),
"%-m_%-d_%Y",
pytest.raises(ValueError),
id="non_numeric_datetime_value"
),
],
)
def test_normalize_datetime_errors(datetime, template, expected):
Comment thread
mathieudpnt marked this conversation as resolved.
Outdated
"""Test that function handles error cases appropriately."""
with expected as e:
assert normalize_datetime(datetime, template) == e
Comment thread
mathieudpnt marked this conversation as resolved.
Outdated