initiate fpod_utils tests

Gre51 · Gre51 · commit b0bbee6aeb21 · 2026-01-20T16:06:51.000+01:00
diff --git a/tests/test_fpod_utils.py b/tests/test_fpod_utils.py
@@ -1,11 +1,12 @@
 """FPOD/ CPOD processing functions tests."""
 import io
+import pytz
 from datetime import datetime
 from pathlib import Path
 
 import pytest
 from osekit.utils.timestamp_utils import strptime_from_text
-from pandas import DataFrame, Timestamp, read_csv, concat
+from pandas import DataFrame, Timestamp, Timedelta, read_csv, concat
 from pandas.testing import assert_frame_equal
 
 from post_processing.utils.fpod_utils import (
@@ -164,8 +165,322 @@ def test_csv_folder_single_file(tmp_path) -> None:
     assert all(result["deploy.name"] == "data")
     assert list(result.columns) == ["col1", "col2", "deploy.name"]
 
+
 # pod2aplose
+@pytest.fixture
+def sample_df():
+    """Create a sample POD DataFrame for testing."""
+    return DataFrame({
+        "ChunkEnd": ["15/01/2024 10:30", "15/01/2024 11:00", "15/01/2024 09:45"],
+        "deploy.name": ["deploy1", "deploy2", "deploy1"],
+    })
+
+
+@pytest.fixture
+def timezone():
+    """Return UTC timezone for testing."""
+    return pytz.UTC
+
+
+def test_pod2aplose_basic_structure(sample_df, timezone):
+    """Test that basic structure and required columns are present."""
+    result = pod2aplose(
+        df=sample_df,
+        tz=timezone,
+        dataset_name="test_dataset",
+        annotation="test_annotation",
+        annotator="test_annotator",
+    )
+
+    expected_columns = [
+        "dataset",
+        "filename",
+        "start_time",
+        "end_time",
+        "start_frequency",
+        "end_frequency",
+        "annotation",
+        "annotator",
+        "start_datetime",
+        "end_datetime",
+        "is_box",
+        "deploy.name",
+    ]
 
+    assert isinstance(result, DataFrame)
+    assert list(result.columns) == expected_columns
+    assert len(result) == len(sample_df)
+
+
+def test_pod2aplose_dataset_propagation(sample_df, timezone):
+    """Test that dataset name is propagated to all rows."""
+    result = pod2aplose(
+        df=sample_df,
+        tz=timezone,
+        dataset_name="my_dataset",
+        annotation="click",
+        annotator="john",
+    )
+
+    assert all(result["dataset"] == "my_dataset")
+
+
+def test_pod2aplose_annotation_propagation(sample_df, timezone):
+    """Test that annotation is propagated to all rows."""
+    result = pod2aplose(
+        df=sample_df,
+        tz=timezone,
+        dataset_name="dataset",
+        annotation="porpoise_click",
+        annotator="john",
+    )
+
+    assert all(result["annotation"] == "porpoise_click")
+
+
+def test_pod2aplose_annotator_propagation(sample_df, timezone):
+    """Test that annotator is propagated to all rows."""
+    result = pod2aplose(
+        df=sample_df,
+        tz=timezone,
+        dataset_name="dataset",
+        annotation="click",
+        annotator="alice",
+    )
+
+    assert all(result["annotator"] == "alice")
+
+
+def test_pod2aplose_default_bin_size(sample_df, timezone):
+    """Test default bin_size of 60 seconds."""
+    result = pod2aplose(
+        df=sample_df,
+        tz=timezone,
+        dataset_name="dataset",
+        annotation="click",
+        annotator="john",
+    )
+
+    assert all(result["start_time"] == 0)
+    assert all(result["end_time"] == 60)
+
+
+def test_pod2aplose_custom_bin_size(sample_df, timezone):
+    """Test custom bin_size parameter."""
+    result = pod2aplose(
+        df=sample_df,
+        tz=timezone,
+        dataset_name="dataset",
+        annotation="click",
+        annotator="john",
+        bin_size=120,
+    )
+
+    assert all(result["start_time"] == 0)
+    assert all(result["end_time"] == 120)
+
+
+def test_pod2aplose_frequency_values(sample_df, timezone):
+    """Test that frequency values are set to 0."""
+    result = pod2aplose(
+        df=sample_df,
+        tz=timezone,
+        dataset_name="dataset",
+        annotation="click",
+        annotator="john",
+    )
+
+    assert all(result["start_frequency"] == 0)
+    assert all(result["end_frequency"] == 0)
+
+
+def test_pod2aplose_is_box_values(sample_df, timezone):
+    """Test that is_box values are set to 0."""
+    result = pod2aplose(
+        df=sample_df,
+        tz=timezone,
+        dataset_name="dataset",
+        annotation="click",
+        annotator="john",
+    )
+
+    assert all(result["is_box"] == 0)
+
+
+def test_pod2aplose_deploy_name_preserved(sample_df, timezone):
+    """Test that deploy.name values are preserved from input."""
+    result = pod2aplose(
+        df=sample_df,
+        tz=timezone,
+        dataset_name="dataset",
+        annotation="click",
+        annotator="john",
+    )
+
+    # After sorting, deploy.name should still be present
+    assert "deploy.name" in result.columns
+    assert len(result["deploy.name"]) == len(sample_df)
+    assert set(result["deploy.name"]) == {"deploy1", "deploy2"}
+
+
+def test_pod2aplose_sorting_by_datetime(timezone):
+    """Test that rows are sorted by datetime."""
+    df = DataFrame({
+        "ChunkEnd": ["15/01/2024 12:00", "15/01/2024 10:00", "15/01/2024 11:00"],
+        "deploy.name": ["d1", "d2", "d3"],
+    })
+
+    result = pod2aplose(
+        df=df, tz=timezone, dataset_name="dataset", annotation="click", annotator="john"
+    )
+
+    # Check that deploy.name follows the sorted order (by time)
+    assert result["deploy.name"].tolist() == ["d2", "d3", "d1"]
+
+
+def test_pod2aplose_datetime_formatting():
+    """Test that datetime strings are properly formatted."""
+    df = DataFrame({"ChunkEnd": ["01/02/2024 14:30"], "deploy.name": ["deploy1"]})
+
+    result = pod2aplose(
+        df=df,
+        tz=pytz.UTC,
+        dataset_name="dataset",
+        annotation="click",
+        annotator="john",
+        bin_size=60,
+    )
+
+    # Check that datetime strings are present and not empty
+    assert len(result["start_datetime"].iloc[0]) > 0
+    assert len(result["end_datetime"].iloc[0]) > 0
+    assert len(result["filename"].iloc[0]) > 0
+
+
+def test_pod2aplose_end_datetime_offset(timezone):
+    """Test that end_datetime is offset by bin_size from start_datetime."""
+    df = DataFrame({"ChunkEnd": ["15/01/2024 10:00"], "deploy.name": ["deploy1"]})
+
+    result = pod2aplose(
+        df=df,
+        tz=timezone,
+        dataset_name="dataset",
+        annotation="click",
+        annotator="john",
+        bin_size=120,
+    )
+
+    # Both should be valid datetime strings
+    assert result["start_datetime"].iloc[0] != result["end_datetime"].iloc[0]
+
+
+def test_pod2aplose_different_timezones():
+    """Test with different timezone."""
+    df = DataFrame({"ChunkEnd": ["15/01/2024 10:00"], "deploy.name": ["deploy1"]})
+
+    tz_paris = pytz.timezone("Europe/Paris")
+
+    result = pod2aplose(
+        df=df, tz=tz_paris, dataset_name="dataset", annotation="click", annotator="john"
+    )
+
+    assert len(result) == 1
+    assert result["dataset"].iloc[0] == "dataset"
+
+
+def test_pod2aplose_empty_dataframe(timezone):
+    """Test handling of empty DataFrame."""
+    df = DataFrame({"ChunkEnd": [], "deploy.name": []})
+
+    result = pod2aplose(
+        df=df, tz=timezone, dataset_name="dataset", annotation="click", annotator="john"
+    )
+
+    assert len(result) == 0
+    assert list(result.columns) == [
+        "dataset",
+        "filename",
+        "start_time",
+        "end_time",
+        "start_frequency",
+        "end_frequency",
+        "annotation",
+        "annotator",
+        "start_datetime",
+        "end_datetime",
+        "is_box",
+        "deploy.name",
+    ]
+
+
+def test_pod2aplose_single_row(timezone):
+    """Test with single row DataFrame."""
+    df = DataFrame({"ChunkEnd": ["20/03/2024 15:45"], "deploy.name": ["single_deploy"]})
+
+    result = pod2aplose(
+        df=df,
+        tz=timezone,
+        dataset_name="dataset",
+        annotation="click",
+        annotator="john",
+        bin_size=90,
+    )
+
+    assert len(result) == 1
+    assert result["deploy.name"].iloc[0] == "single_deploy"
+    assert result["end_time"].iloc[0] == 90
+
+
+def test_pod2aplose_does_not_modify_original(sample_df, timezone):
+    """Test that the original DataFrame is not modified."""
+    original_columns = sample_df.columns.tolist()
+    original_len = len(sample_df)
+
+    pod2aplose(
+        df=sample_df,
+        tz=timezone,
+        dataset_name="dataset",
+        annotation="click",
+        annotator="john",
+    )
+
+    # Original DataFrame should be unchanged
+    assert sample_df.columns.tolist() == original_columns
+    assert len(sample_df) == original_len
+    assert "_temp_dt" not in sample_df.columns
+
+
+def test_pod2aplose_large_bin_size(sample_df, timezone):
+    """Test with large bin_size value."""
+    result = pod2aplose(
+        df=sample_df,
+        tz=timezone,
+        dataset_name="dataset",
+        annotation="click",
+        annotator="john",
+        bin_size=3600,  # 1 hour
+    )
+
+    assert all(result["end_time"] == 3600)
+
+
+def test_pod2aplose_index_reset(timezone):
+    """Test that index is properly reset after sorting."""
+    df = DataFrame({
+        "ChunkEnd": ["15/01/2024 12:00", "15/01/2024 10:00"],
+        "deploy.name": ["d1", "d2"]
+    })
+
+    result = pod2aplose(
+        df=df,
+        tz=timezone,
+        dataset_name="dataset",
+        annotation="click",
+        annotator="john"
+    )
+
+    # Index should be 0, 1 after reset
+    assert result.index.tolist() == [0, 1]
 
 # meta_cut_aplose