From 459c402c1ac34ced3fbe0024e24b71476ba724a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ABlle=20TORTEROTOT?= Date: Wed, 7 Jan 2026 17:29:22 +0100 Subject: [PATCH 01/33] add show_recording_off --- src/post_processing/dataclass/data_aplose.py | 4 +- .../dataclass/recording_period.py | 126 ++++++++++++++---- src/post_processing/utils/filtering_utils.py | 4 +- .../PAMGuardMatlab-main/.DS_Store | Bin 0 -> 6148 bytes .../PAMGuardMatlab-main/.MATLABDriveTag | 1 + .../pgmatlab/.MATLABDriveTag | 1 + .../pgmatlab/Array/.MATLABDriveTag | 1 + src/post_processing/utils/plot_utils.py | 77 +++++++++-- 8 files changed, 169 insertions(+), 45 deletions(-) create mode 100644 src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.DS_Store create mode 100644 src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.MATLABDriveTag create mode 100644 src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/.MATLABDriveTag create mode 100644 src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/Array/.MATLABDriveTag diff --git a/src/post_processing/dataclass/data_aplose.py b/src/post_processing/dataclass/data_aplose.py index 98e6d9c..f604f80 100644 --- a/src/post_processing/dataclass/data_aplose.py +++ b/src/post_processing/dataclass/data_aplose.py @@ -393,7 +393,7 @@ def plot( color = kwargs.get("color") season = kwargs.get("season") effort = kwargs.get("effort") - + show_recording_OFF = kwargs.get("show_recording_OFF") if not bin_size: msg = "'bin_size' missing for histogram plot." raise ValueError(msg) @@ -409,7 +409,7 @@ def plot( color=color, season=season, effort=effort, - coordinates=(self.lat, self.lon), + coordinates=(self.lat, self.lon) ) if mode == "heatmap": diff --git a/src/post_processing/dataclass/recording_period.py b/src/post_processing/dataclass/recording_period.py index 4c09722..15def1e 100644 --- a/src/post_processing/dataclass/recording_period.py +++ b/src/post_processing/dataclass/recording_period.py @@ -16,6 +16,8 @@ cut, read_csv, ) +from pandas.tseries.offsets import BaseOffset +import pandas as pd from post_processing.utils.core_utils import ( get_time_range_and_bin_size, @@ -33,42 +35,108 @@ @dataclass(frozen=True) class RecordingPeriod: - """A class to handle recording periods.""" - counts: Series timebin_origin: Timedelta @classmethod def from_path( cls, - config: DetectionFilter, - date_format: str = TIMESTAMP_FORMATS_EXPORTED_FILES, + config, *, bin_size: Timedelta | BaseOffset, - ) -> RecordingPeriod: - """Return a list of Timestamps corresponding to recording periods.""" + ) -> "RecordingPeriod": + """Vectorized creation of recording coverage from CSV with start/end datetimes. + + CSV must have columns 'start_recording' and 'end_recording'. + bin_size can be a Timedelta (e.g., pd.Timedelta("1H")) or a pandas offset (e.g., "1D"). + """ + # 1. Read CSV and parse datetimes timestamp_file = config.timestamp_file delim = find_delimiter(timestamp_file) - timestamp_df = read_csv(timestamp_file, delimiter=delim) - - if "timestamp" in timestamp_df.columns: - msg = "Parsing 'timestamp' column not implemented yet." - raise NotImplementedError(msg) - - if "filename" in timestamp_df.columns: - timestamps = [ - strptime_from_text(ts, date_format) - for ts in timestamp_df["filename"] - ] - timestamps = localize_timestamps(timestamps, config.timezone) - time_vector, bin_size = get_time_range_and_bin_size(timestamps, bin_size) - - binned = cut(timestamps, time_vector) - max_annot = bin_size / config.timebin_origin - - return cls(counts=binned.value_counts().sort_index().clip(upper=max_annot), - timebin_origin=config.timebin_origin, - ) - - msg = "Could not parse timestamps." - raise ValueError(msg) + df = pd.read_csv( + config.timestamp_file, + parse_dates=["start_recording", "end_recording"], + delimiter=delim + ) + + if df.empty: + raise ValueError("CSV is empty.") + + # 2. Normalize timezones if needed + df["start_recording"] = ( + pd.to_datetime(df["start_recording"], utc=True).dt.tz_convert(None) + ) + df["end_recording"] = ( + pd.to_datetime(df["end_recording"], utc=True).dt.tz_convert(None) + ) + + # Build fine-grained timeline (timebin_origin resolution) + origin = config.timebin_origin + time_index = pd.date_range( + start=df["start_recording"].min(), + end=df["end_recording"].max(), + freq=origin, + ) + + # Initialize effort vector + effort = pd.Series(0, index=time_index) + + # Vectorized interval coverage + tvals = time_index.values[:, None] + start_vals = df["start_recording"].values + end_vals = df["end_recording"].values + + covered = (tvals >= start_vals) & (tvals < end_vals) + effort[:] = covered.any(axis=1).astype(int) + + # Aggregate effort into bin_size + counts = effort.resample(bin_size).sum() + counts.index = pd.interval_range( + start=counts.index[0], + periods=len(counts), + freq=bin_size, + closed="left", + ) + return cls(counts=counts, timebin_origin=origin) + +# @dataclass(frozen=True) +# class RecordingPeriod: +# """A class to handle recording periods.""" +# +# counts: Series +# timebin_origin: Timedelta +# +# @classmethod +# def from_path( +# cls, +# config: DetectionFilter, +# date_format: str = TIMESTAMP_FORMATS_EXPORTED_FILES, +# *, +# bin_size: Timedelta | BaseOffset, +# ) -> RecordingPeriod: +# """Return a list of Timestamps corresponding to recording periods.""" +# timestamp_file = config.timestamp_file +# delim = find_delimiter(timestamp_file) +# timestamp_df = read_csv(timestamp_file, delimiter=delim) +# +# if "timestamp" in timestamp_df.columns: +# msg = "Parsing 'timestamp' column not implemented yet." +# raise NotImplementedError(msg) +# +# if "filename" in timestamp_df.columns: +# timestamps = [ +# strptime_from_text(ts, date_format) +# for ts in timestamp_df["filename"] +# ] +# timestamps = localize_timestamps(timestamps, config.timezone) +# time_vector, bin_size = get_time_range_and_bin_size(timestamps, bin_size) +# +# binned = cut(timestamps, time_vector) +# max_annot = bin_size / config.timebin_origin +# +# return cls(counts=binned.value_counts().sort_index().clip(upper=max_annot), +# timebin_origin=config.timebin_origin, +# ) +# +# msg = "Could not parse timestamps." +# raise ValueError(msg) diff --git a/src/post_processing/utils/filtering_utils.py b/src/post_processing/utils/filtering_utils.py index c391ff6..650b5a8 100644 --- a/src/post_processing/utils/filtering_utils.py +++ b/src/post_processing/utils/filtering_utils.py @@ -509,8 +509,8 @@ def reshape_timebin( timebin_new: Timedelta The size of the new time bin. timestamp_audio: list[Timestamp] - A list of Timestamp objects corresponding to the shape - in which the data should be reshaped. + A list of Timestamp objects corresponding to the start of each wav + that corresponds to a detection Returns ------- diff --git a/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.DS_Store b/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..8cbbca90b6e5166f54b08e62aaab675fe94f04f0 GIT binary patch literal 6148 zcmeHK%}T>S5Z-O8-BN@c6nb3nTCkQQl)(EZ>xnykjo@uf;KKT0MGoe+f+2zj`Rl2EONYMF$o&h?B#G(=p{2Emc4#&vuTKv)3b}~@k=~Q)SDrb1MganEEez%%1&Lc-Xe)rG6i3eSHu$% z1H=F^Kn&~;1NICM&HX8vDkTPpfgdt}`-6mr=vvGT>a7Dlczs5H4G{%&d`looi>}4o zAb3EyNd+{i+&(e5Ne8>MajwPOph;(3&kW<(nTy8@*Rz9N>Tt$ggY+c^h=FYes;2AU z`F{n!Oz9(kJB51005R~-7~r*$HyT1w_H5lL56@Z&?FkwR#^tDhfWCDJfDY~>E$uXZ bi8{o&7IT9*3fgr#AYBABA@m^zeu056q9RP< literal 0 HcmV?d00001 diff --git a/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.MATLABDriveTag b/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.MATLABDriveTag new file mode 100644 index 0000000..84059a2 --- /dev/null +++ b/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.MATLABDriveTag @@ -0,0 +1 @@ +3496f669-9381-4974-bb7c-5cc1ddcb05d4 \ No newline at end of file diff --git a/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/.MATLABDriveTag b/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/.MATLABDriveTag new file mode 100644 index 0000000..df9fcd4 --- /dev/null +++ b/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/.MATLABDriveTag @@ -0,0 +1 @@ +ee53bc03-ef5e-44bc-aea4-8fae1e2a0b9f \ No newline at end of file diff --git a/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/Array/.MATLABDriveTag b/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/Array/.MATLABDriveTag new file mode 100644 index 0000000..656b51a --- /dev/null +++ b/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/Array/.MATLABDriveTag @@ -0,0 +1 @@ +c4dd0a9d-e15d-496f-91ff-d9ff561a4fa0 \ No newline at end of file diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index f21e343..2746a85 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -13,7 +13,7 @@ from matplotlib.dates import num2date from matplotlib.ticker import PercentFormatter from numpy import ceil, histogram, polyfit -from pandas import DataFrame, DatetimeIndex, Index, Timedelta, Timestamp, date_range +from pandas import DataFrame, DatetimeIndex, Index, Timedelta, Timestamp, date_range, Series from pandas.tseries import frequencies from scipy.stats import pearsonr from seaborn import scatterplot @@ -107,8 +107,9 @@ def histo( else: legend_labels = None - if effort: - normalize_counts_by_effort(df, effort, time_bin) + # if effort: + # normalize_counts_by_effort(df, effort, time_bin) + n_groups = len(labels) if legend_labels else 1 bar_width = bin_size / n_groups @@ -128,6 +129,8 @@ def histo( bar_kwargs["label"] = legend_labels[i] ax.bar(bin_starts + offset, df.iloc[:, i], **bar_kwargs) + if kwargs.get("show_recording_OFF"): + ax.set_facecolor("lightgrey") if len(df.columns) > 1 and legend: ax.legend(labels=legend_labels, bbox_to_anchor=(1.01, 1), loc="upper left") @@ -138,7 +141,7 @@ def histo( f" - bin size: {bin_size_str})" ) ax.set_ylabel(y_label) - set_y_axis_to_percentage(ax) if effort else set_dynamic_ylim(ax, df) + #set_y_axis_to_percentage(ax) if effort else set_dynamic_ylim(ax, df) set_plot_title(ax, annotators, labels) ax.set_xlim(begin, end) @@ -659,20 +662,70 @@ def shade_no_effort( """ + """Shade areas of the plot where no observation effort was made.""" width_days = bar_width.total_seconds() / 86400 - no_effort_bins = bin_starts[observed.counts.reindex(bin_starts) == 0] - for ts in no_effort_bins: - start = mdates.date2num(ts) - ax.axvspan(start, start + width_days, color="grey", alpha=0.08, zorder=1.5) + # Convert effort IntervalIndex → DatetimeIndex (bin starts) + effort_by_start = Series( + observed.counts.values, + index=[i.left for i in observed.counts.index], + ).tz_localize("UTC") + + # Align effort to plotting bins + effort_aligned = effort_by_start.reindex(bin_starts) + max_effort = bar_width / observed.timebin_origin + effort_fraction = effort_aligned / max_effort + + no_effort = effort_fraction == 0 + partial_effort = (effort_fraction > 0) & (effort_fraction < 1) + # Draw partial effort first (lighter) + for ts in bin_starts[partial_effort]: + start = mdates.date2num(ts - bar_width) + ax.axvspan( + start, + start + width_days, + facecolor="0.65", + alpha=0.08, + linewidth=0, + zorder=0, + ) + + # Draw no effort on top (darker) + for ts in bin_starts[no_effort]: + start = mdates.date2num(ts - bar_width) + ax.axvspan( + start, + start + width_days, + facecolor="0.45", + alpha=0.15, + linewidth=0, + zorder=0, + ) + + # Outside data coverage x_min, x_max = ax.get_xlim() - data_min = mdates.date2num(bin_starts[0]) - data_max = mdates.date2num(bin_starts[-1]) + width_days + data_min = mdates.date2num(bin_starts[0] - bar_width) + data_max = mdates.date2num(bin_starts[-1] + bar_width) if x_min < data_min: - ax.axvspan(x_min, data_min, color="grey", alpha=0.08, zorder=1.5) + ax.axvspan( + x_min, + data_min, + facecolor="0.45", + alpha=0.15, + linewidth=0, + zorder=0, + ) if x_max > data_max: - ax.axvspan(data_max, x_max, color="grey", alpha=0.08, zorder=1.5) + ax.axvspan( + data_max, + x_max, + facecolor="0.45", + alpha=0.15, + linewidth=0, + zorder=0, + ) + ax.set_xlim(x_min, x_max) From b8a38764ce04b065dc67ae30523ae0c1fa6c1865 Mon Sep 17 00:00:00 2001 From: MaelleTtrt <48910568+MaelleTtrt@users.noreply.github.com> Date: Fri, 9 Jan 2026 10:37:01 +0100 Subject: [PATCH 02/33] fix_ruff --- src/post_processing/dataclass/data_aplose.py | 1 - .../dataclass/recording_period.py | 66 ++----------------- 2 files changed, 7 insertions(+), 60 deletions(-) diff --git a/src/post_processing/dataclass/data_aplose.py b/src/post_processing/dataclass/data_aplose.py index f604f80..76b6d98 100644 --- a/src/post_processing/dataclass/data_aplose.py +++ b/src/post_processing/dataclass/data_aplose.py @@ -393,7 +393,6 @@ def plot( color = kwargs.get("color") season = kwargs.get("season") effort = kwargs.get("effort") - show_recording_OFF = kwargs.get("show_recording_OFF") if not bin_size: msg = "'bin_size' missing for histogram plot." raise ValueError(msg) diff --git a/src/post_processing/dataclass/recording_period.py b/src/post_processing/dataclass/recording_period.py index 15def1e..901d932 100644 --- a/src/post_processing/dataclass/recording_period.py +++ b/src/post_processing/dataclass/recording_period.py @@ -8,21 +8,12 @@ from dataclasses import dataclass from typing import TYPE_CHECKING -from osekit.config import TIMESTAMP_FORMATS_EXPORTED_FILES -from osekit.utils.timestamp_utils import strptime_from_text +import pandas as pd from pandas import ( Series, Timedelta, - cut, - read_csv, ) -from pandas.tseries.offsets import BaseOffset -import pandas as pd -from post_processing.utils.core_utils import ( - get_time_range_and_bin_size, - localize_timestamps, -) from post_processing.utils.filtering_utils import ( find_delimiter, ) @@ -30,8 +21,6 @@ if TYPE_CHECKING: from pandas.tseries.offsets import BaseOffset - from post_processing.dataclass.detection_filter import DetectionFilter - @dataclass(frozen=True) class RecordingPeriod: @@ -44,19 +33,20 @@ def from_path( config, *, bin_size: Timedelta | BaseOffset, - ) -> "RecordingPeriod": - """Vectorized creation of recording coverage from CSV with start/end datetimes. + ) -> RecordingPeriod: + """Vectorized creation of recording coverage from CSV with start/end datetime. CSV must have columns 'start_recording' and 'end_recording'. - bin_size can be a Timedelta (e.g., pd.Timedelta("1H")) or a pandas offset (e.g., "1D"). + bin_size can be a Timedelta (e.g., pd.Timedelta("1H")) + or a pandas offset (e.g., "1D"). """ - # 1. Read CSV and parse datetimes + # 1. Read CSV and parse datetime timestamp_file = config.timestamp_file delim = find_delimiter(timestamp_file) df = pd.read_csv( config.timestamp_file, parse_dates=["start_recording", "end_recording"], - delimiter=delim + delimiter=delim, ) if df.empty: @@ -98,45 +88,3 @@ def from_path( closed="left", ) return cls(counts=counts, timebin_origin=origin) - -# @dataclass(frozen=True) -# class RecordingPeriod: -# """A class to handle recording periods.""" -# -# counts: Series -# timebin_origin: Timedelta -# -# @classmethod -# def from_path( -# cls, -# config: DetectionFilter, -# date_format: str = TIMESTAMP_FORMATS_EXPORTED_FILES, -# *, -# bin_size: Timedelta | BaseOffset, -# ) -> RecordingPeriod: -# """Return a list of Timestamps corresponding to recording periods.""" -# timestamp_file = config.timestamp_file -# delim = find_delimiter(timestamp_file) -# timestamp_df = read_csv(timestamp_file, delimiter=delim) -# -# if "timestamp" in timestamp_df.columns: -# msg = "Parsing 'timestamp' column not implemented yet." -# raise NotImplementedError(msg) -# -# if "filename" in timestamp_df.columns: -# timestamps = [ -# strptime_from_text(ts, date_format) -# for ts in timestamp_df["filename"] -# ] -# timestamps = localize_timestamps(timestamps, config.timezone) -# time_vector, bin_size = get_time_range_and_bin_size(timestamps, bin_size) -# -# binned = cut(timestamps, time_vector) -# max_annot = bin_size / config.timebin_origin -# -# return cls(counts=binned.value_counts().sort_index().clip(upper=max_annot), -# timebin_origin=config.timebin_origin, -# ) -# -# msg = "Could not parse timestamps." -# raise ValueError(msg) From 639c6d6d8f6fc0f738b2a6ce37ee47fbd4407ccc Mon Sep 17 00:00:00 2001 From: MaelleTtrt <48910568+MaelleTtrt@users.noreply.github.com> Date: Fri, 9 Jan 2026 10:38:51 +0100 Subject: [PATCH 03/33] fix_ruff2 --- src/post_processing/dataclass/data_aplose.py | 2 +- src/post_processing/utils/core_utils.py | 10 ++++---- src/post_processing/utils/plot_utils.py | 16 +++++++++---- tests/conftest.py | 2 -- tests/test_DataAplose.py | 9 +++++--- tests/test_core_utils.py | 19 +++++++++------- tests/test_filtering_utils.py | 24 +++++++++++--------- tests/test_glider_utils.py | 2 +- tests/test_metric_utils.py | 3 ++- tests/test_plot_utils.py | 6 ++--- 10 files changed, 52 insertions(+), 41 deletions(-) diff --git a/src/post_processing/dataclass/data_aplose.py b/src/post_processing/dataclass/data_aplose.py index 76b6d98..3c8d1e4 100644 --- a/src/post_processing/dataclass/data_aplose.py +++ b/src/post_processing/dataclass/data_aplose.py @@ -408,7 +408,7 @@ def plot( color=color, season=season, effort=effort, - coordinates=(self.lat, self.lon) + coordinates=(self.lat, self.lon), ) if mode == "heatmap": diff --git a/src/post_processing/utils/core_utils.py b/src/post_processing/utils/core_utils.py index 5a831e1..9457ef2 100644 --- a/src/post_processing/utils/core_utils.py +++ b/src/post_processing/utils/core_utils.py @@ -11,7 +11,7 @@ from astral.sun import sunrise, sunset from matplotlib import pyplot as plt from osekit.config import TIMESTAMP_FORMAT_AUDIO_FILE -from osekit.utils.timestamp_utils import strptime_from_text, strftime_osmose_format +from osekit.utils.timestamp_utils import strftime_osmose_format, strptime_from_text from pandas import ( DataFrame, DatetimeIndex, @@ -255,7 +255,6 @@ def add_weak_detection( new_line.append(np.nan) df.loc[df.index.max() + 1] = new_line - return df.sort_values(by=["start_datetime", "annotator"]).reset_index(drop=True) @@ -509,11 +508,10 @@ def get_time_range_and_bin_size( if isinstance(bin_size, Timedelta): return timestamp_range, bin_size - elif isinstance(bin_size, BaseOffset): + if isinstance(bin_size, BaseOffset): return timestamp_range, timestamp_range[1] - timestamp_range[0] - else: - msg = "bin_size must be a Timedelta or BaseOffset." - raise TypeError(msg) + msg = "bin_size must be a Timedelta or BaseOffset." + raise TypeError(msg) def round_begin_end_timestamps( diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 2746a85..b6f566e 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -13,7 +13,15 @@ from matplotlib.dates import num2date from matplotlib.ticker import PercentFormatter from numpy import ceil, histogram, polyfit -from pandas import DataFrame, DatetimeIndex, Index, Timedelta, Timestamp, date_range, Series +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + Series, + Timedelta, + Timestamp, + date_range, +) from pandas.tseries import frequencies from scipy.stats import pearsonr from seaborn import scatterplot @@ -28,11 +36,10 @@ timedelta_to_str, ) from post_processing.utils.filtering_utils import ( + filter_by_annotator, get_max_time, get_timezone, - filter_by_annotator, ) -from post_processing.utils.metrics_utils import normalize_counts_by_effort if TYPE_CHECKING: from datetime import tzinfo @@ -110,7 +117,6 @@ def histo( # if effort: # normalize_counts_by_effort(df, effort, time_bin) - n_groups = len(labels) if legend_labels else 1 bar_width = bin_size / n_groups bin_starts = mdates.date2num(df.index) @@ -141,7 +147,7 @@ def histo( f" - bin size: {bin_size_str})" ) ax.set_ylabel(y_label) - #set_y_axis_to_percentage(ax) if effort else set_dynamic_ylim(ax, df) + # set_y_axis_to_percentage(ax) if effort else set_dynamic_ylim(ax, df) set_plot_title(ax, annotators, labels) ax.set_xlim(begin, end) diff --git a/tests/conftest.py b/tests/conftest.py index e03bf43..99ff7ef 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -122,8 +122,6 @@ """ - - STATUS = """dataset,filename,ann1,ann2,ann3,ann4,ann5,ann6 sample_dataset,2025_01_25_06_20_00,FINISHED,FINISHED,FINISHED,FINISHED,FINISHED,FINISHED sample_dataset,2025_01_25_06_20_10,FINISHED,FINISHED,FINISHED,FINISHED,FINISHED,FINISHED diff --git a/tests/test_DataAplose.py b/tests/test_DataAplose.py index 5ad1b04..9b9516c 100644 --- a/tests/test_DataAplose.py +++ b/tests/test_DataAplose.py @@ -19,6 +19,7 @@ def test_data_aplose_init(sample_df: DataFrame) -> None: assert data.begin == sample_df["start_datetime"].min() assert data.end == sample_df["end_datetime"].max() + def test_filter_df_single_pair(sample_df: DataFrame) -> None: data = DataAplose(sample_df) filtered_data = data.filter_df(annotator="ann1", label="lbl1") @@ -30,17 +31,19 @@ def test_filter_df_single_pair(sample_df: DataFrame) -> None: ].reset_index(drop=True) assert filtered_data.equals(expected) + def test_change_tz(sample_df: DataFrame) -> None: data = DataAplose(sample_df) - new_tz = 'Etc/GMT-7' + new_tz = "Etc/GMT-7" data.change_tz(new_tz) - start_dt = data.df['start_datetime'] - end_dt = data.df['end_datetime'] + start_dt = data.df["start_datetime"] + end_dt = data.df["end_datetime"] assert all(ts.tz.zone == new_tz for ts in start_dt), f"The detection start timestamps have to be in {new_tz} timezone" assert all(ts.tz.zone == new_tz for ts in end_dt), f"The detection end timestamps have to be in {new_tz} timezone" assert data.begin.tz.zone == new_tz, f"The begin value of the DataAplose has to be in {new_tz} timezone" assert data.end.tz.zone == new_tz, f"The end value of the DataAplose has to be in {new_tz} timezone" + def test_filter_df_multiple_pairs(sample_df: DataFrame) -> None: data = DataAplose(sample_df) filtered_data = data.filter_df(annotator=["ann1", "ann2"], label=["lbl1", "lbl2"]) diff --git a/tests/test_core_utils.py b/tests/test_core_utils.py index a1a3d73..e72e482 100644 --- a/tests/test_core_utils.py +++ b/tests/test_core_utils.py @@ -8,6 +8,8 @@ from post_processing.dataclass.data_aplose import DataAplose from post_processing.utils.core_utils import ( + add_recording_period, + add_season_period, add_weak_detection, get_coordinates, get_count, @@ -15,13 +17,11 @@ get_season, get_sun_times, get_time_range_and_bin_size, + json2df, localize_timestamps, round_begin_end_timestamps, - timedelta_to_str, - add_season_period, - add_recording_period, set_bar_height, - json2df, + timedelta_to_str, ) @@ -409,10 +409,11 @@ def test_add_season_no_data() -> None: # %% add_recording_period + def test_add_recording_period_valid() -> None: fig, ax = plt.subplots() start = Timestamp("2025-01-01T00:00:00+00:00") - stop = Timestamp("2025-01-02T00:00:00+00:00") + stop = Timestamp("2025-01-02T00:00:00+00:00") ts = date_range(start=start, end=stop, freq="H", tz="UTC") values = list(range(len(ts))) @@ -423,7 +424,7 @@ def test_add_recording_period_valid() -> None: [ Timestamp("2025-01-01T00:00:00+00:00"), Timestamp("2025-01-02T00:00:00+00:00"), - ] + ], ], columns=["deployment_date", "recovery_date"], ) @@ -438,6 +439,7 @@ def test_add_recording_period_no_data() -> None: # %% set_bar_height + def test_set_bar_height_valid() -> None: fig, ax = plt.subplots() start = Timestamp("2025-01-01T00:00:00+00:00") @@ -457,6 +459,7 @@ def test_set_bar_height_no_data() -> None: # %% json2df + def test_json2df_valid(tmp_path): fake_json = { "deployment_date": "2025-01-01T00:00:00+00:00", @@ -474,9 +477,9 @@ def test_json2df_valid(tmp_path): [ Timestamp("2025-01-01T00:00:00+00:00"), Timestamp("2025-01-02T00:00:00+00:00"), - ] + ], ], columns=["deployment_date", "recovery_date"], ) - assert df.equals(expected) \ No newline at end of file + assert df.equals(expected) diff --git a/tests/test_filtering_utils.py b/tests/test_filtering_utils.py index 95fd987..3ec3760 100644 --- a/tests/test_filtering_utils.py +++ b/tests/test_filtering_utils.py @@ -77,7 +77,7 @@ def test_find_delimiter_unsupported_delimiter(tmp_path: Path) -> None: with pytest.raises( ValueError, - match=r"unsupported delimiter '&'" + match=r"unsupported delimiter '&'", ): find_delimiter(file) @@ -199,6 +199,7 @@ def test_filter_by_freq_valid(sample_df: DataFrame, f_min, f_max): if f_max is not None: assert (result["end_frequency"] <= f_max).all() + @pytest.mark.parametrize( "f_min, f_max, expected_msg", [ @@ -216,8 +217,6 @@ def test_filter_by_freq_valid(sample_df: DataFrame, f_min, f_max): ), ], ) - - def test_filter_by_freq_out_of_range(sample_df: DataFrame, f_min, f_max, expected_msg): with pytest.raises(ValueError, match=expected_msg): filter_by_freq(sample_df, f_min=f_min, f_max=f_max) @@ -331,7 +330,7 @@ def test_get_timezone_several(sample_df: DataFrame) -> None: } sample_df = concat( [sample_df, DataFrame([new_row])], - ignore_index=False + ignore_index=False, ) tz = get_timezone(sample_df) assert len(tz) == 2 @@ -340,6 +339,7 @@ def test_get_timezone_several(sample_df: DataFrame) -> None: # %% read DataFrame + def test_read_dataframe_comma_delimiter(tmp_path: Path) -> None: csv_file = tmp_path / "test.csv" csv_file.write_text( @@ -417,7 +417,7 @@ def test_no_timebin_several_tz(sample_df: DataFrame) -> None: } sample_df = concat( [sample_df, DataFrame([new_row])], - ignore_index=False + ignore_index=False, ) timestamp_wav = to_datetime(sample_df["filename"], format="%Y_%m_%d_%H_%M_%S").dt.tz_localize(pytz.UTC) @@ -429,7 +429,7 @@ def test_no_timebin_original_timebin(sample_df: DataFrame) -> None: tz = get_timezone(sample_df) timestamp_wav = to_datetime( sample_df["filename"], - format="%Y_%m_%d_%H_%M_%S" + format="%Y_%m_%d_%H_%M_%S", ).dt.tz_localize(tz) df_out = reshape_timebin( sample_df, @@ -520,7 +520,7 @@ def test_simple_reshape_hourly(sample_df: DataFrame) -> None: tz = get_timezone(sample_df) timestamp_wav = to_datetime( sample_df["filename"], - format="%Y_%m_%d_%H_%M_%S" + format="%Y_%m_%d_%H_%M_%S", ).dt.tz_localize(tz) df_out = reshape_timebin( sample_df, @@ -538,7 +538,7 @@ def test_reshape_daily_multiple_bins(sample_df: DataFrame) -> None: tz = get_timezone(sample_df) timestamp_wav = to_datetime( sample_df["filename"], - format="%Y_%m_%d_%H_%M_%S" + format="%Y_%m_%d_%H_%M_%S", ).dt.tz_localize(tz) df_out = reshape_timebin(sample_df, timestamp_audio=timestamp_wav, timebin_new=Timedelta(days=1)) assert not df_out.empty @@ -555,7 +555,7 @@ def test_with_manual_timestamps_vector(sample_df: DataFrame) -> None: df_out = reshape_timebin( sample_df, timestamp_audio=timestamp_wav, - timebin_new=Timedelta(hours=1) + timebin_new=Timedelta(hours=1), ) assert not df_out.empty @@ -589,6 +589,7 @@ def test_ensure_no_invalid_with_elements() -> None: assert "bar" in str(exc_info.value) assert "columns" in str(exc_info.value) + def test_ensure_no_invalid_single_element() -> None: invalid_items = ["baz"] with pytest.raises(ValueError) as exc_info: @@ -598,6 +599,7 @@ def test_ensure_no_invalid_single_element() -> None: # %% intersection / union + def test_intersection(sample_df) -> None: df_result = intersection_or_union(sample_df[sample_df["annotator"].isin(["ann1", "ann2"])], user_sel="intersection") @@ -628,7 +630,7 @@ def test_not_enough_annotators_raises() -> None: "annotation": ["cat"], "start_datetime": to_datetime(["2025-01-01 10:00"]), "end_datetime": to_datetime(["2025-01-01 10:01"]), - "annotator": ["A"] + "annotator": ["A"], }) with pytest.raises(ValueError, match="Not enough annotators detected"): - intersection_or_union(df_single_annotator, user_sel="intersection") \ No newline at end of file + intersection_or_union(df_single_annotator, user_sel="intersection") diff --git a/tests/test_glider_utils.py b/tests/test_glider_utils.py index 12d83df..d0247c5 100644 --- a/tests/test_glider_utils.py +++ b/tests/test_glider_utils.py @@ -56,7 +56,7 @@ def test_get_position_from_timestamp(nav_df: DataFrame) -> None: def test_plot_detections_with_nav_data( df_detections: DataFrame, - nav_df: DataFrame + nav_df: DataFrame, ) -> None: plot_detections_with_nav_data( df=df_detections, diff --git a/tests/test_metric_utils.py b/tests/test_metric_utils.py index 34ce769..35717e7 100644 --- a/tests/test_metric_utils.py +++ b/tests/test_metric_utils.py @@ -3,6 +3,7 @@ from post_processing.utils.metrics_utils import detection_perf + def test_detection_perf(sample_df: DataFrame) -> None: try: detection_perf(df=sample_df[sample_df["annotator"].isin(["ann1", "ann4"])], ref=("ann1", "lbl1")) @@ -12,4 +13,4 @@ def test_detection_perf(sample_df: DataFrame) -> None: def test_detection_perf_one_annotator(sample_df: DataFrame) -> None: with pytest.raises(ValueError, match="Two annotators needed"): - detection_perf(df=sample_df[sample_df["annotator"] == "ann1"], ref=("ann1", "lbl1")) \ No newline at end of file + detection_perf(df=sample_df[sample_df["annotator"] == "ann1"], ref=("ann1", "lbl1")) diff --git a/tests/test_plot_utils.py b/tests/test_plot_utils.py index d7392cf..e7a389c 100644 --- a/tests/test_plot_utils.py +++ b/tests/test_plot_utils.py @@ -4,10 +4,10 @@ from numpy import arange, testing from post_processing.utils.plot_utils import ( - overview, _wrap_xtick_labels, - set_y_axis_to_percentage, get_legend, + overview, + set_y_axis_to_percentage, ) @@ -103,4 +103,4 @@ def test_lists_and_strings_combined(): labels = ["Label1", "Label2"] result = get_legend(annotators, labels) expected = ["Alice\nLabel1", "Bob\nLabel2"] - assert result == expected \ No newline at end of file + assert result == expected From b2ebfd71485c864ea7f3f5fd5a7f526e1d43d5ce Mon Sep 17 00:00:00 2001 From: MaelleTtrt <48910568+MaelleTtrt@users.noreply.github.com> Date: Fri, 9 Jan 2026 13:50:54 +0100 Subject: [PATCH 04/33] adapt RecordingPeriod class --- .../dataclass/recording_period.py | 98 +++++++++++++++---- 1 file changed, 81 insertions(+), 17 deletions(-) diff --git a/src/post_processing/dataclass/recording_period.py b/src/post_processing/dataclass/recording_period.py index 901d932..11524e6 100644 --- a/src/post_processing/dataclass/recording_period.py +++ b/src/post_processing/dataclass/recording_period.py @@ -24,6 +24,8 @@ @dataclass(frozen=True) class RecordingPeriod: + """Represents recording effort over time, aggregated into bins.""" + counts: Series timebin_origin: Timedelta @@ -34,33 +36,91 @@ def from_path( *, bin_size: Timedelta | BaseOffset, ) -> RecordingPeriod: - """Vectorized creation of recording coverage from CSV with start/end datetime. + """Vectorized creation of recording coverage from CSV with start/end datetimes. + + This method reads a CSV with columns: + - 'start_recording' + - 'end_recording' + - 'start_deployment' + - 'end_deployment' + + It computes the **effective recording interval** as the intersection between + recording and deployment periods, builds a fine-grained timeline at + `timebin_origin` resolution, and aggregates effort into `bin_size` bins. + + Parameters + ---------- + config + Configuration object containing at least: + - `timestamp_file`: path to CSV + - `timebin_origin`: Timedelta resolution of detections + bin_size : Timedelta or BaseOffset + Size of the aggregation bin (e.g., pd.Timedelta("1H") or "1D"). + + Returns + ------- + RecordingPeriod + Object containing `counts` (Series indexed by IntervalIndex) and + `timebin_origin`. - CSV must have columns 'start_recording' and 'end_recording'. - bin_size can be a Timedelta (e.g., pd.Timedelta("1H")) - or a pandas offset (e.g., "1D"). """ - # 1. Read CSV and parse datetime + # 1. Read CSV and parse datetime columns timestamp_file = config.timestamp_file delim = find_delimiter(timestamp_file) df = pd.read_csv( config.timestamp_file, - parse_dates=["start_recording", "end_recording"], + parse_dates=[ + "start_recording", + "end_recording", + "start_deployment", + "end_deployment", + ], delimiter=delim, ) if df.empty: raise ValueError("CSV is empty.") - # 2. Normalize timezones if needed - df["start_recording"] = ( - pd.to_datetime(df["start_recording"], utc=True).dt.tz_convert(None) - ) - df["end_recording"] = ( - pd.to_datetime(df["end_recording"], utc=True).dt.tz_convert(None) - ) + # Ensure all required columns are present + required_columns = { + "start_recording", + "end_recording", + "start_deployment", + "end_deployment", + } + + missing = required_columns - set(df.columns) + + if missing: + raise ValueError( + f"CSV is missing required columns: {', '.join(sorted(missing))}", + ) + + # 2. Normalize timezones: convert to UTC, then remove tz info (naive) + for col in [ + "start_recording", + "end_recording", + "start_deployment", + "end_deployment", + ]: + df[col] = pd.to_datetime(df[col], utc=True).dt.tz_convert(None) + + # 3. Compute effective recording intervals (intersection) + df["start_recording"] = df[ + ["start_recording", "start_deployment"] + ].max(axis=1) + + df["end_recording"] = df[ + ["end_recording", "end_deployment"] + ].min(axis=1) + + # Remove rows with no actual recording interval + df = df.loc[df["start_recording"] < df["end_recording"]].copy() - # Build fine-grained timeline (timebin_origin resolution) + if df.empty: + raise ValueError("No valid recording intervals after deployment intersection.") + + # 4. Build fine-grained timeline at `timebin_origin` resolution origin = config.timebin_origin time_index = pd.date_range( start=df["start_recording"].min(), @@ -68,19 +128,23 @@ def from_path( freq=origin, ) - # Initialize effort vector + # Initialize effort vector (0 = no recording, 1 = recording) + # Compare each timestamp to all intervals in a vectorized manner effort = pd.Series(0, index=time_index) - # Vectorized interval coverage + # 5. Vectorized interval coverage tvals = time_index.values[:, None] start_vals = df["start_recording"].values end_vals = df["end_recording"].values + # Boolean matrix: True if timestamp is within any recording interval covered = (tvals >= start_vals) & (tvals < end_vals) effort[:] = covered.any(axis=1).astype(int) - # Aggregate effort into bin_size + # 6. Aggregate effort into user-defined bin_size counts = effort.resample(bin_size).sum() + + # Replace index with IntervalIndex for downstream compatibility counts.index = pd.interval_range( start=counts.index[0], periods=len(counts), From f74587e880facfb01d71bb350d11b24b43bd5703 Mon Sep 17 00:00:00 2001 From: MaelleTtrt <48910568+MaelleTtrt@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:37:47 +0100 Subject: [PATCH 05/33] add test_recording_preiod --- tests/conftest.py | 28 ++++++++++++++- tests/test_recording_period.py | 63 ++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 tests/test_recording_period.py diff --git a/tests/conftest.py b/tests/conftest.py index 99ff7ef..9a26f97 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,7 @@ import soundfile as sf import yaml from osekit.utils.timestamp_utils import strftime_osmose_format -from pandas import DataFrame, read_csv +from pandas import DataFrame, Timedelta, read_csv SAMPLE = """dataset,filename,start_time,end_time,start_frequency,end_frequency,annotation,annotator,start_datetime,end_datetime,type,score sample_dataset,2025_01_25_06_20_00,0.0,10.0,0.0,72000.0,lbl2,ann2,2025-01-25T06:20:00.000+00:00,2025-01-25T06:20:10.000+00:00,WEAK,0.11 @@ -132,6 +132,14 @@ sample_dataset,2025_01_26_06_20_20,FINISHED,FINISHED,FINISHED,FINISHED,FINISHED,FINISHED """ +# --------------------------------------------------------------------------- +# Fake recording planning CSV used for tests +# --------------------------------------------------------------------------- +RECORDING_PLANNING_CSV = """start_recording,end_recording,start_deployment,end_deployment +2024-01-01 00:00:00+0000,2024-04-09 02:00:00+0000,2024-01-02 00:00:00+0000,2024-04-30 02:00:00+0000 +2024-04-30 01:00:00+0000,2024-07-03 06:00:00+0000,2024-04-09 04:00:00+0000,2024-07-14 14:00:00+0000 +""" + @pytest.fixture def sample_df() -> DataFrame: @@ -226,3 +234,21 @@ def create_file(path: Path, size: int = 2048): create_file(nested / "file4.wav") (tmp_path / "ignore.txt").write_text("not audio") return tmp_path + + +@pytest.fixture +def recording_planning_csv(tmp_path) -> Path: + """Create a temporary CSV file simulating a recording planning.""" + path = tmp_path / "recording_planning.csv" + path.write_text(RECORDING_PLANNING_CSV) + return path + + +@pytest.fixture +def recording_planning_config(recording_planning_csv): + """Minimal config object compatible with RecordingPeriod.from_path.""" + class RecordingPlanningConfig: + timestamp_file: Path = recording_planning_csv + timebin_origin = Timedelta("1min") + + return RecordingPlanningConfig() diff --git a/tests/test_recording_period.py b/tests/test_recording_period.py new file mode 100644 index 0000000..12464e7 --- /dev/null +++ b/tests/test_recording_period.py @@ -0,0 +1,63 @@ +import pandas as pd + +from post_processing.dataclass.recording_period import RecordingPeriod + + +def test_recording_period_with_gaps(recording_planning_config): + """RecordingPeriod correctly represents long gaps with no recording effort. + + The planning contains two recording blocks separated by ~3 weeks with no + recording at all. Weekly aggregation must reflect: + - weeks with full effort, + - weeks with partial effort, + - weeks with zero effort. + """ + recording_period = RecordingPeriod.from_path( + config=recording_planning_config, + bin_size="1W", + ) + + counts = recording_period.counts + + # ------------------------------------------------------------------ + # Structural checks + # ------------------------------------------------------------------ + assert not counts.empty + assert counts.index.is_interval() + assert counts.min() >= 0 + + # One week = 7 * 24 hours (origin = 1min) + full_week_minutes = 7 * 24 * 60 + + # ------------------------------------------------------------------ + # Helper: find the bin covering a given timestamp + # ------------------------------------------------------------------ + def bin_covering(ts: pd.Timestamp) -> pd.Interval: + for interval in counts.index: + if interval.left <= ts < interval.right: + return interval + raise AssertionError(f"No bin covers timestamp {ts}") + + # ------------------------------------------------------------------ + # Week fully inside the long gap → zero effort + # ------------------------------------------------------------------ + gap_ts = pd.Timestamp("2024-04-21") + + gap_bin = bin_covering(gap_ts) + assert counts.loc[gap_bin] == 0 + + # ------------------------------------------------------------------ + # Week fully inside recording → full effort + # ------------------------------------------------------------------ + full_effort_ts = pd.Timestamp("2024-02-04") + + full_bin = bin_covering(full_effort_ts) + assert counts.loc[full_bin] == full_week_minutes + + # ------------------------------------------------------------------ + # Week overlapping recording stop → partial effort + # ------------------------------------------------------------------ + partial_ts = pd.Timestamp("2024-04-14") + + partial_bin = bin_covering(partial_ts) + assert counts.loc[partial_bin] == 1560 From c1600bf894333969ca8510417f304d8c9fa49437 Mon Sep 17 00:00:00 2001 From: MaelleTtrt <48910568+MaelleTtrt@users.noreply.github.com> Date: Mon, 12 Jan 2026 17:15:20 +0100 Subject: [PATCH 06/33] add test_shade_no_effort --- tests/conftest.py | 7 +++--- tests/test_plot_utils.py | 40 ++++++++++++++++++++++++++++++++++ tests/test_recording_period.py | 3 ++- 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 9a26f97..a6299e3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,8 @@ import soundfile as sf import yaml from osekit.utils.timestamp_utils import strftime_osmose_format -from pandas import DataFrame, Timedelta, read_csv +from pandas import DataFrame, read_csv +from pandas.tseries import frequencies SAMPLE = """dataset,filename,start_time,end_time,start_frequency,end_frequency,annotation,annotator,start_datetime,end_datetime,type,score sample_dataset,2025_01_25_06_20_00,0.0,10.0,0.0,72000.0,lbl2,ann2,2025-01-25T06:20:00.000+00:00,2025-01-25T06:20:10.000+00:00,WEAK,0.11 @@ -137,7 +138,7 @@ # --------------------------------------------------------------------------- RECORDING_PLANNING_CSV = """start_recording,end_recording,start_deployment,end_deployment 2024-01-01 00:00:00+0000,2024-04-09 02:00:00+0000,2024-01-02 00:00:00+0000,2024-04-30 02:00:00+0000 -2024-04-30 01:00:00+0000,2024-07-03 06:00:00+0000,2024-04-09 04:00:00+0000,2024-07-14 14:00:00+0000 +2024-04-30 01:00:00+0000,2024-07-14 06:00:00+0000,2024-04-30 02:00:00+0000,2024-07-06 14:00:00+0000 """ @@ -249,6 +250,6 @@ def recording_planning_config(recording_planning_csv): """Minimal config object compatible with RecordingPeriod.from_path.""" class RecordingPlanningConfig: timestamp_file: Path = recording_planning_csv - timebin_origin = Timedelta("1min") + timebin_origin = frequencies.to_offset("1min") return RecordingPlanningConfig() diff --git a/tests/test_plot_utils.py b/tests/test_plot_utils.py index e7a389c..37c2243 100644 --- a/tests/test_plot_utils.py +++ b/tests/test_plot_utils.py @@ -1,13 +1,19 @@ +from unittest.mock import MagicMock + import matplotlib.pyplot as plt import pytest from matplotlib.ticker import PercentFormatter from numpy import arange, testing +from pandas import Series, Timedelta, to_datetime +from pandas.tseries import frequencies +from post_processing.dataclass.recording_period import RecordingPeriod from post_processing.utils.plot_utils import ( _wrap_xtick_labels, get_legend, overview, set_y_axis_to_percentage, + shade_no_effort, ) @@ -104,3 +110,37 @@ def test_lists_and_strings_combined(): result = get_legend(annotators, labels) expected = ["Alice\nLabel1", "Bob\nLabel2"] assert result == expected + + +def test_shade_no_effort_from_recording_planning(recording_planning_config): + """shade_no_effort shades contiguous zero-effort periods.""" + + def count_contiguous_zero_segments(effort: Series) -> int: + """Return number of contiguous zero-effort segments.""" + is_zero = effort == 0 + return ((is_zero != is_zero.shift(fill_value=False)) & is_zero).sum() + + recording_period = RecordingPeriod.from_path( + config=recording_planning_config, + bin_size=frequencies.to_offset("1W"), + ) + + counts = recording_period.counts + + bin_starts = to_datetime( + [interval.left for interval in counts.index], + ) + + fig, ax = plt.subplots() + ax.axvspan = MagicMock() + + shade_no_effort( + ax=ax, + bin_starts=bin_starts, + observed=recording_period, + bar_width=Timedelta("7D"), + ) + + expected_spans = count_contiguous_zero_segments(counts) + + assert ax.axvspan.call_count == expected_spans diff --git a/tests/test_recording_period.py b/tests/test_recording_period.py index 12464e7..41b9e59 100644 --- a/tests/test_recording_period.py +++ b/tests/test_recording_period.py @@ -1,4 +1,5 @@ import pandas as pd +from pandas.tseries import frequencies from post_processing.dataclass.recording_period import RecordingPeriod @@ -14,7 +15,7 @@ def test_recording_period_with_gaps(recording_planning_config): """ recording_period = RecordingPeriod.from_path( config=recording_planning_config, - bin_size="1W", + bin_size=frequencies.to_offset("1W"), ) counts = recording_period.counts From 0e22298596aeca66821bc6bbca16df72231f40fa Mon Sep 17 00:00:00 2001 From: MaelleTtrt <48910568+MaelleTtrt@users.noreply.github.com> Date: Tue, 13 Jan 2026 09:47:44 +0100 Subject: [PATCH 07/33] delete useluss in shade_no_effort --- src/post_processing/utils/plot_utils.py | 26 ------------------------- 1 file changed, 26 deletions(-) diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index b6f566e..75c421f 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -708,32 +708,6 @@ def shade_no_effort( zorder=0, ) - # Outside data coverage - x_min, x_max = ax.get_xlim() - data_min = mdates.date2num(bin_starts[0] - bar_width) - data_max = mdates.date2num(bin_starts[-1] + bar_width) - - if x_min < data_min: - ax.axvspan( - x_min, - data_min, - facecolor="0.45", - alpha=0.15, - linewidth=0, - zorder=0, - ) - if x_max > data_max: - ax.axvspan( - data_max, - x_max, - facecolor="0.45", - alpha=0.15, - linewidth=0, - zorder=0, - ) - - ax.set_xlim(x_min, x_max) - def add_sunrise_sunset(ax: Axes, lat: float, lon: float, tz: tzinfo) -> None: """Display sunrise/sunset times on plot.""" From e889c29cc52a864f29363257128b9f9dd55e2a59 Mon Sep 17 00:00:00 2001 From: MaelleTtrt <48910568+MaelleTtrt@users.noreply.github.com> Date: Tue, 13 Jan 2026 10:03:01 +0100 Subject: [PATCH 08/33] delete test shading_no_effort --- tests/test_plot_utils.py | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/tests/test_plot_utils.py b/tests/test_plot_utils.py index 37c2243..49aff37 100644 --- a/tests/test_plot_utils.py +++ b/tests/test_plot_utils.py @@ -112,35 +112,3 @@ def test_lists_and_strings_combined(): assert result == expected -def test_shade_no_effort_from_recording_planning(recording_planning_config): - """shade_no_effort shades contiguous zero-effort periods.""" - - def count_contiguous_zero_segments(effort: Series) -> int: - """Return number of contiguous zero-effort segments.""" - is_zero = effort == 0 - return ((is_zero != is_zero.shift(fill_value=False)) & is_zero).sum() - - recording_period = RecordingPeriod.from_path( - config=recording_planning_config, - bin_size=frequencies.to_offset("1W"), - ) - - counts = recording_period.counts - - bin_starts = to_datetime( - [interval.left for interval in counts.index], - ) - - fig, ax = plt.subplots() - ax.axvspan = MagicMock() - - shade_no_effort( - ax=ax, - bin_starts=bin_starts, - observed=recording_period, - bar_width=Timedelta("7D"), - ) - - expected_spans = count_contiguous_zero_segments(counts) - - assert ax.axvspan.call_count == expected_spans From 2349713661ee44fd2bfa84e2a5a7ac7ff1d051d2 Mon Sep 17 00:00:00 2001 From: MaelleTtrt <48910568+MaelleTtrt@users.noreply.github.com> Date: Tue, 13 Jan 2026 10:04:58 +0100 Subject: [PATCH 09/33] fix ruff --- src/post_processing/dataclass/recording_period.py | 15 ++++++++------- tests/test_plot_utils.py | 7 ------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/src/post_processing/dataclass/recording_period.py b/src/post_processing/dataclass/recording_period.py index 11524e6..32319fd 100644 --- a/src/post_processing/dataclass/recording_period.py +++ b/src/post_processing/dataclass/recording_period.py @@ -106,16 +106,16 @@ def from_path( df[col] = pd.to_datetime(df[col], utc=True).dt.tz_convert(None) # 3. Compute effective recording intervals (intersection) - df["start_recording"] = df[ + df["effective_start_recording"] = df[ ["start_recording", "start_deployment"] ].max(axis=1) - df["end_recording"] = df[ + df["effective_end_recording"] = df[ ["end_recording", "end_deployment"] ].min(axis=1) # Remove rows with no actual recording interval - df = df.loc[df["start_recording"] < df["end_recording"]].copy() + df = df.loc[df["effective_start_recording"] < df["effective_end_recording"]].copy() if df.empty: raise ValueError("No valid recording intervals after deployment intersection.") @@ -123,8 +123,8 @@ def from_path( # 4. Build fine-grained timeline at `timebin_origin` resolution origin = config.timebin_origin time_index = pd.date_range( - start=df["start_recording"].min(), - end=df["end_recording"].max(), + start=df["effective_start_recording"].min(), + end=df["effective_end_recording"].max(), freq=origin, ) @@ -134,8 +134,8 @@ def from_path( # 5. Vectorized interval coverage tvals = time_index.values[:, None] - start_vals = df["start_recording"].values - end_vals = df["end_recording"].values + start_vals = df["effective_start_recording"].values + end_vals = df["effective_end_recording"].values # Boolean matrix: True if timestamp is within any recording interval covered = (tvals >= start_vals) & (tvals < end_vals) @@ -151,4 +151,5 @@ def from_path( freq=bin_size, closed="left", ) + return cls(counts=counts, timebin_origin=origin) diff --git a/tests/test_plot_utils.py b/tests/test_plot_utils.py index 49aff37..ffabbbd 100644 --- a/tests/test_plot_utils.py +++ b/tests/test_plot_utils.py @@ -1,19 +1,14 @@ -from unittest.mock import MagicMock import matplotlib.pyplot as plt import pytest from matplotlib.ticker import PercentFormatter from numpy import arange, testing -from pandas import Series, Timedelta, to_datetime -from pandas.tseries import frequencies -from post_processing.dataclass.recording_period import RecordingPeriod from post_processing.utils.plot_utils import ( _wrap_xtick_labels, get_legend, overview, set_y_axis_to_percentage, - shade_no_effort, ) @@ -110,5 +105,3 @@ def test_lists_and_strings_combined(): result = get_legend(annotators, labels) expected = ["Alice\nLabel1", "Bob\nLabel2"] assert result == expected - - From 77f71d1872e056186a071652056d980fcfeb5739 Mon Sep 17 00:00:00 2001 From: MaelleTtrt <48910568+MaelleTtrt@users.noreply.github.com> Date: Tue, 13 Jan 2026 10:34:28 +0100 Subject: [PATCH 10/33] delete matlab boring files --- .../PAMGuardMatlab-main/.DS_Store | Bin 6148 -> 0 bytes .../PAMGuardMatlab-main/.MATLABDriveTag | 1 - .../PAMGuardMatlab-main/pgmatlab/.MATLABDriveTag | 1 - .../pgmatlab/Array/.MATLABDriveTag | 1 - 4 files changed, 3 deletions(-) delete mode 100644 src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.DS_Store delete mode 100644 src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.MATLABDriveTag delete mode 100644 src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/.MATLABDriveTag delete mode 100644 src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/Array/.MATLABDriveTag diff --git a/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.DS_Store b/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.DS_Store deleted file mode 100644 index 8cbbca90b6e5166f54b08e62aaab675fe94f04f0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}T>S5Z-O8-BN@c6nb3nTCkQQl)(EZ>xnykjo@uf;KKT0MGoe+f+2zj`Rl2EONYMF$o&h?B#G(=p{2Emc4#&vuTKv)3b}~@k=~Q)SDrb1MganEEez%%1&Lc-Xe)rG6i3eSHu$% z1H=F^Kn&~;1NICM&HX8vDkTPpfgdt}`-6mr=vvGT>a7Dlczs5H4G{%&d`looi>}4o zAb3EyNd+{i+&(e5Ne8>MajwPOph;(3&kW<(nTy8@*Rz9N>Tt$ggY+c^h=FYes;2AU z`F{n!Oz9(kJB51005R~-7~r*$HyT1w_H5lL56@Z&?FkwR#^tDhfWCDJfDY~>E$uXZ bi8{o&7IT9*3fgr#AYBABA@m^zeu056q9RP< diff --git a/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.MATLABDriveTag b/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.MATLABDriveTag deleted file mode 100644 index 84059a2..0000000 --- a/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/.MATLABDriveTag +++ /dev/null @@ -1 +0,0 @@ -3496f669-9381-4974-bb7c-5cc1ddcb05d4 \ No newline at end of file diff --git a/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/.MATLABDriveTag b/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/.MATLABDriveTag deleted file mode 100644 index df9fcd4..0000000 --- a/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/.MATLABDriveTag +++ /dev/null @@ -1 +0,0 @@ -ee53bc03-ef5e-44bc-aea4-8fae1e2a0b9f \ No newline at end of file diff --git a/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/Array/.MATLABDriveTag b/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/Array/.MATLABDriveTag deleted file mode 100644 index 656b51a..0000000 --- a/src/post_processing/utils/matlab_utils/PAMGuardMatlab-main/PAMGuardMatlab-main/pgmatlab/Array/.MATLABDriveTag +++ /dev/null @@ -1 +0,0 @@ -c4dd0a9d-e15d-496f-91ff-d9ff561a4fa0 \ No newline at end of file From e8c73e074eff7de6719cc3150c867865ee2ff775 Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Wed, 14 Jan 2026 16:15:46 +0100 Subject: [PATCH 11/33] few modifs --- .../dataclass/detection_filter.py | 10 ++- .../dataclass/recording_period.py | 62 ++++++++++--------- src/post_processing/utils/filtering_utils.py | 22 ++++--- src/post_processing/utils/plot_utils.py | 2 +- 4 files changed, 55 insertions(+), 41 deletions(-) diff --git a/src/post_processing/dataclass/detection_filter.py b/src/post_processing/dataclass/detection_filter.py index d636c4c..b2d2599 100644 --- a/src/post_processing/dataclass/detection_filter.py +++ b/src/post_processing/dataclass/detection_filter.py @@ -7,9 +7,9 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, fields from pathlib import Path -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Literal, Any import yaml from pandas import Timedelta, Timestamp @@ -44,6 +44,12 @@ class DetectionFilter: box: bool = False filename_format: str = None + def __getitem__(self, key: str): + """Return the value of the given key.""" + if key in {f.name for f in fields(self)}: + return getattr(self, key) + raise KeyError(key) + @classmethod def from_yaml( cls, diff --git a/src/post_processing/dataclass/recording_period.py b/src/post_processing/dataclass/recording_period.py index 32319fd..637733e 100644 --- a/src/post_processing/dataclass/recording_period.py +++ b/src/post_processing/dataclass/recording_period.py @@ -8,10 +8,13 @@ from dataclasses import dataclass from typing import TYPE_CHECKING -import pandas as pd from pandas import ( Series, Timedelta, + date_range, + interval_range, + read_csv, + to_datetime, ) from post_processing.utils.filtering_utils import ( @@ -21,6 +24,8 @@ if TYPE_CHECKING: from pandas.tseries.offsets import BaseOffset + from post_processing.dataclass.detection_filter import DetectionFilter + @dataclass(frozen=True) class RecordingPeriod: @@ -32,17 +37,17 @@ class RecordingPeriod: @classmethod def from_path( cls, - config, + config: DetectionFilter, *, bin_size: Timedelta | BaseOffset, ) -> RecordingPeriod: """Vectorized creation of recording coverage from CSV with start/end datetimes. This method reads a CSV with columns: - - 'start_recording' - - 'end_recording' - - 'start_deployment' - - 'end_deployment' + - "start_recording" + - "end_recording" + - "start_deployment" + - "end_deployment" It computes the **effective recording interval** as the intersection between recording and deployment periods, builds a fine-grained timeline at @@ -55,7 +60,7 @@ def from_path( - `timestamp_file`: path to CSV - `timebin_origin`: Timedelta resolution of detections bin_size : Timedelta or BaseOffset - Size of the aggregation bin (e.g., pd.Timedelta("1H") or "1D"). + Size of the aggregation bin (e.g., Timedelta("1H") or "1D"). Returns ------- @@ -64,10 +69,10 @@ def from_path( `timebin_origin`. """ - # 1. Read CSV and parse datetime columns + # Read CSV and parse datetime columns timestamp_file = config.timestamp_file delim = find_delimiter(timestamp_file) - df = pd.read_csv( + df = read_csv( config.timestamp_file, parse_dates=[ "start_recording", @@ -79,7 +84,8 @@ def from_path( ) if df.empty: - raise ValueError("CSV is empty.") + msg = "CSV is empty." + raise ValueError(msg) # Ensure all required columns are present required_columns = { @@ -92,20 +98,19 @@ def from_path( missing = required_columns - set(df.columns) if missing: - raise ValueError( - f"CSV is missing required columns: {', '.join(sorted(missing))}", - ) + msg = f"CSV is missing required columns: {', '.join(sorted(missing))}" + raise ValueError(msg) - # 2. Normalize timezones: convert to UTC, then remove tz info (naive) + # Normalize timezones: convert to UTC, then remove tz info (naive) for col in [ "start_recording", "end_recording", "start_deployment", "end_deployment", ]: - df[col] = pd.to_datetime(df[col], utc=True).dt.tz_convert(None) + df[col] = to_datetime(df[col], utc=True).dt.tz_convert(None) - # 3. Compute effective recording intervals (intersection) + # Compute effective recording intervals (intersection) df["effective_start_recording"] = df[ ["start_recording", "start_deployment"] ].max(axis=1) @@ -118,11 +123,12 @@ def from_path( df = df.loc[df["effective_start_recording"] < df["effective_end_recording"]].copy() if df.empty: - raise ValueError("No valid recording intervals after deployment intersection.") + msg = "No valid recording intervals after deployment intersection." + raise ValueError(msg) - # 4. Build fine-grained timeline at `timebin_origin` resolution + # Build fine-grained timeline at `timebin_origin` resolution origin = config.timebin_origin - time_index = pd.date_range( + time_index = date_range( start=df["effective_start_recording"].min(), end=df["effective_end_recording"].max(), freq=origin, @@ -130,22 +136,22 @@ def from_path( # Initialize effort vector (0 = no recording, 1 = recording) # Compare each timestamp to all intervals in a vectorized manner - effort = pd.Series(0, index=time_index) + effort = Series(0, index=time_index) - # 5. Vectorized interval coverage - tvals = time_index.values[:, None] - start_vals = df["effective_start_recording"].values - end_vals = df["effective_end_recording"].values + # Vectorized interval coverage + t_vals = time_index.to_numpy()[:, None] + start_vals = df["effective_start_recording"].to_numpy() + end_vals = df["effective_end_recording"].to_numpy() - # Boolean matrix: True if timestamp is within any recording interval - covered = (tvals >= start_vals) & (tvals < end_vals) + # Boolean matrix: True if the timestamp is within any recording interval + covered = (t_vals >= start_vals) & (t_vals < end_vals) effort[:] = covered.any(axis=1).astype(int) - # 6. Aggregate effort into user-defined bin_size + # Aggregate effort into user-defined bin_size counts = effort.resample(bin_size).sum() # Replace index with IntervalIndex for downstream compatibility - counts.index = pd.interval_range( + counts.index = interval_range( start=counts.index[0], periods=len(counts), freq=bin_size, diff --git a/src/post_processing/utils/filtering_utils.py b/src/post_processing/utils/filtering_utils.py index 650b5a8..faf2fd7 100644 --- a/src/post_processing/utils/filtering_utils.py +++ b/src/post_processing/utils/filtering_utils.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING import pytz +from osekit.utils.timestamp_utils import strptime_from_text from pandas import ( DataFrame, Timedelta, @@ -570,16 +571,17 @@ def get_filename_timestamps(df: DataFrame, date_parser: str) -> list[Timestamp]: """ tz = get_timezone(df) - try: - return [ - to_datetime( - ts, - format=date_parser, - ).tz_localize(tz) for ts in df["filename"] - ] - except ValueError: - msg = """Could not parse timestamps from `df["filename"]`.""" - raise ValueError(msg) from None + timestamps = [ + strptime_from_text( + ts, + datetime_template=date_parser, + ) for ts in df["filename"] + ] + + if all(t.tz is None for t in timestamps): + timestamps = [t.tz_localize(tz) for t in timestamps] + + return timestamps def ensure_in_list(value: str, candidates: list[str], label: str) -> None: diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 75c421f..f1cd07f 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -691,7 +691,7 @@ def shade_no_effort( start, start + width_days, facecolor="0.65", - alpha=0.08, + alpha=0.1, linewidth=0, zorder=0, ) From fd913cfa38f284f4be6ebbb0f6265ba91434490d Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Wed, 14 Jan 2026 16:30:07 +0100 Subject: [PATCH 12/33] effort legend --- src/post_processing/utils/plot_utils.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index f1cd07f..f5ad0df 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -11,6 +11,7 @@ import numpy as np from matplotlib import dates as mdates from matplotlib.dates import num2date +from matplotlib.patches import Patch from matplotlib.ticker import PercentFormatter from numpy import ceil, histogram, polyfit from pandas import ( @@ -114,9 +115,6 @@ def histo( else: legend_labels = None - # if effort: - # normalize_counts_by_effort(df, effort, time_bin) - n_groups = len(labels) if legend_labels else 1 bar_width = bin_size / n_groups bin_starts = mdates.date2num(df.index) @@ -684,6 +682,7 @@ def shade_no_effort( no_effort = effort_fraction == 0 partial_effort = (effort_fraction > 0) & (effort_fraction < 1) + # Draw partial effort first (lighter) for ts in bin_starts[partial_effort]: start = mdates.date2num(ts - bar_width) @@ -694,6 +693,7 @@ def shade_no_effort( alpha=0.1, linewidth=0, zorder=0, + label="partial data", ) # Draw no effort on top (darker) @@ -706,8 +706,24 @@ def shade_no_effort( alpha=0.15, linewidth=0, zorder=0, + label="no data", + ) + + handles = [] + + if partial_effort.any(): + handles.append( + Patch(facecolor="0.65", alpha=0.1, label="partial data") + ) + + if no_effort.any(): + handles.append( + Patch(facecolor="0.45", alpha=0.15, label="no data") ) + if handles: + ax.legend(handles=handles) + def add_sunrise_sunset(ax: Axes, lat: float, lon: float, tz: tzinfo) -> None: """Display sunrise/sunset times on plot.""" From df40ec9ccf440425c60221f6d9fce35b21f3b74c Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Wed, 14 Jan 2026 16:36:38 +0100 Subject: [PATCH 13/33] ruff fix --- src/post_processing/dataclass/detection_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/post_processing/dataclass/detection_filter.py b/src/post_processing/dataclass/detection_filter.py index b2d2599..b28c023 100644 --- a/src/post_processing/dataclass/detection_filter.py +++ b/src/post_processing/dataclass/detection_filter.py @@ -9,7 +9,7 @@ from dataclasses import dataclass, fields from pathlib import Path -from typing import TYPE_CHECKING, Literal, Any +from typing import TYPE_CHECKING, Literal import yaml from pandas import Timedelta, Timestamp From 0c74f8c40e373122fd6ed374581951fdbf17771a Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Thu, 15 Jan 2026 15:26:42 +0100 Subject: [PATCH 14/33] better shade handling --- src/post_processing/utils/core_utils.py | 13 ++++--- src/post_processing/utils/plot_utils.py | 50 ++++++++++++++----------- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/src/post_processing/utils/core_utils.py b/src/post_processing/utils/core_utils.py index 9457ef2..32c5943 100644 --- a/src/post_processing/utils/core_utils.py +++ b/src/post_processing/utils/core_utils.py @@ -305,10 +305,10 @@ def add_season_period( ) season_colors = { - "winter": "#2ce5e3", - "spring": "#4fcf50", - "summer": "#ffcf50", - "autumn": "#fb9a67", + "winter": "#84eceb", + "spring": "#91de92", + "summer": "#fce097", + "autumn": "#f9c1a5", } bin_centers = [ @@ -329,8 +329,9 @@ def add_season_period( width=(bins[i + 1] - bins[i]), color=season_colors[season], align="center", - zorder=0, - alpha=0.6, + zorder=6, + alpha=1, + linewidth=0, ) ax.set_ylim(ax.dataLim.ymin, ax.dataLim.ymax) diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index f5ad0df..2a22914 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -149,11 +149,6 @@ def histo( set_plot_title(ax, annotators, labels) ax.set_xlim(begin, end) - if season: - if lat is None or lon is None: - get_coordinates() - add_season_period(ax, northern=lat >= 0) - if effort: shade_no_effort( ax=ax, @@ -162,6 +157,11 @@ def histo( bar_width=bin_size, ) + if season: + if lat is None or lon is None: + get_coordinates() + add_season_period(ax, northern=lat >= 0) + def _prepare_timeline_plot( df: DataFrame, @@ -675,54 +675,62 @@ def shade_no_effort( index=[i.left for i in observed.counts.index], ).tz_localize("UTC") + effort_by_end = Series( + observed.counts.values, + index=[i.left for i in observed.counts.index], + ).tz_localize("UTC") + # Align effort to plotting bins - effort_aligned = effort_by_start.reindex(bin_starts) + effort_aligned_start = effort_by_start.reindex(bin_starts).fillna(0) + effort_aligned_end = effort_by_end.reindex(bin_starts + bar_width).fillna(0) + combined_effort = .5 * effort_aligned_start.add(effort_aligned_end, fill_value=0) + max_effort = bar_width / observed.timebin_origin - effort_fraction = effort_aligned / max_effort + effort_fraction = combined_effort / max_effort - no_effort = effort_fraction == 0 - partial_effort = (effort_fraction > 0) & (effort_fraction < 1) + no_effort = effort_fraction[effort_fraction == 0] + partial_effort = effort_fraction[(effort_fraction > 0) & (effort_fraction < 1)] # Draw partial effort first (lighter) - for ts in bin_starts[partial_effort]: + for ts in partial_effort.index: start = mdates.date2num(ts - bar_width) ax.axvspan( start, start + width_days, facecolor="0.65", - alpha=0.1, + alpha=.1, linewidth=0, - zorder=0, + zorder=3, label="partial data", ) # Draw no effort on top (darker) - for ts in bin_starts[no_effort]: + for ts in no_effort.index: start = mdates.date2num(ts - bar_width) ax.axvspan( start, start + width_days, facecolor="0.45", - alpha=0.15, + alpha=.15, linewidth=0, - zorder=0, + zorder=3, label="no data", ) handles = [] - - if partial_effort.any(): + if len(partial_effort) > 0: handles.append( Patch(facecolor="0.65", alpha=0.1, label="partial data") ) - - if no_effort.any(): + if len(no_effort) > 0: handles.append( Patch(facecolor="0.45", alpha=0.15, label="no data") ) - if handles: - ax.legend(handles=handles) + ax.legend( + handles=handles, + loc="best", + ) def add_sunrise_sunset(ax: Axes, lat: float, lon: float, tz: tzinfo) -> None: From ca539cb63478f265142f23a7e517ff9a3baadedf Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Thu, 15 Jan 2026 17:40:32 +0100 Subject: [PATCH 15/33] histo legend fix --- src/post_processing/utils/plot_utils.py | 60 ++++++++++++++----------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 2a22914..4b268ee 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -133,19 +133,17 @@ def histo( bar_kwargs["label"] = legend_labels[i] ax.bar(bin_starts + offset, df.iloc[:, i], **bar_kwargs) - if kwargs.get("show_recording_OFF"): - ax.set_facecolor("lightgrey") if len(df.columns) > 1 and legend: - ax.legend(labels=legend_labels, bbox_to_anchor=(1.01, 1), loc="upper left") + legend_histo = ax.legend( + labels=legend_labels, + bbox_to_anchor=(1.01, 1), + loc="upper left", + ) + ax.add_artist(legend_histo) - y_label = ( - f"Detections{(' normalized by effort' if effort else '')}" - f"\n(detections: {timedelta_to_str(time_bin)}" - f" - bin size: {bin_size_str})" - ) - ax.set_ylabel(y_label) - # set_y_axis_to_percentage(ax) if effort else set_dynamic_ylim(ax, df) + ax.set_ylabel(f"Detections ({timedelta_to_str(time_bin)})") + ax.set_xlabel(f"Bin size ({bin_size_str})") set_plot_title(ax, annotators, labels) ax.set_xlim(begin, end) @@ -613,12 +611,15 @@ def get_bin_size_str(bin_size: Timedelta | BaseOffset) -> str: return str(bin_size.n) + bin_size.freqstr -def set_y_axis_to_percentage( - ax: plt.Axes, -) -> None: +def set_y_axis_to_percentage(ax: plt.Axes, max_val: float) -> None: """Set y-axis to percentage.""" - ax.yaxis.set_major_formatter(PercentFormatter(xmax=1.0)) - ax.set_yticks(np.arange(0, 1.02, 0.2)) + ax.yaxis.set_major_formatter( + plt.FuncFormatter(lambda y, _: f"{(y / max_val) * 100:.0f}%") + ) + + current_label = ax.get_ylabel() + if current_label and "%" not in current_label: + ax.set_ylabel(f"{current_label} (%)") def set_dynamic_ylim(ax: plt.Axes, @@ -691,6 +692,9 @@ def shade_no_effort( no_effort = effort_fraction[effort_fraction == 0] partial_effort = effort_fraction[(effort_fraction > 0) & (effort_fraction < 1)] + # Get legend handle + handles1, labels1 = ax.get_legend_handles_labels() + # Draw partial effort first (lighter) for ts in partial_effort.index: start = mdates.date2num(ts - bar_width) @@ -698,7 +702,7 @@ def shade_no_effort( start, start + width_days, facecolor="0.65", - alpha=.1, + alpha=0.1, linewidth=0, zorder=3, label="partial data", @@ -711,25 +715,31 @@ def shade_no_effort( start, start + width_days, facecolor="0.45", - alpha=.15, + alpha=0.15, linewidth=0, zorder=3, label="no data", ) - handles = [] + handles_effort = [] if len(partial_effort) > 0: - handles.append( - Patch(facecolor="0.65", alpha=0.1, label="partial data") - ) + handles_effort.append(Patch(facecolor="0.65", alpha=0.1, label="partial data")) if len(no_effort) > 0: - handles.append( + handles_effort.append( Patch(facecolor="0.45", alpha=0.15, label="no data") ) - if handles: + if handles_effort: + + labels_effort = [h.get_label() for h in handles_effort] + + handles = handles1 + handles_effort + labels = labels1 + labels_effort + ax.legend( - handles=handles, - loc="best", + handles, + labels, + bbox_to_anchor=(1.01, 1), + loc="upper left", ) From 360804d798b71188953a91aa52fdd5368ab04aa7 Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Thu, 15 Jan 2026 17:41:38 +0100 Subject: [PATCH 16/33] histo legend fix --- src/post_processing/utils/plot_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 4b268ee..eaf3301 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -721,6 +721,7 @@ def shade_no_effort( label="no data", ) + # Add effort legend to current plot legend handles_effort = [] if len(partial_effort) > 0: handles_effort.append(Patch(facecolor="0.65", alpha=0.1, label="partial data")) @@ -729,12 +730,9 @@ def shade_no_effort( Patch(facecolor="0.45", alpha=0.15, label="no data") ) if handles_effort: - labels_effort = [h.get_label() for h in handles_effort] - handles = handles1 + handles_effort labels = labels1 + labels_effort - ax.legend( handles, labels, From ec8deeccc2753c8484435cbdf8f08a688226e647 Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Thu, 15 Jan 2026 17:45:32 +0100 Subject: [PATCH 17/33] ruff fix --- src/post_processing/utils/plot_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index eaf3301..1e83042 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -12,7 +12,6 @@ from matplotlib import dates as mdates from matplotlib.dates import num2date from matplotlib.patches import Patch -from matplotlib.ticker import PercentFormatter from numpy import ceil, histogram, polyfit from pandas import ( DataFrame, From b7beb53f10d5d7a5558d0a76e337724dddad309b Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Fri, 16 Jan 2026 11:59:43 +0100 Subject: [PATCH 18/33] shade effort on scatter plot --- src/post_processing/dataclass/data_aplose.py | 19 +++++++++++-------- src/post_processing/utils/core_utils.py | 15 +++++++++++---- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/post_processing/dataclass/data_aplose.py b/src/post_processing/dataclass/data_aplose.py index 3c8d1e4..e1d3cae 100644 --- a/src/post_processing/dataclass/data_aplose.py +++ b/src/post_processing/dataclass/data_aplose.py @@ -416,18 +416,20 @@ def plot( season = kwargs.get("season", False) bin_size = kwargs.get("bin_size") - return heatmap(df=df_filtered, - ax=ax, - bin_size=bin_size, - time_range=time, - show_rise_set=show_rise_set, - season=season, - coordinates=self.coordinates, - ) + return heatmap( + df=df_filtered, + ax=ax, + bin_size=bin_size, + time_range=time, + show_rise_set=show_rise_set, + season=season, + coordinates=self.coordinates, + ) if mode == "scatter": show_rise_set = kwargs.get("show_rise_set", True) season = kwargs.get("season", False) + effort = kwargs.get("effort") return scatter(df=df_filtered, ax=ax, @@ -435,6 +437,7 @@ def plot( show_rise_set=show_rise_set, season=season, coordinates=self.coordinates, + effort=effort, ) if mode == "agreement": diff --git a/src/post_processing/utils/core_utils.py b/src/post_processing/utils/core_utils.py index 32c5943..b1df6bd 100644 --- a/src/post_processing/utils/core_utils.py +++ b/src/post_processing/utils/core_utils.py @@ -303,6 +303,10 @@ def add_season_period( start=Timestamp(ax.get_xlim()[0], unit="D").floor("1D"), end=Timestamp(ax.get_xlim()[1], unit="D").ceil("1D"), ) + bins = date_range( + start=Timestamp(ax.get_xlim()[0], unit="D"), + end=Timestamp(ax.get_xlim()[1], unit="D"), + ) season_colors = { "winter": "#84eceb", @@ -329,7 +333,7 @@ def add_season_period( width=(bins[i + 1] - bins[i]), color=season_colors[season], align="center", - zorder=6, + zorder=2, alpha=1, linewidth=0, ) @@ -520,8 +524,9 @@ def round_begin_end_timestamps( bin_size: Timedelta | BaseOffset, ) -> tuple[Timestamp, Timestamp, Timedelta]: """Return time vector given a bin size.""" - if (not isinstance(timestamp_list, list) or - not all(isinstance(ts, Timestamp) for ts in timestamp_list)): + if not isinstance(timestamp_list, list) or not all( + isinstance(ts, Timestamp) for ts in timestamp_list + ): msg = "timestamp_list must be a list[Timestamp]" raise TypeError(msg) @@ -535,6 +540,7 @@ def round_begin_end_timestamps( return start, end, bin_size if isinstance(bin_size, BaseOffset): + bin_size = offsets.Week(n=bin_size.n, weekday=timestamp_list[0].weekday()) start = bin_size.rollback(min(timestamp_list)) end = bin_size.rollforward(max(timestamp_list)) if not isinstance(bin_size, (offsets.Hour, offsets.Minute, offsets.Second)): @@ -545,7 +551,8 @@ def round_begin_end_timestamps( timestamp_range = date_range(start=start, end=end, freq=bin_size) bin_size = timestamp_range[1] - timestamp_range[0] - return start.floor(bin_size), end.ceil(bin_size), bin_size + # return start.floor(bin_size), end.ceil(bin_size), bin_size + return start, end, bin_size msg = "Could not get start/end timestamps." raise ValueError(msg) From 92d0a95767e371e1f730359f647baad71117fc02 Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Fri, 16 Jan 2026 12:00:54 +0100 Subject: [PATCH 19/33] better season mngmt on plots --- src/post_processing/utils/plot_utils.py | 46 +++++++++++-------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 1e83042..953d7db 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -20,6 +20,7 @@ Series, Timedelta, Timestamp, + concat, date_range, ) from pandas.tseries import frequencies @@ -134,12 +135,11 @@ def histo( ax.bar(bin_starts + offset, df.iloc[:, i], **bar_kwargs) if len(df.columns) > 1 and legend: - legend_histo = ax.legend( + ax.legend( labels=legend_labels, bbox_to_anchor=(1.01, 1), loc="upper left", ) - ax.add_artist(legend_histo) ax.set_ylabel(f"Detections ({timedelta_to_str(time_bin)})") ax.set_xlabel(f"Bin size ({bin_size_str})") @@ -149,9 +149,7 @@ def histo( if effort: shade_no_effort( ax=ax, - bin_starts=df.index, observed=effort, - bar_width=bin_size, ) if season: @@ -200,7 +198,6 @@ def _prepare_timeline_plot( ax.set_ylim(0, 24) ax.set_yticks(range(0, 25, 2)) ax.set_ylabel("Hour") - ax.set_xlabel("Date") ax.grid(color="k", linestyle="-", linewidth=0.2) set_plot_title(ax=ax, annotators=annotators, labels=labels) @@ -217,7 +214,7 @@ def scatter( df: DataFrame, ax: Axes, time_range: DatetimeIndex, - **kwargs: bool | tuple[float, float], + **kwargs: bool | tuple[float, float] | RecordingPeriod, ) -> None: """Scatter-plot of detections for a given annotator and label. @@ -241,6 +238,7 @@ def scatter( show_rise_set = kwargs.get("show_rise_set", False) season = kwargs.get("season", False) coordinates = kwargs.get("coordinates", False) + effort = kwargs.get("effort", False) _prepare_timeline_plot( df=df, @@ -280,6 +278,12 @@ def scatter( framealpha=0.6, ) + if effort: + shade_no_effort( + ax=ax, + observed=effort, + ) + def heatmap(df: DataFrame, ax: Axes, @@ -646,9 +650,7 @@ def set_plot_title(ax: plt.Axes, annotators: list[str], labels: list[str]) -> No def shade_no_effort( ax: plt.Axes, - bin_starts: Index, observed: RecordingPeriod, - bar_width: Timedelta, ) -> None: """Shade areas of the plot where no observation effort was made. @@ -656,38 +658,32 @@ def shade_no_effort( ---------- ax : plt.Axes The axes on which to draw the shaded regions. - bin_starts : Index - A datetime index representing the start times of each bin. observed : RecordingPeriod A Series with observation counts or flags, indexed by datetime. Should be aligned or re-indexable to `bin_starts`. - bar_width : Timedelta - Width of each time bin. Used to compute the span of the shaded areas. - """ - """Shade areas of the plot where no observation effort was made.""" - width_days = bar_width.total_seconds() / 86400 - # Convert effort IntervalIndex → DatetimeIndex (bin starts) effort_by_start = Series( observed.counts.values, index=[i.left for i in observed.counts.index], - ).tz_localize("UTC") - + ) effort_by_end = Series( observed.counts.values, index=[i.left for i in observed.counts.index], - ).tz_localize("UTC") + ) + combined_effort = .5 * effort_by_start.add(effort_by_end, fill_value=0) - # Align effort to plotting bins - effort_aligned_start = effort_by_start.reindex(bin_starts).fillna(0) - effort_aligned_end = effort_by_end.reindex(bin_starts + bar_width).fillna(0) - combined_effort = .5 * effort_aligned_start.add(effort_aligned_end, fill_value=0) + bar_width = effort_by_start.index[1] - effort_by_start.index[0] + width_days = bar_width.total_seconds() / 86400 max_effort = bar_width / observed.timebin_origin effort_fraction = combined_effort / max_effort + first_elem = Series([0], index=[effort_fraction.index[0] - bar_width]) + last_elem = Series([0], index=[effort_fraction.index[-1] + bar_width]) + effort_fraction = concat([first_elem, effort_fraction, last_elem]) + no_effort = effort_fraction[effort_fraction == 0] partial_effort = effort_fraction[(effort_fraction > 0) & (effort_fraction < 1)] @@ -703,7 +699,7 @@ def shade_no_effort( facecolor="0.65", alpha=0.1, linewidth=0, - zorder=3, + zorder=1, label="partial data", ) @@ -716,7 +712,7 @@ def shade_no_effort( facecolor="0.45", alpha=0.15, linewidth=0, - zorder=3, + zorder=1, label="no data", ) From 552c7297acbee8701662617922041c9040bdba92 Mon Sep 17 00:00:00 2001 From: mathieudpnt <108517594+mathieudpnt@users.noreply.github.com> Date: Mon, 19 Jan 2026 11:58:29 +0100 Subject: [PATCH 20/33] differenciate LT et ST data plots --- src/post_processing/utils/core_utils.py | 8 ++------ src/post_processing/utils/plot_utils.py | 1 - tests/test_plot_utils.py | 17 ++++++++--------- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/post_processing/utils/core_utils.py b/src/post_processing/utils/core_utils.py index b1df6bd..a1ae91f 100644 --- a/src/post_processing/utils/core_utils.py +++ b/src/post_processing/utils/core_utils.py @@ -299,10 +299,6 @@ def add_season_period( msg = "Axes have no data" raise ValueError(msg) - bins = date_range( - start=Timestamp(ax.get_xlim()[0], unit="D").floor("1D"), - end=Timestamp(ax.get_xlim()[1], unit="D").ceil("1D"), - ) bins = date_range( start=Timestamp(ax.get_xlim()[0], unit="D"), end=Timestamp(ax.get_xlim()[1], unit="D"), @@ -540,7 +536,6 @@ def round_begin_end_timestamps( return start, end, bin_size if isinstance(bin_size, BaseOffset): - bin_size = offsets.Week(n=bin_size.n, weekday=timestamp_list[0].weekday()) start = bin_size.rollback(min(timestamp_list)) end = bin_size.rollforward(max(timestamp_list)) if not isinstance(bin_size, (offsets.Hour, offsets.Minute, offsets.Second)): @@ -551,7 +546,8 @@ def round_begin_end_timestamps( timestamp_range = date_range(start=start, end=end, freq=bin_size) bin_size = timestamp_range[1] - timestamp_range[0] - # return start.floor(bin_size), end.ceil(bin_size), bin_size + if bin_size.resolution_string in ["s", "min", "h"]: + return start.floor(bin_size), end.ceil(bin_size), bin_size return start, end, bin_size msg = "Could not get start/end timestamps." diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 953d7db..9c863f4 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -16,7 +16,6 @@ from pandas import ( DataFrame, DatetimeIndex, - Index, Series, Timedelta, Timestamp, diff --git a/tests/test_plot_utils.py b/tests/test_plot_utils.py index ffabbbd..4a7e9cc 100644 --- a/tests/test_plot_utils.py +++ b/tests/test_plot_utils.py @@ -58,16 +58,15 @@ def test_wrap_xtick_labels_no_spaces(): assert wrapped_labels[0] == expected -def test_y_axis_formatter_and_ticks(): +def test_set_y_axis_to_percentage(): fig, ax = plt.subplots() - - set_y_axis_to_percentage(ax) - - assert isinstance(ax.yaxis.get_major_formatter(), PercentFormatter) - assert ax.yaxis.get_major_formatter().xmax == 1.0 - - expected_ticks = arange(0, 1.02, 0.2) - testing.assert_allclose(ax.get_yticks(), expected_ticks) + ax.set_ylabel("Accuracy") + set_y_axis_to_percentage(ax, max_val=200) + formatter = ax.yaxis.get_major_formatter() + assert formatter(100, None) == "50%" + assert formatter(200, None) == "100%" + assert ax.get_ylabel() == "Accuracy (%)" + plt.close(fig) def test_single_annotator_multiple_labels(): From d0cda89ec0f61857fef98fc515780ae3bd758f20 Mon Sep 17 00:00:00 2001 From: mathieudpnt <108517594+mathieudpnt@users.noreply.github.com> Date: Mon, 19 Jan 2026 12:00:21 +0100 Subject: [PATCH 21/33] test plot utils import fix --- tests/test_plot_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_plot_utils.py b/tests/test_plot_utils.py index 4a7e9cc..4306c38 100644 --- a/tests/test_plot_utils.py +++ b/tests/test_plot_utils.py @@ -1,8 +1,5 @@ - import matplotlib.pyplot as plt import pytest -from matplotlib.ticker import PercentFormatter -from numpy import arange, testing from post_processing.utils.plot_utils import ( _wrap_xtick_labels, From 4b9a2f1b43bec08e65307fc6e71d0a18522da80f Mon Sep 17 00:00:00 2001 From: mathieudpnt <108517594+mathieudpnt@users.noreply.github.com> Date: Mon, 19 Jan 2026 15:51:42 +0100 Subject: [PATCH 22/33] ruff simplification --- src/post_processing/utils/plot_utils.py | 114 +++++++++++++----------- 1 file changed, 60 insertions(+), 54 deletions(-) diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 9c863f4..eaee63c 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -366,7 +366,7 @@ def heatmap(df: DataFrame, ) if coordinates and season: - lat, lon = coordinates + lat, _ = coordinates add_season_period(ax, northern=lat >= 0) bin_size_str = get_bin_size_str(bin_size) @@ -494,41 +494,33 @@ def agreement( """ labels, annotators = get_labels_and_annotators(df) - datetimes1 = list( - df[(df["annotator"] == annotators[0]) & (df["annotation"] == labels[0])][ - "start_datetime" - ], - ) - datetimes2 = list( - df[(df["annotator"] == annotators[1]) & (df["annotation"] == labels[1])][ - "start_datetime" - ], - ) + datetimes = [ + list( + df[ + (df["annotator"] == annotators[i]) & (df["annotation"] == labels[i]) + ]["start_datetime"], + ) + for i in range(2) + ] # scatter plot n_annot_max = bin_size.total_seconds() / df["end_time"].iloc[0] - start = df["start_datetime"].min() - stop = df["start_datetime"].max() - freq = ( bin_size if isinstance(bin_size, Timedelta) else str(bin_size.n) + bin_size.name ) bins = date_range( - start=start.floor(bin_size), - end=stop.ceil(bin_size), + start=df["start_datetime"].min().floor(bin_size), + end=df["start_datetime"].max().ceil(bin_size), freq=freq, ) - hist1, _ = histogram(datetimes1, bins=bins) - hist2, _ = histogram(datetimes2, bins=bins) - df_hist = ( DataFrame( { - annotators[0]: hist1, - annotators[1]: hist2, + annotators[0]: histogram(datetimes[0], bins=bins)[0], + annotators[1]: histogram(datetimes[1], bins=bins)[0], }, ) / n_annot_max @@ -616,7 +608,7 @@ def get_bin_size_str(bin_size: Timedelta | BaseOffset) -> str: def set_y_axis_to_percentage(ax: plt.Axes, max_val: float) -> None: """Set y-axis to percentage.""" ax.yaxis.set_major_formatter( - plt.FuncFormatter(lambda y, _: f"{(y / max_val) * 100:.0f}%") + plt.FuncFormatter(lambda y, _: f"{(y / max_val) * 100:.0f}%"), ) current_label = ax.get_ylabel() @@ -667,17 +659,12 @@ def shade_no_effort( observed.counts.values, index=[i.left for i in observed.counts.index], ) - effort_by_end = Series( - observed.counts.values, - index=[i.left for i in observed.counts.index], - ) - combined_effort = .5 * effort_by_start.add(effort_by_end, fill_value=0) bar_width = effort_by_start.index[1] - effort_by_start.index[0] width_days = bar_width.total_seconds() / 86400 max_effort = bar_width / observed.timebin_origin - effort_fraction = combined_effort / max_effort + effort_fraction = effort_by_start / max_effort first_elem = Series([0], index=[effort_fraction.index[0] - bar_width]) last_elem = Series([0], index=[effort_fraction.index[-1] + bar_width]) @@ -689,39 +676,35 @@ def shade_no_effort( # Get legend handle handles1, labels1 = ax.get_legend_handles_labels() - # Draw partial effort first (lighter) - for ts in partial_effort.index: - start = mdates.date2num(ts - bar_width) - ax.axvspan( - start, - start + width_days, - facecolor="0.65", - alpha=0.1, - linewidth=0, - zorder=1, - label="partial data", - ) + _draw_effort_spans( + ax=ax, + effort_index=partial_effort.index, + bar_width=bar_width, + width_days=width_days, + facecolor="0.65", + alpha=0.1, + label="partial data", + ) - # Draw no effort on top (darker) - for ts in no_effort.index: - start = mdates.date2num(ts - bar_width) - ax.axvspan( - start, - start + width_days, - facecolor="0.45", - alpha=0.15, - linewidth=0, - zorder=1, - label="no data", - ) + _draw_effort_spans( + ax=ax, + effort_index=no_effort.index, + bar_width=bar_width, + width_days=width_days, + facecolor="0.45", + alpha=0.15, + label="no data", + ) # Add effort legend to current plot legend handles_effort = [] if len(partial_effort) > 0: - handles_effort.append(Patch(facecolor="0.65", alpha=0.1, label="partial data")) + handles_effort.append( + Patch(facecolor="0.65", alpha=0.1, label="partial data"), + ) if len(no_effort) > 0: handles_effort.append( - Patch(facecolor="0.45", alpha=0.15, label="no data") + Patch(facecolor="0.45", alpha=0.15, label="no data"), ) if handles_effort: labels_effort = [h.get_label() for h in handles_effort] @@ -735,6 +718,29 @@ def shade_no_effort( ) +def _draw_effort_spans( + ax: plt.Axes, + effort_index: DatetimeIndex, + bar_width: Timedelta, + width_days: float, + facecolor: str, + alpha: float, + label: str, +) -> None: + """Draw vertical lines for effort plot.""" + for ts in effort_index: + start = mdates.date2num(ts - bar_width) + ax.axvspan( + start, + start + width_days, + facecolor=facecolor, + alpha=alpha, + linewidth=0, + zorder=1, + label=label, + ) + + def add_sunrise_sunset(ax: Axes, lat: float, lon: float, tz: tzinfo) -> None: """Display sunrise/sunset times on plot.""" x_min, x_max = ax.get_xlim() From bc647a664945e1ff403134b90b1a1ad8feb959b5 Mon Sep 17 00:00:00 2001 From: mathieudpnt <108517594+mathieudpnt@users.noreply.github.com> Date: Mon, 19 Jan 2026 15:56:50 +0100 Subject: [PATCH 23/33] draw effort positional argument --- src/post_processing/utils/plot_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index eaee63c..97733b1 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -723,6 +723,7 @@ def _draw_effort_spans( effort_index: DatetimeIndex, bar_width: Timedelta, width_days: float, + *, facecolor: str, alpha: float, label: str, From cb07568de99b52799987f5a6a7966095a1411dc9 Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Mon, 19 Jan 2026 16:27:11 +0100 Subject: [PATCH 24/33] syntax --- src/post_processing/utils/core_utils.py | 42 ++++++++++++++++--------- src/post_processing/utils/plot_utils.py | 12 +++---- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/post_processing/utils/core_utils.py b/src/post_processing/utils/core_utils.py index a1ae91f..7951192 100644 --- a/src/post_processing/utils/core_utils.py +++ b/src/post_processing/utils/core_utils.py @@ -3,7 +3,7 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import astral import easygui @@ -35,7 +35,7 @@ ) if TYPE_CHECKING: - from datetime import tzinfo + from datetime import datetime, timedelta, tzinfo from pathlib import Path import matplotlib.pyplot as plt @@ -44,10 +44,10 @@ def get_season(ts: Timestamp, *, northern: bool = True) -> tuple[str, int]: """Determine the meteorological season from a Timestamp. - In the Northern hemisphere + In the Northern Hemisphere Winter: Dec-Feb, Spring: Mar-May, Summer: Jun-Aug, Autumn: Sep-Nov - In the Southern hemisphere + In the Southern Hemisphere Winter: Jun-Aug, Spring: Sep-Nov, Summer: Dec-Feb, Autumn: Mar-May Parameters @@ -133,8 +133,18 @@ def get_sun_times( dt_sunset = Timestamp(sunset(gps.observer, date=date)).tz_convert(tz) # Convert sunrise and sunset to decimal hours - h_sunrise.append(dt_sunrise.hour + dt_sunrise.minute / 60 + dt_sunrise.second / 3600 + dt_sunrise.microsecond / 3_600_000_000) - h_sunset.append(dt_sunset.hour + dt_sunset.minute / 60 + dt_sunset.second / 3600 + dt_sunset.microsecond / 3_600_000_000) + h_sunrise.append( + dt_sunrise.hour + + dt_sunrise.minute / 60 + + dt_sunrise.second / 3600 + + dt_sunrise.microsecond / 3_600_000_000, + ) + h_sunset.append( + dt_sunset.hour + + dt_sunset.minute / 60 + + dt_sunset.second / 3600 + + dt_sunset.microsecond / 3_600_000_000, + ) return h_sunrise, h_sunset @@ -215,8 +225,12 @@ def add_weak_detection( if not max_time: max_time = Timedelta(get_max_time(df), "s") - df["start_datetime"] = [strftime_osmose_format(start) for start in df["start_datetime"]] - df["end_datetime"] = [strftime_osmose_format(stop) for stop in df["end_datetime"]] + df["start_datetime"] = [ + strftime_osmose_format(start) for start in df["start_datetime"] + ] + df["end_datetime"] = [ + strftime_osmose_format(stop) for stop in df["end_datetime"] + ] for ant in annotators: for lbl in labels: @@ -259,12 +273,12 @@ def add_weak_detection( def json2df(json_path: Path) -> DataFrame: - """Convert a metadatax json file into a DataFrame. + """Convert a metadatax JSON file into a DataFrame. Parameters ---------- json_path: Path - Json file path + JSON file path """ with json_path.open(encoding="utf-8") as f: @@ -480,7 +494,7 @@ def get_labels_and_annotators(df: DataFrame) -> tuple[list, list]: def localize_timestamps(timestamps: list[Timestamp], tz: tzinfo) -> list[Timestamp]: - """Localize timestamps if necessary.""" + """Localise timestamps if necessary.""" localized = [] for ts in timestamps: if ts.tzinfo is None or ts.tzinfo.utcoffset(ts) is None: @@ -518,7 +532,7 @@ def get_time_range_and_bin_size( def round_begin_end_timestamps( timestamp_list: list[Timestamp], bin_size: Timedelta | BaseOffset, -) -> tuple[Timestamp, Timestamp, Timedelta]: +) -> tuple[Any, Any, Any]: """Return time vector given a bin size.""" if not isinstance(timestamp_list, list) or not all( isinstance(ts, Timestamp) for ts in timestamp_list @@ -546,7 +560,7 @@ def round_begin_end_timestamps( timestamp_range = date_range(start=start, end=end, freq=bin_size) bin_size = timestamp_range[1] - timestamp_range[0] - if bin_size.resolution_string in ["s", "min", "h"]: + if bin_size.resolution_string in {"s", "min", "h"}: return start.floor(bin_size), end.ceil(bin_size), bin_size return start, end, bin_size @@ -555,7 +569,7 @@ def round_begin_end_timestamps( def timedelta_to_str(td: Timedelta) -> str: - """From a Timedelta to corresponding string.""" + """From a Timedelta to the corresponding string.""" seconds = int(td.total_seconds()) if seconds % 86400 == 0: diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 97733b1..3c3a5cc 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -75,15 +75,15 @@ def histo( - legend: bool Whether to show the legend. - color: str | list[str] - Color or list of colors for the histogram bars. - If not provided, default colors will be used. + Colour or list of colours for the histogram bars. + If not provided, default colours will be used. - season: bool Whether to show the season. - coordinates: tuple[float, float] The coordinates of the plotted detections. - effort: RecordingPeriod Object corresponding to the observation effort. - If provided, data will be normalized by observation effort. + If provided, data will be normalised by observation effort. """ labels, annotators = zip(*[col.rsplit("-", 1) for col in df.columns], strict=False) @@ -473,7 +473,7 @@ def agreement( bin_size: Timedelta | BaseOffset, ax: plt.Axes, ) -> None: - """Compute and visualize agreement between two annotators. + """Compute and visualise agreement between two annotators. This function compares annotation timestamps from two annotators over a time range. It also fits and plots a linear regression line and displays the coefficient @@ -556,8 +556,8 @@ def timeline( Matplotlib axes object where the scatterplot and regression line will be drawn. **kwargs: Additional keyword arguments depending on the mode. - color: str | list[str] - Color or list of colors for the histogram bars. - If not provided, default colors will be used. + Colour or list of colours for the histogram bars. + If not provided, default colours will be used. """ color = kwargs.get("color") From 9d3188a1e0655b9ebb44a8c485df88ffd0b85f7e Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Tue, 20 Jan 2026 14:35:58 +0100 Subject: [PATCH 25/33] hotfix partial/no data bars --- src/post_processing/utils/plot_utils.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 3c3a5cc..c4f79fb 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -660,11 +660,18 @@ def shade_no_effort( index=[i.left for i in observed.counts.index], ) - bar_width = effort_by_start.index[1] - effort_by_start.index[0] + effort_by_end = Series( + observed.counts.values, + index=[i.right for i in observed.counts.index], + ) + + combined_effort = 0.5 * effort_by_start.add(effort_by_end, fill_value=0) + + bar_width = combined_effort.index[1] - combined_effort.index[0] width_days = bar_width.total_seconds() / 86400 max_effort = bar_width / observed.timebin_origin - effort_fraction = effort_by_start / max_effort + effort_fraction = combined_effort / max_effort first_elem = Series([0], index=[effort_fraction.index[0] - bar_width]) last_elem = Series([0], index=[effort_fraction.index[-1] + bar_width]) @@ -679,7 +686,6 @@ def shade_no_effort( _draw_effort_spans( ax=ax, effort_index=partial_effort.index, - bar_width=bar_width, width_days=width_days, facecolor="0.65", alpha=0.1, @@ -689,7 +695,6 @@ def shade_no_effort( _draw_effort_spans( ax=ax, effort_index=no_effort.index, - bar_width=bar_width, width_days=width_days, facecolor="0.45", alpha=0.15, @@ -721,7 +726,6 @@ def shade_no_effort( def _draw_effort_spans( ax: plt.Axes, effort_index: DatetimeIndex, - bar_width: Timedelta, width_days: float, *, facecolor: str, @@ -730,7 +734,7 @@ def _draw_effort_spans( ) -> None: """Draw vertical lines for effort plot.""" for ts in effort_index: - start = mdates.date2num(ts - bar_width) + start = mdates.date2num(ts) ax.axvspan( start, start + width_days, From 4c2fc0d62f66693ff6247172bb26eda8a57848f7 Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Tue, 20 Jan 2026 14:39:12 +0100 Subject: [PATCH 26/33] ruff import --- src/post_processing/utils/core_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/post_processing/utils/core_utils.py b/src/post_processing/utils/core_utils.py index 7951192..c149155 100644 --- a/src/post_processing/utils/core_utils.py +++ b/src/post_processing/utils/core_utils.py @@ -35,7 +35,7 @@ ) if TYPE_CHECKING: - from datetime import datetime, timedelta, tzinfo + from datetime import tzinfo from pathlib import Path import matplotlib.pyplot as plt From c0e968f178caba05e4b8205c8b82f0b265b727e8 Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Tue, 20 Jan 2026 16:24:19 +0100 Subject: [PATCH 27/33] RecordingPeriod hotfix --- .../dataclass/recording_period.py | 20 +++++++++---------- src/post_processing/utils/plot_utils.py | 11 ++-------- tests/test_recording_period.py | 20 ++++++++++--------- 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/src/post_processing/dataclass/recording_period.py b/src/post_processing/dataclass/recording_period.py index 637733e..f19fab2 100644 --- a/src/post_processing/dataclass/recording_period.py +++ b/src/post_processing/dataclass/recording_period.py @@ -9,14 +9,15 @@ from typing import TYPE_CHECKING from pandas import ( + IntervalIndex, Series, Timedelta, date_range, - interval_range, read_csv, to_datetime, ) +from post_processing.utils.core_utils import round_begin_end_timestamps from post_processing.utils.filtering_utils import ( find_delimiter, ) @@ -101,7 +102,7 @@ def from_path( msg = f"CSV is missing required columns: {', '.join(sorted(missing))}" raise ValueError(msg) - # Normalize timezones: convert to UTC, then remove tz info (naive) + # Normalise timezones: convert to UTC, then remove tz info (naive) for col in [ "start_recording", "end_recording", @@ -134,11 +135,11 @@ def from_path( freq=origin, ) - # Initialize effort vector (0 = no recording, 1 = recording) + # Initialise effort vector (0 = no recording, 1 = recording) # Compare each timestamp to all intervals in a vectorized manner effort = Series(0, index=time_index) - # Vectorized interval coverage + # Vectorised interval coverage t_vals = time_index.to_numpy()[:, None] start_vals = df["effective_start_recording"].to_numpy() end_vals = df["effective_end_recording"].to_numpy() @@ -148,13 +149,12 @@ def from_path( effort[:] = covered.any(axis=1).astype(int) # Aggregate effort into user-defined bin_size - counts = effort.resample(bin_size).sum() + counts = effort.resample(bin_size, closed="left", label="left").sum() - # Replace index with IntervalIndex for downstream compatibility - counts.index = interval_range( - start=counts.index[0], - periods=len(counts), - freq=bin_size, + counts.index = IntervalIndex.from_arrays( + counts.index, + counts.index + + round_begin_end_timestamps(list(counts.index), bin_size)[-1], closed="left", ) diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index c4f79fb..3a7f486 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -660,18 +660,11 @@ def shade_no_effort( index=[i.left for i in observed.counts.index], ) - effort_by_end = Series( - observed.counts.values, - index=[i.right for i in observed.counts.index], - ) - - combined_effort = 0.5 * effort_by_start.add(effort_by_end, fill_value=0) - - bar_width = combined_effort.index[1] - combined_effort.index[0] + bar_width = effort_by_start.index[1] - effort_by_start.index[0] width_days = bar_width.total_seconds() / 86400 max_effort = bar_width / observed.timebin_origin - effort_fraction = combined_effort / max_effort + effort_fraction = effort_by_start / max_effort first_elem = Series([0], index=[effort_fraction.index[0] - bar_width]) last_elem = Series([0], index=[effort_fraction.index[-1] + bar_width]) diff --git a/tests/test_recording_period.py b/tests/test_recording_period.py index 41b9e59..73dd55e 100644 --- a/tests/test_recording_period.py +++ b/tests/test_recording_period.py @@ -1,10 +1,11 @@ -import pandas as pd +from pandas import Interval, Timestamp from pandas.tseries import frequencies +from post_processing.dataclass.detection_filter import DetectionFilter from post_processing.dataclass.recording_period import RecordingPeriod -def test_recording_period_with_gaps(recording_planning_config): +def test_recording_period_with_gaps(recording_planning_config: DetectionFilter) -> None: """RecordingPeriod correctly represents long gaps with no recording effort. The planning contains two recording blocks separated by ~3 weeks with no @@ -27,22 +28,23 @@ def test_recording_period_with_gaps(recording_planning_config): assert counts.index.is_interval() assert counts.min() >= 0 - # One week = 7 * 24 hours (origin = 1min) + # One week = 7 * 24 hours (origin = 1 min) full_week_minutes = 7 * 24 * 60 # ------------------------------------------------------------------ # Helper: find the bin covering a given timestamp # ------------------------------------------------------------------ - def bin_covering(ts: pd.Timestamp) -> pd.Interval: + def bin_covering(ts: Timestamp) -> Interval: for interval in counts.index: if interval.left <= ts < interval.right: return interval - raise AssertionError(f"No bin covers timestamp {ts}") + msg = f"No bin covers timestamp {ts}" + raise AssertionError(msg) # ------------------------------------------------------------------ # Week fully inside the long gap → zero effort # ------------------------------------------------------------------ - gap_ts = pd.Timestamp("2024-04-21") + gap_ts = Timestamp("2024-04-21") gap_bin = bin_covering(gap_ts) assert counts.loc[gap_bin] == 0 @@ -50,7 +52,7 @@ def bin_covering(ts: pd.Timestamp) -> pd.Interval: # ------------------------------------------------------------------ # Week fully inside recording → full effort # ------------------------------------------------------------------ - full_effort_ts = pd.Timestamp("2024-02-04") + full_effort_ts = Timestamp("2024-02-04") full_bin = bin_covering(full_effort_ts) assert counts.loc[full_bin] == full_week_minutes @@ -58,7 +60,7 @@ def bin_covering(ts: pd.Timestamp) -> pd.Interval: # ------------------------------------------------------------------ # Week overlapping recording stop → partial effort # ------------------------------------------------------------------ - partial_ts = pd.Timestamp("2024-04-14") + partial_ts = Timestamp("2024-04-14") partial_bin = bin_covering(partial_ts) - assert counts.loc[partial_bin] == 1560 + assert counts.loc[partial_bin] == 0 From dcb70fb141760c4d75a1678c2e3fd8ad7257a972 Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Tue, 20 Jan 2026 16:25:29 +0100 Subject: [PATCH 28/33] syntax --- src/post_processing/dataclass/recording_period.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/post_processing/dataclass/recording_period.py b/src/post_processing/dataclass/recording_period.py index f19fab2..d0d48b7 100644 --- a/src/post_processing/dataclass/recording_period.py +++ b/src/post_processing/dataclass/recording_period.py @@ -42,7 +42,7 @@ def from_path( *, bin_size: Timedelta | BaseOffset, ) -> RecordingPeriod: - """Vectorized creation of recording coverage from CSV with start/end datetimes. + """Vectorised creation of recording coverage from CSV with start/end datetimes. This method reads a CSV with columns: - "start_recording" @@ -61,7 +61,7 @@ def from_path( - `timestamp_file`: path to CSV - `timebin_origin`: Timedelta resolution of detections bin_size : Timedelta or BaseOffset - Size of the aggregation bin (e.g., Timedelta("1H") or "1D"). + Size of the aggregation bin (e.g. Timedelta("1H") or "1D"). Returns ------- @@ -121,7 +121,9 @@ def from_path( ].min(axis=1) # Remove rows with no actual recording interval - df = df.loc[df["effective_start_recording"] < df["effective_end_recording"]].copy() + df = df.loc[ + df["effective_start_recording"] < df["effective_end_recording"] + ].copy() if df.empty: msg = "No valid recording intervals after deployment intersection." @@ -136,7 +138,7 @@ def from_path( ) # Initialise effort vector (0 = no recording, 1 = recording) - # Compare each timestamp to all intervals in a vectorized manner + # Compare each timestamp to all intervals in a vectorised manner effort = Series(0, index=time_index) # Vectorised interval coverage From 47b170d89ecc620618bfd0e24533f7a0776d1202 Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Tue, 20 Jan 2026 17:20:43 +0100 Subject: [PATCH 29/33] legend histo shade --- src/post_processing/utils/plot_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 3a7f486..8d12fa3 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -149,6 +149,7 @@ def histo( shade_no_effort( ax=ax, observed=effort, + legend=legend, ) if season: @@ -642,6 +643,7 @@ def set_plot_title(ax: plt.Axes, annotators: list[str], labels: list[str]) -> No def shade_no_effort( ax: plt.Axes, observed: RecordingPeriod, + legend: bool, ) -> None: """Shade areas of the plot where no observation effort was made. @@ -652,6 +654,8 @@ def shade_no_effort( observed : RecordingPeriod A Series with observation counts or flags, indexed by datetime. Should be aligned or re-indexable to `bin_starts`. + legend : bool + Wether to add the legend entry for the shaded regions. """ # Convert effort IntervalIndex → DatetimeIndex (bin starts) @@ -704,7 +708,7 @@ def shade_no_effort( handles_effort.append( Patch(facecolor="0.45", alpha=0.15, label="no data"), ) - if handles_effort: + if handles_effort and legend: labels_effort = [h.get_label() for h in handles_effort] handles = handles1 + handles_effort labels = labels1 + labels_effort From ef0e2b6474f8044de561f39a70372178892de9e5 Mon Sep 17 00:00:00 2001 From: MaelleTtrt <48910568+MaelleTtrt@users.noreply.github.com> Date: Thu, 22 Jan 2026 11:52:18 +0100 Subject: [PATCH 30/33] improve test_recording_period --- tests/test_recording_period.py | 82 ++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 34 deletions(-) diff --git a/tests/test_recording_period.py b/tests/test_recording_period.py index 73dd55e..064c6b5 100644 --- a/tests/test_recording_period.py +++ b/tests/test_recording_period.py @@ -1,5 +1,4 @@ -from pandas import Interval, Timestamp -from pandas.tseries import frequencies +from pandas import Timedelta, read_csv, to_datetime from post_processing.dataclass.detection_filter import DetectionFilter from post_processing.dataclass.recording_period import RecordingPeriod @@ -14,53 +13,68 @@ def test_recording_period_with_gaps(recording_planning_config: DetectionFilter) - weeks with partial effort, - weeks with zero effort. """ + histo_x_bin_size = Timedelta("7D") recording_period = RecordingPeriod.from_path( config=recording_planning_config, - bin_size=frequencies.to_offset("1W"), + bin_size=histo_x_bin_size, ) counts = recording_period.counts + origin = recording_planning_config.timebin_origin + nb_timebin_origin_per_histo_x_bin_size = int(histo_x_bin_size / origin) + # Computes effective recording intervals from recording planning csv + df_planning = read_csv( + recording_planning_config.timestamp_file, + parse_dates=[ + "start_recording", + "end_recording", + "start_deployment", + "end_deployment", + ], + ) + for col in [ + "start_recording", + "end_recording", + "start_deployment", + "end_deployment", + ]: + df_planning[col] = ( + to_datetime(df_planning[col], utc=True) + .dt.tz_convert(None) + ) + + df_planning["start"] = df_planning[ + ["start_recording", "start_deployment"] + ].max(axis=1) + df_planning["end"] = df_planning[ + ["end_recording", "end_deployment"] + ].min(axis=1) + + planning = df_planning.loc[df_planning["start"] < df_planning["end"]] # ------------------------------------------------------------------ # Structural checks # ------------------------------------------------------------------ assert not counts.empty assert counts.index.is_interval() assert counts.min() >= 0 - - # One week = 7 * 24 hours (origin = 1 min) - full_week_minutes = 7 * 24 * 60 - - # ------------------------------------------------------------------ - # Helper: find the bin covering a given timestamp - # ------------------------------------------------------------------ - def bin_covering(ts: Timestamp) -> Interval: - for interval in counts.index: - if interval.left <= ts < interval.right: - return interval - msg = f"No bin covers timestamp {ts}" - raise AssertionError(msg) + assert counts.max() <= nb_timebin_origin_per_histo_x_bin_size # ------------------------------------------------------------------ - # Week fully inside the long gap → zero effort + # Find overlap (number of timebin_origin) within each effective recording period # ------------------------------------------------------------------ - gap_ts = Timestamp("2024-04-21") + for interval in counts.index: + bin_start = interval.left + bin_end = interval.right - gap_bin = bin_covering(gap_ts) - assert counts.loc[gap_bin] == 0 + # Compute overlap with all recording intervals + overlap_start = planning["start"].clip(lower=bin_start, upper=bin_end) + overlap_end = planning["end"].clip(lower=bin_start, upper=bin_end) - # ------------------------------------------------------------------ - # Week fully inside recording → full effort - # ------------------------------------------------------------------ - full_effort_ts = Timestamp("2024-02-04") - - full_bin = bin_covering(full_effort_ts) - assert counts.loc[full_bin] == full_week_minutes - - # ------------------------------------------------------------------ - # Week overlapping recording stop → partial effort - # ------------------------------------------------------------------ - partial_ts = Timestamp("2024-04-14") + overlap = (overlap_end - overlap_start).clip(lower=Timedelta(0)) + expected_minutes = int(overlap.sum() / recording_planning_config.timebin_origin) - partial_bin = bin_covering(partial_ts) - assert counts.loc[partial_bin] == 0 + assert counts.loc[interval] == expected_minutes, ( + f"Mismatch for bin {interval}: " + f"expected {expected_minutes}, got {counts.loc[interval]}" + ) From 2a5a8c839088880a1f2cc307f07960eff8af925b Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Mon, 26 Jan 2026 17:03:51 +0100 Subject: [PATCH 31/33] refacto --- src/post_processing/dataclass/data_aplose.py | 23 +- src/post_processing/utils/core_utils.py | 44 +- src/post_processing/utils/filtering_utils.py | 25 +- src/post_processing/utils/fpod_utils.py | 691 ++++--------------- src/post_processing/utils/glider_utils.py | 2 +- src/post_processing/utils/plot_utils.py | 2 +- tests/test_fpod_utils.py | 4 +- user_case/example_FPOD-CPOD_raw.ipynb | 10 +- user_case/resource/CPOD-FPOD_yaml.yml | 6 +- user_case/user_case_CALAIS.ipynb | 17 +- 10 files changed, 207 insertions(+), 617 deletions(-) diff --git a/src/post_processing/dataclass/data_aplose.py b/src/post_processing/dataclass/data_aplose.py index e1d3cae..ce2d9da 100644 --- a/src/post_processing/dataclass/data_aplose.py +++ b/src/post_processing/dataclass/data_aplose.py @@ -431,14 +431,15 @@ def plot( season = kwargs.get("season", False) effort = kwargs.get("effort") - return scatter(df=df_filtered, - ax=ax, - time_range=time, - show_rise_set=show_rise_set, - season=season, - coordinates=self.coordinates, - effort=effort, - ) + return scatter( + df=df_filtered, + ax=ax, + time_range=time, + show_rise_set=show_rise_set, + season=season, + coordinates=self.coordinates, + effort=effort, + ) if mode == "agreement": bin_size = kwargs.get("bin_size") @@ -452,7 +453,11 @@ def plot( label, ) - return timeline(df=df_filtered, ax=ax, color=color) + return timeline( + df=df_filtered, + ax=ax, + color=color, + ) msg = f"Unsupported plot mode: {mode}" raise ValueError(msg) diff --git a/src/post_processing/utils/core_utils.py b/src/post_processing/utils/core_utils.py index c149155..d0125ae 100644 --- a/src/post_processing/utils/core_utils.py +++ b/src/post_processing/utils/core_utils.py @@ -8,7 +8,8 @@ import astral import easygui import numpy as np -from astral.sun import sunrise, sunset +from astral import LocationInfo +from astral.sun import sunrise, sunset, sun from matplotlib import pyplot as plt from osekit.config import TIMESTAMP_FORMAT_AUDIO_FILE from osekit.utils.timestamp_utils import strftime_osmose_format, strptime_from_text @@ -579,3 +580,44 @@ def timedelta_to_str(td: Timedelta) -> str: if seconds % 60 == 0: return f"{seconds // 60}min" return f"{seconds}s" + + +def assign_light_regime( + ts: Timestamp, + lat: float | None = None, + lon: float | None = None, +) -> DataFrame: + """Assign daylight regime to temporal events. + + Parameters + ---------- + ts: Timestamp + Timestamp to assign a light regime to. + lat: float + The latitude of corresponding point. + lon: float + The longitude of corresponding point. + + Returns + ------- + DataFrame + The same dataframe with the column daytime. + + """ + if not all([lat, lon]): + lat, lon = get_coordinates() + + # Get sun times for given location + location = LocationInfo(latitude=lat, longitude=lon) + s = sun(location.observer, date=ts.date()) + + if ts < s['dawn']: + return 'night' + elif ts < s['sunrise']: + return 'dawn' + elif ts < s['sunset']: + return 'day' + elif ts < s['dusk']: + return 'dusk' + else: + return 'night' diff --git a/src/post_processing/utils/filtering_utils.py b/src/post_processing/utils/filtering_utils.py index faf2fd7..661324b 100644 --- a/src/post_processing/utils/filtering_utils.py +++ b/src/post_processing/utils/filtering_utils.py @@ -510,8 +510,8 @@ def reshape_timebin( timebin_new: Timedelta The size of the new time bin. timestamp_audio: list[Timestamp] - A list of Timestamp objects corresponding to the start of each wav - that corresponds to a detection + A list of Timestamp objects corresponding to the shape + in which the data should be reshaped. Returns ------- @@ -571,17 +571,16 @@ def get_filename_timestamps(df: DataFrame, date_parser: str) -> list[Timestamp]: """ tz = get_timezone(df) - timestamps = [ - strptime_from_text( - ts, - datetime_template=date_parser, - ) for ts in df["filename"] - ] - - if all(t.tz is None for t in timestamps): - timestamps = [t.tz_localize(tz) for t in timestamps] - - return timestamps + try: + return [ + to_datetime( + ts, + format=date_parser, + ).tz_localize(tz) for ts in df["filename"] + ] + except ValueError: + msg = """Could not parse timestamps from `df["filename"]`.""" + raise ValueError(msg) from None def ensure_in_list(value: str, candidates: list[str], label: str) -> None: diff --git a/src/post_processing/utils/fpod_utils.py b/src/post_processing/utils/fpod_utils.py index d36bb89..12cee28 100644 --- a/src/post_processing/utils/fpod_utils.py +++ b/src/post_processing/utils/fpod_utils.py @@ -15,9 +15,10 @@ from pandas import ( DataFrame, DateOffset, + Series, Timedelta, + Timestamp, concat, - date_range, notna, read_csv, to_datetime, @@ -25,7 +26,7 @@ to_timedelta, ) -from post_processing.utils.core_utils import get_coordinates, get_sun_times +from post_processing.utils.filtering_utils import find_delimiter from user_case.config import season_color, site_colors if TYPE_CHECKING: @@ -40,9 +41,9 @@ def pod2aplose( dataset_name: str, annotation: str, annotator: str, - bin_size: int = 60, + bin_size: Timedelta, ) -> DataFrame: - """Format PODs DataFrame to match APLOSE format. + """Format PODs DataFrame to match an APLOSE format. Parameters ---------- @@ -56,7 +57,7 @@ def pod2aplose( annotation name. annotator: str annotator name. - bin_size: int + bin_size: Timedelta Duration of the detections in seconds. Returns @@ -65,22 +66,19 @@ def pod2aplose( An APLOSE formatted DataFrame. """ - df = df.copy() - df["_temp_dt"] = [ - tz.localize(strptime_from_text(entry, "%d/%m/%Y %H:%M")) - for entry in df["ChunkEnd"] + df["Datetime"] = [ + tz.localize(entry) + for entry in df["Datetime"] ] - df = df.sort_values("_temp_dt").reset_index(drop=True) - - fpod_start_dt = df["_temp_dt"].tolist() - fpod_end_dt = [entry + Timedelta(seconds=bin_size) for entry in fpod_start_dt] + fpod_start_dt = df["Datetime"] + fpod_end_dt = [entry + bin_size for entry in df["Datetime"]] data = { "dataset": [dataset_name] * len(df), "filename": [strftime_osmose_format(entry) for entry in fpod_start_dt], "start_time": [0] * len(df), - "end_time": [bin_size] * len(df), + "end_time": [bin_size.total_seconds()] * len(df), "start_frequency": [0] * len(df), "end_frequency": [0] * len(df), "annotation": [annotation] * len(df), @@ -88,304 +86,121 @@ def pod2aplose( "start_datetime": [strftime_osmose_format(entry) for entry in fpod_start_dt], "end_datetime": [strftime_osmose_format(entry) for entry in fpod_end_dt], "is_box": [0] * len(df), - "deploy.name": df["deploy.name"].tolist(), + "deploy": df["Deploy"].tolist(), } return DataFrame(data) -def csv_folder( - folder_path: Path, - sep: str = ";", - encoding: str = "latin-1", +def load_pod_folder( + folder: Path, + ext: str, ) -> DataFrame: - """Process all CSV files from a folder. + """Read POD's result files from a folder. Parameters ---------- - folder_path: Path + folder: Path Folder's place. - sep: str, default=";" - Column separator. - encoding: str, default="latin-1" - File encoding. + ext: str + File extension of result files. Returns ------- DataFrame - Concatenated data with optional filename column. + Concatenated data. Raises ------ ValueError - If no CSV files found. + If no result files are found. """ - all_files = list(folder_path.rglob("*.csv")) - - if not all_files: - msg = f"No .csv files found in {folder_path}" + if ext not in {"csv", "txt"}: + msg = f"Invalid file extension: {ext}" raise ValueError(msg) - all_data = [] - for file in all_files: - df = read_csv(file, sep=sep, encoding=encoding, dtype="O") - df["deploy.name"] = file.stem - all_data.append(df) - - return concat(all_data, ignore_index=True) - - -def txt_folder( - folder_path: Path, - sep: str = "\t", -) -> DataFrame: - r"""Process all TXT files from a folder. - - Parameters - ---------- - folder_path: Path - Folder's place. - sep: str, default="\t" - Column separator. - - Returns - ------- - DataFrame - Concatenated data from all TXT files. - - """ - all_files = list(Path(folder_path).rglob("*.txt")) + all_files = sorted(folder.rglob(f"*.{ext}")) if not all_files: - msg = f"No .txt files found in {folder_path}" + msg = f"No .{ext} files found in {folder}" raise ValueError(msg) all_data = [] for file in all_files: - file_path = folder_path / file - df = read_csv(file_path, sep=sep) - all_data.append(df) - - return concat(all_data, ignore_index=True) - + sep = find_delimiter(file) + df = read_csv( + file, + sep=sep, + ) -def parse_timestamps( - df: DataFrame, - col_timestamp: str, - date_formats: list[str] | None = None, -) -> DataFrame: - """Parse timestamp column with multiple possible formats. + df["Deploy"] = file.stem.strip().lower().replace(" ", "_") + all_data.append(df) - Parameters - ---------- - df: DataFrame - Input dataframe. - col_timestamp: str - Name of the timestamp column to parse. - date_formats: list[str], optional - List of strptime formats to try. If None, uses common formats. + data = concat(all_data, ignore_index=True) - Returns - ------- - DataFrame - Copy of df with parsed timestamps. + if ext == "csv": + if "%TimeLost" in data.columns: + data_filtered = data[data["File"].notna()].copy() + data_filtered = data_filtered[data_filtered["Nall/m"].notna()] + else: + data_filtered = data[data["DPM"] > 0].copy() + data_filtered = data_filtered[data_filtered["Nall"].notna()] - """ - if date_formats is None: - date_formats = [ - "%Y-%m-%dT%H:%M:%S.%f%z", - "%Y-%m-%dT%H:%M:%S:%Z", - "%Y-%m-%dT%H:%M:%S.%f", - "%Y-%m-%dT%H:%M:%S", - "%Y-%m-%d %H:%M:%S.%f", - "%d/%m/%Y %H:%M", + data_filtered["Datetime"] = [ + strptime_from_text(dt, "%d/%m/%Y %H:%M") for dt in data_filtered["ChunkEnd"] ] + return data_filtered.sort_values(by=["Datetime"]).reset_index(drop=True) - df = df.copy() - df[col_timestamp] = df[col_timestamp].apply( - lambda x: strptime_from_text(x, date_formats)) - return df - - -def required_columns( - df: DataFrame, - columns: list[str], -) -> None: - """Validate that required columns exist in dataframe. - - Parameters - ---------- - df: DataFrame - Table to validate. - columns : list[str] - List of required column names. - - Raises - ------ - ValueError - If any required column is missing. + if ext == "txt": + pod_type = {Path(f).suffix.lower().strip(".p3") for f in data["File"]} - """ - for col in columns: - if col not in df.columns: - msg = f"'{col}' not found in {df}" + if len(pod_type) != 1: + msg = f"Multiple POD types found in {folder}: {pod_type}" raise ValueError(msg) + pod_type = pod_type.pop() -def create_mask( - df: DataFrame, - col_timestamp: str, - col_start: str, - col_end: str, -) -> DataFrame: - """Filter rows to keep only those within deployment period. - - Parameters - ---------- - df : DataFrame - Table with timestamp and deployment period columns. - col_timestamp : str - Name of timestamp column. - col_start : str - Name of deployment start date column. - col_end : str - Name of deployment end date column. - - Returns - ------- - DataFrame - Filtered dataframe with rows in deployment periods. - - """ - mask = ( - (df[col_timestamp] >= df[col_start]) - & (df[col_timestamp] <= df[col_end]) - & df[col_timestamp].notna() - & df[col_start].notna() - & df[col_end].notna() - ) - return df[mask].copy() - - -def meta_cut_aplose( - raw_data: DataFrame, - metadata: DataFrame, -) -> DataFrame: - """Filter data to keep only rows within deployment periods. - - Parameters - ---------- - raw_data : DataFrame - Table containing deployment name and timestamps. - metadata : DataFrame - Metadata with deployment periods (start/end dates). - - Returns - ------- - DataFrame - Filtered data with only rows within deployment periods. - - """ - required_columns( - raw_data, ["deploy.name", "start_datetime"]) - required_columns( - metadata, ["deploy.name", "deployment_date", "recovery_date"]) - - raw = parse_timestamps(raw_data, "start_datetime") - raw = raw.sort_values(["start_datetime"]) - - dfm = raw.merge( - metadata[["deploy.name", "deployment_date", "recovery_date"]], - on="deploy.name", - how="left", - ) - - out = create_mask(dfm, "start_datetime", "deployment_date", "recovery_date") - - columns_to_drop = [ - col for col in ["deployment_date", "recovery_date"] if col not in raw_data. - columns] - if columns_to_drop: - out = out.drop(columns=columns_to_drop) - - return out.sort_values(["start_datetime"]).reset_index(drop=True) - - -def add_utc( - df: DataFrame, - cols: list, - fr: str = "h", -) -> DataFrame: - """Create a DataFrame with one line per hour between start and end dates. - - Keep the number of detections per hour between these dates. - - Parameters - ---------- - df: DataFrame - Metadata dataframe with deployments information (previously exported as json). - cols:list - Timestamp column names. - fr:str - Frequency of the range of detections. - - Returns - ------- - DataFrame - A full period of time with positive and negative hours to detections. - - """ - for col in df[cols]: - df[col] = to_datetime(df[col], utc=True) - df[col] = df[col].dt.floor(fr) - return df - - -def build_range( - df: DataFrame, - fr: str = "h", -) -> DataFrame: - """Create a DataFrame with one line per hour between start and end dates. + data["Datetime"] = data.apply( + lambda row: get_feeding_buzz_datetime(row, pod=f"{pod_type}"), + axis=1, + ) + return data.sort_values(by=["Datetime"]).reset_index(drop=True) - Keep the number of detections per hour between these dates. + msg = f"Could not load {ext} result folder" + raise ValueError(msg) - Parameters - ---------- - df: DataFrame - Metadata dataframe with deployments information (previously exported as json) - fr:str - Frequency of the range of detections. - Returns - ------- - DataFrame - A full period of time with positive and negative hours to detections. +def get_feeding_buzz_datetime(row: Series, pod: str) -> Timestamp: + """Convert feeding buzz timestamp into a standard Timestamp. + The conversion method differs based on the POD type. """ - add_utc(df, ["Deb", "Fin"], fr) + if pod not in {"c", "f"}: + msg = f"Invalid POD type: {pod}" + raise ValueError(msg) - all_ranges = [] - for _, row in df.iterrows(): - hours = date_range(row["Deb"], row["Fin"], freq=fr) - tmp = DataFrame( - { - "deploy.name": row["deploy.name"], - "start_datetime": hours, - }, + if pod == "f": + return ( + to_datetime("1900-01-01") + + to_timedelta(row["Minute"], unit="min") + + to_timedelta(row["microsec"] / 1e6, unit="sec") - + to_timedelta(2, unit="D") ) - all_ranges.append(tmp) - return concat(all_ranges, ignore_index=True) + return strptime_from_text( + f"{row["Minute"]}:{int(str(row["microsec"])[0]):02d}.{int(str(row["microsec"])[1:])}", + "%-d/%-m/%Y %H:%M:%S.%f", + ) -def feeding_buzz( +def process_feeding_buzz( df: DataFrame, species: str, ) -> DataFrame: - """Process a CPOD/FPOD feeding buzz detection file. + """Process a POD feeding buzz detection DataFrame. - Gives the feeding buzz duration, depending on the studied species. + Give the feeding buzz duration, depending on the studied species. Parameters ---------- @@ -397,322 +212,69 @@ def feeding_buzz( Returns ------- DataFrame - Containing all ICIs for every positive minutes to clicks + Containing all ICIs for every positive minute to click """ - df["microsec"] /= 1e6 - - try: - df["Minute"].astype(int) - df["datetime"] = ( - to_datetime("1900-01-01") - + to_timedelta(df["Minute"], unit="min") - + to_timedelta(df["microsec"], unit="sec") - - to_timedelta(2, unit="D") - ) - df["start_datetime"] = df["datetime"].dt.floor("min") - except (ValueError, TypeError): - df["datetime"] = ( - to_datetime(df["Minute"], dayfirst=True) - + to_timedelta(df["microsec"], unit="sec") - ) - df["start_datetime"] = to_datetime(df["Minute"], dayfirst=True) - - df["ICI"] = df["datetime"].diff() - - if species == "Dauphin": # Herzing et al., 2014 - df["Buzz"] = df["ICI"].between(Timedelta(0), - Timedelta(seconds=0.02)).astype(int) - elif species == "Marsouin": # Nuuttila et al., 2013 - df["Buzz"] = df["ICI"].between(Timedelta(0), - Timedelta(seconds=0.01)).astype(int) - elif species == "Commerson": # Reyes Reyes et al., 2015 - df["Buzz"] = df["ICI"].between(Timedelta(0), - Timedelta(seconds=0.005)).astype(int) + df["Datetime"] = df["Datetime"].dt.floor("min") + df["ICI"] = df["Datetime"].diff() + + if species.lower() == "delphinid": # Herzing et al., 2014 + df["Buzz"] = df["ICI"].between( + Timedelta(0), + Timedelta(seconds=0.02), + ).astype(int) + elif species.lower() == "porpoise": # Nuuttila et al., 2013 + df["Buzz"] = df["ICI"].between( + Timedelta(0), + Timedelta(seconds=0.01), + ).astype(int) + elif species.lower() == "commerson": # Reyes Reyes et al., 2015 + df["Buzz"] = df["ICI"].between( + Timedelta(0), + Timedelta(seconds=0.005), + ).astype(int) else: msg = "This species is not supported" raise ValueError(msg) - f = df.groupby(["start_datetime"])["Buzz"].sum().reset_index() + df_buzz = df.groupby(["Datetime"])["Buzz"].sum().reset_index() + df_buzz["Foraging"] = to_numeric( + df_buzz["Buzz"] != 0, downcast="integer" + ).astype(int) - f["Foraging"] = to_numeric(f["Buzz"] != 0, downcast="integer").astype(int) + return df_buzz - return f +def process_timelost(df: DataFrame, threshold: int = 0) -> DataFrame: + """Process TimeLost DataFrame. -def assign_daytime( - df: DataFrame, -) -> DataFrame: - """Assign datetime categories to temporal events. - - Categorize daytime of the detection (among 4 categories). + Returns relevant columns and reshape into hourly data. Parameters ---------- df: DataFrame - Contains positive hours to detections. - - Returns - ------- - DataFrame - The same dataframe with the column daytime. - - """ - df["start_datetime"] = to_datetime(df["start_datetime"], utc=True) - start = df["start_datetime"].min() - stop = df["start_datetime"].max() - lat, lon = get_coordinates() - sunrise, sunset = get_sun_times(start, stop, lat, lon) - - sun_times = DataFrame( - {"date": date_range(start, stop, freq="D"), - "sunrise": [Timedelta(h, "hours") for h in sunrise], - "sunset": [Timedelta(h, "hours") for h in sunset], - }) - - sun_times["sunrise"] = sun_times["date"].dt.floor("D") + sun_times["sunrise"] - sun_times["sunset"] = sun_times["date"].dt.floor("D") + sun_times["sunset"] - - for i, row in df.iterrows(): - dpm_i = row["start_datetime"] - if notna(dpm_i): # Check if time is not NaN - jour_i = sun_times[ - (sun_times["sunrise"].dt.year == dpm_i.year) - & (sun_times["sunrise"].dt.month == dpm_i.month) - & (sun_times["sunrise"].dt.day == dpm_i.day) - ] - if not jour_i.empty: # Ensure there's a matching row - jour_i = jour_i.iloc[0] # Extract first match - if (dpm_i <= jour_i["sunrise"]) | (dpm_i > jour_i["sunset"]): - df.loc[i, "REGIME"] = 1 - else: - df.loc[i, "REGIME"] = 2 - - return df - - -def is_dpm_col( - df: DataFrame, -) -> DataFrame: - """Ensure DPM column exists with default value of 1. - - Parameters - ---------- - df: DataFrame - Input dataframe. - - Returns - ------- - DataFrame - Copy of df with DPM column. - - """ - df = df.copy() - if "DPM" not in df.columns: - df["DPM"] = 1 - return df - - -def resample_dpm( - df: DataFrame, - frq: str, - cols: dict[str, str], - group_by: list[str] | None = None, -) -> DataFrame: - """Resample DPM data to specified time frequency. - - Aggregates Detection Positive Minutes (DPM) by time period, - optionally preserving grouping columns like deployment name. - - Parameters - ---------- - df: DataFrame - CPOD result DataFrame with DPM data. - frq: str - Pandas frequency string: "D" (day), "h" (hour), "10min", etc. - cols: dict[str, str] - Dictionary of column names and to process them. - group_by: list[str], optional - Columns to group by (e.g., ["deploy.name", "start_datetime"]). - If None, groups only by start_datetime. - - Returns - ------- - DataFrame - Resampled DataFrame with aggregated DPM values. - - Examples - -------- - >>> # Daily aggregation per deployment - >>> resample_dpm(df, "D", {"Foraging":"sum"}, group_by=["deploy.name"]) - - >>> # Hourly aggregation with site info preserved - >>> resample_dpm(df, "h", cols={"DPM":"sum","deploy.name":"first"}) - - """ - df = is_dpm_col(df) - df = add_utc(df, ["start_datetime"], frq) - - # Determine grouping columns - if group_by is None: - group_by = ["start_datetime"] - - return df.groupby(group_by).agg(cols).reset_index() - - -def deploy_period( - df: DataFrame, - col_timestamp: str = "start_datetime", - col_deployment: str = "deploy.name", -) -> DataFrame: - """Extract start and end timestamps for each deployment. - - Parameters - ---------- - df: DataFrame - Input dataframe with parsed timestamps. - col_timestamp: str, default="start_datetime" - Name of the timestamp column. - col_deployment: str, default="deploy.name" - Name of the deployment identifier column. - - Returns - ------- - DataFrame - Table with columns: [col_deployment, 'Deb', 'Fin']. - - """ - return ( - df.groupby([col_deployment]) - .agg(Deb=(col_timestamp, "first"), Fin=(col_timestamp, "last")) - .reset_index() - ) - - -def first_last( - df: DataFrame, - col_timestamp: str = "start_datetime", - col_deployment: str = "deploy.name", - date_formats: list[str] | None = None, -) -> DataFrame: - """Isolate beginning and end of every deployment. - - Parameters - ---------- - df: DataFrame - CPOD result DataFrame. - col_timestamp: str, default="start_datetime" - Name of the timestamps' column. - col_deployment: str, default="deploy.name" - Name of the deployment identifier column. - date_formats: list[str], optional - List of date formats to try for parsing. - - Returns - ------- - DataFrame - Table with deployment periods (Deb, Fin). - - """ - df_parsed = parse_timestamps(df, col_timestamp, date_formats) - return deploy_period(df_parsed, col_timestamp, col_deployment) - - -def actual_data( - df: DataFrame, - meta: DataFrame, -) -> DataFrame: - """Create a table with beginning and end of every deployment using metadata. - - Parameters - ---------- - df: DataFrame - Contains beginning and end for every deployment. - meta: DataFrame - Contains metadata for every deployment. - - Returns - ------- - DataFrame - Table with corrected deployment periods (Deb, Fin). - - """ - required_columns( - df, ["deploy.name", "ChunkEnd"]) - required_columns( - meta, ["deploy.name", "deployment_date", "recovery_date"]) - - beg_end = first_last(df, "ChunkEnd") - - beg_end = add_utc(beg_end, ["Deb", "Fin"]) - - final = beg_end.merge(meta[["deployment_date", "recovery_date", "deploy.name"]], - on="deploy.name", how="left") - final.loc[final["Deb"] < final["deployment_date"], "Deb"] = final["deployment_date"] - final.loc[final["Fin"] > final["recovery_date"], "Fin"] = final["recovery_date"] - final.loc[final["Deb"] > final["Fin"], ["Deb", "Fin"]] = None - final = final.sort_values(by=["Deb"]) - return final.drop(["deployment_date", "recovery_date"], axis=1) - - -def process_tl(tl_files: Path) -> DataFrame: - """Process Environmental data extracted from cpod.exe to get a usable dataframe. - - Parameters - ---------- - tl_files: Path All your Environmental data files. + threshold: float + TimeLost threshold. Returns ------- %TimeLost DataFrame. """ - df = csv_folder(tl_files) - df = df.dropna() - df = parse_timestamps(df, "ChunkEnd") - df = add_utc(df, ["ChunkEnd"], "h") - df["start_datetime"] = df["ChunkEnd"] - - return df.sort_values(["start_datetime"]) - - -def filter_tl(df: DataFrame, tl: int) -> DataFrame: - """Remove lines with a %TimeLost superior to the chosen threshold. - - Parameters - ---------- - df: DataFrame - Table of data and associated TimeLost. - tl: int - TimeLost filter threshold. - - Returns - ------- - Filtered DataFrame with few %TimeLost. - - """ - df["%TimeLost"] = (df["%TimeLost"].fillna(tl)).astype(int) - - return df[df["%TimeLost"] < tl] - - -def preserved_data(filtered_df: DataFrame, whole_df: DataFrame) -> float: - """Calculate the percentage of preserved data. - - Parameters - ---------- - filtered_df: DataFrame - Result of filter_tl. - whole_df: DataFrame - Table before filtering. - - Returns - ------- - Percentage of preserved data. + if threshold not in range(0, 100): + msg = "Threshold must be an integer between 0 and 100." + raise ValueError(msg) - """ - return (len(filtered_df) / len(whole_df)) * 100 + df["Datetime"] = df["Datetime"].dt.floor("h") + cols_to_drop = [ + col for col in df.columns if col not in { + "File", "Datetime", "Temp", "Angle", "%TimeLost", "Deploy" + } + ] + return df[df["%TimeLost"] >= threshold].drop( + columns=cols_to_drop + ).sort_values(["Datetime"]).reset_index(drop=True) def create_matrix( @@ -747,27 +309,6 @@ def create_matrix( return matrix -def extract_site( - df: DataFrame, -) -> DataFrame: - """Create new columns: site.name and campaign.name, in order to match the metadata. - - Parameters - ---------- - df: DataFrame - All values concatenated - - Returns - ------- - DataFrame - The same dataframe with two additional columns. - - """ - required_columns(df, ["deploy.name"]) - df[["site.name", "campaign.name"]] = df["deploy.name"].str.split("_", expand=True) - return df - - def percent_calc( data: DataFrame, time_unit: str | None = None, diff --git a/src/post_processing/utils/glider_utils.py b/src/post_processing/utils/glider_utils.py index 626371c..aaa0224 100644 --- a/src/post_processing/utils/glider_utils.py +++ b/src/post_processing/utils/glider_utils.py @@ -175,7 +175,7 @@ def load_glider_nav(directory: Path) -> DataFrame: msg = f"Directory '{directory}' does not exist." raise FileNotFoundError(msg) - file = [f for f in directory.glob("*.gz") if "gli" in f.name] + file = [f for f in directory.rglob("*.gz") if "gli" in f.name] if not len(file) > 0: msg = f"Directory '{directory}' does not contain '.gz' files." diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 8d12fa3..79c7673 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -577,7 +577,7 @@ def timeline( ax.grid(color="k", linestyle="-", linewidth=0.2) ax.set_yticks(np.arange(0, len(labels), 1)) - ax.set_yticklabels(labels[::-1]) + ax.set_yticklabels(labels) ax.set_xlabel("Date") ax.set_xlim( df["start_datetime"].min().floor("1d"), diff --git a/tests/test_fpod_utils.py b/tests/test_fpod_utils.py index a0ab178..d1c4ece 100644 --- a/tests/test_fpod_utils.py +++ b/tests/test_fpod_utils.py @@ -4,7 +4,7 @@ from pandas import DataFrame from post_processing.utils.fpod_utils import ( - csv_folder, + load_pod_folder, pod2aplose, ) @@ -142,7 +142,7 @@ def test_csv_folder_single_file(tmp_path) -> None: csv_file = tmp_path / "data.csv" csv_file.write_text("col1;col2\nval1;val2\nval3;val4", encoding="latin-1") - result = csv_folder(tmp_path) + result = load_pod_folder(tmp_path) assert isinstance(result, DataFrame) assert len(result) == 2 diff --git a/user_case/example_FPOD-CPOD_raw.ipynb b/user_case/example_FPOD-CPOD_raw.ipynb index 60e7f67..8412c3b 100644 --- a/user_case/example_FPOD-CPOD_raw.ipynb +++ b/user_case/example_FPOD-CPOD_raw.ipynb @@ -15,7 +15,7 @@ "\n", "import pytz\n", "\n", - "from post_processing.utils.fpod_utils import csv_folder, pod2aplose, actual_data, meta_cut_aplose, feeding_buzz, txt_folder, add_utc\n", + "from post_processing.utils.fpod_utils import load_pod_folder, pod2aplose, actual_data, filter_by_metadatax, process_feeding_buzz, load_pod_txt, add_utc\n", "from post_processing.utils.core_utils import json2df\n", "\n", "import logging\n", @@ -54,7 +54,7 @@ "cell_type": "code", "source": [ "pod_files = Path(r\"U:\\CA4\") #Path to your data folder. 🐬\n", - "path = csv_folder(pod_files) #Process all your POD.csv files.\n", + "path = load_pod_folder(pod_files) #Process all your POD.csv files.\n", "\n", "fb_files = Path(r\"U:\\fb_CA4\") #Path to your click details folder. 🐬\n", "json = Path(r\"C:\\Users\\fouinel\\Downloads\\deployment_calais.json\") #Path to your metadata file. 🐬\n", @@ -180,7 +180,7 @@ } }, "cell_type": "code", - "source": "cleared = meta_cut_aplose(df_aplose, metadatax)", + "source": "cleared = filter_by_metadatax(df_aplose, metadatax)", "id": "895bd5a116918285", "outputs": [], "execution_count": 5 @@ -211,8 +211,8 @@ "metadata": {}, "cell_type": "code", "source": [ - "fb_all = txt_folder(fb_files) #Read all your FB.txt files.\n", - "fb_all = feeding_buzz(fb_all, \"Marsouin\") #Categorize the minutes (positive or not to FB detection). 🐬\n", + "fb_all = load_pod_txt(fb_files) #Read all your FB.txt files.\n", + "fb_all = process_feeding_buzz(fb_all, \"Marsouin\") #Categorize the minutes (positive or not to FB detection). 🐬\n", "add_utc(fb_all, [\"start_datetime\"], \"min\")" ], "id": "64f824e02131d90a", diff --git a/user_case/resource/CPOD-FPOD_yaml.yml b/user_case/resource/CPOD-FPOD_yaml.yml index 3e259cc..ded85b0 100644 --- a/user_case/resource/CPOD-FPOD_yaml.yml +++ b/user_case/resource/CPOD-FPOD_yaml.yml @@ -6,7 +6,7 @@ # - Several csv files can be loaded at once, to perform this copy the template and paste it at the end of the present file. ######################################### -'U:\APLOSE_A1112.csv': +'C:\Users\dupontma2\Downloads\data_aplose.csv': # timebin: detection time resolution in seconds 'timebin_new': null # begin datetime: '2000-01-01T00:00:00+0000' @@ -16,7 +16,7 @@ # annotator: ['annotator1', 'annotator2'] 'annotator': null # annotation: ['annotation1', 'annotation2'] - 'annotation': "Commerson" + 'annotation': "Marsouin" # box: boolean value, whether to keep strong annotations 'box': False # timestamp file @@ -29,3 +29,5 @@ 'f_max': null # score 'score': null + # filename format + 'filename_format': '%Y-%m-%dT%H:%M:%S.%f%z' \ No newline at end of file diff --git a/user_case/user_case_CALAIS.ipynb b/user_case/user_case_CALAIS.ipynb index a5e49a3..f0f109a 100644 --- a/user_case/user_case_CALAIS.ipynb +++ b/user_case/user_case_CALAIS.ipynb @@ -6,8 +6,8 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2025-10-23T15:42:49.393544Z", - "start_time": "2025-10-23T15:42:47.415071Z" + "end_time": "2026-01-21T13:29:51.583226Z", + "start_time": "2026-01-21T13:29:43.385674Z" } }, "source": [ @@ -21,8 +21,8 @@ ")\n", "\n", "from post_processing.dataclass.data_aplose import DataAplose\n", - "from post_processing.utils.fpod_utils import csv_folder, pod2aplose, actual_data, meta_cut_aplose, resample_dpm, txt_folder, build_range, extract_site, percent_calc, site_percent, year_percent, ym_percent, month_percent, hour_percent, \\\n", - " feeding_buzz\n", + "from post_processing.utils.fpod_utils import load_pod_folder, pod2aplose, actual_data, filter_by_metadatax, resample_dpm, load_pod_txt, build_range, extract_site, percent_calc, site_percent, year_percent, ym_percent, month_percent, hour_percent, \\\n", + " process_feeding_buzz\n", "from post_processing.utils.core_utils import json2df,get_season" ], "outputs": [], @@ -53,7 +53,7 @@ "cell_type": "code", "source": [ "pod_files = Path(r\"U:\\Walde\")\n", - "path = csv_folder(pod_files)\n", + "path = load_pod_folder(pod_files)\n", "print(path.head())\n", "\n", "df_0 = path.dropna()\n", @@ -253,7 +253,7 @@ "source": [ "\n", "\n", - "cleared = meta_cut_aplose(df_aplose, metadatax) #Remove lines captures outside the instrument submersion." + "cleared = filter_by_metadatax(df_aplose, metadatax) #Remove lines captures outside the instrument submersion." ], "id": "ed6a06c522aea169", "outputs": [], @@ -347,8 +347,8 @@ "cell_type": "code", "source": [ "fb_files = Path(r\"U:\\fb_fpod_cetiroise_c\") #Path to your click details folder.\n", - "fb_all = txt_folder(fb_files)\n", - "fb_all = feeding_buzz(fb_all, \"Marsouin\")\n", + "fb_all = load_pod_txt(fb_files)\n", + "fb_all = process_feeding_buzz(fb_all, \"Marsouin\")\n", "\n", "fb_all[\"start_datetime\"] = fb_all[\"start_datetime\"].dt.floor(frq)\n", "fb = fb_all.groupby(\"start_datetime\")[\"Foraging\"].sum().reset_index()\n", @@ -363,6 +363,7 @@ "cell_type": "code", "source": [ "d_tot = resamp.merge(fb, on=\"start_datetime\", how=\"left\")\n", + "\n", "#This function aims to reindent 0 between the positive detections. It will be useful to produce first visualization graphs and use this dataset in R.\n", "d_hour = build_range(d_beg_end, frq)" ], From f0adf5bb77d7d4154741bca36b0f1b89f199ba92 Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Tue, 27 Jan 2026 17:16:31 +0100 Subject: [PATCH 32/33] refacto --- src/post_processing/utils/filtering_utils.py | 22 +- src/post_processing/utils/fpod_utils.py | 124 ++-- src/post_processing/utils/plot_utils.py | 2 + user_case/example_FPOD-CPOD_aplose.ipynb | 287 --------- .../example_FPOD-CPOD_firstresults.ipynb | 251 -------- user_case/example_FPOD-CPOD_raw.ipynb | 420 ------------- user_case/resource/CPOD-FPOD_yaml.yml | 33 - user_case/user_case_CALAIS.ipynb | 581 ------------------ 8 files changed, 82 insertions(+), 1638 deletions(-) delete mode 100644 user_case/example_FPOD-CPOD_aplose.ipynb delete mode 100644 user_case/example_FPOD-CPOD_firstresults.ipynb delete mode 100644 user_case/example_FPOD-CPOD_raw.ipynb delete mode 100644 user_case/resource/CPOD-FPOD_yaml.yml delete mode 100644 user_case/user_case_CALAIS.ipynb diff --git a/src/post_processing/utils/filtering_utils.py b/src/post_processing/utils/filtering_utils.py index 661324b..a9e0619 100644 --- a/src/post_processing/utils/filtering_utils.py +++ b/src/post_processing/utils/filtering_utils.py @@ -485,7 +485,8 @@ def _process_annotator_label_pair( time_vector[i] for i, detected in enumerate(detect_vec) if detected ] file_vector = [ - filename_vector[i] for i, detected in enumerate(detect_vec) if detected + # filename_vector[i] for i, detected in enumerate(detect_vec) if detected + filename_vector[i + 1] for i, detected in enumerate(detect_vec) if detected ] if not start_datetime: @@ -571,16 +572,17 @@ def get_filename_timestamps(df: DataFrame, date_parser: str) -> list[Timestamp]: """ tz = get_timezone(df) - try: - return [ - to_datetime( + timestamps = [ + strptime_from_text( ts, - format=date_parser, - ).tz_localize(tz) for ts in df["filename"] - ] - except ValueError: - msg = """Could not parse timestamps from `df["filename"]`.""" - raise ValueError(msg) from None + datetime_template=date_parser, + ) for ts in df["filename"] + ] + + if all(t.tz is None for t in timestamps): + timestamps = [t.tz_localize(tz) for t in timestamps] + + return timestamps def ensure_in_list(value: str, candidates: list[str], label: str) -> None: diff --git a/src/post_processing/utils/fpod_utils.py b/src/post_processing/utils/fpod_utils.py index 12cee28..cd13e58 100644 --- a/src/post_processing/utils/fpod_utils.py +++ b/src/post_processing/utils/fpod_utils.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -from pathlib import Path from typing import TYPE_CHECKING import matplotlib.dates as mdates @@ -30,6 +29,8 @@ from user_case.config import season_color, site_colors if TYPE_CHECKING: + from pathlib import Path + import pytz logger = logging.getLogger(__name__) @@ -66,26 +67,27 @@ def pod2aplose( An APLOSE formatted DataFrame. """ - df["Datetime"] = [ + fpod_start_dt = [ tz.localize(entry) for entry in df["Datetime"] ] - fpod_start_dt = df["Datetime"] - fpod_end_dt = [entry + bin_size for entry in df["Datetime"]] - data = { "dataset": [dataset_name] * len(df), - "filename": [strftime_osmose_format(entry) for entry in fpod_start_dt], + "filename": list(fpod_start_dt), "start_time": [0] * len(df), "end_time": [bin_size.total_seconds()] * len(df), "start_frequency": [0] * len(df), "end_frequency": [0] * len(df), "annotation": [annotation] * len(df), "annotator": [annotator] * len(df), - "start_datetime": [strftime_osmose_format(entry) for entry in fpod_start_dt], - "end_datetime": [strftime_osmose_format(entry) for entry in fpod_end_dt], - "is_box": [0] * len(df), + "start_datetime": [ + strftime_osmose_format(entry.floor(bin_size)) for entry in fpod_start_dt + ], + "end_datetime": [ + strftime_osmose_format(entry.ceil(bin_size)) for entry in fpod_start_dt + ], + "type": ["WEAK"] * len(df), "deploy": df["Deploy"].tolist(), } @@ -140,58 +142,67 @@ def load_pod_folder( data = concat(all_data, ignore_index=True) if ext == "csv": - if "%TimeLost" in data.columns: - data_filtered = data[data["File"].notna()].copy() - data_filtered = data_filtered[data_filtered["Nall/m"].notna()] - else: - data_filtered = data[data["DPM"] > 0].copy() - data_filtered = data_filtered[data_filtered["Nall"].notna()] + return _process_csv_data(data) + if ext == "txt": + return _process_txt_data(data) - data_filtered["Datetime"] = [ - strptime_from_text(dt, "%d/%m/%Y %H:%M") for dt in data_filtered["ChunkEnd"] - ] - return data_filtered.sort_values(by=["Datetime"]).reset_index(drop=True) + msg = f"Could not load {ext} result folder" + raise ValueError(msg) - if ext == "txt": - pod_type = {Path(f).suffix.lower().strip(".p3") for f in data["File"]} - if len(pod_type) != 1: - msg = f"Multiple POD types found in {folder}: {pod_type}" - raise ValueError(msg) +def _process_csv_data(data: DataFrame) -> DataFrame: + """Process CSV data with filtering and datetime conversion.""" + data_filtered = _filter_csv_data(data) + data_filtered["Datetime"] = [ + strptime_from_text(dt, "%d/%m/%Y %H:%M") + for dt in data_filtered["ChunkEnd"] + ] + return data_filtered.sort_values(by=["Datetime"]).reset_index(drop=True) - pod_type = pod_type.pop() - data["Datetime"] = data.apply( - lambda row: get_feeding_buzz_datetime(row, pod=f"{pod_type}"), - axis=1, - ) - return data.sort_values(by=["Datetime"]).reset_index(drop=True) +def _filter_csv_data(data: DataFrame) -> DataFrame: + """Filter CSV data based on available columns.""" + if "%TimeLost" in data.columns: + data_filtered = data[data["File"].notna()].copy() + data_filtered = data_filtered[data_filtered["Nall/m"].notna()] + else: + data_filtered = data[data["DPM"] > 0].copy() + data_filtered = data_filtered[data_filtered["Nall"].notna()] - msg = f"Could not load {ext} result folder" - raise ValueError(msg) + return data_filtered + + +def _process_txt_data(data: DataFrame) -> DataFrame: + """Process TXT data with datetime conversion.""" + data["Datetime"] = data.apply(get_feeding_buzz_datetime, axis=1) + return data.drop_duplicates().sort_values(by=["Datetime"]).reset_index(drop=True) -def get_feeding_buzz_datetime(row: Series, pod: str) -> Timestamp: +def get_feeding_buzz_datetime(row: Series) -> Timestamp: """Convert feeding buzz timestamp into a standard Timestamp. The conversion method differs based on the POD type. """ - if pod not in {"c", "f"}: - msg = f"Invalid POD type: {pod}" - raise ValueError(msg) - - if pod == "f": + try: return ( - to_datetime("1900-01-01") + - to_timedelta(row["Minute"], unit="min") + - to_timedelta(row["microsec"] / 1e6, unit="sec") - - to_timedelta(2, unit="D") + to_datetime("1900-01-01") + + to_timedelta(row["Minute"], unit="min") + + to_timedelta(row["microsec"] / 1e6, unit="sec") + - to_timedelta(2, unit="D") ) + except (KeyError, TypeError, ValueError): + pass - return strptime_from_text( - f"{row["Minute"]}:{int(str(row["microsec"])[0]):02d}.{int(str(row["microsec"])[1:])}", - "%-d/%-m/%Y %H:%M:%S.%f", - ) + try: + return strptime_from_text( + f"{row['Minute']}:{int(str(row['microsec'])[0]):02d}.{int(str(row['microsec'])[1:])}", + "%-d/%-m/%Y %H:%M:%S.%f", + ) + except (KeyError, TypeError, ValueError): + pass + + msg = "Could not convert feeding buzz timestamp." + raise ValueError(msg) def process_feeding_buzz( @@ -200,7 +211,8 @@ def process_feeding_buzz( ) -> DataFrame: """Process a POD feeding buzz detection DataFrame. - Give the feeding buzz duration, depending on the studied species. + Give the feeding buzz duration, depending on the studied species + (`delphinid`, `porpoise` or `commerson`). Parameters ---------- @@ -215,8 +227,8 @@ def process_feeding_buzz( Containing all ICIs for every positive minute to click """ - df["Datetime"] = df["Datetime"].dt.floor("min") df["ICI"] = df["Datetime"].diff() + df["Datetime"] = df["Datetime"].dt.floor("min") if species.lower() == "delphinid": # Herzing et al., 2014 df["Buzz"] = df["ICI"].between( @@ -239,7 +251,7 @@ def process_feeding_buzz( df_buzz = df.groupby(["Datetime"])["Buzz"].sum().reset_index() df_buzz["Foraging"] = to_numeric( - df_buzz["Buzz"] != 0, downcast="integer" + df_buzz["Buzz"] != 0, downcast="integer", ).astype(int) return df_buzz @@ -262,18 +274,18 @@ def process_timelost(df: DataFrame, threshold: int = 0) -> DataFrame: %TimeLost DataFrame. """ - if threshold not in range(0, 100): - msg = "Threshold must be an integer between 0 and 100." + if threshold not in range(101): + msg = "Threshold must integer between 0 and 100." raise ValueError(msg) df["Datetime"] = df["Datetime"].dt.floor("h") cols_to_drop = [ col for col in df.columns if col not in { - "File", "Datetime", "Temp", "Angle", "%TimeLost", "Deploy" + "File", "Datetime", "Temp", "Angle", "%TimeLost", "Deploy", } ] - return df[df["%TimeLost"] >= threshold].drop( - columns=cols_to_drop + return df[df["%TimeLost"] <= threshold].drop( + columns=cols_to_drop, ).sort_values(["Datetime"]).reset_index(drop=True) @@ -313,7 +325,7 @@ def percent_calc( data: DataFrame, time_unit: str | None = None, ) -> DataFrame: - """Calculate percentage of clicks, feeding buzzes and positive hours to detection. + """Calculate the percentage of clicks, feeding buzzes and positive hours to detection. Computed on the entire effort and for every site. @@ -358,7 +370,7 @@ def percent_calc( def site_percent(df: DataFrame, metric: str) -> None: - """Plot a graph with percentage of minutes positive to detection for every site. + """Plot a graph with the percentage of minutes positive to detection for every site. Parameters ---------- diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index 79c7673..4cffcd6 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -239,6 +239,7 @@ def scatter( season = kwargs.get("season", False) coordinates = kwargs.get("coordinates", False) effort = kwargs.get("effort", False) + legend = kwargs.get("legend", False) _prepare_timeline_plot( df=df, @@ -282,6 +283,7 @@ def scatter( shade_no_effort( ax=ax, observed=effort, + legend=legend, ) diff --git a/user_case/example_FPOD-CPOD_aplose.ipynb b/user_case/example_FPOD-CPOD_aplose.ipynb deleted file mode 100644 index b0da1c8..0000000 --- a/user_case/example_FPOD-CPOD_aplose.ipynb +++ /dev/null @@ -1,287 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "id": "initial_id", - "metadata": { - "collapsed": true - }, - "source": [ - "from pathlib import Path\n", - "\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from pandas import (\n", - " read_csv,\n", - ")\n", - "\n", - "from post_processing.dataclass.data_aplose import DataAplose\n", - "from post_processing.utils.fpod_utils import resample_dpm, build_range, process_tl, filter_tl, preserved_data" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Load data\n", - "DPM = Detection Positive Minutes \\\n", - "FB = Feeding buzzes \\\n", - "🐬 = input to modify\n" - ], - "id": "a97e19830123b732" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "yaml_file = Path(r\"resource\\CPOD-FPOD_yaml.yml\") #Change the file path in the yaml sheet.🐬\n", - "data_list = DataAplose.from_yaml(file=yaml_file)\n", - "\n", - "d_beg_end = read_csv(r\"U:\\Deb_Fin_CA4.csv\") #Beginning and end of recording for every phase. 🐬\n", - "\n", - "tl_path = Path(r\"U:\\TimeLost\\tl_ca4\\phases\")\n", - "tl_df = process_tl(tl_path)\n", - "tl_df = tl_df.drop_duplicates(subset=['ChunkEnd'], keep=\"first\")\n", - "tl_df[\"Angle\"] = (tl_df[\"Angle\"].replace(',', '.', regex=True)).astype(float)\n", - "tl_df[\"Temp\"] = (tl_df[\"Temp\"].replace(',', '.', regex=True)).astype(float)" - ], - "id": "7da2feb5958db1a9", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Resample your data\n", - "Here you need to choose the format in which you want to visualise the positive detections. This aims to replace the functionalities available in Chelonia's softwares." - ], - "id": "3bc57f4f638ad6dc" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "frq = \"h\" #Determine the format in which you want to visualise your data. Use \"D\", \"h\" or \"10min\". 🐬\n", - "tl = 100 #%TimeLost threshold. If you do not want to set a filter, set tl to 100." - ], - "id": "9b0a078a262ac7f2", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "resamp = resample_dpm(data_list.df, frq=frq, cols={\"DPM\":\"sum\",\"Foraging\":\"sum\",\"deploy.name\":\"first\"}) #Resample your DPMs according to the chosen frq.", - "id": "fa3847d80ccf49c3", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Add the effort\n", - "To analyze the data, add zeros to view it based on effort." - ], - "id": "b92537991aa4ac4b" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_0 = build_range(d_beg_end, frq) #Create a dataframe from beginning to end of every phase filled with 0s.\n", - "d_tot = d_0.merge(resamp, on=[\"start_datetime\", \"deploy.name\"], how=\"left\")" - ], - "id": "4d76089ef06c6fdb", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Add additional metrics columns", - "id": "e6a4623e4baf25b5" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_tot[[\"DPM\",\"Foraging\"]] = d_tot[[\"DPM\",\"Foraging\"]].fillna(0)\n", - "d_tot[\"FBR\"] = d_tot[\"Foraging\"] / d_tot[\"DPM\"] #The Feeding Buzz Ratio corresponds to the proportion of FB among the recorded clicks.\n", - "d_tot[\"FBR\"] = d_tot[\"FBR\"].fillna(0)\n", - "d_tot[f\"DP{frq}\"] = (d_tot[\"DPM\"] >0).astype(int) #1 if the hour counts at least 1 DPM, else 0." - ], - "id": "912268e5e997dbc6", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_tot[\"DPH_fb\"] = (d_tot[\"Foraging\"] >0).astype(int)\n", - "d_tot[\"FBR_h\"] = d_tot[\"DPH_fb\"] / d_tot[\"DPH\"]\n", - "d_tot[\"FBR_h\"] = d_tot[\"FBR_h\"].fillna(0)" - ], - "id": "23e3e4137d9e2a84", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Add time columns", - "id": "a775158ba810957a" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_tot = d_tot.assign(**{attr: getattr(d_tot[\"start_datetime\"].dt, attr.lower())\n", - " for attr in ['Year', 'Month', 'Day', 'Hour']})\n", - "d_tot['YM'] = d_tot[\"start_datetime\"].dt.to_period('M').dt.to_timestamp()\n", - "d_tot['Date'] = d_tot[\"start_datetime\"].dt.to_period('D').dt.to_timestamp()" - ], - "id": "62ce5a31ed0db25a", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Filter your data\n", - "Chose a threshold of %TimeLost to remove all data exceeding it." - ], - "id": "98d31fb21ffb9165" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "full_df = d_tot.merge(\n", - " tl_df[[\"start_datetime\", \"%TimeLost\", \"Angle\", \"Temp\"]],\n", - " on=\"start_datetime\", how=\"left\")" - ], - "id": "66bf795805047a3d", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "filtered_df = filter_tl(full_df, tl)\n", - "preserved_data(filtered_df, full_df)" - ], - "id": "c61f1c95fc05749e", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Visualize environmental data", - "id": "e0ea7247e27a37b6" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "resamp_tot = filtered_df.set_index(\"start_datetime\").resample(frq).first().reset_index()\n", - "\n", - "fig, ax = plt.subplots(figsize=(12, 6))\n", - "ax.plot(resamp_tot[\"start_datetime\"], resamp_tot[\"Angle\"])\n", - "plt.title(\"Angle of the instrument over time\")\n", - "plt.show()" - ], - "id": "14ff46dec308dcc1", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "fig2, ax = plt.subplots(figsize=(12, 6))\n", - "ax.plot(resamp_tot[\"start_datetime\"], resamp_tot[\"Temp\"])\n", - "plt.title(\"Temperature over time\")\n", - "plt.show()" - ], - "id": "db07736375a767d9", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "fig3, ax = plt.subplots(figsize=(12, 6))\n", - "ax.plot(resamp_tot[\"start_datetime\"], resamp_tot[\"%TimeLost\"])\n", - "plt.title(\"%TimeLost over time\")\n", - "plt.show()" - ], - "id": "7409831bc24271e3", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Extract your processed data\n", - "This dataframe is now compatible for analyses on the next notebook and on R." - ], - "id": "c64d09af5a11213d" - }, - { - "metadata": {}, - "cell_type": "code", - "source": "filtered_df.to_csv(r\"U:\\Hours_DPM_FBUZZ_CA4.csv\", index=False) #Name your file. 🐬", - "id": "f1f9ec385038ba87", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### First visualization\n", - "Precise the coordinates of the location of your listening point" - ], - "id": "a1b31aa6bd8f4d70" - }, - { - "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "", - "id": "3fda0cc8174fa757" - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/user_case/example_FPOD-CPOD_firstresults.ipynb b/user_case/example_FPOD-CPOD_firstresults.ipynb deleted file mode 100644 index 1c3366b..0000000 --- a/user_case/example_FPOD-CPOD_firstresults.ipynb +++ /dev/null @@ -1,251 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "id": "initial_id", - "metadata": { - "collapsed": true - }, - "source": [ - "from pandas import (\n", - " concat,\n", - " read_csv,\n", - " to_datetime,\n", - ")\n", - "\n", - "from post_processing.utils.fpod_utils import extract_site, percent_calc, year_percent, ym_percent, create_matrix,hist_mean_h, hist_mean_m, hist_mean_s\n", - "from post_processing.utils.core_utils import get_season" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "## Overview\n", - "Import the right dataset." - ], - "id": "36421fdbbca9aed6" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *CALAIS*", - "id": "caea0e065ad8068c" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "ca4 = read_csv(r\"L:\\acoustock\\Bioacoustique\\DATASETS\\CPOD_PROJETS\\CALAIS\\DATA\\DATA_FULL_OTHERCET\\Hours_DPM_FBUZZ_CA4_ssTL.csv\")\n", - "walde = read_csv(r\"L:\\acoustock\\Bioacoustique\\DATASETS\\CPOD_PROJETS\\CALAIS\\DATA\\DATA_FULL_OTHERCET\\Hours_DPM_FBUZZ_Walde_ssTL.csv\")\n", - "\n", - "data_c = concat([ca4, walde])\n", - "data_c[\"start_datetime\"] = to_datetime(data_c[\"start_datetime\"])\n", - "data_c[\"start_datetime\"] = data_c[\"start_datetime\"].apply(lambda x : x.tz_convert(\"Europe/Paris\"))\n", - "data_c[\"Hour\"] = data_c[\"start_datetime\"].dt.hour" - ], - "id": "1268d9e6ce5cdf32", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *CETIROISE*", - "id": "b426e672fdd5c6b8" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "ceti = read_csv(r\"U:\\Hours_DPM_FBUZZ_CETIROISE.csv\")\n", - "\n", - "ceti[\"start_datetime\"] = to_datetime(ceti[\"start_datetime\"])\n", - "ceti[\"start_datetime\"] = ceti[\"start_datetime\"].apply(lambda x : x.tz_convert(\"CET\")) #TimeZone Central European Time\n", - "ceti[\"Hour\"] = ceti[\"start_datetime\"].dt.hour" - ], - "id": "870bc0a014561ba8", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *KERGUELEN*", - "id": "17a5ce1338f6cd1a" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "a = read_csv(r\"U:\\Hours_DPM_FBUZZ_A.csv\")\n", - "b = read_csv(r\"U:\\Hours_DPM_FBUZZ_B.csv\")\n", - "c = read_csv(r\"U:\\Hours_DPM_FBUZZ_C.csv\")\n", - "d = read_csv(r\"U:\\Hours_DPM_FBUZZ_D.csv\")\n", - "\n", - "data_k = concat([a, b, c, d])\n", - "data_k[\"start_datetime\"] = to_datetime(data_k[\"start_datetime\"])\n", - "data_k[\"start_datetime\"] = data_k[\"start_datetime\"].apply(lambda x : x.tz_convert(\"Indian/Kerguelen\"))\n", - "data_k[\"Hour\"] = data_k[\"start_datetime\"].dt.hour" - ], - "id": "d65697a1f1487f4c", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### First results\n", - "Precise your dataset." - ], - "id": "9fc3b5075bf7ff2c" - }, - { - "metadata": {}, - "cell_type": "code", - "source": "data = data_c #🐬", - "id": "add4a626d6cc25a4", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "data = extract_site(data)\n", - "data[\"YMH\"] = data[\"Year\"].astype(str) + '-' + data[\"Month\"].astype(str) + '-' + data[\"Hour\"].astype(str)\n", - "y_per = percent_calc(data, \"Year\")\n", - "#t_per = percent_calc(data, \"TRAVAUX\")" - ], - "id": "37ecc80eda8e57ed", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "ym_per = percent_calc(data, \"YM\")\n", - "ym_per[\"YM\"] = to_datetime(ym_per[\"YM\"])\n", - "ym_per[\"Season\"] = ym_per[\"YM\"].apply(lambda x: get_season(x)[0]) #If in the southern hemisphere, write \"get_season(x, northern = False)\".\n", - "ym_per[\"Month\"] = ym_per[\"YM\"].dt.month\n", - "ym_per[\"Year\"] = ym_per[\"YM\"].dt.year" - ], - "id": "2b988869ed2466e1", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "per_h = percent_calc(data, \"YMH\")\n", - "per_h[[\"Y\",\"M\",\"Hour\"]] = per_h[\"YMH\"].str.split(\"-\", expand=True)\n", - "per_h[\"Hour\"] = per_h[\"Hour\"].astype(int)" - ], - "id": "cf704032c4a59a7b", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "matrice_s = create_matrix(ym_per, [\"site.name\"],[\"%DPH\", \"FBR\"])\n", - "matrice_m = create_matrix(ym_per, [\"site.name\", \"Month\"],[\"%click\", \"FBR\"])\n", - "matrice_h = create_matrix(per_h, [\"site.name\", \"Hour\"],[\"%click\", \"FBR\"])" - ], - "id": "caf3f71c6b6f70ca", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "hist_mean_s(\n", - " matrice_s,\n", - " metric_mean=\"%DPH_mean\",\n", - " metric_std=\"%DPH_std\",\n", - " y_lab=\"Moyenne %DPH\",\n", - " title_suffix=\"%DPH\"\n", - ")" - ], - "id": "2ff751ae02e80285", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "year_percent(y_per, \"FBR\")", - "id": "29f2703ab28c5b28", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "ym_percent(ym_per, \"DPM\")", - "id": "a2dacac3caecff5f", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "hist_mean_m(\n", - " matrice_m,\n", - " metric_mean=\"%click_mean\",\n", - " metric_std=\"%click_std\",\n", - " y_lab=\"Moyenne %click\",\n", - " title_suffix=\"%click\"\n", - ")" - ], - "id": "71161e7545bb1414", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "hist_mean_h(\n", - " matrice_h,\n", - " metric_mean=\"FBR_mean\",\n", - " metric_std=\"FBR_std\",\n", - " y_lab=\"Feeding buzz ratio\",\n", - " title_suffix=\"FBR\"\n", - ")" - ], - "id": "5cbea8601bce2172", - "outputs": [], - "execution_count": null - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/user_case/example_FPOD-CPOD_raw.ipynb b/user_case/example_FPOD-CPOD_raw.ipynb deleted file mode 100644 index 8412c3b..0000000 --- a/user_case/example_FPOD-CPOD_raw.ipynb +++ /dev/null @@ -1,420 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "id": "initial_id", - "metadata": { - "collapsed": true, - "ExecuteTime": { - "end_time": "2026-01-20T10:24:33.069494Z", - "start_time": "2026-01-20T10:24:29.180185Z" - } - }, - "source": [ - "from __future__ import annotations\n", - "\n", - "import pytz\n", - "\n", - "from post_processing.utils.fpod_utils import load_pod_folder, pod2aplose, actual_data, filter_by_metadatax, process_feeding_buzz, load_pod_txt, add_utc\n", - "from post_processing.utils.core_utils import json2df\n", - "\n", - "import logging\n", - "from pathlib import Path\n", - "import matplotlib.pyplot as plt\n", - "import matplotlib.ticker as ticker\n", - "from pandas import Timestamp, to_datetime\n", - "from pandas.tseries import frequencies\n", - "from post_processing.dataclass.data_aplose import DataAplose\n", - "from post_processing.utils.core_utils import get_season, get_count\n", - "from osekit import setup_logging\n", - "from user_case.config import season_color\n", - "setup_logging(Path(r\"C:\\Users\\dupontma2\\Documents\\Git\\OSmOSE\\OSmOSE_post_processing\\src\\post_processing\\logging_config.yaml\"), logging.ERROR)" - ], - "outputs": [], - "execution_count": 1 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Load data\n", - "🐬 = input to modify \\\n", - "Import your raw FPOD or CPOD data. All files for one site must be stored in the same folder and identified by their respective phases. \\\n", - "You also need to import your metadata file." - ], - "id": "c464f241817a1407" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-20T10:32:32.641354Z", - "start_time": "2026-01-20T10:32:09.650669Z" - } - }, - "cell_type": "code", - "source": [ - "pod_files = Path(r\"U:\\CA4\") #Path to your data folder. 🐬\n", - "path = load_pod_folder(pod_files) #Process all your POD.csv files.\n", - "\n", - "fb_files = Path(r\"U:\\fb_CA4\") #Path to your click details folder. 🐬\n", - "json = Path(r\"C:\\Users\\fouinel\\Downloads\\deployment_calais.json\") #Path to your metadata file. 🐬\n", - "\n", - "print(path.head())\n", - "df_0 = path.dropna()\n", - "\n", - "metadatax = json2df(json_path=json)\n", - "metadatax[\"deploy.name\"] = (metadatax[\"site.name\"].astype(str) + \"_\" + metadatax[\"campaign.name\"].astype(str))" - ], - "id": "6cf23db3b4288c29", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " File ChunkEnd DPM Nall MinsOn deploy.name\n", - "0 CA4 POD2397 file01.CP3 14/05/2014 07:07 0 0 1 CA4_Phase1\n", - "1 CA4 POD2397 file01.CP3 14/05/2014 07:08 0 8 1 CA4_Phase1\n", - "2 CA4 POD2397 file01.CP3 14/05/2014 07:09 0 4 1 CA4_Phase1\n", - "3 CA4 POD2397 file01.CP3 14/05/2014 07:10 0 251 1 CA4_Phase1\n", - "4 CA4 POD2397 file01.CP3 14/05/2014 07:11 0 4095 1 CA4_Phase1\n" - ] - } - ], - "execution_count": 2 - }, - { - "metadata": {}, - "cell_type": "code", - "source": "d_beg_end = actual_data(df_0, metadatax) #Extract the beginning and end of recording for every phase.", - "id": "fa52f8971b61aaf6", - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-20T10:32:43.618484Z", - "start_time": "2026-01-20T10:32:43.243995Z" - } - }, - "cell_type": "code", - "source": "df_1 = df_0[df_0[\"DPM\"] !=\"0\" ] #Remove the 0 to lighten the APLOSE file.", - "id": "769e128f2a5293e1", - "outputs": [], - "execution_count": 3 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### APLOSE format\n", - "#### *C-POD*\n", - "Use cpod2aplose if you are managing C-POD data." - ], - "id": "dd03975b7aef7eed" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-20T10:32:50.772619Z", - "start_time": "2026-01-20T10:32:50.060639Z" - } - }, - "cell_type": "code", - "source": [ - "df_aplose = pod2aplose(df_1, pytz.utc, \"CA4\", \"Marsouin\", \"CPOD\") #Precise site name, species and instrument. 🐬\n", - "print(df_aplose.head())" - ], - "id": "4cc867627d677529", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dataset filename start_time end_time \\\n", - "0 CA4 2014-05-16T02:30:00.000+0000 0 60 \n", - "1 CA4 2014-05-17T03:52:00.000+0000 0 60 \n", - "2 CA4 2014-05-17T04:47:00.000+0000 0 60 \n", - "3 CA4 2014-05-19T17:06:00.000+0000 0 60 \n", - "4 CA4 2014-05-20T11:07:00.000+0000 0 60 \n", - "\n", - " start_frequency end_frequency annotation annotator \\\n", - "0 0 0 Marsouin CPOD \n", - "1 0 0 Marsouin CPOD \n", - "2 0 0 Marsouin CPOD \n", - "3 0 0 Marsouin CPOD \n", - "4 0 0 Marsouin CPOD \n", - "\n", - " start_datetime end_datetime is_box \\\n", - "0 2014-05-16T02:30:00.000+0000 2014-05-16T02:31:00.000+0000 0 \n", - "1 2014-05-17T03:52:00.000+0000 2014-05-17T03:53:00.000+0000 0 \n", - "2 2014-05-17T04:47:00.000+0000 2014-05-17T04:48:00.000+0000 0 \n", - "3 2014-05-19T17:06:00.000+0000 2014-05-19T17:07:00.000+0000 0 \n", - "4 2014-05-20T11:07:00.000+0000 2014-05-20T11:08:00.000+0000 0 \n", - "\n", - " deploy.name \n", - "0 CA4_Phase1 \n", - "1 CA4_Phase1 \n", - "2 CA4_Phase1 \n", - "3 CA4_Phase1 \n", - "4 CA4_Phase1 \n" - ] - } - ], - "execution_count": 4 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Clean your dataset\n", - "Remove useless lines, recorder outside the instrument submersion. Export your file to the aplose format. You can change the name of the file to match the project you are working on." - ], - "id": "d2c642658dbfe278" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-20T10:33:14.509811Z", - "start_time": "2026-01-20T10:33:14.070492Z" - } - }, - "cell_type": "code", - "source": "cleared = filter_by_metadatax(df_aplose, metadatax)", - "id": "895bd5a116918285", - "outputs": [], - "execution_count": 5 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-20T10:33:24.735234Z", - "start_time": "2026-01-20T10:33:24.723966Z" - } - }, - "cell_type": "code", - "source": "cleared = cleared.drop_duplicates(subset=['start_datetime'], keep=\"last\")", - "id": "1a31da7341f3d8c9", - "outputs": [], - "execution_count": 7 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Feeding buzzes processing\n", - "Use \"Dauphin\", Marsouin\" or \"Commerson\" to get different ICI processing." - ], - "id": "4cf0b89a9491884" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "fb_all = load_pod_txt(fb_files) #Read all your FB.txt files.\n", - "fb_all = process_feeding_buzz(fb_all, \"Marsouin\") #Categorize the minutes (positive or not to FB detection). 🐬\n", - "add_utc(fb_all, [\"start_datetime\"], \"min\")" - ], - "id": "64f824e02131d90a", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "dpm_fb = cleared.merge(fb_all[[\"start_datetime\", \"Foraging\"]], on=[\"start_datetime\"], how=\"left\") #Merge DPM and FB dataframes", - "id": "e90f6d91de3f8ce3", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Export", - "id": "d114ed7164cfd0da" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_beg_end.to_csv(r\"U:\\Deb_Fin_CA4.csv\", index=False) #Export the new file. 🐬\n", - "dpm_fb.to_csv(r\"U:\\APLOSE_CA4_pos.csv\", index=False) #Name your file. 🐬" - ], - "id": "9d34e00f4e8147e8", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Explore\n", - "First visualization of the data" - ], - "id": "a85ea092d9fc197c" - }, - { - "metadata": {}, - "cell_type": "code", - "source": "data = DataAplose(dpm_fb)", - "id": "639c474690373895", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "bin_size = frequencies.to_offset(\"1d\")\n", - "ticks = frequencies.to_offset(\"6BMS\")\n", - "fmt = \"%b %y\"" - ], - "id": "cb476b5655bdff42", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "data.lat = 50.973333 #CA4: 51.00035 ; Walde: 50.973333 ; A: -49.38765 ; B: -49.424733 ; C: -49.4677 ; D: -49.47175\n", - "data.lon = 1.8117 #CA4: 1.879667 ; Walde: 1.8117 ; A: 69.9449 ; B: 69.932383 ; C: 70.081067 ; D: 69.836617" - ], - "id": "3fc33f2acf84ea34", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "#### Reshape the data\n", - "Set beginning and end of the chosen window." - ], - "id": "2857f26f8c168ad3" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "data.df[\"end_datetime\"] = to_datetime(data.df[\"end_datetime\"])\n", - "data2 = data.reshape(begin=Timestamp(\"2013 11 01\"), end=Timestamp(\"2025 08 01\"))\n", - "tz = pytz.timezone(\"UTC\")\n", - "data2.change_tz(tz)" - ], - "id": "75e00c1920b69409", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### Heatmap", - "id": "cdf3a92dfb6514d6" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "fig, ax = plt.subplots(1, 1)\n", - "ax = data2.set_ax(ax=ax, x_ticks_res=ticks, date_format=fmt)\n", - "data2.plot(\n", - " mode=\"heatmap\",\n", - " annotator=data2.annotators[0],\n", - " label=data2.labels[0],\n", - " ax=ax,\n", - " bin_size=bin_size,\n", - " show_rise_set=True,\n", - ")\n", - "plt.tight_layout()\n", - "plt.show()" - ], - "id": "6e1832101aec4156", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### Detections over time", - "id": "b0f3c6b7fc08a2be" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "fig2, ax = plt.subplots(1, 1)\n", - "ax = data2.set_ax(ax=ax, x_ticks_res=ticks, date_format=fmt)\n", - "data2.plot(\n", - " mode=\"scatter\",\n", - " annotator=data2.annotators[0],\n", - " label=data2.labels[0],\n", - " ax=ax,\n", - " show_rise_set=True,\n", - ")\n", - "plt.tight_layout()\n", - "plt.show()" - ], - "id": "66b43e53fb17037", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### DPM per day", - "id": "c7ea248b277edb65" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "df_filtered = data2.filter_df(data2.annotators[0], data2.labels[0])\n", - "df_counts = get_count(df_filtered, bin_size)\n", - "\n", - "df_counts[\"Season\"] = df_counts.index.to_series().apply(lambda x: get_season(x)[0])\n", - "df_counts[\"colors\"] = df_counts[\"Season\"].map(season_color).fillna(\"gray\")\n", - "\n", - "fig3, ax = plt.subplots(1, 1)\n", - "ax = data2.set_ax(ax=ax, x_ticks_res=ticks, date_format=fmt)\n", - "data2.plot(\n", - " mode=\"histogram\",\n", - " annotator=data2.annotators[0],\n", - " label=data2.labels[0],\n", - " color=df_counts[\"colors\"].tolist(),\n", - " ax=ax,\n", - " bin_size=bin_size,\n", - " legend=True,\n", - ")\n", - "ax.set_ylim(0, 200)\n", - "ax.yaxis.set_major_locator(ticker.MaxNLocator(nbins=10))\n", - "plt.tight_layout()\n", - "plt.show()" - ], - "id": "81b0dafa8adc20e9", - "outputs": [], - "execution_count": null - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/user_case/resource/CPOD-FPOD_yaml.yml b/user_case/resource/CPOD-FPOD_yaml.yml deleted file mode 100644 index ded85b0..0000000 --- a/user_case/resource/CPOD-FPOD_yaml.yml +++ /dev/null @@ -1,33 +0,0 @@ -#This file is to be used to load an APLOSE result csv file. -#If a parameter is set here to `null`, it will be loaded with a `None` value in the Python scripts. -#For parameter definition, see `sort_detections` function in `utils\def_func`. -# -#Note: -# - Several csv files can be loaded at once, to perform this copy the template and paste it at the end of the present file. - -######################################### -'C:\Users\dupontma2\Downloads\data_aplose.csv': - # timebin: detection time resolution in seconds - 'timebin_new': null - # begin datetime: '2000-01-01T00:00:00+0000' - 'begin': null - # end datetime: '2000-01-01T00:00:00+0000' - 'end': null - # annotator: ['annotator1', 'annotator2'] - 'annotator': null - # annotation: ['annotation1', 'annotation2'] - 'annotation': "Marsouin" - # box: boolean value, whether to keep strong annotations - 'box': False - # timestamp file - 'timestamp_file': null - # user selection - 'user_sel': all - # f_min filter - 'f_min': null - # f_max filter - 'f_max': null - # score - 'score': null - # filename format - 'filename_format': '%Y-%m-%dT%H:%M:%S.%f%z' \ No newline at end of file diff --git a/user_case/user_case_CALAIS.ipynb b/user_case/user_case_CALAIS.ipynb deleted file mode 100644 index f0f109a..0000000 --- a/user_case/user_case_CALAIS.ipynb +++ /dev/null @@ -1,581 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "id": "initial_id", - "metadata": { - "collapsed": true, - "ExecuteTime": { - "end_time": "2026-01-21T13:29:51.583226Z", - "start_time": "2026-01-21T13:29:43.385674Z" - } - }, - "source": [ - "from pathlib import Path\n", - "\n", - "import pytz\n", - "from pandas import (\n", - " concat,\n", - " read_csv,\n", - " to_datetime,\n", - ")\n", - "\n", - "from post_processing.dataclass.data_aplose import DataAplose\n", - "from post_processing.utils.fpod_utils import load_pod_folder, pod2aplose, actual_data, filter_by_metadatax, resample_dpm, load_pod_txt, build_range, extract_site, percent_calc, site_percent, year_percent, ym_percent, month_percent, hour_percent, \\\n", - " process_feeding_buzz\n", - "from post_processing.utils.core_utils import json2df,get_season" - ], - "outputs": [], - "execution_count": 1 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Pre-processing\n", - "\n" - ], - "id": "e8e8c57c7f4197fe" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Import your csv files. All files for one site must be stored in the same folder.", - "id": "6f9beab2dcba1a9c" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:25:01.549663Z", - "start_time": "2025-10-17T09:24:48.208563Z" - } - }, - "cell_type": "code", - "source": [ - "pod_files = Path(r\"U:\\Walde\")\n", - "path = load_pod_folder(pod_files)\n", - "print(path.head())\n", - "\n", - "df_0 = path.dropna()\n", - "\n", - "json = Path(r\"C:\\Users\\fouinel\\Downloads\\deployment_calais.json\") #Path to your metadata file.\n", - "metadatax = json2df(json_path=json)\n", - "\n", - "metadatax[\"deploy.name\"] = (metadatax[\"site.name\"].astype(str) + \"_\" +\n", - " metadatax[\"campaign.name\"].astype(str))" - ], - "id": "8636a8a27fe2af47", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " File ChunkEnd DPM Nall MinsOn deploy.name\n", - "0 POD2399 file01.CP3 14/05/2014 08:03 0.0 0.0 1.0 Walde_Phase1\n", - "1 POD2399 file01.CP3 14/05/2014 08:04 0.0 799.0 1.0 Walde_Phase1\n", - "2 POD2399 file01.CP3 14/05/2014 08:05 0.0 0.0 1.0 Walde_Phase1\n", - "3 POD2399 file01.CP3 14/05/2014 08:06 0.0 3361.0 1.0 Walde_Phase1\n", - "4 POD2399 file01.CP3 14/05/2014 08:07 0.0 421.0 1.0 Walde_Phase1\n" - ] - } - ], - "execution_count": 11 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:34:43.092415Z", - "start_time": "2025-10-17T09:27:26.409365Z" - } - }, - "cell_type": "code", - "source": "d_beg_end = actual_data(df_0, metadatax)", - "id": "4208969d9e509a8", - "outputs": [], - "execution_count": 12 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:35:32.137350Z", - "start_time": "2025-10-17T09:35:32.089860Z" - } - }, - "cell_type": "code", - "source": "d_beg_end.to_csv(r\"U:\\Deb_Fin_Walde.csv\", index=False)", - "id": "6fb6f4fa675d7cab", - "outputs": [], - "execution_count": 13 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:35:33.725437Z", - "start_time": "2025-10-17T09:35:33.670018Z" - } - }, - "cell_type": "code", - "source": "df_1 = df_0[df_0[\"DPM\"] !=0 ]", - "id": "597efd1d90e3d069", - "outputs": [], - "execution_count": 14 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### APLOSE format", - "id": "4f8c83c96f0b6ff4" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Chose the right function, depending on the instrument you are working with.", - "id": "9849c47189cf1a85" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *CPOD*", - "id": "8ed339c688bdef1" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:35:38.378283Z", - "start_time": "2025-10-17T09:35:35.755020Z" - } - }, - "cell_type": "code", - "source": [ - "df_aplose = pod2aplose(df_1, pytz.utc, \"Walde\", \"Marsouin\", \"CPOD\")\n", - "print(df_aplose.head())" - ], - "id": "812ed7c0c5e258e7", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dataset filename start_time end_time start_frequency end_frequency \\\n", - "0 Walde 0 60 0 0 \n", - "1 Walde 0 60 0 0 \n", - "2 Walde 0 60 0 0 \n", - "3 Walde 0 60 0 0 \n", - "4 Walde 0 60 0 0 \n", - "\n", - " annotation annotator start_datetime \\\n", - "0 Marsouin FPOD 2014-05-15T16:56:00.000+0000 \n", - "1 Marsouin FPOD 2014-05-17T22:53:00.000+0000 \n", - "2 Marsouin FPOD 2014-05-17T22:54:00.000+0000 \n", - "3 Marsouin FPOD 2014-05-18T14:05:00.000+0000 \n", - "4 Marsouin FPOD 2014-05-21T14:19:00.000+0000 \n", - "\n", - " end_datetime is_box deploy.name \n", - "0 2014-05-15T16:57:00.000+0000 0 Walde_Phase1 \n", - "1 2014-05-17T22:54:00.000+0000 0 Walde_Phase1 \n", - "2 2014-05-17T22:55:00.000+0000 0 Walde_Phase1 \n", - "3 2014-05-18T14:06:00.000+0000 0 Walde_Phase1 \n", - "4 2014-05-21T14:20:00.000+0000 0 Walde_Phase1 \n" - ] - } - ], - "execution_count": 15 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *FPOD*", - "id": "a39bb10d8ac60a27" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:23:23.552890Z", - "start_time": "2025-10-17T09:23:22.810583Z" - } - }, - "cell_type": "code", - "source": [ - "df_aplose = pod2aplose(df_1, pytz.utc, \"CETIROISE\", \"Marsouin\", \"FPOD\")\n", - "print(df_aplose.head())" - ], - "id": "9b632673397a184", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dataset filename start_time end_time start_frequency end_frequency \\\n", - "0 CA4 0 60 0 0 \n", - "1 CA4 0 60 0 0 \n", - "2 CA4 0 60 0 0 \n", - "3 CA4 0 60 0 0 \n", - "4 CA4 0 60 0 0 \n", - "\n", - " annotation annotator start_datetime \\\n", - "0 Marsouin FPOD 2014-05-17T03:52:00.000+0000 \n", - "1 Marsouin FPOD 2014-05-17T04:47:00.000+0000 \n", - "2 Marsouin FPOD 2014-05-19T17:06:00.000+0000 \n", - "3 Marsouin FPOD 2014-05-20T11:07:00.000+0000 \n", - "4 Marsouin FPOD 2014-05-20T11:16:00.000+0000 \n", - "\n", - " end_datetime is_box deploy.name \n", - "0 2014-05-17T03:53:00.000+0000 0 CA4_Phase1 \n", - "1 2014-05-17T04:48:00.000+0000 0 CA4_Phase1 \n", - "2 2014-05-19T17:07:00.000+0000 0 CA4_Phase1 \n", - "3 2014-05-20T11:08:00.000+0000 0 CA4_Phase1 \n", - "4 2014-05-20T11:17:00.000+0000 0 CA4_Phase1 \n" - ] - } - ], - "execution_count": 7 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Remove non usable lines", - "id": "7860838f8514da39" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Import the .json file available on metadatax.", - "id": "32f8ff8f9ece35a8" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:35:54.374270Z", - "start_time": "2025-10-17T09:35:54.303578Z" - } - }, - "cell_type": "code", - "source": [ - "\n", - "\n", - "cleared = filter_by_metadatax(df_aplose, metadatax) #Remove lines captures outside the instrument submersion." - ], - "id": "ed6a06c522aea169", - "outputs": [], - "execution_count": 16 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Export your file to the aplose format. You can change the name of the file to match the project you are working on.", - "id": "8f5fe75cc3463971" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:36:06.597522Z", - "start_time": "2025-10-17T09:35:56.739495Z" - } - }, - "cell_type": "code", - "source": "cleared.to_csv(r\"U:\\APLOSE_Walde_pos.csv\", index=False) #You can stock all DPM for a site in a DataAplose file.", - "id": "76f70cb6c6658ba6", - "outputs": [], - "execution_count": 17 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Load data", - "id": "f5d38266dc9d5273" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Use the yaml file to import your aplose files one at a time.", - "id": "2ce11c6e57f38690" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "yaml_file = Path(r\"resource\\CPOD-FPOD_yaml.yml\")\n", - "data_list = DataAplose.from_yaml(file=yaml_file)\n", - "print(data_list.df.head())" - ], - "id": "6837593897111b0a", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Format choice\n", - "id": "9f93eb863e3e3a9e" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Here you need to choose the format in which you want to visualise the positive detections. This aims to replace the functionality available in CPOD.exe.", - "id": "925d92d3eec065e6" - }, - { - "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "frq = \"D\" #This argument will determine the format in which you want to visualise your data. Use \"D\", \"h\" or \"10min\".", - "id": "256b756d05c08294" - }, - { - "metadata": {}, - "cell_type": "code", - "source": "resamp = resample_dpm(data_list.df, frq=frq, extra_columns=[\"deploy.name\"])", - "id": "6cc79b2aeef076ed", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Add the feeding buzzes", - "id": "8375ddbe07ad0aee" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Import your click details files. All files for one site must be stacked in the same folder.", - "id": "9753f4ba20c7267e" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "fb_files = Path(r\"U:\\fb_fpod_cetiroise_c\") #Path to your click details folder.\n", - "fb_all = load_pod_txt(fb_files)\n", - "fb_all = process_feeding_buzz(fb_all, \"Marsouin\")\n", - "\n", - "fb_all[\"start_datetime\"] = fb_all[\"start_datetime\"].dt.floor(frq)\n", - "fb = fb_all.groupby(\"start_datetime\")[\"Foraging\"].sum().reset_index()\n", - "fb[\"start_datetime\"] = to_datetime(fb[\"start_datetime\"], utc=True)" - ], - "id": "2b19f90c99252ff3", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_tot = resamp.merge(fb, on=\"start_datetime\", how=\"left\")\n", - "\n", - "#This function aims to reindent 0 between the positive detections. It will be useful to produce first visualization graphs and use this dataset in R.\n", - "d_hour = build_range(d_beg_end, frq)" - ], - "id": "b00c8f1e2210ea7", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "d_fin = d_hour.merge(d_tot, on=[\"start_datetime\", \"deploy.name\"], how=\"left\")", - "id": "601787cc806226b0", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_fin[[\"DPM\",\"Foraging\"]] = d_fin[[\"DPM\",\"Foraging\"]].fillna(0)\n", - "print(d_fin.head())" - ], - "id": "f93bf1f3ca4f4112", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "## Add time columns", - "id": "c7b1d32ed1c99fb7" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_fin[\"Year\"] = d_fin[\"start_datetime\"].dt.year\n", - "d_fin[\"Month\"] = d_fin[\"start_datetime\"].dt.month\n", - "d_fin['YM'] = d_fin[\"Year\"].astype(str) + '-' + d_fin[\"Month\"].astype(str)\n", - "d_fin['YM'] = to_datetime(d_fin['YM'])\n", - "d_fin[\"Day\"] = d_fin[\"start_datetime\"].dt.day\n", - "d_fin[\"Hour\"] = d_fin[\"start_datetime\"].dt.hour\n", - "\n", - "d_fin[\"FBR\"] = d_fin[\"Foraging\"] / d_fin[\"DPM\"]\n", - "d_fin[\"DPH\"] = (d_fin[\"DPM\"] >0).astype(int)" - ], - "id": "a2261ce5093a3104", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_fin[\"FBR\"] = d_fin[\"FBR\"].fillna(0)\n", - "d_fin.to_csv(r\"U:\\Hours_DPM_FBUZZ_CETIROISE.csv\", index=False)" - ], - "id": "d606f4f6904b57c6", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "## Overview", - "id": "4bc0904182a3f845" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *Import datasets*", - "id": "e1de414e2eb3fa8f" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "ca4 = read_csv(r\"U:\\Hours_DPM_FBUZZ_SiteCA4.csv\")\n", - "walde = read_csv(r\"U:\\Hours_DPM_FBUZZ_SiteWalde.csv\")\n", - "\n", - "data_c = concat([ca4, walde])\n", - "data_c[\"start_datetime\"] = to_datetime(data_c[\"start_datetime\"])\n", - "data_c[\"start_datetime\"] = data_c[\"start_datetime\"].apply(lambda x : x.tz_convert(\"Europe/Paris\"))\n", - "data_c[\"Hour\"] = data_c[\"start_datetime\"].dt.hour" - ], - "id": "9909fbfdcb8e2e78", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "a = read_csv(r\"U:\\Hours_DPM_FBUZZ_SiteA.csv\")\n", - "b = read_csv(r\"U:\\Hours_DPM_FBUZZ_SiteB.csv\")\n", - "c = read_csv(r\"U:\\Hours_DPM_FBUZZ_SiteC.csv\")\n", - "d = read_csv(r\"U:\\Hours_DPM_FBUZZ_SiteD.csv\")\n", - "\n", - "data_k = concat([a, b, c, d])\n", - "data_k[\"start_datetime\"] = to_datetime(data_k[\"start_datetime\"])\n", - "data_k[\"start_datetime\"] = data_k[\"start_datetime\"].apply(lambda x : x.tz_convert(\"Indian/Kerguelen\"))\n", - "data_k[\"Hour\"] = data_k[\"start_datetime\"].dt.hour" - ], - "id": "87e2d1938787aefc", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "ceti = read_csv(r\"U:\\Hours_DPM_FBUZZ_CETIROISE.csv\")\n", - "\n", - "ceti[\"start_datetime\"] = to_datetime(ceti[\"start_datetime\"])\n", - "ceti[\"start_datetime\"] = ceti[\"start_datetime\"].apply(lambda x : x.tz_convert(\"CET\")) #TimeZone Central European Time\n", - "ceti[\"Hour\"] = ceti[\"start_datetime\"].dt.hour" - ], - "id": "5928770d1c47bcad", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *Generate graphs*", - "id": "200273fc36fb7d5d" - }, - { - "metadata": {}, - "cell_type": "code", - "source": "data = ceti #Precise which dataset you are working with", - "id": "be10e9d690294cff", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "data = extract_site(data)\n", - "y_per = percent_calc(data, \"Year\")\n", - "ym_per = percent_calc(data, \"YM\")\n", - "ym_per[\"YM\"] = to_datetime(ym_per[\"YM\"])\n", - "ym_per[\"Season\"] = ym_per[\"YM\"].apply(lambda x: get_season(x)[0])\n", - "m_per = percent_calc(data, \"Month\")\n", - "h_per = percent_calc(data, \"Hour\")\n", - "s_per = percent_calc(data)" - ], - "id": "2826b79097a85607", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "site_percent(s_per, \"%click\")", - "id": "ddd1fac6295136c6", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "year_percent(y_per, \"%click\")", - "id": "ba7581e97fdbd07c", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "ym_percent(ym_per, \"%click\")", - "id": "4de618933c154f86", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "month_percent(m_per, \"%buzzes\")", - "id": "7cf84c8744185424", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "hour_percent(h_per, \"%click\")", - "id": "12d83e9082d711c0", - "outputs": [], - "execution_count": null - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 67108127f51c196a7603ff022dd584ad8e1d6dc2 Mon Sep 17 00:00:00 2001 From: Mathieu Dupont <108517594+mathieudpnt@users.noreply.github.com> Date: Tue, 27 Jan 2026 17:20:56 +0100 Subject: [PATCH 33/33] refacto --- src/post_processing/utils/filtering_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/post_processing/utils/filtering_utils.py b/src/post_processing/utils/filtering_utils.py index a9e0619..3d4832b 100644 --- a/src/post_processing/utils/filtering_utils.py +++ b/src/post_processing/utils/filtering_utils.py @@ -485,8 +485,8 @@ def _process_annotator_label_pair( time_vector[i] for i, detected in enumerate(detect_vec) if detected ] file_vector = [ - # filename_vector[i] for i, detected in enumerate(detect_vec) if detected - filename_vector[i + 1] for i, detected in enumerate(detect_vec) if detected + filename_vector[i] for i, detected in enumerate(detect_vec) if detected + # filename_vector[i + 1] for i, detected in enumerate(detect_vec) if detected ] if not start_datetime: