diff --git a/src/post_processing/dataclass/data_aplose.py b/src/post_processing/dataclass/data_aplose.py index 98e6d9c..ce2d9da 100644 --- a/src/post_processing/dataclass/data_aplose.py +++ b/src/post_processing/dataclass/data_aplose.py @@ -393,7 +393,6 @@ def plot( color = kwargs.get("color") season = kwargs.get("season") effort = kwargs.get("effort") - if not bin_size: msg = "'bin_size' missing for histogram plot." raise ValueError(msg) @@ -417,26 +416,30 @@ def plot( season = kwargs.get("season", False) bin_size = kwargs.get("bin_size") - return heatmap(df=df_filtered, - ax=ax, - bin_size=bin_size, - time_range=time, - show_rise_set=show_rise_set, - season=season, - coordinates=self.coordinates, - ) + return heatmap( + df=df_filtered, + ax=ax, + bin_size=bin_size, + time_range=time, + show_rise_set=show_rise_set, + season=season, + coordinates=self.coordinates, + ) if mode == "scatter": show_rise_set = kwargs.get("show_rise_set", True) season = kwargs.get("season", False) + effort = kwargs.get("effort") - return scatter(df=df_filtered, - ax=ax, - time_range=time, - show_rise_set=show_rise_set, - season=season, - coordinates=self.coordinates, - ) + return scatter( + df=df_filtered, + ax=ax, + time_range=time, + show_rise_set=show_rise_set, + season=season, + coordinates=self.coordinates, + effort=effort, + ) if mode == "agreement": bin_size = kwargs.get("bin_size") @@ -450,7 +453,11 @@ def plot( label, ) - return timeline(df=df_filtered, ax=ax, color=color) + return timeline( + df=df_filtered, + ax=ax, + color=color, + ) msg = f"Unsupported plot mode: {mode}" raise ValueError(msg) diff --git a/src/post_processing/dataclass/detection_filter.py b/src/post_processing/dataclass/detection_filter.py index a9158c4..09776ae 100644 --- a/src/post_processing/dataclass/detection_filter.py +++ b/src/post_processing/dataclass/detection_filter.py @@ -7,7 +7,7 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, fields from pathlib import Path from typing import TYPE_CHECKING, Literal @@ -41,6 +41,12 @@ class DetectionFilter: box: bool = False filename_format: str = None + def __getitem__(self, key: str): + """Return the value of the given key.""" + if key in {f.name for f in fields(self)}: + return getattr(self, key) + raise KeyError(key) + @classmethod def from_yaml( cls, diff --git a/src/post_processing/dataclass/recording_period.py b/src/post_processing/dataclass/recording_period.py index 4c09722..d0d48b7 100644 --- a/src/post_processing/dataclass/recording_period.py +++ b/src/post_processing/dataclass/recording_period.py @@ -8,19 +8,16 @@ from dataclasses import dataclass from typing import TYPE_CHECKING -from osekit.config import TIMESTAMP_FORMATS_EXPORTED_FILES -from osekit.utils.timestamp_utils import strptime_from_text from pandas import ( + IntervalIndex, Series, Timedelta, - cut, + date_range, read_csv, + to_datetime, ) -from post_processing.utils.core_utils import ( - get_time_range_and_bin_size, - localize_timestamps, -) +from post_processing.utils.core_utils import round_begin_end_timestamps from post_processing.utils.filtering_utils import ( find_delimiter, ) @@ -33,7 +30,7 @@ @dataclass(frozen=True) class RecordingPeriod: - """A class to handle recording periods.""" + """Represents recording effort over time, aggregated into bins.""" counts: Series timebin_origin: Timedelta @@ -42,33 +39,125 @@ class RecordingPeriod: def from_path( cls, config: DetectionFilter, - date_format: str = TIMESTAMP_FORMATS_EXPORTED_FILES, *, bin_size: Timedelta | BaseOffset, ) -> RecordingPeriod: - """Return a list of Timestamps corresponding to recording periods.""" + """Vectorised creation of recording coverage from CSV with start/end datetimes. + + This method reads a CSV with columns: + - "start_recording" + - "end_recording" + - "start_deployment" + - "end_deployment" + + It computes the **effective recording interval** as the intersection between + recording and deployment periods, builds a fine-grained timeline at + `timebin_origin` resolution, and aggregates effort into `bin_size` bins. + + Parameters + ---------- + config + Configuration object containing at least: + - `timestamp_file`: path to CSV + - `timebin_origin`: Timedelta resolution of detections + bin_size : Timedelta or BaseOffset + Size of the aggregation bin (e.g. Timedelta("1H") or "1D"). + + Returns + ------- + RecordingPeriod + Object containing `counts` (Series indexed by IntervalIndex) and + `timebin_origin`. + + """ + # Read CSV and parse datetime columns timestamp_file = config.timestamp_file delim = find_delimiter(timestamp_file) - timestamp_df = read_csv(timestamp_file, delimiter=delim) - - if "timestamp" in timestamp_df.columns: - msg = "Parsing 'timestamp' column not implemented yet." - raise NotImplementedError(msg) - - if "filename" in timestamp_df.columns: - timestamps = [ - strptime_from_text(ts, date_format) - for ts in timestamp_df["filename"] - ] - timestamps = localize_timestamps(timestamps, config.timezone) - time_vector, bin_size = get_time_range_and_bin_size(timestamps, bin_size) - - binned = cut(timestamps, time_vector) - max_annot = bin_size / config.timebin_origin - - return cls(counts=binned.value_counts().sort_index().clip(upper=max_annot), - timebin_origin=config.timebin_origin, - ) - - msg = "Could not parse timestamps." - raise ValueError(msg) + df = read_csv( + config.timestamp_file, + parse_dates=[ + "start_recording", + "end_recording", + "start_deployment", + "end_deployment", + ], + delimiter=delim, + ) + + if df.empty: + msg = "CSV is empty." + raise ValueError(msg) + + # Ensure all required columns are present + required_columns = { + "start_recording", + "end_recording", + "start_deployment", + "end_deployment", + } + + missing = required_columns - set(df.columns) + + if missing: + msg = f"CSV is missing required columns: {', '.join(sorted(missing))}" + raise ValueError(msg) + + # Normalise timezones: convert to UTC, then remove tz info (naive) + for col in [ + "start_recording", + "end_recording", + "start_deployment", + "end_deployment", + ]: + df[col] = to_datetime(df[col], utc=True).dt.tz_convert(None) + + # Compute effective recording intervals (intersection) + df["effective_start_recording"] = df[ + ["start_recording", "start_deployment"] + ].max(axis=1) + + df["effective_end_recording"] = df[ + ["end_recording", "end_deployment"] + ].min(axis=1) + + # Remove rows with no actual recording interval + df = df.loc[ + df["effective_start_recording"] < df["effective_end_recording"] + ].copy() + + if df.empty: + msg = "No valid recording intervals after deployment intersection." + raise ValueError(msg) + + # Build fine-grained timeline at `timebin_origin` resolution + origin = config.timebin_origin + time_index = date_range( + start=df["effective_start_recording"].min(), + end=df["effective_end_recording"].max(), + freq=origin, + ) + + # Initialise effort vector (0 = no recording, 1 = recording) + # Compare each timestamp to all intervals in a vectorised manner + effort = Series(0, index=time_index) + + # Vectorised interval coverage + t_vals = time_index.to_numpy()[:, None] + start_vals = df["effective_start_recording"].to_numpy() + end_vals = df["effective_end_recording"].to_numpy() + + # Boolean matrix: True if the timestamp is within any recording interval + covered = (t_vals >= start_vals) & (t_vals < end_vals) + effort[:] = covered.any(axis=1).astype(int) + + # Aggregate effort into user-defined bin_size + counts = effort.resample(bin_size, closed="left", label="left").sum() + + counts.index = IntervalIndex.from_arrays( + counts.index, + counts.index + + round_begin_end_timestamps(list(counts.index), bin_size)[-1], + closed="left", + ) + + return cls(counts=counts, timebin_origin=origin) diff --git a/src/post_processing/utils/core_utils.py b/src/post_processing/utils/core_utils.py index 5a831e1..d0125ae 100644 --- a/src/post_processing/utils/core_utils.py +++ b/src/post_processing/utils/core_utils.py @@ -3,15 +3,16 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import astral import easygui import numpy as np -from astral.sun import sunrise, sunset +from astral import LocationInfo +from astral.sun import sunrise, sunset, sun from matplotlib import pyplot as plt from osekit.config import TIMESTAMP_FORMAT_AUDIO_FILE -from osekit.utils.timestamp_utils import strptime_from_text, strftime_osmose_format +from osekit.utils.timestamp_utils import strftime_osmose_format, strptime_from_text from pandas import ( DataFrame, DatetimeIndex, @@ -44,10 +45,10 @@ def get_season(ts: Timestamp, *, northern: bool = True) -> tuple[str, int]: """Determine the meteorological season from a Timestamp. - In the Northern hemisphere + In the Northern Hemisphere Winter: Dec-Feb, Spring: Mar-May, Summer: Jun-Aug, Autumn: Sep-Nov - In the Southern hemisphere + In the Southern Hemisphere Winter: Jun-Aug, Spring: Sep-Nov, Summer: Dec-Feb, Autumn: Mar-May Parameters @@ -133,8 +134,18 @@ def get_sun_times( dt_sunset = Timestamp(sunset(gps.observer, date=date)).tz_convert(tz) # Convert sunrise and sunset to decimal hours - h_sunrise.append(dt_sunrise.hour + dt_sunrise.minute / 60 + dt_sunrise.second / 3600 + dt_sunrise.microsecond / 3_600_000_000) - h_sunset.append(dt_sunset.hour + dt_sunset.minute / 60 + dt_sunset.second / 3600 + dt_sunset.microsecond / 3_600_000_000) + h_sunrise.append( + dt_sunrise.hour + + dt_sunrise.minute / 60 + + dt_sunrise.second / 3600 + + dt_sunrise.microsecond / 3_600_000_000, + ) + h_sunset.append( + dt_sunset.hour + + dt_sunset.minute / 60 + + dt_sunset.second / 3600 + + dt_sunset.microsecond / 3_600_000_000, + ) return h_sunrise, h_sunset @@ -215,8 +226,12 @@ def add_weak_detection( if not max_time: max_time = Timedelta(get_max_time(df), "s") - df["start_datetime"] = [strftime_osmose_format(start) for start in df["start_datetime"]] - df["end_datetime"] = [strftime_osmose_format(stop) for stop in df["end_datetime"]] + df["start_datetime"] = [ + strftime_osmose_format(start) for start in df["start_datetime"] + ] + df["end_datetime"] = [ + strftime_osmose_format(stop) for stop in df["end_datetime"] + ] for ant in annotators: for lbl in labels: @@ -255,17 +270,16 @@ def add_weak_detection( new_line.append(np.nan) df.loc[df.index.max() + 1] = new_line - return df.sort_values(by=["start_datetime", "annotator"]).reset_index(drop=True) def json2df(json_path: Path) -> DataFrame: - """Convert a metadatax json file into a DataFrame. + """Convert a metadatax JSON file into a DataFrame. Parameters ---------- json_path: Path - Json file path + JSON file path """ with json_path.open(encoding="utf-8") as f: @@ -301,15 +315,15 @@ def add_season_period( raise ValueError(msg) bins = date_range( - start=Timestamp(ax.get_xlim()[0], unit="D").floor("1D"), - end=Timestamp(ax.get_xlim()[1], unit="D").ceil("1D"), + start=Timestamp(ax.get_xlim()[0], unit="D"), + end=Timestamp(ax.get_xlim()[1], unit="D"), ) season_colors = { - "winter": "#2ce5e3", - "spring": "#4fcf50", - "summer": "#ffcf50", - "autumn": "#fb9a67", + "winter": "#84eceb", + "spring": "#91de92", + "summer": "#fce097", + "autumn": "#f9c1a5", } bin_centers = [ @@ -330,8 +344,9 @@ def add_season_period( width=(bins[i + 1] - bins[i]), color=season_colors[season], align="center", - zorder=0, - alpha=0.6, + zorder=2, + alpha=1, + linewidth=0, ) ax.set_ylim(ax.dataLim.ymin, ax.dataLim.ymax) @@ -480,7 +495,7 @@ def get_labels_and_annotators(df: DataFrame) -> tuple[list, list]: def localize_timestamps(timestamps: list[Timestamp], tz: tzinfo) -> list[Timestamp]: - """Localize timestamps if necessary.""" + """Localise timestamps if necessary.""" localized = [] for ts in timestamps: if ts.tzinfo is None or ts.tzinfo.utcoffset(ts) is None: @@ -509,20 +524,20 @@ def get_time_range_and_bin_size( if isinstance(bin_size, Timedelta): return timestamp_range, bin_size - elif isinstance(bin_size, BaseOffset): + if isinstance(bin_size, BaseOffset): return timestamp_range, timestamp_range[1] - timestamp_range[0] - else: - msg = "bin_size must be a Timedelta or BaseOffset." - raise TypeError(msg) + msg = "bin_size must be a Timedelta or BaseOffset." + raise TypeError(msg) def round_begin_end_timestamps( timestamp_list: list[Timestamp], bin_size: Timedelta | BaseOffset, -) -> tuple[Timestamp, Timestamp, Timedelta]: +) -> tuple[Any, Any, Any]: """Return time vector given a bin size.""" - if (not isinstance(timestamp_list, list) or - not all(isinstance(ts, Timestamp) for ts in timestamp_list)): + if not isinstance(timestamp_list, list) or not all( + isinstance(ts, Timestamp) for ts in timestamp_list + ): msg = "timestamp_list must be a list[Timestamp]" raise TypeError(msg) @@ -546,14 +561,16 @@ def round_begin_end_timestamps( timestamp_range = date_range(start=start, end=end, freq=bin_size) bin_size = timestamp_range[1] - timestamp_range[0] - return start.floor(bin_size), end.ceil(bin_size), bin_size + if bin_size.resolution_string in {"s", "min", "h"}: + return start.floor(bin_size), end.ceil(bin_size), bin_size + return start, end, bin_size msg = "Could not get start/end timestamps." raise ValueError(msg) def timedelta_to_str(td: Timedelta) -> str: - """From a Timedelta to corresponding string.""" + """From a Timedelta to the corresponding string.""" seconds = int(td.total_seconds()) if seconds % 86400 == 0: @@ -563,3 +580,44 @@ def timedelta_to_str(td: Timedelta) -> str: if seconds % 60 == 0: return f"{seconds // 60}min" return f"{seconds}s" + + +def assign_light_regime( + ts: Timestamp, + lat: float | None = None, + lon: float | None = None, +) -> DataFrame: + """Assign daylight regime to temporal events. + + Parameters + ---------- + ts: Timestamp + Timestamp to assign a light regime to. + lat: float + The latitude of corresponding point. + lon: float + The longitude of corresponding point. + + Returns + ------- + DataFrame + The same dataframe with the column daytime. + + """ + if not all([lat, lon]): + lat, lon = get_coordinates() + + # Get sun times for given location + location = LocationInfo(latitude=lat, longitude=lon) + s = sun(location.observer, date=ts.date()) + + if ts < s['dawn']: + return 'night' + elif ts < s['sunrise']: + return 'dawn' + elif ts < s['sunset']: + return 'day' + elif ts < s['dusk']: + return 'dusk' + else: + return 'night' diff --git a/src/post_processing/utils/filtering_utils.py b/src/post_processing/utils/filtering_utils.py index c391ff6..3d4832b 100644 --- a/src/post_processing/utils/filtering_utils.py +++ b/src/post_processing/utils/filtering_utils.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING import pytz +from osekit.utils.timestamp_utils import strptime_from_text from pandas import ( DataFrame, Timedelta, @@ -485,6 +486,7 @@ def _process_annotator_label_pair( ] file_vector = [ filename_vector[i] for i, detected in enumerate(detect_vec) if detected + # filename_vector[i + 1] for i, detected in enumerate(detect_vec) if detected ] if not start_datetime: @@ -570,16 +572,17 @@ def get_filename_timestamps(df: DataFrame, date_parser: str) -> list[Timestamp]: """ tz = get_timezone(df) - try: - return [ - to_datetime( + timestamps = [ + strptime_from_text( ts, - format=date_parser, - ).tz_localize(tz) for ts in df["filename"] - ] - except ValueError: - msg = """Could not parse timestamps from `df["filename"]`.""" - raise ValueError(msg) from None + datetime_template=date_parser, + ) for ts in df["filename"] + ] + + if all(t.tz is None for t in timestamps): + timestamps = [t.tz_localize(tz) for t in timestamps] + + return timestamps def ensure_in_list(value: str, candidates: list[str], label: str) -> None: diff --git a/src/post_processing/utils/fpod_utils.py b/src/post_processing/utils/fpod_utils.py index d36bb89..cd13e58 100644 --- a/src/post_processing/utils/fpod_utils.py +++ b/src/post_processing/utils/fpod_utils.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -from pathlib import Path from typing import TYPE_CHECKING import matplotlib.dates as mdates @@ -15,9 +14,10 @@ from pandas import ( DataFrame, DateOffset, + Series, Timedelta, + Timestamp, concat, - date_range, notna, read_csv, to_datetime, @@ -25,10 +25,12 @@ to_timedelta, ) -from post_processing.utils.core_utils import get_coordinates, get_sun_times +from post_processing.utils.filtering_utils import find_delimiter from user_case.config import season_color, site_colors if TYPE_CHECKING: + from pathlib import Path + import pytz logger = logging.getLogger(__name__) @@ -40,9 +42,9 @@ def pod2aplose( dataset_name: str, annotation: str, annotator: str, - bin_size: int = 60, + bin_size: Timedelta, ) -> DataFrame: - """Format PODs DataFrame to match APLOSE format. + """Format PODs DataFrame to match an APLOSE format. Parameters ---------- @@ -56,7 +58,7 @@ def pod2aplose( annotation name. annotator: str annotator name. - bin_size: int + bin_size: Timedelta Duration of the detections in seconds. Returns @@ -65,327 +67,152 @@ def pod2aplose( An APLOSE formatted DataFrame. """ - df = df.copy() - df["_temp_dt"] = [ - tz.localize(strptime_from_text(entry, "%d/%m/%Y %H:%M")) - for entry in df["ChunkEnd"] + fpod_start_dt = [ + tz.localize(entry) + for entry in df["Datetime"] ] - df = df.sort_values("_temp_dt").reset_index(drop=True) - - fpod_start_dt = df["_temp_dt"].tolist() - fpod_end_dt = [entry + Timedelta(seconds=bin_size) for entry in fpod_start_dt] - data = { "dataset": [dataset_name] * len(df), - "filename": [strftime_osmose_format(entry) for entry in fpod_start_dt], + "filename": list(fpod_start_dt), "start_time": [0] * len(df), - "end_time": [bin_size] * len(df), + "end_time": [bin_size.total_seconds()] * len(df), "start_frequency": [0] * len(df), "end_frequency": [0] * len(df), "annotation": [annotation] * len(df), "annotator": [annotator] * len(df), - "start_datetime": [strftime_osmose_format(entry) for entry in fpod_start_dt], - "end_datetime": [strftime_osmose_format(entry) for entry in fpod_end_dt], - "is_box": [0] * len(df), - "deploy.name": df["deploy.name"].tolist(), + "start_datetime": [ + strftime_osmose_format(entry.floor(bin_size)) for entry in fpod_start_dt + ], + "end_datetime": [ + strftime_osmose_format(entry.ceil(bin_size)) for entry in fpod_start_dt + ], + "type": ["WEAK"] * len(df), + "deploy": df["Deploy"].tolist(), } return DataFrame(data) -def csv_folder( - folder_path: Path, - sep: str = ";", - encoding: str = "latin-1", +def load_pod_folder( + folder: Path, + ext: str, ) -> DataFrame: - """Process all CSV files from a folder. + """Read POD's result files from a folder. Parameters ---------- - folder_path: Path + folder: Path Folder's place. - sep: str, default=";" - Column separator. - encoding: str, default="latin-1" - File encoding. + ext: str + File extension of result files. Returns ------- DataFrame - Concatenated data with optional filename column. + Concatenated data. Raises ------ ValueError - If no CSV files found. + If no result files are found. """ - all_files = list(folder_path.rglob("*.csv")) - - if not all_files: - msg = f"No .csv files found in {folder_path}" + if ext not in {"csv", "txt"}: + msg = f"Invalid file extension: {ext}" raise ValueError(msg) - all_data = [] - for file in all_files: - df = read_csv(file, sep=sep, encoding=encoding, dtype="O") - df["deploy.name"] = file.stem - all_data.append(df) - - return concat(all_data, ignore_index=True) - - -def txt_folder( - folder_path: Path, - sep: str = "\t", -) -> DataFrame: - r"""Process all TXT files from a folder. - - Parameters - ---------- - folder_path: Path - Folder's place. - sep: str, default="\t" - Column separator. - - Returns - ------- - DataFrame - Concatenated data from all TXT files. - - """ - all_files = list(Path(folder_path).rglob("*.txt")) + all_files = sorted(folder.rglob(f"*.{ext}")) if not all_files: - msg = f"No .txt files found in {folder_path}" + msg = f"No .{ext} files found in {folder}" raise ValueError(msg) all_data = [] for file in all_files: - file_path = folder_path / file - df = read_csv(file_path, sep=sep) - all_data.append(df) - - return concat(all_data, ignore_index=True) - - -def parse_timestamps( - df: DataFrame, - col_timestamp: str, - date_formats: list[str] | None = None, -) -> DataFrame: - """Parse timestamp column with multiple possible formats. - - Parameters - ---------- - df: DataFrame - Input dataframe. - col_timestamp: str - Name of the timestamp column to parse. - date_formats: list[str], optional - List of strptime formats to try. If None, uses common formats. - - Returns - ------- - DataFrame - Copy of df with parsed timestamps. - - """ - if date_formats is None: - date_formats = [ - "%Y-%m-%dT%H:%M:%S.%f%z", - "%Y-%m-%dT%H:%M:%S:%Z", - "%Y-%m-%dT%H:%M:%S.%f", - "%Y-%m-%dT%H:%M:%S", - "%Y-%m-%d %H:%M:%S.%f", - "%d/%m/%Y %H:%M", - ] - - df = df.copy() - df[col_timestamp] = df[col_timestamp].apply( - lambda x: strptime_from_text(x, date_formats)) - return df - - -def required_columns( - df: DataFrame, - columns: list[str], -) -> None: - """Validate that required columns exist in dataframe. - - Parameters - ---------- - df: DataFrame - Table to validate. - columns : list[str] - List of required column names. - - Raises - ------ - ValueError - If any required column is missing. - - """ - for col in columns: - if col not in df.columns: - msg = f"'{col}' not found in {df}" - raise ValueError(msg) - - -def create_mask( - df: DataFrame, - col_timestamp: str, - col_start: str, - col_end: str, -) -> DataFrame: - """Filter rows to keep only those within deployment period. - - Parameters - ---------- - df : DataFrame - Table with timestamp and deployment period columns. - col_timestamp : str - Name of timestamp column. - col_start : str - Name of deployment start date column. - col_end : str - Name of deployment end date column. - - Returns - ------- - DataFrame - Filtered dataframe with rows in deployment periods. - - """ - mask = ( - (df[col_timestamp] >= df[col_start]) - & (df[col_timestamp] <= df[col_end]) - & df[col_timestamp].notna() - & df[col_start].notna() - & df[col_end].notna() - ) - return df[mask].copy() - - -def meta_cut_aplose( - raw_data: DataFrame, - metadata: DataFrame, -) -> DataFrame: - """Filter data to keep only rows within deployment periods. - - Parameters - ---------- - raw_data : DataFrame - Table containing deployment name and timestamps. - metadata : DataFrame - Metadata with deployment periods (start/end dates). - - Returns - ------- - DataFrame - Filtered data with only rows within deployment periods. - - """ - required_columns( - raw_data, ["deploy.name", "start_datetime"]) - required_columns( - metadata, ["deploy.name", "deployment_date", "recovery_date"]) - - raw = parse_timestamps(raw_data, "start_datetime") - raw = raw.sort_values(["start_datetime"]) - - dfm = raw.merge( - metadata[["deploy.name", "deployment_date", "recovery_date"]], - on="deploy.name", - how="left", - ) - - out = create_mask(dfm, "start_datetime", "deployment_date", "recovery_date") + sep = find_delimiter(file) + df = read_csv( + file, + sep=sep, + ) - columns_to_drop = [ - col for col in ["deployment_date", "recovery_date"] if col not in raw_data. - columns] - if columns_to_drop: - out = out.drop(columns=columns_to_drop) + df["Deploy"] = file.stem.strip().lower().replace(" ", "_") + all_data.append(df) - return out.sort_values(["start_datetime"]).reset_index(drop=True) + data = concat(all_data, ignore_index=True) + if ext == "csv": + return _process_csv_data(data) + if ext == "txt": + return _process_txt_data(data) -def add_utc( - df: DataFrame, - cols: list, - fr: str = "h", -) -> DataFrame: - """Create a DataFrame with one line per hour between start and end dates. + msg = f"Could not load {ext} result folder" + raise ValueError(msg) - Keep the number of detections per hour between these dates. - Parameters - ---------- - df: DataFrame - Metadata dataframe with deployments information (previously exported as json). - cols:list - Timestamp column names. - fr:str - Frequency of the range of detections. +def _process_csv_data(data: DataFrame) -> DataFrame: + """Process CSV data with filtering and datetime conversion.""" + data_filtered = _filter_csv_data(data) + data_filtered["Datetime"] = [ + strptime_from_text(dt, "%d/%m/%Y %H:%M") + for dt in data_filtered["ChunkEnd"] + ] + return data_filtered.sort_values(by=["Datetime"]).reset_index(drop=True) - Returns - ------- - DataFrame - A full period of time with positive and negative hours to detections. - """ - for col in df[cols]: - df[col] = to_datetime(df[col], utc=True) - df[col] = df[col].dt.floor(fr) - return df +def _filter_csv_data(data: DataFrame) -> DataFrame: + """Filter CSV data based on available columns.""" + if "%TimeLost" in data.columns: + data_filtered = data[data["File"].notna()].copy() + data_filtered = data_filtered[data_filtered["Nall/m"].notna()] + else: + data_filtered = data[data["DPM"] > 0].copy() + data_filtered = data_filtered[data_filtered["Nall"].notna()] + return data_filtered -def build_range( - df: DataFrame, - fr: str = "h", -) -> DataFrame: - """Create a DataFrame with one line per hour between start and end dates. - Keep the number of detections per hour between these dates. +def _process_txt_data(data: DataFrame) -> DataFrame: + """Process TXT data with datetime conversion.""" + data["Datetime"] = data.apply(get_feeding_buzz_datetime, axis=1) + return data.drop_duplicates().sort_values(by=["Datetime"]).reset_index(drop=True) - Parameters - ---------- - df: DataFrame - Metadata dataframe with deployments information (previously exported as json) - fr:str - Frequency of the range of detections. - Returns - ------- - DataFrame - A full period of time with positive and negative hours to detections. +def get_feeding_buzz_datetime(row: Series) -> Timestamp: + """Convert feeding buzz timestamp into a standard Timestamp. + The conversion method differs based on the POD type. """ - add_utc(df, ["Deb", "Fin"], fr) + try: + return ( + to_datetime("1900-01-01") + + to_timedelta(row["Minute"], unit="min") + + to_timedelta(row["microsec"] / 1e6, unit="sec") + - to_timedelta(2, unit="D") + ) + except (KeyError, TypeError, ValueError): + pass - all_ranges = [] - for _, row in df.iterrows(): - hours = date_range(row["Deb"], row["Fin"], freq=fr) - tmp = DataFrame( - { - "deploy.name": row["deploy.name"], - "start_datetime": hours, - }, + try: + return strptime_from_text( + f"{row['Minute']}:{int(str(row['microsec'])[0]):02d}.{int(str(row['microsec'])[1:])}", + "%-d/%-m/%Y %H:%M:%S.%f", ) - all_ranges.append(tmp) + except (KeyError, TypeError, ValueError): + pass - return concat(all_ranges, ignore_index=True) + msg = "Could not convert feeding buzz timestamp." + raise ValueError(msg) -def feeding_buzz( +def process_feeding_buzz( df: DataFrame, species: str, ) -> DataFrame: - """Process a CPOD/FPOD feeding buzz detection file. + """Process a POD feeding buzz detection DataFrame. - Gives the feeding buzz duration, depending on the studied species. + Give the feeding buzz duration, depending on the studied species + (`delphinid`, `porpoise` or `commerson`). Parameters ---------- @@ -397,322 +224,69 @@ def feeding_buzz( Returns ------- DataFrame - Containing all ICIs for every positive minutes to clicks + Containing all ICIs for every positive minute to click """ - df["microsec"] /= 1e6 - - try: - df["Minute"].astype(int) - df["datetime"] = ( - to_datetime("1900-01-01") - + to_timedelta(df["Minute"], unit="min") - + to_timedelta(df["microsec"], unit="sec") - - to_timedelta(2, unit="D") - ) - df["start_datetime"] = df["datetime"].dt.floor("min") - except (ValueError, TypeError): - df["datetime"] = ( - to_datetime(df["Minute"], dayfirst=True) - + to_timedelta(df["microsec"], unit="sec") - ) - df["start_datetime"] = to_datetime(df["Minute"], dayfirst=True) - - df["ICI"] = df["datetime"].diff() - - if species == "Dauphin": # Herzing et al., 2014 - df["Buzz"] = df["ICI"].between(Timedelta(0), - Timedelta(seconds=0.02)).astype(int) - elif species == "Marsouin": # Nuuttila et al., 2013 - df["Buzz"] = df["ICI"].between(Timedelta(0), - Timedelta(seconds=0.01)).astype(int) - elif species == "Commerson": # Reyes Reyes et al., 2015 - df["Buzz"] = df["ICI"].between(Timedelta(0), - Timedelta(seconds=0.005)).astype(int) + df["ICI"] = df["Datetime"].diff() + df["Datetime"] = df["Datetime"].dt.floor("min") + + if species.lower() == "delphinid": # Herzing et al., 2014 + df["Buzz"] = df["ICI"].between( + Timedelta(0), + Timedelta(seconds=0.02), + ).astype(int) + elif species.lower() == "porpoise": # Nuuttila et al., 2013 + df["Buzz"] = df["ICI"].between( + Timedelta(0), + Timedelta(seconds=0.01), + ).astype(int) + elif species.lower() == "commerson": # Reyes Reyes et al., 2015 + df["Buzz"] = df["ICI"].between( + Timedelta(0), + Timedelta(seconds=0.005), + ).astype(int) else: msg = "This species is not supported" raise ValueError(msg) - f = df.groupby(["start_datetime"])["Buzz"].sum().reset_index() + df_buzz = df.groupby(["Datetime"])["Buzz"].sum().reset_index() + df_buzz["Foraging"] = to_numeric( + df_buzz["Buzz"] != 0, downcast="integer", + ).astype(int) - f["Foraging"] = to_numeric(f["Buzz"] != 0, downcast="integer").astype(int) + return df_buzz - return f +def process_timelost(df: DataFrame, threshold: int = 0) -> DataFrame: + """Process TimeLost DataFrame. -def assign_daytime( - df: DataFrame, -) -> DataFrame: - """Assign datetime categories to temporal events. - - Categorize daytime of the detection (among 4 categories). + Returns relevant columns and reshape into hourly data. Parameters ---------- df: DataFrame - Contains positive hours to detections. - - Returns - ------- - DataFrame - The same dataframe with the column daytime. - - """ - df["start_datetime"] = to_datetime(df["start_datetime"], utc=True) - start = df["start_datetime"].min() - stop = df["start_datetime"].max() - lat, lon = get_coordinates() - sunrise, sunset = get_sun_times(start, stop, lat, lon) - - sun_times = DataFrame( - {"date": date_range(start, stop, freq="D"), - "sunrise": [Timedelta(h, "hours") for h in sunrise], - "sunset": [Timedelta(h, "hours") for h in sunset], - }) - - sun_times["sunrise"] = sun_times["date"].dt.floor("D") + sun_times["sunrise"] - sun_times["sunset"] = sun_times["date"].dt.floor("D") + sun_times["sunset"] - - for i, row in df.iterrows(): - dpm_i = row["start_datetime"] - if notna(dpm_i): # Check if time is not NaN - jour_i = sun_times[ - (sun_times["sunrise"].dt.year == dpm_i.year) - & (sun_times["sunrise"].dt.month == dpm_i.month) - & (sun_times["sunrise"].dt.day == dpm_i.day) - ] - if not jour_i.empty: # Ensure there's a matching row - jour_i = jour_i.iloc[0] # Extract first match - if (dpm_i <= jour_i["sunrise"]) | (dpm_i > jour_i["sunset"]): - df.loc[i, "REGIME"] = 1 - else: - df.loc[i, "REGIME"] = 2 - - return df - - -def is_dpm_col( - df: DataFrame, -) -> DataFrame: - """Ensure DPM column exists with default value of 1. - - Parameters - ---------- - df: DataFrame - Input dataframe. - - Returns - ------- - DataFrame - Copy of df with DPM column. - - """ - df = df.copy() - if "DPM" not in df.columns: - df["DPM"] = 1 - return df - - -def resample_dpm( - df: DataFrame, - frq: str, - cols: dict[str, str], - group_by: list[str] | None = None, -) -> DataFrame: - """Resample DPM data to specified time frequency. - - Aggregates Detection Positive Minutes (DPM) by time period, - optionally preserving grouping columns like deployment name. - - Parameters - ---------- - df: DataFrame - CPOD result DataFrame with DPM data. - frq: str - Pandas frequency string: "D" (day), "h" (hour), "10min", etc. - cols: dict[str, str] - Dictionary of column names and to process them. - group_by: list[str], optional - Columns to group by (e.g., ["deploy.name", "start_datetime"]). - If None, groups only by start_datetime. - - Returns - ------- - DataFrame - Resampled DataFrame with aggregated DPM values. - - Examples - -------- - >>> # Daily aggregation per deployment - >>> resample_dpm(df, "D", {"Foraging":"sum"}, group_by=["deploy.name"]) - - >>> # Hourly aggregation with site info preserved - >>> resample_dpm(df, "h", cols={"DPM":"sum","deploy.name":"first"}) - - """ - df = is_dpm_col(df) - df = add_utc(df, ["start_datetime"], frq) - - # Determine grouping columns - if group_by is None: - group_by = ["start_datetime"] - - return df.groupby(group_by).agg(cols).reset_index() - - -def deploy_period( - df: DataFrame, - col_timestamp: str = "start_datetime", - col_deployment: str = "deploy.name", -) -> DataFrame: - """Extract start and end timestamps for each deployment. - - Parameters - ---------- - df: DataFrame - Input dataframe with parsed timestamps. - col_timestamp: str, default="start_datetime" - Name of the timestamp column. - col_deployment: str, default="deploy.name" - Name of the deployment identifier column. - - Returns - ------- - DataFrame - Table with columns: [col_deployment, 'Deb', 'Fin']. - - """ - return ( - df.groupby([col_deployment]) - .agg(Deb=(col_timestamp, "first"), Fin=(col_timestamp, "last")) - .reset_index() - ) - - -def first_last( - df: DataFrame, - col_timestamp: str = "start_datetime", - col_deployment: str = "deploy.name", - date_formats: list[str] | None = None, -) -> DataFrame: - """Isolate beginning and end of every deployment. - - Parameters - ---------- - df: DataFrame - CPOD result DataFrame. - col_timestamp: str, default="start_datetime" - Name of the timestamps' column. - col_deployment: str, default="deploy.name" - Name of the deployment identifier column. - date_formats: list[str], optional - List of date formats to try for parsing. - - Returns - ------- - DataFrame - Table with deployment periods (Deb, Fin). - - """ - df_parsed = parse_timestamps(df, col_timestamp, date_formats) - return deploy_period(df_parsed, col_timestamp, col_deployment) - - -def actual_data( - df: DataFrame, - meta: DataFrame, -) -> DataFrame: - """Create a table with beginning and end of every deployment using metadata. - - Parameters - ---------- - df: DataFrame - Contains beginning and end for every deployment. - meta: DataFrame - Contains metadata for every deployment. - - Returns - ------- - DataFrame - Table with corrected deployment periods (Deb, Fin). - - """ - required_columns( - df, ["deploy.name", "ChunkEnd"]) - required_columns( - meta, ["deploy.name", "deployment_date", "recovery_date"]) - - beg_end = first_last(df, "ChunkEnd") - - beg_end = add_utc(beg_end, ["Deb", "Fin"]) - - final = beg_end.merge(meta[["deployment_date", "recovery_date", "deploy.name"]], - on="deploy.name", how="left") - final.loc[final["Deb"] < final["deployment_date"], "Deb"] = final["deployment_date"] - final.loc[final["Fin"] > final["recovery_date"], "Fin"] = final["recovery_date"] - final.loc[final["Deb"] > final["Fin"], ["Deb", "Fin"]] = None - final = final.sort_values(by=["Deb"]) - return final.drop(["deployment_date", "recovery_date"], axis=1) - - -def process_tl(tl_files: Path) -> DataFrame: - """Process Environmental data extracted from cpod.exe to get a usable dataframe. - - Parameters - ---------- - tl_files: Path All your Environmental data files. + threshold: float + TimeLost threshold. Returns ------- %TimeLost DataFrame. """ - df = csv_folder(tl_files) - df = df.dropna() - df = parse_timestamps(df, "ChunkEnd") - df = add_utc(df, ["ChunkEnd"], "h") - df["start_datetime"] = df["ChunkEnd"] - - return df.sort_values(["start_datetime"]) - - -def filter_tl(df: DataFrame, tl: int) -> DataFrame: - """Remove lines with a %TimeLost superior to the chosen threshold. - - Parameters - ---------- - df: DataFrame - Table of data and associated TimeLost. - tl: int - TimeLost filter threshold. - - Returns - ------- - Filtered DataFrame with few %TimeLost. - - """ - df["%TimeLost"] = (df["%TimeLost"].fillna(tl)).astype(int) - - return df[df["%TimeLost"] < tl] - - -def preserved_data(filtered_df: DataFrame, whole_df: DataFrame) -> float: - """Calculate the percentage of preserved data. - - Parameters - ---------- - filtered_df: DataFrame - Result of filter_tl. - whole_df: DataFrame - Table before filtering. - - Returns - ------- - Percentage of preserved data. + if threshold not in range(101): + msg = "Threshold must integer between 0 and 100." + raise ValueError(msg) - """ - return (len(filtered_df) / len(whole_df)) * 100 + df["Datetime"] = df["Datetime"].dt.floor("h") + cols_to_drop = [ + col for col in df.columns if col not in { + "File", "Datetime", "Temp", "Angle", "%TimeLost", "Deploy", + } + ] + return df[df["%TimeLost"] <= threshold].drop( + columns=cols_to_drop, + ).sort_values(["Datetime"]).reset_index(drop=True) def create_matrix( @@ -747,32 +321,11 @@ def create_matrix( return matrix -def extract_site( - df: DataFrame, -) -> DataFrame: - """Create new columns: site.name and campaign.name, in order to match the metadata. - - Parameters - ---------- - df: DataFrame - All values concatenated - - Returns - ------- - DataFrame - The same dataframe with two additional columns. - - """ - required_columns(df, ["deploy.name"]) - df[["site.name", "campaign.name"]] = df["deploy.name"].str.split("_", expand=True) - return df - - def percent_calc( data: DataFrame, time_unit: str | None = None, ) -> DataFrame: - """Calculate percentage of clicks, feeding buzzes and positive hours to detection. + """Calculate the percentage of clicks, feeding buzzes and positive hours to detection. Computed on the entire effort and for every site. @@ -817,7 +370,7 @@ def percent_calc( def site_percent(df: DataFrame, metric: str) -> None: - """Plot a graph with percentage of minutes positive to detection for every site. + """Plot a graph with the percentage of minutes positive to detection for every site. Parameters ---------- diff --git a/src/post_processing/utils/glider_utils.py b/src/post_processing/utils/glider_utils.py index 626371c..aaa0224 100644 --- a/src/post_processing/utils/glider_utils.py +++ b/src/post_processing/utils/glider_utils.py @@ -175,7 +175,7 @@ def load_glider_nav(directory: Path) -> DataFrame: msg = f"Directory '{directory}' does not exist." raise FileNotFoundError(msg) - file = [f for f in directory.glob("*.gz") if "gli" in f.name] + file = [f for f in directory.rglob("*.gz") if "gli" in f.name] if not len(file) > 0: msg = f"Directory '{directory}' does not contain '.gz' files." diff --git a/src/post_processing/utils/plot_utils.py b/src/post_processing/utils/plot_utils.py index f21e343..4cffcd6 100644 --- a/src/post_processing/utils/plot_utils.py +++ b/src/post_processing/utils/plot_utils.py @@ -11,9 +11,17 @@ import numpy as np from matplotlib import dates as mdates from matplotlib.dates import num2date -from matplotlib.ticker import PercentFormatter +from matplotlib.patches import Patch from numpy import ceil, histogram, polyfit -from pandas import DataFrame, DatetimeIndex, Index, Timedelta, Timestamp, date_range +from pandas import ( + DataFrame, + DatetimeIndex, + Series, + Timedelta, + Timestamp, + concat, + date_range, +) from pandas.tseries import frequencies from scipy.stats import pearsonr from seaborn import scatterplot @@ -28,11 +36,10 @@ timedelta_to_str, ) from post_processing.utils.filtering_utils import ( + filter_by_annotator, get_max_time, get_timezone, - filter_by_annotator, ) -from post_processing.utils.metrics_utils import normalize_counts_by_effort if TYPE_CHECKING: from datetime import tzinfo @@ -68,15 +75,15 @@ def histo( - legend: bool Whether to show the legend. - color: str | list[str] - Color or list of colors for the histogram bars. - If not provided, default colors will be used. + Colour or list of colours for the histogram bars. + If not provided, default colours will be used. - season: bool Whether to show the season. - coordinates: tuple[float, float] The coordinates of the plotted detections. - effort: RecordingPeriod Object corresponding to the observation effort. - If provided, data will be normalized by observation effort. + If provided, data will be normalised by observation effort. """ labels, annotators = zip(*[col.rsplit("-", 1) for col in df.columns], strict=False) @@ -107,9 +114,6 @@ def histo( else: legend_labels = None - if effort: - normalize_counts_by_effort(df, effort, time_bin) - n_groups = len(labels) if legend_labels else 1 bar_width = bin_size / n_groups bin_starts = mdates.date2num(df.index) @@ -130,31 +134,29 @@ def histo( ax.bar(bin_starts + offset, df.iloc[:, i], **bar_kwargs) if len(df.columns) > 1 and legend: - ax.legend(labels=legend_labels, bbox_to_anchor=(1.01, 1), loc="upper left") + ax.legend( + labels=legend_labels, + bbox_to_anchor=(1.01, 1), + loc="upper left", + ) - y_label = ( - f"Detections{(' normalized by effort' if effort else '')}" - f"\n(detections: {timedelta_to_str(time_bin)}" - f" - bin size: {bin_size_str})" - ) - ax.set_ylabel(y_label) - set_y_axis_to_percentage(ax) if effort else set_dynamic_ylim(ax, df) + ax.set_ylabel(f"Detections ({timedelta_to_str(time_bin)})") + ax.set_xlabel(f"Bin size ({bin_size_str})") set_plot_title(ax, annotators, labels) ax.set_xlim(begin, end) - if season: - if lat is None or lon is None: - get_coordinates() - add_season_period(ax, northern=lat >= 0) - if effort: shade_no_effort( ax=ax, - bin_starts=df.index, observed=effort, - bar_width=bin_size, + legend=legend, ) + if season: + if lat is None or lon is None: + get_coordinates() + add_season_period(ax, northern=lat >= 0) + def _prepare_timeline_plot( df: DataFrame, @@ -196,7 +198,6 @@ def _prepare_timeline_plot( ax.set_ylim(0, 24) ax.set_yticks(range(0, 25, 2)) ax.set_ylabel("Hour") - ax.set_xlabel("Date") ax.grid(color="k", linestyle="-", linewidth=0.2) set_plot_title(ax=ax, annotators=annotators, labels=labels) @@ -213,7 +214,7 @@ def scatter( df: DataFrame, ax: Axes, time_range: DatetimeIndex, - **kwargs: bool | tuple[float, float], + **kwargs: bool | tuple[float, float] | RecordingPeriod, ) -> None: """Scatter-plot of detections for a given annotator and label. @@ -237,6 +238,8 @@ def scatter( show_rise_set = kwargs.get("show_rise_set", False) season = kwargs.get("season", False) coordinates = kwargs.get("coordinates", False) + effort = kwargs.get("effort", False) + legend = kwargs.get("legend", False) _prepare_timeline_plot( df=df, @@ -276,6 +279,13 @@ def scatter( framealpha=0.6, ) + if effort: + shade_no_effort( + ax=ax, + observed=effort, + legend=legend, + ) + def heatmap(df: DataFrame, ax: Axes, @@ -359,7 +369,7 @@ def heatmap(df: DataFrame, ) if coordinates and season: - lat, lon = coordinates + lat, _ = coordinates add_season_period(ax, northern=lat >= 0) bin_size_str = get_bin_size_str(bin_size) @@ -466,7 +476,7 @@ def agreement( bin_size: Timedelta | BaseOffset, ax: plt.Axes, ) -> None: - """Compute and visualize agreement between two annotators. + """Compute and visualise agreement between two annotators. This function compares annotation timestamps from two annotators over a time range. It also fits and plots a linear regression line and displays the coefficient @@ -487,41 +497,33 @@ def agreement( """ labels, annotators = get_labels_and_annotators(df) - datetimes1 = list( - df[(df["annotator"] == annotators[0]) & (df["annotation"] == labels[0])][ - "start_datetime" - ], - ) - datetimes2 = list( - df[(df["annotator"] == annotators[1]) & (df["annotation"] == labels[1])][ - "start_datetime" - ], - ) + datetimes = [ + list( + df[ + (df["annotator"] == annotators[i]) & (df["annotation"] == labels[i]) + ]["start_datetime"], + ) + for i in range(2) + ] # scatter plot n_annot_max = bin_size.total_seconds() / df["end_time"].iloc[0] - start = df["start_datetime"].min() - stop = df["start_datetime"].max() - freq = ( bin_size if isinstance(bin_size, Timedelta) else str(bin_size.n) + bin_size.name ) bins = date_range( - start=start.floor(bin_size), - end=stop.ceil(bin_size), + start=df["start_datetime"].min().floor(bin_size), + end=df["start_datetime"].max().ceil(bin_size), freq=freq, ) - hist1, _ = histogram(datetimes1, bins=bins) - hist2, _ = histogram(datetimes2, bins=bins) - df_hist = ( DataFrame( { - annotators[0]: hist1, - annotators[1]: hist2, + annotators[0]: histogram(datetimes[0], bins=bins)[0], + annotators[1]: histogram(datetimes[1], bins=bins)[0], }, ) / n_annot_max @@ -557,8 +559,8 @@ def timeline( Matplotlib axes object where the scatterplot and regression line will be drawn. **kwargs: Additional keyword arguments depending on the mode. - color: str | list[str] - Color or list of colors for the histogram bars. - If not provided, default colors will be used. + Colour or list of colours for the histogram bars. + If not provided, default colours will be used. """ color = kwargs.get("color") @@ -577,7 +579,7 @@ def timeline( ax.grid(color="k", linestyle="-", linewidth=0.2) ax.set_yticks(np.arange(0, len(labels), 1)) - ax.set_yticklabels(labels[::-1]) + ax.set_yticklabels(labels) ax.set_xlabel("Date") ax.set_xlim( df["start_datetime"].min().floor("1d"), @@ -606,12 +608,15 @@ def get_bin_size_str(bin_size: Timedelta | BaseOffset) -> str: return str(bin_size.n) + bin_size.freqstr -def set_y_axis_to_percentage( - ax: plt.Axes, -) -> None: +def set_y_axis_to_percentage(ax: plt.Axes, max_val: float) -> None: """Set y-axis to percentage.""" - ax.yaxis.set_major_formatter(PercentFormatter(xmax=1.0)) - ax.set_yticks(np.arange(0, 1.02, 0.2)) + ax.yaxis.set_major_formatter( + plt.FuncFormatter(lambda y, _: f"{(y / max_val) * 100:.0f}%"), + ) + + current_label = ax.get_ylabel() + if current_label and "%" not in current_label: + ax.set_ylabel(f"{current_label} (%)") def set_dynamic_ylim(ax: plt.Axes, @@ -639,9 +644,8 @@ def set_plot_title(ax: plt.Axes, annotators: list[str], labels: list[str]) -> No def shade_no_effort( ax: plt.Axes, - bin_starts: Index, observed: RecordingPeriod, - bar_width: Timedelta, + legend: bool, ) -> None: """Shade areas of the plot where no observation effort was made. @@ -649,31 +653,96 @@ def shade_no_effort( ---------- ax : plt.Axes The axes on which to draw the shaded regions. - bin_starts : Index - A datetime index representing the start times of each bin. observed : RecordingPeriod A Series with observation counts or flags, indexed by datetime. Should be aligned or re-indexable to `bin_starts`. - bar_width : Timedelta - Width of each time bin. Used to compute the span of the shaded areas. - + legend : bool + Wether to add the legend entry for the shaded regions. """ + # Convert effort IntervalIndex → DatetimeIndex (bin starts) + effort_by_start = Series( + observed.counts.values, + index=[i.left for i in observed.counts.index], + ) + + bar_width = effort_by_start.index[1] - effort_by_start.index[0] width_days = bar_width.total_seconds() / 86400 - no_effort_bins = bin_starts[observed.counts.reindex(bin_starts) == 0] - for ts in no_effort_bins: - start = mdates.date2num(ts) - ax.axvspan(start, start + width_days, color="grey", alpha=0.08, zorder=1.5) - x_min, x_max = ax.get_xlim() - data_min = mdates.date2num(bin_starts[0]) - data_max = mdates.date2num(bin_starts[-1]) + width_days - - if x_min < data_min: - ax.axvspan(x_min, data_min, color="grey", alpha=0.08, zorder=1.5) - if x_max > data_max: - ax.axvspan(data_max, x_max, color="grey", alpha=0.08, zorder=1.5) - ax.set_xlim(x_min, x_max) + max_effort = bar_width / observed.timebin_origin + effort_fraction = effort_by_start / max_effort + + first_elem = Series([0], index=[effort_fraction.index[0] - bar_width]) + last_elem = Series([0], index=[effort_fraction.index[-1] + bar_width]) + effort_fraction = concat([first_elem, effort_fraction, last_elem]) + + no_effort = effort_fraction[effort_fraction == 0] + partial_effort = effort_fraction[(effort_fraction > 0) & (effort_fraction < 1)] + + # Get legend handle + handles1, labels1 = ax.get_legend_handles_labels() + + _draw_effort_spans( + ax=ax, + effort_index=partial_effort.index, + width_days=width_days, + facecolor="0.65", + alpha=0.1, + label="partial data", + ) + + _draw_effort_spans( + ax=ax, + effort_index=no_effort.index, + width_days=width_days, + facecolor="0.45", + alpha=0.15, + label="no data", + ) + + # Add effort legend to current plot legend + handles_effort = [] + if len(partial_effort) > 0: + handles_effort.append( + Patch(facecolor="0.65", alpha=0.1, label="partial data"), + ) + if len(no_effort) > 0: + handles_effort.append( + Patch(facecolor="0.45", alpha=0.15, label="no data"), + ) + if handles_effort and legend: + labels_effort = [h.get_label() for h in handles_effort] + handles = handles1 + handles_effort + labels = labels1 + labels_effort + ax.legend( + handles, + labels, + bbox_to_anchor=(1.01, 1), + loc="upper left", + ) + + +def _draw_effort_spans( + ax: plt.Axes, + effort_index: DatetimeIndex, + width_days: float, + *, + facecolor: str, + alpha: float, + label: str, +) -> None: + """Draw vertical lines for effort plot.""" + for ts in effort_index: + start = mdates.date2num(ts) + ax.axvspan( + start, + start + width_days, + facecolor=facecolor, + alpha=alpha, + linewidth=0, + zorder=1, + label=label, + ) def add_sunrise_sunset(ax: Axes, lat: float, lon: float, tz: tzinfo) -> None: diff --git a/tests/conftest.py b/tests/conftest.py index e03bf43..a6299e3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,6 +8,7 @@ import yaml from osekit.utils.timestamp_utils import strftime_osmose_format from pandas import DataFrame, read_csv +from pandas.tseries import frequencies SAMPLE = """dataset,filename,start_time,end_time,start_frequency,end_frequency,annotation,annotator,start_datetime,end_datetime,type,score sample_dataset,2025_01_25_06_20_00,0.0,10.0,0.0,72000.0,lbl2,ann2,2025-01-25T06:20:00.000+00:00,2025-01-25T06:20:10.000+00:00,WEAK,0.11 @@ -122,8 +123,6 @@ """ - - STATUS = """dataset,filename,ann1,ann2,ann3,ann4,ann5,ann6 sample_dataset,2025_01_25_06_20_00,FINISHED,FINISHED,FINISHED,FINISHED,FINISHED,FINISHED sample_dataset,2025_01_25_06_20_10,FINISHED,FINISHED,FINISHED,FINISHED,FINISHED,FINISHED @@ -134,6 +133,14 @@ sample_dataset,2025_01_26_06_20_20,FINISHED,FINISHED,FINISHED,FINISHED,FINISHED,FINISHED """ +# --------------------------------------------------------------------------- +# Fake recording planning CSV used for tests +# --------------------------------------------------------------------------- +RECORDING_PLANNING_CSV = """start_recording,end_recording,start_deployment,end_deployment +2024-01-01 00:00:00+0000,2024-04-09 02:00:00+0000,2024-01-02 00:00:00+0000,2024-04-30 02:00:00+0000 +2024-04-30 01:00:00+0000,2024-07-14 06:00:00+0000,2024-04-30 02:00:00+0000,2024-07-06 14:00:00+0000 +""" + @pytest.fixture def sample_df() -> DataFrame: @@ -228,3 +235,21 @@ def create_file(path: Path, size: int = 2048): create_file(nested / "file4.wav") (tmp_path / "ignore.txt").write_text("not audio") return tmp_path + + +@pytest.fixture +def recording_planning_csv(tmp_path) -> Path: + """Create a temporary CSV file simulating a recording planning.""" + path = tmp_path / "recording_planning.csv" + path.write_text(RECORDING_PLANNING_CSV) + return path + + +@pytest.fixture +def recording_planning_config(recording_planning_csv): + """Minimal config object compatible with RecordingPeriod.from_path.""" + class RecordingPlanningConfig: + timestamp_file: Path = recording_planning_csv + timebin_origin = frequencies.to_offset("1min") + + return RecordingPlanningConfig() diff --git a/tests/test_DataAplose.py b/tests/test_DataAplose.py index 5ad1b04..9b9516c 100644 --- a/tests/test_DataAplose.py +++ b/tests/test_DataAplose.py @@ -19,6 +19,7 @@ def test_data_aplose_init(sample_df: DataFrame) -> None: assert data.begin == sample_df["start_datetime"].min() assert data.end == sample_df["end_datetime"].max() + def test_filter_df_single_pair(sample_df: DataFrame) -> None: data = DataAplose(sample_df) filtered_data = data.filter_df(annotator="ann1", label="lbl1") @@ -30,17 +31,19 @@ def test_filter_df_single_pair(sample_df: DataFrame) -> None: ].reset_index(drop=True) assert filtered_data.equals(expected) + def test_change_tz(sample_df: DataFrame) -> None: data = DataAplose(sample_df) - new_tz = 'Etc/GMT-7' + new_tz = "Etc/GMT-7" data.change_tz(new_tz) - start_dt = data.df['start_datetime'] - end_dt = data.df['end_datetime'] + start_dt = data.df["start_datetime"] + end_dt = data.df["end_datetime"] assert all(ts.tz.zone == new_tz for ts in start_dt), f"The detection start timestamps have to be in {new_tz} timezone" assert all(ts.tz.zone == new_tz for ts in end_dt), f"The detection end timestamps have to be in {new_tz} timezone" assert data.begin.tz.zone == new_tz, f"The begin value of the DataAplose has to be in {new_tz} timezone" assert data.end.tz.zone == new_tz, f"The end value of the DataAplose has to be in {new_tz} timezone" + def test_filter_df_multiple_pairs(sample_df: DataFrame) -> None: data = DataAplose(sample_df) filtered_data = data.filter_df(annotator=["ann1", "ann2"], label=["lbl1", "lbl2"]) diff --git a/tests/test_core_utils.py b/tests/test_core_utils.py index a1a3d73..e72e482 100644 --- a/tests/test_core_utils.py +++ b/tests/test_core_utils.py @@ -8,6 +8,8 @@ from post_processing.dataclass.data_aplose import DataAplose from post_processing.utils.core_utils import ( + add_recording_period, + add_season_period, add_weak_detection, get_coordinates, get_count, @@ -15,13 +17,11 @@ get_season, get_sun_times, get_time_range_and_bin_size, + json2df, localize_timestamps, round_begin_end_timestamps, - timedelta_to_str, - add_season_period, - add_recording_period, set_bar_height, - json2df, + timedelta_to_str, ) @@ -409,10 +409,11 @@ def test_add_season_no_data() -> None: # %% add_recording_period + def test_add_recording_period_valid() -> None: fig, ax = plt.subplots() start = Timestamp("2025-01-01T00:00:00+00:00") - stop = Timestamp("2025-01-02T00:00:00+00:00") + stop = Timestamp("2025-01-02T00:00:00+00:00") ts = date_range(start=start, end=stop, freq="H", tz="UTC") values = list(range(len(ts))) @@ -423,7 +424,7 @@ def test_add_recording_period_valid() -> None: [ Timestamp("2025-01-01T00:00:00+00:00"), Timestamp("2025-01-02T00:00:00+00:00"), - ] + ], ], columns=["deployment_date", "recovery_date"], ) @@ -438,6 +439,7 @@ def test_add_recording_period_no_data() -> None: # %% set_bar_height + def test_set_bar_height_valid() -> None: fig, ax = plt.subplots() start = Timestamp("2025-01-01T00:00:00+00:00") @@ -457,6 +459,7 @@ def test_set_bar_height_no_data() -> None: # %% json2df + def test_json2df_valid(tmp_path): fake_json = { "deployment_date": "2025-01-01T00:00:00+00:00", @@ -474,9 +477,9 @@ def test_json2df_valid(tmp_path): [ Timestamp("2025-01-01T00:00:00+00:00"), Timestamp("2025-01-02T00:00:00+00:00"), - ] + ], ], columns=["deployment_date", "recovery_date"], ) - assert df.equals(expected) \ No newline at end of file + assert df.equals(expected) diff --git a/tests/test_filtering_utils.py b/tests/test_filtering_utils.py index 95fd987..3ec3760 100644 --- a/tests/test_filtering_utils.py +++ b/tests/test_filtering_utils.py @@ -77,7 +77,7 @@ def test_find_delimiter_unsupported_delimiter(tmp_path: Path) -> None: with pytest.raises( ValueError, - match=r"unsupported delimiter '&'" + match=r"unsupported delimiter '&'", ): find_delimiter(file) @@ -199,6 +199,7 @@ def test_filter_by_freq_valid(sample_df: DataFrame, f_min, f_max): if f_max is not None: assert (result["end_frequency"] <= f_max).all() + @pytest.mark.parametrize( "f_min, f_max, expected_msg", [ @@ -216,8 +217,6 @@ def test_filter_by_freq_valid(sample_df: DataFrame, f_min, f_max): ), ], ) - - def test_filter_by_freq_out_of_range(sample_df: DataFrame, f_min, f_max, expected_msg): with pytest.raises(ValueError, match=expected_msg): filter_by_freq(sample_df, f_min=f_min, f_max=f_max) @@ -331,7 +330,7 @@ def test_get_timezone_several(sample_df: DataFrame) -> None: } sample_df = concat( [sample_df, DataFrame([new_row])], - ignore_index=False + ignore_index=False, ) tz = get_timezone(sample_df) assert len(tz) == 2 @@ -340,6 +339,7 @@ def test_get_timezone_several(sample_df: DataFrame) -> None: # %% read DataFrame + def test_read_dataframe_comma_delimiter(tmp_path: Path) -> None: csv_file = tmp_path / "test.csv" csv_file.write_text( @@ -417,7 +417,7 @@ def test_no_timebin_several_tz(sample_df: DataFrame) -> None: } sample_df = concat( [sample_df, DataFrame([new_row])], - ignore_index=False + ignore_index=False, ) timestamp_wav = to_datetime(sample_df["filename"], format="%Y_%m_%d_%H_%M_%S").dt.tz_localize(pytz.UTC) @@ -429,7 +429,7 @@ def test_no_timebin_original_timebin(sample_df: DataFrame) -> None: tz = get_timezone(sample_df) timestamp_wav = to_datetime( sample_df["filename"], - format="%Y_%m_%d_%H_%M_%S" + format="%Y_%m_%d_%H_%M_%S", ).dt.tz_localize(tz) df_out = reshape_timebin( sample_df, @@ -520,7 +520,7 @@ def test_simple_reshape_hourly(sample_df: DataFrame) -> None: tz = get_timezone(sample_df) timestamp_wav = to_datetime( sample_df["filename"], - format="%Y_%m_%d_%H_%M_%S" + format="%Y_%m_%d_%H_%M_%S", ).dt.tz_localize(tz) df_out = reshape_timebin( sample_df, @@ -538,7 +538,7 @@ def test_reshape_daily_multiple_bins(sample_df: DataFrame) -> None: tz = get_timezone(sample_df) timestamp_wav = to_datetime( sample_df["filename"], - format="%Y_%m_%d_%H_%M_%S" + format="%Y_%m_%d_%H_%M_%S", ).dt.tz_localize(tz) df_out = reshape_timebin(sample_df, timestamp_audio=timestamp_wav, timebin_new=Timedelta(days=1)) assert not df_out.empty @@ -555,7 +555,7 @@ def test_with_manual_timestamps_vector(sample_df: DataFrame) -> None: df_out = reshape_timebin( sample_df, timestamp_audio=timestamp_wav, - timebin_new=Timedelta(hours=1) + timebin_new=Timedelta(hours=1), ) assert not df_out.empty @@ -589,6 +589,7 @@ def test_ensure_no_invalid_with_elements() -> None: assert "bar" in str(exc_info.value) assert "columns" in str(exc_info.value) + def test_ensure_no_invalid_single_element() -> None: invalid_items = ["baz"] with pytest.raises(ValueError) as exc_info: @@ -598,6 +599,7 @@ def test_ensure_no_invalid_single_element() -> None: # %% intersection / union + def test_intersection(sample_df) -> None: df_result = intersection_or_union(sample_df[sample_df["annotator"].isin(["ann1", "ann2"])], user_sel="intersection") @@ -628,7 +630,7 @@ def test_not_enough_annotators_raises() -> None: "annotation": ["cat"], "start_datetime": to_datetime(["2025-01-01 10:00"]), "end_datetime": to_datetime(["2025-01-01 10:01"]), - "annotator": ["A"] + "annotator": ["A"], }) with pytest.raises(ValueError, match="Not enough annotators detected"): - intersection_or_union(df_single_annotator, user_sel="intersection") \ No newline at end of file + intersection_or_union(df_single_annotator, user_sel="intersection") diff --git a/tests/test_fpod_utils.py b/tests/test_fpod_utils.py index a0ab178..d1c4ece 100644 --- a/tests/test_fpod_utils.py +++ b/tests/test_fpod_utils.py @@ -4,7 +4,7 @@ from pandas import DataFrame from post_processing.utils.fpod_utils import ( - csv_folder, + load_pod_folder, pod2aplose, ) @@ -142,7 +142,7 @@ def test_csv_folder_single_file(tmp_path) -> None: csv_file = tmp_path / "data.csv" csv_file.write_text("col1;col2\nval1;val2\nval3;val4", encoding="latin-1") - result = csv_folder(tmp_path) + result = load_pod_folder(tmp_path) assert isinstance(result, DataFrame) assert len(result) == 2 diff --git a/tests/test_glider_utils.py b/tests/test_glider_utils.py index 12d83df..d0247c5 100644 --- a/tests/test_glider_utils.py +++ b/tests/test_glider_utils.py @@ -56,7 +56,7 @@ def test_get_position_from_timestamp(nav_df: DataFrame) -> None: def test_plot_detections_with_nav_data( df_detections: DataFrame, - nav_df: DataFrame + nav_df: DataFrame, ) -> None: plot_detections_with_nav_data( df=df_detections, diff --git a/tests/test_metric_utils.py b/tests/test_metric_utils.py index 34ce769..35717e7 100644 --- a/tests/test_metric_utils.py +++ b/tests/test_metric_utils.py @@ -3,6 +3,7 @@ from post_processing.utils.metrics_utils import detection_perf + def test_detection_perf(sample_df: DataFrame) -> None: try: detection_perf(df=sample_df[sample_df["annotator"].isin(["ann1", "ann4"])], ref=("ann1", "lbl1")) @@ -12,4 +13,4 @@ def test_detection_perf(sample_df: DataFrame) -> None: def test_detection_perf_one_annotator(sample_df: DataFrame) -> None: with pytest.raises(ValueError, match="Two annotators needed"): - detection_perf(df=sample_df[sample_df["annotator"] == "ann1"], ref=("ann1", "lbl1")) \ No newline at end of file + detection_perf(df=sample_df[sample_df["annotator"] == "ann1"], ref=("ann1", "lbl1")) diff --git a/tests/test_plot_utils.py b/tests/test_plot_utils.py index d7392cf..4306c38 100644 --- a/tests/test_plot_utils.py +++ b/tests/test_plot_utils.py @@ -1,13 +1,11 @@ import matplotlib.pyplot as plt import pytest -from matplotlib.ticker import PercentFormatter -from numpy import arange, testing from post_processing.utils.plot_utils import ( - overview, _wrap_xtick_labels, - set_y_axis_to_percentage, get_legend, + overview, + set_y_axis_to_percentage, ) @@ -57,16 +55,15 @@ def test_wrap_xtick_labels_no_spaces(): assert wrapped_labels[0] == expected -def test_y_axis_formatter_and_ticks(): +def test_set_y_axis_to_percentage(): fig, ax = plt.subplots() - - set_y_axis_to_percentage(ax) - - assert isinstance(ax.yaxis.get_major_formatter(), PercentFormatter) - assert ax.yaxis.get_major_formatter().xmax == 1.0 - - expected_ticks = arange(0, 1.02, 0.2) - testing.assert_allclose(ax.get_yticks(), expected_ticks) + ax.set_ylabel("Accuracy") + set_y_axis_to_percentage(ax, max_val=200) + formatter = ax.yaxis.get_major_formatter() + assert formatter(100, None) == "50%" + assert formatter(200, None) == "100%" + assert ax.get_ylabel() == "Accuracy (%)" + plt.close(fig) def test_single_annotator_multiple_labels(): @@ -103,4 +100,4 @@ def test_lists_and_strings_combined(): labels = ["Label1", "Label2"] result = get_legend(annotators, labels) expected = ["Alice\nLabel1", "Bob\nLabel2"] - assert result == expected \ No newline at end of file + assert result == expected diff --git a/tests/test_recording_period.py b/tests/test_recording_period.py new file mode 100644 index 0000000..064c6b5 --- /dev/null +++ b/tests/test_recording_period.py @@ -0,0 +1,80 @@ +from pandas import Timedelta, read_csv, to_datetime + +from post_processing.dataclass.detection_filter import DetectionFilter +from post_processing.dataclass.recording_period import RecordingPeriod + + +def test_recording_period_with_gaps(recording_planning_config: DetectionFilter) -> None: + """RecordingPeriod correctly represents long gaps with no recording effort. + + The planning contains two recording blocks separated by ~3 weeks with no + recording at all. Weekly aggregation must reflect: + - weeks with full effort, + - weeks with partial effort, + - weeks with zero effort. + """ + histo_x_bin_size = Timedelta("7D") + recording_period = RecordingPeriod.from_path( + config=recording_planning_config, + bin_size=histo_x_bin_size, + ) + + counts = recording_period.counts + origin = recording_planning_config.timebin_origin + nb_timebin_origin_per_histo_x_bin_size = int(histo_x_bin_size / origin) + + # Computes effective recording intervals from recording planning csv + df_planning = read_csv( + recording_planning_config.timestamp_file, + parse_dates=[ + "start_recording", + "end_recording", + "start_deployment", + "end_deployment", + ], + ) + for col in [ + "start_recording", + "end_recording", + "start_deployment", + "end_deployment", + ]: + df_planning[col] = ( + to_datetime(df_planning[col], utc=True) + .dt.tz_convert(None) + ) + + df_planning["start"] = df_planning[ + ["start_recording", "start_deployment"] + ].max(axis=1) + df_planning["end"] = df_planning[ + ["end_recording", "end_deployment"] + ].min(axis=1) + + planning = df_planning.loc[df_planning["start"] < df_planning["end"]] + # ------------------------------------------------------------------ + # Structural checks + # ------------------------------------------------------------------ + assert not counts.empty + assert counts.index.is_interval() + assert counts.min() >= 0 + assert counts.max() <= nb_timebin_origin_per_histo_x_bin_size + + # ------------------------------------------------------------------ + # Find overlap (number of timebin_origin) within each effective recording period + # ------------------------------------------------------------------ + for interval in counts.index: + bin_start = interval.left + bin_end = interval.right + + # Compute overlap with all recording intervals + overlap_start = planning["start"].clip(lower=bin_start, upper=bin_end) + overlap_end = planning["end"].clip(lower=bin_start, upper=bin_end) + + overlap = (overlap_end - overlap_start).clip(lower=Timedelta(0)) + expected_minutes = int(overlap.sum() / recording_planning_config.timebin_origin) + + assert counts.loc[interval] == expected_minutes, ( + f"Mismatch for bin {interval}: " + f"expected {expected_minutes}, got {counts.loc[interval]}" + ) diff --git a/user_case/example_FPOD-CPOD_aplose.ipynb b/user_case/example_FPOD-CPOD_aplose.ipynb deleted file mode 100644 index b0da1c8..0000000 --- a/user_case/example_FPOD-CPOD_aplose.ipynb +++ /dev/null @@ -1,287 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "id": "initial_id", - "metadata": { - "collapsed": true - }, - "source": [ - "from pathlib import Path\n", - "\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from pandas import (\n", - " read_csv,\n", - ")\n", - "\n", - "from post_processing.dataclass.data_aplose import DataAplose\n", - "from post_processing.utils.fpod_utils import resample_dpm, build_range, process_tl, filter_tl, preserved_data" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Load data\n", - "DPM = Detection Positive Minutes \\\n", - "FB = Feeding buzzes \\\n", - "🐬 = input to modify\n" - ], - "id": "a97e19830123b732" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "yaml_file = Path(r\"resource\\CPOD-FPOD_yaml.yml\") #Change the file path in the yaml sheet.🐬\n", - "data_list = DataAplose.from_yaml(file=yaml_file)\n", - "\n", - "d_beg_end = read_csv(r\"U:\\Deb_Fin_CA4.csv\") #Beginning and end of recording for every phase. 🐬\n", - "\n", - "tl_path = Path(r\"U:\\TimeLost\\tl_ca4\\phases\")\n", - "tl_df = process_tl(tl_path)\n", - "tl_df = tl_df.drop_duplicates(subset=['ChunkEnd'], keep=\"first\")\n", - "tl_df[\"Angle\"] = (tl_df[\"Angle\"].replace(',', '.', regex=True)).astype(float)\n", - "tl_df[\"Temp\"] = (tl_df[\"Temp\"].replace(',', '.', regex=True)).astype(float)" - ], - "id": "7da2feb5958db1a9", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Resample your data\n", - "Here you need to choose the format in which you want to visualise the positive detections. This aims to replace the functionalities available in Chelonia's softwares." - ], - "id": "3bc57f4f638ad6dc" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "frq = \"h\" #Determine the format in which you want to visualise your data. Use \"D\", \"h\" or \"10min\". 🐬\n", - "tl = 100 #%TimeLost threshold. If you do not want to set a filter, set tl to 100." - ], - "id": "9b0a078a262ac7f2", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "resamp = resample_dpm(data_list.df, frq=frq, cols={\"DPM\":\"sum\",\"Foraging\":\"sum\",\"deploy.name\":\"first\"}) #Resample your DPMs according to the chosen frq.", - "id": "fa3847d80ccf49c3", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Add the effort\n", - "To analyze the data, add zeros to view it based on effort." - ], - "id": "b92537991aa4ac4b" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_0 = build_range(d_beg_end, frq) #Create a dataframe from beginning to end of every phase filled with 0s.\n", - "d_tot = d_0.merge(resamp, on=[\"start_datetime\", \"deploy.name\"], how=\"left\")" - ], - "id": "4d76089ef06c6fdb", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Add additional metrics columns", - "id": "e6a4623e4baf25b5" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_tot[[\"DPM\",\"Foraging\"]] = d_tot[[\"DPM\",\"Foraging\"]].fillna(0)\n", - "d_tot[\"FBR\"] = d_tot[\"Foraging\"] / d_tot[\"DPM\"] #The Feeding Buzz Ratio corresponds to the proportion of FB among the recorded clicks.\n", - "d_tot[\"FBR\"] = d_tot[\"FBR\"].fillna(0)\n", - "d_tot[f\"DP{frq}\"] = (d_tot[\"DPM\"] >0).astype(int) #1 if the hour counts at least 1 DPM, else 0." - ], - "id": "912268e5e997dbc6", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_tot[\"DPH_fb\"] = (d_tot[\"Foraging\"] >0).astype(int)\n", - "d_tot[\"FBR_h\"] = d_tot[\"DPH_fb\"] / d_tot[\"DPH\"]\n", - "d_tot[\"FBR_h\"] = d_tot[\"FBR_h\"].fillna(0)" - ], - "id": "23e3e4137d9e2a84", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Add time columns", - "id": "a775158ba810957a" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_tot = d_tot.assign(**{attr: getattr(d_tot[\"start_datetime\"].dt, attr.lower())\n", - " for attr in ['Year', 'Month', 'Day', 'Hour']})\n", - "d_tot['YM'] = d_tot[\"start_datetime\"].dt.to_period('M').dt.to_timestamp()\n", - "d_tot['Date'] = d_tot[\"start_datetime\"].dt.to_period('D').dt.to_timestamp()" - ], - "id": "62ce5a31ed0db25a", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Filter your data\n", - "Chose a threshold of %TimeLost to remove all data exceeding it." - ], - "id": "98d31fb21ffb9165" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "full_df = d_tot.merge(\n", - " tl_df[[\"start_datetime\", \"%TimeLost\", \"Angle\", \"Temp\"]],\n", - " on=\"start_datetime\", how=\"left\")" - ], - "id": "66bf795805047a3d", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "filtered_df = filter_tl(full_df, tl)\n", - "preserved_data(filtered_df, full_df)" - ], - "id": "c61f1c95fc05749e", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Visualize environmental data", - "id": "e0ea7247e27a37b6" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "resamp_tot = filtered_df.set_index(\"start_datetime\").resample(frq).first().reset_index()\n", - "\n", - "fig, ax = plt.subplots(figsize=(12, 6))\n", - "ax.plot(resamp_tot[\"start_datetime\"], resamp_tot[\"Angle\"])\n", - "plt.title(\"Angle of the instrument over time\")\n", - "plt.show()" - ], - "id": "14ff46dec308dcc1", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "fig2, ax = plt.subplots(figsize=(12, 6))\n", - "ax.plot(resamp_tot[\"start_datetime\"], resamp_tot[\"Temp\"])\n", - "plt.title(\"Temperature over time\")\n", - "plt.show()" - ], - "id": "db07736375a767d9", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "fig3, ax = plt.subplots(figsize=(12, 6))\n", - "ax.plot(resamp_tot[\"start_datetime\"], resamp_tot[\"%TimeLost\"])\n", - "plt.title(\"%TimeLost over time\")\n", - "plt.show()" - ], - "id": "7409831bc24271e3", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Extract your processed data\n", - "This dataframe is now compatible for analyses on the next notebook and on R." - ], - "id": "c64d09af5a11213d" - }, - { - "metadata": {}, - "cell_type": "code", - "source": "filtered_df.to_csv(r\"U:\\Hours_DPM_FBUZZ_CA4.csv\", index=False) #Name your file. 🐬", - "id": "f1f9ec385038ba87", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### First visualization\n", - "Precise the coordinates of the location of your listening point" - ], - "id": "a1b31aa6bd8f4d70" - }, - { - "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "", - "id": "3fda0cc8174fa757" - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/user_case/example_FPOD-CPOD_firstresults.ipynb b/user_case/example_FPOD-CPOD_firstresults.ipynb deleted file mode 100644 index 1c3366b..0000000 --- a/user_case/example_FPOD-CPOD_firstresults.ipynb +++ /dev/null @@ -1,251 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "id": "initial_id", - "metadata": { - "collapsed": true - }, - "source": [ - "from pandas import (\n", - " concat,\n", - " read_csv,\n", - " to_datetime,\n", - ")\n", - "\n", - "from post_processing.utils.fpod_utils import extract_site, percent_calc, year_percent, ym_percent, create_matrix,hist_mean_h, hist_mean_m, hist_mean_s\n", - "from post_processing.utils.core_utils import get_season" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "## Overview\n", - "Import the right dataset." - ], - "id": "36421fdbbca9aed6" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *CALAIS*", - "id": "caea0e065ad8068c" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "ca4 = read_csv(r\"L:\\acoustock\\Bioacoustique\\DATASETS\\CPOD_PROJETS\\CALAIS\\DATA\\DATA_FULL_OTHERCET\\Hours_DPM_FBUZZ_CA4_ssTL.csv\")\n", - "walde = read_csv(r\"L:\\acoustock\\Bioacoustique\\DATASETS\\CPOD_PROJETS\\CALAIS\\DATA\\DATA_FULL_OTHERCET\\Hours_DPM_FBUZZ_Walde_ssTL.csv\")\n", - "\n", - "data_c = concat([ca4, walde])\n", - "data_c[\"start_datetime\"] = to_datetime(data_c[\"start_datetime\"])\n", - "data_c[\"start_datetime\"] = data_c[\"start_datetime\"].apply(lambda x : x.tz_convert(\"Europe/Paris\"))\n", - "data_c[\"Hour\"] = data_c[\"start_datetime\"].dt.hour" - ], - "id": "1268d9e6ce5cdf32", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *CETIROISE*", - "id": "b426e672fdd5c6b8" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "ceti = read_csv(r\"U:\\Hours_DPM_FBUZZ_CETIROISE.csv\")\n", - "\n", - "ceti[\"start_datetime\"] = to_datetime(ceti[\"start_datetime\"])\n", - "ceti[\"start_datetime\"] = ceti[\"start_datetime\"].apply(lambda x : x.tz_convert(\"CET\")) #TimeZone Central European Time\n", - "ceti[\"Hour\"] = ceti[\"start_datetime\"].dt.hour" - ], - "id": "870bc0a014561ba8", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *KERGUELEN*", - "id": "17a5ce1338f6cd1a" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "a = read_csv(r\"U:\\Hours_DPM_FBUZZ_A.csv\")\n", - "b = read_csv(r\"U:\\Hours_DPM_FBUZZ_B.csv\")\n", - "c = read_csv(r\"U:\\Hours_DPM_FBUZZ_C.csv\")\n", - "d = read_csv(r\"U:\\Hours_DPM_FBUZZ_D.csv\")\n", - "\n", - "data_k = concat([a, b, c, d])\n", - "data_k[\"start_datetime\"] = to_datetime(data_k[\"start_datetime\"])\n", - "data_k[\"start_datetime\"] = data_k[\"start_datetime\"].apply(lambda x : x.tz_convert(\"Indian/Kerguelen\"))\n", - "data_k[\"Hour\"] = data_k[\"start_datetime\"].dt.hour" - ], - "id": "d65697a1f1487f4c", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### First results\n", - "Precise your dataset." - ], - "id": "9fc3b5075bf7ff2c" - }, - { - "metadata": {}, - "cell_type": "code", - "source": "data = data_c #🐬", - "id": "add4a626d6cc25a4", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "data = extract_site(data)\n", - "data[\"YMH\"] = data[\"Year\"].astype(str) + '-' + data[\"Month\"].astype(str) + '-' + data[\"Hour\"].astype(str)\n", - "y_per = percent_calc(data, \"Year\")\n", - "#t_per = percent_calc(data, \"TRAVAUX\")" - ], - "id": "37ecc80eda8e57ed", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "ym_per = percent_calc(data, \"YM\")\n", - "ym_per[\"YM\"] = to_datetime(ym_per[\"YM\"])\n", - "ym_per[\"Season\"] = ym_per[\"YM\"].apply(lambda x: get_season(x)[0]) #If in the southern hemisphere, write \"get_season(x, northern = False)\".\n", - "ym_per[\"Month\"] = ym_per[\"YM\"].dt.month\n", - "ym_per[\"Year\"] = ym_per[\"YM\"].dt.year" - ], - "id": "2b988869ed2466e1", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "per_h = percent_calc(data, \"YMH\")\n", - "per_h[[\"Y\",\"M\",\"Hour\"]] = per_h[\"YMH\"].str.split(\"-\", expand=True)\n", - "per_h[\"Hour\"] = per_h[\"Hour\"].astype(int)" - ], - "id": "cf704032c4a59a7b", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "matrice_s = create_matrix(ym_per, [\"site.name\"],[\"%DPH\", \"FBR\"])\n", - "matrice_m = create_matrix(ym_per, [\"site.name\", \"Month\"],[\"%click\", \"FBR\"])\n", - "matrice_h = create_matrix(per_h, [\"site.name\", \"Hour\"],[\"%click\", \"FBR\"])" - ], - "id": "caf3f71c6b6f70ca", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "hist_mean_s(\n", - " matrice_s,\n", - " metric_mean=\"%DPH_mean\",\n", - " metric_std=\"%DPH_std\",\n", - " y_lab=\"Moyenne %DPH\",\n", - " title_suffix=\"%DPH\"\n", - ")" - ], - "id": "2ff751ae02e80285", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "year_percent(y_per, \"FBR\")", - "id": "29f2703ab28c5b28", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "ym_percent(ym_per, \"DPM\")", - "id": "a2dacac3caecff5f", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "hist_mean_m(\n", - " matrice_m,\n", - " metric_mean=\"%click_mean\",\n", - " metric_std=\"%click_std\",\n", - " y_lab=\"Moyenne %click\",\n", - " title_suffix=\"%click\"\n", - ")" - ], - "id": "71161e7545bb1414", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "hist_mean_h(\n", - " matrice_h,\n", - " metric_mean=\"FBR_mean\",\n", - " metric_std=\"FBR_std\",\n", - " y_lab=\"Feeding buzz ratio\",\n", - " title_suffix=\"FBR\"\n", - ")" - ], - "id": "5cbea8601bce2172", - "outputs": [], - "execution_count": null - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/user_case/example_FPOD-CPOD_raw.ipynb b/user_case/example_FPOD-CPOD_raw.ipynb deleted file mode 100644 index 60e7f67..0000000 --- a/user_case/example_FPOD-CPOD_raw.ipynb +++ /dev/null @@ -1,420 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "id": "initial_id", - "metadata": { - "collapsed": true, - "ExecuteTime": { - "end_time": "2026-01-20T10:24:33.069494Z", - "start_time": "2026-01-20T10:24:29.180185Z" - } - }, - "source": [ - "from __future__ import annotations\n", - "\n", - "import pytz\n", - "\n", - "from post_processing.utils.fpod_utils import csv_folder, pod2aplose, actual_data, meta_cut_aplose, feeding_buzz, txt_folder, add_utc\n", - "from post_processing.utils.core_utils import json2df\n", - "\n", - "import logging\n", - "from pathlib import Path\n", - "import matplotlib.pyplot as plt\n", - "import matplotlib.ticker as ticker\n", - "from pandas import Timestamp, to_datetime\n", - "from pandas.tseries import frequencies\n", - "from post_processing.dataclass.data_aplose import DataAplose\n", - "from post_processing.utils.core_utils import get_season, get_count\n", - "from osekit import setup_logging\n", - "from user_case.config import season_color\n", - "setup_logging(Path(r\"C:\\Users\\dupontma2\\Documents\\Git\\OSmOSE\\OSmOSE_post_processing\\src\\post_processing\\logging_config.yaml\"), logging.ERROR)" - ], - "outputs": [], - "execution_count": 1 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Load data\n", - "🐬 = input to modify \\\n", - "Import your raw FPOD or CPOD data. All files for one site must be stored in the same folder and identified by their respective phases. \\\n", - "You also need to import your metadata file." - ], - "id": "c464f241817a1407" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-20T10:32:32.641354Z", - "start_time": "2026-01-20T10:32:09.650669Z" - } - }, - "cell_type": "code", - "source": [ - "pod_files = Path(r\"U:\\CA4\") #Path to your data folder. 🐬\n", - "path = csv_folder(pod_files) #Process all your POD.csv files.\n", - "\n", - "fb_files = Path(r\"U:\\fb_CA4\") #Path to your click details folder. 🐬\n", - "json = Path(r\"C:\\Users\\fouinel\\Downloads\\deployment_calais.json\") #Path to your metadata file. 🐬\n", - "\n", - "print(path.head())\n", - "df_0 = path.dropna()\n", - "\n", - "metadatax = json2df(json_path=json)\n", - "metadatax[\"deploy.name\"] = (metadatax[\"site.name\"].astype(str) + \"_\" + metadatax[\"campaign.name\"].astype(str))" - ], - "id": "6cf23db3b4288c29", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " File ChunkEnd DPM Nall MinsOn deploy.name\n", - "0 CA4 POD2397 file01.CP3 14/05/2014 07:07 0 0 1 CA4_Phase1\n", - "1 CA4 POD2397 file01.CP3 14/05/2014 07:08 0 8 1 CA4_Phase1\n", - "2 CA4 POD2397 file01.CP3 14/05/2014 07:09 0 4 1 CA4_Phase1\n", - "3 CA4 POD2397 file01.CP3 14/05/2014 07:10 0 251 1 CA4_Phase1\n", - "4 CA4 POD2397 file01.CP3 14/05/2014 07:11 0 4095 1 CA4_Phase1\n" - ] - } - ], - "execution_count": 2 - }, - { - "metadata": {}, - "cell_type": "code", - "source": "d_beg_end = actual_data(df_0, metadatax) #Extract the beginning and end of recording for every phase.", - "id": "fa52f8971b61aaf6", - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-20T10:32:43.618484Z", - "start_time": "2026-01-20T10:32:43.243995Z" - } - }, - "cell_type": "code", - "source": "df_1 = df_0[df_0[\"DPM\"] !=\"0\" ] #Remove the 0 to lighten the APLOSE file.", - "id": "769e128f2a5293e1", - "outputs": [], - "execution_count": 3 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### APLOSE format\n", - "#### *C-POD*\n", - "Use cpod2aplose if you are managing C-POD data." - ], - "id": "dd03975b7aef7eed" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-20T10:32:50.772619Z", - "start_time": "2026-01-20T10:32:50.060639Z" - } - }, - "cell_type": "code", - "source": [ - "df_aplose = pod2aplose(df_1, pytz.utc, \"CA4\", \"Marsouin\", \"CPOD\") #Precise site name, species and instrument. 🐬\n", - "print(df_aplose.head())" - ], - "id": "4cc867627d677529", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dataset filename start_time end_time \\\n", - "0 CA4 2014-05-16T02:30:00.000+0000 0 60 \n", - "1 CA4 2014-05-17T03:52:00.000+0000 0 60 \n", - "2 CA4 2014-05-17T04:47:00.000+0000 0 60 \n", - "3 CA4 2014-05-19T17:06:00.000+0000 0 60 \n", - "4 CA4 2014-05-20T11:07:00.000+0000 0 60 \n", - "\n", - " start_frequency end_frequency annotation annotator \\\n", - "0 0 0 Marsouin CPOD \n", - "1 0 0 Marsouin CPOD \n", - "2 0 0 Marsouin CPOD \n", - "3 0 0 Marsouin CPOD \n", - "4 0 0 Marsouin CPOD \n", - "\n", - " start_datetime end_datetime is_box \\\n", - "0 2014-05-16T02:30:00.000+0000 2014-05-16T02:31:00.000+0000 0 \n", - "1 2014-05-17T03:52:00.000+0000 2014-05-17T03:53:00.000+0000 0 \n", - "2 2014-05-17T04:47:00.000+0000 2014-05-17T04:48:00.000+0000 0 \n", - "3 2014-05-19T17:06:00.000+0000 2014-05-19T17:07:00.000+0000 0 \n", - "4 2014-05-20T11:07:00.000+0000 2014-05-20T11:08:00.000+0000 0 \n", - "\n", - " deploy.name \n", - "0 CA4_Phase1 \n", - "1 CA4_Phase1 \n", - "2 CA4_Phase1 \n", - "3 CA4_Phase1 \n", - "4 CA4_Phase1 \n" - ] - } - ], - "execution_count": 4 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Clean your dataset\n", - "Remove useless lines, recorder outside the instrument submersion. Export your file to the aplose format. You can change the name of the file to match the project you are working on." - ], - "id": "d2c642658dbfe278" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-20T10:33:14.509811Z", - "start_time": "2026-01-20T10:33:14.070492Z" - } - }, - "cell_type": "code", - "source": "cleared = meta_cut_aplose(df_aplose, metadatax)", - "id": "895bd5a116918285", - "outputs": [], - "execution_count": 5 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-20T10:33:24.735234Z", - "start_time": "2026-01-20T10:33:24.723966Z" - } - }, - "cell_type": "code", - "source": "cleared = cleared.drop_duplicates(subset=['start_datetime'], keep=\"last\")", - "id": "1a31da7341f3d8c9", - "outputs": [], - "execution_count": 7 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Feeding buzzes processing\n", - "Use \"Dauphin\", Marsouin\" or \"Commerson\" to get different ICI processing." - ], - "id": "4cf0b89a9491884" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "fb_all = txt_folder(fb_files) #Read all your FB.txt files.\n", - "fb_all = feeding_buzz(fb_all, \"Marsouin\") #Categorize the minutes (positive or not to FB detection). 🐬\n", - "add_utc(fb_all, [\"start_datetime\"], \"min\")" - ], - "id": "64f824e02131d90a", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "dpm_fb = cleared.merge(fb_all[[\"start_datetime\", \"Foraging\"]], on=[\"start_datetime\"], how=\"left\") #Merge DPM and FB dataframes", - "id": "e90f6d91de3f8ce3", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Export", - "id": "d114ed7164cfd0da" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_beg_end.to_csv(r\"U:\\Deb_Fin_CA4.csv\", index=False) #Export the new file. 🐬\n", - "dpm_fb.to_csv(r\"U:\\APLOSE_CA4_pos.csv\", index=False) #Name your file. 🐬" - ], - "id": "9d34e00f4e8147e8", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Explore\n", - "First visualization of the data" - ], - "id": "a85ea092d9fc197c" - }, - { - "metadata": {}, - "cell_type": "code", - "source": "data = DataAplose(dpm_fb)", - "id": "639c474690373895", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "bin_size = frequencies.to_offset(\"1d\")\n", - "ticks = frequencies.to_offset(\"6BMS\")\n", - "fmt = \"%b %y\"" - ], - "id": "cb476b5655bdff42", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "data.lat = 50.973333 #CA4: 51.00035 ; Walde: 50.973333 ; A: -49.38765 ; B: -49.424733 ; C: -49.4677 ; D: -49.47175\n", - "data.lon = 1.8117 #CA4: 1.879667 ; Walde: 1.8117 ; A: 69.9449 ; B: 69.932383 ; C: 70.081067 ; D: 69.836617" - ], - "id": "3fc33f2acf84ea34", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "#### Reshape the data\n", - "Set beginning and end of the chosen window." - ], - "id": "2857f26f8c168ad3" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "data.df[\"end_datetime\"] = to_datetime(data.df[\"end_datetime\"])\n", - "data2 = data.reshape(begin=Timestamp(\"2013 11 01\"), end=Timestamp(\"2025 08 01\"))\n", - "tz = pytz.timezone(\"UTC\")\n", - "data2.change_tz(tz)" - ], - "id": "75e00c1920b69409", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### Heatmap", - "id": "cdf3a92dfb6514d6" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "fig, ax = plt.subplots(1, 1)\n", - "ax = data2.set_ax(ax=ax, x_ticks_res=ticks, date_format=fmt)\n", - "data2.plot(\n", - " mode=\"heatmap\",\n", - " annotator=data2.annotators[0],\n", - " label=data2.labels[0],\n", - " ax=ax,\n", - " bin_size=bin_size,\n", - " show_rise_set=True,\n", - ")\n", - "plt.tight_layout()\n", - "plt.show()" - ], - "id": "6e1832101aec4156", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### Detections over time", - "id": "b0f3c6b7fc08a2be" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "fig2, ax = plt.subplots(1, 1)\n", - "ax = data2.set_ax(ax=ax, x_ticks_res=ticks, date_format=fmt)\n", - "data2.plot(\n", - " mode=\"scatter\",\n", - " annotator=data2.annotators[0],\n", - " label=data2.labels[0],\n", - " ax=ax,\n", - " show_rise_set=True,\n", - ")\n", - "plt.tight_layout()\n", - "plt.show()" - ], - "id": "66b43e53fb17037", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### DPM per day", - "id": "c7ea248b277edb65" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "df_filtered = data2.filter_df(data2.annotators[0], data2.labels[0])\n", - "df_counts = get_count(df_filtered, bin_size)\n", - "\n", - "df_counts[\"Season\"] = df_counts.index.to_series().apply(lambda x: get_season(x)[0])\n", - "df_counts[\"colors\"] = df_counts[\"Season\"].map(season_color).fillna(\"gray\")\n", - "\n", - "fig3, ax = plt.subplots(1, 1)\n", - "ax = data2.set_ax(ax=ax, x_ticks_res=ticks, date_format=fmt)\n", - "data2.plot(\n", - " mode=\"histogram\",\n", - " annotator=data2.annotators[0],\n", - " label=data2.labels[0],\n", - " color=df_counts[\"colors\"].tolist(),\n", - " ax=ax,\n", - " bin_size=bin_size,\n", - " legend=True,\n", - ")\n", - "ax.set_ylim(0, 200)\n", - "ax.yaxis.set_major_locator(ticker.MaxNLocator(nbins=10))\n", - "plt.tight_layout()\n", - "plt.show()" - ], - "id": "81b0dafa8adc20e9", - "outputs": [], - "execution_count": null - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/user_case/resource/CPOD-FPOD_yaml.yml b/user_case/resource/CPOD-FPOD_yaml.yml deleted file mode 100644 index 3e259cc..0000000 --- a/user_case/resource/CPOD-FPOD_yaml.yml +++ /dev/null @@ -1,31 +0,0 @@ -#This file is to be used to load an APLOSE result csv file. -#If a parameter is set here to `null`, it will be loaded with a `None` value in the Python scripts. -#For parameter definition, see `sort_detections` function in `utils\def_func`. -# -#Note: -# - Several csv files can be loaded at once, to perform this copy the template and paste it at the end of the present file. - -######################################### -'U:\APLOSE_A1112.csv': - # timebin: detection time resolution in seconds - 'timebin_new': null - # begin datetime: '2000-01-01T00:00:00+0000' - 'begin': null - # end datetime: '2000-01-01T00:00:00+0000' - 'end': null - # annotator: ['annotator1', 'annotator2'] - 'annotator': null - # annotation: ['annotation1', 'annotation2'] - 'annotation': "Commerson" - # box: boolean value, whether to keep strong annotations - 'box': False - # timestamp file - 'timestamp_file': null - # user selection - 'user_sel': all - # f_min filter - 'f_min': null - # f_max filter - 'f_max': null - # score - 'score': null diff --git a/user_case/user_case_CALAIS.ipynb b/user_case/user_case_CALAIS.ipynb deleted file mode 100644 index a5e49a3..0000000 --- a/user_case/user_case_CALAIS.ipynb +++ /dev/null @@ -1,580 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "id": "initial_id", - "metadata": { - "collapsed": true, - "ExecuteTime": { - "end_time": "2025-10-23T15:42:49.393544Z", - "start_time": "2025-10-23T15:42:47.415071Z" - } - }, - "source": [ - "from pathlib import Path\n", - "\n", - "import pytz\n", - "from pandas import (\n", - " concat,\n", - " read_csv,\n", - " to_datetime,\n", - ")\n", - "\n", - "from post_processing.dataclass.data_aplose import DataAplose\n", - "from post_processing.utils.fpod_utils import csv_folder, pod2aplose, actual_data, meta_cut_aplose, resample_dpm, txt_folder, build_range, extract_site, percent_calc, site_percent, year_percent, ym_percent, month_percent, hour_percent, \\\n", - " feeding_buzz\n", - "from post_processing.utils.core_utils import json2df,get_season" - ], - "outputs": [], - "execution_count": 1 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Pre-processing\n", - "\n" - ], - "id": "e8e8c57c7f4197fe" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Import your csv files. All files for one site must be stored in the same folder.", - "id": "6f9beab2dcba1a9c" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:25:01.549663Z", - "start_time": "2025-10-17T09:24:48.208563Z" - } - }, - "cell_type": "code", - "source": [ - "pod_files = Path(r\"U:\\Walde\")\n", - "path = csv_folder(pod_files)\n", - "print(path.head())\n", - "\n", - "df_0 = path.dropna()\n", - "\n", - "json = Path(r\"C:\\Users\\fouinel\\Downloads\\deployment_calais.json\") #Path to your metadata file.\n", - "metadatax = json2df(json_path=json)\n", - "\n", - "metadatax[\"deploy.name\"] = (metadatax[\"site.name\"].astype(str) + \"_\" +\n", - " metadatax[\"campaign.name\"].astype(str))" - ], - "id": "8636a8a27fe2af47", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " File ChunkEnd DPM Nall MinsOn deploy.name\n", - "0 POD2399 file01.CP3 14/05/2014 08:03 0.0 0.0 1.0 Walde_Phase1\n", - "1 POD2399 file01.CP3 14/05/2014 08:04 0.0 799.0 1.0 Walde_Phase1\n", - "2 POD2399 file01.CP3 14/05/2014 08:05 0.0 0.0 1.0 Walde_Phase1\n", - "3 POD2399 file01.CP3 14/05/2014 08:06 0.0 3361.0 1.0 Walde_Phase1\n", - "4 POD2399 file01.CP3 14/05/2014 08:07 0.0 421.0 1.0 Walde_Phase1\n" - ] - } - ], - "execution_count": 11 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:34:43.092415Z", - "start_time": "2025-10-17T09:27:26.409365Z" - } - }, - "cell_type": "code", - "source": "d_beg_end = actual_data(df_0, metadatax)", - "id": "4208969d9e509a8", - "outputs": [], - "execution_count": 12 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:35:32.137350Z", - "start_time": "2025-10-17T09:35:32.089860Z" - } - }, - "cell_type": "code", - "source": "d_beg_end.to_csv(r\"U:\\Deb_Fin_Walde.csv\", index=False)", - "id": "6fb6f4fa675d7cab", - "outputs": [], - "execution_count": 13 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:35:33.725437Z", - "start_time": "2025-10-17T09:35:33.670018Z" - } - }, - "cell_type": "code", - "source": "df_1 = df_0[df_0[\"DPM\"] !=0 ]", - "id": "597efd1d90e3d069", - "outputs": [], - "execution_count": 14 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### APLOSE format", - "id": "4f8c83c96f0b6ff4" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Chose the right function, depending on the instrument you are working with.", - "id": "9849c47189cf1a85" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *CPOD*", - "id": "8ed339c688bdef1" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:35:38.378283Z", - "start_time": "2025-10-17T09:35:35.755020Z" - } - }, - "cell_type": "code", - "source": [ - "df_aplose = pod2aplose(df_1, pytz.utc, \"Walde\", \"Marsouin\", \"CPOD\")\n", - "print(df_aplose.head())" - ], - "id": "812ed7c0c5e258e7", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dataset filename start_time end_time start_frequency end_frequency \\\n", - "0 Walde 0 60 0 0 \n", - "1 Walde 0 60 0 0 \n", - "2 Walde 0 60 0 0 \n", - "3 Walde 0 60 0 0 \n", - "4 Walde 0 60 0 0 \n", - "\n", - " annotation annotator start_datetime \\\n", - "0 Marsouin FPOD 2014-05-15T16:56:00.000+0000 \n", - "1 Marsouin FPOD 2014-05-17T22:53:00.000+0000 \n", - "2 Marsouin FPOD 2014-05-17T22:54:00.000+0000 \n", - "3 Marsouin FPOD 2014-05-18T14:05:00.000+0000 \n", - "4 Marsouin FPOD 2014-05-21T14:19:00.000+0000 \n", - "\n", - " end_datetime is_box deploy.name \n", - "0 2014-05-15T16:57:00.000+0000 0 Walde_Phase1 \n", - "1 2014-05-17T22:54:00.000+0000 0 Walde_Phase1 \n", - "2 2014-05-17T22:55:00.000+0000 0 Walde_Phase1 \n", - "3 2014-05-18T14:06:00.000+0000 0 Walde_Phase1 \n", - "4 2014-05-21T14:20:00.000+0000 0 Walde_Phase1 \n" - ] - } - ], - "execution_count": 15 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *FPOD*", - "id": "a39bb10d8ac60a27" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:23:23.552890Z", - "start_time": "2025-10-17T09:23:22.810583Z" - } - }, - "cell_type": "code", - "source": [ - "df_aplose = pod2aplose(df_1, pytz.utc, \"CETIROISE\", \"Marsouin\", \"FPOD\")\n", - "print(df_aplose.head())" - ], - "id": "9b632673397a184", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dataset filename start_time end_time start_frequency end_frequency \\\n", - "0 CA4 0 60 0 0 \n", - "1 CA4 0 60 0 0 \n", - "2 CA4 0 60 0 0 \n", - "3 CA4 0 60 0 0 \n", - "4 CA4 0 60 0 0 \n", - "\n", - " annotation annotator start_datetime \\\n", - "0 Marsouin FPOD 2014-05-17T03:52:00.000+0000 \n", - "1 Marsouin FPOD 2014-05-17T04:47:00.000+0000 \n", - "2 Marsouin FPOD 2014-05-19T17:06:00.000+0000 \n", - "3 Marsouin FPOD 2014-05-20T11:07:00.000+0000 \n", - "4 Marsouin FPOD 2014-05-20T11:16:00.000+0000 \n", - "\n", - " end_datetime is_box deploy.name \n", - "0 2014-05-17T03:53:00.000+0000 0 CA4_Phase1 \n", - "1 2014-05-17T04:48:00.000+0000 0 CA4_Phase1 \n", - "2 2014-05-19T17:07:00.000+0000 0 CA4_Phase1 \n", - "3 2014-05-20T11:08:00.000+0000 0 CA4_Phase1 \n", - "4 2014-05-20T11:17:00.000+0000 0 CA4_Phase1 \n" - ] - } - ], - "execution_count": 7 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Remove non usable lines", - "id": "7860838f8514da39" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Import the .json file available on metadatax.", - "id": "32f8ff8f9ece35a8" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:35:54.374270Z", - "start_time": "2025-10-17T09:35:54.303578Z" - } - }, - "cell_type": "code", - "source": [ - "\n", - "\n", - "cleared = meta_cut_aplose(df_aplose, metadatax) #Remove lines captures outside the instrument submersion." - ], - "id": "ed6a06c522aea169", - "outputs": [], - "execution_count": 16 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Export your file to the aplose format. You can change the name of the file to match the project you are working on.", - "id": "8f5fe75cc3463971" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-10-17T09:36:06.597522Z", - "start_time": "2025-10-17T09:35:56.739495Z" - } - }, - "cell_type": "code", - "source": "cleared.to_csv(r\"U:\\APLOSE_Walde_pos.csv\", index=False) #You can stock all DPM for a site in a DataAplose file.", - "id": "76f70cb6c6658ba6", - "outputs": [], - "execution_count": 17 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Load data", - "id": "f5d38266dc9d5273" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Use the yaml file to import your aplose files one at a time.", - "id": "2ce11c6e57f38690" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "yaml_file = Path(r\"resource\\CPOD-FPOD_yaml.yml\")\n", - "data_list = DataAplose.from_yaml(file=yaml_file)\n", - "print(data_list.df.head())" - ], - "id": "6837593897111b0a", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Format choice\n", - "id": "9f93eb863e3e3a9e" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Here you need to choose the format in which you want to visualise the positive detections. This aims to replace the functionality available in CPOD.exe.", - "id": "925d92d3eec065e6" - }, - { - "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "frq = \"D\" #This argument will determine the format in which you want to visualise your data. Use \"D\", \"h\" or \"10min\".", - "id": "256b756d05c08294" - }, - { - "metadata": {}, - "cell_type": "code", - "source": "resamp = resample_dpm(data_list.df, frq=frq, extra_columns=[\"deploy.name\"])", - "id": "6cc79b2aeef076ed", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Add the feeding buzzes", - "id": "8375ddbe07ad0aee" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Import your click details files. All files for one site must be stacked in the same folder.", - "id": "9753f4ba20c7267e" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "fb_files = Path(r\"U:\\fb_fpod_cetiroise_c\") #Path to your click details folder.\n", - "fb_all = txt_folder(fb_files)\n", - "fb_all = feeding_buzz(fb_all, \"Marsouin\")\n", - "\n", - "fb_all[\"start_datetime\"] = fb_all[\"start_datetime\"].dt.floor(frq)\n", - "fb = fb_all.groupby(\"start_datetime\")[\"Foraging\"].sum().reset_index()\n", - "fb[\"start_datetime\"] = to_datetime(fb[\"start_datetime\"], utc=True)" - ], - "id": "2b19f90c99252ff3", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_tot = resamp.merge(fb, on=\"start_datetime\", how=\"left\")\n", - "#This function aims to reindent 0 between the positive detections. It will be useful to produce first visualization graphs and use this dataset in R.\n", - "d_hour = build_range(d_beg_end, frq)" - ], - "id": "b00c8f1e2210ea7", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "d_fin = d_hour.merge(d_tot, on=[\"start_datetime\", \"deploy.name\"], how=\"left\")", - "id": "601787cc806226b0", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_fin[[\"DPM\",\"Foraging\"]] = d_fin[[\"DPM\",\"Foraging\"]].fillna(0)\n", - "print(d_fin.head())" - ], - "id": "f93bf1f3ca4f4112", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "## Add time columns", - "id": "c7b1d32ed1c99fb7" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_fin[\"Year\"] = d_fin[\"start_datetime\"].dt.year\n", - "d_fin[\"Month\"] = d_fin[\"start_datetime\"].dt.month\n", - "d_fin['YM'] = d_fin[\"Year\"].astype(str) + '-' + d_fin[\"Month\"].astype(str)\n", - "d_fin['YM'] = to_datetime(d_fin['YM'])\n", - "d_fin[\"Day\"] = d_fin[\"start_datetime\"].dt.day\n", - "d_fin[\"Hour\"] = d_fin[\"start_datetime\"].dt.hour\n", - "\n", - "d_fin[\"FBR\"] = d_fin[\"Foraging\"] / d_fin[\"DPM\"]\n", - "d_fin[\"DPH\"] = (d_fin[\"DPM\"] >0).astype(int)" - ], - "id": "a2261ce5093a3104", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "d_fin[\"FBR\"] = d_fin[\"FBR\"].fillna(0)\n", - "d_fin.to_csv(r\"U:\\Hours_DPM_FBUZZ_CETIROISE.csv\", index=False)" - ], - "id": "d606f4f6904b57c6", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "## Overview", - "id": "4bc0904182a3f845" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *Import datasets*", - "id": "e1de414e2eb3fa8f" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "ca4 = read_csv(r\"U:\\Hours_DPM_FBUZZ_SiteCA4.csv\")\n", - "walde = read_csv(r\"U:\\Hours_DPM_FBUZZ_SiteWalde.csv\")\n", - "\n", - "data_c = concat([ca4, walde])\n", - "data_c[\"start_datetime\"] = to_datetime(data_c[\"start_datetime\"])\n", - "data_c[\"start_datetime\"] = data_c[\"start_datetime\"].apply(lambda x : x.tz_convert(\"Europe/Paris\"))\n", - "data_c[\"Hour\"] = data_c[\"start_datetime\"].dt.hour" - ], - "id": "9909fbfdcb8e2e78", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "a = read_csv(r\"U:\\Hours_DPM_FBUZZ_SiteA.csv\")\n", - "b = read_csv(r\"U:\\Hours_DPM_FBUZZ_SiteB.csv\")\n", - "c = read_csv(r\"U:\\Hours_DPM_FBUZZ_SiteC.csv\")\n", - "d = read_csv(r\"U:\\Hours_DPM_FBUZZ_SiteD.csv\")\n", - "\n", - "data_k = concat([a, b, c, d])\n", - "data_k[\"start_datetime\"] = to_datetime(data_k[\"start_datetime\"])\n", - "data_k[\"start_datetime\"] = data_k[\"start_datetime\"].apply(lambda x : x.tz_convert(\"Indian/Kerguelen\"))\n", - "data_k[\"Hour\"] = data_k[\"start_datetime\"].dt.hour" - ], - "id": "87e2d1938787aefc", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "ceti = read_csv(r\"U:\\Hours_DPM_FBUZZ_CETIROISE.csv\")\n", - "\n", - "ceti[\"start_datetime\"] = to_datetime(ceti[\"start_datetime\"])\n", - "ceti[\"start_datetime\"] = ceti[\"start_datetime\"].apply(lambda x : x.tz_convert(\"CET\")) #TimeZone Central European Time\n", - "ceti[\"Hour\"] = ceti[\"start_datetime\"].dt.hour" - ], - "id": "5928770d1c47bcad", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "#### *Generate graphs*", - "id": "200273fc36fb7d5d" - }, - { - "metadata": {}, - "cell_type": "code", - "source": "data = ceti #Precise which dataset you are working with", - "id": "be10e9d690294cff", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "data = extract_site(data)\n", - "y_per = percent_calc(data, \"Year\")\n", - "ym_per = percent_calc(data, \"YM\")\n", - "ym_per[\"YM\"] = to_datetime(ym_per[\"YM\"])\n", - "ym_per[\"Season\"] = ym_per[\"YM\"].apply(lambda x: get_season(x)[0])\n", - "m_per = percent_calc(data, \"Month\")\n", - "h_per = percent_calc(data, \"Hour\")\n", - "s_per = percent_calc(data)" - ], - "id": "2826b79097a85607", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "site_percent(s_per, \"%click\")", - "id": "ddd1fac6295136c6", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "year_percent(y_per, \"%click\")", - "id": "ba7581e97fdbd07c", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "ym_percent(ym_per, \"%click\")", - "id": "4de618933c154f86", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "month_percent(m_per, \"%buzzes\")", - "id": "7cf84c8744185424", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": "hour_percent(h_per, \"%click\")", - "id": "12d83e9082d711c0", - "outputs": [], - "execution_count": null - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}