OSmOSE_post_processing/src/post_processing/utils/pamguard_utils.py at 58a32a42c964b085727789fe4ec51ffc838f1c41 · mathieudpnt/OSmOSE_post_processing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from pathlib import Path

from osekit.core_api.audio_data import AudioData
from osekit.utils.timestamp_utils import strftime_osmose_format
from pandas import DataFrame, Timedelta, Timestamp
from pypamguard import load_pamguard_binary_folder
from pypamguard.core.filters import DateFilter, Filters
from pypamguard.logger import Verbosity, logger
from tqdm import tqdm

logger.set_verbosity(Verbosity.ERROR)


def process_binary(audio: AudioData,
                   binary: Path,
                   dataset: str,
                   annotation: str,
                   ) -> DataFrame:
    r"""Process PAMGuard binary files into APLOSE DataFrame.

    Parameters
    ----------
    audio : AudioData
        Osekit AudioData object built from audio files
    binary : Path
        Path to the PAMGuard binary files
    dataset : str
        Dataset name
    annotation : str
        Annotation label

    Returns
    -------
    DataFrame
        APLOSE-formatted DataFrame containing PAMGuard detections

    Examples
    --------
    >>> from pathlib import Path
    >>> from osekit.core_api.audio_file import AudioFile
    >>> from osekit.core_api.audio_data import AudioData

    >>> audio_path = Path(r"path/to/audio/folder")
    >>> binary_path = Path(r"path/to/binary/folder")

    >>> dataset = "dataset_name"
    >>> annotation = "label_name"
    >>> datetime_format = "%Y-%m-%dT%H:%M:%S"

    >>> begin = Timestamp("2025-05-29T00:00:00+0000")
    >>> end = Timestamp("2025-05-30T00:00:00+0000")

    >>> audio_files = [
    ...     AudioFile(path=f,
    ...               strptime_format=datetime_format,
    ...               timezone=begin.tz)
    ...     for f in audio_path.rglob("*/*.wav")
    ... ]

    >>> ad = AudioData.from_files(files=audio_files, begin=begin, end=end)

    >>> df = process_binary(ad, binary_path, dataset, annotation)

    """
    filter_obj = Filters(
        {
        "daterange": DateFilter(start_date=audio.begin, end_date=audio.end, ordered=True),
        },
    )

    data, _, _ = load_pamguard_binary_folder(binary, r"**/*.pgdf", filters=filter_obj)

    (
        start_datetimes,
        start_times,
        end_datetimes,
        end_times,
        durations,
        freq_min,
        freq_max,
        filenames,
        annotator,
    ) = ([], [], [], [], [], [], [], [], [])

    for d in tqdm(data, desc="Creating DataFrame"):
        begin = Timestamp(d.date)
        start_datetimes.append(begin)

        matching_file = None
        for f in audio.files:
            if f.begin <= begin <= f.end:
                matching_file = f
                break

        duration = Timedelta(d.sample_duration / matching_file.sample_rate, "s")
        durations.append(duration)
        end_datetimes.append(begin + duration)

        freq_min.append(d.freq_limits[0])
        freq_max.append(d.freq_limits[1])

        start_time = (begin - matching_file.begin).total_seconds()
        end_time = round(start_time + duration.total_seconds(), 3)
        start_times.append(start_time)
        end_times.append(end_time)

        filenames.append(matching_file.path.name)

        annotator.append(type(d).__name__)

    return DataFrame({
        "dataset": dataset,
        "filename": filenames,
        "start_time": start_times,
        "end_time": end_times,
        "start_frequency": freq_min,
        "end_frequency": freq_max,
        "annotation": annotation,
        "annotator": annotator,
        "start_datetime": [strftime_osmose_format(beg) for beg in start_datetimes],
        "end_datetime": [strftime_osmose_format(end) for end in end_datetimes],
        "is_box": True,
    }).sort_values("start_datetime")