active_learning_annotation_tool/ActiveLearning.py at main · marcel8168/active_learning_annotation_tool · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
from itertools import chain
import logging
import os
import re
import time
from typing import Iterable, Union
from modAL import uncertainty
import sklearn
import numpy as np
from datasets import load_dataset
from sklearn.base import BaseEstimator
from Annotation import Annotation
from AnnotationFile import AnnotationFile
from Dataset import Dataset
from Gui import GUI
from System import System
from TextFile import TextFile
from constants import CERTAINTY_THRESHOLD, DATA_PATH, EXTERNAL_TEST_DATASET_FILE_NAME, FOLDER_NAME, PATH_TO_BRAT, COLLECTION_NAME, SUGGESTION_ANNOTATION_TYPE, TRAINING_DATASET_FILE_NAME, TARGET_CLASS


class ActiveLearning:
    """
    The ActiveLearning object provides functionalities of Active Learning.
    """

    def __init__(self):
        pass

    def iteration(self, classifier: sklearn.base.BaseEstimator,
                  unlabeled_data: Union[list, np.ndarray],
                  num_to_annotate: int = 1):
        gui = GUI()
        system = System()
        classifier.performance_report(path_to_test_set=DATA_PATH + EXTERNAL_TEST_DATASET_FILE_NAME)

        sample_lists = []
        n = num_to_annotate
        while num_to_annotate > 0:
            for data in unlabeled_data:
                samples = self.certainty_sampling_by_target_class(classifier=classifier,
                                                        X=[data],
                                                        n_instances=n)
                sample_lists.append(samples)

            samples = (np.array(list(chain(*[sublist[0] for sublist in sample_lists]))).astype(int), list(chain(*[sublist[1] for sublist in sample_lists])))
            samples_indices_sorted = np.argsort(samples[1])[::-1]
            indices_all = samples[0][samples_indices_sorted]
            while num_to_annotate > 0 and indices_all.size > 0:
                indices = indices_all[:num_to_annotate]
                indices_all = indices_all[num_to_annotate:]
                predictions = classifier.predictions.flatten()
                uncertain_samples = list(filter(lambda x: x['index'] in indices, predictions))
                logging.info(f"Suggested samples to be annotated: {uncertain_samples}")
                suggested_samples = list({sample["word"] for sample in uncertain_samples})
                num_to_annotate -= self.add_samples_to_annotation_files(samples=suggested_samples, type=SUGGESTION_ANNOTATION_TYPE)

            n += 1

        most_certain_predictions = self.get_most_certain_predictions(classifier=classifier, X=unlabeled_data)
        if most_certain_predictions:
            self.add_samples_to_annotation_files(samples=most_certain_predictions, type=TARGET_CLASS)

        # User information
        title = "Suggestions loaded"
        message = "Suggestions has been loaded. You can now start annotating.\n\nInfo:\nAnnotation suggestions by the Active Learning process will be marked red. Whenever a change was done, BRAT will reload to apply changes on identical cases."
        gui.show_custom_popup(title, message)
        system.reload()

        path_to_collection, file_names = system.get_file_names_from_path(path_to_brat=PATH_TO_BRAT, folder_name=FOLDER_NAME, collection_name=COLLECTION_NAME)
        annotation_files = [file_name for file_name in file_names if ".ann" in file_name]
        while self.suggestions_left_in_files(path=path_to_collection, file_names=annotation_files):
            self.check_file_change(path=path_to_collection, file_names=annotation_files)

        # User information
        logging.info("Annotation by domain expert finished. No suggestions left.")
        title = "Annotation finished"
        message = "You finished the current annotation step.\n\nNow the next training iteration began. Please do not change any file until the next call."
        gui.show_custom_popup(title, message)

        for annotation_file in annotation_files:
            self.apply_annotation(path=path_to_collection, file_names=file_names, changed_file=annotation_file)

        dataset = Dataset(path_to_collection=path_to_collection)
        dataset.to_json(DATA_PATH, TRAINING_DATASET_FILE_NAME)
        logging.info(f"Updated dataset with new annotations is generated and saved under {DATA_PATH + TRAINING_DATASET_FILE_NAME}")

        if dataset.dataset.shape[0] > 3:
            dataset = load_dataset("json", data_files=DATA_PATH + TRAINING_DATASET_FILE_NAME)
            split_dataset = dataset["train"].train_test_split()
            labeled_dataset = split_dataset.map(classifier.generate_row_labels)

            logging.info("Training with updated and labeled dataset started..")
            classifier.fit(labeled_dataset)
            logging.info("Training finished!")

            classifier.save()
            logging.info("Pretrained model saved.")

    def add_samples_to_annotation_files(self, samples: Iterable[str], type: str) -> int:
        """
        Adds samples to annotation files.

        Arguments
        ---------
            samples (Iterable[str]): Samples to add to annotation files

        Returns
        -------
            int: Number of added annotations
        """
        system = System()
        path_to_collection, file_names = system.get_file_names_from_path(path_to_brat=PATH_TO_BRAT, folder_name=FOLDER_NAME, collection_name=COLLECTION_NAME)

        text_files = [
            TextFile(file_name=file_name, path=path_to_collection)
            for file_name in file_names
            if ".txt" in file_name
        ]
        logging.info(f"Text files found: {str([file.file_name for file in text_files])}")

        added_annotations = set()

        for file in text_files:
            containing_words = file.contains(excerpts=samples)
            if type == SUGGESTION_ANNOTATION_TYPE:
                containing_words = [item for item in containing_words if item[0].lower() not in added_annotations]
            annotations = []
            annotation_file_name = file.file_name[:file.file_name.find(".")] + ".ann"
            for word_info in containing_words:
                if not word_info[0].lower() in {ann.excerpt.lower() for ann in annotations}:
                    annotations.append(Annotation(file_name=annotation_file_name,
                                                type=type,
                                                begin=word_info[1],
                                                end=word_info[2],
                                                excerpt=word_info[0]))
            annotation_file = AnnotationFile(file_name=annotation_file_name,
                                             path=path_to_collection)
            added = annotation_file.add_annotations(annotations=annotations)
            added_annotations = added_annotations.union(added)

        return len(added_annotations)

    def check_file_change(self, path: str, file_names: list) -> str:
        """
        Checks if a file has been changed by comparing its modification time.

        Arguments
        ---------
            filename (str): The path to the file to monitor.
        """
        initial_state = {}

        for file_name in file_names:
            file_path = os.path.join(path, file_name)
            with open(file_path, 'r') as file:
                initial_state[file_name] = file.read()

        while True:
            time.sleep(1)
            for file_name in file_names:
                file_path = os.path.join(path, file_name)
                with open(file_path, 'r') as file:
                    if file.read() != initial_state[file_name]:
                        return file_name

    def suggestions_left_in_files(self, path: str, file_names: list) -> bool:
        """
        Checks if there are any suggestions left in the given files.

        Arguments
        ---------
            path (str): The path to the directory containing the files.
            file_names (List[str]): A list of file names to check for suggestions.

        Returns
        -------
            bool: True if there are suggestions left in any of the files, False otherwise.
        """

        for file_name in file_names:
            if ".ann" in file_name:
                if AnnotationFile(file_name=file_name, path=path).suggestions_left():
                    return True

        return False

    def apply_annotation(self, path: str, file_names: str, changed_file: str) -> None:
        """
        Applies annotations from an old version of the text file to all identical words
        in the new version of the text file, including the label, begin index, and end index.

        Arguments
        ---------
            path (str): Path to the collection.
            file_names (str): Files in the collection.
            changed_file (str): Annotation file that was changed.

        Raises
        ------
            ValueError: If changed file is not of type '.ann'
        """
        if ".ann" not in changed_file:
            raise ValueError("The changed file must be of type '.ann'")

        distinct_file_names = {file_name[:-4] for file_name in file_names if ".txt" in file_name}
        annotation_file = AnnotationFile(file_name=changed_file, path=path)
        annotations = [ann for ann in annotation_file.read() if ann.type != SUGGESTION_ANNOTATION_TYPE]

        for annotation in annotations:
            excerpt = annotation.excerpt

            for text_file in distinct_file_names:
                sentences = TextFile(file_name=text_file + ".txt", path=path).get_sentence_info()
                new_annotations= []

                for idx, sentence in enumerate(sentences["sentence"]):
                    pattern = r'\b{}\b'.format(excerpt)
                    matches = re.finditer(pattern, sentence, re.IGNORECASE)
                    for match in matches:
                        new_annotations.append(
                            Annotation(file_name=text_file + ".ann",
                                        type=annotation.type,
                                        begin=sentences["start"][idx] + match.start(),
                                        end=sentences["start"][idx] + match.end(),
                                        excerpt=match.group(0)
                                    )
                            )
                annotation_file_to_change = AnnotationFile(file_name=text_file + ".ann", path=path)
                annotation_file_to_change.add_annotations(annotations=new_annotations, overwrite_existing=True)

    def get_most_certain_predictions(self, classifier: sklearn.base.BaseEstimator, X: Iterable):
        """
        Get the most certain predictions from the model's predictions.

        Arguments
        ---------
            classifier (sklearn.base.BaseEstimator): Classifier to compute predictions.
            X (Iterable): Input data to make predictions.

        Returns
        -------
            numpy.ndarray: Array containing the most certain predictions.

        """
        probabilities = classifier.predict(X=X).flatten()
        probabilities = list(chain(*probabilities))
        most_certain_predictions = {x['word'] for x in probabilities if x['score'] > CERTAINTY_THRESHOLD and x['entity'] != 'LABEL_0'}

        return most_certain_predictions

    def uncertainty_sampling_by_target_class(self, classifier: BaseEstimator, X, n_instances: int = 1):
        """
        Uncertainty sampling query strategy. Selects the least sure instances for labelling w.r.t. the target class.

        Arguments
        ---------
            classifier: The classifier for which the labels are to be queried.
            X: The pool of samples to query from.
            n_instances: Number of samples to be queried.

        Returns
        -------
            The indices of the instances from X chosen to be labelled.
            The uncertainty metric of the chosen instances.
        """
        probabilities = classifier.predict_proba(X)
        target_class_prob = probabilities.T[1]

        absolute_diff = np.abs(target_class_prob - 0.5)
        sorted_indices = np.argsort(absolute_diff)
        nearest_indices = sorted_indices[:n_instances]
        nearest_values = target_class_prob[nearest_indices]

        return (nearest_indices, nearest_values)

    def certainty_sampling_by_target_class(self, classifier: BaseEstimator, X, n_instances: int = 1):
        """
        Certainty sampling query strategy. Selects the surest instances for labelling w.r.t. the target class.

        Arguments
        ---------
            classifier: The classifier for which the labels are to be queried.
            X: The pool of samples to query from.
            n_instances: Number of samples to be queried.

        Returns
        -------
            The indices of the instances from X chosen to be labelled.
            The certainty metric of the chosen instances.
        """
        probabilities = classifier.predict_proba(X)
        target_class_prob = probabilities.T[1]

        sorted_indices = np.argsort(target_class_prob)
        nearest_indices = sorted_indices[-n_instances:]
        nearest_values = target_class_prob[nearest_indices]

        return (nearest_indices, nearest_values)

    def random_sampling(self, classifier,  X, n_instances: int = 1):
        """
        Random sampling query strategy. Selects instances randomly.

        Arguments
        ---------
            X: The pool of samples to query from.
            n_instances: Number of samples to be queried.

        Returns
        -------
            Randomly chosen indices of the instances from X to be labelled.
            A placeholder for certainty metric (since this is random sampling).
        """
        probabilities = classifier.predict_proba(X)
        target_class_prob = probabilities.T[1]

        n_samples = target_class_prob.shape[0]
        random_indices = np.random.choice(n_samples, size=n_instances, replace=False)

        return (random_indices, np.zeros(n_instances,))