pprl_toolkit/src/pprl/embedder/embedder.py at c13d2e9b47f7083fc2c84ddfa06b7de3bdb6be4a · datasciencecampus/pprl_toolkit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
"""Classes and functions for handling embedding objects."""

import hashlib
import itertools as it
from typing import Iterable

import dill
import numpy as np
import numpy.ma as ma
import pandas as pd
from scipy.optimize import linear_sum_assignment

from pprl.embedder.bloom_filters import BloomFilterEncoder


class EmbeddedDataFrame(pd.DataFrame):
    """A data frame with a reference to an `Embedder` object.

    An `EmbeddedDataFrame` (EDF) instance wraps together a
    `pandas.DataFrame` with a reference to a `pprl.embedder.Embedder`
    object. An EDF also has a mandatory `bf_indices` column, describing
    the Bloom filter indices used for linkage.

    The EDF instance can also calculate `bf_norms` and `thresholds`
    columns which are used in the `Embedder.compare()` method to
    compute `pprl.embedder.SimilarityArray` instances.

    Parameters
    ----------
    data: numpy.ndarray, Iterable, dict, or pandas.DataFrame
        Data to which to attach the embedder. Must include a
        `bf_indices` column with `list` data type.
    embedder: pprl.embedder.Embedder
        A compatible embedder object for the Bloom filter columns in
        `data`.
    update_norms: bool
        Whether to update the Bloom filter norms on creation. Defaults
        to `False`.
    update_thresholds: bool
        Whether to update the similarity thresholds on creation.
        Defaults to `True`.
    *args: Iterable
        Additional positional arguments to pass to `pandas.DataFrame`
        along with `data`.
    **kwargs: dict
        Additional keyword arguments to pass to `pandas.DataFrame` along
        with `data`.

    Attributes
    ----------
    embedder_checksum: str
        Hexadecimal string digest from `self.embedder`.

    Notes
    -----
    An EDF instance is usually created from an existing `Embedder`
    object by calling the `embedder.embed()` method. It can also be
    initialised using an embedder and a `pandas.DataFrame` that already
    has a `bf_indices` column via `EmbeddedDataFrame(df, embedder)`.

    If using the second method it is up to the user to ensure that the
    `Embedder` instance is compatible with the `bf_indices` column
    (as well as `bf_norms` and `thresholds`, if present) in the data
    frame. If in doubt, call `edf.update_norms()` and
    `edf.update_thresholds()` to refresh them.
    """

    def __init__(
        self,
        data: np.ndarray | Iterable | dict | pd.DataFrame,
        embedder: "Embedder",
        update_norms: bool = True,
        update_thresholds: bool = False,
        *args,
        **kwargs,
    ) -> None:
        super().__init__(data, *args, **kwargs)
        self.embedder = embedder
        self.embedder_checksum = embedder.checksum

        # Check key columns exist
        assert isinstance(self.embedder, Embedder), "Valid Embedder must be provided"
        assert "bf_indices" in self.columns, "bf_indices column missing"

        # Initialise bf_norms
        if update_norms:
            self.update_norms()
        if update_thresholds:
            self.update_thresholds()

    def to_bloom_matrix(self) -> np.ndarray:
        """Convert Bloom filter indices into a binary matrix.

        The matrix has a row for each row in the EDF. The number of
        columns is equal to `self.embedder.bf_size + self.embedder.offset`.
        Each row in the matrix is a Bloom filter expressed as a binary vector, with
        the ones corresponding to hashed features.
        This representation is used in the `Embedder.compare()` method.

        Returns
        -------
        X: np.ndarray
            Binary array of size `(len(self), self.embedder.bf_size + self.embedder.offset)`.
        """
        assert self.embedder_checksum == self.embedder.checksum, "Checksum mismatch"

        bf_length = self.embedder.bf_size + self.embedder.offset
        N = len(self)
        X = np.zeros((N, bf_length))
        for i in range(N):
            X[i, self["bf_indices"].iloc[i]] = 1.0

        return X

    def update_thresholds(self) -> "EmbeddedDataFrame":
        """Generate matching thresholds for each row of the data.

        The threshold is the minimum similarity score that will be
        matched. It is found by getting the pairwise similarities
        between each row and the other rows in the same EDF, and taking
        the maximum of these.

        Attributes
        ----------
        data.thresholds: numpy.ndarray
            Column for maximum similarity of each row within the EDF.
        """
        assert self.embedder_checksum == self.embedder.checksum, "Checksum mismatch"

        similarities = self.embedder.compare(self, self, require_thresholds=False)
        np.fill_diagonal(similarities, -np.inf)

        self["thresholds"] = np.max(similarities, 1)

        return self

    def _calculate_norm(self, bf_indices: list[int]) -> float:
        """Given a list of bf_indices, calculate the vector norm wrt scm_matrix."""
        return np.sqrt(np.sum(self.embedder.scm_matrix[np.ix_(bf_indices, bf_indices)]))

    def update_norms(self) -> "EmbeddedDataFrame":
        """Generate vector norms for each row.

        Create or update the `bf_norms` column in the EDF. This method calculates,
        for each Bloom filter, its Euclidean norm when the filter is expressed as a
        binary vector, and saves it to the EDF. The norm is used to scale the
        (Soft) Cosine similarity scores.

        Attributes
        ----------
        data.bf_norms: list
            Column of vector norms for each row in the EDF.
        """
        assert self.embedder_checksum == self.embedder.checksum, "Checksum mismatch"

        self["bf_norms"] = list(map(self._calculate_norm, self["bf_indices"]))

        return self

    def anonymise(self, keep: None | list = None) -> "EmbeddedDataFrame":
        """Remove raw data from embedded dataframe.

        Remove all columns from the embedded dataframe expect columns listed
        in keep and `bf_indices`, `bf_norms` and `thresholds`.

        Returns
        -------
        keep: list[str]
            Columns to be returned as they appear in the data in addition to
            `bf_indices`, `bf_norms` and `thresholds` if they are present in
            the data.
        """

        if keep is None:
            keep = []

        output_columns = keep + ["bf_indices", "bf_norms", "thresholds"]
        output_columns = [column for column in self.columns if column in output_columns]
        # remove duplicate column names
        output_columns = list(dict.fromkeys(output_columns))
        return self[output_columns]


class SimilarityArray(np.ndarray):
    """Augmented NumPy array of similarity scores with extra attributes.

    Parameters
    ----------
    input_array: numpy.ndarray, Iterable
        Original array of similarity score data.
    thresholds: tuple, optional
        2-tuple of similarity score thresholds for each axis. These
        thresholds are used when generating a matching.
    embedder_checksum: str, optional
        Hexadecimal string digest of a `pprl.embedder.Embedder` object.

    Notes
    -----
    `SimilarityArray` objects are usually initialised from an instance
    of `pprl.embedder.Embedder` via the `embedder.compare()` method.
    """

    def __new__(
        cls,
        input_array: Iterable | np.ndarray,
        thresholds: None | tuple = None,
        embedder_checksum: None | str = None,
    ) -> "SimilarityArray":
        """Create the array, adding on the thresholds and hex digest."""
        obj = np.asarray(input_array).view(cls)
        obj.thresholds = thresholds
        obj.embedder_checksum = embedder_checksum

        return obj

    def __array_finalize__(self, obj) -> None:
        if obj is not None:
            self.thresholds = getattr(obj, "thresholds", None)
            self.embedder_checksum = getattr(obj, "embedder_checksum", None)

    def match(
        self,
        abs_cutoff: int | float = 0,
        rel_cutoff: int | float = 0,
        hungarian: bool = True,
        require_thresholds: bool = True,
    ) -> tuple[list[int], list[int]]:
        """Compute a matching.

        Given an array of similarity scores, compute a matching of its
        elements, using the Hungarian algorithm by default. If the
        `SimilarityArray` has thresholds, masking is used to ensure that prospective
        matches whose similarity score is below the thresholds are not returned.
        An `abs_cutoff` (global minimum similarity score) can also be supplied.

        Parameters
        ----------
        abs_cutoff : int or float, optional
            A lower cutoff for the similarity score. No pairs with
            similarity below the absolute cutoff will be matched. By
            default, this is 0.
        rel_cutoff : int or float, optional
            A margin above the row/column-specific threshold. Raises all
            thresholds by a constant. By default, this is 0.
        hungarian: bool, optional
            Whether to compute the unique matching using the Hungarian
            algorithm, filtered using `thresholds` and `abs_cutoff`.
            Default is `True`. If `False`, just return all pairs above
            the threshold.
        require_thresholds: bool, optional
            If `True` (default), the matching will fail if `thresholds`
            is not present and valid. Must be explicitly set to `False`
            to allow matching without similarity thresholds.

        Returns
        -------
        match: tuple[list[int], list[int]]
            2-tuple of indexes containing row and column indices of
            matched pairs eg. `([0, 1, ...], [0, 1, ...])`.

        Notes
        -----
        If `hungarian=False`, the matching returns all pairs with
        similarity score above the `abs_cutoff`, respecting `thresholds`
        if present. This method does not guarantee no duplicates.
        """
        S = ma.array(self.copy())

        if require_thresholds:
            if isinstance(self.thresholds, tuple):
                S[S < S.thresholds[0][:, None] + rel_cutoff] = ma.masked
                S[S < S.thresholds[1] + rel_cutoff] = ma.masked
            else:
                raise ValueError("Thresholds are required for matching")

        S[S < abs_cutoff] = ma.masked

        match = ma.where(S >= abs_cutoff)  # ma.where(S) would also work

        if hungarian:
            # Compute linear assignment (Hungarian match)
            hungarian_match = linear_sum_assignment(S, maximize=True)
            hungarian_mask = ~S.mask[hungarian_match]
            match = tuple([x[hungarian_mask] for x in hungarian_match])

        return match


class Embedder:
    """Class for embedding a dataset.

    Each instance of the `Embedder` class represents an embedding space
    on personal data features. An `Embedder` instance is defined by
    three things:

    1. A set of Bloom filter parameters
    2. A set of feature factory functions
    3. An embedding matrix that corresponds to the above

    Parameters
    ----------
    feature_factory: dict[str, func]
        Mapping from dataset columns to feature generation functions.
    ff_args: dict[dict], optional
        Mapping from dataset columns to keyword arguments for their
        respective feature generation functions.
    bf_size: int
        Size of the Bloom filter. Default is 1024.
    num_hashes: int
        Number of hashes to perform. Default is two.
    offset: int
        Offset for Bloom filter to enable masking. Default is zero.
    salt: str, optional
        Cryptographic salt added to tokens from the data before hashing.

    Attributes
    ----------
    scm_matrix: np.ndarray
        Soft Cosine Measure matrix. Initialised as an identity matrix of
        size `bf_size + offset`.
    freq_matr_matched: np.ndarray
        Matched frequency matrix for computing `scm_matrix`. Initialised
        as an identity matrix of size `bf_size + offset`.
    freq_matr_unmatched: np.ndarray
        Unmatched frequency matrix for computing `scm_matrix`.
        Initialised as an identity matrix of size `bf_size + offset`.
    checksum: str
        Hexadecimal string digest of the feature factory, SCM matrix,
        and other embedding parameters. Used to check an embedder is
        compatible with an `EmbeddedDataFrame`.

    Notes
    -----
    When an instance is initialised in code, the embedding matrix is
    initialised as an identity matrix; the matrix can then be trained
    using a pair of datasets with known match status and the trained
    `Embedder` instance pickled to file. The pre-trained `Embedder`
    instance can then be reinitialised from the pickle file.

    Both the untrained and trained instances provide `embed()` and
    `compare()` methods. Comparing datasets using an untrained
    `Embedder` instance is equivalent to calculating Cosine similarities
    on ordinary Bloom filters. Comparing datasets using a pre-trained
    `Embedder` calculates the Soft Cosine Measure between Bloom filters.
    The Soft Cosine Measure embedding matrix is trained using an
    experimental method.
    """

    def __init__(
        self,
        feature_factory: dict,
        ff_args: dict[str, dict] | None = None,
        bf_size: int = 1024,
        num_hashes: int = 2,
        offset: int = 0,
        salt: str | None = None,
    ) -> None:
        # Get embedding from model
        # Get other attributes from model
        self.feature_factory = feature_factory
        if ff_args is not None:
            self.ff_args = ff_args
        else:
            self.ff_args = {}
        self.num_hashes = num_hashes
        self.bf_size = bf_size
        self.offset = offset
        self.salt = salt or ""

        # Initialise Soft Cosine Measure matrices
        # These are large-ish (for bf_size=1024, each will take up 4MB)
        self.scm_matrix = self._initmatrix()
        self.freq_matr_matched = self._initmatrix()
        self.freq_matr_unmatched = self._initmatrix()

        self.checksum = self._compute_checksum()

    def _initmatrix(self) -> np.ndarray:
        """Initialise matrices as identity matrices of dimension `bf_size` + `offset`."""
        return np.eye((self.bf_size + self.offset), dtype=np.float32)

    def _compute_checksum(self) -> str:
        """Compute a checksum on important attributes of the Embedder instance.

        To check for functional equality of two instances
        """
        res = hashlib.md5()

        # bytes from feature_factory
        for k, v in self.feature_factory.items():
            res.update(k.encode("utf-8"))
            res.update(dill.dumps(v))

        # bytes from SCM matrix
        res.update(str(self.scm_matrix).encode("utf-8"))

        # bytes from params
        params_bytes = str([self.bf_size, self.num_hashes, self.offset]).encode("utf-8")
        res.update(params_bytes)

        return res.hexdigest()

    def embed(
        self,
        df: pd.DataFrame,
        colspec: dict,
        update_norms: bool = True,
        update_thresholds: bool = False,
    ) -> EmbeddedDataFrame:
        """Encode data columns into features from Bloom embedding.

        Given a pandas DataFrame and a column specification, convert columns into
        string features, and then embed the features into Bloom filters. The method
        returns an instance of `EmbeddedDataFrame`, which is an augmented pandas DataFrame.

        Parameters
        ----------
        df : pd.DataFrame
            Data frame to be embedded.
        colspec : dict
            Dictionary mapping columns in `df` to feature factory
            functions.
        update_norms : bool, optional
            Whether to calculate vector norms for SCM and add to EDF.
            `False` by default.
        update_thresholds : bool, optional
            Whether to calculate similarity thresholds and add to EDF.
            Used as an outside option in matching. `False` by default.

        Returns
        -------
        EmbeddedDataFrame
            An embedded data frame with its embedder.
        """
        df_features = df[colspec.keys()].copy()

        # create features from each column
        for column in colspec:
            column_type = colspec[column]

            feature_factory_kw = self.feature_factory[column_type]
            if column_type in self.ff_args:
                df_features[column] = feature_factory_kw(
                    df_features[column], **self.ff_args[column_type]
                )
            else:
                df_features[column] = feature_factory_kw(df_features[column])

        # concat the features to a single column
        df_features.columns = [i + "_features" for i in df_features.columns]
        df_features["all_features"] = df_features.values.tolist()
        df_features["all_features"] = df_features["all_features"].apply(
            lambda x: list(set(it.chain.from_iterable(x)))
        )
        df = pd.concat([df, df_features], axis=1)

        # create bloom filter indices
        bfencoder = BloomFilterEncoder(self.bf_size, self.num_hashes, self.offset, self.salt)

        df["bf_indices"] = df_features["all_features"].apply(
            lambda x: bfencoder.bloom_filter_vector(x)
        )

        return EmbeddedDataFrame(
            df, embedder=self, update_norms=update_norms, update_thresholds=update_thresholds
        )

    def compare(
        self,
        edf1: EmbeddedDataFrame,
        edf2: EmbeddedDataFrame,
        require_thresholds: bool = True,
    ) -> SimilarityArray:
        """Calculate a `SimilarityArray` on two EDFs.

        Given two EDFs, calculate all pairwise Soft Cosine Similarities
        between rows.

        Parameters
        ----------
        edf1 : EmbeddedDataFrame
            An EDF instance with N rows. Must have `thresholds` column
            unless `require_thresholds=False`.
        edf2 : EmbeddedDataFrame
            An EDF instance with M rows. Must have `thresholds` column
            unless `require_thresholds=False`.
        require_thresholds: bool, optional
            If `True` (default), the comparison will fail if thresholds
            are not present. Must be explicitly set to `False` to allow
            comparison without thresholds.

        Returns
        -------
        SimilarityArray
            An N by M array containing the similarity matrix of pairwise
            Soft Cosine similarities between rows of `edf1` and `edf2`.

        Raises
        ------
        ValueError
            If `require_thresholds` is `True` and both EDFs don't have a
            `thresholds` column.
        """
        assert (
            edf1.embedder_checksum == self.checksum and edf2.embedder_checksum == self.checksum
        ), "Both EmbeddedDFs must refer to the same Embedder instance"

        if "bf_norms" not in edf1.columns:
            edf1.update_norms()
        if "bf_norms" not in edf2.columns:
            edf2.update_norms()

        X1 = edf1.to_bloom_matrix()
        X2 = edf2.to_bloom_matrix()
        A = edf1.embedder.scm_matrix
        diag_norm1 = np.diag(1 / np.array(edf1.bf_norms))
        diag_norm2 = np.diag(1 / np.array(edf2.bf_norms))

        res = diag_norm1 @ X1 @ A @ X2.T @ diag_norm2

        if "thresholds" in edf1.columns and "thresholds" in edf2.columns:
            thresholds = (edf1["thresholds"].to_numpy(), edf2["thresholds"].to_numpy())
        elif require_thresholds:
            raise ValueError("Thresholds required for comparison")
        else:
            thresholds = None

        return SimilarityArray(res, thresholds=thresholds, embedder_checksum=self.checksum)

    def _joint_freq_matrix(
        self,
        bf_indices1: list[list] | pd.Series,
        bf_indices2: list[list] | pd.Series,
        prob: bool = False,
    ) -> np.ndarray:
        """Calculate the symmetrised joint frequency matrix on the Bloom filters.

        Given two EDFs' bf_indices, returns a square matrix of size `self.bf_size` where
        each entry (i,j) is the frequency of observing a feature hashed into slot `i` in
        one dataset, and a feature hashed into slot `j` in the other dataset, at the same
        row number. The frequency matrix is then symmetrised because the order of the two
        datasets doesn't matter. `prob`, if True, converts frequencies to probabilities by dividing
        by N, not usually needed because we're using a logged ratio of two matrices so the
        division by N cancels out.
        """
        assert len(bf_indices1) == len(bf_indices2), "x and y lengths must match"
        N = len(bf_indices1)
        bfsize = self.bf_size + self.offset

        coordinates = ([], [])
        # Loop through the cross-product of every index in bf_indices1[n]
        # and every index in bf_indices2[n]
        # for n in 1:len(bf_indices1)
        for i, j in it.chain.from_iterable(map(it.product, bf_indices1, bf_indices2)):
            coordinates[0].append(i)
            coordinates[1].append(j)

        S = np.zeros((bfsize, bfsize), np.float32)
        np.add.at(S, coordinates, 1.0)

        # Make it symmetric
        S = (S + S.T) / 2
        if prob:
            S = S / N

        return S

    def train(
        self,
        edf1: EmbeddedDataFrame,
        edf2: EmbeddedDataFrame,
        update: bool = True,
        learning_rate: float = 1.0,
        eps: float = 0.01,
        random_state: None | np.random.RandomState = None,
    ) -> None:
        """Fit Soft Cosine Measure matrix to two matched datasets.

        This function updates the `scm_matrix` attribute in-place along
        with its constituent matrices, `freq_matr_matched` and
        `freq_matr_unmatched`.

        Provide two datasets of pre-matched data, with matching records aligned.
        If `update=True`, the training is cumulative, so that `train()` can be called more
        than once, updating the same matrices each time by adding new
        frequency tables. Otherwise, all three matrices are
        reinitialised prior to training.

        Parameters
        ----------
        edf1: EmbeddedDataFrame
            An embedded dataset.
        edf2: EmbeddedDataFrame
            An Embedded dataset of known matches in the same order as
            `edf1`.
        update: bool
            Whether to update the existing SCM matrix, or overwrite it.
            Defaults to `True`.
        eps: float
            Small non-negative constant to avoid `-Inf` in log of
            frequencies. Default is one.
        learning_rate: float
            Scaling factor to dampen matrix updates. Must be in the
            interval `(0, 1]`. Default is 0.01.
        random_state: RandomState, optional
            Random state to pass to dataset jumbler. Defaults to `None`.

        Attributes
        ----------
        scm_matrix: np.ndarray
            Soft Cosine Measure matrix that is fitted cumulatively or
            afresh.
        """
        # Check the dimensions are the same
        x = edf1.bf_indices
        y = edf2.bf_indices
        assert len(x) == len(
            y
        ), "Must have same length (this will be relaxed in future iterations)"
        assert eps >= 0.0, "Negative eps not allowed"
        assert learning_rate > 0.0 and learning_rate <= 1.0

        y_jumbled = pd.Series(y).sample(frac=1, random_state=random_state).to_list()

        # Calculate joint probability matrix for matches
        freq_matr_matched = self._joint_freq_matrix(x, y)

        # Calculate joint probability matrix for random non-matches
        freq_matr_unmatched = self._joint_freq_matrix(x, y_jumbled)

        if update:
            self.freq_matr_matched += learning_rate * freq_matr_matched
            self.freq_matr_unmatched += learning_rate * freq_matr_unmatched
        else:
            self.freq_matr_matched = self._initmatrix() + learning_rate * freq_matr_matched
            self.freq_matr_unmatched = self._initmatrix() + learning_rate * freq_matr_unmatched

        # Log the ratio
        scm_matrix = np.log(self.freq_matr_matched + eps) - np.log(self.freq_matr_unmatched + eps)

        # Ensure matrix is positive definite for positive norm
        # This also ensures that the diagonal is non-negative
        scm_matrix = nearest_pos_semi_definite(scm_matrix, eps=1e-6)
        self.scm_matrix = scm_matrix
        self._compute_checksum()

    def to_pickle(self, path: None | str = None) -> None | bytes:
        """Save Embedder instance to pickle file.

        Parameters
        ----------
        path : str, optional
            File path at which to save the pickled embedder. If not
            specified, the pickled bytes string is returned.

        Returns
        -------
        pickled : bytes or None
            If `path` is not specified, the pickled string comes back.
            Otherwise, nothing is returned.
        """

        if path is None:
            return dill.dumps(self)

        with open(path, "wb") as f:
            dill.dump(self, f)

    @classmethod
    def from_pickle(
        cls, path: None | str = None, pickled: None | str | bytes = None
    ) -> "Embedder":
        """Initialise Embedder instance from pickle file.

        Parameters
        ----------
        path : str, optional
            File path from which to load the pickled embedder.
        pickled : bytes, optional
            Byte-string containing the pickled embedder.

        Raises
        ------
        ValueError
            If not exactly one of `path` and `pickled` are specified.

        Returns
        -------
        embedder : Embedder
            The reformed instance of the `Embedder` class.
        """

        neither = path is None and pickled is None
        both = path is not None and pickled is not None
        if neither or both:
            raise ValueError("Exactly one of `path` and `pickled` must be specified.")

        if isinstance(path, str):
            with open(path, "rb") as f:
                embedder = dill.load(f)

        if isinstance(pickled, (str, bytes)):
            embedder = dill.loads(pickled)

        assert (
            embedder.checksum == embedder._compute_checksum()
        ), "Checksum on loaded Embedder instance doesn't match saved checksum."

        return embedder


def nearest_pos_semi_definite(X: np.ndarray, eps: float = 0.0) -> np.ndarray:
    """Calculate nearest positive semi-definite version of a matrix.

    This function achieves this by setting all negative eigenvalues of
    the matrix to zero, or a small positive value to give a positive
    definite matrix.

    Graciously taken from this StackOverflow
    [post](https://stackoverflow.com/questions/43238173/python-convert-matrix-to-positive-semi-definite)

    Parameters
    ----------
    X: np.ndarray
        Matrix-like array.
    eps: float
        Use a small positive constant to give a positive definite
        matrix. Default is 0 to give a positive semi-definite matrix.

    Returns
    -------
    np.ndarray
        A positive (semi-)definite matrix.
    """
    C = (X + X.T) / 2
    eigval, eigvec = np.linalg.eig(C)
    eigval[eigval < 0] = eps

    return np.real(eigvec.dot(np.diag(eigval)).dot(eigvec.T))