Skip to content
Closed
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions data
1 change: 1 addition & 0 deletions inputs
131 changes: 65 additions & 66 deletions methods/matching/find_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
import logging
from functools import partial
from multiprocessing import Pool, cpu_count, set_start_method
from numba import jit # type: ignore

import numpy as np
import pandas as pd
from numba import jit

from methods.common.luc import luc_matching_columns
from methods.utils.kd_tree import make_kdrangetree, make_rumba_tree

REPEAT_MATCH_FINDING = 100
DEFAULT_DISTANCE = 10000000.0
Expand Down Expand Up @@ -35,12 +37,7 @@ def find_match_iteration(

logging.info("Loading K from %s", k_parquet_filename)

# Methodology 6.5.7: For a 10% sample of K
k_set = pd.read_parquet(k_parquet_filename)
k_subset = k_set.sample(
frac=0.1,
random_state=rng
).reset_index()

logging.info("Loading M from %s", m_parquet_filename)
m_set = pd.read_parquet(m_parquet_filename)
Expand All @@ -61,11 +58,21 @@ def find_match_iteration(
logging.info("Preparing s_set...")

m_dist_thresholded_df = m_set[DISTANCE_COLUMNS] / thresholds_for_columns
k_subset_dist_thresholded_df = k_subset[DISTANCE_COLUMNS] / thresholds_for_columns
k_set_dist_thresholded_df = k_set[DISTANCE_COLUMNS] / thresholds_for_columns

# Rearrange columns by variance so we throw out the least likely to match first
# except the bottom three which are deforestation CPCs and have more cross-variance between K and M
variances = np.std(m_dist_thresholded_df, axis=0)
cols = DISTANCE_COLUMNS
order = np.argsort(-variances.to_numpy())
order = np.roll(order, 3)
new_cols = [cols[o] for o in order]
m_dist_thresholded_df = m_dist_thresholded_df[new_cols]
k_set_dist_thresholded_df = k_set_dist_thresholded_df[new_cols]

# convert to float32 numpy arrays and make them contiguous for numba to vectorise
m_dist_thresholded = np.ascontiguousarray(m_dist_thresholded_df, dtype=np.float32)
k_subset_dist_thresholded = np.ascontiguousarray(k_subset_dist_thresholded_df, dtype=np.float32)
k_set_dist_thresholded = np.ascontiguousarray(k_set_dist_thresholded_df, dtype=np.float32)

# LUC columns are all named with the year in, so calculate the column names
# for the years we are intested in
Expand All @@ -76,32 +83,50 @@ def find_match_iteration(
hard_match_columns = ['country', 'ecoregion', luc10, luc5, luc0]
assert len(hard_match_columns) == HARD_COLUMN_COUNT

# similar to the above, make the hard match columns contiguous float32 numpy arrays
m_dist_hard = np.ascontiguousarray(m_set[hard_match_columns].to_numpy()).astype(np.int32)
k_subset_dist_hard = np.ascontiguousarray(k_subset[hard_match_columns].to_numpy()).astype(np.int32)
# Find categories in K
hard_match_categories = [k[hard_match_columns].to_numpy() for _, k in k_set.iterrows()]
hard_match_categories = {k.tobytes(): k for k in hard_match_categories}
no_potentials = []

# Methodology 6.5.5: S should be 10 times the size of K, in order to achieve this for every
# pixel in the subsample (which is 10% the size of K) we select 100 pixels.
required = 100
# Methodology 6.5.5: S should be 10 times the size of K
required = 10

logging.info("Running make_s_set_mask... required: %d", required)
starting_positions = rng.integers(0, int(m_dist_thresholded.shape[0]), int(k_subset_dist_thresholded.shape[0]))
s_set_mask_true, no_potentials = make_s_set_mask(
m_dist_thresholded,
k_subset_dist_thresholded,
m_dist_hard,
k_subset_dist_hard,
starting_positions,
required
)

s_set_mask_true = np.zeros(m_set.shape[0], dtype=np.bool_)
no_potentials = np.zeros(k_set.shape[0], dtype=np.bool_)

# Split K and M into those categories and create masks
for values in hard_match_categories.values():
k_selector = np.all(k_set[hard_match_columns] == values, axis=1)
m_selector = np.all(m_set[hard_match_columns] == values, axis=1)
logging.info(" category: %a |K|: %d |M|: %d", values, k_selector.sum(), m_selector.sum())
# Make masks for each of those pairs
key_s_set_mask_true, key_no_potentials = make_s_set_mask(
m_dist_thresholded[m_selector],
k_set_dist_thresholded[k_selector],
required,
rng
)
# Merge into one s_set_mask_true
s_set_mask_true[m_selector] = key_s_set_mask_true
# Merge into no_potentials
no_potentials[k_selector] = key_no_potentials

logging.info("Done make_s_set_mask. s_set_mask.shape: %a", {s_set_mask_true.shape})

s_set = m_set[s_set_mask_true]
logging.info("Finished preparing s_set. shape: %a", {s_set.shape})
potentials = np.invert(no_potentials)

k_subset = k_subset[potentials]
logging.info("Finished preparing s_set. shape: %a", {s_set.shape})
# Methodology 6.5.7: For a 10% sample of K
k_subset = k_set.sample(
frac=0.1,
random_state=rng
)
k_subset = k_subset.apply(lambda row: potentials[row.index])
k_subset.reset_index()
logging.info("Finished preparing k_subset. shape: %a", {k_subset.shape})

# Notes:
# 1. Not all pixels may have matches
Expand Down Expand Up @@ -173,56 +198,30 @@ def find_match_iteration(

logging.info("Finished find match iteration")

@jit(nopython=True, fastmath=True, error_model="numpy")
def make_s_set_mask(
m_dist_thresholded: np.ndarray,
k_subset_dist_thresholded: np.ndarray,
m_dist_hard: np.ndarray,
k_subset_dist_hard: np.ndarray,
starting_positions: np.ndarray,
required: int
k_set_dist_thresholded: np.ndarray,
required: int,
rng: np.random.Generator
):
m_tree = make_kdrangetree(m_dist_thresholded, np.ones(m_dist_thresholded.shape[1]))
rumba_tree = make_rumba_tree(m_tree, m_dist_thresholded)

k_size = k_set_dist_thresholded.shape[0]
m_size = m_dist_thresholded.shape[0]
k_size = k_subset_dist_thresholded.shape[0]

s_include = np.zeros(m_size, dtype=np.bool_)
k_miss = np.zeros(k_size, dtype=np.bool_)

for k in range(k_size):
matches = 0
k_row = k_subset_dist_thresholded[k, :]
k_hard = k_subset_dist_hard[k]

for index in range(m_size):
m_index = (index + starting_positions[k]) % m_size

m_row = m_dist_thresholded[m_index, :]
m_hard = m_dist_hard[m_index]

should_include = True

# check that every element of m_hard matches k_hard
hard_equals = True
for j in range(m_hard.shape[0]):
if m_hard[j] != k_hard[j]:
hard_equals = False

if not hard_equals:
should_include = False
else:
for j in range(m_row.shape[0]):
if abs(m_row[j] - k_row[j]) > 1.0:
should_include = False

if should_include:
s_include[m_index] = True
matches += 1

# Don't find any more M's
if matches == required:
break

k_miss[k] = matches == 0
k_row = k_set_dist_thresholded[k]
possible_s = rumba_tree.members(k_row)
if len(possible_s) == 0:
k_miss[k] = True
else:
samples = min(len(possible_s), required)
chosen_s = rng.choice(possible_s, samples, replace=False)
s_include[chosen_s] = True

return s_include, k_miss

Expand Down
Loading