-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtest_correctness_pairwise_compare_manager.py
More file actions
145 lines (117 loc) · 4.23 KB
/
test_correctness_pairwise_compare_manager.py
File metadata and controls
145 lines (117 loc) · 4.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import pathlib
import sys
import pandas as pd
import pytest
from comparators.PearsonsCorrelation import PearsonsCorrelation
from comparison_tools.PairwiseCompareManager import PairwiseCompareManager
# Paths to original nuclear speckle data
data_dir = pathlib.Path(
"data"
).resolve(strict=True)
plate4df = pd.read_parquet(data_dir / "Plate_4_bulk_feature_selected.parquet")
plate4df["Metadata_siRNA"] = plate4df["Metadata_siRNA"].fillna("No siRNA")
feat_cols = plate4df.columns[~plate4df.columns.str.contains("Metadata")].tolist()
# Compute comparison outputs with the PairwiseCompareManager
@pytest.fixture
def test_data(request):
case = request.param
comparer = PairwiseCompareManager(
_df=plate4df.copy(),
_comparator=case["_comparator"],
_same_columns=case["_same_columns"],
_different_columns=case["_different_columns"],
_feat_cols=feat_cols,
_drop_cols=case["_drop_cols"],
)
case["_comparer"] = comparer()
return case
# Specify test cases
@pytest.mark.parametrize(
"test_data",
[
(
{
"_comparator": PearsonsCorrelation(),
"_same_columns": ["Metadata_Concentration"],
"_different_columns": ["Metadata_siRNA", "Metadata_Well"],
"_drop_cols": ["Metadata_Concentration", "Metadata_Well"],
}
),
(
{
"_comparator": PearsonsCorrelation(),
"_same_columns": None,
"_different_columns": ["Metadata_siRNA", "Metadata_Well", "Metadata_genotype"],
"_drop_cols": None,
}
),
(
{
"_comparator": PearsonsCorrelation(),
"_same_columns": ["Metadata_Concentration"],
"_different_columns": ["Metadata_siRNA"],
"_drop_cols": ["Metadata_Concentration"],
}
),
(
{
"_comparator": PearsonsCorrelation(),
"_same_columns": ["Metadata_Concentration"],
"_different_columns": ["Metadata_siRNA"],
"_drop_cols": None,
}
),
],
indirect=True,
)
def test_dataframe_shape_correct(test_data: dict):
"""Tests if the output dataframe contains the correct number of rows and columns."""
total_number_of_comparisons = 0
def number_of_comparisons(_df: pd.DataFrame):
"""Calculate the number of expected comparisons."""
number_of_comparisons = 0
numberdf_samples = _df.shape[0]
for first_row in range(numberdf_samples - 1):
for second_row in range(first_row + 1, numberdf_samples):
groupsdf = _df.iloc[[first_row, second_row]].copy()
if (
not groupsdf[test_data["_different_columns"]]
.apply(lambda col: col.duplicated(keep=False))
.any()
.any()
):
number_of_comparisons += 1
return number_of_comparisons
# Calculate the expected number of comparisons if _same_columns is specified
if test_data["_same_columns"]:
for _, urow in plate4df.drop_duplicates(
subset=test_data["_same_columns"]
).iterrows():
same_df = plate4df[
(
plate4df[test_data["_same_columns"]]
== urow[test_data["_same_columns"]]
).all(axis=1)
]
total_number_of_comparisons += number_of_comparisons(same_df)
else:
total_number_of_comparisons += number_of_comparisons(plate4df)
assert test_data["_comparer"].shape[0] == total_number_of_comparisons
if not test_data["_drop_cols"]:
number_drop_cols = 0
else:
number_drop_cols = len(test_data["_drop_cols"])
if not test_data["_same_columns"]:
number_same_columns = 0
else:
number_same_columns = len(test_data["_same_columns"])
# There are 2 of each column, but only one comparison column
assert (
2
* (
number_same_columns
+ len(test_data["_different_columns"])
- number_drop_cols
)
== test_data["_comparer"].shape[1] - 1
)