-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathfilter.py
More file actions
122 lines (106 loc) · 4.11 KB
/
filter.py
File metadata and controls
122 lines (106 loc) · 4.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import csv
import pandas as pd
import numpy as np
import argparse
def get_args():
parser = argparse.ArgumentParser(
description="Filter probes based on standard deviation."
)
parser.add_argument(
"--num-probes",
type=int,
default=10000,
help="Number of probes to retain after filtering.",
)
parser.add_argument("--output-name", type=str, help="Name for output file.")
parser.add_argument(
"--filtered-probes", type=str, help="Name for filtered probes file."
)
parser.add_argument(
"--pval-threshold",
type=float,
default=0.01,
help="P-value threshold for filtering.",
)
parser.add_argument(
"--pval-sample-fraction",
type=float,
default=0.5,
help="Fraction of samples that must exceed the p-value threshold.",
)
parser.add_argument("--pval", type=str, help="P-values CSV file.")
parser.add_argument(
"--exclude", type=str, action="append", help="Files with probes to exclude."
)
parser.add_argument("beta", type=str, help="Beta values CSV file.")
args = parser.parse_args()
return args
if __name__ == "__main__":
args = get_args()
# Read probes to exclude
probes_to_exclude = []
if args.exclude is not None:
for exclude_file in args.exclude:
with open(exclude_file, "r") as ef:
for line in ef:
probes_to_exclude.append(line.strip())
print("Number of probes to exclude:", len(probes_to_exclude))
# Read p-values and find probes with high p-value in too many samples
high_pval_probes = []
if args.pval is not None:
with open(args.pval, "r") as f:
reader = csv.reader(f)
header = next(reader)
allowable_sample_count = args.pval_sample_fraction * (len(header) - 1)
for i, line in enumerate(reader):
if i % 10000 == 0:
print(f"Processing p-value for probe {i}")
probe = line[0]
# Get count of samples where the p-value exceeds the threshold
count_high_pval = sum(
1 for x in line[1:] if float(x) > args.pval_threshold
)
# Check if the count exceeds the allowable fraction
if count_high_pval > allowable_sample_count:
high_pval_probes.append(probe)
print(
"Number of probes with high p-value in too many samples:",
len(high_pval_probes),
)
pd.Series(high_pval_probes).to_csv(
"high_pval_probes.csv", index=False, header=False
)
# Combine with probes to exclude
exclude_probe_list = set(high_pval_probes).union(set(probes_to_exclude))
print("Total number of probes to exclude:", len(exclude_probe_list))
# Read beta values and compute standard deviation
data = []
with open(args.beta, "r") as f:
reader = csv.reader(f)
header = next(reader)
for i, line in enumerate(reader):
if i % 10000 == 0:
print(f"Processing probe {i}")
probe = line[0]
sd = np.std([float(x) for x in line[1:]])
if probe not in exclude_probe_list:
data.append([probe, sd])
sd_df = pd.DataFrame(data, columns=["probe", "sd"]).set_index("probe")
# Filter probes based on standard deviation
filtered_probes = (
sd_df.sort_values("sd", ascending=False).head(args.num_probes).index
)
pd.Series(filtered_probes, index=filtered_probes).to_csv(
args.filtered_probes, index=False
)
# Filter beta values
# NOTE: we are iterating over the file a second time to avoid loading the entire file into memory
print("Reading beta for filtering")
with open(args.beta, "r") as f:
reader = csv.reader(f)
header = next(reader)
header[0] = "probe"
beta_data = [line for line in reader if line[0] in filtered_probes]
print("Writing filtered beta values")
bd = pd.DataFrame(beta_data, columns=header).set_index("probe")
bd.to_csv(args.output_name)