-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathboldigger_add_info.py
More file actions
executable file
·75 lines (60 loc) · 2.75 KB
/
boldigger_add_info.py
File metadata and controls
executable file
·75 lines (60 loc) · 2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Requires BOLDigger results file with sheet containing filtered results
import pandas as pd
import numpy as np
BOLDigger_file = "/Users/christopherhempel/Desktop/RSDE COI water project/apscale/rsde-coi-water-otu_97_apscale/boldigger_ESVs/BOLDResults_rsde-coi-water-otu_98_apscale_ESVs_filtered_part_1.xlsx"
ranks = ["Phylum", "Class", "Order", "Family", "Genus", "Species"]
# Find the lowest non-NaN value in each row
def find_lowest_taxon(row):
if "No Match" in row.values:
return None
return next((row[col] for col in reversed(ranks) if not pd.isna(row[col])), None)
# Find the name of the lowest non-NaN taxonomic column in each row
def find_lowest_rank(row):
if "No Match" in row.values:
return None
return next((col for col in reversed(ranks) if not pd.isna(row[col])), None)
# Read in data
df = pd.read_excel(BOLDigger_file, sheet_name=0)
df_filtered = pd.read_excel(BOLDigger_file, sheet_name=1)
# Fill in empty ESVs
df["You searched for"] = df["You searched for"].ffill()
# Delete species entry that contain sp.
df.loc[df["Species"].str.contains("sp.", case=False, na=False), "Species"] = np.nan
df["Species"] = df["Genus"] + " " + df["Species"]
# Add info for closest species
esv_lowest_tax = []
for esv in df["You searched for"].drop_duplicates():
# Cut down
hits = df[df["You searched for"] == esv][["Species", "Similarity"]]
# Find the first non-NaN index in Species
first_non_nan_index = hits["Species"].first_valid_index()
# Exception if none exist
if first_non_nan_index is None:
# Add to list
esv_lowest_tax.append("No species identified")
continue
# Get the similarity of the first non-NaN index
first_non_nan_entry = hits["Species"][first_non_nan_index]
similarity_max = hits[hits["Species"] == first_non_nan_entry]["Similarity"].max()
# Get all rows with that similarity (in case multiple different species have the same similarity)
maxhits = hits[hits["Similarity"] == similarity_max].dropna()
# If multiple species, cat them together into one string
lowest_tax = ", ".join(maxhits["Species"].drop_duplicates())
# Add to list
esv_lowest_tax.append(lowest_tax)
# Add info
df_filtered["closest_species"] = esv_lowest_tax
# Polish df
df_filtered["Species"] = df_filtered["Genus"] + " " + df_filtered["Species"]
df_filtered["Species"] = df_filtered["Species"].replace("No Match No Match", "No Match")
# Add info for lowest taxon
df_filtered["lowest_taxon"] = df_filtered.apply(
lambda row: find_lowest_taxon(row), axis=1
)
# Add info for lowest rank
df_filtered["lowest_rank"] = df_filtered.apply(
lambda row: find_lowest_rank(row), axis=1
)
# Save
outfile = BOLDigger_file.replace(".xlsx", "_with_additional_info.xlsx")
df_filtered.to_excel(outfile, index=False)