-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathesv_table_taxa_abundance_analysis.py
More file actions
executable file
·75 lines (64 loc) · 2.34 KB
/
esv_table_taxa_abundance_analysis.py
File metadata and controls
executable file
·75 lines (64 loc) · 2.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""
Generates read number bar graphs for taxa in ESV tables.
By Chris Hempel (christopher.hempel@kaust.edu.sa) on Jun 27 2023
"""
import pandas as pd
import plotly.express as px
# Options
file = "/Users/christopherhempel/Desktop/RSDE COI water project/second_run_with_dnoise_swarm/rsde-coi-water_final-OTU-table_coil-filtered.xlsx"
# Is the file from Boldigger?
boldigger = True
# Gibve tha dataset a name for plotting title
dataset = "RSDE-COI-water"
# Specify string shared by all sample names to extract read abudnances
sample_abbrev = "RSDE"
graph_width = 1200
# Max number of taxa in plot so that names are readable
num_taxa_barplots = 50
# Import ESV table
df = pd.read_excel(file) if ".xlsx" in file else pd.read_csv(file)
# Select sample columns
sample_cols = [col for col in df.columns if sample_abbrev in col]
# Calculate number of reads across samples
readsums = df[sample_cols].sum(axis=1)
# Plot read abundances of taxa across ranks
if boldigger:
ranks = ["Phylum", "Class", "Order", "Family", "Genus", "Species"]
else:
ranks = ["superkingdom", "phylum", "class", "order", "family", "genus", "species"]
## Loop over ranks
for rank in ranks:
## Group taxa and sum up number of reads
rank_df = pd.DataFrame({rank: df[rank], "readsum": readsums})
grouped_df = (
rank_df.groupby([rank])
.sum()
.sort_values(["readsum"], ascending=False)[:num_taxa_barplots]
)
# Turn to proportions
grouped_df = grouped_df / grouped_df.sum()
## Define title
if len(grouped_df) >= num_taxa_barplots:
barplot_title = f"{dataset}: {rank} ({num_taxa_barplots} most abundant taxa)"
else:
barplot_title = f"{dataset}: {rank}"
## Plot
fig = px.bar(
grouped_df,
labels={"value": "Number of reads [%]", rank: "Taxa"},
title=barplot_title,
width=graph_width,
)
fig.update_layout(showlegend=False)
fig.show()
# Plot resolution as proportion of ranks among all OTUs/ESVs (requires column lowest_rank in df)
resolution_df = df["lowest_rank"].fillna("No match").value_counts() / len(df) * 100
resolution_df = resolution_df.reindex(ranks + ["No match"]).round(1)
fig = px.bar(
resolution_df,
labels={"value": "Proportion [%]", "lowest_rank": "Rank"},
title="Taxonomic resolution",
)
fig.update_yaxes(range=[0, 100])
fig.update_layout(showlegend=False)
fig.show()