-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathplot_parsers.R
More file actions
130 lines (121 loc) · 4.85 KB
/
plot_parsers.R
File metadata and controls
130 lines (121 loc) · 4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#' Parses a list of xyline plot datasets
#' @param dataset A list which has the names "name" and "data", which
#' originate from a MultiQC plot
#' @return A list whose names are sample names. Each value is a data frame with one column.
#' @keywords internal
map_xy_line_datasets <- function(datasets, plot_name){
kv_map(datasets, function(dataset){
list(
key = dataset$name,
value = dataset$data %>%
purrr::map(~ tibble::tibble_row(x = .[[1]], y = .[[2]])) %>%
purrr::list_rbind() %>%
# Chop the multi-row data frame into one row
tidyr::nest(.key = stringr::str_c("plot", plot_name, sep = "."))
)
})
}
#' Determine if a plot is Plotly
#'
#' Plotly plots are generated by MultiQC 1.2 and above.
#' This matters because the plot data formats differ between these two formats.
#' @keywords internal
#' @param plot_data A list containing the top level data for a single plot.
#' @return A logical scalar. TRUE if the plot is a plotly plot, or FALSE if it's a HighCharts one.
is_plotly <- function(plot_data){
!is.null(plot_data$layout)
}
#' Takes the JSON dictionary for an xyline plot, and returns a named list of
#' data frames, one for each sample.
#' @keywords internal
#' @import rlang
#' @keywords plot_parser
#' @return A list of data frames, one for each sample.
#' Each data frame will have two columns: x, and y.
#' These correspond to the x and y coordinates in the plot.
#' For example, for histogram data, the x values are values of the random
#' variable, and the y values are the number of counts for that value.
parse_xyline_plot <- function(plot_data, name) {
# This only works on xyline plots
assertthat::assert_that(plot_data$plot_type == "xy_line")
# MultiQC >=1.2 plotly parser
if (is_plotly(plot_data)){
plot_data$datasets %>%
purrr::map(function(dataset){
dataset$lines %>%
map_xy_line_datasets(plot_name = name)
}) %>%
purrr::list_flatten()
}
# MultiQC <=1.1 highcharts parser
else {
plot_data$datasets %>%
# For some reason there are two levels of nesting here
purrr::map(map_xy_line_datasets, plot_name = name) %>%
purrr::list_flatten()
}
}
#' Takes the JSON dictionary for a bar graph, and returns a named list of
#' data frames, one for each sample.
#' @keywords internal
#' @import rlang
#' @keywords plot_parser
#' @return A list of data frames, one for each sample.
#' Each data frame will have one column corresponding to each category in the bar chart.
#' For example, for the plot "SnpEff: Counts by Genomic Region", we will have
#' one column for the number of intron variants, one column for the number of exon variants, etc.
#' This means that the number of columns will be fairly variable for different plots.
parse_bar_graph <- function(plot_data, name) {
assertthat::assert_that(plot_data$plot_type == "bar_graph")
plot_data$datasets %>% length() %>% `==`(1) %>% assertthat::assert_that(msg = "Only bar graphs with 1 dataset are understood by this parser!")
colname <- stringr::str_c("plot", sanitise_column_name(name), sep = ".")
if (is_plotly(plot_data)){
# MultiQC 1.2+
dataset <- plot_data$datasets[[1]]
samples <- dataset$samples %>% purrr::flatten_chr()
# We make a data frame whose rows are samples and whose columns are categories
# Ideally this would be the final output, but currently the other code
# expects a list of samples
df <- dataset$cats %>%
purrr::map(function(cat){
tibble::as_tibble_col(
purrr::flatten_dbl(cat$data),
column_name = sanitise_column_name(cat$name)
)
}) %>%
purrr::list_cbind()
# For compatibility with the old format
if ("unknown" %in% colnames(df)){
df <- dplyr::rename(df, none = unknown)
}
# And then we slice out each row to become its own list
seq_along(samples) %>%
purrr::map(function(sample_idx){
df[sample_idx, ] %>% tidyr::nest(.key = colname)
}) %>%
purrr::set_names(samples) %>%
`[`(sort(samples))
}
else {
# Make a list of samples
samples <- plot_data$samples[[1]] %>% purrr::flatten_chr()
plot_data$datasets[[1]] %>%
# First, build up a dictionary of samples -> dictionary of quality metrics
purrr::map(function(dataset) {
segment_name <- dataset$name
dataset$data %>%
# For this segment, each sample has a value
kv_map(function(value, idx) {
list(
key = samples[[idx]],
value = list(value) %>% purrr::set_names(sanitise_column_name(segment_name))
)
}, map_keys = TRUE)
}) %>%
purrr::reduce(utils::modifyList) %>%
# Then, convert each inner dictionary to a tibble row
purrr::map(tibble::as_tibble_row) %>%
# And nest each df so that we only have 1 cell of output per sample
purrr::map(~ tidyr::nest(., .key = colname))
}
}