TidyMultiqc/R/plot_parsers.R at de0078e8ceeda3e75b27a2edc93cc880c57cf8de · multimeric/TidyMultiqc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#' Parses a list of xyline plot datasets
#' @param dataset A list which has the names "name" and "data", which
#'  originate from a MultiQC plot
#' @return A list whose names are sample names. Each value is a data frame with one column.
#' @keywords internal
map_xy_line_datasets <- function(datasets, plot_name){
  kv_map(datasets, function(dataset){
    list(
      key = dataset$name,
      value = dataset$data %>%
        purrr::map(~ tibble::tibble_row(x = .[[1]], y = .[[2]])) %>%
        purrr::list_rbind() %>%
        # Chop the multi-row data frame into one row
        tidyr::nest(.key = stringr::str_c("plot", plot_name, sep = "."))
    )
  })
}

#' Determine if a plot is Plotly
#'
#' Plotly plots are generated by MultiQC 1.2 and above.
#' This matters because the plot data formats differ between these two formats.
#' @keywords internal
#' @param plot_data A list containing the top level data for a single plot.
#' @return A logical scalar. TRUE if the plot is a plotly plot, or FALSE if it's a HighCharts one.
is_plotly <- function(plot_data){
  !is.null(plot_data$layout)
}

#' Takes the JSON dictionary for an xyline plot, and returns a named list of
#' data frames, one for each sample.
#' @keywords internal
#' @import rlang
#' @keywords plot_parser
#' @return A list of data frames, one for each sample.
#' Each data frame will have two columns: x, and y.
#' These correspond to the x and y coordinates in the plot.
#' For example, for histogram data, the x values are values of the random
#' variable, and the y values are the number of counts for that value.
parse_xyline_plot <- function(plot_data, name) {
  # This only works on xyline plots
  assertthat::assert_that(plot_data$plot_type == "xy_line")

  # MultiQC >=1.2 plotly parser
  if (is_plotly(plot_data)){
    plot_data$datasets %>%
      purrr::map(function(dataset){
        dataset$lines %>%
          map_xy_line_datasets(plot_name = name)
      }) %>%
      purrr::list_flatten()
  }

  # MultiQC <=1.1 highcharts parser
  else {
    plot_data$datasets %>%
      # For some reason there are two levels of nesting here
      purrr::map(map_xy_line_datasets, plot_name = name) %>%
      purrr::list_flatten()
  }
}

#' Takes the JSON dictionary for a bar graph, and returns a named list of
#' data frames, one for each sample.
#' @keywords internal
#' @import rlang
#' @keywords plot_parser
#' @return A list of data frames, one for each sample.
#' Each data frame will have one column corresponding to each category in the bar chart.
#' For example, for the plot "SnpEff: Counts by Genomic Region", we will have
#' one column for the number of intron variants, one column for the number of exon variants, etc.
#' This means that the number of columns will be fairly variable for different plots.
parse_bar_graph <- function(plot_data, name) {
  assertthat::assert_that(plot_data$plot_type == "bar_graph")
  plot_data$datasets %>% length() %>% `==`(1) %>% assertthat::assert_that(msg = "Only bar graphs with 1 dataset are understood by this parser!")

  colname <- stringr::str_c("plot", sanitise_column_name(name), sep = ".")

  if (is_plotly(plot_data)){
    # MultiQC 1.2+
    dataset <- plot_data$datasets[[1]]
    samples <- dataset$samples %>% purrr::flatten_chr()
    # We make a data frame whose rows are samples and whose columns are categories
    # Ideally this would be the final output, but currently the other code
    # expects a list of samples
    df <- dataset$cats %>%
      purrr::map(function(cat){
        tibble::as_tibble_col(
          purrr::flatten_dbl(cat$data),
          column_name = sanitise_column_name(cat$name)
        )
      }) %>%
      purrr::list_cbind()

    # For compatibility with the old format
    if ("unknown" %in% colnames(df)){
      df <- dplyr::rename(df, none = unknown)
    }

    # And then we slice out each row to become its own list
    seq_along(samples) %>%
      purrr::map(function(sample_idx){
        df[sample_idx, ] %>% tidyr::nest(.key = colname)
      }) %>%
      purrr::set_names(samples) %>%
      `[`(sort(samples))
  }
  else {
    # Make a list of samples
    samples <- plot_data$samples[[1]] %>% purrr::flatten_chr()
    plot_data$datasets[[1]] %>%
      # First, build up a dictionary of samples -> dictionary of quality metrics
      purrr::map(function(dataset) {
        segment_name <- dataset$name
        dataset$data %>%
          # For this segment, each sample has a value
          kv_map(function(value, idx) {
            list(
              key = samples[[idx]],
              value = list(value) %>% purrr::set_names(sanitise_column_name(segment_name))
            )
          }, map_keys = TRUE)
      }) %>%
      purrr::reduce(utils::modifyList) %>%
      # Then, convert each inner dictionary to a tibble row
      purrr::map(tibble::as_tibble_row) %>%
      # And nest each df so that we only have 1 cell of output per sample
      purrr::map(~ tidyr::nest(., .key = colname))
  }
}