ftmsRanalysis/R/as.ftmsData.R at ed732fea6f7a7cb7d6eae64c5bfc624aa3b39892 · EMSL-Computing/ftmsRanalysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
#' Convert Data to peakData Class
#'
#' Converts a list object or several data.frames of FT-MS data to an object of the class 'peakData'. Objects of the class 'peakData' are lists with three obligatory components \code{e_data}, \code{f_data}, and \code{e_meta}.
#'
#' @param e_data a \eqn{p \times n + 1} data.frame of expression data, where \eqn{p} is the number of observed peaks and \eqn{n} is the number of samples. Each row corresponds to data for each peak. One column specifying a unique identifier for each peak/mass (row) must be present.
#' @param f_data a data.frame with \eqn{n} rows. Each row corresponds to a sample with one column giving the unique sample identifiers found in e_data column names and other columns providing qualitative and/or quantitative traits of each sample.
#' @param e_meta a data.frame with \eqn{p} rows. Each row corresponds to a peak/mass with one column giving a unique peak/identifier (must be named the same as the column in \code{e_data}) and other columns giving meta information. At a minimum a column giving the mass of each peak and a column giving molecular formulae or columns giving elemental counts must be present.
#' @param edata_cname character string specifying the name of the column containing a unique identifier for each peak/mass in \code{e_data} and \code{e_meta}.
#' @param fdata_cname character string specifying the name of the column containing the sample identifiers in \code{f_data}.
#' @param mass_cname character string specifying the name of the column containing the peak/mass identifiers in \code{e_meta}. Note: this is often the same as \code{edata_cname} for cases where mass is used as a unique identifier.
#' @param ... further arguments
#'
#' @details Objects of class 'peakData' contain some attributes that are referenced by downstream functions. These attributes must be specified (or added using available functions) to reference downstream functions for: Kendrick plots, Van Krevelen plots, and functions involving databases.
#'
#' If your data contains information about isotopic peaks (e.g. C13), you should specify the attribute \code{isotopic_cname} which gives the column in \code{e_meta} that contains an indicator of yes/no for each peak. Additionally, you must specify the attribute \code{isotopic_notation} which is a character string indicating the value in column \code{isotopic_cname} which indicates that a peak is isotopic.
#' Currently, any peaks that are isotopic are removed from the dataset, as available methods (e.g. Van Krevelen plot) are not applicable to these peaks.
#'
#' Attributes giving general information about the data object:
#' \tabular{ll}{
#' data_scale \tab character string giving the scale that the data is on. Valid options include 'log2', 'log10', 'log' (for natural log), 'pres' (for 0/1 presence/absence data), and 'abundance'. Default value is 'abundance'. \cr
#' \tab \cr
#' instrument_type \tab character string giving the type of FT-MS instrument data was generated by. Valid options are: "12T" and "21T". Defaults to "12T". This information is used to determine appropriate plotting functions for Van Krevelen, Kendrick, etc. plots. \cr
#' }
#' Attributes giving extra information in \code{f_data}:
#' \tabular{ll}{
#' extraction_cname \tab character string specifying the name of the column, in \code{f_data}, containing information as to what extraction method was used for a sample. Only necessary if \code{e_data} contains samples from multiple extraction methods. \cr
#' }
#' Attributes giving extra information in \code{e_meta}:
#' \tabular{ll}{
#' mass_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the mass information for each peak. \cr
#' \tab \cr
#' mf_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the mass (empirical) formula for a peak/mass. \cr
#' \tab \cr
#' c_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the Carbon count for each peak/mass. \cr
#' \tab \cr
#' h_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the Hydrogen count for each peak/mass. \cr
#' \tab \cr
#' o_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the Oxygen count for each peak/mass. \cr
#' \tab \cr
#' n_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the Nitrogen count for each peak/mass. \cr
#' \tab \cr
#' s_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the Sulfur count for each peak/mass. \cr
#' \tab \cr
#' p_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the Phosphorus count for each peak/mass. \cr
#' \tab \cr
#' isotopic_cname \tab character string specifying the name of the column, in \code{e_meta}, containing information about whether each peak is isotopic or not. \cr
#' \tab \cr
#' isotopic_notation \tab character string specifying the value used in column \code{isotopic_cname} which indicates that a peak is isotopic. \cr
#' \tab \cr
#' o2c_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the Oxygen to Carbon ratio for each peak/mass. \cr
#' \tab \cr
#' h2c_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the Hydrogen to Carbon ratio for each peak/mass. \cr
#' \tab \cr
#' kmass_cname \tab a possibly named character vector specifying the name of the columns, in \code{e_meta}, containing the Kendrick Mass for each peak/mass.  Names should be any of 'CH2', 'CO2', 'H2', 'H2O', 'CHO' and correspond to the base compounds used to calculate each of the Kendrick Masses \cr
#' \tab \cr
#' kdefect_cname \tab a possibly named character vector specifying the name of the column, in \code{e_meta}, containing the Kendrick Defect for each peak/mass. Names should be any of 'CH2', 'CO2', 'H2', 'H2O', 'CHO' and correspond to the base compounds used to calculate each of the Kendrick Masses\cr
#' \tab \cr
#' nosc_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the NOSC value for each peak/mass \cr
#' \tab \cr
#' gfe_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the Gibb's Free Energy value for each peak/mass \cr
#' \tab \cr
#' mfname_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the name/description for each peak/mass \cr
#' \tab \cr
#' aroma_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the aromaticity value for each peak/mass \cr
#' \tab \cr
#' modaroma_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the modified aromaticity value for each peak/mass \cr
#' \tab \cr
#' dbe_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the double-bond equivalent values for each peak/mass \cr
#' \tab \cr
#' dbeo_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the double-bond equivalent minus oxygen value for each peak/mass \cr
#' \tab \cr
#' dbeai_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the double-bond equivalent aromaticity index value for each peak/mass \cr
#' \tab \cr
#' elcomp_cname \tab character string specifying the name of the column, in \code{e_meta}, containing the general elemental composition of each peak/mass \cr
#' \tab \cr
#' check_rows \tab logical indicating whether to remove peaks with no nonzero entries.  Defaults to FALSE \cr
#' }
#'
#'@author Lisa Bramer
#'
#'@export

as.peakData <- function(e_data, f_data, e_meta, edata_cname, fdata_cname, mass_cname, ...){
  .as.peakData(e_data, f_data, e_meta, edata_cname, fdata_cname, mass_cname, ...)
}

.as.peakData <- function(e_data, f_data, e_meta, edata_cname, fdata_cname, mass_cname,
                        extraction_cname = NULL, mf_cname = NULL, c_cname = NULL, h_cname = NULL,
                        o_cname = NULL, n_cname = NULL, s_cname = NULL, p_cname = NULL,
                        isotopic_cname = NULL, isotopic_notation = NULL, o2c_cname = NULL, h2c_cname = NULL, kmass_cname = NULL,
                        kdefect_cname = NULL, nosc_cname = NULL, gfe_cname = NULL, mfname_cname = NULL,
                        aroma_cname = NULL, modaroma_cname = NULL, dbe_cname = NULL, dbeo_cname = NULL, dbeai_cname = NULL,
                        elcomp_cname = NULL, instrument_type = "12T", data_scale = "abundance", check_rows = FALSE){


  # make sure e_data, f_data, and e_meta are data.frames #
  e_data <- as.data.frame(e_data)
  f_data <- as.data.frame(f_data)
  e_meta <- as.data.frame(e_meta)

  # check that the peak column exists in e_data and e_meta (if applicable) #
  if(!(edata_cname %in% names(e_data))) stop(paste("Peak/Mass column ", edata_cname," not found in e_data. See details of as.peakData for specifying column names.", sep = ""))
  if(!(edata_cname %in% names(e_meta))) stop(paste("Peak/Mass column ", edata_cname," not found in e_meta. Column names for peak/mass identifiers must match for e_data and e_meta. See details of as.peakData for specifying column names.", sep = ""))

  # check that e_data has unique rows #
  if(nrow(e_data) != length(unique(dplyr::pull(e_data, edata_cname)))) stop("The 'edata_cname' identifier is non-unique.")

  # check that f_data has unique rows #
  if(nrow(f_data) != length(unique(dplyr::pull(f_data, fdata_cname)))) stop("The 'fdata_cname' identifier is non-unique.")

  # check that instrument_type is a valid string #
  if(!(instrument_type %in% c("21T", "12T"))) stop("Instrument type is not valid. See details of as.peakData for valid options.")

  # check that mass_cname is not null #
  if(is.null(mass_cname)) stop("'mass_cname' must be specified")

  # check that either mf_cname or elemental cnames are not null #
  if(is.null(mf_cname) & any(c(is.null(c_cname), is.null(h_cname)))) {
    stop("Either 'mf_cname' or both 'c_cname' and 'h_cname' must be specified")
  }
  # check that cname arguments are found #
  if(!(mass_cname %in% names(e_meta))) {
    stop(paste("Mass column", mass_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = ""))
  }
  if(!is.null(c_cname)){
    if(!(c_cname %in% names(e_meta))) stop(paste("Carbon column", c_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = ""))
  }
  if(!is.null(h_cname)){
    if(!(h_cname %in% names(e_meta))) stop(paste("Hydrogen column", h_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = ""))
  }
  if(!is.null(o_cname)){
    if(!(o_cname %in% names(e_meta))) stop(paste("Oxygen column", o_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = ""))
  }
  if(!is.null(n_cname)){
    if(!(n_cname %in% names(e_meta))) stop(paste("Nitrogen column", n_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = ""))
  }
  if(!is.null(s_cname)){
    if(!(s_cname %in% names(e_meta))) stop(paste("Sulfur column", s_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = ""))
  }
  if(!is.null(p_cname)){
    if(!(p_cname %in% names(e_meta))) stop(paste("Phosphorus column", p_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = ""))
  }
  if(!is.null(isotopic_cname)){
    if(!(isotopic_cname %in% names(e_meta))) stop(paste("Isotopic column", isotopic_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = ""))
  }
  if(!is.null(o2c_cname)){
    if(!(o2c_cname %in% names(e_meta))) stop(paste("Oxygen:Carbon Ratio column ", o2c_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(h2c_cname)){
    if(!(h2c_cname %in% names(e_meta))) stop(paste("Hydrogen:Carbon Ratio column ", h2c_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(kmass_cname)){
    if(!is.null(names(kmass_cname))){
      if(!all(names(kmass_cname)) %in% c('CH2', 'CO2', 'H2', 'H2O', 'CHO', '')) stop("Names (not the values) of kmass_cname must be in 'CH2', 'CO2', 'H2', 'H2O', 'CHO'")
    }
    if(!all(kmass_cname %in% names(e_meta))) stop(paste("Kendrick Mass column ", kmass_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(kdefect_cname)){
    if(!is.null(names(kdefect_cname))){
      if(!all(names(kdefect_cname)) %in% c('CH2', 'CO2', 'H2', 'H2O', 'CHO', '')) stop("Names (not the values) of kdefect_cname must be in 'CH2', 'CO2', 'H2', 'H2O', 'CHO'")
    }
    if(!all(kdefect_cname %in% names(e_meta))) stop(paste("Kendrick Defect column ", kdefect_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(nosc_cname)){
    if(!(nosc_cname %in% names(e_meta))) stop(paste("NOSC column ", nosc_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(gfe_cname)){
    if(!(gfe_cname %in% names(e_meta))) stop(paste("Gibbs Free Energy column ", gfe_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(mfname_cname)){
    if(!(mfname_cname %in% names(e_meta))) stop(paste("Molecular formula name/description column ", mfname_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(aroma_cname)){
    if(!(aroma_cname %in% names(e_meta))) stop(paste("Aromaticity column ", aroma_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(modaroma_cname)){
    if(!(modaroma_cname %in% names(e_meta))) stop(paste("Modified aromaticity column ", modaroma_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(mf_cname)){
    if(!(mf_cname %in% names(e_meta))) stop(paste("Molecular formula column ", mf_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(dbe_cname)){
    if(!(dbe_cname %in% names(e_meta))) stop(paste("Double-bond equivalent column ", dbe_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(dbeo_cname)){
    if(!(dbeo_cname %in% names(e_meta))) stop(paste("Double-bond equivalent (minus Oxygen) column ", dbeo_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(dbeai_cname)){
    if(!(dbeai_cname %in% names(e_meta))) stop(paste("Double-bond equivalent (aromaticity index) column ", dbeai_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(elcomp_cname)){
    if(!(elcomp_cname %in% names(e_meta))) stop(paste("Elemental composition column ", elcomp_cname, " not found in e_meta. See details of as.peakData for specifying column names.", sep = "") )
  }
  if(!is.null(extraction_cname)){
    if(!(extraction_cname %in% names(f_data))) stop(paste("Extraction Method column ", extraction_cname, " not found in f_data. See details of as.peakData for specifying column names.", sep = "") )
  }


  # check that the Sample column name is in f_data column names #
  if(!(fdata_cname %in% names(f_data))) stop(paste("Sample column ", fdata_cname, " not found in f_data. See details of as.pepData for specifying column names.", sep = ""))

  # check that all samples in e_data are present in f_data #
  edat_sampid = which(names(e_data) == edata_cname)
  samps.miss = sum(!(names(e_data[,-edat_sampid]) %in% dplyr::pull(f_data,fdata_cname)))
  if( samps.miss > 0) stop(paste( samps.miss, " samples from e_data not found in f_data", sep = ""))

  # check for any extra samples in f_data not in e_data - necessary to remove before group_designation function #
  if(any(!(dplyr::pull(f_data, fdata_cname) %in% names(e_data)))){
    f_data <- f_data[-which(!(dplyr::pull(f_data,fdata_cname) %in% names(e_data))),]
    warning("Extra samples were found in f_data that were not in e_data. These have been removed from f_data.")
  }

  # check that f_data has at least 2 columns #
  if(ncol(f_data) < 2) stop("f_data must contain at least 2 columns")

  # if e_meta is provided, check that all peaks in e_data occur in e_meta #
  if(!is.null(e_meta)){
    if(sum(!(dplyr::pull(e_data,edata_cname) %in% dplyr::pull(e_meta,edata_cname))) > 0 ) stop("Not all peaks in e_data are present in e_meta")
  }

  # if e_meta is provided, remove any extra features that were provided #
  if(!is.null(e_meta)){
    if(any(!(dplyr::pull(e_meta,edata_cname) %in% dplyr::pull(e_data,edata_cname)))){
      e_meta <- e_meta[-which(!(dplyr::pull(e_meta, edata_cname) %in% dplyr::pull(e_data, edata_cname))),]
      warning("Extra peaks were found in e_meta that were not in e_data. These have been removed from e_meta.")
    }
  }

  # convert the unique identifier column to character, to avoid factors #
  e_data[, edata_cname] = as.character(dplyr::pull(e_data, edata_cname))
  e_meta[, edata_cname] = as.character(dplyr::pull(e_meta, edata_cname))

  # check that if isotopic_cname is non-NULL then isotopic_notation is also non-NULL #
  if(!is.null(isotopic_cname)){
    if(is.null(isotopic_notation)) stop("You must specify 'isotopic_notation', if 'isotopic_cname' is non-NULL")
  }

  # if isotopic_cname is not NULL #
  if(!is.null(isotopic_cname)){
    # filter out peaks where isotopic_cname = TRUE #
    ids_rmv = e_meta[which(as.character(e_meta[,isotopic_cname]) == isotopic_notation),edata_cname]
    if(length(ids_rmv) > 0){
      e_meta = e_meta[-which(dplyr::pull(e_meta, edata_cname) %in% ids_rmv), ]
      e_data = e_data[-which(dplyr::pull(e_data, edata_cname) %in% ids_rmv), ]
    }
  }

  # if mf_cname is NULL and any of o_cname, n_cname, s_cname or p_cname are NULL, create columns of zeroes for them #
  if (is.null(mf_cname) & is.null(o_cname)) {
    o_cname <- tail(make.unique(c(colnames(e_meta), "O")), 1) #unique column name
    e_meta[, o_cname] <- 0
  }
  if (is.null(mf_cname) & is.null(n_cname)) {
    n_cname <- tail(make.unique(c(colnames(e_meta), "N")), 1) #unique column name
    e_meta[,n_cname] <- 0
  }
  if (is.null(mf_cname) & is.null(s_cname)) {
    s_cname <- tail(make.unique(c(colnames(e_meta), "S")), 1) #unique column name
    e_meta[, s_cname] <- 0
  }
  if (is.null(mf_cname) & is.null(p_cname)) {
    p_cname <- tail(make.unique(c(colnames(e_meta), "P")), 1) #unique column name
    e_meta[, p_cname] <- 0
  }

  # store results #
  res = list(e_data = e_data, f_data = f_data, e_meta = e_meta)

  # set column name attributes #
  attr(res, "cnames") = list(edata_cname = edata_cname, fdata_cname = fdata_cname, mass_cname = mass_cname,
                             extraction_cname = extraction_cname, mf_cname = mf_cname,
                             c_cname = c_cname, h_cname = h_cname, o_cname = o_cname, n_cname = n_cname,
                             s_cname = s_cname, p_cname = p_cname, isotopic_cname = isotopic_cname,
                             o2c_cname = o2c_cname,
                             h2c_cname = h2c_cname, kmass_cname = kmass_cname, kdefect_cname = kdefect_cname,
                             nosc_cname = nosc_cname, gfe_cname = gfe_cname, mfname_cname = mfname_cname,
                             aroma_cname = aroma_cname, modaroma_cname = modaroma_cname, dbe_cname = dbe_cname,
                             dbeo_cname = dbeo_cname, dbeai_cname = dbeai_cname, elcomp_cname = elcomp_cname
                             )

  attr(res, "data_info") = list(data_scale = data_scale, instrument_type = instrument_type)

  # set group dataframe attribute to NULL, will be filled in after running group_designation function #
  attr(res, "group_DF") = NULL

  # set class of list #
  class(res) = c("peakData","ftmsData")

  # check for empty rows and remove them with molfilt
  if(check_rows){
    molfilt <- molecule_filter(res)
    if(any(molfilt$Num_Observations == 0)) res <- applyFilt(molfilt, res, min_num = 1)
  }

  # set filters attributes #
  attr(res, "filters") = NULL

  # if mf_cname is NULL and elemental columns are non-NULL, construct formulae #
  if(is.null(mf_cname) & all(c(!is.null(c_cname), !is.null(h_cname), !is.null(o_cname), !is.null(n_cname), !is.null(s_cname), !is.null(p_cname)))){
    res = assign_mf(res)
  }

  # if mf_cname is non-NULL and elemental columns are NULL, parse formulae #
  if(!is.null(mf_cname) & all(c(is.null(c_cname), is.null(h_cname), is.null(o_cname), is.null(n_cname), is.null(s_cname), is.null(p_cname)))){
    res = parse_mf(res)
  }

  return(res)

}


#' Convert Data to compoundData Class
#'
#' Converts a list object or several data.frames of FT-MS data to an object of the class 'compoundData'. Objects of the class 'compoundData' are lists with three obligatory components \code{e_data}, \code{f_data}, and \code{e_meta}.
#'
#' @param e_data a \eqn{p \times n + 1} data.frame of expression data, where \eqn{p} is the number of observed compounds and \eqn{n} is the number of samples. Each row corresponds to data for each peak. One column specifying a unique identifier for each peak/mass (row) must be present.
#' @param f_data a data.frame with \eqn{n} rows. Each row corresponds to a sample with one column giving the unique sample identifiers found in e_data column names and other columns providing qualitative and/or quantitative traits of each sample.
#' @param e_meta a data.frame with \eqn{p} rows. Each row corresponds to a compound with one column giving a unique peak/identifier (must be named the same as the column in \code{e_data}) and other columns giving meta information. At a minimum a column giving the mass of each peak and a column giving molecular formulae or columns giving elemental counts must be present.
#' @param edata_cname character string specifying the name of the column containing a unique identifier for each peak/mass in \code{e_data} and \code{e_meta}.
#' @param fdata_cname character string specifying the name of the column containing the sample identifiers in \code{f_data}.
#' @param mass_cname character string specifying the name of the column containing the peak/mass identifiers in \code{e_meta}. Note: this is often the same as \code{edata_cname} for cases where mass is used as a unique identifier.
#' @param compound_cname character string specifying the name of the column containing the compound identifier in \code{e_meta}. This is a compound identifier related to a database (e.g. MetaCyc)
#' @param ... further arguments (see \code{\link{as.peakData}})
#'
#' @details \code{as.compoundData} constructs a compoundData object which is an ftmsData object where the rows of \code{e_data} correspond to compounds.
#' @rdname as.compoundData

as.compoundData <- function(e_data, f_data, e_meta, edata_cname, fdata_cname, mass_cname, compound_cname, ...){
  res <- as.peakData(e_data, f_data, e_meta, edata_cname, fdata_cname, mass_cname, ...)
  class(res) <- c("compoundData", "ftmsData")

  # if e_meta is provided, check that reaction_cname is in it
  if (!is.null(e_meta)) {
    if(!(compound_cname %in% names(e_meta))) stop(paste("Compound column", compound_cname, " not found in e_meta.", sep = ""))
  }
  res <- setCompoundColName(res, compound_cname)
  return(res)
}

#' Convert Data to reactionData Class
#'
#' Converts a list object or several data.frames of FT-MS data to an object of the class 'reactionData'. Objects of the class 'reactionData' are lists with two obligatory components \code{e_data} and \code{f_data} and one optional compounent, \code{e_meta}.
#'
#' @param e_data a \eqn{p \times n + 1} data.frame of expression data, where \eqn{p} is the number of observed reactions and \eqn{n} is the number of samples. Each row corresponds to data for each peak. One column specifying a unique identifier for each peak/mass (row) must be present.
#' @param f_data a data.frame with \eqn{n} rows. Each row corresponds to a sample with one column giving the unique sample identifiers found in e_data column names and other columns providing qualitative and/or quantitative traits of each sample.
#' @param e_meta a data.frame with \eqn{p} rows. Each row corresponds to a reaction with one column giving a unique reaction identifier (must be named the same as the column in \code{e_data}) and other columns giving meta information. At a minimum a column giving the mass of each peak and a column giving molecular formulae or columns giving elemental counts must be present.
#' @param edata_cname character string specifying the name of the column containing a unique identifier for each reaction in \code{e_data} and \code{e_meta}.
#' @param fdata_cname character string specifying the name of the column containing the sample identifiers in \code{f_data}.
#' @param reaction_cname character string specifying the name of the column containing the reaction identifiers in \code{e_meta}.
#' @param instrument_type character string giving the type of FT-MS instrument data was generated by. Valid options are: "12T" and "21T". Defaults to "12T". This information is used to determine appropriate plotting functions for Van Krevelen, Kendrick, etc. plots.
#' @param db character string specifying the database from which reaction information is drawn
#' @param ... further arguments
#'
#' @details \code{as.reactionData} constructs a reactionData object which is an ftmsData object where the rows of \code{e_data} correspond to reactions.
#' @rdname as.reactionData
as.reactionData <- function(e_data, f_data, e_meta = NULL, edata_cname, fdata_cname, reaction_cname, instrument_type = "12T", db=NA, ...){

  # initial checks #

  # check that e_data, f_data, and e_meta are data.frames #
  if(!inherits(e_data, "data.frame")) stop("e_data must be of the class 'data.frame'")
  if(!inherits(f_data, "data.frame")) stop("f_data must be of the class 'data.frame'")
  if(!inherits(e_meta, "data.frame")) stop("e_meta must be of the class 'data.frame'")

  # check that the peak column exists in e_data and e_meta (if applicable) #
  if(!(edata_cname %in% names(e_data))) stop(paste("Column ", edata_cname," not found in e_data. See details of as.reactionData for specifying column names.", sep = ""))
  if(!(edata_cname %in% names(e_meta))) stop(paste("Column ", edata_cname," not found in e_meta. Column names for dentifiers must match for e_data and e_meta. See details of as.reactionData for specifying column names.", sep = ""))

  # check that e_data has unique rows #
  if(nrow(e_data) != length(unique(dplyr::pull(e_data, edata_cname)))) stop("The 'edata_cname' identifier is non-unique.")

  # check that f_data has unique rows #
  if(nrow(f_data) != length(unique(dplyr::pull(f_data, fdata_cname)))) stop("The 'fdata_cname' identifier is non-unique.")

  # check that the Sample column name is in f_data column names #
  if(!(fdata_cname %in% names(f_data))) stop(paste("Sample column ", fdata_cname, " not found in f_data. See details of as.pepData for specifying column names.", sep = ""))

  # check that all samples in e_data are present in f_data #
  edat_sampid = which(names(e_data) == edata_cname)
  samps.miss = sum(!(names(e_data[,-edat_sampid]) %in% dplyr::pull(f_data, fdata_cname)))
  if( samps.miss > 0) stop(paste( samps.miss, " samples from e_data not found in f_data", sep = ""))

  # check for any extra samples in f_data not in e_data - necessary to remove before group_designation function #
  if(any(!(dplyr::pull(f_data, fdata_cname) %in% names(e_data)))){
    f_data <- f_data[-which(!(dplyr::pull(f_data, fdata_cname) %in% names(e_data))),]
    warning("Extra samples were found in f_data that were not in e_data. These have been removed from f_data.")
  }

  # check that f_data has at least 2 columns #
  if(ncol(f_data) < 2) stop("f_data must contain at least 2 columns")

  # if e_meta is provided, check that all peaks in e_data occur in e_meta #
  if(!is.null(e_meta)){
    if(sum(!(dplyr::pull(e_data, edata_cname) %in% dplyr::pull(e_meta, edata_cname))) > 0 ) stop("Not all identifiers in e_data are present in e_meta")
  }

  # if e_meta is provided, remove any extra features that were provided #
  if(!is.null(e_meta)){
    if(any(!(dplyr::pull(e_meta, edata_cname) %in% dplyr::pull(e_data, edata_cname)))){
      e_meta <- e_meta[-which(!(dplyr::pull(e_meta, edata_cname) %in% dplyr::pull(e_data, edata_cname))),]
      warning("Extra rows were found in e_meta that were not in e_data. These have been removed from e_meta.")
    }
  }

  # if e_meta is provided, check that reaction_cname is in it
  if (!is.null(e_meta)) {
    if(!(reaction_cname %in% names(e_meta))) stop(paste("Reaction column", reaction_cname, " not found in e_meta.", sep = ""))
  }

  # convert the unique identifier column to character, to avoid factors #
  e_data[, edata_cname] = as.character(dplyr::pull(e_data, edata_cname))
  e_meta[, edata_cname] = as.character(dplyr::pull(e_meta, edata_cname))

  # store results #
  res = list(e_data = e_data, f_data = f_data, e_meta = e_meta)

  # set column name attributes #
  attr(res, "cnames") = list(edata_cname = edata_cname, fdata_cname = fdata_cname, reaction_cname=reaction_cname)

  # set group dataframe attribute to NULL, will be filled in after running group_designation function #
  attr(res, "group_DF") = NULL

  # set filters attributes #
  attr(res, "filters") = NULL

  attr(res, "data_info") <- list(data_scale=NA, instrument_type=instrument_type)

  attr(res, "DB") <- db

  class(res) <- c("reactionData", "ftmsData")
  return(res)
}


#' Convert Data to moduleData Class
#'
#' Converts a list object or several data.frames of FT-MS data to an object of the class 'moduleData'. Objects of the class 'moduleData' are lists with two obligatory components \code{e_data} and \code{f_data} and one optional component, \code{e_meta}.
#'
#' @param e_data a \eqn{p \times n + 1} data.frame of expression data, where \eqn{p} is the number of observed module nodes and \eqn{n} is the number of samples. Each row corresponds to data for each peak. One column specifying a unique identifier for each peak/mass (row) must be present.
#' @param f_data a data.frame with \eqn{n} rows. Each row corresponds to a sample with one column giving the unique sample identifiers found in e_data column names and other columns providing qualitative and/or quantitative traits of each sample.
#' @param e_meta a data.frame with \eqn{p} rows. Each row corresponds to a module node with one column giving a unique module node identifier (must be named the same as the column in \code{e_data}) and other columns giving meta information. At a minimum a column giving the mass of each peak and a column giving molecular formulae or columns giving elemental counts must be present.
#' @param edata_cname character string specifying the name of the column containing a unique identifier for each module node in \code{e_data} and \code{e_meta}.
#' @param fdata_cname character string specifying the name of the column containing the sample identifiers in \code{f_data}.
#' @param module_cname character string specifying the name of the column containing the module identifiers in \code{e_meta}.
#' @param module_node_cname character string specifying the name of the column containing the module node identifiers in \code{e_meta}
#' @param instrument_type character string giving the type of FT-MS instrument data was generated by. Valid options are: "12T" and "21T". Defaults to "12T". This information is used to determine appropriate plotting functions for Van Krevelen, Kendrick, etc. plots.
#' @param db character string specifying the database from which reaction information is drawn
#' @param ... further arguments
#'
#' @details \code{as.moduleData} constructs a moduleData object which is an ftmsData object where the rows of \code{e_data} correspond to unique module nodes.
#' @rdname as.moduleData
# @param node_label_cname character string specifying the name of the column containing the display name for each module node, in \code{e_meta}

as.moduleData <- function(e_data, f_data, e_meta = NULL, edata_cname, fdata_cname, module_cname, module_node_cname,
                             instrument_type = "12T", db=NA, ...){

  # initial checks #

  # check that e_data, f_data, and e_meta are data.frames #
  if(!inherits(e_data, "data.frame")) stop("e_data must be of the class 'data.frame'")
  if(!inherits(f_data, "data.frame")) stop("f_data must be of the class 'data.frame'")
  if(!inherits(e_meta, "data.frame")) stop("e_meta must be of the class 'data.frame'")

  # check that the peak column exists in e_data and e_meta (if applicable) #
  if(!(edata_cname %in% names(e_data))) stop(paste("Column ", edata_cname," not found in e_data. See details of as.moduleData for specifying column names.", sep = ""))
  if(!(edata_cname %in% names(e_meta))) stop(paste("Column ", edata_cname," not found in e_meta. Column names for identifiers must match for e_data and e_meta. See details of as.moduleData for specifying column names.", sep = ""))

  # check that e_data has unique rows #
  if(nrow(e_data) != length(unique(dplyr::pull(e_data, edata_cname)))) stop("The 'edata_cname' identifier is non-unique.")

  # check that f_data has unique rows #
  if(nrow(f_data) != length(unique(dplyr::pull(f_data, fdata_cname)))) stop("The 'fdata_cname' identifier is non-unique.")

  # check that the Sample column name is in f_data column names #
  if(!(fdata_cname %in% names(f_data))) stop(paste("Sample column ", fdata_cname, " not found in f_data. See details of as.pepData for specifying column names.", sep = ""))

  # check that all samples in e_data are present in f_data #
  edat_sampid = which(names(e_data) == edata_cname)
  samps.miss = sum(!(names(e_data[,-edat_sampid]) %in% dplyr::pull(f_data, fdata_cname)))
  if( samps.miss > 0) stop(paste( samps.miss, " samples from e_data not found in f_data", sep = ""))

  # check for any extra samples in f_data not in e_data - necessary to remove before group_designation function #
  if(any(!(dplyr::pull(f_data, fdata_cname) %in% names(e_data)))){
    f_data <- f_data[-which(!(dplyr::pull(f_data, fdata_cname) %in% names(e_data))),]
    warning("Extra samples were found in f_data that were not in e_data. These have been removed from f_data.")
  }

  # check that f_data has at least 2 columns #
  if(ncol(f_data) < 2) stop("f_data must contain at least 2 columns")

  # if e_meta is provided, check that all peaks in e_data occur in e_meta #
  if(!is.null(e_meta)){
    if(sum(!(dplyr::pull(e_data, edata_cname) %in% dplyr::pull(e_meta, edata_cname))) > 0 ) stop("Not all identifiers in e_data are present in e_meta")
  }

  # if e_meta is provided, remove any extra features that were provided #
  if(!is.null(e_meta)){
    if(any(!(dplyr::pull(e_meta, edata_cname) %in% dplyr::pull(e_data, edata_cname)))){
      e_meta <- e_meta[-which(!(dplyr::pull(e_meta, edata_cname) %in% dplyr::pull(e_data, edata_cname))),]
      warning("Extra rows were found in e_meta that were not in e_data. These have been removed from e_meta.")
    }
  }

  # if e_meta is provided, check that reaction_cname is in it
  if (!is.null(e_meta)) {
    if(!(module_cname %in% names(e_meta))) stop(paste("Module column", module_cname, " not found in e_meta.", sep = ""))
    if(!(module_node_cname %in% names(e_meta))) stop(paste("Module node column", module_node_cname, " not found in e_meta.", sep = ""))
    # if(!is.null(node_label_cname))
    #    if (!(node_label_cname %in% names(e_meta))) stop(paste("Node label column", node_label_cname, " not found in e_meta.", sep = ""))
  }

  # convert the unique identifier column to character, to avoid factors #
  e_data[, edata_cname] = as.character(dplyr::pull(e_data, edata_cname))
  e_meta[, edata_cname] = as.character(dplyr::pull(e_meta, edata_cname))

  # store results #
  res = list(e_data = e_data, f_data = f_data, e_meta = e_meta)

  # set column name attributes #
  attr(res, "cnames") <- list(edata_cname = edata_cname, fdata_cname = fdata_cname, module_cname=module_cname, module_node_cname=module_node_cname)

  # set group dataframe attribute to NULL, will be filled in after running group_designation function #
  attr(res, "group_DF") = NULL

  # set filters attributes #
  attr(res, "filters") = NULL

  attr(res, "data_info") <- list(data_scale=NA, instrument_type=instrument_type)

  attr(res, "DB") <- db

  class(res) <- c("moduleData", "ftmsData")
  return(res)
}