-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathdiffdf.R
More file actions
427 lines (353 loc) · 12.7 KB
/
diffdf.R
File metadata and controls
427 lines (353 loc) · 12.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
#' diffdf
#' @description
#' Compares 2 dataframes and outputs any differences.
#' @param base input dataframe
#' @param compare comparison dataframe
#' @param keys vector of variables (as strings) that defines a unique row in
#' the base and compare dataframes
#' @param strict_numeric Flag for strict numeric to numeric comparisons
#' (default = TRUE). If False diffdf will cast integer to double where
#' required for comparisons. Note that variables specified in the keys
#' will never be casted.
#' @param strict_factor Flag for strict factor to character comparisons
#' (default = TRUE). If False diffdf will cast factors to characters where
#' required for comparisons. Note that variables specified in the keys will
#' never be casted.
#' @param suppress_warnings Do you want to suppress warnings? (logical)
#' @param file Location and name of a text file to output the results to.
#' Setting to NULL will cause no file to be produced.
#' @param tolerance Set tolerance for numeric comparisons. Note that
#' comparisons fail if (x-y)/scale > tolerance.
#' @param scale Set scale for numeric comparisons. Note that comparisons fail
#' if (x-y)/scale > tolerance. Setting as NULL is a slightly more efficient
#' version of scale = 1.
#' @param check_column_order Should the column ordering be checked? (logical)
#' @param check_df_class Do you want to check for differences in the class
#' between `base` and `compare`? (logical)
#' @examples
#' x <- subset(iris, -Species)
#' x[1, 2] <- 5
#' COMPARE <- diffdf(iris, x)
#' print(COMPARE)
#'
#' #### Sample data frames
#'
#' DF1 <- data.frame(
#' id = c(1, 2, 3, 4, 5, 6),
#' v1 = letters[1:6],
#' v2 = c(NA, NA, 1, 2, 3, NA)
#' )
#'
#' DF2 <- data.frame(
#' id = c(1, 2, 3, 4, 5, 7),
#' v1 = letters[1:6],
#' v2 = c(NA, NA, 1, 2, NA, NA),
#' v3 = c(NA, NA, 1, 2, NA, 4)
#' )
#'
#' diffdf(DF1, DF1, keys = "id")
#'
#' # We can control matching with scale/location for example:
#'
#' DF1 <- data.frame(
#' id = c(1, 2, 3, 4, 5, 6),
#' v1 = letters[1:6],
#' v2 = c(1, 2, 3, 4, 5, 6)
#' )
#' DF2 <- data.frame(
#' id = c(1, 2, 3, 4, 5, 6),
#' v1 = letters[1:6],
#' v2 = c(1.1, 2, 3, 4, 5, 6)
#' )
#'
#' diffdf(DF1, DF2, keys = "id")
#' diffdf(DF1, DF2, keys = "id", tolerance = 0.2)
#' diffdf(DF1, DF2, keys = "id", scale = 10, tolerance = 0.2)
#'
#' # We can use strict_factor to compare factors with characters for example:
#'
#' DF1 <- data.frame(
#' id = c(1, 2, 3, 4, 5, 6),
#' v1 = letters[1:6],
#' v2 = c(NA, NA, 1, 2, 3, NA),
#' stringsAsFactors = FALSE
#' )
#'
#' DF2 <- data.frame(
#' id = c(1, 2, 3, 4, 5, 6),
#' v1 = letters[1:6],
#' v2 = c(NA, NA, 1, 2, 3, NA)
#' )
#'
#' diffdf(DF1, DF2, keys = "id", strict_factor = TRUE)
#' diffdf(DF1, DF2, keys = "id", strict_factor = FALSE)
#'
#' @export
diffdf <- function(
base,
compare,
keys = NULL,
suppress_warnings = FALSE,
strict_numeric = TRUE,
strict_factor = TRUE,
file = NULL,
tolerance = sqrt(.Machine$double.eps),
scale = NULL,
check_column_order = FALSE,
check_df_class = FALSE
) {
assertthat::assert_that(
assertthat::is.flag(check_df_class),
!is.na(check_df_class),
msg = "`check_df_class` must be a length 1 logical"
)
BASE <- base
COMP <- compare
KEYS <- keys
SUPWARN <- suppress_warnings
### Initatiate output object
COMPARE <- list()
class(COMPARE) <- c("diffdf", "list")
BASE_NAME <- deparse(substitute(base))
COMP_NAME <- deparse(substitute(compare))
COMPARE[["DataSummary"]] <- construct_issue(
value = describe_dataframe(BASE, COMP, BASE_NAME, COMP_NAME),
message = "Summary of BASE and COMPARE"
)
is_derived <- FALSE
### If no key is suplied match values based upon row number
if (is.null(KEYS)) {
is_derived <- TRUE
keyname <- generate_keyname(BASE, COMP)
BASE[[keyname]] <- seq_len(nrow(BASE))
COMP[[keyname]] <- seq_len(nrow(COMP))
KEYS <- keyname
}
attr(COMPARE, "keys") <- list(value = KEYS, is_derived = is_derived)
assertthat::assert_that(
is.numeric(tolerance),
is.numeric(scale) || is.null(scale)
)
missing_keys_base <- KEYS[!KEYS %in% names(BASE)]
assertthat::assert_that(
length(missing_keys_base) == 0,
msg = sprintf(
"The following KEYS are not available in BASE:\n %s",
paste(missing_keys_base, collapse = "\n ")
)
)
missing_keys_comp <- KEYS[!KEYS %in% names(COMP)]
assertthat::assert_that(
length(missing_keys_comp) == 0,
msg = sprintf(
"The following KEYS are not available in COMPARE:\n %s",
paste(missing_keys_comp, collapse = "\n ")
)
)
assertthat::assert_that(
has_unique_rows(BASE, KEYS),
msg = "BY variables in BASE do not result in unique observations"
)
assertthat::assert_that(
has_unique_rows(COMP, KEYS),
msg = "BY variables in COMPARE do not result in unique observations"
)
#### Check essential variable properties (class & mode)
COMPARE[["UnsupportedColsBase"]] <- construct_issue(
value = identify_unsupported_cols(BASE),
message = "There are columns in BASE with unsupported modes !!"
)
COMPARE[["UnsupportedColsComp"]] <- construct_issue(
value = identify_unsupported_cols(COMP),
message = "There are columns in COMPARE with unsupported modes !!"
)
# cast variables if strict is off
if (!strict_factor || !strict_numeric) {
casted_df <- cast_variables(
BASE = BASE,
COMPARE = COMP,
ignore_vars = KEYS,
cast_integers = !strict_numeric,
cast_factors = !strict_factor
)
BASE <- casted_df$BASE
COMP <- casted_df$COMP
}
COMPARE[["VarModeDiffs"]] <- construct_issue(
value = identify_mode_differences(BASE, COMP),
message = "There are columns in BASE and COMPARE with different modes !!"
)
COMPARE[["VarClassDiffs"]] <- construct_issue(
value = identify_class_differences(BASE, COMP),
message = "There are columns in BASE and COMPARE with different classes !!"
)
##### Check Validity of Keys
BASE_keys <- names(BASE)[names(BASE) %in% KEYS]
COMP_keys <- names(COMP)[names(COMP) %in% KEYS]
assertthat::assert_that(
length(BASE_keys) == length(KEYS),
msg = "BASE is missing variables specified in KEYS"
)
assertthat::assert_that(
length(COMP_keys) == length(KEYS),
msg = "COMP is missing variables specified in KEYS"
)
assert_valid_keys(
COMPARE, KEYS, "UnsupportedColsBase",
"The following KEYS in BASE have an unsupported mode (see `?mode()`)"
)
assert_valid_keys(
COMPARE, KEYS, "UnsupportedColsComp",
"The following KEYS in COMPARE have an unsupported mode (see `?mode()`)"
)
assert_valid_keys(
COMPARE, KEYS, "VarModeDiffs",
"The following KEYS have different modes between BASE and COMPARE"
)
assert_valid_keys(
COMPARE, KEYS, "VarClassDiffs",
"The following KEYS have different classes between BASE and COMPARE"
)
exclude_cols <- c(
COMPARE[["UnsupportedColsBase"]]$VARIABLE,
COMPARE[["UnsupportedColsComp"]]$VARIABLE,
COMPARE[["VarClassDiffs"]]$VARIABLE,
COMPARE[["VarModeDiffs"]]$VARIABLE
)
if (check_column_order) {
if (attr(COMPARE, "keys")$is_derived) {
keep_vars_base <- !(names(BASE) %in% attr(COMPARE, "keys")$value)
keep_vars_comp <- !(names(COMP) %in% attr(COMPARE, "keys")$value)
} else {
keep_vars_base <- TRUE
keep_vars_comp <- TRUE
}
COMPARE[["ColumnOrder"]] <- construct_issue(
value = identify_column_order_differences(
BASE[, keep_vars_base, drop = FALSE],
COMP[, keep_vars_comp, drop = FALSE]
),
message = "There are differences in the column ordering between BASE and COMPARE !!"
)
}
##### Check Attributes
COMPARE[["AttribDiffs"]] <- construct_issue(
value = identify_att_differences(BASE, COMP, exclude_cols),
message = "There are columns in BASE and COMPARE with differing attributes !!"
)
##### Check data
BASE <- factor_to_character(BASE, KEYS)
COMP <- factor_to_character(COMP, KEYS)
COMPARE[["ExtRowsBase"]] <- construct_issue(
value = identify_extra_rows(BASE, COMP, KEYS),
message = "There are rows in BASE that are not in COMPARE !!"
)
COMPARE[["ExtRowsComp"]] <- construct_issue(
value = identify_extra_rows(COMP, BASE, KEYS),
message = "There are rows in COMPARE that are not in BASE !!"
)
COMPARE[["ExtColsBase"]] <- construct_issue(
value = identify_extra_cols(BASE, COMP),
message = "There are columns in BASE that are not in COMPARE !!"
)
COMPARE[["ExtColsComp"]] <- construct_issue(
value = identify_extra_cols(COMP, BASE),
message = "There are columns in COMPARE that are not in BASE !!"
)
VALUE_DIFFERENCES <- identify_differences(
BASE, COMP, KEYS, exclude_cols,
tolerance = tolerance,
scale = scale
)
## Summarise the number of mismatching rows per variable
if (length(VALUE_DIFFERENCES)) {
NDIFF <- sapply(VALUE_DIFFERENCES, nrow)
COMPARE[["NumDiff"]] <- construct_issue(
value = convert_to_issue(NDIFF),
message = "Not all Values Compared Equal"
)
}
for (i in names(VALUE_DIFFERENCES)) {
COMPARE[[paste0("VarDiff_", i)]] <- construct_issue(
value = VALUE_DIFFERENCES[[i]],
message = NULL
)
}
# suppress warning message of data summary if user didn't request to check it
# we leave the issue in the main compare object though for printing purposes
COMPARE_WARNINGS <- COMPARE
attr(COMPARE_WARNINGS[["DataSummary"]], "message") <- c(
"There are differences between the class of BASE and COMPARE"
)
if (!check_df_class || identical(class(base), class(compare))) {
COMPARE_WARNINGS["DataSummary"] <- NULL
}
# Get all issue messages, remove blank message, and collapse into single string
ISSUE_MSGS <- sapply(COMPARE_WARNINGS, function(x) get_issue_message(x))
ISSUE_MSGS <- Filter(function(x) !is.null(x), ISSUE_MSGS)
ISSUE_MSGS <- Filter(function(x) x != "", ISSUE_MSGS)
if (length(ISSUE_MSGS) != 0) {
if (!SUPWARN) {
ISSUE_MSGS <- paste(ISSUE_MSGS, collapse = "\n")
warning(c("\n", ISSUE_MSGS))
}
}
# If the classes are the same and it is the only entry in the compare
# object then remove it in order to trigger "no issues found"
if (identical(class(base), class(compare)) && length(COMPARE) == 1) {
COMPARE["DataSummary"] <- NULL
}
# If the summary is the only item and the user didn't want to check classes
# then remove the object to trigger the "no issues found"
if (!check_df_class && length(COMPARE) == 1) {
COMPARE["DataSummary"] <- NULL
}
if (!is.null(file)) {
print(COMPARE, file = file)
}
return(COMPARE)
}
#' diffdf_has_issues
#'
#' Utility function which returns TRUE if an diffdf
#' object has issues or FALSE if an diffdf object does not have issues
#' @param x diffdf object
#' @examples
#'
#' # Example with no issues
#' x <- diffdf(iris, iris)
#' diffdf_has_issues(x)
#'
#' # Example with issues
#' iris2 <- iris
#' iris2[2, 2] <- NA
#' x <- diffdf(iris, iris2, suppress_warnings = TRUE)
#' diffdf_has_issues(x)
#' @export
diffdf_has_issues <- function(x) {
if (class(x)[[1]] != "diffdf") stop("x is not an diffdf object")
length(x) != 0
}
#' Assert that keys are valid
#'
#' Utility function to check that user provided "keys" aren't listed as a problem
#' variable of the current list of issues.
#' @param COMPARE (`list`)\cr A named list of which each element is a `data.frame` with the
#' column `VARIABLE`
#' @param KEYS (`character`)\cr name of key variables to check to make sure they don't contain
#' any issues
#' @param component (`character`)\cr name of the component within `COMPARE` to check against
#' @param msg (`character`)\cr error message to print if any of `KEYS` are found within
#' `COMPARE[component]$VARIABLE`
#' @keywords internal
assert_valid_keys <- function(COMPARE, KEYS, component, msg) {
keys_reduced <- KEYS[KEYS %in% COMPARE[[component]]$VARIABLE]
assertthat::assert_that(
length(keys_reduced) == 0,
msg = sprintf(
"%s:\n%s",
msg,
paste0("`", paste0(keys_reduced, collapse = "`, `"), "`")
)
)
}