diff --git a/R/helpers_regression.R b/R/helpers_regression.R index 52a09c8..365881f 100644 --- a/R/helpers_regression.R +++ b/R/helpers_regression.R @@ -2,7 +2,8 @@ #' #' @description This function determines the order of sibling pairs based on an outcome variable. #' The function checks which of the two kinship pairs has more of a specified outcome variable. -#' It adds a new column named `order` to the dataset, indicating which sibling (identified as "s1" or "s2") has more of the outcome. +#' It adds a new column named `order` to the dataset, indicating which sibling +#' (identified as "s1" or "s2") has more of the outcome. #' If the two siblings have the same amount of the outcome, it randomly assigns one as having more. # #' @inheritParams discord_data @@ -16,7 +17,7 @@ check_sibling_order <- function(..., fast = FALSE) { if (fast == TRUE) { check_sibling_order_fast(...) } else { - check_sibling_order_ram_optimized(...) + check_sibling_order_ram(...) } } @@ -24,7 +25,8 @@ check_sibling_order <- function(..., fast = FALSE) { #' #' @description This function determines the order of sibling pairs based on an outcome variable. #' The function checks which of the two kinship pairs has more of a specified outcome variable. -#' It adds a new column named `order` to the dataset, indicating which sibling (identified as "s1" or "s2") has more of the outcome. +#' It adds a new column named `order` to the dataset, indicating which sibling +#' (identified as "s1" or "s2") has more of the outcome. #' If the two siblings have the same amount of the outcome, it randomly assigns one as having more. #' #' @inheritParams discord_data @@ -36,7 +38,7 @@ check_sibling_order <- function(..., fast = FALSE) { #' neither) has more of the outcome. #' @keywords internal -check_sibling_order_ram_optimized <- function(data, outcome, pair_identifiers, row) { +check_sibling_order_ram <- function(data, outcome, pair_identifiers, row) { # Select the row of interest from the data frame data <- data[row, ] @@ -46,7 +48,11 @@ check_sibling_order_ram_optimized <- function(data, outcome, pair_identifiers, r # Check if either sibling has missing (NA) outcome data if (is.na(outcome1) || is.na(outcome2)) { - stop(paste0("There are missing data, encoded as `NA`, for at least one kinship pair in the '", outcome, "' variable and data cannot be prepped properly.\n Please remove or impute missing data.")) + stop(paste0( + "There are missing data, encoded as `NA`, for at least one kinship pair in the '", + outcome, "' variable and data cannot be prepped properly.\n", + " Please remove or impute missing data." + )) } # Determine sibling order if (outcome1 > outcome2) { @@ -63,7 +69,7 @@ check_sibling_order_ram_optimized <- function(data, outcome, pair_identifiers, r } } - return(data) + data } check_sibling_order_fast <- function(data, outcome, pair_identifiers) { @@ -75,7 +81,11 @@ check_sibling_order_fast <- function(data, outcome, pair_identifiers) { # Check for missing outcome data if (any(is.na(outcome1) | is.na(outcome2))) { - stop(paste0("There are missing data, encoded as `NA`, for at least one kinship pair in the '", outcome, "' variable and data cannot be prepped properly.\n Please remove or impute missing data.")) + stop(paste0( + "There are missing data, encoded as `NA`, for at least one kinship pair in the '", + outcome, "' variable and data cannot be prepped properly.\n", + " Please remove or impute missing data." + )) } order <- ifelse(outcome1 > outcome2, "s1", @@ -90,14 +100,17 @@ check_sibling_order_fast <- function(data, outcome, pair_identifiers) { } data$order <- order - return(data) + data } #' @title Make Mean Differences #' -#' @description This function calculates differences and means of a given variable for each kinship pair. The order of subtraction and the variables' names in the output dataframe depend on the order column set by check_sibling_order(). -#' If the demographics parameter is set to "race", "sex", or "both", it also prepares demographic information accordingly, +#' @description This function calculates differences and means of a given variable for each +#' kinship pair. The order of subtraction and the variables' names in the output dataframe +#' depend on the order column set by check_sibling_order(). +#' If the demographics parameter is set to "race", "sex", or "both", it also prepares +#' demographic information accordingly, #' swapping the order of demographics as per the order column. #' @inheritParams discord_data #' @inheritParams check_sibling_order @@ -177,7 +190,7 @@ make_mean_diffs_ram_optimized <- function(data, id, sex, race, demographics, ) - return(output) + output } @@ -270,20 +283,26 @@ recode_demographics <- function(demographics, data, raceS1, raceS2, if (demographics == "both" || demographics == "race") { race_1_name <- paste0(race, "_1") race_2_name <- paste0(race, "_2") - output_demographics[[paste0(race, "_binarymatch")]] <- ifelse(output_demographics[[race_1_name]] == output_demographics[[race_2_name]], + output_demographics[[paste0(race, "_binarymatch")]] <- ifelse( + output_demographics[[race_1_name]] == output_demographics[[race_2_name]], 1, 0 ) - output_demographics[[paste0(race, "_multimatch")]] <- ifelse(output_demographics[[race_1_name]] == output_demographics[[race_2_name]], + output_demographics[[paste0(race, "_multimatch")]] <- ifelse( + output_demographics[[race_1_name]] == output_demographics[[race_2_name]], as.character(output_demographics[[race_2_name]]), "mixed" ) } if (demographics == "both" || demographics == "sex") { sex_1_name <- paste0(sex, "_1") sex_2_name <- paste0(sex, "_2") - output_demographics[[paste0(sex, "_binarymatch")]] <- ifelse(output_demographics[[sex_1_name]] == output_demographics[[sex_2_name]], + output_demographics[[paste0(sex, "_binarymatch")]] <- ifelse( + output_demographics[[sex_1_name]] == output_demographics[[sex_2_name]], 1, 0 ) - output_demographics[[paste0(sex, "_multimatch")]] <- ifelse(output_demographics[[sex_1_name]] == output_demographics[[sex_2_name]], as.character(output_demographics[[sex_2_name]]), "mixed") + output_demographics[[paste0(sex, "_multimatch")]] <- ifelse( + output_demographics[[sex_1_name]] == output_demographics[[sex_2_name]], + as.character(output_demographics[[sex_2_name]]), "mixed" + ) } } @@ -291,7 +310,7 @@ recode_demographics <- function(demographics, data, raceS1, raceS2, output <- base::cbind(output, output_demographics) } - return(output) + output } @@ -346,12 +365,13 @@ make_mean_diffs_fast <- function(data, id, sex, race, demographics, ) diff_list[[var]] <- tmp } - return(diff_list) + diff_list } #' @title Check Discord Errors #' -#' @description This function checks for common errors in the provided data, including the correct specification of identifiers (ID, sex, race) and their existence in the data. +#' @description This function checks for common errors in the provided data, including +#' the correct specification of identifiers (ID, sex, race) and their existence in the data. #' # #' @param data The data to perform a discord regression on. @@ -370,15 +390,26 @@ check_discord_errors <- function(data, id, sex, race, pair_identifiers) { } if (!base::is.null(sex) && base::sum(base::grepl(sex, base::names(data))) == 0) { - stop(paste0("The kinship pair sex identifier \"", sex, "\" is not appropriately defined. Please check that you have the correct column name.")) + stop(paste0( + "The kinship pair sex identifier \"", sex, + "\" is not appropriately defined. Please check that you have the correct column name." + )) } if (!base::is.null(race) && base::sum(base::grepl(race, base::names(data))) == 0) { - stop(paste0("The kinship pair race identifier \"", race, "\" is not appropriately defined. Please check that you have the correct column name.")) + stop(paste0( + "The kinship pair race identifier \"", race, + "\" is not appropriately defined. Please check that you have the correct column name." + )) } - if (base::sum(base::grepl(pair_identifiers[1], base::names(data))) == 0 | base::sum(base::grepl(pair_identifiers[2], base::names(data))) == 0) { - stop(paste0("Please check that the kinship pair identifiers \"", pair_identifiers[1], "\" and \"", pair_identifiers[2], "\" are valid, i.e. ensure that you have the correct labels for each kin.")) + if (base::sum(base::grepl(pair_identifiers[1], base::names(data))) == 0 || + base::sum(base::grepl(pair_identifiers[2], base::names(data))) == 0) { + stop(paste0( + "Please check that the kinship pair identifiers \"", pair_identifiers[1], + "\" and \"", pair_identifiers[2], + "\" are valid, i.e. ensure that you have the correct labels for each kin." + )) } - if (!base::is.null(sex) & !base::is.null(race) && sex == race) { + if (!base::is.null(sex) && !base::is.null(race) && sex == race) { stop("Please check that your sex and race variables are not equal.") } } @@ -401,11 +432,11 @@ valid_ids <- function(data, id) { dwarn("Specified id column does not contain unique values for each kin-pair. Adding row-wise ID for restructuring data into paired format for analysis. For more details, see .") - return(FALSE) + FALSE } else if (id_length == nrow(data)) { - return(TRUE) + TRUE } } else if (is.null(id)) { - return(FALSE) + FALSE } } diff --git a/man/check_discord_errors.Rd b/man/check_discord_errors.Rd index 626b7c3..8ea12ec 100644 --- a/man/check_discord_errors.Rd +++ b/man/check_discord_errors.Rd @@ -21,5 +21,6 @@ check_discord_errors(data, id, sex, race, pair_identifiers) An error message if one of the conditions are met. } \description{ -This function checks for common errors in the provided data, including the correct specification of identifiers (ID, sex, race) and their existence in the data. +This function checks for common errors in the provided data, including +the correct specification of identifiers (ID, sex, race) and their existence in the data. } diff --git a/man/check_sibling_order.Rd b/man/check_sibling_order.Rd index fe1688a..94f8246 100644 --- a/man/check_sibling_order.Rd +++ b/man/check_sibling_order.Rd @@ -18,6 +18,7 @@ A one-row data frame with a new column order indicating which familial member (1 \description{ This function determines the order of sibling pairs based on an outcome variable. The function checks which of the two kinship pairs has more of a specified outcome variable. -It adds a new column named `order` to the dataset, indicating which sibling (identified as "s1" or "s2") has more of the outcome. +It adds a new column named `order` to the dataset, indicating which sibling +(identified as "s1" or "s2") has more of the outcome. If the two siblings have the same amount of the outcome, it randomly assigns one as having more. } diff --git a/man/check_sibling_order_ram_optimized.Rd b/man/check_sibling_order_ram.Rd similarity index 81% rename from man/check_sibling_order_ram_optimized.Rd rename to man/check_sibling_order_ram.Rd index 68268d9..3a6058a 100644 --- a/man/check_sibling_order_ram_optimized.Rd +++ b/man/check_sibling_order_ram.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/helpers_regression.R -\name{check_sibling_order_ram_optimized} -\alias{check_sibling_order_ram_optimized} +\name{check_sibling_order_ram} +\alias{check_sibling_order_ram} \title{Check Sibling Order RAM Optimized} \usage{ -check_sibling_order_ram_optimized(data, outcome, pair_identifiers, row) +check_sibling_order_ram(data, outcome, pair_identifiers, row) } \arguments{ \item{data}{The data set with kinship pairs} @@ -24,7 +24,8 @@ A one-row data frame with a new column order indicating which familial member (1 \description{ This function determines the order of sibling pairs based on an outcome variable. The function checks which of the two kinship pairs has more of a specified outcome variable. -It adds a new column named `order` to the dataset, indicating which sibling (identified as "s1" or "s2") has more of the outcome. +It adds a new column named `order` to the dataset, indicating which sibling +(identified as "s1" or "s2") has more of the outcome. If the two siblings have the same amount of the outcome, it randomly assigns one as having more. } \keyword{internal} diff --git a/man/make_mean_diffs.Rd b/man/make_mean_diffs.Rd index a6ac8cf..bb8b034 100644 --- a/man/make_mean_diffs.Rd +++ b/man/make_mean_diffs.Rd @@ -12,7 +12,10 @@ make_mean_diffs(..., fast = FALSE) \item{fast}{Logical. If TRUE, uses a faster method for data processing.} } \description{ -This function calculates differences and means of a given variable for each kinship pair. The order of subtraction and the variables' names in the output dataframe depend on the order column set by check_sibling_order(). -If the demographics parameter is set to "race", "sex", or "both", it also prepares demographic information accordingly, +This function calculates differences and means of a given variable for each +kinship pair. The order of subtraction and the variables' names in the output dataframe +depend on the order column set by check_sibling_order(). +If the demographics parameter is set to "race", "sex", or "both", it also prepares +demographic information accordingly, swapping the order of demographics as per the order column. } diff --git a/tests/testthat/test-helpers_regression.R b/tests/testthat/test-helpers_regression.R new file mode 100644 index 0000000..984b696 --- /dev/null +++ b/tests/testthat/test-helpers_regression.R @@ -0,0 +1,169 @@ +# Tests for internal helper functions in helpers_regression.R +# These functions are exercised via the public discord_data() and +# discord_regression() APIs. + +# Minimal pair data used across several tests below +pair_data <- data.frame( + id = 1:3, + y_1 = c(5, 3, 4), + y_2 = c(3, 5, 6) +) + +# --- check_discord_errors (via discord_regression) --- + +test_that("check_discord_errors stops on invalid id column", { + expect_error( + discord_regression( + data = pair_data, + outcome = "y", + predictors = NULL, + id = "bad_id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2") + ), + regexp = "bad_id" + ) +}) + +test_that("check_discord_errors stops on invalid sex column", { + expect_error( + discord_regression( + data = pair_data, + outcome = "y", + predictors = NULL, + sex = "bad_sex", + race = NULL, + pair_identifiers = c("_1", "_2") + ), + regexp = "bad_sex" + ) +}) + +test_that("check_discord_errors stops on invalid race column", { + expect_error( + discord_regression( + data = pair_data, + outcome = "y", + predictors = NULL, + sex = NULL, + race = "bad_race", + pair_identifiers = c("_1", "_2") + ), + regexp = "bad_race" + ) +}) + +test_that("check_discord_errors stops when first pair identifier is missing", { + expect_error( + discord_regression( + data = pair_data, + outcome = "y", + predictors = NULL, + sex = NULL, + race = NULL, + pair_identifiers = c("_bad1", "_2") + ), + regexp = "_bad1" + ) +}) + +test_that("check_discord_errors stops when second pair identifier is missing", { + expect_error( + discord_regression( + data = pair_data, + outcome = "y", + predictors = NULL, + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_bad2") + ), + regexp = "_bad2" + ) +}) + +test_that("check_discord_errors stops when sex and race columns are equal", { + # sex_1 / sex_2 columns exist so the earlier existence checks pass + sex_data <- data.frame( + sex_1 = c(1, 0, 1), + sex_2 = c(0, 1, 1), + y_1 = c(5, 3, 4), + y_2 = c(3, 5, 6) + ) + expect_error( + discord_regression( + data = sex_data, + outcome = "y", + predictors = NULL, + sex = "sex", + race = "sex", + pair_identifiers = c("_1", "_2") + ), + regexp = "sex and race" + ) +}) + +# --- valid_ids (via discord_data) --- + +test_that("valid_ids warns and proceeds when id column has duplicate values", { + dup_data <- data.frame( + id = c(1, 1, 2), + y_1 = c(5, 3, 4), + y_2 = c(3, 5, 6) + ) + expect_warning( + discord_data( + data = dup_data, + outcome = "y", + predictors = NULL, + id = "id", + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + demographics = "none" + ), + regexp = "unique" + ) +}) + +# --- missing outcome data (check_sibling_order_ram and check_sibling_order_fast) --- + +test_that("check_sibling_order_ram stops when outcome has NA (fast = FALSE)", { + na_data <- data.frame( + y_1 = c(NA, 3, 4), + y_2 = c(3, 5, 6) + ) + expect_error( + discord_data( + data = na_data, + outcome = "y", + predictors = NULL, + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + demographics = "none", + fast = FALSE + ), + regexp = "missing data" + ) +}) + +test_that("check_sibling_order_fast stops when outcome has NA (fast = TRUE)", { + na_data <- data.frame( + y_1 = c(NA, 3, 4), + y_2 = c(3, 5, 6) + ) + expect_error( + discord_data( + data = na_data, + outcome = "y", + predictors = NULL, + sex = NULL, + race = NULL, + pair_identifiers = c("_1", "_2"), + demographics = "none", + fast = TRUE + ), + regexp = "missing data" + ) +})