Skip to content

Commit 4ddea7d

Browse files
Merge pull request #65 from FinOMOP/copilot/add-validation-for-source-name
Add validation for sourceCode maximum length of 50 characters
2 parents 59e3c26 + 0f5ed94 commit 4ddea7d

4 files changed

Lines changed: 16 additions & 5 deletions

File tree

R/databasesFromAndToCSV.R

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ omopVocabularyCSVsToDuckDB <- function(
2424
"CONCEPT_RELATIONSHIP",
2525
"CONCEPT_SYNONYM",
2626
"DOMAIN",
27-
"DRUG_STRENGTH",
27+
"DRUG_STRENGTH",
2828
"RELATIONSHIP",
2929
"VOCABULARY"
3030
)
@@ -55,7 +55,7 @@ omopVocabularyCSVsToDuckDB <- function(
5555
sql = sql,
5656
targetDialect = "duckdb"
5757
)
58-
58+
5959
# Fix DuckDB data type issues: replace NUMERIC with DOUBLE for float columns
6060
# This prevents precision errors when importing large numeric values
6161
sql <- gsub("NUMERIC NULL", "DOUBLE NULL", sql)
@@ -130,14 +130,14 @@ duckdbToOMOPVocabularyCSVs <- function(
130130
for (table_name in OMOPVocabularyTableNames) {
131131
message("Exporting table: ", table_name)
132132
out_path <- file.path(pathToOMOPVocabularyCSVsFolder, paste0(table_name, ".csv"))
133-
133+
134134
col_info <- DBI::dbGetQuery(
135135
connection,
136136
paste0("PRAGMA table_info(", table_name, ");")
137137
)
138138
cols <- col_info$name
139139
date_cols <- col_info$name[grepl("^date$", tolower(col_info$type))]
140-
140+
141141
select_cols <- sapply(cols, function(col) {
142142
if (col %in% date_cols) {
143143
paste0("STRFTIME('%Y%m%d', ", col, ") AS ", col)
@@ -147,7 +147,7 @@ duckdbToOMOPVocabularyCSVs <- function(
147147
})
148148

149149
select_sql <- paste(select_cols, collapse = ", ")
150-
sql <- paste0("COPY (SELECT ", select_sql, " FROM ", table_name, ") TO '", out_path, "' (HEADER, DELIM '\t');")
150+
sql <- paste0("COPY (SELECT ", select_sql, " FROM ", table_name, ") TO '", out_path, "' (HEADER, DELIM '\t', QUOTE '');")
151151
DatabaseConnector::dbExecute(connection, sql)
152152
}
153153

R/validateUsagiFile.R

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#' - Check if all default Usagi columns are present:
66
#' - Check if sourceCode and conceptId are unique
77
#' - Check if sourceCode is not empty
8+
#' - Check if sourceCode is less than 50 characters
89
#' - Check if sourceName is not empty
910
#' - Check if sourceName is less than 255 characters
1011
#' If usagi file has C&CR columns:
@@ -96,6 +97,7 @@ validateUsagiFile <- function(
9697
validationRules <- validate::validator(
9798
SourceCode.is.empty = is_complete(sourceCode),
9899
SourceCode.and.conceptId.are.not.unique = is_unique(sourceCode, conceptId),
100+
SourceCode.is.more.than.50.characters = field_length(sourceCode, min = 0, max = 50),
99101
SourceName.is.empty = is_complete(sourceName),
100102
SourceName.is.more.than.255.characters = field_length(sourceName, min = 0, max = 255),
101103
SourceFrequency.is.not.empty = is_complete(sourceFrequency),

inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ A01.0+G01,[SourceCode and conceptId are not unique]Meningitis (in) typhoid fever
44
A01.0+G01,[SourceCode and conceptId are not unique]Meningitis (in) typhoid fever,-1,,2000500101,Lavantautiin liittyvä aivokalvotulehdus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A01|A01.0|G01,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666794379045,4100102,Meningitis due to typhoid fever,Condition,MAPS_TO,,TAYS,1623974400000,,,
55
A01.0+J17.0,,-1,,2000500103,Lavantautiin liittyvä keuhkokuume,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A01|A01.0|J17.0,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666794388143,4166072,Pneumonia in typhoid fever,Condition,MAPS_TO,,TAYS,1623974400000,,,
66
A01.4+M01.3,[SourceName is more than 255 characters]Arthritis in typhoid or paratyphoid fever ad [SourceName is more than 255 characters]Arthritis in typhoid or paratyphoid fever ad [SourceName is more than 255 characters]Arthritis in typhoid or paratyphoid fever ad ddd,-1,,2000500104,Lavantautiin tai pikkulavantautiin liittyvä nivelinfektio,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A01|A01.4|M01.3,ICD10|ICD10|ICD10,0.78,APPROVED,EQUIVALENT,PKo,1666806100347,80316,Salmonella arthritis,Condition,MAPS_TO,,PKo,1666806094598,,,
7+
A01234567890123456789012345678901234567890123456789X,[SourceCode is more than 50 characters]Test entry for sourceCode length validation,-1,,2000500999,Test entry,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A01,ICD10,0,APPROVED,EQUAL,PKo,1666805697461,4100102,Meningitis due to typhoid fever,Condition,MAPS_TO,,TAYS,1623974400000,,,
78
A02.2+G01,[APPROVED mappingStatus conceptId is 0]Salmonella meningitis,-1,,2000500105,Salmonellan aiheuttama aivokalvotulehdus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A02|A02.2|G01,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666794600409,0,Salmonella meningitis,Condition,MAPS_TO,,TAYS,1623974400000,,,
89
A17.0+G01,[APPROVED mappingStatus with concepts outdated]Tuberculous meningitis,-1,,2000500115,Tuberkuloottinen meningiitti,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.0|G01,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1669304049249,1234,Tuberculosis of meninges,Condition,MAPS_TO,,PKo,1666804429398,,,
910
A17.1+G07,[APPROVED mappingStatus with concepts outdated]Meningeal tuberculoma,-1,,2000500116,Aivokalvojen tuberkulooma,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.1|G07,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1669304054597,1234,Tuberculoma of meninges,Condition,MAPS_TO,,TAYS,1623974400000,,,

tests/testthat/test-validateUsagiFile.R

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,14 @@ test_that("test validateUsagiFile returns errors with the errored usagi file", {
8888
validatedUsagiFile |> dplyr::filter(is.na(sourceName)) |> dplyr::pull(mappingStatus) |>
8989
expect_equal("FLAGGED")
9090

91+
# SourceCode is more than 50 characters
92+
validationsSummary |> dplyr::filter(step == "SourceCode is more than 50 characters") |> nrow() |> expect_equal(1)
93+
validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceCode is more than 50 characters")) |> nrow() |> expect_equal(1)
94+
validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceCode is more than 50 characters")) |> dplyr::pull(`ADD_INFO:validationMessages`) |>
95+
expect_equal("ERROR: SourceCode is more than 50 characters")
96+
validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceCode is more than 50 characters")) |> dplyr::pull(mappingStatus) |>
97+
expect_equal("FLAGGED")
98+
9199
# SourceName is more than 255 characters
92100
validationsSummary |> dplyr::filter(step == "SourceName is more than 255 characters") |> nrow() |> expect_equal(1)
93101
validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceName is more than 255 characters")) |> nrow() |> expect_equal(1)

0 commit comments

Comments
 (0)