Skip to content

Commit adf9436

Browse files
authored
Refactor kinship to allow other code to initiate import directly (#730)
* Refactor genetics pipeline to allow standalone import of TSVs that were calculated separately
1 parent d0a9e46 commit adf9436

11 files changed

Lines changed: 187 additions & 149 deletions

ehr/api-src/org/labkey/api/ehr/EHRService.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
*/
1616
package org.labkey.api.ehr;
1717

18+
import org.apache.logging.log4j.Logger;
1819
import org.jetbrains.annotations.NotNull;
1920
import org.jetbrains.annotations.Nullable;
2021
import org.labkey.api.data.AbstractTableInfo;
@@ -31,6 +32,7 @@
3132
import org.labkey.api.ehr.history.*;
3233
import org.labkey.api.ldk.table.ButtonConfigFactory;
3334
import org.labkey.api.module.Module;
35+
import org.labkey.api.pipeline.PipelineJobException;
3436
import org.labkey.api.query.BatchValidationException;
3537
import org.labkey.api.query.DetailsURL;
3638
import org.labkey.api.query.ExprColumn;
@@ -43,6 +45,7 @@
4345
import org.labkey.api.view.ActionURL;
4446
import org.labkey.api.view.template.ClientDependency;
4547

48+
import java.io.File;
4649
import java.io.IOException;
4750
import java.util.Collection;
4851
import java.util.Date;
@@ -326,4 +329,14 @@ public EHRQCState getQCState(@NotNull Container c)
326329
abstract public List<String> ensureStudyQCStates(Container c, final User u, final boolean commitChanges);
327330

328331
abstract public void registerLabWorkOverrides(Module module, String fromType, LabworkType toType);
332+
333+
/**
334+
* The EHR has a built-in GeneticsCalculations pipeline job that computes inbreeding and kinship based on the pedigree.
335+
* These are normally calculated in R, saved as TSVs, and imported using java code. This method is a separate entrypoint
336+
* that allows other code perform the calculations, save the results as TSVs, and then trigger import here.
337+
*
338+
* A use case is a separate pipeline server that performs the R computation on a cluster, and then triggers the main webserver to import
339+
* those results.
340+
*/
341+
abstract public void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException;
329342
}

ehr/resources/pipeline/kinship/populateInbreeding.r

Lines changed: 23 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,47 +7,43 @@
77
# This R script will calculate and store inbreeding coefficients for all animals in the colony. This data will be compared against
88
# the information currently stored in the DB and the minimal number of inserts/updates/deletes are then performed. This script is designed
99
# to run as a daily cron job.
10-
11-
12-
options(error = dump.frames);
13-
library(pedigree);
14-
library(getopt);
15-
library(Matrix);
10+
library(pedigree)
11+
library(getopt)
12+
library(Matrix)
1613
library(dplyr)
1714

1815
spec <- matrix(c(
19-
'inputFile', '-f', 1, "character"
20-
), ncol=4, byrow=TRUE);
21-
opts = getopt(spec, commandArgs(trailingOnly = TRUE));
16+
'inputFile', '-f', 1, 'character'
17+
), ncol=4, byrow=TRUE)
18+
opts <- getopt(spec, commandArgs(trailingOnly = TRUE))
2219

23-
allPed <- read.table(opts$inputFile);
20+
allPed <- read.table(opts$inputFile)
2421
colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender')
2522

26-
is.na(allPed$Id)<-which(allPed$Id=="")
27-
is.na(allPed$Dam)<-which(allPed$Dam=="")
28-
is.na(allPed$Sire)<-which(allPed$Sire=="")
29-
is.na(allPed$Gender)<-which(allPed$Gender=="")
30-
31-
df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE);
32-
colnames(df)<-c("id", "id parent1", "id parent2")
23+
allPed$Id[allPed$Id == ""] <- NA
24+
allPed$Dam[allPed$Dam == ""] <- NA
25+
allPed$Sire[allPed$Sire == ""] <- NA
26+
allPed$Gender[allPed$Gender == ""] <- NA
3327

34-
originalIds <-as.data.frame(df[,1,drop=FALSE])
28+
df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE)
29+
originalIds <- df$id
30+
print(paste0('Input IDs: ', nrow(df)))
3531

3632
#this is a function in the pedigree package designed to add missing parents to the dataframe
3733
#see pedigree package documentation for more detail
38-
df <- add.Inds(df);
34+
df <- add.Inds(df)
3935
ord <- orderPed(df)
4036
df <- df[order(ord),]
4137

4238
#use an existing package to calculate inbreeding
43-
ib = calcInbreeding(df);
44-
45-
newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE);
39+
ib <- calcInbreeding(df)
4640

4741
#only calculate inbreeding for Ids at the center
48-
newRecords <- dplyr::filter(newRecords, Id %in% originalIds$id)
42+
newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE) %>%
43+
dplyr::filter(Id %in% originalIds)
44+
45+
if (nrow(newRecords) != length(originalIds)) {
46+
stop(paste0('Output dataframe and input IDs not the same length! Expected: ', length(originalIds), ', was: ', nrow(newRecords)))
47+
}
4948

50-
# write TSV to disk
51-
print("Output table:");
52-
print(str(newRecords));
53-
write.table(newRecords, file = "inbreeding.txt", append = FALSE,row.names=F,quote=F,sep="\t");
49+
write.table(newRecords, file = "inbreeding.txt", append = FALSE, row.names=F, quote=F, sep="\t")

ehr/resources/pipeline/kinship/populateKinship.r

Lines changed: 31 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -7,67 +7,64 @@
77
# This R script will calculate and store kinship coefficients (aka. relatedness) for all animals in the colony. This is a large, sparse matrix.
88
# The matrix is converted into a very long 3-column dataframe (animal1, animal2, coefficient). This dataframe is output to a TSV file,
99
# which is normally imported into ehr.kinship by java code in GeneticCalculationsImportTask
10-
11-
12-
#options(echo=TRUE);
13-
options(error = dump.frames);
14-
library(methods);
15-
library(kinship2);
16-
library(getopt);
17-
library(Matrix);
18-
library(dplyr);
10+
library(kinship2)
11+
library(getopt)
12+
library(Matrix)
13+
library(dplyr)
1914

2015
spec <- matrix(c(
21-
#'containerPath', '-c', 1, "character",
22-
#'baseUrl', '-b', 1, "character"
23-
'inputFile', '-f', 1, "character"
24-
), ncol=4, byrow=TRUE);
25-
opts = getopt(spec, commandArgs(trailingOnly = TRUE));
16+
'inputFile', '-f', 1, 'character'
17+
), ncol=4, byrow=TRUE)
18+
opts <- getopt(spec, commandArgs(trailingOnly = TRUE))
2619

27-
allPed <- read.table(opts$inputFile, quote="\"");
28-
colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species');
20+
allPed <- read.table(opts$inputFile, quote="\"")
21+
colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species')
2922

30-
is.na(allPed$Id)<-which(allPed$Id=="")
31-
is.na(allPed$Dam)<-which(allPed$Dam=="")
32-
is.na(allPed$Sire)<-which(allPed$Sire=="")
33-
is.na(allPed$Gender)<-which(allPed$Gender=="")
23+
allPed$Id[allPed$Id == ""] <- NA
24+
allPed$Dam[allPed$Dam == ""] <- NA
25+
allPed$Sire[allPed$Sire == ""] <- NA
26+
allPed$Gender[allPed$Gender == "" | is.na(allPed$Gender)] <- 3 # 3 = unknown
3427

3528
allPed$Species <- as.character(allPed$Species)
3629
allPed$Species[is.na(allPed$Species)] <- c('Unknown')
3730
allPed$Species <- as.factor(allPed$Species)
3831

39-
# In order to reduce the max matrix size, calculate famids using makefamid, then analyze each group separately
40-
# It resizes the biggest matrix from 12000^2 to 8200^2 thus reduces the memory used by half
41-
newRecords=NULL
32+
if (any(allPed$Species == 'Unknown')) {
33+
print(paste0('There are ', sum(allPed$Species == 'Unknown'), ' Ids with species = Unknown'))
34+
}
35+
36+
newRecords <- NULL
4237
for (species in unique(allPed$Species)){
43-
print(paste0('processing species: ', species))
44-
allRecordsForSpecies <- allPed[allPed$Species == species,]
38+
allRecordsForSpecies <- allPed[allPed$Species %in% species,]
39+
print(paste0('Processing species: ', species, ', with ', nrow(allRecordsForSpecies), ' IDs'))
40+
if (nrow(allRecordsForSpecies) == 1) {
41+
print('single record, skipping')
42+
next
43+
}
4544

4645
# Add missing parents for accurate kinship calculations
4746
fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender))
4847

4948
# Kinship is expecting records to be sorted IAW it's own pedigree function
50-
recordsForSpecies <- with(fixedRecords, pedigree(id=id,dadid=dadid,momid=momid,sex=sex,missid=0))
49+
recordsForSpecies <- with(fixedRecords, pedigree(id = id, dadid = dadid, momid = momid, sex = sex, missid = 0))
5150

52-
temp.kin=kinship(recordsForSpecies)
51+
temp.kin <- kinship(recordsForSpecies)
5352

5453
# Add rownames to make matrix symmetric, which is required downstream
5554
rownames(temp.kin) <- colnames(temp.kin)
5655

5756
# Convert kinship matrix to a triplet list of two ids and their coefficient
58-
summaryDf = as.data.frame(summary(as(temp.kin, "dgCMatrix")))
57+
summaryDf <- as.data.frame(summary(as(temp.kin, "dgCMatrix")))
5958
idList <- rownames(temp.kin)
60-
temp.tri = data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x)
59+
temp.tri <- data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x)
6160

6261
# Now filter out parents added for kinship calculation
6362
temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE))
6463
temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE))
64+
temp.tri <- merge(temp.tri, allRecordsForSpecies[c('Id', 'Species')], by = 'Id', all.x = TRUE)
6565

66-
newRecords=rbind(newRecords,temp.tri)
67-
print(paste0('total subjects: ', nrow(allRecordsForSpecies)))
66+
newRecords <- dplyr::bind_rows(newRecords,temp.tri)
6867
}
6968

7069
# write TSV to disk
71-
print("Output table:");
72-
print(str(newRecords));
73-
write.table(newRecords, file = "kinship.txt", append = FALSE,row.names=F,quote=F,sep="\t");
70+
write.table(newRecords, file = "kinship.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep = '\t')

ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,15 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', {
2424
xtype: 'checkbox',
2525
fieldLabel: 'Is Enabled?',
2626
itemId: 'enabled'
27+
},{
28+
xtype: 'checkbox',
29+
fieldLabel: 'Allow Import During Business Hours?',
30+
itemId: 'allowImportDuringBusinessHours'
2731
},{
2832
xtype: 'checkbox',
2933
fieldLabel: 'Kinship validation?',
3034
itemId: 'kinshipValidation',
31-
listeners : {
35+
listeners: {
3236
render: function(c) {
3337
Ext4.create('Ext.tip.ToolTip', {
3438
target: c.getEl(),
@@ -94,6 +98,7 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', {
9498
this.down('#hourOfDay').setValue(results.hourOfDay);
9599
this.down('#containerPath').setValue(results.containerPath);
96100
this.down('#kinshipValidation').setValue(results.kinshipValidation);
101+
this.down('#allowImportDuringBusinessHours').setValue(results.allowImportDuringBusinessHours)
97102
},
98103

99104
saveData: function(){
@@ -104,7 +109,8 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', {
104109
containerPath: this.down('#containerPath').getValue(),
105110
enabled: this.down('#enabled').getValue(),
106111
hourOfDay: this.down('#hourOfDay').getValue(),
107-
kinshipValidation: this.down('#kinshipValidation').getValue()
112+
kinshipValidation: this.down('#kinshipValidation').getValue(),
113+
allowImportDuringBusinessHours: this.down('#allowImportDuringBusinessHours').getValue()
108114
},
109115
method : 'POST',
110116
scope: this,

ehr/src/org/labkey/ehr/EHRController.java

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
import org.labkey.api.settings.AppProps;
8484
import org.labkey.api.study.DatasetTable;
8585
import org.labkey.api.util.ExceptionUtil;
86+
import org.labkey.api.util.HtmlString;
8687
import org.labkey.api.util.HtmlStringBuilder;
8788
import org.labkey.api.util.PageFlowUtil;
8889
import org.labkey.api.util.Path;
@@ -639,7 +640,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er
639640
errors.reject(ERROR_MSG, "Unable to find container for path: " + form.getContainerPath());
640641
return null;
641642
}
642-
GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation());
643+
GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation(), form.isAllowImportDuringBusinessHours());
643644

644645
return new ApiSimpleResponse("success", true);
645646
}
@@ -759,6 +760,7 @@ public static class ScheduleGeneticCalculationForm
759760
private int hourOfDay;
760761

761762
private boolean _kinshipValidation;
763+
private boolean _allowImportDuringBusinessHours;
762764

763765
public boolean isEnabled()
764766
{
@@ -799,6 +801,16 @@ public void setKinshipValidation(boolean kinshipValidation)
799801
{
800802
_kinshipValidation = kinshipValidation;
801803
}
804+
805+
public boolean isAllowImportDuringBusinessHours()
806+
{
807+
return _allowImportDuringBusinessHours;
808+
}
809+
810+
public void setAllowImportDuringBusinessHours(boolean allowImportDuringBusinessHours)
811+
{
812+
_allowImportDuringBusinessHours = allowImportDuringBusinessHours;
813+
}
802814
}
803815

804816
@RequiresPermission(AdminPermission.class)
@@ -817,6 +829,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er
817829
ret.put("enabled", GeneticCalculationsJob.isEnabled());
818830
ret.put("hourOfDay", GeneticCalculationsJob.getHourOfDay());
819831
ret.put("kinshipValidation", GeneticCalculationsJob.isKinshipValidation());
832+
ret.put("allowImportDuringBusinessHours", GeneticCalculationsJob.isAllowImportDuringBusinessHours());
820833

821834
return new ApiSimpleResponse(ret);
822835
}
@@ -1250,7 +1263,7 @@ public void validateCommand(Object form, Errors errors)
12501263
@Override
12511264
public ModelAndView getConfirmView(Object form, BindException errors)
12521265
{
1253-
return new HtmlView("This will cause the system to recalculate kinship and inbreeding coefficients on the colony. Do you want to continue?");
1266+
return new HtmlView(HtmlString.of("This will cause the system to recalculate kinship and inbreeding coefficients on the colony. Do you want to continue?"));
12541267
}
12551268

12561269
@Override

ehr/src/org/labkey/ehr/EHRServiceImpl.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import org.labkey.api.module.ModuleLoader;
5151
import org.labkey.api.module.ModuleProperty;
5252
import org.labkey.api.pipeline.PipeRoot;
53+
import org.labkey.api.pipeline.PipelineJobException;
5354
import org.labkey.api.pipeline.PipelineService;
5455
import org.labkey.api.query.BatchValidationException;
5556
import org.labkey.api.query.DetailsURL;
@@ -78,10 +79,12 @@
7879
import org.labkey.ehr.history.DefaultObservationsDataSource;
7980
import org.labkey.ehr.history.DefaultPregnanciesDataSource;
8081
import org.labkey.ehr.history.LabworkManager;
82+
import org.labkey.ehr.pipeline.GeneticCalculationsImportTask;
8183
import org.labkey.ehr.security.EHRSecurityManager;
8284
import org.labkey.ehr.table.DefaultEHRCustomizer;
8385
import org.labkey.ehr.table.SNOMEDCodesDisplayColumn;
8486

87+
import java.io.File;
8588
import java.io.FileNotFoundException;
8689
import java.io.IOException;
8790
import java.io.InputStream;
@@ -1061,4 +1064,10 @@ public Map<String, Pair<Module, LabworkType>> getLabWorkOverrides()
10611064
{
10621065
return _labWorkOverrides;
10631066
}
1067+
1068+
@Override
1069+
public void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException
1070+
{
1071+
GeneticCalculationsImportTask.standaloneProcessKinshipAndInbreeding(c, u, pipelineDir, log);
1072+
}
10641073
}

0 commit comments

Comments
 (0)