77# This R script will calculate and store kinship coefficients (aka. relatedness) for all animals in the colony. This is a large, sparse matrix.
88# The matrix is converted into a very long 3-column dataframe (animal1, animal2, coefficient). This dataframe is output to a TSV file,
99# which is normally imported into ehr.kinship by java code in GeneticCalculationsImportTask
10-
11-
12- # options(echo=TRUE);
13- options(error = dump.frames );
14- library(methods );
15- library(kinship2 );
16- library(getopt );
17- library(Matrix );
18- library(dplyr );
10+ library(kinship2 )
11+ library(getopt )
12+ library(Matrix )
13+ library(dplyr )
1914
2015spec <- matrix (c(
21- # 'containerPath', '-c', 1, "character",
22- # 'baseUrl', '-b', 1, "character"
23- ' inputFile' , ' -f' , 1 , " character"
24- ), ncol = 4 , byrow = TRUE );
25- opts = getopt(spec , commandArgs(trailingOnly = TRUE ));
16+ ' inputFile' , ' -f' , 1 , ' character'
17+ ), ncol = 4 , byrow = TRUE )
18+ opts <- getopt(spec , commandArgs(trailingOnly = TRUE ))
2619
27- allPed <- read.table(opts $ inputFile , quote = " \" " );
28- colnames(allPed )<- c(' Id' , ' Dam' , ' Sire' , ' Gender' , ' Species' );
20+ allPed <- read.table(opts $ inputFile , quote = " \" " )
21+ colnames(allPed )<- c(' Id' , ' Dam' , ' Sire' , ' Gender' , ' Species' )
2922
30- is.na( allPed $ Id ) <- which( allPed $ Id == " " )
31- is.na( allPed $ Dam ) <- which( allPed $ Dam == " " )
32- is.na( allPed $ Sire ) <- which( allPed $ Sire == " " )
33- is.na( allPed $ Gender ) <- which( allPed $ Gender == " " )
23+ allPed $ Id [ allPed $ Id == " " ] <- NA
24+ allPed $ Dam [ allPed $ Dam == " " ] <- NA
25+ allPed $ Sire [ allPed $ Sire == " " ] <- NA
26+ allPed $ Gender [ allPed $ Gender == " " | is.na( allPed $ Gender )] <- 3 # 3 = unknown
3427
3528allPed $ Species <- as.character(allPed $ Species )
3629allPed $ Species [is.na(allPed $ Species )] <- c(' Unknown' )
3730allPed $ Species <- as.factor(allPed $ Species )
3831
39- # In order to reduce the max matrix size, calculate famids using makefamid, then analyze each group separately
40- # It resizes the biggest matrix from 12000^2 to 8200^2 thus reduces the memory used by half
41- newRecords = NULL
32+ if (any(allPed $ Species == ' Unknown' )) {
33+ print(paste0(' There are ' , sum(allPed $ Species == ' Unknown' ), ' Ids with species = Unknown' ))
34+ }
35+
36+ newRecords <- NULL
4237for (species in unique(allPed $ Species )){
43- print(paste0(' processing species: ' , species ))
44- allRecordsForSpecies <- allPed [allPed $ Species == species ,]
38+ allRecordsForSpecies <- allPed [allPed $ Species %in% species ,]
39+ print(paste0(' Processing species: ' , species , ' , with ' , nrow(allRecordsForSpecies ), ' IDs' ))
40+ if (nrow(allRecordsForSpecies ) == 1 ) {
41+ print(' single record, skipping' )
42+ next
43+ }
4544
4645 # Add missing parents for accurate kinship calculations
4746 fixedRecords <- with(allRecordsForSpecies , fixParents(id = Id , dadid = Sire , momid = Dam , sex = Gender ))
4847
4948 # Kinship is expecting records to be sorted IAW it's own pedigree function
50- recordsForSpecies <- with(fixedRecords , pedigree(id = id ,dadid = dadid ,momid = momid ,sex = sex ,missid = 0 ))
49+ recordsForSpecies <- with(fixedRecords , pedigree(id = id , dadid = dadid , momid = momid , sex = sex , missid = 0 ))
5150
52- temp.kin = kinship(recordsForSpecies )
51+ temp.kin <- kinship(recordsForSpecies )
5352
5453 # Add rownames to make matrix symmetric, which is required downstream
5554 rownames(temp.kin ) <- colnames(temp.kin )
5655
5756 # Convert kinship matrix to a triplet list of two ids and their coefficient
58- summaryDf = as.data.frame(summary(as(temp.kin , " dgCMatrix" )))
57+ summaryDf <- as.data.frame(summary(as(temp.kin , " dgCMatrix" )))
5958 idList <- rownames(temp.kin )
60- temp.tri = data.frame (Id = idList [summaryDf $ i ], Id2 = idList [summaryDf $ j ], coefficient = summaryDf $ x )
59+ temp.tri <- data.frame (Id = idList [summaryDf $ i ], Id2 = idList [summaryDf $ j ], coefficient = summaryDf $ x )
6160
6261 # Now filter out parents added for kinship calculation
6362 temp.tri <- dplyr :: filter(temp.tri , grepl(" ^(?!addin).*$" , Id , perl = TRUE ))
6463 temp.tri <- dplyr :: filter(temp.tri , grepl(" ^(?!addin).*$" , Id2 , perl = TRUE ))
64+ temp.tri <- merge(temp.tri , allRecordsForSpecies [c(' Id' , ' Species' )], by = ' Id' , all.x = TRUE )
6565
66- newRecords = rbind(newRecords ,temp.tri )
67- print(paste0(' total subjects: ' , nrow(allRecordsForSpecies )))
66+ newRecords <- dplyr :: bind_rows(newRecords ,temp.tri )
6867}
6968
7069# write TSV to disk
71- print(" Output table:" );
72- print(str(newRecords ));
73- write.table(newRecords , file = " kinship.txt" , append = FALSE ,row.names = F ,quote = F ,sep = " \t " );
70+ write.table(newRecords , file = " kinship.txt" , append = FALSE , row.names = FALSE , quote = FALSE , sep = ' \t ' )
0 commit comments