vagrantDnaSim/07transformData.R at main · SBCSnicholsLab/vagrantDnaSim · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
######################################
# Prepare data for general method ####
######################################

# set constants
nSamp = commandArgs(trailingOnly = T)[1]
tDir = commandArgs(trailingOnly = T)[2]
extraNucLen <- 16000


# a utility function to rename the mapping depth vals
getNums <- function(x){
  a <- x[,3]
  names(a) <- sapply(x[,1], function(y){
    substr(strsplit(y, "/")[[1]][[4]],2,4)
  })
  return(a)
}
nMapped <- read.table(paste0(tDir, "bpMapped"),
                      sep = "\t",
                      stringsAsFactors = F)
#head(nMapped)

nTot <- read.table(paste0(tDir, "bpTotal"),
                   sep = "\t",
                   stringsAsFactors = F)
#head(nTot)

nMapped <- getNums(nMapped)
nTot <- getNums(nTot)

#mappingProp <- nMapped / nTot
##hist(mappingProp, breaks=20)


#################
# Genotype calls

gts <- read.table(paste0(tDir, "genotypes.csv"),
                  header = T,
                  check.names = F)
#head(gts)
# counts of each allele, some are missing (-1)
#table(as.vector(unlist(gts[,1:nSamp])))

# remove lowest coverage samples
gtsHC <- gts # none removed

# visualise missingness
#image(as.matrix(gtsHC[,1:nSamp]) == -1)

# remove individuals with missing data
#indMissing <- apply(gtsHC, 2, function(x) -1 %in% x)
#gtsHC <- gtsHC[,!indMissing]
#head(gtsHC)

# # remove any site with missing data
# posMissing <- gtsHC$POS[apply(gtsHC, 1, function(x) -1 %in% x)]
# gtsHC <- gtsHC[,!gtsHC$POS %in% posMissing]

nindFilt <- ncol(gtsHC)-1
###############
# Allele counts

allCounts <- read.table(paste0(tDir, "alleleCounts.csv"))
#head(allCounts)
allCountsHC <- allCounts # opportunity to remove individuals if desired

# remove sites with missing alleles identified above
#allCountsHC <- allCountsHC[,!c(indMissing, F)]
#head(allCountsHC)
# split allele counts into separate DFs, one per allele
gtList <- lapply(1:4, function(x) {
  allCountsHC[allCountsHC$allele==x,]
})

#lapply(gtList, dim)
# 12442*4 = 49768

# flatten into four vectors
#rm(c) # In case somebody named a variable c, which would shadow the c function.
gtVectors <- lapply(gtList, function(x){
  do.call(c,x[,1:nindFilt])
})
#index of samples
sampleIndex <- rep(colnames(gtsHC)[1:nindFilt], each=nrow(gtList[[1]]))
#str(gtVectors)

# long(er) data frame for allele counts with sample index
#allDF <- data.frame(sample=sampleIndex, pos = paste0("S",rep(sprintf("%05d",gtsHC$POS), nindFilt)),
#                    ref=gtVectors[[1]], alt=gtVectors[[2]] + gtVectors[[3]] + gtVectors[[4]])
allDF <- data.frame(sample=sampleIndex, pos = rep(gtsHC$POS, nindFilt),
                    ref=gtVectors[[1]], alt=gtVectors[[2]] + gtVectors[[3]] + gtVectors[[4]])
#head(allDF)
#str(allDF)

# Depth of the alt alleles (summed)
siteDeps <- rowSums(allDF[,3:4])
altRatio <- allDF$alt / siteDeps


# DF that with all relevant data for the analysis:
mainDF <- data.frame(Sample = as.character(allDF$sample),
                     Position=allDF$pos,
                     AltProp=altRatio,
                     DP=allDF$ref+allDF$alt,
                     stringsAsFactors = F, row.names = NULL)
#head(mainDF)
# Add mapping rate, make a DF so we can use "merge"
#mappingProp

propDF <- data.frame(Sample = names(nMapped),
                     #mappingrate=mappingProp,
                     nMapped,
                     nTot,
                     stringsAsFactors = F)
#head(propDF)
mainDF <- merge(mainDF, propDF, by="Sample")

#head(mainDF)
mainDF$ylog <- log(mainDF$AltProp)
mainDF$ylog[is.infinite(mainDF$ylog)] <- NA # replace -Inf by NA, makes plotting easier
#mainDF$xnqlogis <- -qlogis(mainDF$mappingrate)
#mainDF$xlog <- log(mainDF$mappingrate)

#head(mainDF)

# remove NA lines
mainDF <- mainDF[!is.na(mainDF$AltProp),]
mainDF <- mainDF[!is.na(mainDF$ylog),]
#mainDF <- mainDF[!is.na(mainDF$xnqlogis),]

# Write out data
write.table(mainDF,
            paste0(tDir, "transformedData.csv")
            )