-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathBuild_counts_table_without_metadata.R
More file actions
74 lines (63 loc) · 4.63 KB
/
Build_counts_table_without_metadata.R
File metadata and controls
74 lines (63 loc) · 4.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
library(data.table)
library(ggplot2)
library(gridExtra)
library(pheatmap)
args <- commandArgs(trailingOnly = TRUE)
directory = args[1] # example: "/oak/stanford/groups/horence/Roozbeh/NOMAD_10x/runs/CCLE_all/"
#run_name = args[2] # the output files and plots will be generated in ${directory}/satc_with_metadata
datatype = args[2] # could be either "10x" or "non10x"
output_counts_folder_name = "satc_Peter_cdna_anchors"
############## reading in metadata and sample_conversion files #########################
sample_name_id_conversion = fread(paste(directory,"/sample_name_to_id.mapping.txt",sep=""),header = FALSE)
###############################################################
system(paste("mkdir ", directory,"/",output_counts_folder_name,sep="")) # this is the directory for the supervised analysis files
###### if you want to use a custom list of anchors
###### you need to comment out the "select the anchors when no specific list of anchors is provided" code block
###### which is selecting anchors based on reading in one
###### of the SPLASH output files and instead uncomment the following command:
###### selected_anchors = fread("/oak/stanford/groups/horence/dcotter1/sponge_compactors/anchors_from_PeterCDNAs_11-20_k-27_s-10.txt",header=F)
#############################################################################################################
##### select the anchors when no specific list of anchors is provided #######################################
#############################################################################################################
selected_anchors = fread(paste(directory,"result.after_correction.scores.top_target_entropy.tsv",sep=""),select = c("anchor", "effect_size_bin", "M", "number_nonzero_samples", "target_entropy"))
selected_anchors = selected_anchors[number_nonzero_samples>5]
selected_anchors = selected_anchors[target_entropy > 1]
selected_anchors = selected_anchors[M > 50]
print("finished reading in anchors file")
if (directory %like%"Sponge"){ # I added this to make sure that for sponge granny anchor is always included
selected_anchors1 = fread(paste(directory,"result.after_correction.all_anchors.tsv",sep=""),select = c("anchor", "effect_size_bin", "M", "number_nonzero_samples", "target_entropy"))
selected_anchors1 = selected_anchors1[anchor=="GCCATCAGAACCCCAGGAACCATCTAA" | anchor == "TCTAACCAGCCATCAGAACCCCAGGAA"]
if (nrow(selected_anchors1)>0){
selected_anchors = rbind(selected_anchors, selected_anchors1)
}
}
#############################################################################################################
#############################################################################################################
#############################################################################################################
##################################################################################
####### satc_dump for getting counts for the selected anchors ####################
write.table(selected_anchors$V1,paste(directory, "/",output_counts_folder_name,"/selected_anchors.txt",sep=""),sep="\t",row.names=FALSE,quote=FALSE,col.names = FALSE)
for (counter in 0:127){
system(paste("/oak/stanford/groups/horence/Roozbeh/software/R-NOMAD/bin/satc_dump --anchor_list ", paste(directory, "/",output_counts_folder_name,"/", "selected_anchors.txt ", sep = ""), directory, "result_satc/bin", counter, ".satc ", directory, "/",output_counts_folder_name,"/", "satcOut",counter,".tsv ", sep = ""))
}
# now below I read in satcout files generated by satc_dump
satcOut_files = list.files(path = paste(directory,"/",output_counts_folder_name,"/",sep=""), pattern = "satcOut")
counts = data.table()
for (counter in 1:length(satcOut_files)) {
counts = rbind(counts, fread(paste(directory,"/",output_counts_folder_name,"/",satcOut_files[counter], sep = "")))
print(counter)
}
if (datatype == "non10x"){
setnames(counts,c("V1","V2","V3","V4"),c("sample_id","anchor","target","count"))
} else if (datatype == "10x"){
setnames(counts,c("V1","V2","V3","V4","V5"),c("sample_id","cell_barcode","anchor","target","count"))
}
###################################################################################
###################################################################################
if (datatype=="10x"){
counts$sample_id = paste(counts$cell_barcode,counts$sample_id,sep="_")
counts[,cell_barcode:=NULL]
}
counts = merge(counts,sample_name_id_conversion,all.x=T,all.y=F,by.x="sample_id",by.y="V2")
setnames(counts,"V1","sample_name")
write.table(counts, paste(directory,"/",output_counts_folder_name,"/",output_counts_folder_name,"_counts.tsv", sep = ""), sep = "\t", row.names = FALSE, quote = FALSE)