-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathBaseSpaceDownload.sh
More file actions
122 lines (107 loc) · 6.08 KB
/
BaseSpaceDownload.sh
File metadata and controls
122 lines (107 loc) · 6.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#Download FASTQ files from BaseSpace using API
#All fastq.gz files from a project are downloaded
#Optionally, some run statistics is reported
#Access token must be in the text file (token_header.txt) containing one line of text:
#header = "x-access-token: <your-token-here>"
#projectID (and runID) must be specified within the script (see below)
#REQUIRES: GNU parallel, curl
#
#----------------------------------------------------------------------------
#Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2020
#tomas.fer@natur.cuni.cz
#----------------------------------------------------------------------------
#Specify the project ID here
projectID=
#Test if prerequisites are installed
for i in curl parallel; do
if ! [ -x "$(command -v $i)" ]; then
echo "Please install '$i' first" && exit
fi
done
#Test if result files exist
for i in sampleTable.txt fileTable.txt filesList.txt samplesList.txt JSONproject.txt JSONsamples.txt; do
if [[ -f $i ]]; then
echo The file '$i' already exists. Delete it or rename before running this script again... && exit
fi
done
#Samples from projects
#Get info about samples
echo -e "\nGetting info about samples in the project ${projectID}..."
curl -L -J --config ./token_header.txt https://api.basespace.illumina.com/v1pre3/projects/${projectID}/samples?Limit=1000 2>/dev/null > JSONproject.txt
#Check whether the project exists
if grep -q Error "JSONproject.txt"; do
echo -e "\nYou are not permitted to access the project '$projectID'\n" && exit
fi
#Get sample numbers
grep -Po '"Href":.*?[^\\]",' JSONproject.txt | grep "/samples" | awk -F\" '{print $4}'| awk -F\/ '{print $3}' > samplesList.txt
grep -Po '"SampleId":.*?[^\\]",' JSONproject.txt | awk -F\" '{print $4}' > sampleID.txt
grep -Po '"LibraryName":.*?[^\\]",' JSONproject.txt | awk -F\" '{print $4}' > libName.txt
grep -Po '"TotalReadsPF":.*?[^\\]",' JSONproject.txt | awk -F\" '{print $3}' | sed 's/[:,]//g' > readsPF.txt
expName=$(grep -Po '"ExperimentName":.*?[^\\]"' JSONproject.txt | awk -F\" '{print $4}' | head -n1)
echo "There are" `cat samplesList.txt | wc -l` "samples in the project '$expName'"
#Make samples table
echo -e "SampleID\tName\tReadsPF\tBaseSpaceID" > sampleTable.txt
paste sampleID.txt libName.txt readsPF.txt samplesList.txt >> sampleTable.txt
rm sampleID.txt libName.txt readsPF.txt
#Get file numbers
echo -e "\nGetting info about files in the project ${projectID}..."
for i in $(cat samplesList.txt); do
#download information about files for particular sample
curl -L -J --config ./token_header.txt https://api.basespace.illumina.com/v1pre3/samples/${i}/files?Extensions=gz 2>/dev/null > JSONsamples.txt
#get 'Id', display only them and add it to the IDs list
grep -Po '"Id":.*?[^\\]",' JSONsamples.txt | awk -F\" '{print $4}' >> filesList.txt
#get sizes
grep -Po '"Size":.*?[^\\]"' JSONsamples.txt | awk -F\" '{print $3}' | sed 's/[:,]//g' >> filesSize.txt
#get file names
grep -Po '"Path":.*?[^\\]"' JSONsamples.txt | awk -F\" '{print $4}' >> filesNames.txt
done
echo "There are" `cat filesList.txt | wc -l` "files"
#Make table
echo -e "FileName\tSize\tBaseSpaceID" > fileTable.txt
paste filesNames.txt filesSize.txt filesList.txt >> fileTable.txt
rm filesNames.txt filesSize.txt
#Download individual files using parallel
echo -e "\nDownloading fastq.gz files..."
cat filesList.txt | parallel 'curl -L -J --config token_header.txt https://api.basespace.illumina.com/v1pre3/files/{}/content -O'
#Check file sizes of downloaded files (if they match sizes stated by BaseSpace)
#Download incorrectly downloaded files
echo -e "\nChecking whether file sizes are correct..."
cat fileTable.txt | sed '1d' | while read line; do
fileName=$(awk '{ print $1 }' <<< $line) #file name
fileSizeDown=$(stat -c %s `awk '{ print $1 }' <<< $line`) #file size of the downloaded file
fileSizeBS=$(awk '{ print $2 }' <<< $line) #file size extracted from BaseSpace JSON
fileBS=$(awk '{ print $3 }' <<< $line) #file BaseSpace ID
if [[ ${fileSizeDown} -eq ${fileSizeBS} ]]; then
echo ${fileName} size OK
else
echo -e "${fileName} size incorrect. Downloading again...\n"
mv ${fileName} ${fileName}.bak 2>/dev/null #make a backup of the wrongly downloaded file
until [[ $(stat -c %s `awk '{ print $1 }' <<< $line` 2>/dev/null) -eq ${fileSizeBS} ]]; do
curl -L -J --config token_header.txt https://api.basespace.illumina.com/v1pre3/files/${fileBS}/content -O
echo
done
fi
done
echo -e "\nFinished downloading" `cat filesList.txt | wc -l` "files from the project '$expName' (ID: $projectID)"
exit #Remove if you want to continue with getting info about a run
#Download information about specific run
#Specify the run ID here
runID=
#Run details (total yield, number of clusters, total reads, total reads PF...)
echo "Getting info about run ${runID}"
#Get info about the run
curl -L -J --config ./token_header.txt https://api.basespace.illumina.com/v1pre3/runs/${runID} 2>/dev/null > JSONrun.txt
grep -Po '"ExperimentName":.*?[^\\]"' JSONrun.txt | head -n1 | awk -F\" '{print $4}' | sed 's/[:,]//g' > rundata.txt
grep -Po '"PlatformName":.*?[^\\]"' JSONrun.txt | awk -F\" '{print $4}' | sed 's/[:,]//g' >> rundata.txt
grep -Po '"YieldTotal":.*?[^\\]"' JSONrun.txt | awk -F\" '{print $3}' | sed 's/[:,]//g' >> rundata.txt
grep -Po '"PercentPf":.*?[^\\]"' JSONrun.txt | awk -F\" '{print $3}' | sed 's/[:,]//g' >> rundata.txt
grep -Po '"Clusters":.*?[^\\]"' JSONrun.txt | awk -F\" '{print $3}' | sed 's/[:,]//g' >> rundata.txt
grep -Po '"ClustersPf":.*?[^\\]"' JSONrun.txt | awk -F\" '{print $3}' | sed 's/[:,}]//g' >> rundata.txt
grep -Po '"PercentGtQ30":.*?[^\\]"' JSONrun.txt | awk -F\" '{print $3}' | sed 's/[:,]//g' >> rundata.txt
grep -Po '"PercentGtQ30R1":.*?[^\\]"' JSONrun.txt | awk -F\" '{print $3}' | sed 's/[:,]//g' >> rundata.txt
grep -Po '"PercentGtQ30R2":.*?[^\\]"' JSONrun.txt | awk -F\" '{print $3}' | sed 's/[:,]//g' >> rundata.txt
#make a table
echo -e "ExperimentName\nPlatformName\nTotalYield[Gbp]\nClusters\nClustersPF\nPercentPF\nPercentGtQ30\nPercentGtQ30R1\nPercentGtQ30R2" > runHeader.txt
paste runHeader.txt rundata.txt > runTable.txt
rm rundata.txt runHeader.txt
echo -e "Summary of the run $runID is in 'runTable.txt'"