spatial-transcriptomics-wf/workflows/spatial_visium/main.wdl at main · DNAstack/spatial-transcriptomics-wf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
version 1.0

# Harmonized human PMDBS and non-human spatial transcriptomics workflow entrypoint for 10x Visium data

import "structs.wdl"
import "../../wf-common/wdl/tasks/get_workflow_metadata.wdl" as GetWorkflowMetadata
import "preprocess/preprocess.wdl" as Preprocess
import "cohort_analysis/cohort_analysis.wdl" as CohortAnalysis

workflow spatial_visium_analysis {
	input {
		Array[Project] projects

		File spaceranger_reference_data
		File? visium_probe_set_csv

		# Processing parameters
		Int filter_cells_min_counts = 5000
		Int filter_cells_min_genes = 3000
		Int filter_genes_min_cells = 10
		Float filter_mt_max_percent = 0.2
		Float normalize_target_sum = 10000
		Int n_top_genes = 3000
		Int n_comps = 30
		String batch_key = "batch_id"
		Float leiden_resolution = 0.4

		String container_registry
		String zones = "us-central1-c us-central1-f"
	}

	String workflow_execution_path = "workflow_execution"
	String workflow_name = "spatial_visium"
	String workflow_version = "v1.0.1"
	String workflow_release = "https://github.com/ASAP-CRN/spatial-transcriptomics-wf/releases/tag/spatial_visium_analysis-~{workflow_version}"

	call GetWorkflowMetadata.get_workflow_metadata {
		input:
			zones = zones
	}

	scatter (project in projects) {
		String project_raw_data_path_prefix = "~{project.raw_data_bucket}/~{workflow_execution_path}/~{workflow_name}"

		call Preprocess.preprocess {
			input:
				team_id = project.team_id,
				dataset_id = project.dataset_id,
				dataset_doi_url = project.dataset_doi_url,
				samples = project.samples,
				spaceranger_reference_data = spaceranger_reference_data,
				visium_probe_set_csv = visium_probe_set_csv,
				workflow_name = workflow_name,
				workflow_version = workflow_version,
				workflow_release = workflow_release,
				run_timestamp = get_workflow_metadata.timestamp,
				raw_data_path_prefix = project_raw_data_path_prefix,
				billing_project = get_workflow_metadata.billing_project,
				container_registry = container_registry,
				zones = zones
		}

		Array[String] preprocessing_output_file_paths = flatten([
			preprocess.raw_counts,
			preprocess.filtered_counts,
			preprocess.molecule_info,
			preprocess.metrics_summary_csv,
			preprocess.spatial_outputs_tar_gz,
			flatten(preprocess.spatial_images),
			preprocess.scalefactors_json,
			preprocess.tissue_positions_csv,
			preprocess.spatial_enrichment_csv,
			preprocess.initial_adata_object,
			preprocess.qc_adata_object
		]) #!StringCoercion

		if (project.run_project_cohort_analysis) {
			call CohortAnalysis.cohort_analysis as project_cohort_analysis {
				input:
					cohort_id = project.team_id,
					project_sample_ids = preprocess.project_sample_ids,
					preprocessed_adata_objects = preprocess.qc_adata_object,
					preprocessing_output_file_paths = preprocessing_output_file_paths,
					filter_cells_min_counts = filter_cells_min_counts,
					filter_cells_min_genes = filter_cells_min_genes,
					filter_genes_min_cells = filter_genes_min_cells,
					filter_mt_max_percent = filter_mt_max_percent,
					normalize_target_sum = normalize_target_sum,
					n_top_genes = n_top_genes,
					n_comps = n_comps,
					batch_key = batch_key,
					leiden_resolution = leiden_resolution,
					workflow_name = workflow_name,
					workflow_version = workflow_version,
					workflow_release = workflow_release,
					run_timestamp = get_workflow_metadata.timestamp,
					raw_data_path_prefix = project_raw_data_path_prefix,
					staging_data_buckets = project.staging_data_buckets,
					billing_project = get_workflow_metadata.billing_project,
					container_registry = container_registry,
					zones = zones
			}
		}
	}

	output {
		# Sample-level outputs
		## Sample list
		Array[Array[Array[String]]] project_sample_ids = preprocess.project_sample_ids

		## Preprocess
		Array[Array[File]] raw_counts = preprocess.raw_counts
		Array[Array[File]] filtered_counts = preprocess.filtered_counts
		Array[Array[File]] molecule_info = preprocess.molecule_info
		Array[Array[File]] metrics_summary_csv = preprocess.metrics_summary_csv
		Array[Array[File]] spatial_outputs_tar_gz = preprocess.spatial_outputs_tar_gz
		Array[Array[Array[File]]] spatial_images = preprocess.spatial_images
		Array[Array[File]] scalefactors_json = preprocess.scalefactors_json
		Array[Array[File]] tissue_positions_csv = preprocess.tissue_positions_csv
		Array[Array[File]] spatial_enrichment_csv = preprocess.spatial_enrichment_csv
		Array[Array[File]] initial_adata_object = preprocess.initial_adata_object
		Array[Array[File]] qc_adata_object = preprocess.qc_adata_object

		# Project cohort analysis outputs
		## List of samples included in the cohort
		Array[File?] project_cohort_sample_list = project_cohort_analysis.cohort_sample_list

		# Merged, processed (filtered, normalized, dimensionality reduced), integrated, and clustered adata objects, and plots
		Array[File?] project_merged_adata_object = project_cohort_analysis.merged_adata_object
		Array[File?] project_merged_metadata_csv = project_cohort_analysis.merged_metadata_csv
		Array[File?] project_all_genes_csv = project_cohort_analysis.all_genes_csv
		Array[File?] project_hvg_genes_csv = project_cohort_analysis.hvg_genes_csv
		Array[Array[File]?] project_qc_plots_png = project_cohort_analysis.qc_plots_png
		Array[File?] project_processed_adata_object = project_cohort_analysis.processed_adata_object
		Array[File?] project_hvg_plot_png = project_cohort_analysis.hvg_plot_png
		Array[File?] project_integrated_adata_object = project_cohort_analysis.integrated_adata_object
		Array[File?] project_clustered_adata_object = project_cohort_analysis.clustered_adata_object
		Array[File?] project_umap_cluster_plots_png = project_cohort_analysis.umap_cluster_plots_png

		# Image features outputs
		Array[File?] project_spatial_scatter_plot_png = project_cohort_analysis.spatial_scatter_plot_png

		# Spatial statistics outputs
		Array[File?] project_final_adata_object = project_cohort_analysis.final_adata_object
		Array[File?] project_final_metadata_csv = project_cohort_analysis.final_metadata_csv
		Array[File?] project_moran_top_10_variable_genes_csv = project_cohort_analysis.moran_top_10_variable_genes_csv
		Array[File?] project_moran_top_4_variable_genes_spatial_scatter_plot_png = project_cohort_analysis.moran_top_4_variable_genes_spatial_scatter_plot_png

		Array[Array[File]?] preprocess_manifests = project_cohort_analysis.preprocess_manifest_tsvs
		Array[Array[File]?] project_manifests = project_cohort_analysis.cohort_analysis_manifest_tsvs
	}

	meta {
		description: "Harmonized human postmortem-derived brain sequencing (PMDBS) and non-human spatial transcriptomics workflow for 10x Visium data"
	}

	parameter_meta {
		projects: {help: "The project ID, set of samples and their associated reads and metadata, output bucket locations, and whether or not to run project-level downstream analysis."}
		spaceranger_reference_data: {help: "Space Ranger transcriptome reference data; see https://www.10xgenomics.com/support/software/space-ranger/downloads."}
		visium_probe_set_csv: {help: "Visium probe-based assays target genes in Space Ranger transcriptome; see https://www.10xgenomics.com/support/software/space-ranger/downloads."}
		filter_cells_min_counts: {help: "Minimum number of counts required for a cell to pass filtering. [5000]"}
		filter_cells_min_genes: {help: "Minimum number of genes required for a cell to pass filtering. [3000]"}
		filter_genes_min_cells: {help: "Minimum number of cells expressed required for a gene to pass filtering. [10]"}
		filter_mt_max_percent: {help: "Maximum percentage of mitochondrial read counts for a cell to pass filtering. [0.2]"}
		normalize_target_sum: {help: "The total count to which each cell's gene expression values will be normalized. [10000]"}
		n_top_genes: {help: "Number of highly-variable genes to keep. [3000]"}
		n_comps: {help: "Number of principal components to compute. [30]"}
		batch_key: {help: "Key in AnnData object for batch information. ['batch_id']"}
		leiden_resolution: {help: "Value controlling the coarseness of the Leiden clustering. [0.4]"}
		container_registry: {help: "Container registry where workflow Docker images are hosted."}
		zones: {help: "Space-delimited set of GCP zones where compute will take place. ['us-central1-c us-central1-f']"}
	}
}