starcall-workflow/default-config.yaml at master · FowlerLab/starcall-workflow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
###
# Config file for the STARCall processing pipeline.
# make a copy of this file and change any relevant parameters.
###

# Scale of objective used to image phenotyping and genotyping images.
# The absolute value of each is not important, but the relative ratio
# is used to scale up and down images. Both should be integers.
phenotype_scale: 20
bases_scale: 10

# The channels imaged for sequencing and phenotyping images.
# Channels for each base should be labeled as such so bases
# are called properly.
sequencing_channels: ['DAPI', 'GFP', 'G', 'T', 'A', 'C']
phenotyping_channels: ['DAPI', 'GFP', 'Ph+WGA', 'Mito']

# Grid size for different processing steps. Input images are split
# into an X by X grid, the processing is run, then resulting images are
# merged back together.
# This can be disabled by setting the size to 1 or not setting the variable
segmentation_grid_size: 4
sequencing_grid_size: 4
phenotyping_grid_size: 12
# Segmentation grid sizes can be further specified, this is really only useful if
# you are running segmentation on the gpu and need very fine grain control
#segmentation_cells_grid_size: 6
#segmentation_nuclei_grid_size: 3

stitching:
    # The channel used to align images, can be an index or the string
    # name of one of the channels above. If a string, it should be
    # present in both sequencing and phenotyping channels, to align between them.
    #channel: 0
    channel: 'GFP'
    # The amount of overlap in pixels between generated tiles. This should be
    # larger than a cell, so that all cells are fully contained in at least one tile
    overlap: 100
    # The degree to which the stitching algorithm will align to sub pixel precision,
    # so 16 would mean alignment is done to 1/16 of a pixel. This requires minimal extra
    # processing, and improves alignment up to 16, past which improvements are not small.
    # This can be disabled by not setting this or setting it to 1.
    subpixel_alignment: 16
    # Solving method for alignment. I have found 'mae' (Mean absolute error minimizing solver)
    # to be the best in most situations. Other options are 'mse' (Mean squared error minimizing solver),
    # 'spantree' (Spanning tree)
    solver: 'mae'
    # The limit on the number of pairwise alignments between cycles. Eg, if
    # the limit is 5, cycle 0 would be aligned to cycle 4 but not cycle 5.
    # This reduces the processing greatly when the number of cycles is large.
    max_cycle_pairs: 16
    # The method to combine overlapping tiles. Mean takes the mean, nearest crops
    # each tile to be not overlapping
    merger: 'nearest'

    # This section is all of the parameters for ASHLAR stitching. ASHLAR may provide
    # better stitching than ConStitch, but it should be noted that ASHLAR cannot be used
    # when the phenotype cycles are imaged at a different scale
    ashlar:
        flip_x: False
        flip_y: False
        transpose: False
        interp: false
        filter_sigma: null
        maximum_shift: 1000
    ashlar_executable: 'ashlar'
    ashlar_overlap: 0.15

segmentation:
    # Diameter passed to cellpose, in pixels of the phenotype images.
    diameter: 100

    # Method to segment nuclei. Can be 'otsu_threshold' or 'stardist'
    nuclei_method: 'stardist'
    # Method to segment cells. Can be 'cellpose' or 'stardist'
    cells_method: 'cellpose'

    # Channels used to run cellpose, can be indices or strings from the
    # list of phenotype channels above. The first channel should be the nuclear
    # channel and the second the cytoplasm channel
    channels: ['DAPI', 'Ph+WGA']

    # Whether the cell and nuclei masks should be linked together.
    # Only necessary if you are going to use the nuclei masks, for example
    # in phenotyping
    match_masks: True

    # Whether cell segmentation is run on background corrected images
    # if both this and phenotyping.use_corrected are false, background correction
    # will not be run
    use_corrected: False

dotdetection:
    # Parameters for the LoG blob detection algorithm that detects dots.
    # These values should work well for most images, but if dot detection is
    # not working well, they can be adjusted
    min_sigma: 1
    max_sigma: 3
    num_sigma: 7

read_clustering:
    # Parameters for clustering nearby reads into single reads of the same sequence
    # Distances between reads are calculated with three values, positional distance,
    # cosine distance between their read values extracted from the sequencing images,
    # and edit distance between their sequences.
    # The current values for the paramters (99999, 0, 1 for weights, linkage max and threshold 0.5)
    # simply combines all reads that have the same sequence. This system is still being
    # tested, and experimentation with parameters is encouraged.
    normalization: 'none'
    positional_weight: 99999
    value_weight: 0
    sequence_weight: 1

    # These distances are then clustered with agglomerative clustering, specified
    # with linkage (min, mean, max) and threshold. More on agglomerative clustering
    # can be found here
    # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering
    linkage: 'max'
    threshold: 0.5

sequencing:
    # The max number of reads kept for each cell. A good number for this is usually,
    # 4-8, but this depends on how prevelant errors are and the size of the barcode library
    max_reads: 4

phenotyping:
    # Whether cell segmentation is run on background corrected images
    # if both this and sequencing.use_corrected are false, background correction
    # will not be run
    use_corrected: False

    # The executable to run to invoke cellprofiler. If cellprofiler is installed
    # in a different conda environment, this should reflect that.
    cellprofiler_executable: 'cellprofiler'
    #cellprofiler_executable: '~/miniconda3/envs/cp4/bin/cellprofiler'


# Directory that contains the raw .nd2 microscope files. It should
# contain one subdirectory for each cycle, each with one nd2 file for each well.
# If your files are not in nd2 format, they can be converted to tif files
# and placed in the input directory, described below.
# For example:
#   rawinput/
#   ├── 20231215_180258_893
#   │   ├── Well1_ChannelDAPI,GFP,G,T,A,C_Seq0000.nd2
#   │   │       ...
#   │   └── Well6_ChannelDAPI,GFP,G,T,A,C_Seq0003.nd2
#   │           ...
#   ├── 20231219_001028_540
#   │   ├── Well1_ChannelDAPI,GFP,G,T,A,C_Seq0000.nd2
#   │   │       ...
#   │   └── Well6_ChannelDAPI,GFP,G,T,A,C_Seq0003.nd2
#   └── phenotype
#       ├── Well1_Channel408 nm,473 nm,545 nm,635 nm_Seq0000.nd2
#       │       ...
#       └── Well6_Channel408 nm,473 nm,545 nm,635 nm_Seq0003.nd2
rawinput_dir: 'rawinput/'

# Directory that contains the input in .tif file form. If the data is
# present in the rawinput dir, this will be automatically filled.
# The directory should contain one subdirectory for each well, each
# with a subdirectory for each cycle, containing 'raw.tif' with the unstitched
# tile images and 'positions.csv' with the positions of each tile.
# An additional subdirectory 'auxdata' can be provided containing a barcode
# library.
# For example:
#    input/
#    ├── auxdata
#    │   └── barcodes.csv
#    └── well1
#        ├── cycle00
#        │   ├── positions.csv
#        │   └── raw.tif
#        │      ...
#        ├── cycle11
#        │   ├── positions.csv
#        │   └── raw.tif
#        └── cyclePT
#            ├── positions.csv
#            └── raw.tif
#       ...
input_dir: 'input/'

# Directory that contains all files generated when stitching unstitched
# tiled image sets. If your input images are already stitched, you can
# place them in this directory, following the pattern 'well{well}/raw.tif',
# where the tiff file is of shape (num_cycles, num_channels, width, height)
stitching_dir: 'stitching/'

# Directory containing all files generated when performing cell segmentation
segmentation_dir: 'segmentation/'

# Directory that contains all files generated when sequencing the in situ
# sequencing reads
sequencing_dir: 'sequencing/'

# Directory that contains all files generated when calculating features and
# phenotyping cells
phenotyping_dir: 'phenotyping/'

# Folder for final output
output_dir: 'output/'


# If necessary the individual wells and cycles can be specified here, otherwise
# they will be automatically determined by the folders in rawinput/ or input/.
#wells: ['B2', 'C2']
#cycles: ['00', '01', '02', '03', '04', '05', '06', '07']

# The prefix for phenotyping cycles in rawinput/, all directories that begin
# with this string such as phenotype/, phenotype2/, phenotype_20250225/
# are all considered phenotyping cycles.
phenotype_date: 'phenotype'
# The cycle name for each phenotyping cycle, to be stored in input/ and such.
#phenotype_cycles: ['PT']