This repository was archived by the owner on Dec 10, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataLoader.py
More file actions
214 lines (193 loc) · 10 KB
/
dataLoader.py
File metadata and controls
214 lines (193 loc) · 10 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
""""
Script containing dataloader class
"""
## DataLoader credit to https://github.com/nejyeah/DeepPicker-python
import os
import mrcfile
import numpy as np
import scipy.ndimage as ndimage
from sklearn import preprocessing
from starReader import starRead
class DataLoader(object):
@staticmethod
def bin_2d(body_2d, bin_size):
# based on the numpy operation to do the bin process
col = body_2d.shape[0]
row = body_2d.shape[1]
scale_col = int(round(col/bin_size))
scale_row = int(round(row/bin_size))
patch = np.copy(body_2d[0:scale_col*bin_size, 0:scale_row*bin_size])
patch_view = patch.reshape(scale_col, bin_size, scale_row, bin_size)
body_2d_bin = patch_view.mean(axis=3).mean(axis=1)
return body_2d_bin
@staticmethod
def preprocess_particle(particle, model_input_size):
# resize the particle to fit the model input
# normalize the patch
mean_value = particle.mean()
std_value = particle.std()
particle = (particle - mean_value)/std_value
return particle
@staticmethod
def preprocess_micrograph_local(micrograph, bin_size):
# Lowpass filter
micrograph = ndimage.filters.gaussian_filter(micrograph, 0.5)
# do the bin process
micrograph = DataLoader.bin_2d(micrograph, bin_size)
# standarization of micrographs
micrograph = preprocessing.scale(micrograph)
transform = preprocessing.MaxAbsScaler()
micrograph = transform.fit_transform(micrograph)
return micrograph
@staticmethod
def preprocess_micrograph_picker(micrograph, bin_size):
# lowpass
micrograph = ndimage.filters.gaussian_filter(micrograph, 0.1)
# do the bin process
micrograph = DataLoader.bin_2d(micrograph, bin_size)
mean_value = micrograph.mean()
std_value = micrograph.std()
micrograph = (micrograph - mean_value) / std_value
return micrograph
@staticmethod
def load_trainData_From_RelionStarFile(starFileName, particle_size,
model_input_size, validation_ratio,
train_number, bin_size):
particle_array_positive, particle_array_negative = DataLoader.load_Particle_From_starFile(
starFileName, particle_size, model_input_size, bin_size)
if train_number < len(particle_array_positive):
particle_array_positive = particle_array_positive[:train_number,
...]
particle_array_negative = particle_array_negative[:train_number,
...]
np.random.shuffle(particle_array_positive)
np.random.shuffle(particle_array_negative)
validation_size = int(validation_ratio*particle_array_positive.shape[0])
train_size = particle_array_positive.shape[0] - validation_size
validation_data = particle_array_positive[:validation_size, ...]
validation_data = np.concatenate(
(validation_data, particle_array_negative[:validation_size, ...]))
validation_labels = np.concatenate(
(np.ones(validation_size, dtype=np.int64), np.zeros(validation_size, dtype=np.int64)))
train_data = particle_array_positive[validation_size:, ...]
train_data = np.concatenate(
(train_data, particle_array_negative[validation_size:, ...]))
train_labels = np.concatenate(
(np.ones(train_size, dtype=np.int64), np.zeros(train_size, dtype=np.int64)))
print(train_data.shape, train_data.dtype)
print(train_labels.shape, train_labels.dtype)
print(validation_data.shape, validation_data.dtype)
print(validation_labels.shape, validation_labels.dtype)
return train_data, train_labels, validation_data, validation_labels
@staticmethod
def load_Particle_From_starFile(starFileName,
particle_size,
model_input_size,
bin_size,
produce_negative=True,
negative_distance_ratio=0.4,
negative_number_ratio=1):
particle_star = starRead(starFileName)
table_star = particle_star.getByName('data_')
mrcfilename_list = table_star.getByName('_rlnMicrographName')
coordinateX_list = table_star.getByName('_rlnCoordinateX')
coordinateY_list = table_star.getByName('_rlnCoordinateY')
# creat a dictionary to store the coordinate
# the key is the mrc file name
# the value is a list of the coordinates
coordinate = {}
path_star = os.path.split(starFileName)
for i in range(len(mrcfilename_list)):
fileName = mrcfilename_list[i]
fileName = os.path.join(path_star[0], fileName)
if fileName in coordinate:
coordinate[fileName][0].append(int(float(coordinateX_list[i])))
coordinate[fileName][1].append(int(float(coordinateY_list[i])))
else:
coordinate[fileName] = [[], []]
coordinate[fileName][0].append(int(float(coordinateX_list[i])))
coordinate[fileName][1].append(int(float(coordinateY_list[i])))
# read mrc data using the mrcfile library
particle_array_positive = []
particle_array_negative = []
number_total_particle = 0
for key in coordinate:
print(key)
with mrcfile.open(key, mode='r', permissive=True) as mrc:
header = mrc.header
body_2d = mrc.data
n_col = header.nx
n_row = header.ny
# show the micrograph with manually picked particles
# plot the circle of the particle
# do preprocess to the micrograph
body_2d = DataLoader.preprocess_micrograph_picker(body_2d, bin_size)
# bin scale the particle size and the coordinates
particle_size_bin = int(particle_size / bin_size)
n_col = int(n_col / bin_size)
n_row = int(n_row / bin_size)
for i in range(len(coordinate[key][0])):
coordinate[key][0][i] = int(coordinate[key][0][i] / bin_size)
coordinate[key][1][i] = int(coordinate[key][1][i] / bin_size)
# delete the particle outside the boundry
radius = int(particle_size_bin / 2)
i = 0
while True:
if i >= len(coordinate[key][0]):
break
coordinate_x = coordinate[key][0][i]
coordinate_y = coordinate[key][1][i]
if coordinate_x < radius or coordinate_y < radius or coordinate_y + radius > n_col or coordinate_x + radius > n_row:
del coordinate[key][0][i]
del coordinate[key][1][i]
else:
i = i + 1
# number of positive particles
number_particle = len(coordinate[key][0])
number_total_particle = number_total_particle + number_particle
print('number of particles:', number_particle, 'accumulated: ', number_total_particle)
# extract the positive particles
# store the particles in a contacted array: particle_array_positive
for i in range(number_particle):
coordinate_x = coordinate[key][0][i]
coordinate_y = coordinate[key][1][i]
patch = np.copy(
body_2d[(coordinate_y - radius):(coordinate_y + radius),
(coordinate_x - radius):(coordinate_x + radius)])
patch = DataLoader.preprocess_particle(patch, model_input_size)
particle_array_positive.append(patch)
# extract the negative particles
# store the particles in a concated array: particle_array_negative
if produce_negative:
for i in range(number_particle):
while True:
isLegal = True
coor_x = np.random.randint(radius, n_row - radius)
coor_y = np.random.randint(radius, n_col - radius)
for j in range(number_particle):
coordinate_x = coordinate[key][0][j]
coordinate_y = coordinate[key][1][j]
distance = ((coor_x - coordinate_x)**2 +
(coor_y - coordinate_y)**2)**0.5
patch = np.copy(
body_2d[(coor_y - radius):(coor_y + radius),
(coor_x - radius):(coor_x + radius)])
# print("radius ", radius, particle_size_bin,patch.shape )
if patch.shape == (
radius * 2, radius * 2
) and distance >= negative_distance_ratio * particle_size_bin:
patch = DataLoader.preprocess_particle(
patch, model_input_size)
particle_array_negative.append(patch)
isLegal = False
break
if isLegal is False:
break
if produce_negative:
#print("total: ", np.array(particle_array_positive[0]).shape)
particle_array_positive = np.array(particle_array_positive).reshape(number_total_particle,model_input_size[1],model_input_size[2], 1)
particle_array_negative = np.array(particle_array_negative).reshape(number_total_particle,model_input_size[1],model_input_size[2], 1)
return particle_array_positive, particle_array_negative
else:
particle_array_positive = np.array(particle_array_positive).reshape(number_total_particle,model_input_size[1],model_input_size[2], 1)
return particle_array_positive