Skip to content

Commit ec47a63

Browse files
author
Jordan Stomps
committed
changes in light of PR cnerg#41 comments
1 parent 1a85591 commit ec47a63

3 files changed

Lines changed: 43 additions & 53 deletions

File tree

models/LogReg.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@ class LogReg:
1717
Add multinomial functions and unit tests.
1818
Add functionality for regression(?)
1919
Inputs:
20-
params: dictionary of logistic regression input functions.
21-
keys max_iter, tol, and C supported.
20+
kwargs: logistic regression input functions.
21+
keys random_state, max_iter, tol, and C supported.
2222
random_state: int/float for reproducible intiailization.
2323
'''
2424

2525
# only binary so far
2626
def __init__(self, **kwargs):
27-
# supported keys = ['max_iter', 'tol', 'C']
27+
# supported keys = ['max_iter', 'tol', 'C', 'random_state']
2828
# defaults to a fixed value for reproducibility
2929
self.random_state = kwargs.pop('random_state', 0)
3030
# parameters for logistic regression model:

models/SSML/CoTraining.py

Lines changed: 27 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -19,37 +19,35 @@ class CoTraining:
1919
Add multinomial functions and unit tests.
2020
Add functionality for regression(?)
2121
Inputs:
22-
params: dictionary of logistic regression input functions.
23-
keys max_iter, tol, and C supported.
22+
kwargs: logistic regression input functions.
23+
keys random_state, max_iter, tol, and C supported.
2424
random_state: int/float for reproducible intiailization.
2525
'''
2626

2727
# only binary so far
28-
def __init__(self, params=None, random_state=0):
28+
def __init__(self, **kwargs):
29+
# supported keys = ['max_iter', 'tol', 'C', 'random_state']
2930
# defaults to a fixed value for reproducibility
30-
self.random_state = random_state
31-
# dictionary of parameters for logistic regression model
32-
self.params = params
33-
if self.params is None:
34-
self.model1 = linear_model.LogisticRegression(
35-
random_state=self.random_state)
36-
self.model2 = linear_model.LogisticRegression(
37-
random_state=self.random_state)
38-
# default needed for training
39-
self.params = {'n_samples': 1}
40-
else:
41-
self.model1 = linear_model.LogisticRegression(
42-
random_state=self.random_state,
43-
max_iter=params['max_iter'],
44-
tol=params['tol'],
45-
C=params['C']
46-
)
47-
self.model2 = linear_model.LogisticRegression(
48-
random_state=self.random_state,
49-
max_iter=params['max_iter'],
50-
tol=params['tol'],
51-
C=params['C']
52-
)
31+
self.random_state = kwargs.pop('random_state', 0)
32+
self.seed = kwargs.pop('seed', 0)
33+
# parameters for cotraining logistic regression models:
34+
# defaults to sklearn.linear_model.LogisticRegression default vals
35+
self.max_iter = kwargs.pop('max_iter', 100)
36+
self.tol = kwargs.pop('tol', 0.0001)
37+
self.C = kwargs.pop('C', 1.0)
38+
self.n_samples = kwargs.pop('n_samples', 1)
39+
self.model1 = linear_model.LogisticRegression(
40+
random_state=self.random_state,
41+
max_iter=self.max_iter,
42+
tol=self.tol,
43+
C=self.C
44+
)
45+
self.model2 = linear_model.LogisticRegression(
46+
random_state=self.random_state,
47+
max_iter=self.max_iter,
48+
tol=self.tol,
49+
C=self.C
50+
)
5351

5452
def training_loop(self, slr1, slr2, L_lr1, L_lr2,
5553
Ly_lr1, Ly_lr2, U_lr, n_samples,
@@ -155,7 +153,7 @@ def fresh_start(self, params, data_dict):
155153
# unlabeled co-training data
156154
Ux = data_dict['Ux']
157155

158-
clf = CoTraining(params=params, random_state=self.random_state)
156+
clf = CoTraining(**params, random_state=self.random_state)
159157
# training and testing
160158
model1_accs, model2_accs = clf.train(trainx, trainy, Ux, testx, testy)
161159
# uses balanced_accuracy accounts for class imbalanced data
@@ -239,10 +237,7 @@ def train(self, trainx, trainy, Ux,
239237
U_lr = Ux.copy()
240238

241239
# set the random seed of training splits for reproducibility
242-
# This can be ignored by excluding params['seed']
243-
# in the hyperopt space dictionary
244-
if 'seed' in self.params.keys():
245-
np.random.seed(self.params['seed'])
240+
np.random.seed(self.seed)
246241

247242
# TODO: allow a user to specify uneven splits between the two models
248243
split_frac = 0.5
@@ -262,7 +257,7 @@ def train(self, trainx, trainy, Ux,
262257
self.model1, self.model2,
263258
L_lr1, L_lr2,
264259
Ly_lr1, Ly_lr2,
265-
U_lr, self.params['n_samples'],
260+
U_lr, self.n_samples,
266261
testx, testy,
267262
)
268263

tests/test_models.py

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -67,16 +67,6 @@ def test_cross_validation():
6767
# therefore its accuracy should be less than all other folds
6868
assert (accs[-1] < accs[:-1]).all()
6969

70-
# test cross validation for supervised data and StratifiedKFold with LogReg
71-
# params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
72-
# model = LogReg(params=params)
73-
# max_acc_model = utils.cross_validation(model=model,
74-
# X=X,
75-
# y=y,
76-
# params=params,
77-
# stratified=True)
78-
# assert max_acc_model['accuracy'] >= 0.5
79-
8070
# test cross validation for SSML with LabelProp
8171
# params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5}
8272
# model = LabelProp(params=params)
@@ -106,9 +96,10 @@ def test_pca():
10696
utils.plot_pca(pcs, y_train, np.full_like(Uy, -1), filename, 2)
10797
os.remove(filename+'.png')
10898

109-
# filename = 'test_multiD_pca'
110-
# utils.multiD_pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename, n=5)
111-
# os.remove(filename+'.png')
99+
filename = 'test_multiD_pca'
100+
pcs = utils.pca(X_train, Ux, 5)
101+
utils.plot_pca(pcs, y_train, np.full_like(Uy, -1), filename, 5)
102+
os.remove(filename+'.png')
112103

113104
# normalization
114105
normalizer = StandardScaler()
@@ -197,7 +188,9 @@ def test_LogReg():
197188
def test_CoTraining():
198189
# test saving model input parameters
199190
params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
200-
model = CoTraining(params=params)
191+
model = CoTraining(max_iter=params['max_iter'],
192+
tol=params['tol'],
193+
C=params['C'])
201194

202195
assert model.model1.max_iter == params['max_iter']
203196
assert model.model1.tol == params['tol']
@@ -207,8 +200,8 @@ def test_CoTraining():
207200
assert model.model2.tol == params['tol']
208201
assert model.model2.C == params['C']
209202

210-
X, Ux, y, Uy = train_test_split(spectra,
211-
labels,
203+
X, Ux, y, Uy = train_test_split(pytest.spectra,
204+
pytest.labels,
212205
test_size=0.5,
213206
random_state=0)
214207
X_train, X_test, y_train, y_test = train_test_split(X,
@@ -231,8 +224,10 @@ def test_CoTraining():
231224
# testing train and predict methods
232225
pred, acc, *_ = model.predict(X_test, y_test)
233226

234-
assert acc > 0.7
235-
np.testing.assert_equal(pred, y_test)
227+
# since the test data used here is synthetic/toy data (i.e. uninteresting),
228+
# the trained model should be at least better than a 50-50 guess
229+
# if it was worse, something would be wrong with the ML class
230+
assert acc > 0.5
236231

237232
# testing hyperopt optimize methods
238233
space = {'max_iter': scope.int(hp.quniform('max_iter',

0 commit comments

Comments
 (0)