changes in light of PR cnerg#41 comments

Jordan Stomps · Jordan Stomps · commit ec47a631ad10 · 2023-01-16T14:06:11.000-05:00
diff --git a/models/LogReg.py b/models/LogReg.py
@@ -17,14 +17,14 @@ class LogReg:
         Add multinomial functions and unit tests.
         Add functionality for regression(?)
     Inputs:
-    params: dictionary of logistic regression input functions.
-        keys max_iter, tol, and C supported.
+    kwargs: logistic regression input functions.
+        keys random_state, max_iter, tol, and C supported.
     random_state: int/float for reproducible intiailization.
     '''
 
     # only binary so far
     def __init__(self, **kwargs):
-        # supported keys = ['max_iter', 'tol', 'C']
+        # supported keys = ['max_iter', 'tol', 'C', 'random_state']
         # defaults to a fixed value for reproducibility
         self.random_state = kwargs.pop('random_state', 0)
         # parameters for logistic regression model:
diff --git a/models/SSML/CoTraining.py b/models/SSML/CoTraining.py
@@ -19,37 +19,35 @@ class CoTraining:
         Add multinomial functions and unit tests.
         Add functionality for regression(?)
     Inputs:
-    params: dictionary of logistic regression input functions.
-        keys max_iter, tol, and C supported.
+    kwargs: logistic regression input functions.
+        keys random_state, max_iter, tol, and C supported.
     random_state: int/float for reproducible intiailization.
     '''
 
     # only binary so far
-    def __init__(self, params=None, random_state=0):
+    def __init__(self, **kwargs):
+        # supported keys = ['max_iter', 'tol', 'C', 'random_state']
         # defaults to a fixed value for reproducibility
-        self.random_state = random_state
-        # dictionary of parameters for logistic regression model
-        self.params = params
-        if self.params is None:
-            self.model1 = linear_model.LogisticRegression(
-                            random_state=self.random_state)
-            self.model2 = linear_model.LogisticRegression(
-                            random_state=self.random_state)
-            # default needed for training
-            self.params = {'n_samples': 1}
-        else:
-            self.model1 = linear_model.LogisticRegression(
-                            random_state=self.random_state,
-                            max_iter=params['max_iter'],
-                            tol=params['tol'],
-                            C=params['C']
-                        )
-            self.model2 = linear_model.LogisticRegression(
-                            random_state=self.random_state,
-                            max_iter=params['max_iter'],
-                            tol=params['tol'],
-                            C=params['C']
-                        )
+        self.random_state = kwargs.pop('random_state', 0)
+        self.seed = kwargs.pop('seed', 0)
+        # parameters for cotraining logistic regression models:
+        # defaults to sklearn.linear_model.LogisticRegression default vals
+        self.max_iter = kwargs.pop('max_iter', 100)
+        self.tol = kwargs.pop('tol', 0.0001)
+        self.C = kwargs.pop('C', 1.0)
+        self.n_samples = kwargs.pop('n_samples', 1)
+        self.model1 = linear_model.LogisticRegression(
+                        random_state=self.random_state,
+                        max_iter=self.max_iter,
+                        tol=self.tol,
+                        C=self.C
+                    )
+        self.model2 = linear_model.LogisticRegression(
+                        random_state=self.random_state,
+                        max_iter=self.max_iter,
+                        tol=self.tol,
+                        C=self.C
+                    )
 
     def training_loop(self, slr1, slr2, L_lr1, L_lr2,
                       Ly_lr1, Ly_lr2, U_lr, n_samples,
@@ -155,7 +153,7 @@ def fresh_start(self, params, data_dict):
         # unlabeled co-training data
         Ux = data_dict['Ux']
 
-        clf = CoTraining(params=params, random_state=self.random_state)
+        clf = CoTraining(**params, random_state=self.random_state)
         # training and testing
         model1_accs, model2_accs = clf.train(trainx, trainy, Ux, testx, testy)
         # uses balanced_accuracy accounts for class imbalanced data
@@ -239,10 +237,7 @@ def train(self, trainx, trainy, Ux,
         U_lr = Ux.copy()
 
         # set the random seed of training splits for reproducibility
-        # This can be ignored by excluding params['seed']
-        # in the hyperopt space dictionary
-        if 'seed' in self.params.keys():
-            np.random.seed(self.params['seed'])
+        np.random.seed(self.seed)
 
         # TODO: allow a user to specify uneven splits between the two models
         split_frac = 0.5
@@ -262,7 +257,7 @@ def train(self, trainx, trainy, Ux,
                                 self.model1, self.model2,
                                 L_lr1, L_lr2,
                                 Ly_lr1, Ly_lr2,
-                                U_lr, self.params['n_samples'],
+                                U_lr, self.n_samples,
                                 testx, testy,
                                 )
 
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -67,16 +67,6 @@ def test_cross_validation():
     # therefore its accuracy should be less than all other folds
     assert (accs[-1] < accs[:-1]).all()
 
-    # test cross validation for supervised data and StratifiedKFold with LogReg
-    # params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
-    # model = LogReg(params=params)
-    # max_acc_model = utils.cross_validation(model=model,
-    #                                        X=X,
-    #                                        y=y,
-    #                                        params=params,
-    #                                        stratified=True)
-    # assert max_acc_model['accuracy'] >= 0.5
-
     # test cross validation for SSML with LabelProp
     # params = {'gamma': 10, 'n_neighbors': 15, 'max_iter': 2022, 'tol': 0.5}
     # model = LabelProp(params=params)
@@ -106,9 +96,10 @@ def test_pca():
     utils.plot_pca(pcs, y_train, np.full_like(Uy, -1), filename, 2)
     os.remove(filename+'.png')
 
-    # filename = 'test_multiD_pca'
-    # utils.multiD_pca(X_train, y_train, Ux, np.full_like(Uy, -1), filename, n=5)
-    # os.remove(filename+'.png')
+    filename = 'test_multiD_pca'
+    pcs = utils.pca(X_train, Ux, 5)
+    utils.plot_pca(pcs, y_train, np.full_like(Uy, -1), filename, 5)
+    os.remove(filename+'.png')
 
     # normalization
     normalizer = StandardScaler()
@@ -197,7 +188,9 @@ def test_LogReg():
 def test_CoTraining():
     # test saving model input parameters
     params = {'max_iter': 2022, 'tol': 0.5, 'C': 5.0}
-    model = CoTraining(params=params)
+    model = CoTraining(max_iter=params['max_iter'],
+                       tol=params['tol'],
+                       C=params['C'])
 
     assert model.model1.max_iter == params['max_iter']
     assert model.model1.tol == params['tol']
@@ -207,8 +200,8 @@ def test_CoTraining():
     assert model.model2.tol == params['tol']
     assert model.model2.C == params['C']
 
-    X, Ux, y, Uy = train_test_split(spectra,
-                                    labels,
+    X, Ux, y, Uy = train_test_split(pytest.spectra,
+                                    pytest.labels,
                                     test_size=0.5,
                                     random_state=0)
     X_train, X_test, y_train, y_test = train_test_split(X,
@@ -231,8 +224,10 @@ def test_CoTraining():
     # testing train and predict methods
     pred, acc, *_ = model.predict(X_test, y_test)
 
-    assert acc > 0.7
-    np.testing.assert_equal(pred, y_test)
+    # since the test data used here is synthetic/toy data (i.e. uninteresting),
+    # the trained model should be at least better than a 50-50 guess
+    # if it was worse, something would be wrong with the ML class
+    assert acc > 0.5
 
     # testing hyperopt optimize methods
     space = {'max_iter': scope.int(hp.quniform('max_iter',