black formatting, ruff safe fixes.

SamoraHunter · SamoraHunter · commit 13d822374035 · 2026-01-13T08:17:04.000Z
diff --git a/ml_grid/model_classes/H2OBaseClassifier.py b/ml_grid/model_classes/H2OBaseClassifier.py
@@ -417,7 +417,9 @@ def _sanitize_model_params(self):
         """
         if self.model_ and hasattr(self.model_, "_parms"):
             if "HGLM" in self.model_._parms:
-                self.logger.debug("Removing 'HGLM' parameter from H2O model to prevent backend error.")
+                self.logger.debug(
+                    "Removing 'HGLM' parameter from H2O model to prevent backend error."
+                )
                 del self.model_._parms["HGLM"]
 
     def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OBaseClassifier":
@@ -548,7 +550,9 @@ def predict(self, X: pd.DataFrame) -> np.ndarray:
             # Predict the first class as a fallback. This will result in a poor score for this fold,
             # which is the correct outcome for a degenerate test set.
             dummy_prediction = (
-                self.classes_[0] if self.classes_ is not None and len(self.classes_) > 0 else 0
+                self.classes_[0]
+                if self.classes_ is not None and len(self.classes_) > 0
+                else 0
             )
             return np.full(len(X), dummy_prediction)
 
@@ -569,9 +573,13 @@ def predict(self, X: pd.DataFrame) -> np.ndarray:
             # We filter feature_types_ to ensure only present columns are passed.
             col_types = None
             if self.feature_types_:
-                col_types = {k: v for k, v in self.feature_types_.items() if k in X.columns}
-            
-            tmp_frame = h2o.H2OFrame(X, column_names=self.feature_names_, column_types=col_types)
+                col_types = {
+                    k: v for k, v in self.feature_types_.items() if k in X.columns
+                }
+
+            tmp_frame = h2o.H2OFrame(
+                X, column_names=self.feature_names_, column_types=col_types
+            )
 
             # Optimization: Use the temporary frame directly.
             # Explicitly assigning a key (h2o.assign) triggers expensive GC checks.
@@ -590,7 +598,11 @@ def predict(self, X: pd.DataFrame) -> np.ndarray:
                     f"H2O backend crashed with NPE during predict(). Returning dummy predictions. Details: {e}"
                 )
                 # Fallback: predict the first class (usually 0)
-                dummy_val = self.classes_[0] if self.classes_ is not None and len(self.classes_) > 0 else 0
+                dummy_val = (
+                    self.classes_[0]
+                    if self.classes_ is not None and len(self.classes_) > 0
+                    else 0
+                )
                 return np.full(len(X), dummy_val)
 
             raise RuntimeError(f"H2O prediction failed: {e}")
@@ -643,7 +655,9 @@ def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
             )
             # Return a uniform probability distribution.
             n_classes = (
-                len(self.classes_) if self.classes_ is not None and len(self.classes_) > 0 else 2
+                len(self.classes_)
+                if self.classes_ is not None and len(self.classes_) > 0
+                else 2
             )
             dummy_probas = np.full((len(X), n_classes), 1 / n_classes)
             return dummy_probas
@@ -659,9 +673,13 @@ def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
             # Optimization: Pass column_types directly to constructor
             col_types = None
             if self.feature_types_:
-                col_types = {k: v for k, v in self.feature_types_.items() if k in X.columns}
-            
-            test_h2o = h2o.H2OFrame(X, column_names=self.feature_names_, column_types=col_types)
+                col_types = {
+                    k: v for k, v in self.feature_types_.items() if k in X.columns
+                }
+
+            test_h2o = h2o.H2OFrame(
+                X, column_names=self.feature_names_, column_types=col_types
+            )
         except Exception as e:
             raise RuntimeError(f"Failed to create H2O frame for prediction: {e}")
 
@@ -675,7 +693,11 @@ def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
                     f"H2O backend crashed with NPE during predict_proba(). Returning dummy probabilities. Details: {e}"
                 )
                 # Fallback: uniform probabilities
-                n_classes = len(self.classes_) if self.classes_ is not None and len(self.classes_) > 0 else 2
+                n_classes = (
+                    len(self.classes_)
+                    if self.classes_ is not None and len(self.classes_) > 0
+                    else 2
+                )
                 return np.full((len(X), n_classes), 1.0 / n_classes)
 
             raise RuntimeError(f"H2O prediction failed: {e}")
diff --git a/ml_grid/pipeline/data_train_test_split.py b/ml_grid/pipeline/data_train_test_split.py
@@ -156,7 +156,9 @@ def get_data_split(
                 X_test = X_test.drop(idx_to_move)
                 y_test = y_test.drop(idx_to_move)
 
-                logger.info(f"Moved sample {idx_to_move} (class {missing_cls}) from test to train.")
+                logger.info(
+                    f"Moved sample {idx_to_move} (class {missing_cls}) from test to train."
+                )
                 break  # Only need one sample to satisfy "at least 2 classes"
 
     return X_train, X_test, y_train, y_test, X_test_orig, y_test_orig
diff --git a/ml_grid/pipeline/grid_search_cross_validate.py b/ml_grid/pipeline/grid_search_cross_validate.py
@@ -172,22 +172,33 @@ def __init__(
         )  # hard limit on param space exploration
 
         # Allow local override for max_param_space_iter_value
-        if self.ml_grid_object_iter.local_param_dict.get("max_param_space_iter_value") is not None:
-            max_param_space_iter_value = self.ml_grid_object_iter.local_param_dict.get("max_param_space_iter_value")
+        if (
+            self.ml_grid_object_iter.local_param_dict.get("max_param_space_iter_value")
+            is not None
+        ):
+            max_param_space_iter_value = self.ml_grid_object_iter.local_param_dict.get(
+                "max_param_space_iter_value"
+            )
 
         if "svc" in method_name.lower():
-            self.logger.info("Applying StandardScaler for SVC to prevent convergence issues.")
+            self.logger.info(
+                "Applying StandardScaler for SVC to prevent convergence issues."
+            )
             scaler = StandardScaler()
             self.X_train = pd.DataFrame(
                 scaler.fit_transform(self.X_train),
                 columns=self.X_train.columns,
                 index=self.X_train.index,
             )
             self.X_test = pd.DataFrame(
-                scaler.transform(self.X_test), columns=self.X_test.columns, index=self.X_test.index
+                scaler.transform(self.X_test),
+                columns=self.X_test.columns,
+                index=self.X_test.index,
             )
             self.X_test_orig = pd.DataFrame(
-                scaler.transform(self.X_test_orig), columns=self.X_test_orig.columns, index=self.X_test_orig.index
+                scaler.transform(self.X_test_orig),
+                columns=self.X_test_orig.columns,
+                index=self.X_test_orig.index,
             )
 
         # --- PERFORMANCE FIX for testing ---
@@ -303,17 +314,23 @@ def __init__(
                 n_iter_v = 2
             n_iter_v = int(n_iter_v)
         except (ValueError, TypeError):
-            self.logger.warning("Invalid or missing n_iter in global_params. Defaulting to 2.")
+            self.logger.warning(
+                "Invalid or missing n_iter in global_params. Defaulting to 2."
+            )
             n_iter_v = 2
 
         # Allow local override from run_params/local_param_dict
         local_n_iter = self.ml_grid_object_iter.local_param_dict.get("n_iter")
         if local_n_iter is not None:
             try:
                 n_iter_v = int(local_n_iter)
-                self.logger.info(f"Overriding global n_iter with local value: {n_iter_v}")
+                self.logger.info(
+                    f"Overriding global n_iter with local value: {n_iter_v}"
+                )
             except (ValueError, TypeError):
-                self.logger.warning(f"Invalid local n_iter value: {local_n_iter}. Ignoring override.")
+                self.logger.warning(
+                    f"Invalid local n_iter value: {local_n_iter}. Ignoring override."
+                )
 
         if max_param_space_iter_value is not None:
             if n_iter_v > max_param_space_iter_value:
@@ -420,7 +437,9 @@ def __init__(
 
         except Exception as e:
             if "dual coefficients or intercepts are not finite" in str(e):
-                self.logger.warning(f"SVC failed to fit due to data issues: {e}. Returning default score.")
+                self.logger.warning(
+                    f"SVC failed to fit due to data issues: {e}. Returning default score."
+                )
                 self.grid_search_cross_validate_score_result = 0.5
                 return
 
@@ -476,9 +495,15 @@ def __init__(
         # Define default scores (e.g., mean score of 0.5 for binary classification)
         # Default scores if cross-validation fails
         default_scores = {
-            "test_accuracy": np.array([0.5]),  # Default to random classifier performance
-            "test_f1": np.array([0.5]),  # Default F1 score (again, 0.5 for random classification)
-            "test_auc": np.array([0.5]),  # Default ROC AUC score (0.5 for random classifier)
+            "test_accuracy": np.array(
+                [0.5]
+            ),  # Default to random classifier performance
+            "test_f1": np.array(
+                [0.5]
+            ),  # Default F1 score (again, 0.5 for random classification)
+            "test_auc": np.array(
+                [0.5]
+            ),  # Default ROC AUC score (0.5 for random classifier)
             "fit_time": np.array([0]),  # No fitting time if the model fails
             "score_time": np.array([0]),  # No scoring time if the model fails
             "train_score": np.array([0.5]),  # Default train score
@@ -535,11 +560,15 @@ def __init__(
             )
 
             if force_second_cv:
-                self.logger.info("force_second_cv is True. Skipping cached result extraction to run fresh cross-validation.")
+                self.logger.info(
+                    "force_second_cv is True. Skipping cached result extraction to run fresh cross-validation."
+                )
 
             # Check if we can reuse results from HyperparameterSearch
-            if not force_second_cv and hasattr(current_algorithm, "cv_results_") and hasattr(
-                current_algorithm, "best_index_"
+            if (
+                not force_second_cv
+                and hasattr(current_algorithm, "cv_results_")
+                and hasattr(current_algorithm, "best_index_")
             ):
                 try:
                     self.logger.info(
@@ -553,22 +582,33 @@ def __init__(
                     # Extract fit and score times
                     if "split0_fit_time" in results:
                         temp_scores["fit_time"] = np.array(
-                            [results[f"split{k}_fit_time"][index] for k in range(n_splits)]
+                            [
+                                results[f"split{k}_fit_time"][index]
+                                for k in range(n_splits)
+                            ]
                         )
                     else:
                         # Fallback: Use mean time repeated if split times are missing (e.g. BayesSearchCV)
-                        temp_scores["fit_time"] = np.full(n_splits, results["mean_fit_time"][index])
+                        temp_scores["fit_time"] = np.full(
+                            n_splits, results["mean_fit_time"][index]
+                        )
 
                     if "split0_score_time" in results:
                         temp_scores["score_time"] = np.array(
-                            [results[f"split{k}_score_time"][index] for k in range(n_splits)]
+                            [
+                                results[f"split{k}_score_time"][index]
+                                for k in range(n_splits)
+                            ]
                         )
                     else:
                         # Fallback: Use mean score time.
                         # We use .get() with a default that is safe to index into if the key is missing.
                         # If 'mean_score_time' is missing, we default to a list of 0s large enough to cover 'index'.
                         default_times = np.zeros(index + 1)
-                        temp_scores["score_time"] = np.full(n_splits, results.get("mean_score_time", default_times)[index])
+                        temp_scores["score_time"] = np.full(
+                            n_splits,
+                            results.get("mean_score_time", default_times)[index],
+                        )
 
                     # Extract metric scores
                     for metric in self.metric_list:
@@ -582,7 +622,9 @@ def __init__(
                         )
                         # Train scores (if available)
                         train_key = f"train_{metric}"
-                        train_col = f"split0_train_{metric}"  # Check existence on first split
+                        train_col = (
+                            f"split0_train_{metric}"  # Check existence on first split
+                        )
                         if train_col in results:
                             temp_scores[train_key] = np.array(
                                 [
@@ -592,7 +634,9 @@ def __init__(
                             )
                     scores = temp_scores
                 except Exception as e:
-                    self.logger.warning(f"Could not extract cached CV results: {e}. Falling back to standard CV.")
+                    self.logger.warning(
+                        f"Could not extract cached CV results: {e}. Falling back to standard CV."
+                    )
                     scores = None
 
             if scores is None:
@@ -628,7 +672,11 @@ def __init__(
                     # This is done AFTER fitting and before cross-validation.
                     if isinstance(
                         current_algorithm,
-                        (KerasClassifier, KerasClassifierClass, NeuralNetworkClassifier),
+                        (
+                            KerasClassifier,
+                            KerasClassifierClass,
+                            NeuralNetworkClassifier,
+                        ),
                     ):
                         try:
                             self.logger.debug(
@@ -637,7 +685,9 @@ def __init__(
                             n_features = self.X_train.shape[1]
                             # Define an input signature that allows for variable batch size.
                             input_signature = [
-                                tf.TensorSpec(shape=(None, n_features), dtype=tf.float32)
+                                tf.TensorSpec(
+                                    shape=(None, n_features), dtype=tf.float32
+                                )
                             ]
                             # Access the underlying Keras model via .model_
                             current_algorithm.model_.predict.get_concrete_function(
diff --git a/ml_grid/pipeline/main.py b/ml_grid/pipeline/main.py
@@ -1,8 +1,5 @@
 import logging
 import traceback
-import glob
-import os
-import yaml
 from typing import Any, Dict, List, Tuple
 
 import numpy as np
diff --git a/ml_grid/util/project_score_save.py b/ml_grid/util/project_score_save.py
@@ -274,7 +274,9 @@ def update_score_log(
                     logger.error(f"Error processing scores for BayesSearch: {e}")
                     logger.debug(f"Scores dictionary: {scores}")
             else:
-                line["fit_time_m"] = np.array(scores["fit_time"]).mean()  # deprecated for bayes
+                line["fit_time_m"] = np.array(
+                    scores["fit_time"]
+                ).mean()  # deprecated for bayes
                 line["fit_time_std"] = np.array(scores["fit_time"]).std()
                 line["score_time_m"] = np.array(scores["score_time"]).mean()
                 line["score_time_std"] = np.array(scores["score_time"]).std()
diff --git a/tests/test_h2o_base_classifier.py b/tests/test_h2o_base_classifier.py
@@ -1,7 +1,7 @@
 import pytest
 import pandas as pd
 import numpy as np
-from unittest.mock import patch, MagicMock, ANY
+from unittest.mock import patch, MagicMock
 import os
 import shutil
 from sklearn.base import clone
@@ -215,11 +215,11 @@ def test_predict_successful(
     mock_h2o_frame.assert_called_once_with(
         X, column_names=list(X.columns), column_types=classifier_instance.feature_types_
     )
-    
+
     # Optimization: h2o.assign and h2o.get_frame should NO LONGER be called
     mock_h2o_assign.assert_not_called()
     mock_h2o_get_frame.assert_not_called()
-    
+
     # Verify the model's predict method was called with the temporary frame directly
     mock_model.predict.assert_called_once_with(mock_tmp_frame)