Skip to content

Commit 13d8223

Browse files
author
SamoraHunter
committed
black formatting, ruff safe fixes.
1 parent 74ab16c commit 13d8223

6 files changed

Lines changed: 115 additions & 42 deletions

File tree

ml_grid/model_classes/H2OBaseClassifier.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,9 @@ def _sanitize_model_params(self):
417417
"""
418418
if self.model_ and hasattr(self.model_, "_parms"):
419419
if "HGLM" in self.model_._parms:
420-
self.logger.debug("Removing 'HGLM' parameter from H2O model to prevent backend error.")
420+
self.logger.debug(
421+
"Removing 'HGLM' parameter from H2O model to prevent backend error."
422+
)
421423
del self.model_._parms["HGLM"]
422424

423425
def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OBaseClassifier":
@@ -548,7 +550,9 @@ def predict(self, X: pd.DataFrame) -> np.ndarray:
548550
# Predict the first class as a fallback. This will result in a poor score for this fold,
549551
# which is the correct outcome for a degenerate test set.
550552
dummy_prediction = (
551-
self.classes_[0] if self.classes_ is not None and len(self.classes_) > 0 else 0
553+
self.classes_[0]
554+
if self.classes_ is not None and len(self.classes_) > 0
555+
else 0
552556
)
553557
return np.full(len(X), dummy_prediction)
554558

@@ -569,9 +573,13 @@ def predict(self, X: pd.DataFrame) -> np.ndarray:
569573
# We filter feature_types_ to ensure only present columns are passed.
570574
col_types = None
571575
if self.feature_types_:
572-
col_types = {k: v for k, v in self.feature_types_.items() if k in X.columns}
573-
574-
tmp_frame = h2o.H2OFrame(X, column_names=self.feature_names_, column_types=col_types)
576+
col_types = {
577+
k: v for k, v in self.feature_types_.items() if k in X.columns
578+
}
579+
580+
tmp_frame = h2o.H2OFrame(
581+
X, column_names=self.feature_names_, column_types=col_types
582+
)
575583

576584
# Optimization: Use the temporary frame directly.
577585
# Explicitly assigning a key (h2o.assign) triggers expensive GC checks.
@@ -590,7 +598,11 @@ def predict(self, X: pd.DataFrame) -> np.ndarray:
590598
f"H2O backend crashed with NPE during predict(). Returning dummy predictions. Details: {e}"
591599
)
592600
# Fallback: predict the first class (usually 0)
593-
dummy_val = self.classes_[0] if self.classes_ is not None and len(self.classes_) > 0 else 0
601+
dummy_val = (
602+
self.classes_[0]
603+
if self.classes_ is not None and len(self.classes_) > 0
604+
else 0
605+
)
594606
return np.full(len(X), dummy_val)
595607

596608
raise RuntimeError(f"H2O prediction failed: {e}")
@@ -643,7 +655,9 @@ def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
643655
)
644656
# Return a uniform probability distribution.
645657
n_classes = (
646-
len(self.classes_) if self.classes_ is not None and len(self.classes_) > 0 else 2
658+
len(self.classes_)
659+
if self.classes_ is not None and len(self.classes_) > 0
660+
else 2
647661
)
648662
dummy_probas = np.full((len(X), n_classes), 1 / n_classes)
649663
return dummy_probas
@@ -659,9 +673,13 @@ def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
659673
# Optimization: Pass column_types directly to constructor
660674
col_types = None
661675
if self.feature_types_:
662-
col_types = {k: v for k, v in self.feature_types_.items() if k in X.columns}
663-
664-
test_h2o = h2o.H2OFrame(X, column_names=self.feature_names_, column_types=col_types)
676+
col_types = {
677+
k: v for k, v in self.feature_types_.items() if k in X.columns
678+
}
679+
680+
test_h2o = h2o.H2OFrame(
681+
X, column_names=self.feature_names_, column_types=col_types
682+
)
665683
except Exception as e:
666684
raise RuntimeError(f"Failed to create H2O frame for prediction: {e}")
667685

@@ -675,7 +693,11 @@ def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
675693
f"H2O backend crashed with NPE during predict_proba(). Returning dummy probabilities. Details: {e}"
676694
)
677695
# Fallback: uniform probabilities
678-
n_classes = len(self.classes_) if self.classes_ is not None and len(self.classes_) > 0 else 2
696+
n_classes = (
697+
len(self.classes_)
698+
if self.classes_ is not None and len(self.classes_) > 0
699+
else 2
700+
)
679701
return np.full((len(X), n_classes), 1.0 / n_classes)
680702

681703
raise RuntimeError(f"H2O prediction failed: {e}")

ml_grid/pipeline/data_train_test_split.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,9 @@ def get_data_split(
156156
X_test = X_test.drop(idx_to_move)
157157
y_test = y_test.drop(idx_to_move)
158158

159-
logger.info(f"Moved sample {idx_to_move} (class {missing_cls}) from test to train.")
159+
logger.info(
160+
f"Moved sample {idx_to_move} (class {missing_cls}) from test to train."
161+
)
160162
break # Only need one sample to satisfy "at least 2 classes"
161163

162164
return X_train, X_test, y_train, y_test, X_test_orig, y_test_orig

ml_grid/pipeline/grid_search_cross_validate.py

Lines changed: 73 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -172,22 +172,33 @@ def __init__(
172172
) # hard limit on param space exploration
173173

174174
# Allow local override for max_param_space_iter_value
175-
if self.ml_grid_object_iter.local_param_dict.get("max_param_space_iter_value") is not None:
176-
max_param_space_iter_value = self.ml_grid_object_iter.local_param_dict.get("max_param_space_iter_value")
175+
if (
176+
self.ml_grid_object_iter.local_param_dict.get("max_param_space_iter_value")
177+
is not None
178+
):
179+
max_param_space_iter_value = self.ml_grid_object_iter.local_param_dict.get(
180+
"max_param_space_iter_value"
181+
)
177182

178183
if "svc" in method_name.lower():
179-
self.logger.info("Applying StandardScaler for SVC to prevent convergence issues.")
184+
self.logger.info(
185+
"Applying StandardScaler for SVC to prevent convergence issues."
186+
)
180187
scaler = StandardScaler()
181188
self.X_train = pd.DataFrame(
182189
scaler.fit_transform(self.X_train),
183190
columns=self.X_train.columns,
184191
index=self.X_train.index,
185192
)
186193
self.X_test = pd.DataFrame(
187-
scaler.transform(self.X_test), columns=self.X_test.columns, index=self.X_test.index
194+
scaler.transform(self.X_test),
195+
columns=self.X_test.columns,
196+
index=self.X_test.index,
188197
)
189198
self.X_test_orig = pd.DataFrame(
190-
scaler.transform(self.X_test_orig), columns=self.X_test_orig.columns, index=self.X_test_orig.index
199+
scaler.transform(self.X_test_orig),
200+
columns=self.X_test_orig.columns,
201+
index=self.X_test_orig.index,
191202
)
192203

193204
# --- PERFORMANCE FIX for testing ---
@@ -303,17 +314,23 @@ def __init__(
303314
n_iter_v = 2
304315
n_iter_v = int(n_iter_v)
305316
except (ValueError, TypeError):
306-
self.logger.warning("Invalid or missing n_iter in global_params. Defaulting to 2.")
317+
self.logger.warning(
318+
"Invalid or missing n_iter in global_params. Defaulting to 2."
319+
)
307320
n_iter_v = 2
308321

309322
# Allow local override from run_params/local_param_dict
310323
local_n_iter = self.ml_grid_object_iter.local_param_dict.get("n_iter")
311324
if local_n_iter is not None:
312325
try:
313326
n_iter_v = int(local_n_iter)
314-
self.logger.info(f"Overriding global n_iter with local value: {n_iter_v}")
327+
self.logger.info(
328+
f"Overriding global n_iter with local value: {n_iter_v}"
329+
)
315330
except (ValueError, TypeError):
316-
self.logger.warning(f"Invalid local n_iter value: {local_n_iter}. Ignoring override.")
331+
self.logger.warning(
332+
f"Invalid local n_iter value: {local_n_iter}. Ignoring override."
333+
)
317334

318335
if max_param_space_iter_value is not None:
319336
if n_iter_v > max_param_space_iter_value:
@@ -420,7 +437,9 @@ def __init__(
420437

421438
except Exception as e:
422439
if "dual coefficients or intercepts are not finite" in str(e):
423-
self.logger.warning(f"SVC failed to fit due to data issues: {e}. Returning default score.")
440+
self.logger.warning(
441+
f"SVC failed to fit due to data issues: {e}. Returning default score."
442+
)
424443
self.grid_search_cross_validate_score_result = 0.5
425444
return
426445

@@ -476,9 +495,15 @@ def __init__(
476495
# Define default scores (e.g., mean score of 0.5 for binary classification)
477496
# Default scores if cross-validation fails
478497
default_scores = {
479-
"test_accuracy": np.array([0.5]), # Default to random classifier performance
480-
"test_f1": np.array([0.5]), # Default F1 score (again, 0.5 for random classification)
481-
"test_auc": np.array([0.5]), # Default ROC AUC score (0.5 for random classifier)
498+
"test_accuracy": np.array(
499+
[0.5]
500+
), # Default to random classifier performance
501+
"test_f1": np.array(
502+
[0.5]
503+
), # Default F1 score (again, 0.5 for random classification)
504+
"test_auc": np.array(
505+
[0.5]
506+
), # Default ROC AUC score (0.5 for random classifier)
482507
"fit_time": np.array([0]), # No fitting time if the model fails
483508
"score_time": np.array([0]), # No scoring time if the model fails
484509
"train_score": np.array([0.5]), # Default train score
@@ -535,11 +560,15 @@ def __init__(
535560
)
536561

537562
if force_second_cv:
538-
self.logger.info("force_second_cv is True. Skipping cached result extraction to run fresh cross-validation.")
563+
self.logger.info(
564+
"force_second_cv is True. Skipping cached result extraction to run fresh cross-validation."
565+
)
539566

540567
# Check if we can reuse results from HyperparameterSearch
541-
if not force_second_cv and hasattr(current_algorithm, "cv_results_") and hasattr(
542-
current_algorithm, "best_index_"
568+
if (
569+
not force_second_cv
570+
and hasattr(current_algorithm, "cv_results_")
571+
and hasattr(current_algorithm, "best_index_")
543572
):
544573
try:
545574
self.logger.info(
@@ -553,22 +582,33 @@ def __init__(
553582
# Extract fit and score times
554583
if "split0_fit_time" in results:
555584
temp_scores["fit_time"] = np.array(
556-
[results[f"split{k}_fit_time"][index] for k in range(n_splits)]
585+
[
586+
results[f"split{k}_fit_time"][index]
587+
for k in range(n_splits)
588+
]
557589
)
558590
else:
559591
# Fallback: Use mean time repeated if split times are missing (e.g. BayesSearchCV)
560-
temp_scores["fit_time"] = np.full(n_splits, results["mean_fit_time"][index])
592+
temp_scores["fit_time"] = np.full(
593+
n_splits, results["mean_fit_time"][index]
594+
)
561595

562596
if "split0_score_time" in results:
563597
temp_scores["score_time"] = np.array(
564-
[results[f"split{k}_score_time"][index] for k in range(n_splits)]
598+
[
599+
results[f"split{k}_score_time"][index]
600+
for k in range(n_splits)
601+
]
565602
)
566603
else:
567604
# Fallback: Use mean score time.
568605
# We use .get() with a default that is safe to index into if the key is missing.
569606
# If 'mean_score_time' is missing, we default to a list of 0s large enough to cover 'index'.
570607
default_times = np.zeros(index + 1)
571-
temp_scores["score_time"] = np.full(n_splits, results.get("mean_score_time", default_times)[index])
608+
temp_scores["score_time"] = np.full(
609+
n_splits,
610+
results.get("mean_score_time", default_times)[index],
611+
)
572612

573613
# Extract metric scores
574614
for metric in self.metric_list:
@@ -582,7 +622,9 @@ def __init__(
582622
)
583623
# Train scores (if available)
584624
train_key = f"train_{metric}"
585-
train_col = f"split0_train_{metric}" # Check existence on first split
625+
train_col = (
626+
f"split0_train_{metric}" # Check existence on first split
627+
)
586628
if train_col in results:
587629
temp_scores[train_key] = np.array(
588630
[
@@ -592,7 +634,9 @@ def __init__(
592634
)
593635
scores = temp_scores
594636
except Exception as e:
595-
self.logger.warning(f"Could not extract cached CV results: {e}. Falling back to standard CV.")
637+
self.logger.warning(
638+
f"Could not extract cached CV results: {e}. Falling back to standard CV."
639+
)
596640
scores = None
597641

598642
if scores is None:
@@ -628,7 +672,11 @@ def __init__(
628672
# This is done AFTER fitting and before cross-validation.
629673
if isinstance(
630674
current_algorithm,
631-
(KerasClassifier, KerasClassifierClass, NeuralNetworkClassifier),
675+
(
676+
KerasClassifier,
677+
KerasClassifierClass,
678+
NeuralNetworkClassifier,
679+
),
632680
):
633681
try:
634682
self.logger.debug(
@@ -637,7 +685,9 @@ def __init__(
637685
n_features = self.X_train.shape[1]
638686
# Define an input signature that allows for variable batch size.
639687
input_signature = [
640-
tf.TensorSpec(shape=(None, n_features), dtype=tf.float32)
688+
tf.TensorSpec(
689+
shape=(None, n_features), dtype=tf.float32
690+
)
641691
]
642692
# Access the underlying Keras model via .model_
643693
current_algorithm.model_.predict.get_concrete_function(

ml_grid/pipeline/main.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
import logging
22
import traceback
3-
import glob
4-
import os
5-
import yaml
63
from typing import Any, Dict, List, Tuple
74

85
import numpy as np

ml_grid/util/project_score_save.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,9 @@ def update_score_log(
274274
logger.error(f"Error processing scores for BayesSearch: {e}")
275275
logger.debug(f"Scores dictionary: {scores}")
276276
else:
277-
line["fit_time_m"] = np.array(scores["fit_time"]).mean() # deprecated for bayes
277+
line["fit_time_m"] = np.array(
278+
scores["fit_time"]
279+
).mean() # deprecated for bayes
278280
line["fit_time_std"] = np.array(scores["fit_time"]).std()
279281
line["score_time_m"] = np.array(scores["score_time"]).mean()
280282
line["score_time_std"] = np.array(scores["score_time"]).std()

tests/test_h2o_base_classifier.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import pytest
22
import pandas as pd
33
import numpy as np
4-
from unittest.mock import patch, MagicMock, ANY
4+
from unittest.mock import patch, MagicMock
55
import os
66
import shutil
77
from sklearn.base import clone
@@ -215,11 +215,11 @@ def test_predict_successful(
215215
mock_h2o_frame.assert_called_once_with(
216216
X, column_names=list(X.columns), column_types=classifier_instance.feature_types_
217217
)
218-
218+
219219
# Optimization: h2o.assign and h2o.get_frame should NO LONGER be called
220220
mock_h2o_assign.assert_not_called()
221221
mock_h2o_get_frame.assert_not_called()
222-
222+
223223
# Verify the model's predict method was called with the temporary frame directly
224224
mock_model.predict.assert_called_once_with(mock_tmp_frame)
225225

0 commit comments

Comments
 (0)