Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/BENCH-CONFIG-SPEC.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ Configs have the three highest parameter keys:
| `data`:`id` | None | | OpenML data id for `fetch_openml` source. |
| `data`:`preprocessing_kwargs`:`replace_nan` | `median` | `median`, `mean` | Value to replace NaNs in preprocessed data. |
| `data`:`preprocessing_kwargs`:`category_encoding` | `ordinal` | `ordinal`, `onehot`, `drop`, `ignore` | How to encode categorical features in preprocessed data. |
| `data`:`preprocessing_kwargs`:`normalize` | False | | Enables normalization of preprocessed data. |
| `data`:`preprocessing_kwargs`:`normalize` | None | None, `mean`, `minmax`, `standard` | Enables normalization of preprocessed data. |
| `data`:`preprocessing_kwargs`:`force_for_sparse` | True | | Forces preprocessing for sparse data formats. |
| `data`:`split_kwargs` | Empty `dict` or default split from dataset description | | Data split parameters for `train_test_split` function. |
| `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. |
Expand Down
2 changes: 1 addition & 1 deletion configs/common/knn.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
}
},
"data": {
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
}
},
"sklearn knn parameters": {
Expand Down
2 changes: 1 addition & 1 deletion configs/common/svm.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
"max_iter": 10000
}
},
"data": { "preprocessing_kwargs": { "normalize": true } }
"data": { "preprocessing_kwargs": { "normalize": "standard" } }
},
"svm clsf parameters": {
"algorithm": { "estimator_params": { "random_state": 42 } }
Expand Down
4 changes: 2 additions & 2 deletions configs/regular/dbscan.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
"PARAMETERS_SETS": {
"dbscan datasets": {
"data": [
{ "dataset": "cifar", "split_kwargs": { "train_size": 15000 } },
{ "dataset": "cifar", "split_kwargs": { "train_size": 15000 }, "preprocessing_kwargs": { "normalize": "mean" } },
{ "dataset": "mnist", "split_kwargs": { "train_size": 40000 } },
{ "dataset": "sensit", "split_kwargs": { "ignore": true } },
{ "dataset": "susy", "split_kwargs": { "train_size": 100000 } },
{
"dataset": "skin_segmentation",
"split_kwargs": { "train_size": 100000 },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"source": "make_blobs",
Expand Down
14 changes: 10 additions & 4 deletions configs/regular/kmeans.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,21 @@
{
"dataset": "covtype",
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"dataset": ["mnist", "gisette"],
"dataset": ["mnist"],
"split_kwargs": { "ignore": true }
},
{
"dataset" : "gisette",
"split_kwargs" : {"ignore" : true},
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"dataset": "cifar",
"split_kwargs": { "train_size": 10000, "test_size": null }
"split_kwargs": { "train_size": 10000, "test_size": null },
"preprocessing_kwargs": { "normalize": "mean" }
}
]
},
Expand All @@ -28,7 +34,7 @@
"shuffle": true,
"random_state": 42
},
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
"algorithm": {
"estimator_params": { "n_clusters": [2, 50] }
Expand Down
2 changes: 1 addition & 1 deletion configs/regular/knn.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"data": [
{ "dataset": "susy", "split_kwargs": { "train_size": 80000, "test_size": 20000 } },
{ "dataset": "connect" },
{ "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }
{ "dataset": "gisette", "preprocessing_kwargs": { "normalize": null } }
]
},
"kd_tree knn classification datasets": {
Expand Down
2 changes: 1 addition & 1 deletion configs/regular/linear_model.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
{
"data": {
"dataset": "year_prediction_msd",
"preprocessing_kwargs": { "normalize": true },
"preprocessing_kwargs": { "normalize": "standard" },
"split_kwargs": { "train_size": 0.5, "test_size": 0.5 }
}
},
Expand Down
9 changes: 6 additions & 3 deletions configs/regular/logreg.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,21 +61,24 @@
{
"data": {
"dataset": "hepmass",
"split_kwargs": { "train_size": 0.1, "test_size": null }
"split_kwargs": { "train_size": 0.1, "test_size": null },
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like this is the only case where benchmark behavior changes - is it intended?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it was done for a reason but let me check the convergence for both options

"preprocessing_kwargs": { "normalize": "standard" }
},
"algorithm": { "estimator_params": {"C": 1e-5} }
},
{
"data": {
"dataset": "cifar",
"split_kwargs": { "train_size": 0.1, "test_size": null }
"split_kwargs": { "train_size": 0.1, "test_size": null },
"preprocessing_kwargs": { "normalize": "mean" }
},
"algorithm": { "estimator_params": {"C": 1e-9} }
},
{
"data": {
"dataset": "gisette",
"split_kwargs": { "train_size": 2000, "test_size": null }
"split_kwargs": { "train_size": 2000, "test_size": null },
"preprocessing_kwargs": { "normalize": "standard" }
},
"algorithm": { "estimator_params": {"C": 1e1} }
}
Expand Down
10 changes: 5 additions & 5 deletions configs/regular/svm.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } }
},
{
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
"algorithm": {
"estimator_params": { "C": 100.0, "kernel": ["linear", "poly", "rbf"] }
}
Expand All @@ -30,7 +30,7 @@
"data": {
"dataset": "mnist",
"split_kwargs": { "train_size": 20000, "test_size": null },
"preprocessing_kwargs": { "normalize": false }
"preprocessing_kwargs": {"normalize" : null}
Copy link
Copy Markdown
Contributor

@ethanglaser ethanglaser May 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"preprocessing_kwargs": {"normalize" : null}
"preprocessing_kwargs": { "normalize": null }

Minor adjustment but looks like spaces have been added before colons in a few places throughout the configs

},
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
}
Expand All @@ -45,7 +45,7 @@
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } }
},
{
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
},
{
Expand Down Expand Up @@ -75,7 +75,7 @@
"algorithm": { "estimator_params": { "nu": 0.1, "kernel": "rbf" } }
},
{
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
"algorithm": { "estimator_params": { "nu": 0.9, "kernel": ["linear", "rbf"] } }
}
],
Expand All @@ -89,7 +89,7 @@
"algorithm": { "estimator_params": { "nu": 0.8, "C": 2.0, "kernel": "rbf" } }
},
{
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
"algorithm": { "estimator_params": { "nu": 0.9, "C": 1.0, "kernel": "rbf" } }
},
{
Expand Down
2 changes: 1 addition & 1 deletion configs/testing/azure-pipelines-ci.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"random_state": 42
},
"preprocessing_kwargs": {
"normalize": true
"normalize": "standard"
}
},
"bench": { "n_runs": 5 },
Expand Down
11 changes: 8 additions & 3 deletions configs/weekly/dbscan.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,19 @@
"high-load dbscan datasets": {
"data": [
{
"dataset": ["cifar", "road_network", "covtype"],
"dataset" : "cifar",
"split_kwargs": { "ignore" : true },
Comment on lines +7 to +8
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"dataset" : "cifar",
"split_kwargs": { "ignore" : true },
"dataset": "cifar",
"split_kwargs": { "ignore": true },

"preprocessing_kwargs": { "normalize": "mean" }
},
{
"dataset": ["road_network", "covtype"],
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"dataset": "susy",
"split_kwargs": { "train_size": 800000 },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"source": "make_blobs",
Expand Down
5 changes: 3 additions & 2 deletions configs/weekly/kmeans.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"data": {
"dataset": ["susy", "hepmass"],
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
}
},
{
Expand All @@ -37,7 +37,8 @@
{
"data": {
"dataset": "cifar",
"split_kwargs": { "ignore": true }
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": "mean" }
}
}
]
Expand Down
2 changes: 1 addition & 1 deletion configs/weekly/linear_model.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"susy"
],
"preprocessing_kwargs": {
"normalize": true
"normalize": "standard"
},
"split_kwargs": { "ignore": true }
}
Expand Down
2 changes: 1 addition & 1 deletion configs/weekly/svm.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"data": {
"dataset": "mnist",
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": false }
"preprocessing_kwargs": { "normalize": null }
},
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
}
Expand Down
9 changes: 8 additions & 1 deletion configs/weekly/tsne.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,16 @@
},
{
"data": {
"dataset": ["sensit", "mnist", "cifar"],
"dataset": ["sensit", "mnist"],
"split_kwargs": { "ignore": true }
}
},
{
"data": {
"dataset" : "cifar",
"split_kwargs": { "ignore" : true },
"preprocessing_kwargs": { "normalize": "mean" }
}
}
]
},
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ pandas
tabulate
fastparquet
h5py
openml
openpyxl
tqdm
psutil
Expand Down
45 changes: 45 additions & 0 deletions sklbench/benchmarks/sklearn_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,48 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
return acceleration_lines > 0 and fallback_lines == 0


def validate_estimator_params(estimator_class, estimator_params: Dict) -> Dict:
"""Validates parameters and returns only those supported by the estimator.

Args:
estimator_class: The estimator class to validate against
estimator_params: Dictionary of parameters to validate

Returns:
Dictionary with only valid parameters
"""
try:
init_signature = inspect.signature(estimator_class.__init__)
valid_params = set(init_signature.parameters.keys()) - {"self"}

# Check if estimator accepts **kwargs
has_var_keyword = any(
param.kind == inspect.Parameter.VAR_KEYWORD
for param in init_signature.parameters.values()
)

# If accepts **kwargs, return all params
if has_var_keyword:
return estimator_params

# Filter out invalid params and warn
filtered_params = {}
for param_name, param_value in estimator_params.items():
if param_name in valid_params:
filtered_params[param_name] = param_value
else:
logger.warning(
f"Parameter '{param_name}' is not supported by "
f"{estimator_class.__name__} and will be ignored"
)

return filtered_params

except Exception as e:
logger.debug(f"Could not validate parameters for {estimator_class.__name__}: {e}")
return estimator_params


def create_online_function(method_instance, data_args, batch_size):
n_batches = data_args[0].shape[0] // batch_size

Expand Down Expand Up @@ -491,6 +533,9 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
bench_case, "algorithm:estimator_params", dict()
)

# validate and filter estimator parameters
estimator_params = validate_estimator_params(estimator_class, estimator_params)

# get estimator methods for measurement
estimator_methods = get_estimator_methods(bench_case)

Expand Down
22 changes: 18 additions & 4 deletions sklbench/datasets/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import (
MinMaxScaler,
OneHotEncoder,
OrdinalEncoder,
StandardScaler,
)

from ..utils.custom_types import Array
from ..utils.logger import logger
Expand Down Expand Up @@ -167,7 +172,7 @@ def preprocess_x(
x: Array,
replace_nan="auto",
category_encoding="ordinal",
normalize=False,
normalize=None,
force_for_sparse=True,
**kwargs,
) -> Array:
Expand Down Expand Up @@ -219,9 +224,18 @@ def preprocess_x(
pass
else:
logger.warning(f'Unknown "{category_encoding}" category encoding type.')
# Mean-Standard normalization
# Normalization
if normalize:
x = (x - x.mean()) / x.std()
if normalize == "standard":
scaler = StandardScaler(with_mean=True, with_std=True)
elif normalize == "mean":
scaler = StandardScaler(with_mean=True, with_std=False)
elif normalize == "minmax":
scaler = MinMaxScaler(feature_range=(0, 1))
else:
logger.warning(f'Unknown "{normalize}" normalization type.')
if scaler is not None:
return pd.DataFrame(scaler.fit_transform(x), columns=x.columns, index=x.index)
if return_type == np.ndarray:
return x.values
else:
Expand Down
Loading
Loading