Merge pull request #5 from OpenTabular/main

AnFreTh · web-flow · commit 3cea0c1f6f60 · 2025-04-13T10:54:39.000+02:00
v0.0.2
diff --git a/README.md b/README.md
@@ -54,34 +54,43 @@ pip install -e .
 
 ```python
 import pandas as pd
-from pretab import Preprocessor
+import numpy as np
+from pretab.preprocessor import Preprocessor
 
+# Simulated tabular dataset
 df = pd.DataFrame({
-    "age": [22, 35, 46, 59],
-    "income": [40000, 52000, 98000, 87000],
-    "job": ["nurse", "engineer", "scientist", "teacher"]
+    "age": np.random.randint(18, 65, size=100),
+    "income": np.random.normal(60000, 15000, size=100).astype(int),
+    "job": np.random.choice(["nurse", "engineer", "scientist", "teacher", "artist", "manager"], size=100),
+    "city": np.random.choice(["Berlin", "Munich", "Hamburg", "Cologne"], size=100),
+    "experience": np.random.randint(0, 40, size=100)
 })
 
-# Optional feature-specific config
+y = np.random.randn(100, 1)
+
+# Optional feature-specific preprocessing config
 config = {
     "age": "ple",
     "income": "rbf",
-    "job": "one-hot"
+    "experience": "quantile",
+    "job": "one-hot",
+    "city": "none"
 }
 
+# Initialize Preprocessor
 preprocessor = Preprocessor(
     feature_preprocessing=config,
     task="regression"
 )
 
-# Fit and transform
-X_dict = preprocessor.fit_transform(df)
+# Fit and transform the data into a dictionary of feature arrays
+X_dict = preprocessor.fit_transform(df, y)
 
-# Optionally get stacked array
-X_array = preprocessor.transform(df, return_dict=False)
+# Optionally get a stacked array instead of a dictionary
+X_array = preprocessor.transform(df, return_array=True)
 
-# Get feature info
-preprocessor.get_feature_info()
+# Get feature metadata
+preprocessor.get_feature_info(verbose=True)
 ```
 
 ---
diff --git a/pretab/__version__.py b/pretab/__version__.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.0.1"
+__version__ = "0.0.2"
diff --git a/pretab/utils/get_numerical.py b/pretab/utils/get_numerical.py
@@ -45,10 +45,11 @@ def get_numerical_transformer_steps(
             ("imputer", SimpleImputer(strategy=imputer_strategy, **imputer_kwargs))
         )
 
-    if scaling == "standardization":
-        steps.append(("scaler", StandardScaler()))
-    elif scaling == "minmax":
-        steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
+    # Define scalers that could be added independently
+    scalers = {
+        "standardization": ("scaler", StandardScaler()),
+        "minmax": ("minmax", MinMaxScaler(feature_range=(-1, 1))),
+    }
 
     method_map = {
         "standardization": (StandardScaler, []),
@@ -93,6 +94,10 @@ def get_numerical_transformer_steps(
         "none": (NoTransformer, []),
     }
 
+    # Add optional scaling step only if not already part of method
+    if scaling in scalers and scaling != method:
+        steps.append(scalers[scaling])
+
     if method not in method_map:
         raise ValueError(f"Unknown numerical transformer method: {method}")