Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 66 additions & 2 deletions feature_engine/encoding/one_hot.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause

import warnings
from typing import List, Optional, Union

import numpy as np
Expand Down Expand Up @@ -94,6 +95,19 @@ class OneHotEncoder(CategoricalMethodsMixin, CategoricalInitMixin):
to `True`, will ensure that for every binary variable in the dataset, only 1
dummy is created.

drop: str, default=None
Controls which category to drop when creating k-1 dummy variables. Only used
if `top_categories` is None. If `drop` is not None and `drop_last` is also
True, a `FutureWarning` is raised and `drop` takes precedence.

- ``None``: No category is dropped (k dummies). Equivalent to
``drop_last=False``.
- ``'last'``: Drops the last category in alphabetical order.
- ``'first'``: Drops the first category in alphabetical order.
- ``'most_frequent'``: Drops the most frequent category found during ``fit()``.
If there is a tie, a ``UserWarning`` is raised and the first
category alphabetically among the tied categories is dropped.

{variables}

{ignore_format}
Expand Down Expand Up @@ -162,6 +176,7 @@ def __init__(
top_categories: Optional[int] = None,
drop_last: bool = False,
drop_last_binary: bool = False,
drop: Optional[str] = None,
variables: Union[None, int, str, List[Union[str, int]]] = None,
ignore_format: bool = False,
) -> None:
Expand All @@ -185,10 +200,26 @@ def __init__(
f"Got {drop_last_binary} instead."
)

if drop is not None and drop not in ("last", "first", "most_frequent"):
raise ValueError(
"drop takes only values None, 'last', 'first', or "
f"'most_frequent'. Got {drop} instead."
)

if drop is not None and drop_last is True:
warnings.warn(
"Both `drop_last` and `drop` were set. `drop_last` is deprecated "
"when used together with `drop`. `drop` will take precedence. "
"In future versions, `drop_last` will be removed.",
FutureWarning,
stacklevel=2,
)

super().__init__(variables, ignore_format)
self.top_categories = top_categories
self.drop_last = drop_last
self.drop_last_binary = drop_last_binary
self.drop = drop

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Expand Down Expand Up @@ -230,8 +261,41 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
else:
category_ls = list(X[var].unique())

# return k-1 dummies
if self.drop_last:
if self.drop is not None:
sorted_cats = sorted(category_ls)

if self.drop == "last":
self.encoder_dict_[var] = sorted_cats[:-1]

elif self.drop == "first":
self.encoder_dict_[var] = sorted_cats[1:]

elif self.drop == "most_frequent":
freq = X[var].value_counts()
max_freq = freq.iloc[0]
most_frequent_cats = freq[
freq == max_freq
].index.tolist()

if len(most_frequent_cats) > 1:
cat_to_drop = sorted(most_frequent_cats)[0]
warnings.warn(
f"Variable '{var}': multiple categories "
f"share the highest frequency ({max_freq}). "
f"Dropping '{cat_to_drop}' (first "
f"alphabetically among ties).",
UserWarning,
stacklevel=2,
)
else:
cat_to_drop = most_frequent_cats[0]

self.encoder_dict_[var] = [
c for c in category_ls if c != cat_to_drop
]

# Legacy path: drop_last (backward compatible)
elif self.drop_last:
self.encoder_dict_[var] = category_ls[:-1]

# return k dummies
Expand Down
203 changes: 203 additions & 0 deletions tests/test_encoding/test_onehot_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,3 +534,206 @@ def test_inverse_transform_raises_not_implemented_error(df_enc_binary):
enc = OneHotEncoder().fit(df_enc_binary)
with pytest.raises(NotImplementedError):
enc.inverse_transform(df_enc_binary)


# ===========================================================================
# Tests for the new `drop` parameter (Issue #913)
# ===========================================================================


@pytest.fixture(scope="module")
def df_drop():
"""DataFrame with known categories for testing drop strategies."""
df = pd.DataFrame(
{
"x1": ["c", "a", "b", "a", "c", "b", "a"],
"x2": ["z", "y", "z", "x", "y", "z", "x"],
"num": [1, 2, 3, 4, 5, 6, 7],
}
)
return df


def test_drop_last_alphabetically(df_drop):
"""drop='last' should drop the last category in sorted order."""
encoder = OneHotEncoder(drop="last")
encoder.fit(df_drop)

# x1 categories sorted: ['a', 'b', 'c'] -> drop 'c'
assert encoder.encoder_dict_["x1"] == ["a", "b"]
# x2 categories sorted: ['x', 'y', 'z'] -> drop 'z'
assert encoder.encoder_dict_["x2"] == ["x", "y"]

X = encoder.transform(df_drop)
assert "x1_c" not in X.columns
assert "x2_z" not in X.columns
assert "x1_a" in X.columns
assert "x1_b" in X.columns
assert "x2_x" in X.columns
assert "x2_y" in X.columns


def test_drop_first_alphabetically(df_drop):
"""drop='first' should drop the first category in sorted order."""
encoder = OneHotEncoder(drop="first")
encoder.fit(df_drop)

# x1 categories sorted: ['a', 'b', 'c'] -> drop 'a'
assert encoder.encoder_dict_["x1"] == ["b", "c"]
# x2 categories sorted: ['x', 'y', 'z'] -> drop 'x'
assert encoder.encoder_dict_["x2"] == ["y", "z"]

X = encoder.transform(df_drop)
assert "x1_a" not in X.columns
assert "x2_x" not in X.columns
assert "x1_b" in X.columns
assert "x1_c" in X.columns
assert "x2_y" in X.columns
assert "x2_z" in X.columns


def test_drop_most_frequent():
"""drop='most_frequent' should drop the most common category."""
df = pd.DataFrame(
{
"x1": ["a"] * 10 + ["b"] * 5 + ["c"] * 3,
}
)

encoder = OneHotEncoder(drop="most_frequent")
encoder.fit(df)

# 'a' is most frequent (10 times) -> drop 'a'
assert "a" not in encoder.encoder_dict_["x1"]
assert "b" in encoder.encoder_dict_["x1"]
assert "c" in encoder.encoder_dict_["x1"]

X = encoder.transform(df)
assert "x1_a" not in X.columns
assert "x1_b" in X.columns
assert "x1_c" in X.columns


def test_drop_most_frequent_with_tie():
"""When multiple categories tie for most frequent, warn and drop first alpha."""
df = pd.DataFrame(
{
"x1": ["c"] * 5 + ["a"] * 5 + ["b"] * 3,
}
)

with pytest.warns(UserWarning, match="multiple categories share the highest"):
encoder = OneHotEncoder(drop="most_frequent")
encoder.fit(df)

# 'a' and 'c' both have frequency 5 — drop 'a' (first alphabetically)
assert "a" not in encoder.encoder_dict_["x1"]
assert "b" in encoder.encoder_dict_["x1"]
assert "c" in encoder.encoder_dict_["x1"]


def test_drop_ignored_when_top_categories_set():
"""top_categories should take precedence over drop."""
df = pd.DataFrame(
{
"x1": ["a"] * 10 + ["b"] * 5 + ["c"] * 3 + ["d"] * 1,
}
)

encoder = OneHotEncoder(top_categories=2, drop="first")
encoder.fit(df)

# top_categories=2 should pick the 2 most frequent: ['a', 'b']
assert encoder.encoder_dict_["x1"] == ["a", "b"]


def test_drop_overrides_drop_last():
"""When both drop and drop_last are set, drop wins and FutureWarning is raised."""
df = pd.DataFrame(
{
"x1": ["c", "a", "b", "a", "c", "b", "a"],
}
)

with pytest.warns(FutureWarning, match="drop_last.*deprecated"):
encoder = OneHotEncoder(drop_last=True, drop="first")

encoder.fit(df)

# drop="first" should drop 'a' (sorted: ['a', 'b', 'c'])
assert encoder.encoder_dict_["x1"] == ["b", "c"]


def test_drop_with_drop_last_binary():
"""drop and drop_last_binary should work together correctly."""
df = pd.DataFrame(
{
"x1": ["a"] * 10 + ["b"] * 5 + ["c"] * 3,
"x2": ["yes"] * 10 + ["no"] * 8, # binary variable
}
)

encoder = OneHotEncoder(drop="first", drop_last_binary=True)
encoder.fit(df)

# x1: sorted ['a', 'b', 'c'] -> drop 'a'
assert encoder.encoder_dict_["x1"] == ["b", "c"]

# x2: binary -> drop_last_binary overrides to keep only the first unique
assert len(encoder.encoder_dict_["x2"]) == 1


@pytest.mark.parametrize(
"drop_value", ["empanada", "middle", 123, True, ["last"]]
)
def test_error_if_drop_not_valid_string(drop_value):
"""Invalid drop values should raise ValueError."""
with pytest.raises(ValueError, match="drop takes only values"):
OneHotEncoder(drop=drop_value)


def test_get_feature_names_out_with_drop(df_enc_binary):
"""get_feature_names_out should reflect the dropped category."""
original_features = ["var_num"]
input_features = df_enc_binary.columns

# drop="first": sorted cats for var_A are ['A','B','C'] -> drop 'A'
tr = OneHotEncoder(drop="first")
tr.fit(df_enc_binary)

out = [
"var_A_B",
"var_A_C",
"var_B_B",
"var_B_C",
"var_C_UHU",
"var_D_OHO",
]
feat_out = original_features + out
assert tr.get_feature_names_out(input_features=None) == feat_out
assert tr.get_feature_names_out(input_features=input_features) == feat_out


def test_drop_none_produces_k_dummies(df_drop):
"""drop=None (default) should produce k dummies, same as drop_last=False."""
encoder = OneHotEncoder(drop=None, drop_last=False)
encoder.fit(df_drop)

# x1 has 3 unique categories -> 3 dummies
assert len(encoder.encoder_dict_["x1"]) == 3
# x2 has 3 unique categories -> 3 dummies
assert len(encoder.encoder_dict_["x2"]) == 3


def test_drop_last_backward_compatible(df_drop):
"""Existing drop_last=True without drop should behave exactly as before."""
encoder = OneHotEncoder(drop_last=True)
encoder.fit(df_drop)

# Original behavior: category_ls = list(unique()), drop last element
# This preserves insertion order, NOT sorted order
x1_unique = list(df_drop["x1"].unique())
assert encoder.encoder_dict_["x1"] == x1_unique[:-1]

x2_unique = list(df_drop["x2"].unique())
assert encoder.encoder_dict_["x2"] == x2_unique[:-1]