feature-engine · BALOGUN-DAVID · May 31, 2026 · May 31, 2026
diff --git a/feature_engine/encoding/one_hot.py b/feature_engine/encoding/one_hot.py
@@ -1,6 +1,7 @@
 # Authors: Soledad Galli <solegalli@protonmail.com>
 # License: BSD 3 clause
 
+import warnings
 from typing import List, Optional, Union
 
 import numpy as np
@@ -94,6 +95,19 @@ class OneHotEncoder(CategoricalMethodsMixin, CategoricalInitMixin):
         to `True`, will ensure that for every binary variable in the dataset, only 1
         dummy is created.
 
+    drop: str, default=None
+        Controls which category to drop when creating k-1 dummy variables. Only used
+        if `top_categories` is None. If `drop` is not None and `drop_last` is also
+        True, a `FutureWarning` is raised and `drop` takes precedence.
+
+        - ``None``: No category is dropped (k dummies). Equivalent to
+          ``drop_last=False``.
+        - ``'last'``: Drops the last category in alphabetical order.
+        - ``'first'``: Drops the first category in alphabetical order.
+        - ``'most_frequent'``: Drops the most frequent category found during ``fit()``.
+          If there is a tie, a ``UserWarning`` is raised and the first
+          category alphabetically among the tied categories is dropped.
+
     {variables}
 
     {ignore_format}
@@ -162,6 +176,7 @@ def __init__(
         top_categories: Optional[int] = None,
         drop_last: bool = False,
         drop_last_binary: bool = False,
+        drop: Optional[str] = None,
         variables: Union[None, int, str, List[Union[str, int]]] = None,
         ignore_format: bool = False,
     ) -> None:
@@ -185,10 +200,26 @@ def __init__(
                 f"Got {drop_last_binary} instead."
             )
 
+        if drop is not None and drop not in ("last", "first", "most_frequent"):
+            raise ValueError(
+                "drop takes only values None, 'last', 'first', or "
+                f"'most_frequent'. Got {drop} instead."
+            )
+
+        if drop is not None and drop_last is True:
+            warnings.warn(
+                "Both `drop_last` and `drop` were set. `drop_last` is deprecated "
+                "when used together with `drop`. `drop` will take precedence. "
+                "In future versions, `drop_last` will be removed.",
+                FutureWarning,
+                stacklevel=2,
+            )
+
         super().__init__(variables, ignore_format)
         self.top_categories = top_categories
         self.drop_last = drop_last
         self.drop_last_binary = drop_last_binary
+        self.drop = drop
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
@@ -230,8 +261,41 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
             else:
                 category_ls = list(X[var].unique())
 
-                # return k-1 dummies
-                if self.drop_last:
+                if self.drop is not None:
+                    sorted_cats = sorted(category_ls)
+
+                    if self.drop == "last":
+                        self.encoder_dict_[var] = sorted_cats[:-1]
+
+                    elif self.drop == "first":
+                        self.encoder_dict_[var] = sorted_cats[1:]
+
+                    elif self.drop == "most_frequent":
+                        freq = X[var].value_counts()
+                        max_freq = freq.iloc[0]
+                        most_frequent_cats = freq[
+                            freq == max_freq
+                        ].index.tolist()
+
+                        if len(most_frequent_cats) > 1:
+                            cat_to_drop = sorted(most_frequent_cats)[0]
+                            warnings.warn(
+                                f"Variable '{var}': multiple categories "
+                                f"share the highest frequency ({max_freq}). "
+                                f"Dropping '{cat_to_drop}' (first "
+                                f"alphabetically among ties).",
+                                UserWarning,
+                                stacklevel=2,
+                            )
+                        else:
+                            cat_to_drop = most_frequent_cats[0]
+
+                        self.encoder_dict_[var] = [
+                            c for c in category_ls if c != cat_to_drop
+                        ]
+
+                # Legacy path: drop_last (backward compatible)
+                elif self.drop_last:
                     self.encoder_dict_[var] = category_ls[:-1]
 
                 # return k dummies

diff --git a/tests/test_encoding/test_onehot_encoder.py b/tests/test_encoding/test_onehot_encoder.py
@@ -534,3 +534,206 @@ def test_inverse_transform_raises_not_implemented_error(df_enc_binary):
     enc = OneHotEncoder().fit(df_enc_binary)
     with pytest.raises(NotImplementedError):
         enc.inverse_transform(df_enc_binary)
+
+
+# ===========================================================================
+# Tests for the new `drop` parameter (Issue #913)
+# ===========================================================================
+
+
+@pytest.fixture(scope="module")
+def df_drop():
+    """DataFrame with known categories for testing drop strategies."""
+    df = pd.DataFrame(
+        {
+            "x1": ["c", "a", "b", "a", "c", "b", "a"],
+            "x2": ["z", "y", "z", "x", "y", "z", "x"],
+            "num": [1, 2, 3, 4, 5, 6, 7],
+        }
+    )
+    return df
+
+
+def test_drop_last_alphabetically(df_drop):
+    """drop='last' should drop the last category in sorted order."""
+    encoder = OneHotEncoder(drop="last")
+    encoder.fit(df_drop)
+
+    # x1 categories sorted: ['a', 'b', 'c'] -> drop 'c'
+    assert encoder.encoder_dict_["x1"] == ["a", "b"]
+    # x2 categories sorted: ['x', 'y', 'z'] -> drop 'z'
+    assert encoder.encoder_dict_["x2"] == ["x", "y"]
+
+    X = encoder.transform(df_drop)
+    assert "x1_c" not in X.columns
+    assert "x2_z" not in X.columns
+    assert "x1_a" in X.columns
+    assert "x1_b" in X.columns
+    assert "x2_x" in X.columns
+    assert "x2_y" in X.columns
+
+
+def test_drop_first_alphabetically(df_drop):
+    """drop='first' should drop the first category in sorted order."""
+    encoder = OneHotEncoder(drop="first")
+    encoder.fit(df_drop)
+
+    # x1 categories sorted: ['a', 'b', 'c'] -> drop 'a'
+    assert encoder.encoder_dict_["x1"] == ["b", "c"]
+    # x2 categories sorted: ['x', 'y', 'z'] -> drop 'x'
+    assert encoder.encoder_dict_["x2"] == ["y", "z"]
+
+    X = encoder.transform(df_drop)
+    assert "x1_a" not in X.columns
+    assert "x2_x" not in X.columns
+    assert "x1_b" in X.columns
+    assert "x1_c" in X.columns
+    assert "x2_y" in X.columns
+    assert "x2_z" in X.columns
+
+
+def test_drop_most_frequent():
+    """drop='most_frequent' should drop the most common category."""
+    df = pd.DataFrame(
+        {
+            "x1": ["a"] * 10 + ["b"] * 5 + ["c"] * 3,
+        }
+    )
+
+    encoder = OneHotEncoder(drop="most_frequent")
+    encoder.fit(df)
+
+    # 'a' is most frequent (10 times) -> drop 'a'
+    assert "a" not in encoder.encoder_dict_["x1"]
+    assert "b" in encoder.encoder_dict_["x1"]
+    assert "c" in encoder.encoder_dict_["x1"]
+
+    X = encoder.transform(df)
+    assert "x1_a" not in X.columns
+    assert "x1_b" in X.columns
+    assert "x1_c" in X.columns
+
+
+def test_drop_most_frequent_with_tie():
+    """When multiple categories tie for most frequent, warn and drop first alpha."""
+    df = pd.DataFrame(
+        {
+            "x1": ["c"] * 5 + ["a"] * 5 + ["b"] * 3,
+        }
+    )
+
+    with pytest.warns(UserWarning, match="multiple categories share the highest"):
+        encoder = OneHotEncoder(drop="most_frequent")
+        encoder.fit(df)
+
+    # 'a' and 'c' both have frequency 5 — drop 'a' (first alphabetically)
+    assert "a" not in encoder.encoder_dict_["x1"]
+    assert "b" in encoder.encoder_dict_["x1"]
+    assert "c" in encoder.encoder_dict_["x1"]
+
+
+def test_drop_ignored_when_top_categories_set():
+    """top_categories should take precedence over drop."""
+    df = pd.DataFrame(
+        {
+            "x1": ["a"] * 10 + ["b"] * 5 + ["c"] * 3 + ["d"] * 1,
+        }
+    )
+
+    encoder = OneHotEncoder(top_categories=2, drop="first")
+    encoder.fit(df)
+
+    # top_categories=2 should pick the 2 most frequent: ['a', 'b']
+    assert encoder.encoder_dict_["x1"] == ["a", "b"]
+
+
+def test_drop_overrides_drop_last():
+    """When both drop and drop_last are set, drop wins and FutureWarning is raised."""
+    df = pd.DataFrame(
+        {
+            "x1": ["c", "a", "b", "a", "c", "b", "a"],
+        }
+    )
+
+    with pytest.warns(FutureWarning, match="drop_last.*deprecated"):
+        encoder = OneHotEncoder(drop_last=True, drop="first")
+
+    encoder.fit(df)
+
+    # drop="first" should drop 'a' (sorted: ['a', 'b', 'c'])
+    assert encoder.encoder_dict_["x1"] == ["b", "c"]
+
+
+def test_drop_with_drop_last_binary():
+    """drop and drop_last_binary should work together correctly."""
+    df = pd.DataFrame(
+        {
+            "x1": ["a"] * 10 + ["b"] * 5 + ["c"] * 3,
+            "x2": ["yes"] * 10 + ["no"] * 8,  # binary variable
+        }
+    )
+
+    encoder = OneHotEncoder(drop="first", drop_last_binary=True)
+    encoder.fit(df)
+
+    # x1: sorted ['a', 'b', 'c'] -> drop 'a'
+    assert encoder.encoder_dict_["x1"] == ["b", "c"]
+
+    # x2: binary -> drop_last_binary overrides to keep only the first unique
+    assert len(encoder.encoder_dict_["x2"]) == 1
+
+
+@pytest.mark.parametrize(
+    "drop_value", ["empanada", "middle", 123, True, ["last"]]
+)
+def test_error_if_drop_not_valid_string(drop_value):
+    """Invalid drop values should raise ValueError."""
+    with pytest.raises(ValueError, match="drop takes only values"):
+        OneHotEncoder(drop=drop_value)
+
+
+def test_get_feature_names_out_with_drop(df_enc_binary):
+    """get_feature_names_out should reflect the dropped category."""
+    original_features = ["var_num"]
+    input_features = df_enc_binary.columns
+
+    # drop="first": sorted cats for var_A are ['A','B','C'] -> drop 'A'
+    tr = OneHotEncoder(drop="first")
+    tr.fit(df_enc_binary)
+
+    out = [
+        "var_A_B",
+        "var_A_C",
+        "var_B_B",
+        "var_B_C",
+        "var_C_UHU",
+        "var_D_OHO",
+    ]
+    feat_out = original_features + out
+    assert tr.get_feature_names_out(input_features=None) == feat_out
+    assert tr.get_feature_names_out(input_features=input_features) == feat_out
+
+
+def test_drop_none_produces_k_dummies(df_drop):
+    """drop=None (default) should produce k dummies, same as drop_last=False."""
+    encoder = OneHotEncoder(drop=None, drop_last=False)
+    encoder.fit(df_drop)
+
+    # x1 has 3 unique categories -> 3 dummies
+    assert len(encoder.encoder_dict_["x1"]) == 3
+    # x2 has 3 unique categories -> 3 dummies
+    assert len(encoder.encoder_dict_["x2"]) == 3
+
+
+def test_drop_last_backward_compatible(df_drop):
+    """Existing drop_last=True without drop should behave exactly as before."""
+    encoder = OneHotEncoder(drop_last=True)
+    encoder.fit(df_drop)
+
+    # Original behavior: category_ls = list(unique()), drop last element
+    # This preserves insertion order, NOT sorted order
+    x1_unique = list(df_drop["x1"].unique())
+    assert encoder.encoder_dict_["x1"] == x1_unique[:-1]
+
+    x2_unique = list(df_drop["x2"].unique())
+    assert encoder.encoder_dict_["x2"] == x2_unique[:-1]