From 88d15bc7dd2c51f8f3edad45ba7f873b02911cf6 Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Mon, 9 Feb 2026 08:24:26 -0600
Subject: [PATCH 1/2] Add TextFeatures transformer for extracting text features

- Add TextFeatures class to extract 19 text features from string columns
- Features include char_count, word_count, sentence_count, digit_count, etc.
- Add comprehensive test suite (16 tests)
---
 feature_engine/text/__init__.py       |   9 +
 feature_engine/text/text_features.py  | 327 ++++++++++++++++++++++++++
 tests/test_text/__init__.py           | 167 +++++++++++++
 tests/test_text/test_text_features.py | 167 +++++++++++++
 4 files changed, 670 insertions(+)
 create mode 100644 feature_engine/text/__init__.py
 create mode 100644 feature_engine/text/text_features.py
 create mode 100644 tests/test_text/__init__.py
 create mode 100644 tests/test_text/test_text_features.py

diff --git a/feature_engine/text/__init__.py b/feature_engine/text/__init__.py
new file mode 100644
index 000000000..14626b79c
--- /dev/null
+++ b/feature_engine/text/__init__.py
@@ -0,0 +1,9 @@
+"""
+The module text includes classes to extract features from text/string variables.
+"""
+
+from .text_features import TextFeatures
+
+__all__ = [
+    "TextFeatures",
+]
diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py
new file mode 100644
index 000000000..c06afdf79
--- /dev/null
+++ b/feature_engine/text/text_features.py
@@ -0,0 +1,327 @@
+# Authors: Ankit Hemant Lade (contributor)
+# License: BSD 3 clause
+
+from typing import List, Optional, Union
+
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import check_is_fitted
+
+from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin
+from feature_engine._check_init_parameters.check_init_input_params import (
+    _check_param_drop_original,
+)
+from feature_engine.dataframe_checks import _check_X_matches_training_df, check_X
+from feature_engine.tags import _return_tags
+
+# Available text features and their computation functions
+TEXT_FEATURES = {
+    "char_count": lambda x: x.str.len(),
+    "word_count": lambda x: x.str.split().str.len(),
+    "sentence_count": lambda x: x.str.count(r"[.!?]+"),
+    "avg_word_length": lambda x: x.apply(
+        lambda s: sum(len(w) for w in str(s).split()) / max(len(str(s).split()), 1)
+    ),
+    "digit_count": lambda x: x.str.count(r"\d"),
+    "uppercase_count": lambda x: x.str.count(r"[A-Z]"),
+    "lowercase_count": lambda x: x.str.count(r"[a-z]"),
+    "special_char_count": lambda x: x.str.count(r"[^a-zA-Z0-9\s]"),
+    "whitespace_count": lambda x: x.str.count(r"\s"),
+    "whitespace_ratio": lambda x: x.str.count(r"\s") / x.str.len().replace(0, 1),
+    "digit_ratio": lambda x: x.str.count(r"\d") / x.str.len().replace(0, 1),
+    "uppercase_ratio": lambda x: x.str.count(r"[A-Z]") / x.str.len().replace(0, 1),
+    "has_digits": lambda x: x.str.contains(r"\d", regex=True).astype(int),
+    "has_uppercase": lambda x: x.str.contains(r"[A-Z]", regex=True).astype(int),
+    "is_empty": lambda x: (x.str.len() == 0).astype(int),
+    "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int),
+    "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int),
+    "unique_word_count": lambda x: x.apply(lambda s: len(set(str(s).lower().split()))),
+    "unique_word_ratio": lambda x: x.apply(
+        lambda s: len(set(str(s).lower().split())) / max(len(str(s).split()), 1)
+    ),
+}
+
+
+class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin):
+    """
+    TextFeatures() extracts numerical features from text/string variables. This
+    transformer is useful for extracting basic text statistics that can be used
+    as features in machine learning models.
+
+    The transformer can extract various text features including character counts,
+    word counts, sentence counts, and various ratios and indicators.
+
+    A list of variables can be passed as an argument. Alternatively, the transformer
+    will automatically select and transform all variables of type object (string).
+
+    More details in the :ref:`User Guide <text_features>`.
+
+    Parameters
+    ----------
+    variables: list, default=None
+        The list of text/string variables to extract features from. If None, the
+        transformer will automatically select all object (string) columns.
+
+    features: list, default=None
+        List of text features to extract. Available features are:
+
+        - 'char_count': Number of characters in the text
+        - 'word_count': Number of words (whitespace-separated tokens)
+        - 'sentence_count': Number of sentences (based on .!? punctuation)
+        - 'avg_word_length': Average length of words
+        - 'digit_count': Number of digit characters
+        - 'uppercase_count': Number of uppercase letters
+        - 'lowercase_count': Number of lowercase letters
+        - 'special_char_count': Number of special characters (non-alphanumeric)
+        - 'whitespace_count': Number of whitespace characters
+        - 'whitespace_ratio': Ratio of whitespace to total characters
+        - 'digit_ratio': Ratio of digits to total characters
+        - 'uppercase_ratio': Ratio of uppercase to total characters
+        - 'has_digits': Binary indicator if text contains digits
+        - 'has_uppercase': Binary indicator if text contains uppercase
+        - 'is_empty': Binary indicator if text is empty
+        - 'starts_with_uppercase': Binary indicator if text starts with uppercase
+        - 'ends_with_punctuation': Binary indicator if text ends with .!?
+        - 'unique_word_count': Number of unique words (case-insensitive)
+        - 'unique_word_ratio': Ratio of unique words to total words
+
+        If None, extracts all available features.
+
+    drop_original: bool, default=False
+        Whether to drop the original text columns after transformation.
+
+    Attributes
+    ----------
+    variables_:
+        The list of text variables that will be transformed.
+
+    features_:
+        The list of features that will be extracted.
+
+    feature_names_in_:
+        List with the names of features seen during fit.
+
+    n_features_in_:
+        The number of features in the train set used in fit.
+
+    Methods
+    -------
+    fit:
+        This transformer does not learn parameters. It stores the feature names
+        and validates input.
+
+    fit_transform:
+        Fit to data, then transform it.
+
+    transform:
+        Extract text features and add them to the dataframe.
+
+    get_feature_names_out:
+        Get output feature names for transformation.
+
+    See Also
+    --------
+    feature_engine.encoding.StringSimilarityEncoder :
+        Encodes categorical variables based on string similarity.
+
+    Examples
+    --------
+
+    >>> import pandas as pd
+    >>> from feature_engine.text import TextFeatures
+    >>> X = pd.DataFrame({
+    ...     'text': ['Hello World!', 'Python is GREAT.', 'ML rocks 123']
+    ... })
+    >>> tf = TextFeatures(features=['char_count', 'word_count', 'has_digits'])
+    >>> tf.fit(X)
+    >>> X = tf.transform(X)
+    >>> X
+                   text  text_char_count  text_word_count  text_has_digits
+    0      Hello World!               12                2                0
+    1  Python is GREAT.               16                3                0
+    2       ML rocks 123               12                3                1
+    """
+
+    def __init__(
+        self,
+        variables: Union[None, str, List[str]] = None,
+        features: Union[None, List[str]] = None,
+        drop_original: bool = False,
+    ) -> None:
+
+        # Validate variables
+        if variables is not None:
+            if isinstance(variables, str):
+                variables = [variables]
+            elif not isinstance(variables, list) or not all(
+                isinstance(v, str) for v in variables
+            ):
+                raise ValueError(
+                    "variables must be None, a string, or a list of strings. "
+                    f"Got {type(variables).__name__} instead."
+                )
+
+        # Validate features
+        if features is not None:
+            if not isinstance(features, list) or not all(
+                isinstance(f, str) for f in features
+            ):
+                raise ValueError(
+                    "features must be None or a list of strings. "
+                    f"Got {type(features).__name__} instead."
+                )
+            invalid_features = set(features) - set(TEXT_FEATURES.keys())
+            if invalid_features:
+                raise ValueError(
+                    f"Invalid features: {invalid_features}. "
+                    f"Available features are: {list(TEXT_FEATURES.keys())}"
+                )
+
+        _check_param_drop_original(drop_original)
+
+        self.variables = variables
+        self.features = features
+        self.drop_original = drop_original
+
+    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
+        """
+        This transformer does not learn parameters.
+
+        Stores feature names and validates that the specified variables are
+        present and are of string/object type.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The training input samples.
+
+        y: pandas Series, or np.array. Defaults to None.
+            It is not needed in this transformer. You can pass y or None.
+
+        Returns
+        -------
+        self: TextFeatures
+            The fitted transformer.
+        """
+
+        # check input dataframe
+        X = check_X(X)
+
+        # Find or validate text variables
+        if self.variables is None:
+            # Select object/string columns
+            self.variables_ = [col for col in X.columns if X[col].dtype == "object"]
+            if len(self.variables_) == 0:
+                raise ValueError(
+                    "No object/string columns found in the dataframe. "
+                    "Please specify variables explicitly."
+                )
+        else:
+            # Validate user-specified variables exist
+            missing = set(self.variables) - set(X.columns)
+            if missing:
+                raise ValueError(
+                    f"Variables {missing} are not present in the dataframe."
+                )
+            self.variables_ = self.variables
+
+        # Set features to extract
+        if self.features is None:
+            self.features_ = list(TEXT_FEATURES.keys())
+        else:
+            self.features_ = self.features
+
+        # save input features
+        self.feature_names_in_ = X.columns.tolist()
+
+        # save train set shape
+        self.n_features_in_ = X.shape[1]
+
+        return self
+
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        Extract text features and add them to the dataframe.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The data to transform.
+
+        Returns
+        -------
+        X_new: Pandas dataframe
+            The dataframe with the original columns plus the new text features.
+        """
+
+        # Check method fit has been called
+        check_is_fitted(self)
+
+        # check that input is a dataframe
+        X = check_X(X)
+
+        # Check if input data contains same number of columns as dataframe used to fit.
+        _check_X_matches_training_df(X, self.n_features_in_)
+
+        # reorder variables to match train set
+        X = X[self.feature_names_in_]
+
+        # Extract features for each text variable
+        for var in self.variables_:
+            # Fill NaN with empty string for feature extraction
+            text_col = X[var].fillna("")
+
+            for feature_name in self.features_:
+                new_col_name = f"{var}_{feature_name}"
+                feature_func = TEXT_FEATURES[feature_name]
+                X[new_col_name] = feature_func(text_col)
+
+                # Fill any NaN values resulting from computation with 0
+                X[new_col_name] = X[new_col_name].fillna(0)
+
+        if self.drop_original:
+            X = X.drop(columns=self.variables_)
+
+        return X
+
+    def get_feature_names_out(self, input_features=None) -> List[str]:
+        """
+        Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features. If None, uses feature_names_in_.
+
+        Returns
+        -------
+        feature_names_out : list of str
+            Output feature names.
+        """
+        check_is_fitted(self)
+
+        # Start with original features
+        if self.drop_original:
+            feature_names = [
+                f for f in self.feature_names_in_ if f not in self.variables_
+            ]
+        else:
+            feature_names = list(self.feature_names_in_)
+
+        # Add new text feature names
+        for var in self.variables_:
+            for feature_name in self.features_:
+                feature_names.append(f"{var}_{feature_name}")
+
+        return feature_names
+
+    def _more_tags(self):
+        tags_dict = _return_tags()
+        tags_dict["allow_nan"] = True
+        tags_dict["variables"] = "categorical"
+        return tags_dict
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
diff --git a/tests/test_text/__init__.py b/tests/test_text/__init__.py
new file mode 100644
index 000000000..f4e9de3ea
--- /dev/null
+++ b/tests/test_text/__init__.py
@@ -0,0 +1,167 @@
+import pandas as pd
+import pytest
+
+from feature_engine.text import TextFeatures
+
+
+class TestTextFeatures:
+    """Test cases for TextFeatures transformer."""
+
+    def test_default_all_features(self):
+        """Test extracting all features with default parameters."""
+        X = pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]})
+        transformer = TextFeatures()
+        X_tr = transformer.fit_transform(X)
+
+        # Check that new columns were added
+        assert "text_char_count" in X_tr.columns
+        assert "text_word_count" in X_tr.columns
+        assert "text_digit_count" in X_tr.columns
+
+        # Check char_count
+        assert X_tr["text_char_count"].tolist() == [12, 10, 2]
+
+        # Check word_count
+        assert X_tr["text_word_count"].tolist() == [2, 2, 1]
+
+        # Check digit_count
+        assert X_tr["text_digit_count"].tolist() == [0, 3, 0]
+
+    def test_specific_features(self):
+        """Test extracting specific features only."""
+        X = pd.DataFrame({"text": ["Hello", "World"]})
+        transformer = TextFeatures(features=["char_count", "word_count"])
+        X_tr = transformer.fit_transform(X)
+
+        # Check only specified features are extracted
+        assert "text_char_count" in X_tr.columns
+        assert "text_word_count" in X_tr.columns
+        assert "text_digit_count" not in X_tr.columns
+        assert "text_uppercase_count" not in X_tr.columns
+
+    def test_specific_variables(self):
+        """Test extracting features from specific variables only."""
+        X = pd.DataFrame(
+            {"text1": ["Hello", "World"], "text2": ["Foo", "Bar"], "numeric": [1, 2]}
+        )
+        transformer = TextFeatures(variables=["text1"], features=["char_count"])
+        X_tr = transformer.fit_transform(X)
+
+        # Only text1 should have features extracted
+        assert "text1_char_count" in X_tr.columns
+        assert "text2_char_count" not in X_tr.columns
+
+    def test_drop_original(self):
+        """Test drop_original parameter."""
+        X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]})
+        transformer = TextFeatures(features=["char_count"], drop_original=True)
+        X_tr = transformer.fit_transform(X)
+
+        assert "text" not in X_tr.columns
+        assert "text_char_count" in X_tr.columns
+        assert "other" in X_tr.columns
+
+    def test_empty_string_handling(self):
+        """Test handling of empty strings."""
+        X = pd.DataFrame({"text": ["", "Hello", ""]})
+        transformer = TextFeatures(features=["char_count", "word_count", "is_empty"])
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["text_char_count"].tolist() == [0, 5, 0]
+        assert X_tr["text_is_empty"].tolist() == [1, 0, 1]
+
+    def test_nan_handling(self):
+        """Test handling of NaN values."""
+        X = pd.DataFrame({"text": ["Hello", None, "World"]})
+        transformer = TextFeatures(features=["char_count"])
+        X_tr = transformer.fit_transform(X)
+
+        # NaN should be filled with empty string, resulting in char_count of 0
+        assert X_tr["text_char_count"].tolist() == [5, 0, 5]
+
+    def test_uppercase_features(self):
+        """Test uppercase-related features."""
+        X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]})
+        transformer = TextFeatures(
+            features=["uppercase_count", "has_uppercase", "starts_with_uppercase"]
+        )
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3]
+        assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1]
+        assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1]
+
+    def test_sentence_count(self):
+        """Test sentence counting."""
+        X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]})
+        transformer = TextFeatures(features=["sentence_count"])
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["text_sentence_count"].tolist() == [2, 0, 3]
+
+    def test_unique_word_features(self):
+        """Test unique word features."""
+        X = pd.DataFrame({"text": ["the the the", "a b c", "x"]})
+        transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"])
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1]
+        assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0]
+
+    def test_invalid_feature_raises_error(self):
+        """Test that invalid feature name raises ValueError."""
+        with pytest.raises(ValueError, match="Invalid features"):
+            TextFeatures(features=["invalid_feature"])
+
+    def test_invalid_variables_raises_error(self):
+        """Test that invalid variables parameter raises ValueError."""
+        with pytest.raises(ValueError, match="variables must be"):
+            TextFeatures(variables=123)
+
+    def test_missing_variable_raises_error(self):
+        """Test that missing variable raises ValueError on fit."""
+        X = pd.DataFrame({"text": ["Hello"]})
+        transformer = TextFeatures(variables=["nonexistent"])
+        with pytest.raises(ValueError, match="not present in the dataframe"):
+            transformer.fit(X)
+
+    def test_no_text_columns_raises_error(self):
+        """Test that no text columns raises error when variables=None."""
+        X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+        transformer = TextFeatures()
+        with pytest.raises(ValueError, match="No object/string columns found"):
+            transformer.fit(X)
+
+    def test_fit_stores_attributes(self):
+        """Test that fit stores expected attributes."""
+        X = pd.DataFrame({"text": ["Hello"]})
+        transformer = TextFeatures()
+        transformer.fit(X)
+
+        assert hasattr(transformer, "variables_")
+        assert hasattr(transformer, "features_")
+        assert hasattr(transformer, "feature_names_in_")
+        assert hasattr(transformer, "n_features_in_")
+
+    def test_get_feature_names_out(self):
+        """Test get_feature_names_out returns correct names."""
+        X = pd.DataFrame({"text": ["Hello"], "other": [1]})
+        transformer = TextFeatures(features=["char_count", "word_count"])
+        transformer.fit(X)
+
+        feature_names = transformer.get_feature_names_out()
+        assert "text" in feature_names
+        assert "other" in feature_names
+        assert "text_char_count" in feature_names
+        assert "text_word_count" in feature_names
+
+    def test_get_feature_names_out_with_drop(self):
+        """Test get_feature_names_out with drop_original=True."""
+        X = pd.DataFrame({"text": ["Hello"], "other": [1]})
+        transformer = TextFeatures(features=["char_count"], drop_original=True)
+        transformer.fit(X)
+
+        feature_names = transformer.get_feature_names_out()
+        assert "text" not in feature_names
+        assert "other" in feature_names
+        assert "text_char_count" in feature_names
diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py
new file mode 100644
index 000000000..f4e9de3ea
--- /dev/null
+++ b/tests/test_text/test_text_features.py
@@ -0,0 +1,167 @@
+import pandas as pd
+import pytest
+
+from feature_engine.text import TextFeatures
+
+
+class TestTextFeatures:
+    """Test cases for TextFeatures transformer."""
+
+    def test_default_all_features(self):
+        """Test extracting all features with default parameters."""
+        X = pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]})
+        transformer = TextFeatures()
+        X_tr = transformer.fit_transform(X)
+
+        # Check that new columns were added
+        assert "text_char_count" in X_tr.columns
+        assert "text_word_count" in X_tr.columns
+        assert "text_digit_count" in X_tr.columns
+
+        # Check char_count
+        assert X_tr["text_char_count"].tolist() == [12, 10, 2]
+
+        # Check word_count
+        assert X_tr["text_word_count"].tolist() == [2, 2, 1]
+
+        # Check digit_count
+        assert X_tr["text_digit_count"].tolist() == [0, 3, 0]
+
+    def test_specific_features(self):
+        """Test extracting specific features only."""
+        X = pd.DataFrame({"text": ["Hello", "World"]})
+        transformer = TextFeatures(features=["char_count", "word_count"])
+        X_tr = transformer.fit_transform(X)
+
+        # Check only specified features are extracted
+        assert "text_char_count" in X_tr.columns
+        assert "text_word_count" in X_tr.columns
+        assert "text_digit_count" not in X_tr.columns
+        assert "text_uppercase_count" not in X_tr.columns
+
+    def test_specific_variables(self):
+        """Test extracting features from specific variables only."""
+        X = pd.DataFrame(
+            {"text1": ["Hello", "World"], "text2": ["Foo", "Bar"], "numeric": [1, 2]}
+        )
+        transformer = TextFeatures(variables=["text1"], features=["char_count"])
+        X_tr = transformer.fit_transform(X)
+
+        # Only text1 should have features extracted
+        assert "text1_char_count" in X_tr.columns
+        assert "text2_char_count" not in X_tr.columns
+
+    def test_drop_original(self):
+        """Test drop_original parameter."""
+        X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]})
+        transformer = TextFeatures(features=["char_count"], drop_original=True)
+        X_tr = transformer.fit_transform(X)
+
+        assert "text" not in X_tr.columns
+        assert "text_char_count" in X_tr.columns
+        assert "other" in X_tr.columns
+
+    def test_empty_string_handling(self):
+        """Test handling of empty strings."""
+        X = pd.DataFrame({"text": ["", "Hello", ""]})
+        transformer = TextFeatures(features=["char_count", "word_count", "is_empty"])
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["text_char_count"].tolist() == [0, 5, 0]
+        assert X_tr["text_is_empty"].tolist() == [1, 0, 1]
+
+    def test_nan_handling(self):
+        """Test handling of NaN values."""
+        X = pd.DataFrame({"text": ["Hello", None, "World"]})
+        transformer = TextFeatures(features=["char_count"])
+        X_tr = transformer.fit_transform(X)
+
+        # NaN should be filled with empty string, resulting in char_count of 0
+        assert X_tr["text_char_count"].tolist() == [5, 0, 5]
+
+    def test_uppercase_features(self):
+        """Test uppercase-related features."""
+        X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]})
+        transformer = TextFeatures(
+            features=["uppercase_count", "has_uppercase", "starts_with_uppercase"]
+        )
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3]
+        assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1]
+        assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1]
+
+    def test_sentence_count(self):
+        """Test sentence counting."""
+        X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]})
+        transformer = TextFeatures(features=["sentence_count"])
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["text_sentence_count"].tolist() == [2, 0, 3]
+
+    def test_unique_word_features(self):
+        """Test unique word features."""
+        X = pd.DataFrame({"text": ["the the the", "a b c", "x"]})
+        transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"])
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1]
+        assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0]
+
+    def test_invalid_feature_raises_error(self):
+        """Test that invalid feature name raises ValueError."""
+        with pytest.raises(ValueError, match="Invalid features"):
+            TextFeatures(features=["invalid_feature"])
+
+    def test_invalid_variables_raises_error(self):
+        """Test that invalid variables parameter raises ValueError."""
+        with pytest.raises(ValueError, match="variables must be"):
+            TextFeatures(variables=123)
+
+    def test_missing_variable_raises_error(self):
+        """Test that missing variable raises ValueError on fit."""
+        X = pd.DataFrame({"text": ["Hello"]})
+        transformer = TextFeatures(variables=["nonexistent"])
+        with pytest.raises(ValueError, match="not present in the dataframe"):
+            transformer.fit(X)
+
+    def test_no_text_columns_raises_error(self):
+        """Test that no text columns raises error when variables=None."""
+        X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+        transformer = TextFeatures()
+        with pytest.raises(ValueError, match="No object/string columns found"):
+            transformer.fit(X)
+
+    def test_fit_stores_attributes(self):
+        """Test that fit stores expected attributes."""
+        X = pd.DataFrame({"text": ["Hello"]})
+        transformer = TextFeatures()
+        transformer.fit(X)
+
+        assert hasattr(transformer, "variables_")
+        assert hasattr(transformer, "features_")
+        assert hasattr(transformer, "feature_names_in_")
+        assert hasattr(transformer, "n_features_in_")
+
+    def test_get_feature_names_out(self):
+        """Test get_feature_names_out returns correct names."""
+        X = pd.DataFrame({"text": ["Hello"], "other": [1]})
+        transformer = TextFeatures(features=["char_count", "word_count"])
+        transformer.fit(X)
+
+        feature_names = transformer.get_feature_names_out()
+        assert "text" in feature_names
+        assert "other" in feature_names
+        assert "text_char_count" in feature_names
+        assert "text_word_count" in feature_names
+
+    def test_get_feature_names_out_with_drop(self):
+        """Test get_feature_names_out with drop_original=True."""
+        X = pd.DataFrame({"text": ["Hello"], "other": [1]})
+        transformer = TextFeatures(features=["char_count"], drop_original=True)
+        transformer.fit(X)
+
+        feature_names = transformer.get_feature_names_out()
+        assert "text" not in feature_names
+        assert "other" in feature_names
+        assert "text_char_count" in feature_names

From 848670fbc2a9418c9e04e349f68f798ba1d55afc Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Mon, 9 Feb 2026 08:34:47 -0600
Subject: [PATCH 2/2] Fix string dtype detection for pandas 2/3 compatibility

Use pd.api.types.is_string_dtype() and is_object_dtype() instead
of checking dtype == 'object' directly, which fails with StringDtype.
---
 feature_engine/text/text_features.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py
index c06afdf79..8a91f235e 100644
--- a/feature_engine/text/text_features.py
+++ b/feature_engine/text/text_features.py
@@ -209,8 +209,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
 
         # Find or validate text variables
         if self.variables is None:
-            # Select object/string columns
-            self.variables_ = [col for col in X.columns if X[col].dtype == "object"]
+            # Select object/string columns (handles both object dtype and StringDtype)
+            self.variables_ = [
+                col for col in X.columns
+                if pd.api.types.is_string_dtype(X[col])
+                or pd.api.types.is_object_dtype(X[col])
+            ]
             if len(self.variables_) == 0:
                 raise ValueError(
                     "No object/string columns found in the dataframe. "