From 88d15bc7dd2c51f8f3edad45ba7f873b02911cf6 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Mon, 9 Feb 2026 08:24:26 -0600 Subject: [PATCH 1/2] Add TextFeatures transformer for extracting text features - Add TextFeatures class to extract 19 text features from string columns - Features include char_count, word_count, sentence_count, digit_count, etc. - Add comprehensive test suite (16 tests) --- feature_engine/text/__init__.py | 9 + feature_engine/text/text_features.py | 327 ++++++++++++++++++++++++++ tests/test_text/__init__.py | 167 +++++++++++++ tests/test_text/test_text_features.py | 167 +++++++++++++ 4 files changed, 670 insertions(+) create mode 100644 feature_engine/text/__init__.py create mode 100644 feature_engine/text/text_features.py create mode 100644 tests/test_text/__init__.py create mode 100644 tests/test_text/test_text_features.py diff --git a/feature_engine/text/__init__.py b/feature_engine/text/__init__.py new file mode 100644 index 000000000..14626b79c --- /dev/null +++ b/feature_engine/text/__init__.py @@ -0,0 +1,9 @@ +""" +The module text includes classes to extract features from text/string variables. +""" + +from .text_features import TextFeatures + +__all__ = [ + "TextFeatures", +] diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py new file mode 100644 index 000000000..c06afdf79 --- /dev/null +++ b/feature_engine/text/text_features.py @@ -0,0 +1,327 @@ +# Authors: Ankit Hemant Lade (contributor) +# License: BSD 3 clause + +from typing import List, Optional, Union + +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted + +from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin +from feature_engine._check_init_parameters.check_init_input_params import ( + _check_param_drop_original, +) +from feature_engine.dataframe_checks import _check_X_matches_training_df, check_X +from feature_engine.tags import _return_tags + +# Available text features and their computation functions +TEXT_FEATURES = { + "char_count": lambda x: x.str.len(), + "word_count": lambda x: x.str.split().str.len(), + "sentence_count": lambda x: x.str.count(r"[.!?]+"), + "avg_word_length": lambda x: x.apply( + lambda s: sum(len(w) for w in str(s).split()) / max(len(str(s).split()), 1) + ), + "digit_count": lambda x: x.str.count(r"\d"), + "uppercase_count": lambda x: x.str.count(r"[A-Z]"), + "lowercase_count": lambda x: x.str.count(r"[a-z]"), + "special_char_count": lambda x: x.str.count(r"[^a-zA-Z0-9\s]"), + "whitespace_count": lambda x: x.str.count(r"\s"), + "whitespace_ratio": lambda x: x.str.count(r"\s") / x.str.len().replace(0, 1), + "digit_ratio": lambda x: x.str.count(r"\d") / x.str.len().replace(0, 1), + "uppercase_ratio": lambda x: x.str.count(r"[A-Z]") / x.str.len().replace(0, 1), + "has_digits": lambda x: x.str.contains(r"\d", regex=True).astype(int), + "has_uppercase": lambda x: x.str.contains(r"[A-Z]", regex=True).astype(int), + "is_empty": lambda x: (x.str.len() == 0).astype(int), + "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int), + "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int), + "unique_word_count": lambda x: x.apply(lambda s: len(set(str(s).lower().split()))), + "unique_word_ratio": lambda x: x.apply( + lambda s: len(set(str(s).lower().split())) / max(len(str(s).split()), 1) + ), +} + + +class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): + """ + TextFeatures() extracts numerical features from text/string variables. This + transformer is useful for extracting basic text statistics that can be used + as features in machine learning models. + + The transformer can extract various text features including character counts, + word counts, sentence counts, and various ratios and indicators. + + A list of variables can be passed as an argument. Alternatively, the transformer + will automatically select and transform all variables of type object (string). + + More details in the :ref:`User Guide `. + + Parameters + ---------- + variables: list, default=None + The list of text/string variables to extract features from. If None, the + transformer will automatically select all object (string) columns. + + features: list, default=None + List of text features to extract. Available features are: + + - 'char_count': Number of characters in the text + - 'word_count': Number of words (whitespace-separated tokens) + - 'sentence_count': Number of sentences (based on .!? punctuation) + - 'avg_word_length': Average length of words + - 'digit_count': Number of digit characters + - 'uppercase_count': Number of uppercase letters + - 'lowercase_count': Number of lowercase letters + - 'special_char_count': Number of special characters (non-alphanumeric) + - 'whitespace_count': Number of whitespace characters + - 'whitespace_ratio': Ratio of whitespace to total characters + - 'digit_ratio': Ratio of digits to total characters + - 'uppercase_ratio': Ratio of uppercase to total characters + - 'has_digits': Binary indicator if text contains digits + - 'has_uppercase': Binary indicator if text contains uppercase + - 'is_empty': Binary indicator if text is empty + - 'starts_with_uppercase': Binary indicator if text starts with uppercase + - 'ends_with_punctuation': Binary indicator if text ends with .!? + - 'unique_word_count': Number of unique words (case-insensitive) + - 'unique_word_ratio': Ratio of unique words to total words + + If None, extracts all available features. + + drop_original: bool, default=False + Whether to drop the original text columns after transformation. + + Attributes + ---------- + variables_: + The list of text variables that will be transformed. + + features_: + The list of features that will be extracted. + + feature_names_in_: + List with the names of features seen during fit. + + n_features_in_: + The number of features in the train set used in fit. + + Methods + ------- + fit: + This transformer does not learn parameters. It stores the feature names + and validates input. + + fit_transform: + Fit to data, then transform it. + + transform: + Extract text features and add them to the dataframe. + + get_feature_names_out: + Get output feature names for transformation. + + See Also + -------- + feature_engine.encoding.StringSimilarityEncoder : + Encodes categorical variables based on string similarity. + + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.text import TextFeatures + >>> X = pd.DataFrame({ + ... 'text': ['Hello World!', 'Python is GREAT.', 'ML rocks 123'] + ... }) + >>> tf = TextFeatures(features=['char_count', 'word_count', 'has_digits']) + >>> tf.fit(X) + >>> X = tf.transform(X) + >>> X + text text_char_count text_word_count text_has_digits + 0 Hello World! 12 2 0 + 1 Python is GREAT. 16 3 0 + 2 ML rocks 123 12 3 1 + """ + + def __init__( + self, + variables: Union[None, str, List[str]] = None, + features: Union[None, List[str]] = None, + drop_original: bool = False, + ) -> None: + + # Validate variables + if variables is not None: + if isinstance(variables, str): + variables = [variables] + elif not isinstance(variables, list) or not all( + isinstance(v, str) for v in variables + ): + raise ValueError( + "variables must be None, a string, or a list of strings. " + f"Got {type(variables).__name__} instead." + ) + + # Validate features + if features is not None: + if not isinstance(features, list) or not all( + isinstance(f, str) for f in features + ): + raise ValueError( + "features must be None or a list of strings. " + f"Got {type(features).__name__} instead." + ) + invalid_features = set(features) - set(TEXT_FEATURES.keys()) + if invalid_features: + raise ValueError( + f"Invalid features: {invalid_features}. " + f"Available features are: {list(TEXT_FEATURES.keys())}" + ) + + _check_param_drop_original(drop_original) + + self.variables = variables + self.features = features + self.drop_original = drop_original + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + This transformer does not learn parameters. + + Stores feature names and validates that the specified variables are + present and are of string/object type. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training input samples. + + y: pandas Series, or np.array. Defaults to None. + It is not needed in this transformer. You can pass y or None. + + Returns + ------- + self: TextFeatures + The fitted transformer. + """ + + # check input dataframe + X = check_X(X) + + # Find or validate text variables + if self.variables is None: + # Select object/string columns + self.variables_ = [col for col in X.columns if X[col].dtype == "object"] + if len(self.variables_) == 0: + raise ValueError( + "No object/string columns found in the dataframe. " + "Please specify variables explicitly." + ) + else: + # Validate user-specified variables exist + missing = set(self.variables) - set(X.columns) + if missing: + raise ValueError( + f"Variables {missing} are not present in the dataframe." + ) + self.variables_ = self.variables + + # Set features to extract + if self.features is None: + self.features_ = list(TEXT_FEATURES.keys()) + else: + self.features_ = self.features + + # save input features + self.feature_names_in_ = X.columns.tolist() + + # save train set shape + self.n_features_in_ = X.shape[1] + + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Extract text features and add them to the dataframe. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. + + Returns + ------- + X_new: Pandas dataframe + The dataframe with the original columns plus the new text features. + """ + + # Check method fit has been called + check_is_fitted(self) + + # check that input is a dataframe + X = check_X(X) + + # Check if input data contains same number of columns as dataframe used to fit. + _check_X_matches_training_df(X, self.n_features_in_) + + # reorder variables to match train set + X = X[self.feature_names_in_] + + # Extract features for each text variable + for var in self.variables_: + # Fill NaN with empty string for feature extraction + text_col = X[var].fillna("") + + for feature_name in self.features_: + new_col_name = f"{var}_{feature_name}" + feature_func = TEXT_FEATURES[feature_name] + X[new_col_name] = feature_func(text_col) + + # Fill any NaN values resulting from computation with 0 + X[new_col_name] = X[new_col_name].fillna(0) + + if self.drop_original: + X = X.drop(columns=self.variables_) + + return X + + def get_feature_names_out(self, input_features=None) -> List[str]: + """ + Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. If None, uses feature_names_in_. + + Returns + ------- + feature_names_out : list of str + Output feature names. + """ + check_is_fitted(self) + + # Start with original features + if self.drop_original: + feature_names = [ + f for f in self.feature_names_in_ if f not in self.variables_ + ] + else: + feature_names = list(self.feature_names_in_) + + # Add new text feature names + for var in self.variables_: + for feature_name in self.features_: + feature_names.append(f"{var}_{feature_name}") + + return feature_names + + def _more_tags(self): + tags_dict = _return_tags() + tags_dict["allow_nan"] = True + tags_dict["variables"] = "categorical" + return tags_dict + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags diff --git a/tests/test_text/__init__.py b/tests/test_text/__init__.py new file mode 100644 index 000000000..f4e9de3ea --- /dev/null +++ b/tests/test_text/__init__.py @@ -0,0 +1,167 @@ +import pandas as pd +import pytest + +from feature_engine.text import TextFeatures + + +class TestTextFeatures: + """Test cases for TextFeatures transformer.""" + + def test_default_all_features(self): + """Test extracting all features with default parameters.""" + X = pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]}) + transformer = TextFeatures() + X_tr = transformer.fit_transform(X) + + # Check that new columns were added + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" in X_tr.columns + + # Check char_count + assert X_tr["text_char_count"].tolist() == [12, 10, 2] + + # Check word_count + assert X_tr["text_word_count"].tolist() == [2, 2, 1] + + # Check digit_count + assert X_tr["text_digit_count"].tolist() == [0, 3, 0] + + def test_specific_features(self): + """Test extracting specific features only.""" + X = pd.DataFrame({"text": ["Hello", "World"]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + X_tr = transformer.fit_transform(X) + + # Check only specified features are extracted + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" not in X_tr.columns + assert "text_uppercase_count" not in X_tr.columns + + def test_specific_variables(self): + """Test extracting features from specific variables only.""" + X = pd.DataFrame( + {"text1": ["Hello", "World"], "text2": ["Foo", "Bar"], "numeric": [1, 2]} + ) + transformer = TextFeatures(variables=["text1"], features=["char_count"]) + X_tr = transformer.fit_transform(X) + + # Only text1 should have features extracted + assert "text1_char_count" in X_tr.columns + assert "text2_char_count" not in X_tr.columns + + def test_drop_original(self): + """Test drop_original parameter.""" + X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + X_tr = transformer.fit_transform(X) + + assert "text" not in X_tr.columns + assert "text_char_count" in X_tr.columns + assert "other" in X_tr.columns + + def test_empty_string_handling(self): + """Test handling of empty strings.""" + X = pd.DataFrame({"text": ["", "Hello", ""]}) + transformer = TextFeatures(features=["char_count", "word_count", "is_empty"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_char_count"].tolist() == [0, 5, 0] + assert X_tr["text_is_empty"].tolist() == [1, 0, 1] + + def test_nan_handling(self): + """Test handling of NaN values.""" + X = pd.DataFrame({"text": ["Hello", None, "World"]}) + transformer = TextFeatures(features=["char_count"]) + X_tr = transformer.fit_transform(X) + + # NaN should be filled with empty string, resulting in char_count of 0 + assert X_tr["text_char_count"].tolist() == [5, 0, 5] + + def test_uppercase_features(self): + """Test uppercase-related features.""" + X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]}) + transformer = TextFeatures( + features=["uppercase_count", "has_uppercase", "starts_with_uppercase"] + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3] + assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1] + assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1] + + def test_sentence_count(self): + """Test sentence counting.""" + X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]}) + transformer = TextFeatures(features=["sentence_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_sentence_count"].tolist() == [2, 0, 3] + + def test_unique_word_features(self): + """Test unique word features.""" + X = pd.DataFrame({"text": ["the the the", "a b c", "x"]}) + transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1] + assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0] + + def test_invalid_feature_raises_error(self): + """Test that invalid feature name raises ValueError.""" + with pytest.raises(ValueError, match="Invalid features"): + TextFeatures(features=["invalid_feature"]) + + def test_invalid_variables_raises_error(self): + """Test that invalid variables parameter raises ValueError.""" + with pytest.raises(ValueError, match="variables must be"): + TextFeatures(variables=123) + + def test_missing_variable_raises_error(self): + """Test that missing variable raises ValueError on fit.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures(variables=["nonexistent"]) + with pytest.raises(ValueError, match="not present in the dataframe"): + transformer.fit(X) + + def test_no_text_columns_raises_error(self): + """Test that no text columns raises error when variables=None.""" + X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + transformer = TextFeatures() + with pytest.raises(ValueError, match="No object/string columns found"): + transformer.fit(X) + + def test_fit_stores_attributes(self): + """Test that fit stores expected attributes.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures() + transformer.fit(X) + + assert hasattr(transformer, "variables_") + assert hasattr(transformer, "features_") + assert hasattr(transformer, "feature_names_in_") + assert hasattr(transformer, "n_features_in_") + + def test_get_feature_names_out(self): + """Test get_feature_names_out returns correct names.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names + assert "text_word_count" in feature_names + + def test_get_feature_names_out_with_drop(self): + """Test get_feature_names_out with drop_original=True.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" not in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py new file mode 100644 index 000000000..f4e9de3ea --- /dev/null +++ b/tests/test_text/test_text_features.py @@ -0,0 +1,167 @@ +import pandas as pd +import pytest + +from feature_engine.text import TextFeatures + + +class TestTextFeatures: + """Test cases for TextFeatures transformer.""" + + def test_default_all_features(self): + """Test extracting all features with default parameters.""" + X = pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]}) + transformer = TextFeatures() + X_tr = transformer.fit_transform(X) + + # Check that new columns were added + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" in X_tr.columns + + # Check char_count + assert X_tr["text_char_count"].tolist() == [12, 10, 2] + + # Check word_count + assert X_tr["text_word_count"].tolist() == [2, 2, 1] + + # Check digit_count + assert X_tr["text_digit_count"].tolist() == [0, 3, 0] + + def test_specific_features(self): + """Test extracting specific features only.""" + X = pd.DataFrame({"text": ["Hello", "World"]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + X_tr = transformer.fit_transform(X) + + # Check only specified features are extracted + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" not in X_tr.columns + assert "text_uppercase_count" not in X_tr.columns + + def test_specific_variables(self): + """Test extracting features from specific variables only.""" + X = pd.DataFrame( + {"text1": ["Hello", "World"], "text2": ["Foo", "Bar"], "numeric": [1, 2]} + ) + transformer = TextFeatures(variables=["text1"], features=["char_count"]) + X_tr = transformer.fit_transform(X) + + # Only text1 should have features extracted + assert "text1_char_count" in X_tr.columns + assert "text2_char_count" not in X_tr.columns + + def test_drop_original(self): + """Test drop_original parameter.""" + X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + X_tr = transformer.fit_transform(X) + + assert "text" not in X_tr.columns + assert "text_char_count" in X_tr.columns + assert "other" in X_tr.columns + + def test_empty_string_handling(self): + """Test handling of empty strings.""" + X = pd.DataFrame({"text": ["", "Hello", ""]}) + transformer = TextFeatures(features=["char_count", "word_count", "is_empty"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_char_count"].tolist() == [0, 5, 0] + assert X_tr["text_is_empty"].tolist() == [1, 0, 1] + + def test_nan_handling(self): + """Test handling of NaN values.""" + X = pd.DataFrame({"text": ["Hello", None, "World"]}) + transformer = TextFeatures(features=["char_count"]) + X_tr = transformer.fit_transform(X) + + # NaN should be filled with empty string, resulting in char_count of 0 + assert X_tr["text_char_count"].tolist() == [5, 0, 5] + + def test_uppercase_features(self): + """Test uppercase-related features.""" + X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]}) + transformer = TextFeatures( + features=["uppercase_count", "has_uppercase", "starts_with_uppercase"] + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3] + assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1] + assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1] + + def test_sentence_count(self): + """Test sentence counting.""" + X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]}) + transformer = TextFeatures(features=["sentence_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_sentence_count"].tolist() == [2, 0, 3] + + def test_unique_word_features(self): + """Test unique word features.""" + X = pd.DataFrame({"text": ["the the the", "a b c", "x"]}) + transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1] + assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0] + + def test_invalid_feature_raises_error(self): + """Test that invalid feature name raises ValueError.""" + with pytest.raises(ValueError, match="Invalid features"): + TextFeatures(features=["invalid_feature"]) + + def test_invalid_variables_raises_error(self): + """Test that invalid variables parameter raises ValueError.""" + with pytest.raises(ValueError, match="variables must be"): + TextFeatures(variables=123) + + def test_missing_variable_raises_error(self): + """Test that missing variable raises ValueError on fit.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures(variables=["nonexistent"]) + with pytest.raises(ValueError, match="not present in the dataframe"): + transformer.fit(X) + + def test_no_text_columns_raises_error(self): + """Test that no text columns raises error when variables=None.""" + X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + transformer = TextFeatures() + with pytest.raises(ValueError, match="No object/string columns found"): + transformer.fit(X) + + def test_fit_stores_attributes(self): + """Test that fit stores expected attributes.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures() + transformer.fit(X) + + assert hasattr(transformer, "variables_") + assert hasattr(transformer, "features_") + assert hasattr(transformer, "feature_names_in_") + assert hasattr(transformer, "n_features_in_") + + def test_get_feature_names_out(self): + """Test get_feature_names_out returns correct names.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names + assert "text_word_count" in feature_names + + def test_get_feature_names_out_with_drop(self): + """Test get_feature_names_out with drop_original=True.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" not in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names From 848670fbc2a9418c9e04e349f68f798ba1d55afc Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Mon, 9 Feb 2026 08:34:47 -0600 Subject: [PATCH 2/2] Fix string dtype detection for pandas 2/3 compatibility Use pd.api.types.is_string_dtype() and is_object_dtype() instead of checking dtype == 'object' directly, which fails with StringDtype. --- feature_engine/text/text_features.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index c06afdf79..8a91f235e 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -209,8 +209,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # Find or validate text variables if self.variables is None: - # Select object/string columns - self.variables_ = [col for col in X.columns if X[col].dtype == "object"] + # Select object/string columns (handles both object dtype and StringDtype) + self.variables_ = [ + col for col in X.columns + if pd.api.types.is_string_dtype(X[col]) + or pd.api.types.is_object_dtype(X[col]) + ] if len(self.variables_) == 0: raise ValueError( "No object/string columns found in the dataframe. "