MAINT add support for feature_names_in_ (#959)

glemaitre · web-flow · commit 54a7b5b4d6d6 · 2022-12-05T19:19:57.000+01:00
diff --git a/doc/whats_new/v0.10.rst b/doc/whats_new/v0.10.rst
@@ -22,6 +22,10 @@ Compatibility
 - Add support for automatic parameters validation as in scikit-learn >= 1.2.
   :pr:`955` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- Add support for `feature_names_in_` as well as `get_feature_names_out` for
+  all samplers.
+  :pr:`959` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Deprecation
 ...........
 
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -8,6 +8,12 @@
 
 import numpy as np
 from sklearn.base import BaseEstimator
+
+try:
+    # scikit-learn >= 1.2
+    from sklearn.base import OneToOneFeatureMixin
+except ImportError:
+    from sklearn.base import _OneToOneFeatureMixin as OneToOneFeatureMixin
 from sklearn.preprocessing import label_binarize
 from sklearn.utils.multiclass import check_classification_targets
 
@@ -133,7 +139,7 @@ class attribute, which is a dictionary `param_name: list of constraints`. See
             )
 
 
-class BaseSampler(SamplerMixin, _ParamsValidationMixin):
+class BaseSampler(SamplerMixin, OneToOneFeatureMixin, _ParamsValidationMixin):
     """Base class for sampling algorithms.
 
     Warning: This class should not be used directly. Use the derive classes
@@ -260,6 +266,12 @@ class FunctionSampler(BaseSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     sklearn.preprocessing.FunctionTransfomer : Stateless transformer.
diff --git a/imblearn/combine/_smote_enn.py b/imblearn/combine/_smote_enn.py
@@ -67,6 +67,12 @@ class SMOTEENN(BaseSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     SMOTETomek : Over-sample using SMOTE followed by under-sampling removing
diff --git a/imblearn/combine/_smote_tomek.py b/imblearn/combine/_smote_tomek.py
@@ -66,6 +66,12 @@ class SMOTETomek(BaseSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     SMOTEENN : Over-sample using SMOTE followed by under-sampling using Edited
diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py
@@ -71,6 +71,17 @@ class ValueDifferenceMetric(BaseEstimator, _ParamsValidationMixin):
         List of length `n_features` containing the conditional probabilities
         for each category given a class.
 
+    n_features_in_ : int
+        Number of features in the input dataset.
+
+        .. versionadded:: 0.10
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     sklearn.neighbors.DistanceMetric : Interface for fast metric computation.
diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py
@@ -73,6 +73,12 @@ class ADASYN(BaseOverSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     SMOTE : Over-sample using SMOTE.
diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py
@@ -76,6 +76,12 @@ class RandomOverSampler(BaseOverSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     BorderlineSMOTE : Over-sample using the borderline-SMOTE variant.
diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
@@ -264,6 +264,12 @@ class SMOTE(BaseSMOTE):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     SMOTENC : Over-sample using SMOTE for continuous and categorical features.
@@ -442,6 +448,12 @@ class SMOTENC(SMOTE):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     SMOTE : Over-sample using SMOTE.
@@ -759,6 +771,12 @@ class SMOTEN(SMOTE):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     SMOTE : Over-sample using SMOTE.
diff --git a/imblearn/over_sampling/_smote/cluster.py b/imblearn/over_sampling/_smote/cluster.py
@@ -93,6 +93,12 @@ class KMeansSMOTE(BaseSMOTE):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     SMOTE : Over-sample using SMOTE.
diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py
@@ -100,6 +100,12 @@ class BorderlineSMOTE(BaseSMOTE):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     SMOTE : Over-sample using SMOTE.
@@ -352,6 +358,12 @@ class SVMSMOTE(BaseSMOTE):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     SMOTE : Over-sample using SMOTE.
diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py
@@ -3,6 +3,7 @@
 #          Christos Aridas
 # License: MIT
 
+import warnings
 from collections import OrderedDict
 
 import numpy as np
@@ -19,6 +20,7 @@
 from imblearn.under_sampling import NearMiss, RandomUnderSampler
 from imblearn.utils.estimator_checks import (
     _set_checking_parameters,
+    check_dataframe_column_names_consistency,
     check_param_validation,
     parametrize_with_checks,
 )
@@ -92,3 +94,17 @@ def test_strategy_as_ordered_dict(Sampler):
     X_res, y_res = sampler.fit_resample(X, y)
     assert X_res.shape[0] == sum(strategy.values())
     assert y_res.shape[0] == sum(strategy.values())
+
+
+@pytest.mark.parametrize(
+    "estimator", _tested_estimators(), ids=_get_check_estimator_ids
+)
+def test_pandas_column_name_consistency(estimator):
+    _set_checking_parameters(estimator)
+    with ignore_warnings(category=(FutureWarning)):
+        with warnings.catch_warnings(record=True) as record:
+            check_dataframe_column_names_consistency(
+                estimator.__class__.__name__, estimator
+            )
+        for warning in record:
+            assert "was fitted without feature names" not in str(warning.message)
diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py
@@ -78,6 +78,12 @@ class ClusterCentroids(BaseUnderSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     EditedNearestNeighbours : Under-sampling by editing samples.
diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py
@@ -69,6 +69,12 @@ class CondensedNearestNeighbour(BaseCleaningSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     EditedNearestNeighbours : Undersample by editing samples.
diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py
@@ -76,6 +76,12 @@ class EditedNearestNeighbours(BaseCleaningSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     CondensedNearestNeighbour : Undersample by condensing samples.
@@ -251,6 +257,12 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     CondensedNearestNeighbour : Undersample by condensing samples.
@@ -454,6 +466,12 @@ class without early stopping.
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     CondensedNearestNeighbour: Under-sampling by condensing samples.
diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py
@@ -67,6 +67,12 @@ class InstanceHardnessThreshold(BaseUnderSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     NearMiss : Undersample based on near-miss search.
diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py
@@ -72,6 +72,12 @@ class NearMiss(BaseUnderSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     RandomUnderSampler : Random undersample the dataset.
diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py
@@ -83,6 +83,12 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     EditedNearestNeighbours : Undersample by editing noisy samples.
diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py
@@ -68,6 +68,12 @@ class OneSidedSelection(BaseCleaningSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     EditedNearestNeighbours : Undersample by editing noisy samples.
diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
@@ -50,6 +50,12 @@ class RandomUnderSampler(BaseUnderSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     NearMiss : Undersample using near-miss samples.
diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py
@@ -48,6 +48,12 @@ class TomekLinks(BaseCleaningSampler):
 
         .. versionadded:: 0.9
 
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X` has feature
+        names that are all strings.
+
+        .. versionadded:: 0.10
+
     See Also
     --------
     EditedNearestNeighbours : Undersample by samples edition.
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py