Skip to content

Commit e9d120a

Browse files
authored
ENH add auto inference based on pd.CategoricalDtype in SMOTENC (#1009)
1 parent 704d106 commit e9d120a

File tree

4 files changed

+93
-12
lines changed

4 files changed

+93
-12
lines changed

doc/over_sampling.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,9 @@ which categorical data are treated differently::
193193
In this data set, the first and last features are considered as categorical
194194
features. One needs to provide this information to :class:`SMOTENC` via the
195195
parameters ``categorical_features`` either by passing the indices, the feature
196-
names when `X` is a pandas DataFrame, or a boolean mask marking these features::
196+
names when `X` is a pandas DataFrame, a boolean mask marking these features,
197+
or relying on `dtype` inference if the columns are using the
198+
:class:`pandas.CategoricalDtype`::
197199

198200
>>> from imblearn.over_sampling import SMOTENC
199201
>>> smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)

doc/whats_new/v0.11.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,10 @@ Enhancements
6161
- :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str`
6262
when passing the `categorical_features` parameter.
6363
:pr:`1008` by :user`Guillaume Lemaitre <glemaitre>`.
64+
<<<<<<< HEAD
65+
66+
- :class:`~imblearn.over_sampling.SMOTENC` now support automatic categorical inference
67+
when `categorical_features` is set to `"auto"`.
68+
:pr:`1009` by :user`Guillaume Lemaitre <glemaitre>`.
69+
=======
70+
>>>>>>> origin/master

imblearn/over_sampling/_smote/base.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@
3131
from ...metrics.pairwise import ValueDifferenceMetric
3232
from ...utils import Substitution, check_neighbors_object, check_target_type
3333
from ...utils._docstring import _n_jobs_docstring, _random_state_docstring
34-
from ...utils._param_validation import HasMethods, Interval
34+
from ...utils._param_validation import HasMethods, Interval, StrOptions
3535
from ...utils._validation import _check_X
36-
from ...utils.fixes import _mode
36+
from ...utils.fixes import _is_pandas_df, _mode
3737
from ..base import BaseOverSampler
3838

3939

@@ -395,10 +395,13 @@ class SMOTENC(SMOTE):
395395
396396
Parameters
397397
----------
398-
categorical_features : array-like of shape (n_cat_features,) or (n_features,), \
399-
dtype={{bool, int, str}}
398+
categorical_features : "infer" or array-like of shape (n_cat_features,) or \
399+
(n_features,), dtype={{bool, int, str}}
400400
Specified which features are categorical. Can either be:
401401
402+
- "auto" (default) to automatically detect categorical features. Only
403+
supported when `X` is a :class:`pandas.DataFrame` and it corresponds
404+
to columns that have a :class:`pandas.CategoricalDtype`;
402405
- array of `int` corresponding to the indices specifying the categorical
403406
features;
404407
- array of `str` corresponding to the feature names. `X` should be a pandas
@@ -538,7 +541,7 @@ class SMOTENC(SMOTE):
538541

539542
_parameter_constraints: dict = {
540543
**SMOTE._parameter_constraints,
541-
"categorical_features": ["array-like"],
544+
"categorical_features": ["array-like", StrOptions({"auto"})],
542545
"categorical_encoder": [
543546
HasMethods(["fit_transform", "inverse_transform"]),
544547
None,
@@ -575,12 +578,27 @@ def _check_X_y(self, X, y):
575578
return X, y, binarize_y
576579

577580
def _validate_column_types(self, X):
578-
self.categorical_features_ = np.array(
579-
_get_column_indices(X, self.categorical_features)
580-
)
581-
self.continuous_features_ = np.setdiff1d(
582-
np.arange(self.n_features_), self.categorical_features_
583-
)
581+
"""Compute the indices of the categorical and continuous features."""
582+
if self.categorical_features == "auto":
583+
if not _is_pandas_df(X):
584+
raise ValueError(
585+
"When `categorical_features='auto'`, the input data "
586+
f"should be a pandas.DataFrame. Got {type(X)} instead."
587+
)
588+
import pandas as pd # safely import pandas now
589+
590+
are_columns_categorical = np.array(
591+
[isinstance(col_dtype, pd.CategoricalDtype) for col_dtype in X.dtypes]
592+
)
593+
self.categorical_features_ = np.flatnonzero(are_columns_categorical)
594+
self.continuous_features_ = np.flatnonzero(~are_columns_categorical)
595+
else:
596+
self.categorical_features_ = np.array(
597+
_get_column_indices(X, self.categorical_features)
598+
)
599+
self.continuous_features_ = np.setdiff1d(
600+
np.arange(self.n_features_), self.categorical_features_
601+
)
584602

585603
def _validate_estimator(self):
586604
super()._validate_estimator()
@@ -589,6 +607,11 @@ def _validate_estimator(self):
589607
"SMOTE-NC is not designed to work only with categorical "
590608
"features. It requires some numerical features."
591609
)
610+
elif self.categorical_features_.size == 0:
611+
raise ValueError(
612+
"SMOTE-NC is not designed to work only with numerical "
613+
"features. It requires some categorical features."
614+
)
592615

593616
def _fit_resample(self, X, y):
594617
# FIXME: to be removed in 0.12

imblearn/over_sampling/_smote/tests/test_smote_nc.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,3 +349,52 @@ def test_smotenc_categorical_features_str():
349349
assert counter[0] == counter[1] == 70
350350
assert_array_equal(smote.categorical_features_, [1, 2])
351351
assert_array_equal(smote.continuous_features_, [0])
352+
353+
354+
def test_smotenc_categorical_features_auto():
355+
"""Check that we can automatically detect categorical features based on pandas
356+
dataframe.
357+
"""
358+
pd = pytest.importorskip("pandas")
359+
360+
X = pd.DataFrame(
361+
{
362+
"A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
363+
"B": ["a", "b"] * 5,
364+
"C": ["a", "b", "c"] * 3 + ["a"],
365+
}
366+
)
367+
X = pd.concat([X] * 10, ignore_index=True)
368+
X["B"] = X["B"].astype("category")
369+
X["C"] = X["C"].astype("category")
370+
y = np.array([0] * 70 + [1] * 30)
371+
smote = SMOTENC(categorical_features="auto", random_state=0)
372+
X_res, y_res = smote.fit_resample(X, y)
373+
assert X_res["B"].isin(["a", "b"]).all()
374+
assert X_res["C"].isin(["a", "b", "c"]).all()
375+
counter = Counter(y_res)
376+
assert counter[0] == counter[1] == 70
377+
assert_array_equal(smote.categorical_features_, [1, 2])
378+
assert_array_equal(smote.continuous_features_, [0])
379+
380+
381+
def test_smote_nc_categorical_features_auto_error():
382+
"""Check that we raise a proper error when we cannot use the `'auto'` mode."""
383+
pd = pytest.importorskip("pandas")
384+
385+
X = pd.DataFrame(
386+
{
387+
"A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
388+
"B": ["a", "b"] * 5,
389+
"C": ["a", "b", "c"] * 3 + ["a"],
390+
}
391+
)
392+
y = np.array([0] * 70 + [1] * 30)
393+
smote = SMOTENC(categorical_features="auto", random_state=0)
394+
395+
with pytest.raises(ValueError, match="the input data should be a pandas.DataFrame"):
396+
smote.fit_resample(X.to_numpy(), y)
397+
398+
err_msg = "SMOTE-NC is not designed to work only with numerical features"
399+
with pytest.raises(ValueError, match=err_msg):
400+
smote.fit_resample(X, y)

0 commit comments

Comments
 (0)