31
31
from ...metrics .pairwise import ValueDifferenceMetric
32
32
from ...utils import Substitution , check_neighbors_object , check_target_type
33
33
from ...utils ._docstring import _n_jobs_docstring , _random_state_docstring
34
- from ...utils ._param_validation import HasMethods , Interval
34
+ from ...utils ._param_validation import HasMethods , Interval , StrOptions
35
35
from ...utils ._validation import _check_X
36
- from ...utils .fixes import _mode
36
+ from ...utils .fixes import _is_pandas_df , _mode
37
37
from ..base import BaseOverSampler
38
38
39
39
@@ -395,10 +395,13 @@ class SMOTENC(SMOTE):
395
395
396
396
Parameters
397
397
----------
398
- categorical_features : array-like of shape (n_cat_features,) or (n_features,), \
399
- dtype={{bool, int, str}}
398
+ categorical_features : "infer" or array-like of shape (n_cat_features,) or \
399
+ (n_features,), dtype={{bool, int, str}}
400
400
Specified which features are categorical. Can either be:
401
401
402
+ - "auto" (default) to automatically detect categorical features. Only
403
+ supported when `X` is a :class:`pandas.DataFrame` and it corresponds
404
+ to columns that have a :class:`pandas.CategoricalDtype`;
402
405
- array of `int` corresponding to the indices specifying the categorical
403
406
features;
404
407
- array of `str` corresponding to the feature names. `X` should be a pandas
@@ -538,7 +541,7 @@ class SMOTENC(SMOTE):
538
541
539
542
_parameter_constraints : dict = {
540
543
** SMOTE ._parameter_constraints ,
541
- "categorical_features" : ["array-like" ],
544
+ "categorical_features" : ["array-like" , StrOptions ({ "auto" }) ],
542
545
"categorical_encoder" : [
543
546
HasMethods (["fit_transform" , "inverse_transform" ]),
544
547
None ,
@@ -575,12 +578,27 @@ def _check_X_y(self, X, y):
575
578
return X , y , binarize_y
576
579
577
580
def _validate_column_types (self , X ):
578
- self .categorical_features_ = np .array (
579
- _get_column_indices (X , self .categorical_features )
580
- )
581
- self .continuous_features_ = np .setdiff1d (
582
- np .arange (self .n_features_ ), self .categorical_features_
583
- )
581
+ """Compute the indices of the categorical and continuous features."""
582
+ if self .categorical_features == "auto" :
583
+ if not _is_pandas_df (X ):
584
+ raise ValueError (
585
+ "When `categorical_features='auto'`, the input data "
586
+ f"should be a pandas.DataFrame. Got { type (X )} instead."
587
+ )
588
+ import pandas as pd # safely import pandas now
589
+
590
+ are_columns_categorical = np .array (
591
+ [isinstance (col_dtype , pd .CategoricalDtype ) for col_dtype in X .dtypes ]
592
+ )
593
+ self .categorical_features_ = np .flatnonzero (are_columns_categorical )
594
+ self .continuous_features_ = np .flatnonzero (~ are_columns_categorical )
595
+ else :
596
+ self .categorical_features_ = np .array (
597
+ _get_column_indices (X , self .categorical_features )
598
+ )
599
+ self .continuous_features_ = np .setdiff1d (
600
+ np .arange (self .n_features_ ), self .categorical_features_
601
+ )
584
602
585
603
def _validate_estimator (self ):
586
604
super ()._validate_estimator ()
@@ -589,6 +607,11 @@ def _validate_estimator(self):
589
607
"SMOTE-NC is not designed to work only with categorical "
590
608
"features. It requires some numerical features."
591
609
)
610
+ elif self .categorical_features_ .size == 0 :
611
+ raise ValueError (
612
+ "SMOTE-NC is not designed to work only with numerical "
613
+ "features. It requires some categorical features."
614
+ )
592
615
593
616
def _fit_resample (self , X , y ):
594
617
# FIXME: to be removed in 0.12
0 commit comments