[MRG] Example for JMLR (#211)

glemaitre · web-flow · commit ca8e7f4305a5 · 2016-12-31T17:30:39.000+01:00
* Add a classification report example

* add an example for multiclass

* finish the example

* Use signature instead of poping kwargs

* Solve the issue with the doc

* Correct mispealing

* Add readme for dataset examples
diff --git a/doc/api.rst b/doc/api.rst
@@ -133,7 +133,8 @@ Metrics
 Functions
 ---------
 .. autosummary::
-:toctree: generated/
+   :toctree: generated/
+
    metrics.sensitivity_specificity_support
    metrics.sensitivity_score
    metrics.specificity_score
diff --git a/examples/applications/plot_multi_class_under_sampling.py b/examples/applications/plot_multi_class_under_sampling.py
@@ -0,0 +1,40 @@
+"""
+=============================================
+Multiclass classification with under-sampling
+=============================================
+
+Some balancing methods allow for balancing dataset with multiples classes.
+We provide an example to illustrate the use of those methods which do
+not differ from the binary case.
+
+"""
+
+from sklearn.datasets import load_iris
+from sklearn.svm import LinearSVC
+from sklearn.model_selection import train_test_split
+
+from imblearn.under_sampling import NearMiss
+from imblearn.pipeline import make_pipeline
+from imblearn.metrics import classification_report_imbalanced
+
+print(__doc__)
+
+RANDOM_STATE = 42
+
+# Create a folder to fetch the dataset
+iris = load_iris()
+# Make the dataset imbalanced
+# Select only half of the first class
+iris.data = iris.data[25:-1, :]
+iris.target = iris.target[25:-1]
+
+X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
+                                                    random_state=RANDOM_STATE)
+
+# Create a pipeline
+pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE),
+                         LinearSVC(random_state=RANDOM_STATE))
+pipeline.fit(X_train, y_train)
+
+# Classify and report the results
+print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
diff --git a/examples/datasets/README.txt b/examples/datasets/README.txt
@@ -0,0 +1,6 @@
+.. _dataset_examples:
+
+Dataset examples
+-----------------------
+
+Examples concerning the :mod:`imblearn.datasets` module.
diff --git a/examples/evaluation/plot_classification_report.py b/examples/evaluation/plot_classification_report.py
@@ -0,0 +1,45 @@
+"""
+=============================================
+Evaluate classification by compiling a report
+=============================================
+
+Specific metrics have been developed to evaluate classifier which has been
+trained using imbalanced data. `imblearn` provides a classification
+report similar to `sklearn`, with additional metrics specific to imbalanced
+learning problem.
+"""
+
+from sklearn import datasets
+from sklearn.svm import LinearSVC
+from sklearn.model_selection import train_test_split
+
+from imblearn import over_sampling as os
+from imblearn import pipeline as pl
+from imblearn.metrics import classification_report_imbalanced
+
+print(__doc__)
+
+RANDOM_STATE = 42
+
+# Generate a dataset
+X, y = datasets.make_classification(n_classes=2, class_sep=2,
+                                    weights=[0.1, 0.9], n_informative=10,
+                                    n_redundant=1, flip_y=0, n_features=20,
+                                    n_clusters_per_class=4, n_samples=5000,
+                                    random_state=RANDOM_STATE)
+
+pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE),
+                            LinearSVC(random_state=RANDOM_STATE))
+
+# Split the data
+X_train, X_test, y_train, y_test = train_test_split(X, y,
+                                                    random_state=RANDOM_STATE)
+
+# Train the classifier with balancing
+pipeline.fit(X_train, y_train)
+
+# Test the classifier and get the prediction
+y_pred_bal = pipeline.predict(X_test)
+
+# Show the classification report
+print(classification_report_imbalanced(y_test, y_pred_bal))
diff --git a/examples/evaluation/plot_metrics.py b/examples/evaluation/plot_metrics.py
@@ -0,0 +1,74 @@
+"""
+=======================================
+Metrics specific to imbalanced learning
+=======================================
+
+Specific metrics have been developed to evaluate classifier which
+has been trained using imbalanced data. `imblearn` provides mainly
+two additional metrics which are not implemented in `sklearn`: (i)
+geometric mean and (ii) index balanced accuracy.
+"""
+
+from sklearn import datasets
+from sklearn.svm import LinearSVC
+from sklearn.model_selection import train_test_split
+
+from imblearn import over_sampling as os
+from imblearn import pipeline as pl
+from imblearn.metrics import (geometric_mean_score,
+                              make_index_balanced_accuracy)
+
+print(__doc__)
+
+RANDOM_STATE = 42
+
+# Generate a dataset
+X, y = datasets.make_classification(n_classes=3, class_sep=2,
+                                    weights=[0.1, 0.9], n_informative=10,
+                                    n_redundant=1, flip_y=0, n_features=20,
+                                    n_clusters_per_class=4, n_samples=5000,
+                                    random_state=RANDOM_STATE)
+
+pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE),
+                            LinearSVC(random_state=RANDOM_STATE))
+
+# Split the data
+X_train, X_test, y_train, y_test = train_test_split(X, y,
+                                                    random_state=RANDOM_STATE)
+
+# Train the classifier with balancing
+pipeline.fit(X_train, y_train)
+
+# Test the classifier and get the prediction
+y_pred_bal = pipeline.predict(X_test)
+
+###############################################################################
+# The geometric mean corresponds to the square root of the product of the
+# sensitivity and specificity. Combining the two metrics should account for
+# the balancing of the dataset.
+
+print('The geometric mean is {}'.format(geometric_mean_score(
+    y_test,
+    y_pred_bal)))
+
+###############################################################################
+# The index balanced accuracy can transform any metric to be used in
+# imbalanced learning problems.
+
+alpha = 0.1
+geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
+    geometric_mean_score)
+
+print('The IBA using alpha = {} and the geometric mean: {}'.format(
+    alpha, geo_mean(
+        y_test,
+        y_pred_bal)))
+
+alpha = 0.5
+geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
+    geometric_mean_score)
+
+print('The IBA using alpha = {} and the geometric mean: {}'.format(
+    alpha, geo_mean(
+        y_test,
+        y_pred_bal)))
diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py
@@ -29,7 +29,7 @@
                                     weights=[0.1, 0.9], n_informative=10,
                                     n_redundant=1, flip_y=0, n_features=20,
                                     n_clusters_per_class=4, n_samples=5000,
-                                    random_state=10)
+                                    random_state=RANDOM_STATE)
 smote = os.SMOTE(random_state=RANDOM_STATE)
 cart = tree.DecisionTreeClassifier(random_state=RANDOM_STATE)
 pipeline = pl.make_pipeline(smote, cart)
diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
@@ -14,6 +14,8 @@
 import logging
 import functools
 
+from inspect import getcallargs
+
 import numpy as np
 
 from sklearn.metrics.classification import (_check_targets, _prf_divide,
@@ -22,6 +24,12 @@
 from sklearn.utils.fixes import bincount
 from sklearn.utils.multiclass import unique_labels
 
+try:
+    from inspect import signature
+except ImportError:
+    from sklearn.externals.funcsigs import signature
+
+
 LOGGER = logging.getLogger(__name__)
 
 
@@ -563,10 +571,10 @@ def geometric_mean_score(y_true,
 
 
 def make_index_balanced_accuracy(alpha=0.1, squared=True):
-    """Balance any scoring function using the indexed balanced accuracy
+    """Balance any scoring function using the index balanced accuracy
 
     This factory function wraps scoring function to express it as the
-    indexed balanced accuracy (IBA). You need to use this function to
+    index balanced accuracy (IBA). You need to use this function to
     decorate any scoring function.
 
     Parameters
@@ -582,7 +590,7 @@ def make_index_balanced_accuracy(alpha=0.1, squared=True):
     -------
     iba_scoring_func : callable,
         Returns the scoring metric decorated which will automatically compute
-        the indexed balanced accuracy.
+        the index balanced accuracy.
 
     Examples
     --------
@@ -603,21 +611,16 @@ def compute_score(*args, **kwargs):
             # Square if desired
             if squared:
                 _score = np.power(_score, 2)
-            # args will contain the y_pred and y_true
-            # kwargs will contain the other parameters
-            labels = kwargs.get('labels', None)
-            pos_label = kwargs.get('pos_label', 1)
-            average = kwargs.get('average', 'binary')
-            sample_weight = kwargs.get('sample_weight', None)
-            # Compute the sensitivity and specificity
-            dict_sen_spe = {
-                'labels': labels,
-                'pos_label': pos_label,
-                'average': average,
-                'sample_weight': sample_weight
-            }
-            sen, spe, _ = sensitivity_specificity_support(*args,
-                                                          **dict_sen_spe)
+            # Create the list of tags
+            tags_scoring_func = getcallargs(scoring_func, *args, **kwargs)
+            # Get the signature of the sens/spec function
+            sens_spec_sig = signature(sensitivity_specificity_support)
+            # Filter the inputs required by the sens/spec function
+            tags_sens_spec = sens_spec_sig.bind(**tags_scoring_func)
+            # Call the sens/spec function
+            sen, spe, _ = sensitivity_specificity_support(
+                *tags_sens_spec.args,
+                **tags_sens_spec.kwargs)
             # Compute the dominance
             dom = sen - spe
             return (1. + alpha * dom) * _score
@@ -640,7 +643,7 @@ def classification_report_imbalanced(y_true,
     Specific metrics have been proposed to evaluate the classification
     performed on imbalanced dataset. This report compiles the
     state-of-the-art metrics: precision/recall/specificity, geometric
-    mean, and indexed balanced accuracy of the
+    mean, and index balanced accuracy of the
     geometric mean.
 
     Parameters
@@ -674,7 +677,7 @@ def classification_report_imbalanced(y_true,
     -------
     report : string
         Text summary of the precision, recall, specificity, geometric mean,
-        and indexed balanced accuracy.
+        and index balanced accuracy.
 
     Examples
     --------
@@ -746,7 +749,7 @@ class 2       1.00      0.67      1.00      0.80      0.82      0.69\
         labels=labels,
         average=None,
         sample_weight=sample_weight)
-    # Indexed balanced accuracy
+    # Index balanced accuracy
     iba_gmean = make_index_balanced_accuracy(
         alpha=alpha, squared=True)(geometric_mean_score)
     iba = iba_gmean(