From 4afc7b9701c0a7e1cd7a6bbf01f4a69fee07abe6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Sun, 18 Dec 2016 17:16:16 +0100
Subject: [PATCH 01/21] Starting sensitivity specificity metric

---
 imblearn/__init__.py               |   7 +-
 imblearn/metrics/__init__.py       |   7 ++
 imblearn/metrics/classification.py | 180 +++++++++++++++++++++++++++++
 imblearn/setup.py                  |   2 +
 4 files changed, 194 insertions(+), 2 deletions(-)
 create mode 100644 imblearn/metrics/__init__.py
 create mode 100644 imblearn/metrics/classification.py

diff --git a/imblearn/__init__.py b/imblearn/__init__.py
index bd58a9724..de1b4a106 100644
--- a/imblearn/__init__.py
+++ b/imblearn/__init__.py
@@ -10,6 +10,9 @@
 ensemble
     Module which provides methods generating an ensemble of
     under-sampled subsets.
+metrics
+    Module which provides metrics to quantified the classification performance
+    with imbalanced dataset.
 over_sampling
     Module which provides methods to under-sample a dataset.
 under-sampling
@@ -31,6 +34,6 @@
 
 # list all submodules available in imblearn and version
 __all__ = [
-    'combine', 'ensemble', 'over_sampling', 'under_sampling', 'pipeline',
-    '__version__'
+    'combine', 'ensemble', 'metrics', 'over_sampling', 'under_sampling',
+    'pipeline', '__version__'
 ]
diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py
new file mode 100644
index 000000000..d7437fa7a
--- /dev/null
+++ b/imblearn/metrics/__init__.py
@@ -0,0 +1,7 @@
+"""
+The :mod:`imblearn.metrics` module includes score functions, performance
+metrics and pairwise metrics and distance computations.
+"""
+
+import numpy as np
+
diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
new file mode 100644
index 000000000..0688dad5f
--- /dev/null
+++ b/imblearn/metrics/classification.py
@@ -0,0 +1,180 @@
+"""Metrics to assess performance on classification task given class prediction
+
+Functions named as ``*_score`` return a scalar value to maximize: the higher
+the better
+
+Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize:
+the lower the better
+"""
+
+from __future__ import division
+
+import warnings
+
+import numpy as np
+
+from sklearn.metrics.classification import _check_targets
+from sklearn.preprocessing import LabelEncoder
+from sklearn.utils.fixes import bincount
+from sklearn.utils.multiclass import unique_labels
+
+
+def sensitivity_specificity_support(y_true, y_pred, labels=None,
+                                    pos_label=1, average=None,
+                                    warn_for=('sensitivity', 'specificity'),
+                                    sample_weight=None):
+    """Compute sensitivity, specificity, and support for each class
+
+    The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number
+    of true positives and ``fn`` the number of false negatives. The sensitivity
+    quantifies the ability to avoid false negatives_[1].
+
+    The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number
+    of true negatives and ``fn`` the number of false negatives. The specificity
+    quantifies the ability to avoid false positives_[1].
+
+    The support is the number of occurrences of each class in ``y_true``.
+
+    If ``pos_label is None`` and in binary classification, this function
+    returns the average precision, recall and F-measure if ``average``
+    is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``.
+
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) target values.
+
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Estimated targets as returned by a classifier.
+
+    beta : float, 1.0 by default
+        The strength of recall versus precision in the F-score.
+
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
+
+    pos_label : str or int, 1 by default
+        The class to report if ``average='binary'`` and the data is binary.
+        If the data are multiclass or multilabel, this will be ignored;
+        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
+        scores for that label only.
+
+    average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \
+                       'weighted']
+        If ``None``, the scores for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+
+    warn_for : tuple or set, for internal use
+        This determines which warnings will be made in the case that this
+        function is being used to return only one of its metrics.
+
+    Returns
+    -------
+    sensitivity : float (if average is not None) or array of float, shape =\
+        [n_unique_labels]
+
+    specificity : float (if average is not None) or array of float, , shape =\
+        [n_unique_labels]
+
+    support : int (if average is not None) or array of int, shape =\
+        [n_unique_labels]
+        The number of occurrences of each label in ``y_true``.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Sensitivity and specificity
+           <https://en.wikipedia.org/wiki/Sensitivity_and_specificity>`_
+
+    """
+
+    average_options = (None, 'micro', 'macro', 'weighted')
+    if average not in average_options and average != 'binary':
+        raise ValueError('average has to be one of ' +
+                         str(average_options))
+
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    present_labels = unique_labels(y_true, y_pred)
+
+    # We do not support multilabel for the moment
+    if y_type.startswith('multilabel'):
+        raise ValueError('Multilabel are not supported.')
+
+    if average == 'binary':
+        if y_type == 'binary':
+            if pos_label not in present_labels:
+                if len(present_labels) < 2:
+                    # Only negative labels
+                    return (0., 0., 0)
+                else:
+                    raise ValueError("pos_label=%r is not a valid label: %r" %
+                                     (pos_label, present_labels))
+            labels = [pos_label]
+        else:
+            raise ValueError("Target is %s but average='binary'. Please "
+                             "choose another average setting." % y_type)
+    elif pos_label not in (None, 1):
+        warnings.warn("Note that pos_label (set to %r) is ignored when "
+                      "average != 'binary' (got %r). You may use "
+                      "labels=[pos_label] to specify a single positive class."
+                      % (pos_label, average), UserWarning)
+
+    if labels is None:
+        labels = present_labels
+        n_labels = None
+    else:
+        n_labels = len(labels)
+        labels = np.hstack([labels, np.setdiff1d(present_labels, labels,
+                                                 assume_unique=True)])
+
+    le = LabelEncoder()
+    le.fit(labels)
+    y_true = le.transform(y_true)
+    y_pred = le.transform(y_pred)
+    sorted_labels = le.classes_
+
+    # In a leave out strategy and for each label, compute:
+    # TP, TN, FP, FN
+    list_tp = [(y_true == label) == (y_pred == label)
+               for label in sorted_labels]
+    list_tn = [(y_true != label) == (y_pred != label)
+               for label in sorted_labels]
+    list_fp = [(y_true == label) == (y_pred != label)
+               for label in sorted_labels]
+    list_fn = [(y_true != label) == (y_pred == label)
+               for label in sorted_labels]
+
+    # Retain only selected labels
+    indices = np.searchsorted(sorted_labels, labels[:n_labels])
+    list_tp = [tp[indices] for tp in list_tp]
+    list_tp = [tn[indices] for tn in list_tn]
+    list_tp = [fp[indices] for fp in list_fp]
+    list_tp = [fn[indices] for fn in list_fn]
+
+    # # Compute the specificity and sensitivity for each label
+    # list_sp = [np.count_nonzero(tn) / (np.count_nonzero(tn) +
+    #                                    np.count_nonzero(fp))
+    #            for tn, fp in zip(list_tn, list_fp)]
+    # list_se = [np.count_nonzero(tp) / (np.count_nonzero(tp) +
+    #                                    np.count_nonzero(fn))
+    #            for tp, fn in zip(list_tp, list_fn)]
diff --git a/imblearn/setup.py b/imblearn/setup.py
index e9ca7f385..a5d543a81 100644
--- a/imblearn/setup.py
+++ b/imblearn/setup.py
@@ -7,6 +7,8 @@ def configuration(parent_package='', top_path=None):
     config.add_subpackage('combine/tests')
     config.add_subpackage('ensemble')
     config.add_subpackage('ensemble/tests')
+    config.add_subpackage('metrics')
+    config.add_subpackage('metrics/tests')
     config.add_subpackage('over_sampling')
     config.add_subpackage('over_sampling/tests')
     config.add_subpackage('under_sampling')

From 1bdcc186eb7115d3de17311fc51b9f1b76f93841 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Mon, 19 Dec 2016 22:34:41 +0100
Subject: [PATCH 02/21] start adding test

---
 imblearn/metrics/classification.py            |  89 +++++---
 imblearn/metrics/tests/test_classification.py | 204 ++++++++++++++++++
 2 files changed, 261 insertions(+), 32 deletions(-)
 create mode 100644 imblearn/metrics/tests/test_classification.py

diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index 0688dad5f..bc39d0581 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -13,11 +13,13 @@
 
 import numpy as np
 
-from sklearn.metrics.classification import _check_targets
+from sklearn.metrics.classification import (_check_targets, _prf_divide)
 from sklearn.preprocessing import LabelEncoder
 from sklearn.utils.fixes import bincount
 from sklearn.utils.multiclass import unique_labels
 
+LOGGER = logging.getLogger(__name__)
+
 
 def sensitivity_specificity_support(y_true, y_pred, labels=None,
                                     pos_label=1, average=None,
@@ -36,8 +38,8 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     The support is the number of occurrences of each class in ``y_true``.
 
     If ``pos_label is None`` and in binary classification, this function
-    returns the average precision, recall and F-measure if ``average``
-    is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``.
+    returns the average sensitivity and specificity if ``average``
+    is one of ``'micro'`` or 'weighted'``.
 
     Parameters
     ----------
@@ -47,9 +49,6 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     y_pred : 1d array-like, or label indicator array / sparse matrix
         Estimated targets as returned by a classifier.
 
-    beta : float, 1.0 by default
-        The strength of recall versus precision in the F-score.
-
     labels : list, optional
         The set of labels to include when ``average != 'binary'``, and their
         order if ``average is None``. Labels present in the data can be
@@ -65,25 +64,20 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
         setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
         scores for that label only.
 
-    average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \
-                       'weighted']
+    average : string, [None (default), 'binary', 'macro', 'weighted']
         If ``None``, the scores for each class are returned. Otherwise, this
         determines the type of averaging performed on the data:
 
         ``'binary'``:
             Only report results for the class specified by ``pos_label``.
             This is applicable only if targets (``y_{true,pred}``) are binary.
-        ``'micro'``:
-            Calculate metrics globally by counting the total true positives,
-            false negatives and false positives.
         ``'macro'``:
             Calculate metrics for each label, and find their unweighted
             mean.  This does not take label imbalance into account.
         ``'weighted'``:
             Calculate metrics for each label, and find their average, weighted
             by support (the number of true instances for each label). This
-            alters 'macro' to account for label imbalance; it can result in an
-            F-score that is not between precision and recall.
+            alters 'macro' to account for label imbalance.
 
     warn_for : tuple or set, for internal use
         This determines which warnings will be made in the case that this
@@ -91,14 +85,14 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
 
     Returns
     -------
-    sensitivity : float (if average is not None) or array of float, shape =\
-        [n_unique_labels]
+    sensitivity : float (if ``average`` = None) or ndarray, \
+        shape(n_unique_labels,)
 
-    specificity : float (if average is not None) or array of float, , shape =\
-        [n_unique_labels]
+    specificity : float (if ``average`` = None) or ndarray, \
+        shape(n_unique_labels,)
 
-    support : int (if average is not None) or array of int, shape =\
-        [n_unique_labels]
+    support : int (if ``average`` = None) or ndarray, \
+        shape(n_unique_labels,)
         The number of occurrences of each label in ``y_true``.
 
     References
@@ -116,6 +110,9 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     present_labels = unique_labels(y_true, y_pred)
 
+    LOGGER.debug('The labels in the prediction and ground-truth are %s',
+                 present_labels)
+
     # We do not support multilabel for the moment
     if y_type.startswith('multilabel'):
         raise ValueError('Multilabel are not supported.')
@@ -151,10 +148,12 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     le.fit(labels)
     y_true = le.transform(y_true)
     y_pred = le.transform(y_pred)
-    sorted_labels = le.classes_
+    sorted_labels = le.classes_n
 
     # In a leave out strategy and for each label, compute:
     # TP, TN, FP, FN
+    # These list contain an array in which each sample is labeled as
+    # TP, TN, FP, FN
     list_tp = [(y_true == label) == (y_pred == label)
                for label in sorted_labels]
     list_tn = [(y_true != label) == (y_pred != label)
@@ -164,17 +163,43 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     list_fn = [(y_true != label) == (y_pred == label)
                for label in sorted_labels]
 
+    # Compute the sum for each type
+    tp_sum = [bincount(tp, weights=sample_weight, minlength=len(labels))
+              for tp in list_tp]
+    tn_sum = [bincount(tn, weights=sample_weight, minlength=len(labels))
+              for tn in list_tn]
+    fp_sum = [bincount(fp, weights=sample_weight, minlength=len(labels))
+              for fp in list_fp]
+    fn_sum = [bincount(fn, weights=sample_weight, minlength=len(labels))
+              for fn in list_fn]
+
     # Retain only selected labels
     indices = np.searchsorted(sorted_labels, labels[:n_labels])
-    list_tp = [tp[indices] for tp in list_tp]
-    list_tp = [tn[indices] for tn in list_tn]
-    list_tp = [fp[indices] for fp in list_fp]
-    list_tp = [fn[indices] for fn in list_fn]
-
-    # # Compute the specificity and sensitivity for each label
-    # list_sp = [np.count_nonzero(tn) / (np.count_nonzero(tn) +
-    #                                    np.count_nonzero(fp))
-    #            for tn, fp in zip(list_tn, list_fp)]
-    # list_se = [np.count_nonzero(tp) / (np.count_nonzero(tp) +
-    #                                    np.count_nonzero(fn))
-    #            for tp, fn in zip(list_tp, list_fn)]
+    tp_sum = [tp[indices] for tp in tp_sum]
+    tn_sum = [tn[indices] for tn in tn_sum]
+    fp_sum = [fp[indices] for fp in fp_sum]
+    fn_sum = [fn[indices] for fn in fn_sum]
+
+    LOGGER.debug('Computed for each label the stats')
+
+    # Compute the sensitivity and specificity
+    sensitivity = [_prf_divide(tp, tp + fn, 'sensitivity', 'tp + fn', average,
+                               warn_for) for tp, fn in zip(tp_sum, fn_sum)]
+    specificity = [_prf_divide(tn, tn + fp, 'specificity', 'tn + fp', average,
+                               warn_for) for tn, fp in zip(tn_sum, fp_sum)]
+
+    # If we need to weight the results
+    if average == 'weighted':
+        weights = tp_sum
+        if weights.sum() == 0:
+            return 0, 0, None
+    else:
+        weights = None
+
+    if average is not None:
+        assert average != 'binary' or len(sensitivity) == 1
+        sensitivity = np.average(sensitivity, weights=weights)
+        specificity = np.average(specificity, weights=weights)
+        tp_sum = None
+
+    return sensitivity, specificity, tp_sum
diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py
new file mode 100644
index 000000000..031a48887
--- /dev/null
+++ b/imblearn/metrics/tests/test_classification.py
@@ -0,0 +1,204 @@
+"""Testing the metric for classification with imbalanced dataset"""
+
+from __future__ import division, print_function
+
+import numpy as np
+
+from numpy.testing import (assert_array_almost_equal, assert_array_equal,
+                           assert_no_warnings, assert_equal,
+                           assert_almost_equal, assert_raises)
+from sklearn.utils.testing import assert_warns_message, ignore_warnings
+
+from sklearn import datasets
+from sklearn import svm
+
+from sklearn.utils.validation import check_random_state
+
+RND_SEED = 42
+
+###############################################################################
+# Utilities for testing
+
+
+def make_prediction(dataset=None, binary=False):
+    """Make some classification predictions on a toy dataset using a SVC
+    If binary is True restrict to a binary classification problem instead of a
+    multiclass classification problem
+    """
+
+    if dataset is None:
+        # import some data to play with
+        dataset = datasets.load_iris()
+
+    X = dataset.data
+    y = dataset.target
+
+    if binary:
+        # restrict to a binary classification task
+        X, y = X[y < 2], y[y < 2]
+
+    n_samples, n_features = X.shape
+    p = np.arange(n_samples)
+
+    rng = check_random_state(37)
+    rng.shuffle(p)
+    X, y = X[p], y[p]
+    half = int(n_samples / 2)
+
+    # add noisy features to make the problem harder and avoid perfect results
+    rng = np.random.RandomState(0)
+    X = np.c_[X, rng.randn(n_samples, 200 * n_features)]
+
+    # run classifier, get class probabilities and label predictions
+    clf = svm.SVC(kernel='linear', probability=True, random_state=0)
+    probas_pred = clf.fit(X[:half], y[:half]).predict_proba(X[half:])
+
+    if binary:
+        # only interested in probabilities of the positive case
+        # XXX: do we really want a special API for the binary case?
+        probas_pred = probas_pred[:, 1]
+
+    y_pred = clf.predict(X[half:])
+    y_true = y[half:]
+
+    return y_true, y_pred, probas_pred
+
+
+###############################################################################
+# Tests
+
+def test_precision_recall_f1_score_binary():
+    # Test Precision Recall and F1 Score for binary classification task
+    y_true, y_pred, _ = make_prediction(binary=True)
+
+    # detailed measures for each class
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
+    assert_array_almost_equal(p, [0.73, 0.85], 2)
+    assert_array_almost_equal(r, [0.88, 0.68], 2)
+    assert_array_almost_equal(f, [0.80, 0.76], 2)
+    assert_array_equal(s, [25, 25])
+
+    # individual scoring function that can be used for grid search: in the
+    # binary class case the score is the value of the measure for the positive
+    # class (e.g. label == 1). This is deprecated for average != 'binary'.
+    for kwargs, my_assert in [({}, assert_no_warnings),
+                              ({'average': 'binary'}, assert_no_warnings)]:
+        ps = my_assert(precision_score, y_true, y_pred, **kwargs)
+        assert_array_almost_equal(ps, 0.85, 2)
+
+        rs = my_assert(recall_score, y_true, y_pred, **kwargs)
+        assert_array_almost_equal(rs, 0.68, 2)
+
+        fs = my_assert(f1_score, y_true, y_pred, **kwargs)
+        assert_array_almost_equal(fs, 0.76, 2)
+
+        assert_almost_equal(my_assert(fbeta_score, y_true, y_pred, beta=2,
+                                      **kwargs),
+                            (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
+
+
+def test_precision_recall_f_binary_single_class():
+    # Test precision, recall and F1 score behave with a single positive or
+    # negative class
+    # Such a case may occur with non-stratified cross-validation
+    assert_equal(1., precision_score([1, 1], [1, 1]))
+    assert_equal(1., recall_score([1, 1], [1, 1]))
+    assert_equal(1., f1_score([1, 1], [1, 1]))
+
+    assert_equal(0., precision_score([-1, -1], [-1, -1]))
+    assert_equal(0., recall_score([-1, -1], [-1, -1]))
+    assert_equal(0., f1_score([-1, -1], [-1, -1]))
+
+
+@ignore_warnings
+def test_precision_recall_f_extra_labels():
+    # Test handling of explicit additional (not in input) labels to PRF
+    y_true = [1, 3, 3, 2]
+    y_pred = [1, 1, 3, 2]
+    y_true_bin = label_binarize(y_true, classes=np.arange(5))
+    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
+    data = [(y_true, y_pred),
+            (y_true_bin, y_pred_bin)]
+
+    for i, (y_true, y_pred) in enumerate(data):
+        # No average: zeros in array
+        actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
+                              average=None)
+        assert_array_almost_equal([0., 1., 1., .5, 0.], actual)
+
+        # Macro average is changed
+        actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
+                              average='macro')
+        assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual)
+
+        # No effect otheriwse
+        for average in ['micro', 'weighted', 'samples']:
+            if average == 'samples' and i == 0:
+                continue
+            assert_almost_equal(recall_score(y_true, y_pred,
+                                             labels=[0, 1, 2, 3, 4],
+                                             average=average),
+                                recall_score(y_true, y_pred, labels=None,
+                                             average=average))
+
+    # Error when introducing invalid label in multilabel case
+    # (although it would only affect performance if average='macro'/None)
+    for average in [None, 'macro', 'micro', 'samples']:
+        assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
+                      labels=np.arange(6), average=average)
+        assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
+                      labels=np.arange(-1, 4), average=average)
+
+
+@ignore_warnings
+def test_precision_recall_f_ignored_labels():
+    # Test a subset of labels may be requested for PRF
+    y_true = [1, 1, 2, 3]
+    y_pred = [1, 3, 3, 3]
+    y_true_bin = label_binarize(y_true, classes=np.arange(5))
+    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
+    data = [(y_true, y_pred),
+            (y_true_bin, y_pred_bin)]
+
+    for i, (y_true, y_pred) in enumerate(data):
+        recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3])
+        recall_all = partial(recall_score, y_true, y_pred, labels=None)
+
+        assert_array_almost_equal([.5, 1.], recall_13(average=None))
+        assert_almost_equal((.5 + 1.) / 2, recall_13(average='macro'))
+        assert_almost_equal((.5 * 2 + 1. * 1) / 3,
+                            recall_13(average='weighted'))
+        assert_almost_equal(2. / 3, recall_13(average='micro'))
+
+        # ensure the above were meaningful tests:
+        for average in ['macro', 'weighted', 'micro']:
+            assert_not_equal(recall_13(average=average),
+                             recall_all(average=average))
+
+
+@ignore_warnings
+def test_precision_recall_fscore_support_errors():
+    y_true, y_pred, _ = make_prediction(binary=True)
+
+    # Bad beta
+    assert_raises(ValueError, precision_recall_fscore_support,
+                  y_true, y_pred, beta=0.0)
+
+    # Bad pos_label
+    assert_raises(ValueError, precision_recall_fscore_support,
+                  y_true, y_pred, pos_label=2, average='binary')
+
+    # Bad average option
+    assert_raises(ValueError, precision_recall_fscore_support,
+                  [0, 1, 2], [1, 2, 0], average='mega')
+
+
+def test_precision_recall_f_unused_pos_label():
+    # Check warning that pos_label unused when set to non-default value
+    # but average != 'binary'; even if data is binary.
+    assert_warns_message(UserWarning,
+                         "Note that pos_label (set to 2) is "
+                         "ignored when average != 'binary' (got 'macro'). You "
+                         "may use labels=[pos_label] to specify a single "
+                         "positive class.", precision_recall_fscore_support,
+                         [1, 2, 1], [1, 2, 2], pos_label=2, average='macro')

From d0884dd128f7be435cbb8f399b6035db9041ec40 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Tue, 20 Dec 2016 00:05:36 +0100
Subject: [PATCH 03/21] advance the testing

---
 imblearn/metrics/__init__.py                  |   5 +-
 imblearn/metrics/classification.py            |  25 +-
 imblearn/metrics/tests/test_classification.py | 266 +++++++++---------
 3 files changed, 158 insertions(+), 138 deletions(-)

diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py
index d7437fa7a..9a4a3dc6e 100644
--- a/imblearn/metrics/__init__.py
+++ b/imblearn/metrics/__init__.py
@@ -3,5 +3,8 @@
 metrics and pairwise metrics and distance computations.
 """
 
-import numpy as np
+from .classification import sensitivity_specificity_support
 
+__all__ = [
+    'sensitivity_specificity_support'
+]
diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index bc39d0581..2cfa84112 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -10,6 +10,7 @@
 from __future__ import division
 
 import warnings
+import logging
 
 import numpy as np
 
@@ -148,21 +149,26 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     le.fit(labels)
     y_true = le.transform(y_true)
     y_pred = le.transform(y_pred)
-    sorted_labels = le.classes_n
+    sorted_labels = le.classes_
 
     # In a leave out strategy and for each label, compute:
     # TP, TN, FP, FN
     # These list contain an array in which each sample is labeled as
     # TP, TN, FP, FN
-    list_tp = [(y_true == label) == (y_pred == label)
+    list_tp = [np.bitwise_and((y_true == label), (y_pred == label))
                for label in sorted_labels]
-    list_tn = [(y_true != label) == (y_pred != label)
+    list_tn = [np.bitwise_and((y_true != label), (y_pred != label))
                for label in sorted_labels]
-    list_fp = [(y_true == label) == (y_pred != label)
+    list_fp = [np.bitwise_and((y_true == label), (y_pred != label))
                for label in sorted_labels]
-    list_fn = [(y_true != label) == (y_pred == label)
+    list_fn = [np.bitwise_and((y_true != label), (y_pred == label))
                for label in sorted_labels]
 
+    LOGGER.debug(list_tp)
+    LOGGER.debug(list_tn)
+    LOGGER.debug(list_fn)
+    LOGGER.debug(list_fn)
+
     # Compute the sum for each type
     tp_sum = [bincount(tp, weights=sample_weight, minlength=len(labels))
               for tp in list_tp]
@@ -173,6 +179,11 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     fn_sum = [bincount(fn, weights=sample_weight, minlength=len(labels))
               for fn in list_fn]
 
+    LOGGER.debug(tp_sum)
+    LOGGER.debug(tn_sum)
+    LOGGER.debug(fp_sum)
+    LOGGER.debug(fn_sum)
+
     # Retain only selected labels
     indices = np.searchsorted(sorted_labels, labels[:n_labels])
     tp_sum = [tp[indices] for tp in tp_sum]
@@ -188,6 +199,10 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     specificity = [_prf_divide(tn, tn + fp, 'specificity', 'tn + fp', average,
                                warn_for) for tn, fp in zip(tn_sum, fp_sum)]
 
+    LOGGER.debug('Computed the sensitivity and specificity for each class')
+    LOGGER.debug('The lengths of those two metrics are: %s - %s',
+                 len(sensitivity), len(specificity))
+
     # If we need to weight the results
     if average == 'weighted':
         weights = tp_sum
diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py
index 031a48887..64ad4b9ad 100644
--- a/imblearn/metrics/tests/test_classification.py
+++ b/imblearn/metrics/tests/test_classification.py
@@ -14,6 +14,8 @@
 
 from sklearn.utils.validation import check_random_state
 
+from imblearn.metrics import sensitivity_specificity_support
+
 RND_SEED = 42
 
 ###############################################################################
@@ -67,138 +69,138 @@ def make_prediction(dataset=None, binary=False):
 ###############################################################################
 # Tests
 
-def test_precision_recall_f1_score_binary():
-    # Test Precision Recall and F1 Score for binary classification task
+def test_sensitivity_specificity_support_binary():
+    """Test the sensitivity specificity for binary classification task"""
     y_true, y_pred, _ = make_prediction(binary=True)
 
     # detailed measures for each class
-    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
-    assert_array_almost_equal(p, [0.73, 0.85], 2)
-    assert_array_almost_equal(r, [0.88, 0.68], 2)
-    assert_array_almost_equal(f, [0.80, 0.76], 2)
-    assert_array_equal(s, [25, 25])
-
-    # individual scoring function that can be used for grid search: in the
-    # binary class case the score is the value of the measure for the positive
-    # class (e.g. label == 1). This is deprecated for average != 'binary'.
-    for kwargs, my_assert in [({}, assert_no_warnings),
-                              ({'average': 'binary'}, assert_no_warnings)]:
-        ps = my_assert(precision_score, y_true, y_pred, **kwargs)
-        assert_array_almost_equal(ps, 0.85, 2)
-
-        rs = my_assert(recall_score, y_true, y_pred, **kwargs)
-        assert_array_almost_equal(rs, 0.68, 2)
-
-        fs = my_assert(f1_score, y_true, y_pred, **kwargs)
-        assert_array_almost_equal(fs, 0.76, 2)
-
-        assert_almost_equal(my_assert(fbeta_score, y_true, y_pred, beta=2,
-                                      **kwargs),
-                            (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
-
-
-def test_precision_recall_f_binary_single_class():
-    # Test precision, recall and F1 score behave with a single positive or
-    # negative class
-    # Such a case may occur with non-stratified cross-validation
-    assert_equal(1., precision_score([1, 1], [1, 1]))
-    assert_equal(1., recall_score([1, 1], [1, 1]))
-    assert_equal(1., f1_score([1, 1], [1, 1]))
-
-    assert_equal(0., precision_score([-1, -1], [-1, -1]))
-    assert_equal(0., recall_score([-1, -1], [-1, -1]))
-    assert_equal(0., f1_score([-1, -1], [-1, -1]))
-
-
-@ignore_warnings
-def test_precision_recall_f_extra_labels():
-    # Test handling of explicit additional (not in input) labels to PRF
-    y_true = [1, 3, 3, 2]
-    y_pred = [1, 1, 3, 2]
-    y_true_bin = label_binarize(y_true, classes=np.arange(5))
-    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
-    data = [(y_true, y_pred),
-            (y_true_bin, y_pred_bin)]
-
-    for i, (y_true, y_pred) in enumerate(data):
-        # No average: zeros in array
-        actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
-                              average=None)
-        assert_array_almost_equal([0., 1., 1., .5, 0.], actual)
-
-        # Macro average is changed
-        actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
-                              average='macro')
-        assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual)
-
-        # No effect otheriwse
-        for average in ['micro', 'weighted', 'samples']:
-            if average == 'samples' and i == 0:
-                continue
-            assert_almost_equal(recall_score(y_true, y_pred,
-                                             labels=[0, 1, 2, 3, 4],
-                                             average=average),
-                                recall_score(y_true, y_pred, labels=None,
-                                             average=average))
-
-    # Error when introducing invalid label in multilabel case
-    # (although it would only affect performance if average='macro'/None)
-    for average in [None, 'macro', 'micro', 'samples']:
-        assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
-                      labels=np.arange(6), average=average)
-        assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
-                      labels=np.arange(-1, 4), average=average)
-
-
-@ignore_warnings
-def test_precision_recall_f_ignored_labels():
-    # Test a subset of labels may be requested for PRF
-    y_true = [1, 1, 2, 3]
-    y_pred = [1, 3, 3, 3]
-    y_true_bin = label_binarize(y_true, classes=np.arange(5))
-    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
-    data = [(y_true, y_pred),
-            (y_true_bin, y_pred_bin)]
-
-    for i, (y_true, y_pred) in enumerate(data):
-        recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3])
-        recall_all = partial(recall_score, y_true, y_pred, labels=None)
-
-        assert_array_almost_equal([.5, 1.], recall_13(average=None))
-        assert_almost_equal((.5 + 1.) / 2, recall_13(average='macro'))
-        assert_almost_equal((.5 * 2 + 1. * 1) / 3,
-                            recall_13(average='weighted'))
-        assert_almost_equal(2. / 3, recall_13(average='micro'))
-
-        # ensure the above were meaningful tests:
-        for average in ['macro', 'weighted', 'micro']:
-            assert_not_equal(recall_13(average=average),
-                             recall_all(average=average))
-
-
-@ignore_warnings
-def test_precision_recall_fscore_support_errors():
-    y_true, y_pred, _ = make_prediction(binary=True)
-
-    # Bad beta
-    assert_raises(ValueError, precision_recall_fscore_support,
-                  y_true, y_pred, beta=0.0)
-
-    # Bad pos_label
-    assert_raises(ValueError, precision_recall_fscore_support,
-                  y_true, y_pred, pos_label=2, average='binary')
-
-    # Bad average option
-    assert_raises(ValueError, precision_recall_fscore_support,
-                  [0, 1, 2], [1, 2, 0], average='mega')
-
-
-def test_precision_recall_f_unused_pos_label():
-    # Check warning that pos_label unused when set to non-default value
-    # but average != 'binary'; even if data is binary.
-    assert_warns_message(UserWarning,
-                         "Note that pos_label (set to 2) is "
-                         "ignored when average != 'binary' (got 'macro'). You "
-                         "may use labels=[pos_label] to specify a single "
-                         "positive class.", precision_recall_fscore_support,
-                         [1, 2, 1], [1, 2, 2], pos_label=2, average='macro')
+    sens, spec, supp = sensitivity_specificity_support(y_true, y_pred,
+                                                       average=None)
+    assert_array_almost_equal(sens, [0.88, 0.68], 2)
+    assert_array_almost_equal(spec, [0.73, 0.85], 2)
+    assert_array_equal(supp, [25, 25])
+
+    # # individual scoring function that can be used for grid search: in the
+    # # binary class case the score is the value of the measure for the positive
+    # # class (e.g. label == 1). This is deprecated for average != 'binary'.
+    # for kwargs, my_assert in [({}, assert_no_warnings),
+    #                           ({'average': 'binary'}, assert_no_warnings)]:
+    #     ps = my_assert(precision_score, y_true, y_pred, **kwargs)
+    #     assert_array_almost_equal(ps, 0.85, 2)
+
+    #     rs = my_assert(recall_score, y_true, y_pred, **kwargs)
+    #     assert_array_almost_equal(rs, 0.68, 2)
+
+    #     fs = my_assert(f1_score, y_true, y_pred, **kwargs)
+    #     assert_array_almost_equal(fs, 0.76, 2)
+
+    #     assert_almost_equal(my_assert(fbeta_score, y_true, y_pred, beta=2,
+    #                                   **kwargs),
+    #                         (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
+
+
+# def test_precision_recall_f_binary_single_class():
+#     # Test precision, recall and F1 score behave with a single positive or
+#     # negative class
+#     # Such a case may occur with non-stratified cross-validation
+#     assert_equal(1., precision_score([1, 1], [1, 1]))
+#     assert_equal(1., recall_score([1, 1], [1, 1]))
+#     assert_equal(1., f1_score([1, 1], [1, 1]))
+
+#     assert_equal(0., precision_score([-1, -1], [-1, -1]))
+#     assert_equal(0., recall_score([-1, -1], [-1, -1]))
+#     assert_equal(0., f1_score([-1, -1], [-1, -1]))
+
+
+# @ignore_warnings
+# def test_precision_recall_f_extra_labels():
+#     # Test handling of explicit additional (not in input) labels to PRF
+#     y_true = [1, 3, 3, 2]
+#     y_pred = [1, 1, 3, 2]
+#     y_true_bin = label_binarize(y_true, classes=np.arange(5))
+#     y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
+#     data = [(y_true, y_pred),
+#             (y_true_bin, y_pred_bin)]
+
+#     for i, (y_true, y_pred) in enumerate(data):
+#         # No average: zeros in array
+#         actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
+#                               average=None)
+#         assert_array_almost_equal([0., 1., 1., .5, 0.], actual)
+
+#         # Macro average is changed
+#         actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
+#                               average='macro')
+#         assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual)
+
+#         # No effect otheriwse
+#         for average in ['micro', 'weighted', 'samples']:
+#             if average == 'samples' and i == 0:
+#                 continue
+#             assert_almost_equal(recall_score(y_true, y_pred,
+#                                              labels=[0, 1, 2, 3, 4],
+#                                              average=average),
+#                                 recall_score(y_true, y_pred, labels=None,
+#                                              average=average))
+
+#     # Error when introducing invalid label in multilabel case
+#     # (although it would only affect performance if average='macro'/None)
+#     for average in [None, 'macro', 'micro', 'samples']:
+#         assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
+#                       labels=np.arange(6), average=average)
+#         assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
+#                       labels=np.arange(-1, 4), average=average)
+
+
+# @ignore_warnings
+# def test_precision_recall_f_ignored_labels():
+#     # Test a subset of labels may be requested for PRF
+#     y_true = [1, 1, 2, 3]
+#     y_pred = [1, 3, 3, 3]
+#     y_true_bin = label_binarize(y_true, classes=np.arange(5))
+#     y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
+#     data = [(y_true, y_pred),
+#             (y_true_bin, y_pred_bin)]
+
+#     for i, (y_true, y_pred) in enumerate(data):
+#         recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3])
+#         recall_all = partial(recall_score, y_true, y_pred, labels=None)
+
+#         assert_array_almost_equal([.5, 1.], recall_13(average=None))
+#         assert_almost_equal((.5 + 1.) / 2, recall_13(average='macro'))
+#         assert_almost_equal((.5 * 2 + 1. * 1) / 3,
+#                             recall_13(average='weighted'))
+#         assert_almost_equal(2. / 3, recall_13(average='micro'))
+
+#         # ensure the above were meaningful tests:
+#         for average in ['macro', 'weighted', 'micro']:
+#             assert_not_equal(recall_13(average=average),
+#                              recall_all(average=average))
+
+
+# @ignore_warnings
+# def test_precision_recall_fscore_support_errors():
+#     y_true, y_pred, _ = make_prediction(binary=True)
+
+#     # Bad beta
+#     assert_raises(ValueError, precision_recall_fscore_support,
+#                   y_true, y_pred, beta=0.0)
+
+#     # Bad pos_label
+#     assert_raises(ValueError, precision_recall_fscore_support,
+#                   y_true, y_pred, pos_label=2, average='binary')
+
+#     # Bad average option
+#     assert_raises(ValueError, precision_recall_fscore_support,
+#                   [0, 1, 2], [1, 2, 0], average='mega')
+
+
+# def test_precision_recall_f_unused_pos_label():
+#     # Check warning that pos_label unused when set to non-default value
+#     # but average != 'binary'; even if data is binary.
+#     assert_warns_message(UserWarning,
+#                          "Note that pos_label (set to 2) is "
+#                          "ignored when average != 'binary' (got 'macro'). You "
+#                          "may use labels=[pos_label] to specify a single "
+#                          "positive class.", precision_recall_fscore_support,
+#                          [1, 2, 1], [1, 2, 2], pos_label=2, average='macro')

From 63a2aa2740939830922216ab6f20167d8faf2c5d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Wed, 21 Dec 2016 01:09:13 +0100
Subject: [PATCH 04/21] Finish the non-failure test

---
 imblearn/metrics/__init__.py                  |   6 +-
 imblearn/metrics/classification.py            | 246 +++++++++++++++---
 imblearn/metrics/tests/test_classification.py |  71 +++--
 3 files changed, 269 insertions(+), 54 deletions(-)

diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py
index 9a4a3dc6e..d518d7a23 100644
--- a/imblearn/metrics/__init__.py
+++ b/imblearn/metrics/__init__.py
@@ -4,7 +4,11 @@
 """
 
 from .classification import sensitivity_specificity_support
+from .classification import sensitivity_score
+from .classification import specificity_score
 
 __all__ = [
-    'sensitivity_specificity_support'
+    'sensitivity_specificity_support',
+    'sensitivity_score',
+    'specificity_score'
 ]
diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index 2cfa84112..90ef0c736 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -14,7 +14,7 @@
 
 import numpy as np
 
-from sklearn.metrics.classification import (_check_targets, _prf_divide)
+from sklearn.metrics.classification import _check_targets, _prf_divide
 from sklearn.preprocessing import LabelEncoder
 from sklearn.utils.fixes import bincount
 from sklearn.utils.multiclass import unique_labels
@@ -44,10 +44,10 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
 
     Parameters
     ----------
-    y_true : 1d array-like, or label indicator array / sparse matrix
+    y_true : ndarray, shape (n_samples, )
         Ground truth (correct) target values.
 
-    y_pred : 1d array-like, or label indicator array / sparse matrix
+    y_pred : ndarray, shape (n_samples, )
         Estimated targets as returned by a classifier.
 
     labels : list, optional
@@ -59,13 +59,13 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
         labels are column indices. By default, all labels in ``y_true`` and
         ``y_pred`` are used in sorted order.
 
-    pos_label : str or int, 1 by default
+    pos_label : str or int, optional (default=1)
         The class to report if ``average='binary'`` and the data is binary.
         If the data are multiclass or multilabel, this will be ignored;
         setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
         scores for that label only.
 
-    average : string, [None (default), 'binary', 'macro', 'weighted']
+    average : str or None, optional (default=None)
         If ``None``, the scores for each class are returned. Otherwise, this
         determines the type of averaging performed on the data:
 
@@ -84,16 +84,19 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
         This determines which warnings will be made in the case that this
         function is being used to return only one of its metrics.
 
+    sample_weight : ndarray, shape (n_samples, )
+        Sample weights.
+
     Returns
     -------
     sensitivity : float (if ``average`` = None) or ndarray, \
-        shape(n_unique_labels,)
+        shape (n_unique_labels, )
 
     specificity : float (if ``average`` = None) or ndarray, \
-        shape(n_unique_labels,)
+        shape (n_unique_labels, )
 
     support : int (if ``average`` = None) or ndarray, \
-        shape(n_unique_labels,)
+        shape (n_unique_labels, )
         The number of occurrences of each label in ``y_true``.
 
     References
@@ -151,6 +154,12 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     y_pred = le.transform(y_pred)
     sorted_labels = le.classes_
 
+    LOGGER.debug(y_true)
+    LOGGER.debug(y_pred)
+    LOGGER.debug(sorted_labels)
+
+    LOGGER.debug('The number of labels is %s' % n_labels)
+
     # In a leave out strategy and for each label, compute:
     # TP, TN, FP, FN
     # These list contain an array in which each sample is labeled as
@@ -159,39 +168,58 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
                for label in sorted_labels]
     list_tn = [np.bitwise_and((y_true != label), (y_pred != label))
                for label in sorted_labels]
-    list_fp = [np.bitwise_and((y_true == label), (y_pred != label))
+    list_fp = [np.bitwise_and((y_true != label), (y_pred == label))
                for label in sorted_labels]
-    list_fn = [np.bitwise_and((y_true != label), (y_pred == label))
+    list_fn = [np.bitwise_and((y_true == label), (y_pred != label))
                for label in sorted_labels]
 
-    LOGGER.debug(list_tp)
-    LOGGER.debug(list_tn)
-    LOGGER.debug(list_fn)
-    LOGGER.debug(list_fn)
-
     # Compute the sum for each type
-    tp_sum = [bincount(tp, weights=sample_weight, minlength=len(labels))
-              for tp in list_tp]
-    tn_sum = [bincount(tn, weights=sample_weight, minlength=len(labels))
-              for tn in list_tn]
-    fp_sum = [bincount(fp, weights=sample_weight, minlength=len(labels))
-              for fp in list_fp]
-    fn_sum = [bincount(fn, weights=sample_weight, minlength=len(labels))
-              for fn in list_fn]
-
-    LOGGER.debug(tp_sum)
-    LOGGER.debug(tn_sum)
-    LOGGER.debug(fp_sum)
-    LOGGER.debug(fn_sum)
+    # We keep only the counting corresponding to True values
+    # We are using bincount since it allows to weight the samples
+    tp_sum = np.array([bincount(tp, weights=sample_weight,
+                                minlength=2)[-1]
+                       for tp in list_tp])
+    tn_sum = np.array([bincount(tn, weights=sample_weight,
+                                minlength=2)[-1]
+                       for tn in list_tn])
+    fp_sum = np.array([bincount(fp, weights=sample_weight,
+                                minlength=2)[-1]
+                       for fp in list_fp])
+    fn_sum = np.array([bincount(fn, weights=sample_weight,
+                                minlength=2)[-1]
+                       for fn in list_fn])
 
     # Retain only selected labels
     indices = np.searchsorted(sorted_labels, labels[:n_labels])
-    tp_sum = [tp[indices] for tp in tp_sum]
-    tn_sum = [tn[indices] for tn in tn_sum]
-    fp_sum = [fp[indices] for fp in fp_sum]
-    fn_sum = [fn[indices] for fn in fn_sum]
+    # For support, we can count the number of occurrences of each label
+    support = np.array(bincount(y_true, weights=sample_weight,
+                                minlength=len(labels)))
+    # Sort the support
+    support = support[indices]
+
+
+    LOGGER.debug('The indices which are retained are %s' % indices)
+
+    LOGGER.debug('TP: %s' % tp_sum)
+    LOGGER.debug('TN: %s' % tn_sum)
+    LOGGER.debug('FP: %s' % fp_sum)
+    LOGGER.debug('FN: %s' % fn_sum)
 
-    LOGGER.debug('Computed for each label the stats')
+    tp_sum = tp_sum[indices]
+    tn_sum = tn_sum[indices]
+    fp_sum = fp_sum[indices]
+    fn_sum = fn_sum[indices]
+
+    if average == 'micro':
+        tp_sum = np.array([tp_sum.sum()])
+        tn_sum = np.array([tn_sum.sum()])
+        fp_sum = np.array([fp_sum.sum()])
+        fn_sum = np.array([fn_sum.sum()])
+
+    LOGGER.debug('Did we do the average micro %s' % tp_sum)
+
+    LOGGER.debug('Computed the necessary stats for the sensitivity and'
+                 ' specificity')
 
     # Compute the sensitivity and specificity
     sensitivity = [_prf_divide(tp, tp + fn, 'sensitivity', 'tp + fn', average,
@@ -199,13 +227,16 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     specificity = [_prf_divide(tn, tn + fp, 'specificity', 'tn + fp', average,
                                warn_for) for tn, fp in zip(tn_sum, fp_sum)]
 
+    LOGGER.debug('Sensitivity = %s - Specificity = %s' % (sensitivity,
+                                                          specificity))
+
     LOGGER.debug('Computed the sensitivity and specificity for each class')
     LOGGER.debug('The lengths of those two metrics are: %s - %s',
                  len(sensitivity), len(specificity))
 
     # If we need to weight the results
     if average == 'weighted':
-        weights = tp_sum
+        weights = support
         if weights.sum() == 0:
             return 0, 0, None
     else:
@@ -215,6 +246,149 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
         assert average != 'binary' or len(sensitivity) == 1
         sensitivity = np.average(sensitivity, weights=weights)
         specificity = np.average(specificity, weights=weights)
-        tp_sum = None
+        support = None
+
+    return sensitivity, specificity, support
+
+
+def sensitivity_score(y_true, y_pred, labels=None, pos_label=1,
+                      average='binary', sample_weight=None):
+    """Compute the sensitivity
+
+    The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number
+    of true positives and ``fn`` the number of false negatives. The sensitivity
+    quantifies the ability to avoid false negatives.
+
+    The best value is 1 and the worst value is 0.
+
+    Parameters
+    ----------
+    y_true : ndarray, shape (n_samples, )
+        Ground truth (correct) target values.
+
+    y_pred : ndarray, shape (n_samples, )
+        Estimated targets as returned by a classifier.
+
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
+
+    pos_label : str or int, optional (default=1)
+        The class to report if ``average='binary'`` and the data is binary.
+        If the data are multiclass or multilabel, this will be ignored;
+        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
+        scores for that label only.
+
+    average : str or None, optional (default=None)
+        If ``None``, the scores for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance.
+
+    warn_for : tuple or set, for internal use
+        This determines which warnings will be made in the case that this
+        function is being used to return only one of its metrics.
 
-    return sensitivity, specificity, tp_sum
+    sample_weight : ndarray, shape (n_samples, )
+        Sample weights.
+
+    Returns
+    -------
+    specificity : float (if ``average`` = None) or ndarray, \
+        shape (n_unique_labels, )
+
+    """
+    s, _, _ = sensitivity_specificity_support(y_true, y_pred,
+                                              labels=labels,
+                                              pos_label=pos_label,
+                                              average=average,
+                                              warn_for=('specificity',),
+                                              sample_weight=sample_weight)
+
+    return s
+
+
+def specificity_score(y_true, y_pred, labels=None, pos_label=1,
+                      average='binary', sample_weight=None):
+    """Compute the specificity
+
+    The specificity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number
+    of true positives and ``fn`` the number of false negatives. The specificity
+    is intuitively the ability of the classifier to find all the positive
+    samples.
+
+    The best value is 1 and the worst value is 0.
+
+    Parameters
+    ----------
+    y_true : ndarray, shape (n_samples, )
+        Ground truth (correct) target values.
+
+    y_pred : ndarray, shape (n_samples, )
+        Estimated targets as returned by a classifier.
+
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
+
+    pos_label : str or int, optional (default=1)
+        The class to report if ``average='binary'`` and the data is binary.
+        If the data are multiclass or multilabel, this will be ignored;
+        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
+        scores for that label only.
+
+    average : str or None, optional (default=None)
+        If ``None``, the scores for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance.
+
+    warn_for : tuple or set, for internal use
+        This determines which warnings will be made in the case that this
+        function is being used to return only one of its metrics.
+
+    sample_weight : ndarray, shape (n_samples, )
+        Sample weights.
+
+    Returns
+    -------
+    specificity : float (if ``average`` = None) or ndarray, \
+        shape (n_unique_labels, )
+
+    """
+    _, s, _ = sensitivity_specificity_support(y_true, y_pred,
+                                              labels=labels,
+                                              pos_label=pos_label,
+                                              average=average,
+                                              warn_for=('specificity',),
+                                              sample_weight=sample_weight)
+
+    return s
diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py
index 64ad4b9ad..d7ce37caa 100644
--- a/imblearn/metrics/tests/test_classification.py
+++ b/imblearn/metrics/tests/test_classification.py
@@ -15,6 +15,8 @@
 from sklearn.utils.validation import check_random_state
 
 from imblearn.metrics import sensitivity_specificity_support
+from imblearn.metrics import sensitivity_score
+from imblearn.metrics import specificity_score
 
 RND_SEED = 42
 
@@ -77,26 +79,19 @@ def test_sensitivity_specificity_support_binary():
     sens, spec, supp = sensitivity_specificity_support(y_true, y_pred,
                                                        average=None)
     assert_array_almost_equal(sens, [0.88, 0.68], 2)
-    assert_array_almost_equal(spec, [0.73, 0.85], 2)
+    assert_array_almost_equal(spec, [0.68, 0.88], 2)
     assert_array_equal(supp, [25, 25])
 
-    # # individual scoring function that can be used for grid search: in the
-    # # binary class case the score is the value of the measure for the positive
-    # # class (e.g. label == 1). This is deprecated for average != 'binary'.
-    # for kwargs, my_assert in [({}, assert_no_warnings),
-    #                           ({'average': 'binary'}, assert_no_warnings)]:
-    #     ps = my_assert(precision_score, y_true, y_pred, **kwargs)
-    #     assert_array_almost_equal(ps, 0.85, 2)
+    # individual scoring function that can be used for grid search: in the
+    # binary class case the score is the value of the measure for the positive
+    # class (e.g. label == 1). This is deprecated for average != 'binary'.
+    for kwargs, my_assert in [({}, assert_no_warnings),
+                              ({'average': 'binary'}, assert_no_warnings)]:
+        sens = my_assert(sensitivity_score, y_true, y_pred, **kwargs)
+        assert_array_almost_equal(sens, 0.68, 2)
 
-    #     rs = my_assert(recall_score, y_true, y_pred, **kwargs)
-    #     assert_array_almost_equal(rs, 0.68, 2)
-
-    #     fs = my_assert(f1_score, y_true, y_pred, **kwargs)
-    #     assert_array_almost_equal(fs, 0.76, 2)
-
-    #     assert_almost_equal(my_assert(fbeta_score, y_true, y_pred, beta=2,
-    #                                   **kwargs),
-    #                         (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
+        spec = my_assert(specificity_score, y_true, y_pred, **kwargs)
+        assert_array_almost_equal(spec, 0.88, 2)
 
 
 # def test_precision_recall_f_binary_single_class():
@@ -204,3 +199,45 @@ def test_sensitivity_specificity_support_binary():
 #                          "may use labels=[pos_label] to specify a single "
 #                          "positive class.", precision_recall_fscore_support,
 #                          [1, 2, 1], [1, 2, 2], pos_label=2, average='macro')
+
+def test_precision_recall_f1_score_multiclass():
+    # Test Precision Recall and F1 Score for multiclass classification task
+    y_true, y_pred, _ = make_prediction(binary=False)
+
+    # compute scores with default labels introspection
+    sens, spec, supp = sensitivity_specificity_support(y_true, y_pred,
+                                                       average=None)
+    assert_array_almost_equal(spec, [0.92, 0.86, 0.55], 2)
+    assert_array_almost_equal(sens, [0.79, 0.09, 0.90], 2)
+    assert_array_equal(supp, [24, 31, 20])
+
+    # averaging tests
+    spec = specificity_score(y_true, y_pred, pos_label=1, average='micro')
+    assert_array_almost_equal(spec, 0.77, 2)
+
+    sens = sensitivity_score(y_true, y_pred, average='micro')
+    assert_array_almost_equal(sens, 0.53, 2)
+
+    spec = specificity_score(y_true, y_pred, average='macro')
+    assert_array_almost_equal(spec, 0.77, 2)
+
+    sens = sensitivity_score(y_true, y_pred, average='macro')
+    assert_array_almost_equal(sens, 0.60, 2)
+
+    spec = specificity_score(y_true, y_pred, average='weighted')
+    assert_array_almost_equal(spec, 0.80, 2)
+
+    sens = sensitivity_score(y_true, y_pred, average='weighted')
+    assert_array_almost_equal(sens, 0.53, 2)
+
+    assert_raises(ValueError, sensitivity_score, y_true, y_pred,
+                  average="samples")
+    assert_raises(ValueError, specificity_score, y_true, y_pred,
+                  average="samples")
+
+    # same prediction but with and explicit label ordering
+    sens, spec, supp = sensitivity_specificity_support(
+        y_true, y_pred, labels=[0, 2, 1], average=None)
+    assert_array_almost_equal(spec, [0.92, 0.55, 0.86], 2)
+    assert_array_almost_equal(sens, [0.79, 0.90, 0.10], 2)
+    assert_array_equal(supp, [24, 20, 31])

From d7333bd8a6128e2cf3702c0782331a20574873be Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Thu, 22 Dec 2016 18:59:36 +0100
Subject: [PATCH 05/21] Finish sensitivity and specificity

---
 imblearn/metrics/classification.py            |  57 ++---
 imblearn/metrics/tests/test_classification.py | 207 ++++++++----------
 2 files changed, 112 insertions(+), 152 deletions(-)

diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index 90ef0c736..26ae88704 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -40,7 +40,7 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
 
     If ``pos_label is None`` and in binary classification, this function
     returns the average sensitivity and specificity if ``average``
-    is one of ``'micro'`` or 'weighted'``.
+    is one of ``'weighted'``.
 
     Parameters
     ----------
@@ -105,8 +105,7 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
            <https://en.wikipedia.org/wiki/Sensitivity_and_specificity>`_
 
     """
-
-    average_options = (None, 'micro', 'macro', 'weighted')
+    average_options = (None, 'macro', 'weighted')
     if average not in average_options and average != 'binary':
         raise ValueError('average has to be one of ' +
                          str(average_options))
@@ -154,10 +153,6 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     y_pred = le.transform(y_pred)
     sorted_labels = le.classes_
 
-    LOGGER.debug(y_true)
-    LOGGER.debug(y_pred)
-    LOGGER.debug(sorted_labels)
-
     LOGGER.debug('The number of labels is %s' % n_labels)
 
     # In a leave out strategy and for each label, compute:
@@ -165,13 +160,13 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     # These list contain an array in which each sample is labeled as
     # TP, TN, FP, FN
     list_tp = [np.bitwise_and((y_true == label), (y_pred == label))
-               for label in sorted_labels]
+               for label in range(sorted_labels.size)]
     list_tn = [np.bitwise_and((y_true != label), (y_pred != label))
-               for label in sorted_labels]
+               for label in range(sorted_labels.size)]
     list_fp = [np.bitwise_and((y_true != label), (y_pred == label))
-               for label in sorted_labels]
+               for label in range(sorted_labels.size)]
     list_fn = [np.bitwise_and((y_true == label), (y_pred != label))
-               for label in sorted_labels]
+               for label in range(sorted_labels.size)]
 
     # Compute the sum for each type
     # We keep only the counting corresponding to True values
@@ -197,42 +192,32 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     # Sort the support
     support = support[indices]
 
-
     LOGGER.debug('The indices which are retained are %s' % indices)
 
-    LOGGER.debug('TP: %s' % tp_sum)
-    LOGGER.debug('TN: %s' % tn_sum)
-    LOGGER.debug('FP: %s' % fp_sum)
-    LOGGER.debug('FN: %s' % fn_sum)
-
     tp_sum = tp_sum[indices]
     tn_sum = tn_sum[indices]
     fp_sum = fp_sum[indices]
     fn_sum = fn_sum[indices]
 
-    if average == 'micro':
-        tp_sum = np.array([tp_sum.sum()])
-        tn_sum = np.array([tn_sum.sum()])
-        fp_sum = np.array([fp_sum.sum()])
-        fn_sum = np.array([fn_sum.sum()])
-
-    LOGGER.debug('Did we do the average micro %s' % tp_sum)
-
     LOGGER.debug('Computed the necessary stats for the sensitivity and'
                  ' specificity')
 
-    # Compute the sensitivity and specificity
-    sensitivity = [_prf_divide(tp, tp + fn, 'sensitivity', 'tp + fn', average,
-                               warn_for) for tp, fn in zip(tp_sum, fn_sum)]
-    specificity = [_prf_divide(tn, tn + fp, 'specificity', 'tn + fp', average,
-                               warn_for) for tn, fp in zip(tn_sum, fp_sum)]
-
-    LOGGER.debug('Sensitivity = %s - Specificity = %s' % (sensitivity,
-                                                          specificity))
+    LOGGER.debug(tp_sum)
+    LOGGER.debug(tn_sum)
+    LOGGER.debug(fp_sum)
+    LOGGER.debug(fn_sum)
 
-    LOGGER.debug('Computed the sensitivity and specificity for each class')
-    LOGGER.debug('The lengths of those two metrics are: %s - %s',
-                 len(sensitivity), len(specificity))
+    # Compute the sensitivity and specificity
+    with np.errstate(divide='ignore', invalid='ignore'):
+        sensitivity = _prf_divide(tp_sum, tp_sum + fn_sum, 'sensitivity',
+                                  'tp + fn', average, warn_for)
+        specificity = _prf_divide(tn_sum, tn_sum + fp_sum, 'specificity',
+                                  'tn + fp', average, warn_for)
+
+    # sensitivity = [_prf_divide(tp, tp + fn, 'sensitivity', 'tp + fn', average,
+    #                            warn_for) for tp, fn in zip(tp_sum, fn_sum)]
+    # specificity = [_prf_divide(tn, tn + fp, 'specificity', 'tn + fp', average,
+    #                            warn_for) for tn, fp in zip(tn_sum, fp_sum)]
 
     # If we need to weight the results
     if average == 'weighted':
diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py
index d7ce37caa..f998b64f0 100644
--- a/imblearn/metrics/tests/test_classification.py
+++ b/imblearn/metrics/tests/test_classification.py
@@ -2,6 +2,8 @@
 
 from __future__ import division, print_function
 
+from functools import partial
+
 import numpy as np
 
 from numpy.testing import (assert_array_almost_equal, assert_array_equal,
@@ -12,6 +14,8 @@
 from sklearn import datasets
 from sklearn import svm
 
+from sklearn.preprocessing import label_binarize
+from sklearn.utils.testing import assert_not_equal
 from sklearn.utils.validation import check_random_state
 
 from imblearn.metrics import sensitivity_specificity_support
@@ -94,113 +98,93 @@ def test_sensitivity_specificity_support_binary():
         assert_array_almost_equal(spec, 0.88, 2)
 
 
-# def test_precision_recall_f_binary_single_class():
-#     # Test precision, recall and F1 score behave with a single positive or
-#     # negative class
-#     # Such a case may occur with non-stratified cross-validation
-#     assert_equal(1., precision_score([1, 1], [1, 1]))
-#     assert_equal(1., recall_score([1, 1], [1, 1]))
-#     assert_equal(1., f1_score([1, 1], [1, 1]))
-
-#     assert_equal(0., precision_score([-1, -1], [-1, -1]))
-#     assert_equal(0., recall_score([-1, -1], [-1, -1]))
-#     assert_equal(0., f1_score([-1, -1], [-1, -1]))
-
-
-# @ignore_warnings
-# def test_precision_recall_f_extra_labels():
-#     # Test handling of explicit additional (not in input) labels to PRF
-#     y_true = [1, 3, 3, 2]
-#     y_pred = [1, 1, 3, 2]
-#     y_true_bin = label_binarize(y_true, classes=np.arange(5))
-#     y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
-#     data = [(y_true, y_pred),
-#             (y_true_bin, y_pred_bin)]
-
-#     for i, (y_true, y_pred) in enumerate(data):
-#         # No average: zeros in array
-#         actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
-#                               average=None)
-#         assert_array_almost_equal([0., 1., 1., .5, 0.], actual)
-
-#         # Macro average is changed
-#         actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
-#                               average='macro')
-#         assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual)
-
-#         # No effect otheriwse
-#         for average in ['micro', 'weighted', 'samples']:
-#             if average == 'samples' and i == 0:
-#                 continue
-#             assert_almost_equal(recall_score(y_true, y_pred,
-#                                              labels=[0, 1, 2, 3, 4],
-#                                              average=average),
-#                                 recall_score(y_true, y_pred, labels=None,
-#                                              average=average))
-
-#     # Error when introducing invalid label in multilabel case
-#     # (although it would only affect performance if average='macro'/None)
-#     for average in [None, 'macro', 'micro', 'samples']:
-#         assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
-#                       labels=np.arange(6), average=average)
-#         assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
-#                       labels=np.arange(-1, 4), average=average)
-
-
-# @ignore_warnings
-# def test_precision_recall_f_ignored_labels():
-#     # Test a subset of labels may be requested for PRF
-#     y_true = [1, 1, 2, 3]
-#     y_pred = [1, 3, 3, 3]
-#     y_true_bin = label_binarize(y_true, classes=np.arange(5))
-#     y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
-#     data = [(y_true, y_pred),
-#             (y_true_bin, y_pred_bin)]
-
-#     for i, (y_true, y_pred) in enumerate(data):
-#         recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3])
-#         recall_all = partial(recall_score, y_true, y_pred, labels=None)
-
-#         assert_array_almost_equal([.5, 1.], recall_13(average=None))
-#         assert_almost_equal((.5 + 1.) / 2, recall_13(average='macro'))
-#         assert_almost_equal((.5 * 2 + 1. * 1) / 3,
-#                             recall_13(average='weighted'))
-#         assert_almost_equal(2. / 3, recall_13(average='micro'))
-
-#         # ensure the above were meaningful tests:
-#         for average in ['macro', 'weighted', 'micro']:
-#             assert_not_equal(recall_13(average=average),
-#                              recall_all(average=average))
-
-
-# @ignore_warnings
-# def test_precision_recall_fscore_support_errors():
-#     y_true, y_pred, _ = make_prediction(binary=True)
-
-#     # Bad beta
-#     assert_raises(ValueError, precision_recall_fscore_support,
-#                   y_true, y_pred, beta=0.0)
-
-#     # Bad pos_label
-#     assert_raises(ValueError, precision_recall_fscore_support,
-#                   y_true, y_pred, pos_label=2, average='binary')
-
-#     # Bad average option
-#     assert_raises(ValueError, precision_recall_fscore_support,
-#                   [0, 1, 2], [1, 2, 0], average='mega')
-
-
-# def test_precision_recall_f_unused_pos_label():
-#     # Check warning that pos_label unused when set to non-default value
-#     # but average != 'binary'; even if data is binary.
-#     assert_warns_message(UserWarning,
-#                          "Note that pos_label (set to 2) is "
-#                          "ignored when average != 'binary' (got 'macro'). You "
-#                          "may use labels=[pos_label] to specify a single "
-#                          "positive class.", precision_recall_fscore_support,
-#                          [1, 2, 1], [1, 2, 2], pos_label=2, average='macro')
-
-def test_precision_recall_f1_score_multiclass():
+def test_sensitivity_specificity_binary_single_class():
+    # Test sensitivity and specificity score behave with a single positive or
+    # negative class
+    # Such a case may occur with non-stratified cross-validation
+    assert_equal(1., sensitivity_score([1, 1], [1, 1]))
+    assert_equal(0., specificity_score([1, 1], [1, 1]))
+
+    assert_equal(0., sensitivity_score([-1, -1], [-1, -1]))
+    assert_equal(0., specificity_score([-1, -1], [-1, -1]))
+
+
+def test_sensitivity_specificity_error_multilabels():
+    # Test either if an error is raised when the input are multilabels
+    y_true = [1, 3, 3, 2]
+    y_pred = [1, 1, 3, 2]
+    y_true_bin = label_binarize(y_true, classes=np.arange(5))
+    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
+
+    assert_raises(ValueError, sensitivity_score, y_true_bin, y_pred_bin)
+
+@ignore_warnings
+def test_sensitivity_specifiicity_extra_labels():
+    # Test handling of explicit additional (not in input) labels to SS
+    y_true = [1, 3, 3, 2]
+    y_pred = [1, 1, 3, 2]
+
+    actual = sensitivity_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
+                               average=None)
+    assert_array_almost_equal([0., 1., 1., .5, 0.], actual)
+
+    # Macro average is changed
+    actual = sensitivity_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
+                               average='macro')
+    assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual)
+
+    # Weighted average is changed
+    assert_almost_equal(sensitivity_score(y_true, y_pred,
+                                          labels=[0, 1, 2, 3, 4],
+                                          average='weighted'),
+                        sensitivity_score(y_true, y_pred, labels=None,
+                                          average='weighted'))
+
+@ignore_warnings
+def test_sensitivity_specificity_f_ignored_labels():
+    # Test a subset of labels may be requested for SS
+    y_true = [1, 1, 2, 3]
+    y_pred = [1, 3, 3, 3]
+
+    sensitivity_13 = partial(sensitivity_score, y_true, y_pred, labels=[1, 3])
+    sensitivity_all = partial(sensitivity_score, y_true, y_pred, labels=None)
+
+    assert_array_almost_equal([.5, 1.], sensitivity_13(average=None))
+    assert_almost_equal((.5 + 1.) / 2, sensitivity_13(average='macro'))
+    assert_almost_equal((.5 * 2 + 1. * 1) / 3,
+                        sensitivity_13(average='weighted'))
+
+    # ensure the above were meaningful tests:
+    for average in ['macro', 'weighted']:
+        assert_not_equal(sensitivity_13(average=average),
+                         sensitivity_all(average=average))
+
+
+@ignore_warnings
+def test_sensitivity_specificity_support_errors():
+    y_true, y_pred, _ = make_prediction(binary=True)
+
+    # Bad pos_label
+    assert_raises(ValueError, sensitivity_specificity_support,
+                  y_true, y_pred, pos_label=2, average='binary')
+
+    # Bad average option
+    assert_raises(ValueError, sensitivity_specificity_support,
+                  [0, 1, 2], [1, 2, 0], average='mega')
+
+
+def test_sensitivity_specificity_unused_pos_label():
+    # Check warning that pos_label unused when set to non-default value
+    # but average != 'binary'; even if data is binary.
+    assert_warns_message(UserWarning,
+                         "Note that pos_label (set to 2) is "
+                         "ignored when average != 'binary' (got 'macro'). You "
+                         "may use labels=[pos_label] to specify a single "
+                         "positive class.", sensitivity_specificity_support,
+                         [1, 2, 1], [1, 2, 2], pos_label=2, average='macro')
+
+
+def test_sensitivity_specificity_multiclass():
     # Test Precision Recall and F1 Score for multiclass classification task
     y_true, y_pred, _ = make_prediction(binary=False)
 
@@ -212,15 +196,6 @@ def test_precision_recall_f1_score_multiclass():
     assert_array_equal(supp, [24, 31, 20])
 
     # averaging tests
-    spec = specificity_score(y_true, y_pred, pos_label=1, average='micro')
-    assert_array_almost_equal(spec, 0.77, 2)
-
-    sens = sensitivity_score(y_true, y_pred, average='micro')
-    assert_array_almost_equal(sens, 0.53, 2)
-
-    spec = specificity_score(y_true, y_pred, average='macro')
-    assert_array_almost_equal(spec, 0.77, 2)
-
     sens = sensitivity_score(y_true, y_pred, average='macro')
     assert_array_almost_equal(sens, 0.60, 2)
 

From d0769165d89db55f538314d91ef7b3612ff27bf0 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Thu, 22 Dec 2016 19:32:12 +0100
Subject: [PATCH 06/21] Added geometric mean

---
 imblearn/metrics/classification.py | 110 ++++++++++++++++++++++++-----
 1 file changed, 91 insertions(+), 19 deletions(-)

diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index 26ae88704..6955be47d 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -1,3 +1,5 @@
+# coding: utf-8
+
 """Metrics to assess performance on classification task given class prediction
 
 Functions named as ``*_score`` return a scalar value to maximize: the higher
@@ -61,7 +63,7 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
 
     pos_label : str or int, optional (default=1)
         The class to report if ``average='binary'`` and the data is binary.
-        If the data are multiclass or multilabel, this will be ignored;
+        If the data are multiclass, this will be ignored;
         setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
         scores for that label only.
 
@@ -202,11 +204,6 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     LOGGER.debug('Computed the necessary stats for the sensitivity and'
                  ' specificity')
 
-    LOGGER.debug(tp_sum)
-    LOGGER.debug(tn_sum)
-    LOGGER.debug(fp_sum)
-    LOGGER.debug(fn_sum)
-
     # Compute the sensitivity and specificity
     with np.errstate(divide='ignore', invalid='ignore'):
         sensitivity = _prf_divide(tp_sum, tp_sum + fn_sum, 'sensitivity',
@@ -214,11 +211,6 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
         specificity = _prf_divide(tn_sum, tn_sum + fp_sum, 'specificity',
                                   'tn + fp', average, warn_for)
 
-    # sensitivity = [_prf_divide(tp, tp + fn, 'sensitivity', 'tp + fn', average,
-    #                            warn_for) for tp, fn in zip(tp_sum, fn_sum)]
-    # specificity = [_prf_divide(tn, tn + fp, 'specificity', 'tn + fp', average,
-    #                            warn_for) for tn, fp in zip(tn_sum, fp_sum)]
-
     # If we need to weight the results
     if average == 'weighted':
         weights = support
@@ -259,13 +251,11 @@ def sensitivity_score(y_true, y_pred, labels=None, pos_label=1,
         order if ``average is None``. Labels present in the data can be
         excluded, for example to calculate a multiclass average ignoring a
         majority negative class, while labels not present in the data will
-        result in 0 components in a macro average. For multilabel targets,
-        labels are column indices. By default, all labels in ``y_true`` and
-        ``y_pred`` are used in sorted order.
+        result in 0 components in a macro average.
 
     pos_label : str or int, optional (default=1)
         The class to report if ``average='binary'`` and the data is binary.
-        If the data are multiclass or multilabel, this will be ignored;
+        If the data are multiclass, this will be ignored;
         setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
         scores for that label only.
 
@@ -331,13 +321,11 @@ def specificity_score(y_true, y_pred, labels=None, pos_label=1,
         order if ``average is None``. Labels present in the data can be
         excluded, for example to calculate a multiclass average ignoring a
         majority negative class, while labels not present in the data will
-        result in 0 components in a macro average. For multilabel targets,
-        labels are column indices. By default, all labels in ``y_true`` and
-        ``y_pred`` are used in sorted order.
+        result in 0 components in a macro average.
 
     pos_label : str or int, optional (default=1)
         The class to report if ``average='binary'`` and the data is binary.
-        If the data are multiclass or multilabel, this will be ignored;
+        If the data are multiclass, this will be ignored;
         setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
         scores for that label only.
 
@@ -377,3 +365,87 @@ def specificity_score(y_true, y_pred, labels=None, pos_label=1,
                                               sample_weight=sample_weight)
 
     return s
+
+
+def geometric_mean_score(y_true, y_pred, labels=None, pos_label=1,
+                         average='binary', sample_weight=None):
+    """Compute the geometric mean
+
+    The geometric mean is the squared root of the product of the sensitivity
+    and specificity. This measure tries to maximize the accuracy on each
+    of the two classes while keeping these accuracies balanced.
+
+    The specificity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number
+    of true positives and ``fn`` the number of false negatives. The specificity
+    is intuitively the ability of the classifier to find all the positive
+    samples.
+
+    The best value is 1 and the worst value is 0.
+
+    Parameters
+    ----------
+    y_true : ndarray, shape (n_samples, )
+        Ground truth (correct) target values.
+
+    y_pred : ndarray, shape (n_samples, )
+        Estimated targets as returned by a classifier.
+
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average.
+
+    pos_label : str or int, optional (default=1)
+        The class to report if ``average='binary'`` and the data is binary.
+        If the data are multiclass or multilabel, this will be ignored;
+        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
+        scores for that label only.
+
+    average : str or None, optional (default=None)
+        If ``None``, the scores for each class are returned. Otherwise, this
+        determines the type of averaging performed on the data:
+
+        ``'binary'``:
+            Only report results for the class specified by ``pos_label``.
+            This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label). This
+            alters 'macro' to account for label imbalance.
+
+    warn_for : tuple or set, for internal use
+        This determines which warnings will be made in the case that this
+        function is being used to return only one of its metrics.
+
+    sample_weight : ndarray, shape (n_samples, )
+        Sample weights.
+
+    Returns
+    -------
+    geometric_mean : float (if ``average`` = None) or ndarray, \
+        shape (n_unique_labels, )
+
+    References
+    ----------
+    .. [1] Kubat, M. and Matwin, S. "Addressing the curse of
+       imbalanced training sets: one-sided selection" ICML (1997)
+
+    .. [2] Barandela, R., Sánchez, J. S., Garcıa, V., & Rangel, E. "Strategies
+       for learning in class imbalance problems", Pattern Recognition,
+       36(3), (2003), pp 849-851.
+
+    """
+    sen, spe, _ = sensitivity_specificity_support(y_true, y_pred,
+                                                  labels=labels,
+                                                  pos_label=pos_label,
+                                                  average=average,
+                                                  warn_for=('specificity',
+                                                            'specificity'),
+                                                  sample_weight=sample_weight)
+
+    return np.sqrt(sen * spe)

From ace6cabacac46b7ddee957c18f9c29932fa080b7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Thu, 22 Dec 2016 22:25:05 +0100
Subject: [PATCH 07/21] Add the testing for geometric mean

---
 imblearn/metrics/__init__.py                  |  4 ++-
 imblearn/metrics/classification.py            |  2 ++
 imblearn/metrics/tests/test_classification.py | 29 ++++++++++++++++++-
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py
index d518d7a23..13a17ae9d 100644
--- a/imblearn/metrics/__init__.py
+++ b/imblearn/metrics/__init__.py
@@ -6,9 +6,11 @@
 from .classification import sensitivity_specificity_support
 from .classification import sensitivity_score
 from .classification import specificity_score
+from .classification import geometric_mean_score
 
 __all__ = [
     'sensitivity_specificity_support',
     'sensitivity_score',
-    'specificity_score'
+    'specificity_score',
+    'geometric_mean_score'
 ]
diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index 6955be47d..7656bbd53 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -448,4 +448,6 @@ def geometric_mean_score(y_true, y_pred, labels=None, pos_label=1,
                                                             'specificity'),
                                                   sample_weight=sample_weight)
 
+    LOGGER.debug('The sensitivity and specificity are : %s - %s' % (sen, spe))
+
     return np.sqrt(sen * spe)
diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py
index f998b64f0..58f9e81e0 100644
--- a/imblearn/metrics/tests/test_classification.py
+++ b/imblearn/metrics/tests/test_classification.py
@@ -21,6 +21,7 @@
 from imblearn.metrics import sensitivity_specificity_support
 from imblearn.metrics import sensitivity_score
 from imblearn.metrics import specificity_score
+from imblearn.metrics import geometric_mean_score
 
 RND_SEED = 42
 
@@ -185,7 +186,7 @@ def test_sensitivity_specificity_unused_pos_label():
 
 
 def test_sensitivity_specificity_multiclass():
-    # Test Precision Recall and F1 Score for multiclass classification task
+    # Test sensitivity and specificity for multiclass classification task
     y_true, y_pred, _ = make_prediction(binary=False)
 
     # compute scores with default labels introspection
@@ -216,3 +217,29 @@ def test_sensitivity_specificity_multiclass():
     assert_array_almost_equal(spec, [0.92, 0.55, 0.86], 2)
     assert_array_almost_equal(sens, [0.79, 0.90, 0.10], 2)
     assert_array_equal(supp, [24, 20, 31])
+
+
+def test_geometric_mean_support_binary():
+    """Test the geometric mean for binary classification task"""
+    y_true, y_pred, _ = make_prediction(binary=True)
+
+    # compute the geometric mean for the binary problem
+    geo_mean = geometric_mean_score(y_true, y_pred)
+
+    assert_almost_equal(geo_mean, 0.77, 2)
+
+
+def test_geometric_mean_multiclass():
+    # Test geometric mean for multiclass classification task
+    y_true, y_pred, _ = make_prediction(binary=False)
+
+    # Compute the geometric mean for each of the classes
+    geo_mean = geometric_mean_score(y_true, y_pred, average=None)
+    assert_array_almost_equal(geo_mean, [0.85, 0.29, 0.7], 2)
+
+    # average tests
+    geo_mean = geometric_mean_score(y_true, y_pred, average='macro')
+    assert_almost_equal(geo_mean, 0.68, 2)
+
+    geo_mean = geometric_mean_score(y_true, y_pred, average='weighted')
+    assert_array_almost_equal(geo_mean, 0.65, 2)

From 73f226d4cde4a6f70f8024515e43257ca7517f7c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Thu, 22 Dec 2016 23:08:09 +0100
Subject: [PATCH 08/21] Change the computation of the specificity to fit with
 sklearn function

---
 imblearn/metrics/classification.py | 215 ++++++++++++++++++-----------
 1 file changed, 135 insertions(+), 80 deletions(-)

diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index 7656bbd53..c4c4ae525 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -74,14 +74,21 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
         ``'binary'``:
             Only report results for the class specified by ``pos_label``.
             This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
         ``'macro'``:
             Calculate metrics for each label, and find their unweighted
             mean.  This does not take label imbalance into account.
         ``'weighted'``:
             Calculate metrics for each label, and find their average, weighted
             by support (the number of true instances for each label). This
-            alters 'macro' to account for label imbalance.
-
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
     warn_for : tuple or set, for internal use
         This determines which warnings will be made in the case that this
         function is being used to return only one of its metrics.
@@ -107,7 +114,7 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
            <https://en.wikipedia.org/wiki/Sensitivity_and_specificity>`_
 
     """
-    average_options = (None, 'macro', 'weighted')
+    average_options = (None, 'micro', 'macro', 'weighted', 'samples')
     if average not in average_options and average != 'binary':
         raise ValueError('average has to be one of ' +
                          str(average_options))
@@ -115,13 +122,6 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     present_labels = unique_labels(y_true, y_pred)
 
-    LOGGER.debug('The labels in the prediction and ground-truth are %s',
-                 present_labels)
-
-    # We do not support multilabel for the moment
-    if y_type.startswith('multilabel'):
-        raise ValueError('Multilabel are not supported.')
-
     if average == 'binary':
         if y_type == 'binary':
             if pos_label not in present_labels:
@@ -149,83 +149,114 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
         labels = np.hstack([labels, np.setdiff1d(present_labels, labels,
                                                  assume_unique=True)])
 
-    le = LabelEncoder()
-    le.fit(labels)
-    y_true = le.transform(y_true)
-    y_pred = le.transform(y_pred)
-    sorted_labels = le.classes_
-
-    LOGGER.debug('The number of labels is %s' % n_labels)
-
-    # In a leave out strategy and for each label, compute:
-    # TP, TN, FP, FN
-    # These list contain an array in which each sample is labeled as
-    # TP, TN, FP, FN
-    list_tp = [np.bitwise_and((y_true == label), (y_pred == label))
-               for label in range(sorted_labels.size)]
-    list_tn = [np.bitwise_and((y_true != label), (y_pred != label))
-               for label in range(sorted_labels.size)]
-    list_fp = [np.bitwise_and((y_true != label), (y_pred == label))
-               for label in range(sorted_labels.size)]
-    list_fn = [np.bitwise_and((y_true == label), (y_pred != label))
-               for label in range(sorted_labels.size)]
-
-    # Compute the sum for each type
-    # We keep only the counting corresponding to True values
-    # We are using bincount since it allows to weight the samples
-    tp_sum = np.array([bincount(tp, weights=sample_weight,
-                                minlength=2)[-1]
-                       for tp in list_tp])
-    tn_sum = np.array([bincount(tn, weights=sample_weight,
-                                minlength=2)[-1]
-                       for tn in list_tn])
-    fp_sum = np.array([bincount(fp, weights=sample_weight,
-                                minlength=2)[-1]
-                       for fp in list_fp])
-    fn_sum = np.array([bincount(fn, weights=sample_weight,
-                                minlength=2)[-1]
-                       for fn in list_fn])
-
-    # Retain only selected labels
-    indices = np.searchsorted(sorted_labels, labels[:n_labels])
-    # For support, we can count the number of occurrences of each label
-    support = np.array(bincount(y_true, weights=sample_weight,
-                                minlength=len(labels)))
-    # Sort the support
-    support = support[indices]
-
-    LOGGER.debug('The indices which are retained are %s' % indices)
-
-    tp_sum = tp_sum[indices]
-    tn_sum = tn_sum[indices]
-    fp_sum = fp_sum[indices]
-    fn_sum = fn_sum[indices]
-
-    LOGGER.debug('Computed the necessary stats for the sensitivity and'
-                 ' specificity')
-
-    # Compute the sensitivity and specificity
+    # Calculate tp_sum, pred_sum, true_sum ###
+
+    if y_type.startswith('multilabel'):
+        sum_axis = 1 if average == 'samples' else 0
+
+        # All labels are index integers for multilabel.
+        # Select labels:
+        if not np.all(labels == present_labels):
+            if np.max(labels) > np.max(present_labels):
+                raise ValueError('All labels must be in [0, n labels). '
+                                 'Got %d > %d' %
+                                 (np.max(labels), np.max(present_labels)))
+            if np.min(labels) < 0:
+                raise ValueError('All labels must be in [0, n labels). '
+                                 'Got %d < 0' % np.min(labels))
+
+            y_true = y_true[:, labels[:n_labels]]
+            y_pred = y_pred[:, labels[:n_labels]]
+
+        # calculate weighted counts
+        true_and_pred = y_true.multiply(y_pred)
+        tp_sum = count_nonzero(true_and_pred, axis=sum_axis,
+                               sample_weight=sample_weight)
+        pred_sum = count_nonzero(y_pred, axis=sum_axis,
+                                 sample_weight=sample_weight)
+        true_sum = count_nonzero(y_true, axis=sum_axis,
+                                 sample_weight=sample_weight)
+        tn_sum = y_true.size - (pred_sum + true_sum - tp_sum)
+
+    elif average == 'samples':
+        raise ValueError("Sample-based precision, recall, fscore is "
+                         "not meaningful outside multilabel "
+                         "classification. See the accuracy_score instead.")
+    else:
+        le = LabelEncoder()
+        le.fit(labels)
+        y_true = le.transform(y_true)
+        y_pred = le.transform(y_pred)
+        sorted_labels = le.classes_
+
+        # labels are now from 0 to len(labels) - 1 -> use bincount
+        tp = y_true == y_pred
+        tp_bins = y_true[tp]
+        if sample_weight is not None:
+            tp_bins_weights = np.asarray(sample_weight)[tp]
+        else:
+            tp_bins_weights = None
+
+        if len(tp_bins):
+            tp_sum = bincount(tp_bins, weights=tp_bins_weights,
+                              minlength=len(labels))
+        else:
+            # Pathological case
+            true_sum = pred_sum = tp_sum = np.zeros(len(labels))
+        if len(y_pred):
+            pred_sum = bincount(y_pred, weights=sample_weight,
+                                minlength=len(labels))
+        if len(y_true):
+            true_sum = bincount(y_true, weights=sample_weight,
+                                minlength=len(labels))
+
+        # Compute the true negative
+        tn_sum = y_true.size - (pred_sum + true_sum - tp_sum)
+
+        # Retain only selected labels
+        indices = np.searchsorted(sorted_labels, labels[:n_labels])
+        tp_sum = tp_sum[indices]
+        true_sum = true_sum[indices]
+        pred_sum = pred_sum[indices]
+        tn_sum = tn_sum[indices]
+
+    if average == 'micro':
+        tp_sum = np.array([tp_sum.sum()])
+        pred_sum = np.array([pred_sum.sum()])
+        true_sum = np.array([true_sum.sum()])
+        tn_sum = np.array([tn_sum.sum()])
+
+    # Finally, we have all our sufficient statistics. Divide! #
+
     with np.errstate(divide='ignore', invalid='ignore'):
-        sensitivity = _prf_divide(tp_sum, tp_sum + fn_sum, 'sensitivity',
-                                  'tp + fn', average, warn_for)
-        specificity = _prf_divide(tn_sum, tn_sum + fp_sum, 'specificity',
-                                  'tn + fp', average, warn_for)
+        # Divide, and on zero-division, set scores to 0 and warn:
+
+        # Oddly, we may get an "invalid" rather than a "divide" error
+        # here.
+        specificity = _prf_divide(tn_sum, tn_sum + pred_sum - tp_sum,
+                                  'specificity', 'predicted', average,
+                                  warn_for)
+        sensitivity = _prf_divide(tp_sum, true_sum,
+                                  'sensitivity', 'true', average, warn_for)
+
+    # Average the results
 
-    # If we need to weight the results
     if average == 'weighted':
-        weights = support
+        weights = true_sum
         if weights.sum() == 0:
             return 0, 0, None
+    elif average == 'samples':
+        weights = sample_weight
     else:
         weights = None
 
     if average is not None:
-        assert average != 'binary' or len(sensitivity) == 1
-        sensitivity = np.average(sensitivity, weights=weights)
+        assert average != 'binary' or len(specificity) == 1
         specificity = np.average(specificity, weights=weights)
-        support = None
+        sensitivity = np.average(sensitivity, weights=weights)
+        true_sum = None  # return no support
 
-    return sensitivity, specificity, support
+    return sensitivity, specificity, true_sum
 
 
 def sensitivity_score(y_true, y_pred, labels=None, pos_label=1,
@@ -266,13 +297,21 @@ def sensitivity_score(y_true, y_pred, labels=None, pos_label=1,
         ``'binary'``:
             Only report results for the class specified by ``pos_label``.
             This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
         ``'macro'``:
             Calculate metrics for each label, and find their unweighted
             mean.  This does not take label imbalance into account.
         ``'weighted'``:
             Calculate metrics for each label, and find their average, weighted
             by support (the number of true instances for each label). This
-            alters 'macro' to account for label imbalance.
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
 
     warn_for : tuple or set, for internal use
         This determines which warnings will be made in the case that this
@@ -291,7 +330,7 @@ def sensitivity_score(y_true, y_pred, labels=None, pos_label=1,
                                               labels=labels,
                                               pos_label=pos_label,
                                               average=average,
-                                              warn_for=('specificity',),
+                                              warn_for=('sensitivity',),
                                               sample_weight=sample_weight)
 
     return s
@@ -336,13 +375,21 @@ def specificity_score(y_true, y_pred, labels=None, pos_label=1,
         ``'binary'``:
             Only report results for the class specified by ``pos_label``.
             This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
         ``'macro'``:
             Calculate metrics for each label, and find their unweighted
             mean.  This does not take label imbalance into account.
         ``'weighted'``:
             Calculate metrics for each label, and find their average, weighted
             by support (the number of true instances for each label). This
-            alters 'macro' to account for label imbalance.
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
 
     warn_for : tuple or set, for internal use
         This determines which warnings will be made in the case that this
@@ -410,13 +457,21 @@ def geometric_mean_score(y_true, y_pred, labels=None, pos_label=1,
         ``'binary'``:
             Only report results for the class specified by ``pos_label``.
             This is applicable only if targets (``y_{true,pred}``) are binary.
+        ``'micro'``:
+            Calculate metrics globally by counting the total true positives,
+            false negatives and false positives.
         ``'macro'``:
             Calculate metrics for each label, and find their unweighted
             mean.  This does not take label imbalance into account.
         ``'weighted'``:
             Calculate metrics for each label, and find their average, weighted
             by support (the number of true instances for each label). This
-            alters 'macro' to account for label imbalance.
+            alters 'macro' to account for label imbalance; it can result in an
+            F-score that is not between precision and recall.
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average (only
+            meaningful for multilabel classification where this differs from
+            :func:`accuracy_score`).
 
     warn_for : tuple or set, for internal use
         This determines which warnings will be made in the case that this

From 03f10719d45582f3dd49f6d615da83abee6e252a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Fri, 23 Dec 2016 00:24:25 +0100
Subject: [PATCH 09/21] Update the test for the specificity

---
 imblearn/metrics/classification.py            | 135 +++++++-------
 imblearn/metrics/tests/test_classification.py | 171 ++++++++----------
 2 files changed, 149 insertions(+), 157 deletions(-)

diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index c4c4ae525..e9516ed0f 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -1,5 +1,4 @@
 # coding: utf-8
-
 """Metrics to assess performance on classification task given class prediction
 
 Functions named as ``*_score`` return a scalar value to maximize: the higher
@@ -20,12 +19,16 @@
 from sklearn.preprocessing import LabelEncoder
 from sklearn.utils.fixes import bincount
 from sklearn.utils.multiclass import unique_labels
+from sklearn.utils.sparsefuncs import count_nonzero
 
 LOGGER = logging.getLogger(__name__)
 
 
-def sensitivity_specificity_support(y_true, y_pred, labels=None,
-                                    pos_label=1, average=None,
+def sensitivity_specificity_support(y_true,
+                                    y_pred,
+                                    labels=None,
+                                    pos_label=1,
+                                    average=None,
                                     warn_for=('sensitivity', 'specificity'),
                                     sample_weight=None):
     """Compute sensitivity, specificity, and support for each class
@@ -116,8 +119,7 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     """
     average_options = (None, 'micro', 'macro', 'weighted', 'samples')
     if average not in average_options and average != 'binary':
-        raise ValueError('average has to be one of ' +
-                         str(average_options))
+        raise ValueError('average has to be one of ' + str(average_options))
 
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     present_labels = unique_labels(y_true, y_pred)
@@ -146,38 +148,14 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
         n_labels = None
     else:
         n_labels = len(labels)
-        labels = np.hstack([labels, np.setdiff1d(present_labels, labels,
-                                                 assume_unique=True)])
+        labels = np.hstack(
+            [labels, np.setdiff1d(
+                present_labels, labels, assume_unique=True)])
 
     # Calculate tp_sum, pred_sum, true_sum ###
 
     if y_type.startswith('multilabel'):
-        sum_axis = 1 if average == 'samples' else 0
-
-        # All labels are index integers for multilabel.
-        # Select labels:
-        if not np.all(labels == present_labels):
-            if np.max(labels) > np.max(present_labels):
-                raise ValueError('All labels must be in [0, n labels). '
-                                 'Got %d > %d' %
-                                 (np.max(labels), np.max(present_labels)))
-            if np.min(labels) < 0:
-                raise ValueError('All labels must be in [0, n labels). '
-                                 'Got %d < 0' % np.min(labels))
-
-            y_true = y_true[:, labels[:n_labels]]
-            y_pred = y_pred[:, labels[:n_labels]]
-
-        # calculate weighted counts
-        true_and_pred = y_true.multiply(y_pred)
-        tp_sum = count_nonzero(true_and_pred, axis=sum_axis,
-                               sample_weight=sample_weight)
-        pred_sum = count_nonzero(y_pred, axis=sum_axis,
-                                 sample_weight=sample_weight)
-        true_sum = count_nonzero(y_true, axis=sum_axis,
-                                 sample_weight=sample_weight)
-        tn_sum = y_true.size - (pred_sum + true_sum - tp_sum)
-
+        raise ValueError('imblearn does not support multilabel')
     elif average == 'samples':
         raise ValueError("Sample-based precision, recall, fscore is "
                          "not meaningful outside multilabel "
@@ -198,17 +176,17 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
             tp_bins_weights = None
 
         if len(tp_bins):
-            tp_sum = bincount(tp_bins, weights=tp_bins_weights,
-                              minlength=len(labels))
+            tp_sum = bincount(
+                tp_bins, weights=tp_bins_weights, minlength=len(labels))
         else:
             # Pathological case
             true_sum = pred_sum = tp_sum = np.zeros(len(labels))
         if len(y_pred):
-            pred_sum = bincount(y_pred, weights=sample_weight,
-                                minlength=len(labels))
+            pred_sum = bincount(
+                y_pred, weights=sample_weight, minlength=len(labels))
         if len(y_true):
-            true_sum = bincount(y_true, weights=sample_weight,
-                                minlength=len(labels))
+            true_sum = bincount(
+                y_true, weights=sample_weight, minlength=len(labels))
 
         # Compute the true negative
         tn_sum = y_true.size - (pred_sum + true_sum - tp_sum)
@@ -220,6 +198,11 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
         pred_sum = pred_sum[indices]
         tn_sum = tn_sum[indices]
 
+        LOGGER.debug('tp: %s' % tp_sum)
+        LOGGER.debug('tn: %s' % tn_sum)
+        LOGGER.debug('pred_sum: %s' % pred_sum)
+        LOGGER.debug('true_sum: %s' % true_sum)
+
     if average == 'micro':
         tp_sum = np.array([tp_sum.sum()])
         pred_sum = np.array([pred_sum.sum()])
@@ -236,8 +219,8 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
         specificity = _prf_divide(tn_sum, tn_sum + pred_sum - tp_sum,
                                   'specificity', 'predicted', average,
                                   warn_for)
-        sensitivity = _prf_divide(tp_sum, true_sum,
-                                  'sensitivity', 'true', average, warn_for)
+        sensitivity = _prf_divide(tp_sum, true_sum, 'sensitivity', 'true',
+                                  average, warn_for)
 
     # Average the results
 
@@ -250,6 +233,9 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     else:
         weights = None
 
+    LOGGER.debug(specificity)
+    LOGGER.debug(weights)
+
     if average is not None:
         assert average != 'binary' or len(specificity) == 1
         specificity = np.average(specificity, weights=weights)
@@ -259,8 +245,12 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None,
     return sensitivity, specificity, true_sum
 
 
-def sensitivity_score(y_true, y_pred, labels=None, pos_label=1,
-                      average='binary', sample_weight=None):
+def sensitivity_score(y_true,
+                      y_pred,
+                      labels=None,
+                      pos_label=1,
+                      average='binary',
+                      sample_weight=None):
     """Compute the sensitivity
 
     The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number
@@ -326,18 +316,24 @@ def sensitivity_score(y_true, y_pred, labels=None, pos_label=1,
         shape (n_unique_labels, )
 
     """
-    s, _, _ = sensitivity_specificity_support(y_true, y_pred,
-                                              labels=labels,
-                                              pos_label=pos_label,
-                                              average=average,
-                                              warn_for=('sensitivity',),
-                                              sample_weight=sample_weight)
+    s, _, _ = sensitivity_specificity_support(
+        y_true,
+        y_pred,
+        labels=labels,
+        pos_label=pos_label,
+        average=average,
+        warn_for=('sensitivity', ),
+        sample_weight=sample_weight)
 
     return s
 
 
-def specificity_score(y_true, y_pred, labels=None, pos_label=1,
-                      average='binary', sample_weight=None):
+def specificity_score(y_true,
+                      y_pred,
+                      labels=None,
+                      pos_label=1,
+                      average='binary',
+                      sample_weight=None):
     """Compute the specificity
 
     The specificity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number
@@ -404,18 +400,24 @@ def specificity_score(y_true, y_pred, labels=None, pos_label=1,
         shape (n_unique_labels, )
 
     """
-    _, s, _ = sensitivity_specificity_support(y_true, y_pred,
-                                              labels=labels,
-                                              pos_label=pos_label,
-                                              average=average,
-                                              warn_for=('specificity',),
-                                              sample_weight=sample_weight)
+    _, s, _ = sensitivity_specificity_support(
+        y_true,
+        y_pred,
+        labels=labels,
+        pos_label=pos_label,
+        average=average,
+        warn_for=('specificity', ),
+        sample_weight=sample_weight)
 
     return s
 
 
-def geometric_mean_score(y_true, y_pred, labels=None, pos_label=1,
-                         average='binary', sample_weight=None):
+def geometric_mean_score(y_true,
+                         y_pred,
+                         labels=None,
+                         pos_label=1,
+                         average='binary',
+                         sample_weight=None):
     """Compute the geometric mean
 
     The geometric mean is the squared root of the product of the sensitivity
@@ -495,13 +497,14 @@ def geometric_mean_score(y_true, y_pred, labels=None, pos_label=1,
        36(3), (2003), pp 849-851.
 
     """
-    sen, spe, _ = sensitivity_specificity_support(y_true, y_pred,
-                                                  labels=labels,
-                                                  pos_label=pos_label,
-                                                  average=average,
-                                                  warn_for=('specificity',
-                                                            'specificity'),
-                                                  sample_weight=sample_weight)
+    sen, spe, _ = sensitivity_specificity_support(
+        y_true,
+        y_pred,
+        labels=labels,
+        pos_label=pos_label,
+        average=average,
+        warn_for=('specificity', 'specificity'),
+        sample_weight=sample_weight)
 
     LOGGER.debug('The sensitivity and specificity are : %s - %s' % (sen, spe))
 
diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py
index 58f9e81e0..02c2edb53 100644
--- a/imblearn/metrics/tests/test_classification.py
+++ b/imblearn/metrics/tests/test_classification.py
@@ -76,31 +76,33 @@ def make_prediction(dataset=None, binary=False):
 ###############################################################################
 # Tests
 
-def test_sensitivity_specificity_support_binary():
-    """Test the sensitivity specificity for binary classification task"""
+
+def test_sensitivity_specificity_score_binary():
+    # Test Sensitivity Specificity for binary classification task
     y_true, y_pred, _ = make_prediction(binary=True)
 
     # detailed measures for each class
-    sens, spec, supp = sensitivity_specificity_support(y_true, y_pred,
-                                                       average=None)
-    assert_array_almost_equal(sens, [0.88, 0.68], 2)
-    assert_array_almost_equal(spec, [0.68, 0.88], 2)
-    assert_array_equal(supp, [25, 25])
+    sen, spe, sup = sensitivity_specificity_support(
+        y_true, y_pred, average=None)
+    assert_array_almost_equal(sen, [0.88, 0.68], 2)
+    assert_array_almost_equal(spe, [0.68, 0.88], 2)
+    assert_array_equal(sup, [25, 25])
 
     # individual scoring function that can be used for grid search: in the
     # binary class case the score is the value of the measure for the positive
     # class (e.g. label == 1). This is deprecated for average != 'binary'.
-    for kwargs, my_assert in [({}, assert_no_warnings),
-                              ({'average': 'binary'}, assert_no_warnings)]:
-        sens = my_assert(sensitivity_score, y_true, y_pred, **kwargs)
-        assert_array_almost_equal(sens, 0.68, 2)
+    for kwargs, my_assert in [({}, assert_no_warnings), ({
+            'average': 'binary'
+    }, assert_no_warnings)]:
+        sen = my_assert(sensitivity_score, y_true, y_pred, **kwargs)
+        assert_array_almost_equal(sen, 0.68, 2)
 
-        spec = my_assert(specificity_score, y_true, y_pred, **kwargs)
-        assert_array_almost_equal(spec, 0.88, 2)
+        spe = my_assert(specificity_score, y_true, y_pred, **kwargs)
+        assert_array_almost_equal(spe, 0.88, 2)
 
 
-def test_sensitivity_specificity_binary_single_class():
-    # Test sensitivity and specificity score behave with a single positive or
+def test_sensitivity_specificity_f_binary_single_class():
+    # Test sensitivity and specificity behave with a single positive or
     # negative class
     # Such a case may occur with non-stratified cross-validation
     assert_equal(1., sensitivity_score([1, 1], [1, 1]))
@@ -110,55 +112,66 @@ def test_sensitivity_specificity_binary_single_class():
     assert_equal(0., specificity_score([-1, -1], [-1, -1]))
 
 
-def test_sensitivity_specificity_error_multilabels():
-    # Test either if an error is raised when the input are multilabels
-    y_true = [1, 3, 3, 2]
-    y_pred = [1, 1, 3, 2]
-    y_true_bin = label_binarize(y_true, classes=np.arange(5))
-    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
-
-    assert_raises(ValueError, sensitivity_score, y_true_bin, y_pred_bin)
-
 @ignore_warnings
-def test_sensitivity_specifiicity_extra_labels():
+def test_sensitivity_specificity_extra_labels():
     # Test handling of explicit additional (not in input) labels to SS
     y_true = [1, 3, 3, 2]
     y_pred = [1, 1, 3, 2]
 
-    actual = sensitivity_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
-                               average=None)
-    assert_array_almost_equal([0., 1., 1., .5, 0.], actual)
+    # No average: zeros in array
+    actual = specificity_score(
+        y_true, y_pred, labels=[0, 1, 2, 3, 4], average=None)
+    assert_array_almost_equal([1., 0.67, 1., 1., 1.], actual, 2)
 
     # Macro average is changed
-    actual = sensitivity_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
-                               average='macro')
-    assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual)
+    actual = specificity_score(
+        y_true, y_pred, labels=[0, 1, 2, 3, 4], average='macro')
+    assert_array_almost_equal(np.mean([1., 0.67, 1., 1., 1.]), actual, 2)
+
+    # Check for micro
+    actual = specificity_score(
+        y_true, y_pred, labels=[0, 1, 2, 3, 4], average='micro')
+    assert_array_almost_equal(15. / 16., actual)
+
+    # Check for weighted
+    actual = specificity_score(
+        y_true, y_pred, labels=[0, 1, 2, 3, 4], average='macro')
+    assert_array_almost_equal(np.mean([1., 0.67, 1., 1., 1.]), actual, 2)
 
-    # Weighted average is changed
-    assert_almost_equal(sensitivity_score(y_true, y_pred,
-                                          labels=[0, 1, 2, 3, 4],
-                                          average='weighted'),
-                        sensitivity_score(y_true, y_pred, labels=None,
-                                          average='weighted'))
 
 @ignore_warnings
-def test_sensitivity_specificity_f_ignored_labels():
+def test_sensitivity_specificity_ignored_labels():
     # Test a subset of labels may be requested for SS
     y_true = [1, 1, 2, 3]
     y_pred = [1, 3, 3, 3]
 
-    sensitivity_13 = partial(sensitivity_score, y_true, y_pred, labels=[1, 3])
-    sensitivity_all = partial(sensitivity_score, y_true, y_pred, labels=None)
+    specificity_13 = partial(specificity_score, y_true, y_pred, labels=[1, 3])
+    specificity_all = partial(specificity_score, y_true, y_pred, labels=None)
 
-    assert_array_almost_equal([.5, 1.], sensitivity_13(average=None))
-    assert_almost_equal((.5 + 1.) / 2, sensitivity_13(average='macro'))
-    assert_almost_equal((.5 * 2 + 1. * 1) / 3,
-                        sensitivity_13(average='weighted'))
+    assert_array_almost_equal([1., 0.33], specificity_13(average=None), 2)
+    assert_almost_equal(
+        np.mean([1., 0.33]), specificity_13(average='macro'), 2)
+    assert_almost_equal(
+        np.average(
+            [1., .33], weights=[2., 1.]),
+        specificity_13(average='weighted'),
+        2)
+    assert_almost_equal(3. / (3. + 2.), specificity_13(average='micro'), 2)
 
     # ensure the above were meaningful tests:
-    for average in ['macro', 'weighted']:
-        assert_not_equal(sensitivity_13(average=average),
-                         sensitivity_all(average=average))
+    for average in ['macro', 'weighted', 'micro']:
+        assert_not_equal(
+            specificity_13(average=average), specificity_all(average=average))
+
+
+def test_sensitivity_specificity_error_multilabels():
+    # Test either if an error is raised when the input are multilabels
+    y_true = [1, 3, 3, 2]
+    y_pred = [1, 1, 3, 2]
+    y_true_bin = label_binarize(y_true, classes=np.arange(5))
+    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
+
+    assert_raises(ValueError, sensitivity_score, y_true_bin, y_pred_bin)
 
 
 @ignore_warnings
@@ -166,57 +179,33 @@ def test_sensitivity_specificity_support_errors():
     y_true, y_pred, _ = make_prediction(binary=True)
 
     # Bad pos_label
-    assert_raises(ValueError, sensitivity_specificity_support,
-                  y_true, y_pred, pos_label=2, average='binary')
+    assert_raises(
+        ValueError,
+        sensitivity_specificity_support,
+        y_true,
+        y_pred,
+        pos_label=2,
+        average='binary')
 
     # Bad average option
-    assert_raises(ValueError, sensitivity_specificity_support,
-                  [0, 1, 2], [1, 2, 0], average='mega')
+    assert_raises(
+        ValueError,
+        sensitivity_specificity_support, [0, 1, 2], [1, 2, 0],
+        average='mega')
 
 
 def test_sensitivity_specificity_unused_pos_label():
     # Check warning that pos_label unused when set to non-default value
     # but average != 'binary'; even if data is binary.
-    assert_warns_message(UserWarning,
-                         "Note that pos_label (set to 2) is "
-                         "ignored when average != 'binary' (got 'macro'). You "
-                         "may use labels=[pos_label] to specify a single "
-                         "positive class.", sensitivity_specificity_support,
-                         [1, 2, 1], [1, 2, 2], pos_label=2, average='macro')
-
-
-def test_sensitivity_specificity_multiclass():
-    # Test sensitivity and specificity for multiclass classification task
-    y_true, y_pred, _ = make_prediction(binary=False)
-
-    # compute scores with default labels introspection
-    sens, spec, supp = sensitivity_specificity_support(y_true, y_pred,
-                                                       average=None)
-    assert_array_almost_equal(spec, [0.92, 0.86, 0.55], 2)
-    assert_array_almost_equal(sens, [0.79, 0.09, 0.90], 2)
-    assert_array_equal(supp, [24, 31, 20])
-
-    # averaging tests
-    sens = sensitivity_score(y_true, y_pred, average='macro')
-    assert_array_almost_equal(sens, 0.60, 2)
-
-    spec = specificity_score(y_true, y_pred, average='weighted')
-    assert_array_almost_equal(spec, 0.80, 2)
-
-    sens = sensitivity_score(y_true, y_pred, average='weighted')
-    assert_array_almost_equal(sens, 0.53, 2)
-
-    assert_raises(ValueError, sensitivity_score, y_true, y_pred,
-                  average="samples")
-    assert_raises(ValueError, specificity_score, y_true, y_pred,
-                  average="samples")
-
-    # same prediction but with and explicit label ordering
-    sens, spec, supp = sensitivity_specificity_support(
-        y_true, y_pred, labels=[0, 2, 1], average=None)
-    assert_array_almost_equal(spec, [0.92, 0.55, 0.86], 2)
-    assert_array_almost_equal(sens, [0.79, 0.90, 0.10], 2)
-    assert_array_equal(supp, [24, 20, 31])
+    assert_warns_message(
+        UserWarning,
+        "Note that pos_label (set to 2) is "
+        "ignored when average != 'binary' (got 'macro'). You "
+        "may use labels=[pos_label] to specify a single "
+        "positive class.",
+        sensitivity_specificity_support, [1, 2, 1], [1, 2, 2],
+        pos_label=2,
+        average='macro')
 
 
 def test_geometric_mean_support_binary():

From 0674ada7218a699d1e06bbc9c709e65c68afc9b2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Fri, 23 Dec 2016 01:22:28 +0100
Subject: [PATCH 10/21] Added the IBA metric

---
 imblearn/metrics/classification.py | 76 ++++++++++++++++++++++++++----
 1 file changed, 68 insertions(+), 8 deletions(-)

diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index e9516ed0f..20ffd4652 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -198,11 +198,6 @@ def sensitivity_specificity_support(y_true,
         pred_sum = pred_sum[indices]
         tn_sum = tn_sum[indices]
 
-        LOGGER.debug('tp: %s' % tp_sum)
-        LOGGER.debug('tn: %s' % tn_sum)
-        LOGGER.debug('pred_sum: %s' % pred_sum)
-        LOGGER.debug('true_sum: %s' % true_sum)
-
     if average == 'micro':
         tp_sum = np.array([tp_sum.sum()])
         pred_sum = np.array([pred_sum.sum()])
@@ -233,9 +228,6 @@ def sensitivity_specificity_support(y_true,
     else:
         weights = None
 
-    LOGGER.debug(specificity)
-    LOGGER.debug(weights)
-
     if average is not None:
         assert average != 'binary' or len(specificity) == 1
         specificity = np.average(specificity, weights=weights)
@@ -509,3 +501,71 @@ def geometric_mean_score(y_true,
     LOGGER.debug('The sensitivity and specificity are : %s - %s' % (sen, spe))
 
     return np.sqrt(sen * spe)
+
+
+def indexed_balanced_accuracy_score(score_func,
+                                    y_true,
+                                    y_pred,
+                                    alpha=0.1,
+                                    squared=True,
+                                    **kwargs):
+    """ Compute the indexed balanced accuracy of a scoring function
+
+    The indexed balanced accuracy (IBA) tends to weight a scoring function
+    to take into account the imbalancing of the data.
+
+    Parameters
+    ----------
+    score_func : callable,
+        Score function (or loss function) with signature
+        ``score_func(y, y_pred, **kwargs)``.
+
+    y_true : ndarray, shape (n_samples, )
+        Ground truth (correct) target values.
+
+    y_pred : ndarray, shape (n_samples, )
+        Estimated targets as returned by a classifier.
+
+    alpha : float, optional (default=0.1)
+        Weighting factor.
+
+    squared : bool, optional (default=True)
+        If ``squared`` is True, then the metric computed will be squared
+        before to be weighted.
+
+    **kwargs : additional arguments
+        Additional parameters to be passed to score_func.
+
+    Returns
+    -------
+    iba : float (if ``average`` = None) or ndarray, \
+        shape (n_unique_labels, )
+
+    References
+    ----------
+    .. [1] Garcia, V. and Mollineda, R.A. and Sanchez, J.S. "Theoretical
+       analysis of a performance measure for imbalanced data" ICPR (2010)
+    """
+
+    score = score_func(**kwargs)
+
+    if squared:
+        score = np.power(score, 2)
+
+    # Pop the arguments to have the proper average, etc. for the
+    # sensitivity and specificity
+    labels = kwargs.get('labels', None)
+    pos_label = kwargs.get('pos_label', 1)
+    average = kwargs.get('average', 'binary')
+    sample_weight = kwargs.get('sample_weight', None)
+
+    # Compute the sensitivity and specificity
+    sen = sensitivity_score(y_true, y_pred, labels, pos_label, average,
+                            sample_weight)
+    spe = specificity_score(y_true, y_pred, labels, pos_label, average,
+                            sample_weight)
+
+    # Compute the dominance
+    dom = sen - spe
+
+    return (1. + alpha * dom) * score

From 21d2c7d5099ebbd8c8e4866a73176c4a78255777 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Fri, 23 Dec 2016 01:31:07 +0100
Subject: [PATCH 11/21] Add a single test for IBA

---
 imblearn/metrics/__init__.py                  |  8 ++++----
 imblearn/metrics/classification.py            |  2 +-
 imblearn/metrics/tests/test_classification.py | 11 +++++++++++
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py
index 13a17ae9d..951829792 100644
--- a/imblearn/metrics/__init__.py
+++ b/imblearn/metrics/__init__.py
@@ -7,10 +7,10 @@
 from .classification import sensitivity_score
 from .classification import specificity_score
 from .classification import geometric_mean_score
+from .classification import indexed_balanced_accuracy_score
 
 __all__ = [
-    'sensitivity_specificity_support',
-    'sensitivity_score',
-    'specificity_score',
-    'geometric_mean_score'
+    'sensitivity_specificity_support', 'sensitivity_score',
+    'specificity_score', 'geometric_mean_score',
+    'indexed_balanced_accuracy_score'
 ]
diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index 20ffd4652..6964a7d93 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -547,7 +547,7 @@ def indexed_balanced_accuracy_score(score_func,
        analysis of a performance measure for imbalanced data" ICPR (2010)
     """
 
-    score = score_func(**kwargs)
+    score = score_func(y_true, y_pred, **kwargs)
 
     if squared:
         score = np.power(score, 2)
diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py
index 02c2edb53..5d0bb94ce 100644
--- a/imblearn/metrics/tests/test_classification.py
+++ b/imblearn/metrics/tests/test_classification.py
@@ -22,6 +22,7 @@
 from imblearn.metrics import sensitivity_score
 from imblearn.metrics import specificity_score
 from imblearn.metrics import geometric_mean_score
+from imblearn.metrics import indexed_balanced_accuracy_score
 
 RND_SEED = 42
 
@@ -232,3 +233,13 @@ def test_geometric_mean_multiclass():
 
     geo_mean = geometric_mean_score(y_true, y_pred, average='weighted')
     assert_array_almost_equal(geo_mean, 0.65, 2)
+
+
+def test_iba_geo_mean_binary():
+    """Test to test the iba using the geometric mean"""
+    y_true, y_pred, _ = make_prediction(binary=True)
+
+    iba = indexed_balanced_accuracy_score(
+        geometric_mean_score, y_true, y_pred, alpha=0.5, squared=True)
+
+    assert_almost_equal(iba, 0.54, 2)

From 3a8fa2117a0b7e6e53129324154c06704cce8e44 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Fri, 23 Dec 2016 01:38:35 +0100
Subject: [PATCH 12/21] Update the doc

---
 doc/api.rst       | 20 ++++++++++++++++++++
 doc/whats_new.rst |  1 +
 2 files changed, 21 insertions(+)

diff --git a/doc/api.rst b/doc/api.rst
index 77e96457b..272c7d22e 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -119,6 +119,26 @@ Functions
 
    pipeline.make_pipeline
 
+.. _metrics_ref:
+
+Metrics
+=======
+
+.. automodule:: imblearn.metrics
+   :no-members:
+   :no-inherited-members:
+
+.. currentmodule:: imblearn
+
+Functions
+---------
+.. autosummary::
+:toctree: generated/
+   metrics.sensitivity_specificity_support
+   metrics.sensitivity_score
+   metrics.specificity_score
+   metrics.geometric_mean_score
+   metrics.indexed_balanced_accuracy_score
 
 .. _datasets_ref:
 
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 40095d88d..bfe3c81a6 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -28,6 +28,7 @@ New features
 ~~~~~~~~~~~~
 
 - Added AllKNN under sampling technique. By `Dayvid Oliveira`_.
+- Added a module `metrics` implementing some specific scoring function for the problem of balancing. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
 
 Enhancement
 ~~~~~~~~~~~

From f72fa66d83066ef23b5fda6603be811358c04008 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Fri, 23 Dec 2016 01:57:43 +0100
Subject: [PATCH 13/21] remove useless import

---
 imblearn/metrics/classification.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index 6964a7d93..cc40bfc3e 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -19,7 +19,6 @@
 from sklearn.preprocessing import LabelEncoder
 from sklearn.utils.fixes import bincount
 from sklearn.utils.multiclass import unique_labels
-from sklearn.utils.sparsefuncs import count_nonzero
 
 LOGGER = logging.getLogger(__name__)
 

From 17b322e1238c3acb10fece052c6da11f2329e7ae Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Fri, 23 Dec 2016 16:14:55 +0100
Subject: [PATCH 14/21] Change IBA to be a decorator instead of a score

---
 imblearn/metrics/__init__.py                  |   5 +-
 imblearn/metrics/classification.py            | 201 ++++++++++++++----
 imblearn/metrics/tests/test_classification.py | 161 ++++++++++++--
 3 files changed, 306 insertions(+), 61 deletions(-)

diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py
index 951829792..5cbe3b2f1 100644
--- a/imblearn/metrics/__init__.py
+++ b/imblearn/metrics/__init__.py
@@ -7,10 +7,11 @@
 from .classification import sensitivity_score
 from .classification import specificity_score
 from .classification import geometric_mean_score
-from .classification import indexed_balanced_accuracy_score
+from .classification import make_indexed_balanced_accuracy
+from .classification import classification_report_imbalanced
 
 __all__ = [
     'sensitivity_specificity_support', 'sensitivity_score',
     'specificity_score', 'geometric_mean_score',
-    'indexed_balanced_accuracy_score'
+    'make_indexed_balanced_accuracy', 'classification_report_imbalanced'
 ]
diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index cc40bfc3e..dcc13f82a 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -12,10 +12,12 @@
 
 import warnings
 import logging
+import functools
 
 import numpy as np
 
-from sklearn.metrics.classification import _check_targets, _prf_divide
+from sklearn.metrics.classification import (_check_targets, _prf_divide,
+                                            precision_recall_fscore_support)
 from sklearn.preprocessing import LabelEncoder
 from sklearn.utils.fixes import bincount
 from sklearn.utils.multiclass import unique_labels
@@ -502,29 +504,15 @@ def geometric_mean_score(y_true,
     return np.sqrt(sen * spe)
 
 
-def indexed_balanced_accuracy_score(score_func,
-                                    y_true,
-                                    y_pred,
-                                    alpha=0.1,
-                                    squared=True,
-                                    **kwargs):
-    """ Compute the indexed balanced accuracy of a scoring function
+def make_indexed_balanced_accuracy(alpha=0.1, squared=True):
+    """Balance any scoring function using the indexed balanced accuracy
 
-    The indexed balanced accuracy (IBA) tends to weight a scoring function
-    to take into account the imbalancing of the data.
+    This factory function wraps scoring function to express it as the
+    indexed balanced accuracy (IBA). You need to use this function to
+    decorate any scoring function.
 
     Parameters
     ----------
-    score_func : callable,
-        Score function (or loss function) with signature
-        ``score_func(y, y_pred, **kwargs)``.
-
-    y_true : ndarray, shape (n_samples, )
-        Ground truth (correct) target values.
-
-    y_pred : ndarray, shape (n_samples, )
-        Estimated targets as returned by a classifier.
-
     alpha : float, optional (default=0.1)
         Weighting factor.
 
@@ -532,39 +520,160 @@ def indexed_balanced_accuracy_score(score_func,
         If ``squared`` is True, then the metric computed will be squared
         before to be weighted.
 
-    **kwargs : additional arguments
-        Additional parameters to be passed to score_func.
-
     Returns
     -------
-    iba : float (if ``average`` = None) or ndarray, \
-        shape (n_unique_labels, )
+    iba_scoring_func : callable,
+        Returns the scoring metric decorated which will automatically compute
+        the indexed balanced accuracy.
+
+    Examples
+    --------
+    >>> from imblearn.metrics import geometric_mean_score as gmean
+    >>> from imblearn.metrics import make_indexed_balanced_accuracy as iba
+    >>> gmean = iba(alpha=0.1, squared=True)(gmean)
+    >>> y_true = [1, 0, 0, 1, 0, 1]
+    >>> y_pred = [0, 0, 1, 1, 0, 1]
+    >>> print(gmean(y_true, y_pred, average=None))
+    [ 0.44444444  0.44444444]
+    """
+    def decorate(scoring_func):
+        @functools.wraps(scoring_func)
+        def compute_score(*args, **kwargs):
+            # Compute the score from the scoring function
+            _score = scoring_func(*args, **kwargs)
+            # Square if desired
+            if squared:
+                _score = np.power(_score, 2)
+            # args will contain the y_pred and y_true
+            # kwargs will contain the other parameters
+            labels = kwargs.get('labels', None)
+            pos_label = kwargs.get('pos_label', 1)
+            average = kwargs.get('average', 'binary')
+            sample_weight = kwargs.get('sample_weight', None)
+            # Compute the sensitivity and specificity
+            dict_sen_spe = {'labels': labels, 'pos_label': pos_label,
+                            'average': average, 'sample_weight': sample_weight}
+            sen, spe, _ = sensitivity_specificity_support(*args,
+                                                          **dict_sen_spe)
+            # Compute the dominance
+            dom = sen - spe
+            return (1. + alpha * dom) * _score
+        return compute_score
+    return decorate
+
+
+def classification_report_imbalanced(y_true,
+                                     y_pred,
+                                     labels=None,
+                                     target_names=None,
+                                     sample_weight=None,
+                                     digits=2,
+                                     alpha=0.1):
+    """Build a classification report based on metrics used with imbalanced
+    dataset
+
+    Specific metrics have been proposed to evaluate the classification
+    performed on imbalanced dataset. This report compiles the
+    state-of-the-art metrics: precision/recall/specificity, geometric
+    mean, and indexed balanced accuracy of the
+    geometric mean.
 
-    References
+    Parameters
     ----------
-    .. [1] Garcia, V. and Mollineda, R.A. and Sanchez, J.S. "Theoretical
-       analysis of a performance measure for imbalanced data" ICPR (2010)
-    """
+    y_true : ndarray, shape (n_samples, )
+        Ground truth (correct) target values.
+
+    y_pred : ndarray, shape (n_samples, )
+        Estimated targets as returned by a classifier.
 
-    score = score_func(y_true, y_pred, **kwargs)
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average.
 
-    if squared:
-        score = np.power(score, 2)
+    target_names : list of strings, optional
+        Optional display names matching the labels (same order).
 
-    # Pop the arguments to have the proper average, etc. for the
-    # sensitivity and specificity
-    labels = kwargs.get('labels', None)
-    pos_label = kwargs.get('pos_label', 1)
-    average = kwargs.get('average', 'binary')
-    sample_weight = kwargs.get('sample_weight', None)
+    sample_weight : ndarray, shape (n_samples, )
+        Sample weights.
 
-    # Compute the sensitivity and specificity
-    sen = sensitivity_score(y_true, y_pred, labels, pos_label, average,
-                            sample_weight)
-    spe = specificity_score(y_true, y_pred, labels, pos_label, average,
-                            sample_weight)
+    digits : int, optional (default=2)
+        Number of digits for formatting output floating point values
 
-    # Compute the dominance
-    dom = sen - spe
+    alpha : float, optional (default=0.1)
+        Weighting factor.
 
-    return (1. + alpha * dom) * score
+    Returns
+    -------
+    report : string
+        Text summary of the precision, recall, specificity, geometric mean,
+        and indexed balanced accuracy.
+
+    """
+
+    if labels is None:
+        labels = unique_labels(y_true, y_pred)
+    else:
+        labels = np.asarray(labels)
+
+    last_line_heading = 'avg / total'
+
+    if target_names is None:
+        target_names = ['%s' % l for l in labels]
+    name_width = max(len(cn) for cn in target_names)
+    width = max(name_width, len(last_line_heading), digits)
+
+    headers = ["pre", "rec", "spe", "f1",
+               "geo", "iba",  "sup"]
+    fmt = '%% %ds' % width  # first column: class name
+    fmt += '  '
+    fmt += ' '.join(['% 9s' for _ in headers])
+    fmt += '\n'
+
+    headers = [""] + headers
+    report = fmt % tuple(headers)
+    report += '\n'
+
+    # Compute the different metrics
+    # Precision/recall/f1
+    precision, recall, f1, support = precision_recall_fscore_support(
+        y_true, y_pred,
+        labels=labels,
+        average=None,
+        sample_weight=sample_weight)
+    # Specificity
+    specificity = specificity_score(y_true, y_pred, labels=labels,
+                                    average=None, sample_weight=sample_weight)
+    # Geometric mean
+    geo_mean = geometric_mean_score(y_pred, y_true, labels=labels,
+                                    average=None, sample_weight=sample_weight)
+    # Indexed balanced accuracy
+    iba_gmean = make_indexed_balanced_accuracy(alpha=alpha, squared=True)(
+        geometric_mean_score)
+    iba = iba_gmean(y_pred, y_true, labels=labels, average=None,
+                    sample_weight=sample_weight)
+
+    for i, label in enumerate(labels):
+        values = [target_names[i]]
+        for v in (precision[i], recall[i], specificity[i],
+                  f1[i], geo_mean[i], iba[i]):
+            values += ["{0:0.{1}f}".format(v, digits)]
+        values += ["{0}".format(support[i])]
+        report += fmt % tuple(values)
+
+    report += '\n'
+
+    # compute averages
+    values = [last_line_heading]
+    for v in (np.average(precision, weights=support),
+              np.average(recall, weights=support),
+              np.average(specificity, weights=support),
+              np.average(f1, weights=support),
+              np.average(geo_mean, weights=support),
+              np.average(iba, weights=support)):
+        values += ["{0:0.{1}f}".format(v, digits)]
+    values += ['{0}'.format(np.sum(support))]
+    report += fmt % tuple(values)
+    return report
diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py
index 5d0bb94ce..af021c0cb 100644
--- a/imblearn/metrics/tests/test_classification.py
+++ b/imblearn/metrics/tests/test_classification.py
@@ -2,6 +2,8 @@
 
 from __future__ import division, print_function
 
+import re
+
 from functools import partial
 
 import numpy as np
@@ -15,14 +17,16 @@
 from sklearn import svm
 
 from sklearn.preprocessing import label_binarize
-from sklearn.utils.testing import assert_not_equal
+from sklearn.utils.fixes import np_version
+from sklearn.utils.testing import assert_not_equal, assert_raise_message
 from sklearn.utils.validation import check_random_state
 
 from imblearn.metrics import sensitivity_specificity_support
 from imblearn.metrics import sensitivity_score
 from imblearn.metrics import specificity_score
 from imblearn.metrics import geometric_mean_score
-from imblearn.metrics import indexed_balanced_accuracy_score
+from imblearn.metrics import make_indexed_balanced_accuracy
+from imblearn.metrics import classification_report_imbalanced
 
 RND_SEED = 42
 
@@ -79,7 +83,7 @@ def make_prediction(dataset=None, binary=False):
 
 
 def test_sensitivity_specificity_score_binary():
-    # Test Sensitivity Specificity for binary classification task
+    """Test Sensitivity Specificity for binary classification task"""
     y_true, y_pred, _ = make_prediction(binary=True)
 
     # detailed measures for each class
@@ -103,8 +107,8 @@ def test_sensitivity_specificity_score_binary():
 
 
 def test_sensitivity_specificity_f_binary_single_class():
-    # Test sensitivity and specificity behave with a single positive or
-    # negative class
+    """Test sensitivity and specificity behave with a single positive or
+    negative class"""
     # Such a case may occur with non-stratified cross-validation
     assert_equal(1., sensitivity_score([1, 1], [1, 1]))
     assert_equal(0., specificity_score([1, 1], [1, 1]))
@@ -115,7 +119,7 @@ def test_sensitivity_specificity_f_binary_single_class():
 
 @ignore_warnings
 def test_sensitivity_specificity_extra_labels():
-    # Test handling of explicit additional (not in input) labels to SS
+    """Test handling of explicit additional (not in input) labels to SS"""
     y_true = [1, 3, 3, 2]
     y_pred = [1, 1, 3, 2]
 
@@ -142,7 +146,7 @@ def test_sensitivity_specificity_extra_labels():
 
 @ignore_warnings
 def test_sensitivity_specificity_ignored_labels():
-    # Test a subset of labels may be requested for SS
+    """Test a subset of labels may be requested for SS"""
     y_true = [1, 1, 2, 3]
     y_pred = [1, 3, 3, 3]
 
@@ -166,7 +170,7 @@ def test_sensitivity_specificity_ignored_labels():
 
 
 def test_sensitivity_specificity_error_multilabels():
-    # Test either if an error is raised when the input are multilabels
+    """Test either if an error is raised when the input are multilabels"""
     y_true = [1, 3, 3, 2]
     y_pred = [1, 1, 3, 2]
     y_true_bin = label_binarize(y_true, classes=np.arange(5))
@@ -177,6 +181,7 @@ def test_sensitivity_specificity_error_multilabels():
 
 @ignore_warnings
 def test_sensitivity_specificity_support_errors():
+    """Test either if an error is raised depending on parameters"""
     y_true, y_pred, _ = make_prediction(binary=True)
 
     # Bad pos_label
@@ -196,8 +201,8 @@ def test_sensitivity_specificity_support_errors():
 
 
 def test_sensitivity_specificity_unused_pos_label():
-    # Check warning that pos_label unused when set to non-default value
-    # but average != 'binary'; even if data is binary.
+    """Check warning that pos_label unused when set to non-default value
+    # but average != 'binary'; even if data is binary"""
     assert_warns_message(
         UserWarning,
         "Note that pos_label (set to 2) is "
@@ -220,7 +225,7 @@ def test_geometric_mean_support_binary():
 
 
 def test_geometric_mean_multiclass():
-    # Test geometric mean for multiclass classification task
+    """Test geometric mean for multiclass classification task"""
     y_true, y_pred, _ = make_prediction(binary=False)
 
     # Compute the geometric mean for each of the classes
@@ -239,7 +244,137 @@ def test_iba_geo_mean_binary():
     """Test to test the iba using the geometric mean"""
     y_true, y_pred, _ = make_prediction(binary=True)
 
-    iba = indexed_balanced_accuracy_score(
-        geometric_mean_score, y_true, y_pred, alpha=0.5, squared=True)
+    iba_gmean = make_indexed_balanced_accuracy(alpha=0.5, squared=True)(
+        geometric_mean_score)
+    iba = iba_gmean(y_true, y_pred)
 
     assert_almost_equal(iba, 0.54, 2)
+
+def _format_report(report):
+    """Private function to reformat the report for testing"""
+
+    return ' '.join(report.split())
+
+
+def test_classification_report_imbalanced_multiclass():
+    """Test classification report for multiclass problem"""
+    iris = datasets.load_iris()
+    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)
+
+    # print classification report with class names
+    expected_report = ("pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 0.81 "
+                       "0.86 0.72 24 versicolor 0.33 0.10 0.86 0.15 0.44 "
+                       "0.08 31 virginica 0.42 0.90 0.55 0.57 0.63 0.51 20 "
+                       "avg / total 0.51 0.53 0.80 0.47 0.62 0.40 75")
+
+    report = classification_report_imbalanced(
+        y_true, y_pred, labels=np.arange(len(iris.target_names)),
+        target_names=iris.target_names)
+    assert_equal(_format_report(report), expected_report)
+    # print classification report with label detection
+    expected_report = ("pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 "
+                       "0.86 0.72 24 1 0.33 0.10 0.86 0.15 0.44 0.08 31 2 "
+                       "0.42 0.90 0.55 0.57 0.63 0.51 20 avg / total 0.51 "
+                       "0.53 0.80 0.47 0.62 0.40 75")
+
+    report = classification_report_imbalanced(y_true, y_pred)
+    assert_equal(_format_report(report), expected_report)
+
+
+def test_classification_report_imbalanced_multiclass_with_digits():
+    """Test performance report with added digits in floating point values"""
+    iris = datasets.load_iris()
+    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)
+
+    # print classification report with class names
+    expected_report = ("pre rec spe f1 geo iba sup setosa 0.82609 0.79167 "
+                       "0.92157 0.80851 0.86409 0.72010 24 versicolor 0.33333 "
+                       "0.09677 0.86364 0.15000 0.43809 0.07717 31 virginica "
+                       "0.41860 0.90000 0.54545 0.57143 0.62645 0.50831 20 "
+                       "avg / total 0.51375 0.53333 0.79733 0.47310 0.62464 "
+                       "0.39788 75")
+    report = classification_report_imbalanced(
+        y_true, y_pred, labels=np.arange(len(iris.target_names)),
+        target_names=iris.target_names, digits=5)
+    assert_equal(_format_report(report), expected_report)
+    # print classification report with label detection
+    expected_report = ("pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 "
+                       "0.86 0.72 24 1 0.33 0.10 0.86 0.15 0.44 0.08 31 "
+                       "2 0.42 0.90 0.55 0.57 0.63 0.51 20 "
+                       "avg / total 0.51 0.53 0.80 0.47 0.62 0.40 75")
+    report = classification_report_imbalanced(y_true, y_pred)
+    assert_equal(_format_report(report), expected_report)
+
+
+def test_classification_report_imbalanced_multiclass_with_string_label():
+    """Test the report with string label"""
+    y_true, y_pred, _ = make_prediction(binary=False)
+
+    y_true = np.array(["blue", "green", "red"])[y_true]
+    y_pred = np.array(["blue", "green", "red"])[y_pred]
+
+    expected_report = """\
+             precision    recall  f1-score   support
+       blue       0.83      0.79      0.81        24
+      green       0.33      0.10      0.15        31
+        red       0.42      0.90      0.57        20
+avg / total       0.51      0.53      0.47        75
+"""
+    report = classification_report_imbalanced(y_true, y_pred)
+    assert_equal(report, expected_report)
+
+    expected_report = """\
+             precision    recall  f1-score   support
+          a       0.83      0.79      0.81        24
+          b       0.33      0.10      0.15        31
+          c       0.42      0.90      0.57        20
+avg / total       0.51      0.53      0.47        75
+"""
+    report = classification_report_imbalanced(y_true, y_pred,
+                                   target_names=["a", "b", "c"])
+    assert_equal(report, expected_report)
+
+
+def test_classification_report_imbalanced_multiclass_with_unicode_label():
+    """Test classification report with unicode label"""
+    y_true, y_pred, _ = make_prediction(binary=False)
+
+    labels = np.array([u"blue\xa2", u"green\xa2", u"red\xa2"])
+    y_true = labels[y_true]
+    y_pred = labels[y_pred]
+
+    expected_report = u"""\
+             precision    recall  f1-score   support
+      blue\xa2       0.83      0.79      0.81        24
+     green\xa2       0.33      0.10      0.15        31
+       red\xa2       0.42      0.90      0.57        20
+avg / total       0.51      0.53      0.47        75
+"""
+    if np_version[:3] < (1, 7, 0):
+        expected_message = ("NumPy < 1.7.0 does not implement"
+                            " searchsorted on unicode data correctly.")
+        assert_raise_message(RuntimeError, expected_message,
+                             classification_report_imbalanced, y_true, y_pred)
+    else:
+        report = classification_report_imbalanced(y_true, y_pred)
+        assert_equal(report, expected_report)
+
+
+def test_classification_report_imbalanced_multiclass_with_long_string_label():
+    """Test classification report with long string label"""
+    y_true, y_pred, _ = make_prediction(binary=False)
+
+    labels = np.array(["blue", "green"*5, "red"])
+    y_true = labels[y_true]
+    y_pred = labels[y_pred]
+
+    expected_report = """\
+                           precision    recall  f1-score   support
+                     blue       0.83      0.79      0.81        24
+greengreengreengreengreen       0.33      0.10      0.15        31
+                      red       0.42      0.90      0.57        20
+              avg / total       0.51      0.53      0.47        75
+"""
+
+    report = classification_report_imbalanced(y_true, y_pred)
+    assert_equal(report, expected_report)

From 154749e638f173620163f433c502c496916d6756 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Fri, 23 Dec 2016 18:46:46 +0100
Subject: [PATCH 15/21] Implemented the test for the report

---
 doc/api.rst                                   |   2 +-
 imblearn/metrics/tests/test_classification.py | 110 +++++++++---------
 2 files changed, 53 insertions(+), 59 deletions(-)

diff --git a/doc/api.rst b/doc/api.rst
index 272c7d22e..faec5f92b 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -138,7 +138,7 @@ Functions
    metrics.sensitivity_score
    metrics.specificity_score
    metrics.geometric_mean_score
-   metrics.indexed_balanced_accuracy_score
+   metrics.make_indexed_balanced_accuracy
 
 .. _datasets_ref:
 
diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py
index af021c0cb..8142937d4 100644
--- a/imblearn/metrics/tests/test_classification.py
+++ b/imblearn/metrics/tests/test_classification.py
@@ -244,12 +244,13 @@ def test_iba_geo_mean_binary():
     """Test to test the iba using the geometric mean"""
     y_true, y_pred, _ = make_prediction(binary=True)
 
-    iba_gmean = make_indexed_balanced_accuracy(alpha=0.5, squared=True)(
-        geometric_mean_score)
+    iba_gmean = make_indexed_balanced_accuracy(
+        alpha=0.5, squared=True)(geometric_mean_score)
     iba = iba_gmean(y_true, y_pred)
 
     assert_almost_equal(iba, 0.54, 2)
 
+
 def _format_report(report):
     """Private function to reformat the report for testing"""
 
@@ -262,20 +263,22 @@ def test_classification_report_imbalanced_multiclass():
     y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)
 
     # print classification report with class names
-    expected_report = ("pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 0.81 "
-                       "0.86 0.72 24 versicolor 0.33 0.10 0.86 0.15 0.44 "
-                       "0.08 31 virginica 0.42 0.90 0.55 0.57 0.63 0.51 20 "
-                       "avg / total 0.51 0.53 0.80 0.47 0.62 0.40 75")
+    expected_report = ('pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 '
+                       '0.81 0.86 0.74 24 versicolor 0.33 0.10 0.86 0.15 '
+                       '0.44 0.19 31 virginica 0.42 0.90 0.55 0.57 0.63 '
+                       '0.37 20 avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75')
 
     report = classification_report_imbalanced(
-        y_true, y_pred, labels=np.arange(len(iris.target_names)),
+        y_true,
+        y_pred,
+        labels=np.arange(len(iris.target_names)),
         target_names=iris.target_names)
     assert_equal(_format_report(report), expected_report)
     # print classification report with label detection
-    expected_report = ("pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 "
-                       "0.86 0.72 24 1 0.33 0.10 0.86 0.15 0.44 0.08 31 2 "
-                       "0.42 0.90 0.55 0.57 0.63 0.51 20 avg / total 0.51 "
-                       "0.53 0.80 0.47 0.62 0.40 75")
+    expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 '
+                       '0.86 0.74 24 1 0.33 0.10 0.86 0.15 0.44 0.19 31 2 '
+                       '0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total 0.51 '
+                       '0.53 0.80 0.47 0.62 0.41 75')
 
     report = classification_report_imbalanced(y_true, y_pred)
     assert_equal(_format_report(report), expected_report)
@@ -287,21 +290,24 @@ def test_classification_report_imbalanced_multiclass_with_digits():
     y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)
 
     # print classification report with class names
-    expected_report = ("pre rec spe f1 geo iba sup setosa 0.82609 0.79167 "
-                       "0.92157 0.80851 0.86409 0.72010 24 versicolor 0.33333 "
-                       "0.09677 0.86364 0.15000 0.43809 0.07717 31 virginica "
-                       "0.41860 0.90000 0.54545 0.57143 0.62645 0.50831 20 "
-                       "avg / total 0.51375 0.53333 0.79733 0.47310 0.62464 "
-                       "0.39788 75")
+    expected_report = ('pre rec spe f1 geo iba sup setosa 0.82609 0.79167 '
+                       '0.92157 0.80851 0.86409 0.74085 24 versicolor '
+                       '0.33333 0.09677 0.86364 0.15000 0.43809 0.18727 31 '
+                       'virginica 0.41860 0.90000 0.54545 0.57143 0.62645 '
+                       '0.37208 20 avg / total 0.51375 0.53333 0.79733 '
+                       '0.47310 0.62464 0.41370 75')
     report = classification_report_imbalanced(
-        y_true, y_pred, labels=np.arange(len(iris.target_names)),
-        target_names=iris.target_names, digits=5)
+        y_true,
+        y_pred,
+        labels=np.arange(len(iris.target_names)),
+        target_names=iris.target_names,
+        digits=5)
     assert_equal(_format_report(report), expected_report)
     # print classification report with label detection
-    expected_report = ("pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 "
-                       "0.86 0.72 24 1 0.33 0.10 0.86 0.15 0.44 0.08 31 "
-                       "2 0.42 0.90 0.55 0.57 0.63 0.51 20 "
-                       "avg / total 0.51 0.53 0.80 0.47 0.62 0.40 75")
+    expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 '
+                       '0.86 0.74 24 1 0.33 0.10 0.86 0.15 0.44 0.19 31 2 '
+                       '0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total 0.51 '
+                       '0.53 0.80 0.47 0.62 0.41 75')
     report = classification_report_imbalanced(y_true, y_pred)
     assert_equal(_format_report(report), expected_report)
 
@@ -313,26 +319,20 @@ def test_classification_report_imbalanced_multiclass_with_string_label():
     y_true = np.array(["blue", "green", "red"])[y_true]
     y_pred = np.array(["blue", "green", "red"])[y_pred]
 
-    expected_report = """\
-             precision    recall  f1-score   support
-       blue       0.83      0.79      0.81        24
-      green       0.33      0.10      0.15        31
-        red       0.42      0.90      0.57        20
-avg / total       0.51      0.53      0.47        75
-"""
+    expected_report = ('pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 '
+                       '0.81 0.86 0.74 24 green 0.33 0.10 0.86 0.15 0.44 '
+                       '0.19 31 red 0.42 0.90 0.55 0.57 0.63 0.37 20 '
+                       'avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75')
     report = classification_report_imbalanced(y_true, y_pred)
-    assert_equal(report, expected_report)
+    assert_equal(_format_report(report), expected_report)
 
-    expected_report = """\
-             precision    recall  f1-score   support
-          a       0.83      0.79      0.81        24
-          b       0.33      0.10      0.15        31
-          c       0.42      0.90      0.57        20
-avg / total       0.51      0.53      0.47        75
-"""
-    report = classification_report_imbalanced(y_true, y_pred,
-                                   target_names=["a", "b", "c"])
-    assert_equal(report, expected_report)
+    expected_report = ('pre rec spe f1 geo iba sup a 0.83 0.79 0.92 0.81 '
+                       '0.86 0.74 24 b 0.33 0.10 0.86 0.15 0.44 0.19 31 '
+                       'c 0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total '
+                       '0.51 0.53 0.80 0.47 0.62 0.41 75')
+    report = classification_report_imbalanced(
+        y_true, y_pred, target_names=["a", "b", "c"])
+    assert_equal(_format_report(report), expected_report)
 
 
 def test_classification_report_imbalanced_multiclass_with_unicode_label():
@@ -343,13 +343,10 @@ def test_classification_report_imbalanced_multiclass_with_unicode_label():
     y_true = labels[y_true]
     y_pred = labels[y_pred]
 
-    expected_report = u"""\
-             precision    recall  f1-score   support
-      blue\xa2       0.83      0.79      0.81        24
-     green\xa2       0.33      0.10      0.15        31
-       red\xa2       0.42      0.90      0.57        20
-avg / total       0.51      0.53      0.47        75
-"""
+    expected_report = (u'pre rec spe f1 geo iba sup blue\xa2 0.83 0.79 '
+                       u'0.92 0.81 0.86 0.74 24 green\xa2 0.33 0.10 0.86 '
+                       u'0.15 0.44 0.19 31 red\xa2 0.42 0.90 0.55 0.57 0.63 '
+                       u'0.37 20 avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75')
     if np_version[:3] < (1, 7, 0):
         expected_message = ("NumPy < 1.7.0 does not implement"
                             " searchsorted on unicode data correctly.")
@@ -357,24 +354,21 @@ def test_classification_report_imbalanced_multiclass_with_unicode_label():
                              classification_report_imbalanced, y_true, y_pred)
     else:
         report = classification_report_imbalanced(y_true, y_pred)
-        assert_equal(report, expected_report)
+        assert_equal(_format_report(report), expected_report)
 
 
 def test_classification_report_imbalanced_multiclass_with_long_string_label():
     """Test classification report with long string label"""
     y_true, y_pred, _ = make_prediction(binary=False)
 
-    labels = np.array(["blue", "green"*5, "red"])
+    labels = np.array(["blue", "green" * 5, "red"])
     y_true = labels[y_true]
     y_pred = labels[y_pred]
 
-    expected_report = """\
-                           precision    recall  f1-score   support
-                     blue       0.83      0.79      0.81        24
-greengreengreengreengreen       0.33      0.10      0.15        31
-                      red       0.42      0.90      0.57        20
-              avg / total       0.51      0.53      0.47        75
-"""
+    expected_report = ('pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 '
+                       '0.86 0.74 24 greengreengreengreengreen 0.33 0.10 '
+                       '0.86 0.15 0.44 0.19 31 red 0.42 0.90 0.55 0.57 0.63 '
+                       '0.37 20 avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75')
 
     report = classification_report_imbalanced(y_true, y_pred)
-    assert_equal(report, expected_report)
+    assert_equal(_format_report(report), expected_report)

From 3da16046539d546bf145bd6127cc02eefaa3d11c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Fri, 23 Dec 2016 18:56:28 +0100
Subject: [PATCH 16/21] PEP8

---
 imblearn/metrics/classification.py            | 62 ++++++++++++-------
 imblearn/metrics/tests/test_classification.py |  2 -
 2 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index dcc13f82a..f3b7a58f4 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -536,6 +536,7 @@ def make_indexed_balanced_accuracy(alpha=0.1, squared=True):
     >>> print(gmean(y_true, y_pred, average=None))
     [ 0.44444444  0.44444444]
     """
+
     def decorate(scoring_func):
         @functools.wraps(scoring_func)
         def compute_score(*args, **kwargs):
@@ -551,14 +552,20 @@ def compute_score(*args, **kwargs):
             average = kwargs.get('average', 'binary')
             sample_weight = kwargs.get('sample_weight', None)
             # Compute the sensitivity and specificity
-            dict_sen_spe = {'labels': labels, 'pos_label': pos_label,
-                            'average': average, 'sample_weight': sample_weight}
+            dict_sen_spe = {
+                'labels': labels,
+                'pos_label': pos_label,
+                'average': average,
+                'sample_weight': sample_weight
+            }
             sen, spe, _ = sensitivity_specificity_support(*args,
                                                           **dict_sen_spe)
             # Compute the dominance
             dom = sen - spe
             return (1. + alpha * dom) * _score
+
         return compute_score
+
     return decorate
 
 
@@ -625,8 +632,7 @@ def classification_report_imbalanced(y_true,
     name_width = max(len(cn) for cn in target_names)
     width = max(name_width, len(last_line_heading), digits)
 
-    headers = ["pre", "rec", "spe", "f1",
-               "geo", "iba",  "sup"]
+    headers = ["pre", "rec", "spe", "f1", "geo", "iba", "sup"]
     fmt = '%% %ds' % width  # first column: class name
     fmt += '  '
     fmt += ' '.join(['% 9s' for _ in headers])
@@ -639,26 +645,39 @@ def classification_report_imbalanced(y_true,
     # Compute the different metrics
     # Precision/recall/f1
     precision, recall, f1, support = precision_recall_fscore_support(
-        y_true, y_pred,
+        y_true,
+        y_pred,
         labels=labels,
         average=None,
         sample_weight=sample_weight)
     # Specificity
-    specificity = specificity_score(y_true, y_pred, labels=labels,
-                                    average=None, sample_weight=sample_weight)
+    specificity = specificity_score(
+        y_true,
+        y_pred,
+        labels=labels,
+        average=None,
+        sample_weight=sample_weight)
     # Geometric mean
-    geo_mean = geometric_mean_score(y_pred, y_true, labels=labels,
-                                    average=None, sample_weight=sample_weight)
+    geo_mean = geometric_mean_score(
+        y_pred,
+        y_true,
+        labels=labels,
+        average=None,
+        sample_weight=sample_weight)
     # Indexed balanced accuracy
-    iba_gmean = make_indexed_balanced_accuracy(alpha=alpha, squared=True)(
-        geometric_mean_score)
-    iba = iba_gmean(y_pred, y_true, labels=labels, average=None,
-                    sample_weight=sample_weight)
+    iba_gmean = make_indexed_balanced_accuracy(
+        alpha=alpha, squared=True)(geometric_mean_score)
+    iba = iba_gmean(
+        y_pred,
+        y_true,
+        labels=labels,
+        average=None,
+        sample_weight=sample_weight)
 
     for i, label in enumerate(labels):
         values = [target_names[i]]
-        for v in (precision[i], recall[i], specificity[i],
-                  f1[i], geo_mean[i], iba[i]):
+        for v in (precision[i], recall[i], specificity[i], f1[i], geo_mean[i],
+                  iba[i]):
             values += ["{0:0.{1}f}".format(v, digits)]
         values += ["{0}".format(support[i])]
         report += fmt % tuple(values)
@@ -667,12 +686,13 @@ def classification_report_imbalanced(y_true,
 
     # compute averages
     values = [last_line_heading]
-    for v in (np.average(precision, weights=support),
-              np.average(recall, weights=support),
-              np.average(specificity, weights=support),
-              np.average(f1, weights=support),
-              np.average(geo_mean, weights=support),
-              np.average(iba, weights=support)):
+    for v in (np.average(
+            precision, weights=support), np.average(
+                recall, weights=support), np.average(
+                    specificity, weights=support), np.average(
+                        f1, weights=support), np.average(
+                            geo_mean, weights=support), np.average(
+                                iba, weights=support)):
         values += ["{0:0.{1}f}".format(v, digits)]
     values += ['{0}'.format(np.sum(support))]
     report += fmt % tuple(values)
diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py
index 8142937d4..a9a938c1b 100644
--- a/imblearn/metrics/tests/test_classification.py
+++ b/imblearn/metrics/tests/test_classification.py
@@ -2,8 +2,6 @@
 
 from __future__ import division, print_function
 
-import re
-
 from functools import partial
 
 import numpy as np

From ca145e9c6e2ea81dedebd0956cf7a09989545737 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Sun, 25 Dec 2016 18:22:31 +0100
Subject: [PATCH 17/21] Modify the test for scorer

---
 imblearn/metrics/tests/test_score_objects.py | 104 +++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 imblearn/metrics/tests/test_score_objects.py

diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py
new file mode 100644
index 000000000..e22a64495
--- /dev/null
+++ b/imblearn/metrics/tests/test_score_objects.py
@@ -0,0 +1,104 @@
+from numpy.testing import assert_almost_equal
+
+from sklearn.datasets import make_blobs
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.metrics import make_scorer
+from sklearn.svm import LinearSVC
+
+from imblearn.metrics import (sensitivity_score, specificity_score,
+                              geometric_mean_score,
+                              make_indexed_balanced_accuracy)
+
+
+def test_imblearn_classification_scorers():
+    """Test if the implemented scorer can be used in scikit-learn"""
+    X, y = make_blobs(random_state=0, centers=2)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = LinearSVC(random_state=0)
+    clf.fit(X_train, y_train)
+
+    # sensitivity scorer
+    scorer = make_scorer(sensitivity_score, pos_label=None, average='macro')
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.92, 2)
+
+    scorer = make_scorer(sensitivity_score, pos_label=None, average='weighted')
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.92, 2)
+
+    scorer = make_scorer(sensitivity_score, pos_label=None, average='micro')
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.92, 2)
+
+    scorer = make_scorer(sensitivity_score, pos_label=1)
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.92, 2)
+
+    # specificity scorer
+    scorer = make_scorer(specificity_score, pos_label=None, average='macro')
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.92, 2)
+
+    scorer = make_scorer(specificity_score, pos_label=None, average='weighted')
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.92, 2)
+
+    scorer = make_scorer(specificity_score, pos_label=None, average='micro')
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.92, 2)
+
+    scorer = make_scorer(specificity_score, pos_label=1)
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.95, 2)
+
+    # geometric_mean scorer
+    scorer = make_scorer(geometric_mean_score, pos_label=None, average='macro')
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.92, 2)
+
+    scorer = make_scorer(
+        geometric_mean_score, pos_label=None, average='weighted')
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.92, 2)
+
+    scorer = make_scorer(geometric_mean_score, pos_label=None, average='micro')
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.92, 2)
+
+    scorer = make_scorer(geometric_mean_score, pos_label=1)
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.92, 2)
+
+    # make a iba metric before a scorer
+    geo_mean_iba = make_indexed_balanced_accuracy()(geometric_mean_score)
+    scorer = make_scorer(geo_mean_iba, pos_label=None, average='macro')
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.85, 2)
+
+    scorer = make_scorer(geo_mean_iba, pos_label=None, average='weighted')
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.85, 2)
+
+    scorer = make_scorer(geo_mean_iba, pos_label=None, average='micro')
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.85, 2)
+
+    scorer = make_scorer(geo_mean_iba, pos_label=1)
+    grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
+    grid.fit(X_train, y_train).predict(X_test)
+    assert_almost_equal(grid.best_score_, 0.84, 2)

From 115e817b7bb8618048590a29b1cd32ffb9d3a3d3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Sun, 25 Dec 2016 18:48:34 +0100
Subject: [PATCH 18/21] handle the model_selection in the testing

---
 imblearn/metrics/tests/test_score_objects.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py
index e22a64495..1faf35191 100644
--- a/imblearn/metrics/tests/test_score_objects.py
+++ b/imblearn/metrics/tests/test_score_objects.py
@@ -1,7 +1,15 @@
 from numpy.testing import assert_almost_equal
 
+import sklearn
+# Get the version
+(major, minor, _) = sklearn.__version__.split('.')
+if minor < 18:
+    from sklearn.cross_validation import train_test_split
+    from sklearn.grid_search import GridSearchCV
+else:
+    from sklearn.model_selection import train_test_split, GridSearchCV
+
 from sklearn.datasets import make_blobs
-from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.metrics import make_scorer
 from sklearn.svm import LinearSVC
 

From a740ddf6540403d6164bfbea2488a71426033e5f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Sun, 25 Dec 2016 19:08:25 +0100
Subject: [PATCH 19/21] Solve the import issue

---
 imblearn/metrics/tests/test_score_objects.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py
index 1faf35191..7232264f3 100644
--- a/imblearn/metrics/tests/test_score_objects.py
+++ b/imblearn/metrics/tests/test_score_objects.py
@@ -1,13 +1,6 @@
 from numpy.testing import assert_almost_equal
 
 import sklearn
-# Get the version
-(major, minor, _) = sklearn.__version__.split('.')
-if minor < 18:
-    from sklearn.cross_validation import train_test_split
-    from sklearn.grid_search import GridSearchCV
-else:
-    from sklearn.model_selection import train_test_split, GridSearchCV
 
 from sklearn.datasets import make_blobs
 from sklearn.metrics import make_scorer
@@ -16,6 +9,13 @@
 from imblearn.metrics import (sensitivity_score, specificity_score,
                               geometric_mean_score,
                               make_indexed_balanced_accuracy)
+# Get the version
+(major, minor, _) = sklearn.__version__.split('.')
+if int(minor) < 18:
+    from sklearn.cross_validation import train_test_split
+    from sklearn.grid_search import GridSearchCV
+else:
+    from sklearn.model_selection import train_test_split, GridSearchCV
 
 
 def test_imblearn_classification_scorers():

From 0b36f6e6c1e383f4904c861c091c866211570755 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Wed, 28 Dec 2016 00:36:11 +0100
Subject: [PATCH 20/21] correct the name of IBA

---
 doc/api.rst                                   | 2 +-
 imblearn/metrics/__init__.py                  | 4 ++--
 imblearn/metrics/classification.py            | 6 +++---
 imblearn/metrics/tests/test_classification.py | 4 ++--
 imblearn/metrics/tests/test_score_objects.py  | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/doc/api.rst b/doc/api.rst
index faec5f92b..cdd56c6ce 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -138,7 +138,7 @@ Functions
    metrics.sensitivity_score
    metrics.specificity_score
    metrics.geometric_mean_score
-   metrics.make_indexed_balanced_accuracy
+   metrics.make_index_balanced_accuracy
 
 .. _datasets_ref:
 
diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py
index 5cbe3b2f1..037a200d9 100644
--- a/imblearn/metrics/__init__.py
+++ b/imblearn/metrics/__init__.py
@@ -7,11 +7,11 @@
 from .classification import sensitivity_score
 from .classification import specificity_score
 from .classification import geometric_mean_score
-from .classification import make_indexed_balanced_accuracy
+from .classification import make_index_balanced_accuracy
 from .classification import classification_report_imbalanced
 
 __all__ = [
     'sensitivity_specificity_support', 'sensitivity_score',
     'specificity_score', 'geometric_mean_score',
-    'make_indexed_balanced_accuracy', 'classification_report_imbalanced'
+    'make_index_balanced_accuracy', 'classification_report_imbalanced'
 ]
diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index f3b7a58f4..e145a26f7 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -504,7 +504,7 @@ def geometric_mean_score(y_true,
     return np.sqrt(sen * spe)
 
 
-def make_indexed_balanced_accuracy(alpha=0.1, squared=True):
+def make_index_balanced_accuracy(alpha=0.1, squared=True):
     """Balance any scoring function using the indexed balanced accuracy
 
     This factory function wraps scoring function to express it as the
@@ -529,7 +529,7 @@ def make_indexed_balanced_accuracy(alpha=0.1, squared=True):
     Examples
     --------
     >>> from imblearn.metrics import geometric_mean_score as gmean
-    >>> from imblearn.metrics import make_indexed_balanced_accuracy as iba
+    >>> from imblearn.metrics import make_index_balanced_accuracy as iba
     >>> gmean = iba(alpha=0.1, squared=True)(gmean)
     >>> y_true = [1, 0, 0, 1, 0, 1]
     >>> y_pred = [0, 0, 1, 1, 0, 1]
@@ -665,7 +665,7 @@ def classification_report_imbalanced(y_true,
         average=None,
         sample_weight=sample_weight)
     # Indexed balanced accuracy
-    iba_gmean = make_indexed_balanced_accuracy(
+    iba_gmean = make_index_balanced_accuracy(
         alpha=alpha, squared=True)(geometric_mean_score)
     iba = iba_gmean(
         y_pred,
diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py
index a9a938c1b..ab858fe81 100644
--- a/imblearn/metrics/tests/test_classification.py
+++ b/imblearn/metrics/tests/test_classification.py
@@ -23,7 +23,7 @@
 from imblearn.metrics import sensitivity_score
 from imblearn.metrics import specificity_score
 from imblearn.metrics import geometric_mean_score
-from imblearn.metrics import make_indexed_balanced_accuracy
+from imblearn.metrics import make_index_balanced_accuracy
 from imblearn.metrics import classification_report_imbalanced
 
 RND_SEED = 42
@@ -242,7 +242,7 @@ def test_iba_geo_mean_binary():
     """Test to test the iba using the geometric mean"""
     y_true, y_pred, _ = make_prediction(binary=True)
 
-    iba_gmean = make_indexed_balanced_accuracy(
+    iba_gmean = make_index_balanced_accuracy(
         alpha=0.5, squared=True)(geometric_mean_score)
     iba = iba_gmean(y_true, y_pred)
 
diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py
index 7232264f3..65a28cc98 100644
--- a/imblearn/metrics/tests/test_score_objects.py
+++ b/imblearn/metrics/tests/test_score_objects.py
@@ -8,7 +8,7 @@
 
 from imblearn.metrics import (sensitivity_score, specificity_score,
                               geometric_mean_score,
-                              make_indexed_balanced_accuracy)
+                              make_index_balanced_accuracy)
 # Get the version
 (major, minor, _) = sklearn.__version__.split('.')
 if int(minor) < 18:
@@ -90,7 +90,7 @@ def test_imblearn_classification_scorers():
     assert_almost_equal(grid.best_score_, 0.92, 2)
 
     # make a iba metric before a scorer
-    geo_mean_iba = make_indexed_balanced_accuracy()(geometric_mean_score)
+    geo_mean_iba = make_index_balanced_accuracy()(geometric_mean_score)
     scorer = make_scorer(geo_mean_iba, pos_label=None, average='macro')
     grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer)
     grid.fit(X_train, y_train).predict(X_test)

From c54197c3ba25ad5866bab803241772f5a1b1903e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <glemaitre@visor.udg.edu>
Date: Wed, 28 Dec 2016 01:07:28 +0100
Subject: [PATCH 21/21] Add example for each function

---
 imblearn/metrics/classification.py | 82 ++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py
index e145a26f7..6937c892c 100644
--- a/imblearn/metrics/classification.py
+++ b/imblearn/metrics/classification.py
@@ -112,6 +112,19 @@ def sensitivity_specificity_support(y_true,
         shape (n_unique_labels, )
         The number of occurrences of each label in ``y_true``.
 
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from imblearn.metrics import sensitivity_specificity_support
+    >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig'])
+    >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog'])
+    >>> sensitivity_specificity_support(y_true, y_pred, average='macro')
+    (0.33333333333333331, 0.66666666666666663, None)
+    >>> sensitivity_specificity_support(y_true, y_pred, average='micro')
+    (0.33333333333333331, 0.66666666666666663, None)
+    >>> sensitivity_specificity_support(y_true, y_pred, average='weighted')
+    (0.33333333333333331, 0.66666666666666663, None)
+
     References
     ----------
     .. [1] `Wikipedia entry for the Sensitivity and specificity
@@ -303,6 +316,21 @@ def sensitivity_score(y_true,
     sample_weight : ndarray, shape (n_samples, )
         Sample weights.
 
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from imblearn.metrics import sensitivity_score
+    >>> y_true = [0, 1, 2, 0, 1, 2]
+    >>> y_pred = [0, 2, 1, 0, 0, 1]
+    >>> sensitivity_score(y_true, y_pred, average='macro')
+    0.33333333333333331
+    >>> sensitivity_score(y_true, y_pred, average='micro')
+    0.33333333333333331
+    >>> sensitivity_score(y_true, y_pred, average='weighted')
+    0.33333333333333331
+    >>> sensitivity_score(y_true, y_pred, average=None)
+    array([ 1.,  0.,  0.])
+
     Returns
     -------
     specificity : float (if ``average`` = None) or ndarray, \
@@ -387,6 +415,21 @@ def specificity_score(y_true,
     sample_weight : ndarray, shape (n_samples, )
         Sample weights.
 
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from imblearn.metrics import specificity_score
+    >>> y_true = [0, 1, 2, 0, 1, 2]
+    >>> y_pred = [0, 2, 1, 0, 0, 1]
+    >>> specificity_score(y_true, y_pred, average='macro')
+    0.66666666666666663
+    >>> specificity_score(y_true, y_pred, average='micro')
+    0.66666666666666663
+    >>> specificity_score(y_true, y_pred, average='weighted')
+    0.66666666666666663
+    >>> specificity_score(y_true, y_pred, average=None)
+    array([ 0.75,  0.5 ,  0.75])
+
     Returns
     -------
     specificity : float (if ``average`` = None) or ndarray, \
@@ -480,6 +523,21 @@ def geometric_mean_score(y_true,
     geometric_mean : float (if ``average`` = None) or ndarray, \
         shape (n_unique_labels, )
 
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from imblearn.metrics import geometric_mean_score
+    >>> y_true = [0, 1, 2, 0, 1, 2]
+    >>> y_pred = [0, 2, 1, 0, 0, 1]
+    >>> geometric_mean_score(y_true, y_pred, average='macro')
+    0.47140452079103168
+    >>> geometric_mean_score(y_true, y_pred, average='micro')
+    0.47140452079103168
+    >>> geometric_mean_score(y_true, y_pred, average='weighted')
+    0.47140452079103168
+    >>> geometric_mean_score(y_true, y_pred, average=None)
+    array([ 0.8660254,  0.       ,  0.       ])
+
     References
     ----------
     .. [1] Kubat, M. and Matwin, S. "Addressing the curse of
@@ -618,6 +676,30 @@ def classification_report_imbalanced(y_true,
         Text summary of the precision, recall, specificity, geometric mean,
         and indexed balanced accuracy.
 
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from imblearn.metrics import classification_report_imbalanced
+    >>> y_true = [0, 1, 2, 2, 2]
+    >>> y_pred = [0, 0, 2, 2, 1] # doctest : +NORMALIZE_WHITESPACE
+    >>> target_names = ['class 0', 'class 1', \
+    'class 2'] # doctest : +NORMALIZE_WHITESPACE
+    >>> print(classification_report_imbalanced(y_true, y_pred, \
+    target_names=target_names))
+                       pre       rec       spe        f1       geo       iba\
+       sup
+    <BLANKLINE>
+        class 0       0.50      1.00      0.75      0.67      0.71      0.48\
+         1
+        class 1       0.00      0.00      0.75      0.00      0.00      0.00\
+         1
+        class 2       1.00      0.67      1.00      0.80      0.82      0.69\
+         3
+    <BLANKLINE>
+    avg / total       0.70      0.60      0.90      0.61      0.63      0.51\
+         5
+    <BLANKLINE>
+
     """
 
     if labels is None: