From 4afc7b9701c0a7e1cd7a6bbf01f4a69fee07abe6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 18 Dec 2016 17:16:16 +0100 Subject: [PATCH 01/21] Starting sensitivity specificity metric --- imblearn/__init__.py | 7 +- imblearn/metrics/__init__.py | 7 ++ imblearn/metrics/classification.py | 180 +++++++++++++++++++++++++++++ imblearn/setup.py | 2 + 4 files changed, 194 insertions(+), 2 deletions(-) create mode 100644 imblearn/metrics/__init__.py create mode 100644 imblearn/metrics/classification.py diff --git a/imblearn/__init__.py b/imblearn/__init__.py index bd58a9724..de1b4a106 100644 --- a/imblearn/__init__.py +++ b/imblearn/__init__.py @@ -10,6 +10,9 @@ ensemble Module which provides methods generating an ensemble of under-sampled subsets. +metrics + Module which provides metrics to quantified the classification performance + with imbalanced dataset. over_sampling Module which provides methods to under-sample a dataset. under-sampling @@ -31,6 +34,6 @@ # list all submodules available in imblearn and version __all__ = [ - 'combine', 'ensemble', 'over_sampling', 'under_sampling', 'pipeline', - '__version__' + 'combine', 'ensemble', 'metrics', 'over_sampling', 'under_sampling', + 'pipeline', '__version__' ] diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py new file mode 100644 index 000000000..d7437fa7a --- /dev/null +++ b/imblearn/metrics/__init__.py @@ -0,0 +1,7 @@ +""" +The :mod:`imblearn.metrics` module includes score functions, performance +metrics and pairwise metrics and distance computations. +""" + +import numpy as np + diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py new file mode 100644 index 000000000..0688dad5f --- /dev/null +++ b/imblearn/metrics/classification.py @@ -0,0 +1,180 @@ +"""Metrics to assess performance on classification task given class prediction + +Functions named as ``*_score`` return a scalar value to maximize: the higher +the better + +Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: +the lower the better +""" + +from __future__ import division + +import warnings + +import numpy as np + +from sklearn.metrics.classification import _check_targets +from sklearn.preprocessing import LabelEncoder +from sklearn.utils.fixes import bincount +from sklearn.utils.multiclass import unique_labels + + +def sensitivity_specificity_support(y_true, y_pred, labels=None, + pos_label=1, average=None, + warn_for=('sensitivity', 'specificity'), + sample_weight=None): + """Compute sensitivity, specificity, and support for each class + + The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number + of true positives and ``fn`` the number of false negatives. The sensitivity + quantifies the ability to avoid false negatives_[1]. + + The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number + of true negatives and ``fn`` the number of false negatives. The specificity + quantifies the ability to avoid false positives_[1]. + + The support is the number of occurrences of each class in ``y_true``. + + If ``pos_label is None`` and in binary classification, this function + returns the average precision, recall and F-measure if ``average`` + is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) target values. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Estimated targets as returned by a classifier. + + beta : float, 1.0 by default + The strength of recall versus precision in the F-score. + + labels : list, optional + The set of labels to include when ``average != 'binary'``, and their + order if ``average is None``. Labels present in the data can be + excluded, for example to calculate a multiclass average ignoring a + majority negative class, while labels not present in the data will + result in 0 components in a macro average. For multilabel targets, + labels are column indices. By default, all labels in ``y_true`` and + ``y_pred`` are used in sorted order. + + pos_label : str or int, 1 by default + The class to report if ``average='binary'`` and the data is binary. + If the data are multiclass or multilabel, this will be ignored; + setting ``labels=[pos_label]`` and ``average != 'binary'`` will report + scores for that label only. + + average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \ + 'weighted'] + If ``None``, the scores for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average, weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + + warn_for : tuple or set, for internal use + This determines which warnings will be made in the case that this + function is being used to return only one of its metrics. + + Returns + ------- + sensitivity : float (if average is not None) or array of float, shape =\ + [n_unique_labels] + + specificity : float (if average is not None) or array of float, , shape =\ + [n_unique_labels] + + support : int (if average is not None) or array of int, shape =\ + [n_unique_labels] + The number of occurrences of each label in ``y_true``. + + References + ---------- + .. [1] `Wikipedia entry for the Sensitivity and specificity + `_ + + """ + + average_options = (None, 'micro', 'macro', 'weighted') + if average not in average_options and average != 'binary': + raise ValueError('average has to be one of ' + + str(average_options)) + + y_type, y_true, y_pred = _check_targets(y_true, y_pred) + present_labels = unique_labels(y_true, y_pred) + + # We do not support multilabel for the moment + if y_type.startswith('multilabel'): + raise ValueError('Multilabel are not supported.') + + if average == 'binary': + if y_type == 'binary': + if pos_label not in present_labels: + if len(present_labels) < 2: + # Only negative labels + return (0., 0., 0) + else: + raise ValueError("pos_label=%r is not a valid label: %r" % + (pos_label, present_labels)) + labels = [pos_label] + else: + raise ValueError("Target is %s but average='binary'. Please " + "choose another average setting." % y_type) + elif pos_label not in (None, 1): + warnings.warn("Note that pos_label (set to %r) is ignored when " + "average != 'binary' (got %r). You may use " + "labels=[pos_label] to specify a single positive class." + % (pos_label, average), UserWarning) + + if labels is None: + labels = present_labels + n_labels = None + else: + n_labels = len(labels) + labels = np.hstack([labels, np.setdiff1d(present_labels, labels, + assume_unique=True)]) + + le = LabelEncoder() + le.fit(labels) + y_true = le.transform(y_true) + y_pred = le.transform(y_pred) + sorted_labels = le.classes_ + + # In a leave out strategy and for each label, compute: + # TP, TN, FP, FN + list_tp = [(y_true == label) == (y_pred == label) + for label in sorted_labels] + list_tn = [(y_true != label) == (y_pred != label) + for label in sorted_labels] + list_fp = [(y_true == label) == (y_pred != label) + for label in sorted_labels] + list_fn = [(y_true != label) == (y_pred == label) + for label in sorted_labels] + + # Retain only selected labels + indices = np.searchsorted(sorted_labels, labels[:n_labels]) + list_tp = [tp[indices] for tp in list_tp] + list_tp = [tn[indices] for tn in list_tn] + list_tp = [fp[indices] for fp in list_fp] + list_tp = [fn[indices] for fn in list_fn] + + # # Compute the specificity and sensitivity for each label + # list_sp = [np.count_nonzero(tn) / (np.count_nonzero(tn) + + # np.count_nonzero(fp)) + # for tn, fp in zip(list_tn, list_fp)] + # list_se = [np.count_nonzero(tp) / (np.count_nonzero(tp) + + # np.count_nonzero(fn)) + # for tp, fn in zip(list_tp, list_fn)] diff --git a/imblearn/setup.py b/imblearn/setup.py index e9ca7f385..a5d543a81 100644 --- a/imblearn/setup.py +++ b/imblearn/setup.py @@ -7,6 +7,8 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('combine/tests') config.add_subpackage('ensemble') config.add_subpackage('ensemble/tests') + config.add_subpackage('metrics') + config.add_subpackage('metrics/tests') config.add_subpackage('over_sampling') config.add_subpackage('over_sampling/tests') config.add_subpackage('under_sampling') From 1bdcc186eb7115d3de17311fc51b9f1b76f93841 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 19 Dec 2016 22:34:41 +0100 Subject: [PATCH 02/21] start adding test --- imblearn/metrics/classification.py | 89 +++++--- imblearn/metrics/tests/test_classification.py | 204 ++++++++++++++++++ 2 files changed, 261 insertions(+), 32 deletions(-) create mode 100644 imblearn/metrics/tests/test_classification.py diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index 0688dad5f..bc39d0581 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -13,11 +13,13 @@ import numpy as np -from sklearn.metrics.classification import _check_targets +from sklearn.metrics.classification import (_check_targets, _prf_divide) from sklearn.preprocessing import LabelEncoder from sklearn.utils.fixes import bincount from sklearn.utils.multiclass import unique_labels +LOGGER = logging.getLogger(__name__) + def sensitivity_specificity_support(y_true, y_pred, labels=None, pos_label=1, average=None, @@ -36,8 +38,8 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, The support is the number of occurrences of each class in ``y_true``. If ``pos_label is None`` and in binary classification, this function - returns the average precision, recall and F-measure if ``average`` - is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``. + returns the average sensitivity and specificity if ``average`` + is one of ``'micro'`` or 'weighted'``. Parameters ---------- @@ -47,9 +49,6 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, y_pred : 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. - beta : float, 1.0 by default - The strength of recall versus precision in the F-score. - labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be @@ -65,25 +64,20 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. - average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \ - 'weighted'] + average : string, [None (default), 'binary', 'macro', 'weighted'] If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. - ``'micro'``: - Calculate metrics globally by counting the total true positives, - false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This - alters 'macro' to account for label imbalance; it can result in an - F-score that is not between precision and recall. + alters 'macro' to account for label imbalance. warn_for : tuple or set, for internal use This determines which warnings will be made in the case that this @@ -91,14 +85,14 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, Returns ------- - sensitivity : float (if average is not None) or array of float, shape =\ - [n_unique_labels] + sensitivity : float (if ``average`` = None) or ndarray, \ + shape(n_unique_labels,) - specificity : float (if average is not None) or array of float, , shape =\ - [n_unique_labels] + specificity : float (if ``average`` = None) or ndarray, \ + shape(n_unique_labels,) - support : int (if average is not None) or array of int, shape =\ - [n_unique_labels] + support : int (if ``average`` = None) or ndarray, \ + shape(n_unique_labels,) The number of occurrences of each label in ``y_true``. References @@ -116,6 +110,9 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, y_type, y_true, y_pred = _check_targets(y_true, y_pred) present_labels = unique_labels(y_true, y_pred) + LOGGER.debug('The labels in the prediction and ground-truth are %s', + present_labels) + # We do not support multilabel for the moment if y_type.startswith('multilabel'): raise ValueError('Multilabel are not supported.') @@ -151,10 +148,12 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, le.fit(labels) y_true = le.transform(y_true) y_pred = le.transform(y_pred) - sorted_labels = le.classes_ + sorted_labels = le.classes_n # In a leave out strategy and for each label, compute: # TP, TN, FP, FN + # These list contain an array in which each sample is labeled as + # TP, TN, FP, FN list_tp = [(y_true == label) == (y_pred == label) for label in sorted_labels] list_tn = [(y_true != label) == (y_pred != label) @@ -164,17 +163,43 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, list_fn = [(y_true != label) == (y_pred == label) for label in sorted_labels] + # Compute the sum for each type + tp_sum = [bincount(tp, weights=sample_weight, minlength=len(labels)) + for tp in list_tp] + tn_sum = [bincount(tn, weights=sample_weight, minlength=len(labels)) + for tn in list_tn] + fp_sum = [bincount(fp, weights=sample_weight, minlength=len(labels)) + for fp in list_fp] + fn_sum = [bincount(fn, weights=sample_weight, minlength=len(labels)) + for fn in list_fn] + # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) - list_tp = [tp[indices] for tp in list_tp] - list_tp = [tn[indices] for tn in list_tn] - list_tp = [fp[indices] for fp in list_fp] - list_tp = [fn[indices] for fn in list_fn] - - # # Compute the specificity and sensitivity for each label - # list_sp = [np.count_nonzero(tn) / (np.count_nonzero(tn) + - # np.count_nonzero(fp)) - # for tn, fp in zip(list_tn, list_fp)] - # list_se = [np.count_nonzero(tp) / (np.count_nonzero(tp) + - # np.count_nonzero(fn)) - # for tp, fn in zip(list_tp, list_fn)] + tp_sum = [tp[indices] for tp in tp_sum] + tn_sum = [tn[indices] for tn in tn_sum] + fp_sum = [fp[indices] for fp in fp_sum] + fn_sum = [fn[indices] for fn in fn_sum] + + LOGGER.debug('Computed for each label the stats') + + # Compute the sensitivity and specificity + sensitivity = [_prf_divide(tp, tp + fn, 'sensitivity', 'tp + fn', average, + warn_for) for tp, fn in zip(tp_sum, fn_sum)] + specificity = [_prf_divide(tn, tn + fp, 'specificity', 'tn + fp', average, + warn_for) for tn, fp in zip(tn_sum, fp_sum)] + + # If we need to weight the results + if average == 'weighted': + weights = tp_sum + if weights.sum() == 0: + return 0, 0, None + else: + weights = None + + if average is not None: + assert average != 'binary' or len(sensitivity) == 1 + sensitivity = np.average(sensitivity, weights=weights) + specificity = np.average(specificity, weights=weights) + tp_sum = None + + return sensitivity, specificity, tp_sum diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py new file mode 100644 index 000000000..031a48887 --- /dev/null +++ b/imblearn/metrics/tests/test_classification.py @@ -0,0 +1,204 @@ +"""Testing the metric for classification with imbalanced dataset""" + +from __future__ import division, print_function + +import numpy as np + +from numpy.testing import (assert_array_almost_equal, assert_array_equal, + assert_no_warnings, assert_equal, + assert_almost_equal, assert_raises) +from sklearn.utils.testing import assert_warns_message, ignore_warnings + +from sklearn import datasets +from sklearn import svm + +from sklearn.utils.validation import check_random_state + +RND_SEED = 42 + +############################################################################### +# Utilities for testing + + +def make_prediction(dataset=None, binary=False): + """Make some classification predictions on a toy dataset using a SVC + If binary is True restrict to a binary classification problem instead of a + multiclass classification problem + """ + + if dataset is None: + # import some data to play with + dataset = datasets.load_iris() + + X = dataset.data + y = dataset.target + + if binary: + # restrict to a binary classification task + X, y = X[y < 2], y[y < 2] + + n_samples, n_features = X.shape + p = np.arange(n_samples) + + rng = check_random_state(37) + rng.shuffle(p) + X, y = X[p], y[p] + half = int(n_samples / 2) + + # add noisy features to make the problem harder and avoid perfect results + rng = np.random.RandomState(0) + X = np.c_[X, rng.randn(n_samples, 200 * n_features)] + + # run classifier, get class probabilities and label predictions + clf = svm.SVC(kernel='linear', probability=True, random_state=0) + probas_pred = clf.fit(X[:half], y[:half]).predict_proba(X[half:]) + + if binary: + # only interested in probabilities of the positive case + # XXX: do we really want a special API for the binary case? + probas_pred = probas_pred[:, 1] + + y_pred = clf.predict(X[half:]) + y_true = y[half:] + + return y_true, y_pred, probas_pred + + +############################################################################### +# Tests + +def test_precision_recall_f1_score_binary(): + # Test Precision Recall and F1 Score for binary classification task + y_true, y_pred, _ = make_prediction(binary=True) + + # detailed measures for each class + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) + assert_array_almost_equal(p, [0.73, 0.85], 2) + assert_array_almost_equal(r, [0.88, 0.68], 2) + assert_array_almost_equal(f, [0.80, 0.76], 2) + assert_array_equal(s, [25, 25]) + + # individual scoring function that can be used for grid search: in the + # binary class case the score is the value of the measure for the positive + # class (e.g. label == 1). This is deprecated for average != 'binary'. + for kwargs, my_assert in [({}, assert_no_warnings), + ({'average': 'binary'}, assert_no_warnings)]: + ps = my_assert(precision_score, y_true, y_pred, **kwargs) + assert_array_almost_equal(ps, 0.85, 2) + + rs = my_assert(recall_score, y_true, y_pred, **kwargs) + assert_array_almost_equal(rs, 0.68, 2) + + fs = my_assert(f1_score, y_true, y_pred, **kwargs) + assert_array_almost_equal(fs, 0.76, 2) + + assert_almost_equal(my_assert(fbeta_score, y_true, y_pred, beta=2, + **kwargs), + (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2) + + +def test_precision_recall_f_binary_single_class(): + # Test precision, recall and F1 score behave with a single positive or + # negative class + # Such a case may occur with non-stratified cross-validation + assert_equal(1., precision_score([1, 1], [1, 1])) + assert_equal(1., recall_score([1, 1], [1, 1])) + assert_equal(1., f1_score([1, 1], [1, 1])) + + assert_equal(0., precision_score([-1, -1], [-1, -1])) + assert_equal(0., recall_score([-1, -1], [-1, -1])) + assert_equal(0., f1_score([-1, -1], [-1, -1])) + + +@ignore_warnings +def test_precision_recall_f_extra_labels(): + # Test handling of explicit additional (not in input) labels to PRF + y_true = [1, 3, 3, 2] + y_pred = [1, 1, 3, 2] + y_true_bin = label_binarize(y_true, classes=np.arange(5)) + y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) + data = [(y_true, y_pred), + (y_true_bin, y_pred_bin)] + + for i, (y_true, y_pred) in enumerate(data): + # No average: zeros in array + actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], + average=None) + assert_array_almost_equal([0., 1., 1., .5, 0.], actual) + + # Macro average is changed + actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], + average='macro') + assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual) + + # No effect otheriwse + for average in ['micro', 'weighted', 'samples']: + if average == 'samples' and i == 0: + continue + assert_almost_equal(recall_score(y_true, y_pred, + labels=[0, 1, 2, 3, 4], + average=average), + recall_score(y_true, y_pred, labels=None, + average=average)) + + # Error when introducing invalid label in multilabel case + # (although it would only affect performance if average='macro'/None) + for average in [None, 'macro', 'micro', 'samples']: + assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin, + labels=np.arange(6), average=average) + assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin, + labels=np.arange(-1, 4), average=average) + + +@ignore_warnings +def test_precision_recall_f_ignored_labels(): + # Test a subset of labels may be requested for PRF + y_true = [1, 1, 2, 3] + y_pred = [1, 3, 3, 3] + y_true_bin = label_binarize(y_true, classes=np.arange(5)) + y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) + data = [(y_true, y_pred), + (y_true_bin, y_pred_bin)] + + for i, (y_true, y_pred) in enumerate(data): + recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3]) + recall_all = partial(recall_score, y_true, y_pred, labels=None) + + assert_array_almost_equal([.5, 1.], recall_13(average=None)) + assert_almost_equal((.5 + 1.) / 2, recall_13(average='macro')) + assert_almost_equal((.5 * 2 + 1. * 1) / 3, + recall_13(average='weighted')) + assert_almost_equal(2. / 3, recall_13(average='micro')) + + # ensure the above were meaningful tests: + for average in ['macro', 'weighted', 'micro']: + assert_not_equal(recall_13(average=average), + recall_all(average=average)) + + +@ignore_warnings +def test_precision_recall_fscore_support_errors(): + y_true, y_pred, _ = make_prediction(binary=True) + + # Bad beta + assert_raises(ValueError, precision_recall_fscore_support, + y_true, y_pred, beta=0.0) + + # Bad pos_label + assert_raises(ValueError, precision_recall_fscore_support, + y_true, y_pred, pos_label=2, average='binary') + + # Bad average option + assert_raises(ValueError, precision_recall_fscore_support, + [0, 1, 2], [1, 2, 0], average='mega') + + +def test_precision_recall_f_unused_pos_label(): + # Check warning that pos_label unused when set to non-default value + # but average != 'binary'; even if data is binary. + assert_warns_message(UserWarning, + "Note that pos_label (set to 2) is " + "ignored when average != 'binary' (got 'macro'). You " + "may use labels=[pos_label] to specify a single " + "positive class.", precision_recall_fscore_support, + [1, 2, 1], [1, 2, 2], pos_label=2, average='macro') From d0884dd128f7be435cbb8f399b6035db9041ec40 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 20 Dec 2016 00:05:36 +0100 Subject: [PATCH 03/21] advance the testing --- imblearn/metrics/__init__.py | 5 +- imblearn/metrics/classification.py | 25 +- imblearn/metrics/tests/test_classification.py | 266 +++++++++--------- 3 files changed, 158 insertions(+), 138 deletions(-) diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py index d7437fa7a..9a4a3dc6e 100644 --- a/imblearn/metrics/__init__.py +++ b/imblearn/metrics/__init__.py @@ -3,5 +3,8 @@ metrics and pairwise metrics and distance computations. """ -import numpy as np +from .classification import sensitivity_specificity_support +__all__ = [ + 'sensitivity_specificity_support' +] diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index bc39d0581..2cfa84112 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -10,6 +10,7 @@ from __future__ import division import warnings +import logging import numpy as np @@ -148,21 +149,26 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, le.fit(labels) y_true = le.transform(y_true) y_pred = le.transform(y_pred) - sorted_labels = le.classes_n + sorted_labels = le.classes_ # In a leave out strategy and for each label, compute: # TP, TN, FP, FN # These list contain an array in which each sample is labeled as # TP, TN, FP, FN - list_tp = [(y_true == label) == (y_pred == label) + list_tp = [np.bitwise_and((y_true == label), (y_pred == label)) for label in sorted_labels] - list_tn = [(y_true != label) == (y_pred != label) + list_tn = [np.bitwise_and((y_true != label), (y_pred != label)) for label in sorted_labels] - list_fp = [(y_true == label) == (y_pred != label) + list_fp = [np.bitwise_and((y_true == label), (y_pred != label)) for label in sorted_labels] - list_fn = [(y_true != label) == (y_pred == label) + list_fn = [np.bitwise_and((y_true != label), (y_pred == label)) for label in sorted_labels] + LOGGER.debug(list_tp) + LOGGER.debug(list_tn) + LOGGER.debug(list_fn) + LOGGER.debug(list_fn) + # Compute the sum for each type tp_sum = [bincount(tp, weights=sample_weight, minlength=len(labels)) for tp in list_tp] @@ -173,6 +179,11 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, fn_sum = [bincount(fn, weights=sample_weight, minlength=len(labels)) for fn in list_fn] + LOGGER.debug(tp_sum) + LOGGER.debug(tn_sum) + LOGGER.debug(fp_sum) + LOGGER.debug(fn_sum) + # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) tp_sum = [tp[indices] for tp in tp_sum] @@ -188,6 +199,10 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, specificity = [_prf_divide(tn, tn + fp, 'specificity', 'tn + fp', average, warn_for) for tn, fp in zip(tn_sum, fp_sum)] + LOGGER.debug('Computed the sensitivity and specificity for each class') + LOGGER.debug('The lengths of those two metrics are: %s - %s', + len(sensitivity), len(specificity)) + # If we need to weight the results if average == 'weighted': weights = tp_sum diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index 031a48887..64ad4b9ad 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -14,6 +14,8 @@ from sklearn.utils.validation import check_random_state +from imblearn.metrics import sensitivity_specificity_support + RND_SEED = 42 ############################################################################### @@ -67,138 +69,138 @@ def make_prediction(dataset=None, binary=False): ############################################################################### # Tests -def test_precision_recall_f1_score_binary(): - # Test Precision Recall and F1 Score for binary classification task +def test_sensitivity_specificity_support_binary(): + """Test the sensitivity specificity for binary classification task""" y_true, y_pred, _ = make_prediction(binary=True) # detailed measures for each class - p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) - assert_array_almost_equal(p, [0.73, 0.85], 2) - assert_array_almost_equal(r, [0.88, 0.68], 2) - assert_array_almost_equal(f, [0.80, 0.76], 2) - assert_array_equal(s, [25, 25]) - - # individual scoring function that can be used for grid search: in the - # binary class case the score is the value of the measure for the positive - # class (e.g. label == 1). This is deprecated for average != 'binary'. - for kwargs, my_assert in [({}, assert_no_warnings), - ({'average': 'binary'}, assert_no_warnings)]: - ps = my_assert(precision_score, y_true, y_pred, **kwargs) - assert_array_almost_equal(ps, 0.85, 2) - - rs = my_assert(recall_score, y_true, y_pred, **kwargs) - assert_array_almost_equal(rs, 0.68, 2) - - fs = my_assert(f1_score, y_true, y_pred, **kwargs) - assert_array_almost_equal(fs, 0.76, 2) - - assert_almost_equal(my_assert(fbeta_score, y_true, y_pred, beta=2, - **kwargs), - (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2) - - -def test_precision_recall_f_binary_single_class(): - # Test precision, recall and F1 score behave with a single positive or - # negative class - # Such a case may occur with non-stratified cross-validation - assert_equal(1., precision_score([1, 1], [1, 1])) - assert_equal(1., recall_score([1, 1], [1, 1])) - assert_equal(1., f1_score([1, 1], [1, 1])) - - assert_equal(0., precision_score([-1, -1], [-1, -1])) - assert_equal(0., recall_score([-1, -1], [-1, -1])) - assert_equal(0., f1_score([-1, -1], [-1, -1])) - - -@ignore_warnings -def test_precision_recall_f_extra_labels(): - # Test handling of explicit additional (not in input) labels to PRF - y_true = [1, 3, 3, 2] - y_pred = [1, 1, 3, 2] - y_true_bin = label_binarize(y_true, classes=np.arange(5)) - y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) - data = [(y_true, y_pred), - (y_true_bin, y_pred_bin)] - - for i, (y_true, y_pred) in enumerate(data): - # No average: zeros in array - actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], - average=None) - assert_array_almost_equal([0., 1., 1., .5, 0.], actual) - - # Macro average is changed - actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], - average='macro') - assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual) - - # No effect otheriwse - for average in ['micro', 'weighted', 'samples']: - if average == 'samples' and i == 0: - continue - assert_almost_equal(recall_score(y_true, y_pred, - labels=[0, 1, 2, 3, 4], - average=average), - recall_score(y_true, y_pred, labels=None, - average=average)) - - # Error when introducing invalid label in multilabel case - # (although it would only affect performance if average='macro'/None) - for average in [None, 'macro', 'micro', 'samples']: - assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin, - labels=np.arange(6), average=average) - assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin, - labels=np.arange(-1, 4), average=average) - - -@ignore_warnings -def test_precision_recall_f_ignored_labels(): - # Test a subset of labels may be requested for PRF - y_true = [1, 1, 2, 3] - y_pred = [1, 3, 3, 3] - y_true_bin = label_binarize(y_true, classes=np.arange(5)) - y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) - data = [(y_true, y_pred), - (y_true_bin, y_pred_bin)] - - for i, (y_true, y_pred) in enumerate(data): - recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3]) - recall_all = partial(recall_score, y_true, y_pred, labels=None) - - assert_array_almost_equal([.5, 1.], recall_13(average=None)) - assert_almost_equal((.5 + 1.) / 2, recall_13(average='macro')) - assert_almost_equal((.5 * 2 + 1. * 1) / 3, - recall_13(average='weighted')) - assert_almost_equal(2. / 3, recall_13(average='micro')) - - # ensure the above were meaningful tests: - for average in ['macro', 'weighted', 'micro']: - assert_not_equal(recall_13(average=average), - recall_all(average=average)) - - -@ignore_warnings -def test_precision_recall_fscore_support_errors(): - y_true, y_pred, _ = make_prediction(binary=True) - - # Bad beta - assert_raises(ValueError, precision_recall_fscore_support, - y_true, y_pred, beta=0.0) - - # Bad pos_label - assert_raises(ValueError, precision_recall_fscore_support, - y_true, y_pred, pos_label=2, average='binary') - - # Bad average option - assert_raises(ValueError, precision_recall_fscore_support, - [0, 1, 2], [1, 2, 0], average='mega') - - -def test_precision_recall_f_unused_pos_label(): - # Check warning that pos_label unused when set to non-default value - # but average != 'binary'; even if data is binary. - assert_warns_message(UserWarning, - "Note that pos_label (set to 2) is " - "ignored when average != 'binary' (got 'macro'). You " - "may use labels=[pos_label] to specify a single " - "positive class.", precision_recall_fscore_support, - [1, 2, 1], [1, 2, 2], pos_label=2, average='macro') + sens, spec, supp = sensitivity_specificity_support(y_true, y_pred, + average=None) + assert_array_almost_equal(sens, [0.88, 0.68], 2) + assert_array_almost_equal(spec, [0.73, 0.85], 2) + assert_array_equal(supp, [25, 25]) + + # # individual scoring function that can be used for grid search: in the + # # binary class case the score is the value of the measure for the positive + # # class (e.g. label == 1). This is deprecated for average != 'binary'. + # for kwargs, my_assert in [({}, assert_no_warnings), + # ({'average': 'binary'}, assert_no_warnings)]: + # ps = my_assert(precision_score, y_true, y_pred, **kwargs) + # assert_array_almost_equal(ps, 0.85, 2) + + # rs = my_assert(recall_score, y_true, y_pred, **kwargs) + # assert_array_almost_equal(rs, 0.68, 2) + + # fs = my_assert(f1_score, y_true, y_pred, **kwargs) + # assert_array_almost_equal(fs, 0.76, 2) + + # assert_almost_equal(my_assert(fbeta_score, y_true, y_pred, beta=2, + # **kwargs), + # (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2) + + +# def test_precision_recall_f_binary_single_class(): +# # Test precision, recall and F1 score behave with a single positive or +# # negative class +# # Such a case may occur with non-stratified cross-validation +# assert_equal(1., precision_score([1, 1], [1, 1])) +# assert_equal(1., recall_score([1, 1], [1, 1])) +# assert_equal(1., f1_score([1, 1], [1, 1])) + +# assert_equal(0., precision_score([-1, -1], [-1, -1])) +# assert_equal(0., recall_score([-1, -1], [-1, -1])) +# assert_equal(0., f1_score([-1, -1], [-1, -1])) + + +# @ignore_warnings +# def test_precision_recall_f_extra_labels(): +# # Test handling of explicit additional (not in input) labels to PRF +# y_true = [1, 3, 3, 2] +# y_pred = [1, 1, 3, 2] +# y_true_bin = label_binarize(y_true, classes=np.arange(5)) +# y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) +# data = [(y_true, y_pred), +# (y_true_bin, y_pred_bin)] + +# for i, (y_true, y_pred) in enumerate(data): +# # No average: zeros in array +# actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], +# average=None) +# assert_array_almost_equal([0., 1., 1., .5, 0.], actual) + +# # Macro average is changed +# actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], +# average='macro') +# assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual) + +# # No effect otheriwse +# for average in ['micro', 'weighted', 'samples']: +# if average == 'samples' and i == 0: +# continue +# assert_almost_equal(recall_score(y_true, y_pred, +# labels=[0, 1, 2, 3, 4], +# average=average), +# recall_score(y_true, y_pred, labels=None, +# average=average)) + +# # Error when introducing invalid label in multilabel case +# # (although it would only affect performance if average='macro'/None) +# for average in [None, 'macro', 'micro', 'samples']: +# assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin, +# labels=np.arange(6), average=average) +# assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin, +# labels=np.arange(-1, 4), average=average) + + +# @ignore_warnings +# def test_precision_recall_f_ignored_labels(): +# # Test a subset of labels may be requested for PRF +# y_true = [1, 1, 2, 3] +# y_pred = [1, 3, 3, 3] +# y_true_bin = label_binarize(y_true, classes=np.arange(5)) +# y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) +# data = [(y_true, y_pred), +# (y_true_bin, y_pred_bin)] + +# for i, (y_true, y_pred) in enumerate(data): +# recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3]) +# recall_all = partial(recall_score, y_true, y_pred, labels=None) + +# assert_array_almost_equal([.5, 1.], recall_13(average=None)) +# assert_almost_equal((.5 + 1.) / 2, recall_13(average='macro')) +# assert_almost_equal((.5 * 2 + 1. * 1) / 3, +# recall_13(average='weighted')) +# assert_almost_equal(2. / 3, recall_13(average='micro')) + +# # ensure the above were meaningful tests: +# for average in ['macro', 'weighted', 'micro']: +# assert_not_equal(recall_13(average=average), +# recall_all(average=average)) + + +# @ignore_warnings +# def test_precision_recall_fscore_support_errors(): +# y_true, y_pred, _ = make_prediction(binary=True) + +# # Bad beta +# assert_raises(ValueError, precision_recall_fscore_support, +# y_true, y_pred, beta=0.0) + +# # Bad pos_label +# assert_raises(ValueError, precision_recall_fscore_support, +# y_true, y_pred, pos_label=2, average='binary') + +# # Bad average option +# assert_raises(ValueError, precision_recall_fscore_support, +# [0, 1, 2], [1, 2, 0], average='mega') + + +# def test_precision_recall_f_unused_pos_label(): +# # Check warning that pos_label unused when set to non-default value +# # but average != 'binary'; even if data is binary. +# assert_warns_message(UserWarning, +# "Note that pos_label (set to 2) is " +# "ignored when average != 'binary' (got 'macro'). You " +# "may use labels=[pos_label] to specify a single " +# "positive class.", precision_recall_fscore_support, +# [1, 2, 1], [1, 2, 2], pos_label=2, average='macro') From 63a2aa2740939830922216ab6f20167d8faf2c5d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 21 Dec 2016 01:09:13 +0100 Subject: [PATCH 04/21] Finish the non-failure test --- imblearn/metrics/__init__.py | 6 +- imblearn/metrics/classification.py | 246 +++++++++++++++--- imblearn/metrics/tests/test_classification.py | 71 +++-- 3 files changed, 269 insertions(+), 54 deletions(-) diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py index 9a4a3dc6e..d518d7a23 100644 --- a/imblearn/metrics/__init__.py +++ b/imblearn/metrics/__init__.py @@ -4,7 +4,11 @@ """ from .classification import sensitivity_specificity_support +from .classification import sensitivity_score +from .classification import specificity_score __all__ = [ - 'sensitivity_specificity_support' + 'sensitivity_specificity_support', + 'sensitivity_score', + 'specificity_score' ] diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index 2cfa84112..90ef0c736 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -14,7 +14,7 @@ import numpy as np -from sklearn.metrics.classification import (_check_targets, _prf_divide) +from sklearn.metrics.classification import _check_targets, _prf_divide from sklearn.preprocessing import LabelEncoder from sklearn.utils.fixes import bincount from sklearn.utils.multiclass import unique_labels @@ -44,10 +44,10 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, Parameters ---------- - y_true : 1d array-like, or label indicator array / sparse matrix + y_true : ndarray, shape (n_samples, ) Ground truth (correct) target values. - y_pred : 1d array-like, or label indicator array / sparse matrix + y_pred : ndarray, shape (n_samples, ) Estimated targets as returned by a classifier. labels : list, optional @@ -59,13 +59,13 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. - pos_label : str or int, 1 by default + pos_label : str or int, optional (default=1) The class to report if ``average='binary'`` and the data is binary. If the data are multiclass or multilabel, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. - average : string, [None (default), 'binary', 'macro', 'weighted'] + average : str or None, optional (default=None) If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: @@ -84,16 +84,19 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, This determines which warnings will be made in the case that this function is being used to return only one of its metrics. + sample_weight : ndarray, shape (n_samples, ) + Sample weights. + Returns ------- sensitivity : float (if ``average`` = None) or ndarray, \ - shape(n_unique_labels,) + shape (n_unique_labels, ) specificity : float (if ``average`` = None) or ndarray, \ - shape(n_unique_labels,) + shape (n_unique_labels, ) support : int (if ``average`` = None) or ndarray, \ - shape(n_unique_labels,) + shape (n_unique_labels, ) The number of occurrences of each label in ``y_true``. References @@ -151,6 +154,12 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, y_pred = le.transform(y_pred) sorted_labels = le.classes_ + LOGGER.debug(y_true) + LOGGER.debug(y_pred) + LOGGER.debug(sorted_labels) + + LOGGER.debug('The number of labels is %s' % n_labels) + # In a leave out strategy and for each label, compute: # TP, TN, FP, FN # These list contain an array in which each sample is labeled as @@ -159,39 +168,58 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, for label in sorted_labels] list_tn = [np.bitwise_and((y_true != label), (y_pred != label)) for label in sorted_labels] - list_fp = [np.bitwise_and((y_true == label), (y_pred != label)) + list_fp = [np.bitwise_and((y_true != label), (y_pred == label)) for label in sorted_labels] - list_fn = [np.bitwise_and((y_true != label), (y_pred == label)) + list_fn = [np.bitwise_and((y_true == label), (y_pred != label)) for label in sorted_labels] - LOGGER.debug(list_tp) - LOGGER.debug(list_tn) - LOGGER.debug(list_fn) - LOGGER.debug(list_fn) - # Compute the sum for each type - tp_sum = [bincount(tp, weights=sample_weight, minlength=len(labels)) - for tp in list_tp] - tn_sum = [bincount(tn, weights=sample_weight, minlength=len(labels)) - for tn in list_tn] - fp_sum = [bincount(fp, weights=sample_weight, minlength=len(labels)) - for fp in list_fp] - fn_sum = [bincount(fn, weights=sample_weight, minlength=len(labels)) - for fn in list_fn] - - LOGGER.debug(tp_sum) - LOGGER.debug(tn_sum) - LOGGER.debug(fp_sum) - LOGGER.debug(fn_sum) + # We keep only the counting corresponding to True values + # We are using bincount since it allows to weight the samples + tp_sum = np.array([bincount(tp, weights=sample_weight, + minlength=2)[-1] + for tp in list_tp]) + tn_sum = np.array([bincount(tn, weights=sample_weight, + minlength=2)[-1] + for tn in list_tn]) + fp_sum = np.array([bincount(fp, weights=sample_weight, + minlength=2)[-1] + for fp in list_fp]) + fn_sum = np.array([bincount(fn, weights=sample_weight, + minlength=2)[-1] + for fn in list_fn]) # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) - tp_sum = [tp[indices] for tp in tp_sum] - tn_sum = [tn[indices] for tn in tn_sum] - fp_sum = [fp[indices] for fp in fp_sum] - fn_sum = [fn[indices] for fn in fn_sum] + # For support, we can count the number of occurrences of each label + support = np.array(bincount(y_true, weights=sample_weight, + minlength=len(labels))) + # Sort the support + support = support[indices] + + + LOGGER.debug('The indices which are retained are %s' % indices) + + LOGGER.debug('TP: %s' % tp_sum) + LOGGER.debug('TN: %s' % tn_sum) + LOGGER.debug('FP: %s' % fp_sum) + LOGGER.debug('FN: %s' % fn_sum) - LOGGER.debug('Computed for each label the stats') + tp_sum = tp_sum[indices] + tn_sum = tn_sum[indices] + fp_sum = fp_sum[indices] + fn_sum = fn_sum[indices] + + if average == 'micro': + tp_sum = np.array([tp_sum.sum()]) + tn_sum = np.array([tn_sum.sum()]) + fp_sum = np.array([fp_sum.sum()]) + fn_sum = np.array([fn_sum.sum()]) + + LOGGER.debug('Did we do the average micro %s' % tp_sum) + + LOGGER.debug('Computed the necessary stats for the sensitivity and' + ' specificity') # Compute the sensitivity and specificity sensitivity = [_prf_divide(tp, tp + fn, 'sensitivity', 'tp + fn', average, @@ -199,13 +227,16 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, specificity = [_prf_divide(tn, tn + fp, 'specificity', 'tn + fp', average, warn_for) for tn, fp in zip(tn_sum, fp_sum)] + LOGGER.debug('Sensitivity = %s - Specificity = %s' % (sensitivity, + specificity)) + LOGGER.debug('Computed the sensitivity and specificity for each class') LOGGER.debug('The lengths of those two metrics are: %s - %s', len(sensitivity), len(specificity)) # If we need to weight the results if average == 'weighted': - weights = tp_sum + weights = support if weights.sum() == 0: return 0, 0, None else: @@ -215,6 +246,149 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, assert average != 'binary' or len(sensitivity) == 1 sensitivity = np.average(sensitivity, weights=weights) specificity = np.average(specificity, weights=weights) - tp_sum = None + support = None + + return sensitivity, specificity, support + + +def sensitivity_score(y_true, y_pred, labels=None, pos_label=1, + average='binary', sample_weight=None): + """Compute the sensitivity + + The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number + of true positives and ``fn`` the number of false negatives. The sensitivity + quantifies the ability to avoid false negatives. + + The best value is 1 and the worst value is 0. + + Parameters + ---------- + y_true : ndarray, shape (n_samples, ) + Ground truth (correct) target values. + + y_pred : ndarray, shape (n_samples, ) + Estimated targets as returned by a classifier. + + labels : list, optional + The set of labels to include when ``average != 'binary'``, and their + order if ``average is None``. Labels present in the data can be + excluded, for example to calculate a multiclass average ignoring a + majority negative class, while labels not present in the data will + result in 0 components in a macro average. For multilabel targets, + labels are column indices. By default, all labels in ``y_true`` and + ``y_pred`` are used in sorted order. + + pos_label : str or int, optional (default=1) + The class to report if ``average='binary'`` and the data is binary. + If the data are multiclass or multilabel, this will be ignored; + setting ``labels=[pos_label]`` and ``average != 'binary'`` will report + scores for that label only. + + average : str or None, optional (default=None) + If ``None``, the scores for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average, weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance. + + warn_for : tuple or set, for internal use + This determines which warnings will be made in the case that this + function is being used to return only one of its metrics. - return sensitivity, specificity, tp_sum + sample_weight : ndarray, shape (n_samples, ) + Sample weights. + + Returns + ------- + specificity : float (if ``average`` = None) or ndarray, \ + shape (n_unique_labels, ) + + """ + s, _, _ = sensitivity_specificity_support(y_true, y_pred, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=('specificity',), + sample_weight=sample_weight) + + return s + + +def specificity_score(y_true, y_pred, labels=None, pos_label=1, + average='binary', sample_weight=None): + """Compute the specificity + + The specificity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number + of true positives and ``fn`` the number of false negatives. The specificity + is intuitively the ability of the classifier to find all the positive + samples. + + The best value is 1 and the worst value is 0. + + Parameters + ---------- + y_true : ndarray, shape (n_samples, ) + Ground truth (correct) target values. + + y_pred : ndarray, shape (n_samples, ) + Estimated targets as returned by a classifier. + + labels : list, optional + The set of labels to include when ``average != 'binary'``, and their + order if ``average is None``. Labels present in the data can be + excluded, for example to calculate a multiclass average ignoring a + majority negative class, while labels not present in the data will + result in 0 components in a macro average. For multilabel targets, + labels are column indices. By default, all labels in ``y_true`` and + ``y_pred`` are used in sorted order. + + pos_label : str or int, optional (default=1) + The class to report if ``average='binary'`` and the data is binary. + If the data are multiclass or multilabel, this will be ignored; + setting ``labels=[pos_label]`` and ``average != 'binary'`` will report + scores for that label only. + + average : str or None, optional (default=None) + If ``None``, the scores for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average, weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance. + + warn_for : tuple or set, for internal use + This determines which warnings will be made in the case that this + function is being used to return only one of its metrics. + + sample_weight : ndarray, shape (n_samples, ) + Sample weights. + + Returns + ------- + specificity : float (if ``average`` = None) or ndarray, \ + shape (n_unique_labels, ) + + """ + _, s, _ = sensitivity_specificity_support(y_true, y_pred, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=('specificity',), + sample_weight=sample_weight) + + return s diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index 64ad4b9ad..d7ce37caa 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -15,6 +15,8 @@ from sklearn.utils.validation import check_random_state from imblearn.metrics import sensitivity_specificity_support +from imblearn.metrics import sensitivity_score +from imblearn.metrics import specificity_score RND_SEED = 42 @@ -77,26 +79,19 @@ def test_sensitivity_specificity_support_binary(): sens, spec, supp = sensitivity_specificity_support(y_true, y_pred, average=None) assert_array_almost_equal(sens, [0.88, 0.68], 2) - assert_array_almost_equal(spec, [0.73, 0.85], 2) + assert_array_almost_equal(spec, [0.68, 0.88], 2) assert_array_equal(supp, [25, 25]) - # # individual scoring function that can be used for grid search: in the - # # binary class case the score is the value of the measure for the positive - # # class (e.g. label == 1). This is deprecated for average != 'binary'. - # for kwargs, my_assert in [({}, assert_no_warnings), - # ({'average': 'binary'}, assert_no_warnings)]: - # ps = my_assert(precision_score, y_true, y_pred, **kwargs) - # assert_array_almost_equal(ps, 0.85, 2) + # individual scoring function that can be used for grid search: in the + # binary class case the score is the value of the measure for the positive + # class (e.g. label == 1). This is deprecated for average != 'binary'. + for kwargs, my_assert in [({}, assert_no_warnings), + ({'average': 'binary'}, assert_no_warnings)]: + sens = my_assert(sensitivity_score, y_true, y_pred, **kwargs) + assert_array_almost_equal(sens, 0.68, 2) - # rs = my_assert(recall_score, y_true, y_pred, **kwargs) - # assert_array_almost_equal(rs, 0.68, 2) - - # fs = my_assert(f1_score, y_true, y_pred, **kwargs) - # assert_array_almost_equal(fs, 0.76, 2) - - # assert_almost_equal(my_assert(fbeta_score, y_true, y_pred, beta=2, - # **kwargs), - # (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2) + spec = my_assert(specificity_score, y_true, y_pred, **kwargs) + assert_array_almost_equal(spec, 0.88, 2) # def test_precision_recall_f_binary_single_class(): @@ -204,3 +199,45 @@ def test_sensitivity_specificity_support_binary(): # "may use labels=[pos_label] to specify a single " # "positive class.", precision_recall_fscore_support, # [1, 2, 1], [1, 2, 2], pos_label=2, average='macro') + +def test_precision_recall_f1_score_multiclass(): + # Test Precision Recall and F1 Score for multiclass classification task + y_true, y_pred, _ = make_prediction(binary=False) + + # compute scores with default labels introspection + sens, spec, supp = sensitivity_specificity_support(y_true, y_pred, + average=None) + assert_array_almost_equal(spec, [0.92, 0.86, 0.55], 2) + assert_array_almost_equal(sens, [0.79, 0.09, 0.90], 2) + assert_array_equal(supp, [24, 31, 20]) + + # averaging tests + spec = specificity_score(y_true, y_pred, pos_label=1, average='micro') + assert_array_almost_equal(spec, 0.77, 2) + + sens = sensitivity_score(y_true, y_pred, average='micro') + assert_array_almost_equal(sens, 0.53, 2) + + spec = specificity_score(y_true, y_pred, average='macro') + assert_array_almost_equal(spec, 0.77, 2) + + sens = sensitivity_score(y_true, y_pred, average='macro') + assert_array_almost_equal(sens, 0.60, 2) + + spec = specificity_score(y_true, y_pred, average='weighted') + assert_array_almost_equal(spec, 0.80, 2) + + sens = sensitivity_score(y_true, y_pred, average='weighted') + assert_array_almost_equal(sens, 0.53, 2) + + assert_raises(ValueError, sensitivity_score, y_true, y_pred, + average="samples") + assert_raises(ValueError, specificity_score, y_true, y_pred, + average="samples") + + # same prediction but with and explicit label ordering + sens, spec, supp = sensitivity_specificity_support( + y_true, y_pred, labels=[0, 2, 1], average=None) + assert_array_almost_equal(spec, [0.92, 0.55, 0.86], 2) + assert_array_almost_equal(sens, [0.79, 0.90, 0.10], 2) + assert_array_equal(supp, [24, 20, 31]) From d7333bd8a6128e2cf3702c0782331a20574873be Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 22 Dec 2016 18:59:36 +0100 Subject: [PATCH 05/21] Finish sensitivity and specificity --- imblearn/metrics/classification.py | 57 ++--- imblearn/metrics/tests/test_classification.py | 207 ++++++++---------- 2 files changed, 112 insertions(+), 152 deletions(-) diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index 90ef0c736..26ae88704 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -40,7 +40,7 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, If ``pos_label is None`` and in binary classification, this function returns the average sensitivity and specificity if ``average`` - is one of ``'micro'`` or 'weighted'``. + is one of ``'weighted'``. Parameters ---------- @@ -105,8 +105,7 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, `_ """ - - average_options = (None, 'micro', 'macro', 'weighted') + average_options = (None, 'macro', 'weighted') if average not in average_options and average != 'binary': raise ValueError('average has to be one of ' + str(average_options)) @@ -154,10 +153,6 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, y_pred = le.transform(y_pred) sorted_labels = le.classes_ - LOGGER.debug(y_true) - LOGGER.debug(y_pred) - LOGGER.debug(sorted_labels) - LOGGER.debug('The number of labels is %s' % n_labels) # In a leave out strategy and for each label, compute: @@ -165,13 +160,13 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, # These list contain an array in which each sample is labeled as # TP, TN, FP, FN list_tp = [np.bitwise_and((y_true == label), (y_pred == label)) - for label in sorted_labels] + for label in range(sorted_labels.size)] list_tn = [np.bitwise_and((y_true != label), (y_pred != label)) - for label in sorted_labels] + for label in range(sorted_labels.size)] list_fp = [np.bitwise_and((y_true != label), (y_pred == label)) - for label in sorted_labels] + for label in range(sorted_labels.size)] list_fn = [np.bitwise_and((y_true == label), (y_pred != label)) - for label in sorted_labels] + for label in range(sorted_labels.size)] # Compute the sum for each type # We keep only the counting corresponding to True values @@ -197,42 +192,32 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, # Sort the support support = support[indices] - LOGGER.debug('The indices which are retained are %s' % indices) - LOGGER.debug('TP: %s' % tp_sum) - LOGGER.debug('TN: %s' % tn_sum) - LOGGER.debug('FP: %s' % fp_sum) - LOGGER.debug('FN: %s' % fn_sum) - tp_sum = tp_sum[indices] tn_sum = tn_sum[indices] fp_sum = fp_sum[indices] fn_sum = fn_sum[indices] - if average == 'micro': - tp_sum = np.array([tp_sum.sum()]) - tn_sum = np.array([tn_sum.sum()]) - fp_sum = np.array([fp_sum.sum()]) - fn_sum = np.array([fn_sum.sum()]) - - LOGGER.debug('Did we do the average micro %s' % tp_sum) - LOGGER.debug('Computed the necessary stats for the sensitivity and' ' specificity') - # Compute the sensitivity and specificity - sensitivity = [_prf_divide(tp, tp + fn, 'sensitivity', 'tp + fn', average, - warn_for) for tp, fn in zip(tp_sum, fn_sum)] - specificity = [_prf_divide(tn, tn + fp, 'specificity', 'tn + fp', average, - warn_for) for tn, fp in zip(tn_sum, fp_sum)] - - LOGGER.debug('Sensitivity = %s - Specificity = %s' % (sensitivity, - specificity)) + LOGGER.debug(tp_sum) + LOGGER.debug(tn_sum) + LOGGER.debug(fp_sum) + LOGGER.debug(fn_sum) - LOGGER.debug('Computed the sensitivity and specificity for each class') - LOGGER.debug('The lengths of those two metrics are: %s - %s', - len(sensitivity), len(specificity)) + # Compute the sensitivity and specificity + with np.errstate(divide='ignore', invalid='ignore'): + sensitivity = _prf_divide(tp_sum, tp_sum + fn_sum, 'sensitivity', + 'tp + fn', average, warn_for) + specificity = _prf_divide(tn_sum, tn_sum + fp_sum, 'specificity', + 'tn + fp', average, warn_for) + + # sensitivity = [_prf_divide(tp, tp + fn, 'sensitivity', 'tp + fn', average, + # warn_for) for tp, fn in zip(tp_sum, fn_sum)] + # specificity = [_prf_divide(tn, tn + fp, 'specificity', 'tn + fp', average, + # warn_for) for tn, fp in zip(tn_sum, fp_sum)] # If we need to weight the results if average == 'weighted': diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index d7ce37caa..f998b64f0 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -2,6 +2,8 @@ from __future__ import division, print_function +from functools import partial + import numpy as np from numpy.testing import (assert_array_almost_equal, assert_array_equal, @@ -12,6 +14,8 @@ from sklearn import datasets from sklearn import svm +from sklearn.preprocessing import label_binarize +from sklearn.utils.testing import assert_not_equal from sklearn.utils.validation import check_random_state from imblearn.metrics import sensitivity_specificity_support @@ -94,113 +98,93 @@ def test_sensitivity_specificity_support_binary(): assert_array_almost_equal(spec, 0.88, 2) -# def test_precision_recall_f_binary_single_class(): -# # Test precision, recall and F1 score behave with a single positive or -# # negative class -# # Such a case may occur with non-stratified cross-validation -# assert_equal(1., precision_score([1, 1], [1, 1])) -# assert_equal(1., recall_score([1, 1], [1, 1])) -# assert_equal(1., f1_score([1, 1], [1, 1])) - -# assert_equal(0., precision_score([-1, -1], [-1, -1])) -# assert_equal(0., recall_score([-1, -1], [-1, -1])) -# assert_equal(0., f1_score([-1, -1], [-1, -1])) - - -# @ignore_warnings -# def test_precision_recall_f_extra_labels(): -# # Test handling of explicit additional (not in input) labels to PRF -# y_true = [1, 3, 3, 2] -# y_pred = [1, 1, 3, 2] -# y_true_bin = label_binarize(y_true, classes=np.arange(5)) -# y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) -# data = [(y_true, y_pred), -# (y_true_bin, y_pred_bin)] - -# for i, (y_true, y_pred) in enumerate(data): -# # No average: zeros in array -# actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], -# average=None) -# assert_array_almost_equal([0., 1., 1., .5, 0.], actual) - -# # Macro average is changed -# actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], -# average='macro') -# assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual) - -# # No effect otheriwse -# for average in ['micro', 'weighted', 'samples']: -# if average == 'samples' and i == 0: -# continue -# assert_almost_equal(recall_score(y_true, y_pred, -# labels=[0, 1, 2, 3, 4], -# average=average), -# recall_score(y_true, y_pred, labels=None, -# average=average)) - -# # Error when introducing invalid label in multilabel case -# # (although it would only affect performance if average='macro'/None) -# for average in [None, 'macro', 'micro', 'samples']: -# assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin, -# labels=np.arange(6), average=average) -# assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin, -# labels=np.arange(-1, 4), average=average) - - -# @ignore_warnings -# def test_precision_recall_f_ignored_labels(): -# # Test a subset of labels may be requested for PRF -# y_true = [1, 1, 2, 3] -# y_pred = [1, 3, 3, 3] -# y_true_bin = label_binarize(y_true, classes=np.arange(5)) -# y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) -# data = [(y_true, y_pred), -# (y_true_bin, y_pred_bin)] - -# for i, (y_true, y_pred) in enumerate(data): -# recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3]) -# recall_all = partial(recall_score, y_true, y_pred, labels=None) - -# assert_array_almost_equal([.5, 1.], recall_13(average=None)) -# assert_almost_equal((.5 + 1.) / 2, recall_13(average='macro')) -# assert_almost_equal((.5 * 2 + 1. * 1) / 3, -# recall_13(average='weighted')) -# assert_almost_equal(2. / 3, recall_13(average='micro')) - -# # ensure the above were meaningful tests: -# for average in ['macro', 'weighted', 'micro']: -# assert_not_equal(recall_13(average=average), -# recall_all(average=average)) - - -# @ignore_warnings -# def test_precision_recall_fscore_support_errors(): -# y_true, y_pred, _ = make_prediction(binary=True) - -# # Bad beta -# assert_raises(ValueError, precision_recall_fscore_support, -# y_true, y_pred, beta=0.0) - -# # Bad pos_label -# assert_raises(ValueError, precision_recall_fscore_support, -# y_true, y_pred, pos_label=2, average='binary') - -# # Bad average option -# assert_raises(ValueError, precision_recall_fscore_support, -# [0, 1, 2], [1, 2, 0], average='mega') - - -# def test_precision_recall_f_unused_pos_label(): -# # Check warning that pos_label unused when set to non-default value -# # but average != 'binary'; even if data is binary. -# assert_warns_message(UserWarning, -# "Note that pos_label (set to 2) is " -# "ignored when average != 'binary' (got 'macro'). You " -# "may use labels=[pos_label] to specify a single " -# "positive class.", precision_recall_fscore_support, -# [1, 2, 1], [1, 2, 2], pos_label=2, average='macro') - -def test_precision_recall_f1_score_multiclass(): +def test_sensitivity_specificity_binary_single_class(): + # Test sensitivity and specificity score behave with a single positive or + # negative class + # Such a case may occur with non-stratified cross-validation + assert_equal(1., sensitivity_score([1, 1], [1, 1])) + assert_equal(0., specificity_score([1, 1], [1, 1])) + + assert_equal(0., sensitivity_score([-1, -1], [-1, -1])) + assert_equal(0., specificity_score([-1, -1], [-1, -1])) + + +def test_sensitivity_specificity_error_multilabels(): + # Test either if an error is raised when the input are multilabels + y_true = [1, 3, 3, 2] + y_pred = [1, 1, 3, 2] + y_true_bin = label_binarize(y_true, classes=np.arange(5)) + y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) + + assert_raises(ValueError, sensitivity_score, y_true_bin, y_pred_bin) + +@ignore_warnings +def test_sensitivity_specifiicity_extra_labels(): + # Test handling of explicit additional (not in input) labels to SS + y_true = [1, 3, 3, 2] + y_pred = [1, 1, 3, 2] + + actual = sensitivity_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], + average=None) + assert_array_almost_equal([0., 1., 1., .5, 0.], actual) + + # Macro average is changed + actual = sensitivity_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], + average='macro') + assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual) + + # Weighted average is changed + assert_almost_equal(sensitivity_score(y_true, y_pred, + labels=[0, 1, 2, 3, 4], + average='weighted'), + sensitivity_score(y_true, y_pred, labels=None, + average='weighted')) + +@ignore_warnings +def test_sensitivity_specificity_f_ignored_labels(): + # Test a subset of labels may be requested for SS + y_true = [1, 1, 2, 3] + y_pred = [1, 3, 3, 3] + + sensitivity_13 = partial(sensitivity_score, y_true, y_pred, labels=[1, 3]) + sensitivity_all = partial(sensitivity_score, y_true, y_pred, labels=None) + + assert_array_almost_equal([.5, 1.], sensitivity_13(average=None)) + assert_almost_equal((.5 + 1.) / 2, sensitivity_13(average='macro')) + assert_almost_equal((.5 * 2 + 1. * 1) / 3, + sensitivity_13(average='weighted')) + + # ensure the above were meaningful tests: + for average in ['macro', 'weighted']: + assert_not_equal(sensitivity_13(average=average), + sensitivity_all(average=average)) + + +@ignore_warnings +def test_sensitivity_specificity_support_errors(): + y_true, y_pred, _ = make_prediction(binary=True) + + # Bad pos_label + assert_raises(ValueError, sensitivity_specificity_support, + y_true, y_pred, pos_label=2, average='binary') + + # Bad average option + assert_raises(ValueError, sensitivity_specificity_support, + [0, 1, 2], [1, 2, 0], average='mega') + + +def test_sensitivity_specificity_unused_pos_label(): + # Check warning that pos_label unused when set to non-default value + # but average != 'binary'; even if data is binary. + assert_warns_message(UserWarning, + "Note that pos_label (set to 2) is " + "ignored when average != 'binary' (got 'macro'). You " + "may use labels=[pos_label] to specify a single " + "positive class.", sensitivity_specificity_support, + [1, 2, 1], [1, 2, 2], pos_label=2, average='macro') + + +def test_sensitivity_specificity_multiclass(): # Test Precision Recall and F1 Score for multiclass classification task y_true, y_pred, _ = make_prediction(binary=False) @@ -212,15 +196,6 @@ def test_precision_recall_f1_score_multiclass(): assert_array_equal(supp, [24, 31, 20]) # averaging tests - spec = specificity_score(y_true, y_pred, pos_label=1, average='micro') - assert_array_almost_equal(spec, 0.77, 2) - - sens = sensitivity_score(y_true, y_pred, average='micro') - assert_array_almost_equal(sens, 0.53, 2) - - spec = specificity_score(y_true, y_pred, average='macro') - assert_array_almost_equal(spec, 0.77, 2) - sens = sensitivity_score(y_true, y_pred, average='macro') assert_array_almost_equal(sens, 0.60, 2) From d0769165d89db55f538314d91ef7b3612ff27bf0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 22 Dec 2016 19:32:12 +0100 Subject: [PATCH 06/21] Added geometric mean --- imblearn/metrics/classification.py | 110 ++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 19 deletions(-) diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index 26ae88704..6955be47d 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -1,3 +1,5 @@ +# coding: utf-8 + """Metrics to assess performance on classification task given class prediction Functions named as ``*_score`` return a scalar value to maximize: the higher @@ -61,7 +63,7 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, pos_label : str or int, optional (default=1) The class to report if ``average='binary'`` and the data is binary. - If the data are multiclass or multilabel, this will be ignored; + If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. @@ -202,11 +204,6 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, LOGGER.debug('Computed the necessary stats for the sensitivity and' ' specificity') - LOGGER.debug(tp_sum) - LOGGER.debug(tn_sum) - LOGGER.debug(fp_sum) - LOGGER.debug(fn_sum) - # Compute the sensitivity and specificity with np.errstate(divide='ignore', invalid='ignore'): sensitivity = _prf_divide(tp_sum, tp_sum + fn_sum, 'sensitivity', @@ -214,11 +211,6 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, specificity = _prf_divide(tn_sum, tn_sum + fp_sum, 'specificity', 'tn + fp', average, warn_for) - # sensitivity = [_prf_divide(tp, tp + fn, 'sensitivity', 'tp + fn', average, - # warn_for) for tp, fn in zip(tp_sum, fn_sum)] - # specificity = [_prf_divide(tn, tn + fp, 'specificity', 'tn + fp', average, - # warn_for) for tn, fp in zip(tn_sum, fp_sum)] - # If we need to weight the results if average == 'weighted': weights = support @@ -259,13 +251,11 @@ def sensitivity_score(y_true, y_pred, labels=None, pos_label=1, order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will - result in 0 components in a macro average. For multilabel targets, - labels are column indices. By default, all labels in ``y_true`` and - ``y_pred`` are used in sorted order. + result in 0 components in a macro average. pos_label : str or int, optional (default=1) The class to report if ``average='binary'`` and the data is binary. - If the data are multiclass or multilabel, this will be ignored; + If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. @@ -331,13 +321,11 @@ def specificity_score(y_true, y_pred, labels=None, pos_label=1, order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will - result in 0 components in a macro average. For multilabel targets, - labels are column indices. By default, all labels in ``y_true`` and - ``y_pred`` are used in sorted order. + result in 0 components in a macro average. pos_label : str or int, optional (default=1) The class to report if ``average='binary'`` and the data is binary. - If the data are multiclass or multilabel, this will be ignored; + If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. @@ -377,3 +365,87 @@ def specificity_score(y_true, y_pred, labels=None, pos_label=1, sample_weight=sample_weight) return s + + +def geometric_mean_score(y_true, y_pred, labels=None, pos_label=1, + average='binary', sample_weight=None): + """Compute the geometric mean + + The geometric mean is the squared root of the product of the sensitivity + and specificity. This measure tries to maximize the accuracy on each + of the two classes while keeping these accuracies balanced. + + The specificity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number + of true positives and ``fn`` the number of false negatives. The specificity + is intuitively the ability of the classifier to find all the positive + samples. + + The best value is 1 and the worst value is 0. + + Parameters + ---------- + y_true : ndarray, shape (n_samples, ) + Ground truth (correct) target values. + + y_pred : ndarray, shape (n_samples, ) + Estimated targets as returned by a classifier. + + labels : list, optional + The set of labels to include when ``average != 'binary'``, and their + order if ``average is None``. Labels present in the data can be + excluded, for example to calculate a multiclass average ignoring a + majority negative class, while labels not present in the data will + result in 0 components in a macro average. + + pos_label : str or int, optional (default=1) + The class to report if ``average='binary'`` and the data is binary. + If the data are multiclass or multilabel, this will be ignored; + setting ``labels=[pos_label]`` and ``average != 'binary'`` will report + scores for that label only. + + average : str or None, optional (default=None) + If ``None``, the scores for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average, weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance. + + warn_for : tuple or set, for internal use + This determines which warnings will be made in the case that this + function is being used to return only one of its metrics. + + sample_weight : ndarray, shape (n_samples, ) + Sample weights. + + Returns + ------- + geometric_mean : float (if ``average`` = None) or ndarray, \ + shape (n_unique_labels, ) + + References + ---------- + .. [1] Kubat, M. and Matwin, S. "Addressing the curse of + imbalanced training sets: one-sided selection" ICML (1997) + + .. [2] Barandela, R., Sánchez, J. S., Garcıa, V., & Rangel, E. "Strategies + for learning in class imbalance problems", Pattern Recognition, + 36(3), (2003), pp 849-851. + + """ + sen, spe, _ = sensitivity_specificity_support(y_true, y_pred, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=('specificity', + 'specificity'), + sample_weight=sample_weight) + + return np.sqrt(sen * spe) From ace6cabacac46b7ddee957c18f9c29932fa080b7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 22 Dec 2016 22:25:05 +0100 Subject: [PATCH 07/21] Add the testing for geometric mean --- imblearn/metrics/__init__.py | 4 ++- imblearn/metrics/classification.py | 2 ++ imblearn/metrics/tests/test_classification.py | 29 ++++++++++++++++++- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py index d518d7a23..13a17ae9d 100644 --- a/imblearn/metrics/__init__.py +++ b/imblearn/metrics/__init__.py @@ -6,9 +6,11 @@ from .classification import sensitivity_specificity_support from .classification import sensitivity_score from .classification import specificity_score +from .classification import geometric_mean_score __all__ = [ 'sensitivity_specificity_support', 'sensitivity_score', - 'specificity_score' + 'specificity_score', + 'geometric_mean_score' ] diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index 6955be47d..7656bbd53 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -448,4 +448,6 @@ def geometric_mean_score(y_true, y_pred, labels=None, pos_label=1, 'specificity'), sample_weight=sample_weight) + LOGGER.debug('The sensitivity and specificity are : %s - %s' % (sen, spe)) + return np.sqrt(sen * spe) diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index f998b64f0..58f9e81e0 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -21,6 +21,7 @@ from imblearn.metrics import sensitivity_specificity_support from imblearn.metrics import sensitivity_score from imblearn.metrics import specificity_score +from imblearn.metrics import geometric_mean_score RND_SEED = 42 @@ -185,7 +186,7 @@ def test_sensitivity_specificity_unused_pos_label(): def test_sensitivity_specificity_multiclass(): - # Test Precision Recall and F1 Score for multiclass classification task + # Test sensitivity and specificity for multiclass classification task y_true, y_pred, _ = make_prediction(binary=False) # compute scores with default labels introspection @@ -216,3 +217,29 @@ def test_sensitivity_specificity_multiclass(): assert_array_almost_equal(spec, [0.92, 0.55, 0.86], 2) assert_array_almost_equal(sens, [0.79, 0.90, 0.10], 2) assert_array_equal(supp, [24, 20, 31]) + + +def test_geometric_mean_support_binary(): + """Test the geometric mean for binary classification task""" + y_true, y_pred, _ = make_prediction(binary=True) + + # compute the geometric mean for the binary problem + geo_mean = geometric_mean_score(y_true, y_pred) + + assert_almost_equal(geo_mean, 0.77, 2) + + +def test_geometric_mean_multiclass(): + # Test geometric mean for multiclass classification task + y_true, y_pred, _ = make_prediction(binary=False) + + # Compute the geometric mean for each of the classes + geo_mean = geometric_mean_score(y_true, y_pred, average=None) + assert_array_almost_equal(geo_mean, [0.85, 0.29, 0.7], 2) + + # average tests + geo_mean = geometric_mean_score(y_true, y_pred, average='macro') + assert_almost_equal(geo_mean, 0.68, 2) + + geo_mean = geometric_mean_score(y_true, y_pred, average='weighted') + assert_array_almost_equal(geo_mean, 0.65, 2) From 73f226d4cde4a6f70f8024515e43257ca7517f7c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 22 Dec 2016 23:08:09 +0100 Subject: [PATCH 08/21] Change the computation of the specificity to fit with sklearn function --- imblearn/metrics/classification.py | 215 ++++++++++++++++++----------- 1 file changed, 135 insertions(+), 80 deletions(-) diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index 7656bbd53..c4c4ae525 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -74,14 +74,21 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This - alters 'macro' to account for label imbalance. - + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). warn_for : tuple or set, for internal use This determines which warnings will be made in the case that this function is being used to return only one of its metrics. @@ -107,7 +114,7 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, `_ """ - average_options = (None, 'macro', 'weighted') + average_options = (None, 'micro', 'macro', 'weighted', 'samples') if average not in average_options and average != 'binary': raise ValueError('average has to be one of ' + str(average_options)) @@ -115,13 +122,6 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, y_type, y_true, y_pred = _check_targets(y_true, y_pred) present_labels = unique_labels(y_true, y_pred) - LOGGER.debug('The labels in the prediction and ground-truth are %s', - present_labels) - - # We do not support multilabel for the moment - if y_type.startswith('multilabel'): - raise ValueError('Multilabel are not supported.') - if average == 'binary': if y_type == 'binary': if pos_label not in present_labels: @@ -149,83 +149,114 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, labels = np.hstack([labels, np.setdiff1d(present_labels, labels, assume_unique=True)]) - le = LabelEncoder() - le.fit(labels) - y_true = le.transform(y_true) - y_pred = le.transform(y_pred) - sorted_labels = le.classes_ - - LOGGER.debug('The number of labels is %s' % n_labels) - - # In a leave out strategy and for each label, compute: - # TP, TN, FP, FN - # These list contain an array in which each sample is labeled as - # TP, TN, FP, FN - list_tp = [np.bitwise_and((y_true == label), (y_pred == label)) - for label in range(sorted_labels.size)] - list_tn = [np.bitwise_and((y_true != label), (y_pred != label)) - for label in range(sorted_labels.size)] - list_fp = [np.bitwise_and((y_true != label), (y_pred == label)) - for label in range(sorted_labels.size)] - list_fn = [np.bitwise_and((y_true == label), (y_pred != label)) - for label in range(sorted_labels.size)] - - # Compute the sum for each type - # We keep only the counting corresponding to True values - # We are using bincount since it allows to weight the samples - tp_sum = np.array([bincount(tp, weights=sample_weight, - minlength=2)[-1] - for tp in list_tp]) - tn_sum = np.array([bincount(tn, weights=sample_weight, - minlength=2)[-1] - for tn in list_tn]) - fp_sum = np.array([bincount(fp, weights=sample_weight, - minlength=2)[-1] - for fp in list_fp]) - fn_sum = np.array([bincount(fn, weights=sample_weight, - minlength=2)[-1] - for fn in list_fn]) - - # Retain only selected labels - indices = np.searchsorted(sorted_labels, labels[:n_labels]) - # For support, we can count the number of occurrences of each label - support = np.array(bincount(y_true, weights=sample_weight, - minlength=len(labels))) - # Sort the support - support = support[indices] - - LOGGER.debug('The indices which are retained are %s' % indices) - - tp_sum = tp_sum[indices] - tn_sum = tn_sum[indices] - fp_sum = fp_sum[indices] - fn_sum = fn_sum[indices] - - LOGGER.debug('Computed the necessary stats for the sensitivity and' - ' specificity') - - # Compute the sensitivity and specificity + # Calculate tp_sum, pred_sum, true_sum ### + + if y_type.startswith('multilabel'): + sum_axis = 1 if average == 'samples' else 0 + + # All labels are index integers for multilabel. + # Select labels: + if not np.all(labels == present_labels): + if np.max(labels) > np.max(present_labels): + raise ValueError('All labels must be in [0, n labels). ' + 'Got %d > %d' % + (np.max(labels), np.max(present_labels))) + if np.min(labels) < 0: + raise ValueError('All labels must be in [0, n labels). ' + 'Got %d < 0' % np.min(labels)) + + y_true = y_true[:, labels[:n_labels]] + y_pred = y_pred[:, labels[:n_labels]] + + # calculate weighted counts + true_and_pred = y_true.multiply(y_pred) + tp_sum = count_nonzero(true_and_pred, axis=sum_axis, + sample_weight=sample_weight) + pred_sum = count_nonzero(y_pred, axis=sum_axis, + sample_weight=sample_weight) + true_sum = count_nonzero(y_true, axis=sum_axis, + sample_weight=sample_weight) + tn_sum = y_true.size - (pred_sum + true_sum - tp_sum) + + elif average == 'samples': + raise ValueError("Sample-based precision, recall, fscore is " + "not meaningful outside multilabel " + "classification. See the accuracy_score instead.") + else: + le = LabelEncoder() + le.fit(labels) + y_true = le.transform(y_true) + y_pred = le.transform(y_pred) + sorted_labels = le.classes_ + + # labels are now from 0 to len(labels) - 1 -> use bincount + tp = y_true == y_pred + tp_bins = y_true[tp] + if sample_weight is not None: + tp_bins_weights = np.asarray(sample_weight)[tp] + else: + tp_bins_weights = None + + if len(tp_bins): + tp_sum = bincount(tp_bins, weights=tp_bins_weights, + minlength=len(labels)) + else: + # Pathological case + true_sum = pred_sum = tp_sum = np.zeros(len(labels)) + if len(y_pred): + pred_sum = bincount(y_pred, weights=sample_weight, + minlength=len(labels)) + if len(y_true): + true_sum = bincount(y_true, weights=sample_weight, + minlength=len(labels)) + + # Compute the true negative + tn_sum = y_true.size - (pred_sum + true_sum - tp_sum) + + # Retain only selected labels + indices = np.searchsorted(sorted_labels, labels[:n_labels]) + tp_sum = tp_sum[indices] + true_sum = true_sum[indices] + pred_sum = pred_sum[indices] + tn_sum = tn_sum[indices] + + if average == 'micro': + tp_sum = np.array([tp_sum.sum()]) + pred_sum = np.array([pred_sum.sum()]) + true_sum = np.array([true_sum.sum()]) + tn_sum = np.array([tn_sum.sum()]) + + # Finally, we have all our sufficient statistics. Divide! # + with np.errstate(divide='ignore', invalid='ignore'): - sensitivity = _prf_divide(tp_sum, tp_sum + fn_sum, 'sensitivity', - 'tp + fn', average, warn_for) - specificity = _prf_divide(tn_sum, tn_sum + fp_sum, 'specificity', - 'tn + fp', average, warn_for) + # Divide, and on zero-division, set scores to 0 and warn: + + # Oddly, we may get an "invalid" rather than a "divide" error + # here. + specificity = _prf_divide(tn_sum, tn_sum + pred_sum - tp_sum, + 'specificity', 'predicted', average, + warn_for) + sensitivity = _prf_divide(tp_sum, true_sum, + 'sensitivity', 'true', average, warn_for) + + # Average the results - # If we need to weight the results if average == 'weighted': - weights = support + weights = true_sum if weights.sum() == 0: return 0, 0, None + elif average == 'samples': + weights = sample_weight else: weights = None if average is not None: - assert average != 'binary' or len(sensitivity) == 1 - sensitivity = np.average(sensitivity, weights=weights) + assert average != 'binary' or len(specificity) == 1 specificity = np.average(specificity, weights=weights) - support = None + sensitivity = np.average(sensitivity, weights=weights) + true_sum = None # return no support - return sensitivity, specificity, support + return sensitivity, specificity, true_sum def sensitivity_score(y_true, y_pred, labels=None, pos_label=1, @@ -266,13 +297,21 @@ def sensitivity_score(y_true, y_pred, labels=None, pos_label=1, ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This - alters 'macro' to account for label imbalance. + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). warn_for : tuple or set, for internal use This determines which warnings will be made in the case that this @@ -291,7 +330,7 @@ def sensitivity_score(y_true, y_pred, labels=None, pos_label=1, labels=labels, pos_label=pos_label, average=average, - warn_for=('specificity',), + warn_for=('sensitivity',), sample_weight=sample_weight) return s @@ -336,13 +375,21 @@ def specificity_score(y_true, y_pred, labels=None, pos_label=1, ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This - alters 'macro' to account for label imbalance. + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). warn_for : tuple or set, for internal use This determines which warnings will be made in the case that this @@ -410,13 +457,21 @@ def geometric_mean_score(y_true, y_pred, labels=None, pos_label=1, ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This - alters 'macro' to account for label imbalance. + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). warn_for : tuple or set, for internal use This determines which warnings will be made in the case that this From 03f10719d45582f3dd49f6d615da83abee6e252a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 23 Dec 2016 00:24:25 +0100 Subject: [PATCH 09/21] Update the test for the specificity --- imblearn/metrics/classification.py | 135 +++++++------- imblearn/metrics/tests/test_classification.py | 171 ++++++++---------- 2 files changed, 149 insertions(+), 157 deletions(-) diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index c4c4ae525..e9516ed0f 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -1,5 +1,4 @@ # coding: utf-8 - """Metrics to assess performance on classification task given class prediction Functions named as ``*_score`` return a scalar value to maximize: the higher @@ -20,12 +19,16 @@ from sklearn.preprocessing import LabelEncoder from sklearn.utils.fixes import bincount from sklearn.utils.multiclass import unique_labels +from sklearn.utils.sparsefuncs import count_nonzero LOGGER = logging.getLogger(__name__) -def sensitivity_specificity_support(y_true, y_pred, labels=None, - pos_label=1, average=None, +def sensitivity_specificity_support(y_true, + y_pred, + labels=None, + pos_label=1, + average=None, warn_for=('sensitivity', 'specificity'), sample_weight=None): """Compute sensitivity, specificity, and support for each class @@ -116,8 +119,7 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, """ average_options = (None, 'micro', 'macro', 'weighted', 'samples') if average not in average_options and average != 'binary': - raise ValueError('average has to be one of ' + - str(average_options)) + raise ValueError('average has to be one of ' + str(average_options)) y_type, y_true, y_pred = _check_targets(y_true, y_pred) present_labels = unique_labels(y_true, y_pred) @@ -146,38 +148,14 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, n_labels = None else: n_labels = len(labels) - labels = np.hstack([labels, np.setdiff1d(present_labels, labels, - assume_unique=True)]) + labels = np.hstack( + [labels, np.setdiff1d( + present_labels, labels, assume_unique=True)]) # Calculate tp_sum, pred_sum, true_sum ### if y_type.startswith('multilabel'): - sum_axis = 1 if average == 'samples' else 0 - - # All labels are index integers for multilabel. - # Select labels: - if not np.all(labels == present_labels): - if np.max(labels) > np.max(present_labels): - raise ValueError('All labels must be in [0, n labels). ' - 'Got %d > %d' % - (np.max(labels), np.max(present_labels))) - if np.min(labels) < 0: - raise ValueError('All labels must be in [0, n labels). ' - 'Got %d < 0' % np.min(labels)) - - y_true = y_true[:, labels[:n_labels]] - y_pred = y_pred[:, labels[:n_labels]] - - # calculate weighted counts - true_and_pred = y_true.multiply(y_pred) - tp_sum = count_nonzero(true_and_pred, axis=sum_axis, - sample_weight=sample_weight) - pred_sum = count_nonzero(y_pred, axis=sum_axis, - sample_weight=sample_weight) - true_sum = count_nonzero(y_true, axis=sum_axis, - sample_weight=sample_weight) - tn_sum = y_true.size - (pred_sum + true_sum - tp_sum) - + raise ValueError('imblearn does not support multilabel') elif average == 'samples': raise ValueError("Sample-based precision, recall, fscore is " "not meaningful outside multilabel " @@ -198,17 +176,17 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, tp_bins_weights = None if len(tp_bins): - tp_sum = bincount(tp_bins, weights=tp_bins_weights, - minlength=len(labels)) + tp_sum = bincount( + tp_bins, weights=tp_bins_weights, minlength=len(labels)) else: # Pathological case true_sum = pred_sum = tp_sum = np.zeros(len(labels)) if len(y_pred): - pred_sum = bincount(y_pred, weights=sample_weight, - minlength=len(labels)) + pred_sum = bincount( + y_pred, weights=sample_weight, minlength=len(labels)) if len(y_true): - true_sum = bincount(y_true, weights=sample_weight, - minlength=len(labels)) + true_sum = bincount( + y_true, weights=sample_weight, minlength=len(labels)) # Compute the true negative tn_sum = y_true.size - (pred_sum + true_sum - tp_sum) @@ -220,6 +198,11 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, pred_sum = pred_sum[indices] tn_sum = tn_sum[indices] + LOGGER.debug('tp: %s' % tp_sum) + LOGGER.debug('tn: %s' % tn_sum) + LOGGER.debug('pred_sum: %s' % pred_sum) + LOGGER.debug('true_sum: %s' % true_sum) + if average == 'micro': tp_sum = np.array([tp_sum.sum()]) pred_sum = np.array([pred_sum.sum()]) @@ -236,8 +219,8 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, specificity = _prf_divide(tn_sum, tn_sum + pred_sum - tp_sum, 'specificity', 'predicted', average, warn_for) - sensitivity = _prf_divide(tp_sum, true_sum, - 'sensitivity', 'true', average, warn_for) + sensitivity = _prf_divide(tp_sum, true_sum, 'sensitivity', 'true', + average, warn_for) # Average the results @@ -250,6 +233,9 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, else: weights = None + LOGGER.debug(specificity) + LOGGER.debug(weights) + if average is not None: assert average != 'binary' or len(specificity) == 1 specificity = np.average(specificity, weights=weights) @@ -259,8 +245,12 @@ def sensitivity_specificity_support(y_true, y_pred, labels=None, return sensitivity, specificity, true_sum -def sensitivity_score(y_true, y_pred, labels=None, pos_label=1, - average='binary', sample_weight=None): +def sensitivity_score(y_true, + y_pred, + labels=None, + pos_label=1, + average='binary', + sample_weight=None): """Compute the sensitivity The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number @@ -326,18 +316,24 @@ def sensitivity_score(y_true, y_pred, labels=None, pos_label=1, shape (n_unique_labels, ) """ - s, _, _ = sensitivity_specificity_support(y_true, y_pred, - labels=labels, - pos_label=pos_label, - average=average, - warn_for=('sensitivity',), - sample_weight=sample_weight) + s, _, _ = sensitivity_specificity_support( + y_true, + y_pred, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=('sensitivity', ), + sample_weight=sample_weight) return s -def specificity_score(y_true, y_pred, labels=None, pos_label=1, - average='binary', sample_weight=None): +def specificity_score(y_true, + y_pred, + labels=None, + pos_label=1, + average='binary', + sample_weight=None): """Compute the specificity The specificity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number @@ -404,18 +400,24 @@ def specificity_score(y_true, y_pred, labels=None, pos_label=1, shape (n_unique_labels, ) """ - _, s, _ = sensitivity_specificity_support(y_true, y_pred, - labels=labels, - pos_label=pos_label, - average=average, - warn_for=('specificity',), - sample_weight=sample_weight) + _, s, _ = sensitivity_specificity_support( + y_true, + y_pred, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=('specificity', ), + sample_weight=sample_weight) return s -def geometric_mean_score(y_true, y_pred, labels=None, pos_label=1, - average='binary', sample_weight=None): +def geometric_mean_score(y_true, + y_pred, + labels=None, + pos_label=1, + average='binary', + sample_weight=None): """Compute the geometric mean The geometric mean is the squared root of the product of the sensitivity @@ -495,13 +497,14 @@ def geometric_mean_score(y_true, y_pred, labels=None, pos_label=1, 36(3), (2003), pp 849-851. """ - sen, spe, _ = sensitivity_specificity_support(y_true, y_pred, - labels=labels, - pos_label=pos_label, - average=average, - warn_for=('specificity', - 'specificity'), - sample_weight=sample_weight) + sen, spe, _ = sensitivity_specificity_support( + y_true, + y_pred, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=('specificity', 'specificity'), + sample_weight=sample_weight) LOGGER.debug('The sensitivity and specificity are : %s - %s' % (sen, spe)) diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index 58f9e81e0..02c2edb53 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -76,31 +76,33 @@ def make_prediction(dataset=None, binary=False): ############################################################################### # Tests -def test_sensitivity_specificity_support_binary(): - """Test the sensitivity specificity for binary classification task""" + +def test_sensitivity_specificity_score_binary(): + # Test Sensitivity Specificity for binary classification task y_true, y_pred, _ = make_prediction(binary=True) # detailed measures for each class - sens, spec, supp = sensitivity_specificity_support(y_true, y_pred, - average=None) - assert_array_almost_equal(sens, [0.88, 0.68], 2) - assert_array_almost_equal(spec, [0.68, 0.88], 2) - assert_array_equal(supp, [25, 25]) + sen, spe, sup = sensitivity_specificity_support( + y_true, y_pred, average=None) + assert_array_almost_equal(sen, [0.88, 0.68], 2) + assert_array_almost_equal(spe, [0.68, 0.88], 2) + assert_array_equal(sup, [25, 25]) # individual scoring function that can be used for grid search: in the # binary class case the score is the value of the measure for the positive # class (e.g. label == 1). This is deprecated for average != 'binary'. - for kwargs, my_assert in [({}, assert_no_warnings), - ({'average': 'binary'}, assert_no_warnings)]: - sens = my_assert(sensitivity_score, y_true, y_pred, **kwargs) - assert_array_almost_equal(sens, 0.68, 2) + for kwargs, my_assert in [({}, assert_no_warnings), ({ + 'average': 'binary' + }, assert_no_warnings)]: + sen = my_assert(sensitivity_score, y_true, y_pred, **kwargs) + assert_array_almost_equal(sen, 0.68, 2) - spec = my_assert(specificity_score, y_true, y_pred, **kwargs) - assert_array_almost_equal(spec, 0.88, 2) + spe = my_assert(specificity_score, y_true, y_pred, **kwargs) + assert_array_almost_equal(spe, 0.88, 2) -def test_sensitivity_specificity_binary_single_class(): - # Test sensitivity and specificity score behave with a single positive or +def test_sensitivity_specificity_f_binary_single_class(): + # Test sensitivity and specificity behave with a single positive or # negative class # Such a case may occur with non-stratified cross-validation assert_equal(1., sensitivity_score([1, 1], [1, 1])) @@ -110,55 +112,66 @@ def test_sensitivity_specificity_binary_single_class(): assert_equal(0., specificity_score([-1, -1], [-1, -1])) -def test_sensitivity_specificity_error_multilabels(): - # Test either if an error is raised when the input are multilabels - y_true = [1, 3, 3, 2] - y_pred = [1, 1, 3, 2] - y_true_bin = label_binarize(y_true, classes=np.arange(5)) - y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) - - assert_raises(ValueError, sensitivity_score, y_true_bin, y_pred_bin) - @ignore_warnings -def test_sensitivity_specifiicity_extra_labels(): +def test_sensitivity_specificity_extra_labels(): # Test handling of explicit additional (not in input) labels to SS y_true = [1, 3, 3, 2] y_pred = [1, 1, 3, 2] - actual = sensitivity_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], - average=None) - assert_array_almost_equal([0., 1., 1., .5, 0.], actual) + # No average: zeros in array + actual = specificity_score( + y_true, y_pred, labels=[0, 1, 2, 3, 4], average=None) + assert_array_almost_equal([1., 0.67, 1., 1., 1.], actual, 2) # Macro average is changed - actual = sensitivity_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], - average='macro') - assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual) + actual = specificity_score( + y_true, y_pred, labels=[0, 1, 2, 3, 4], average='macro') + assert_array_almost_equal(np.mean([1., 0.67, 1., 1., 1.]), actual, 2) + + # Check for micro + actual = specificity_score( + y_true, y_pred, labels=[0, 1, 2, 3, 4], average='micro') + assert_array_almost_equal(15. / 16., actual) + + # Check for weighted + actual = specificity_score( + y_true, y_pred, labels=[0, 1, 2, 3, 4], average='macro') + assert_array_almost_equal(np.mean([1., 0.67, 1., 1., 1.]), actual, 2) - # Weighted average is changed - assert_almost_equal(sensitivity_score(y_true, y_pred, - labels=[0, 1, 2, 3, 4], - average='weighted'), - sensitivity_score(y_true, y_pred, labels=None, - average='weighted')) @ignore_warnings -def test_sensitivity_specificity_f_ignored_labels(): +def test_sensitivity_specificity_ignored_labels(): # Test a subset of labels may be requested for SS y_true = [1, 1, 2, 3] y_pred = [1, 3, 3, 3] - sensitivity_13 = partial(sensitivity_score, y_true, y_pred, labels=[1, 3]) - sensitivity_all = partial(sensitivity_score, y_true, y_pred, labels=None) + specificity_13 = partial(specificity_score, y_true, y_pred, labels=[1, 3]) + specificity_all = partial(specificity_score, y_true, y_pred, labels=None) - assert_array_almost_equal([.5, 1.], sensitivity_13(average=None)) - assert_almost_equal((.5 + 1.) / 2, sensitivity_13(average='macro')) - assert_almost_equal((.5 * 2 + 1. * 1) / 3, - sensitivity_13(average='weighted')) + assert_array_almost_equal([1., 0.33], specificity_13(average=None), 2) + assert_almost_equal( + np.mean([1., 0.33]), specificity_13(average='macro'), 2) + assert_almost_equal( + np.average( + [1., .33], weights=[2., 1.]), + specificity_13(average='weighted'), + 2) + assert_almost_equal(3. / (3. + 2.), specificity_13(average='micro'), 2) # ensure the above were meaningful tests: - for average in ['macro', 'weighted']: - assert_not_equal(sensitivity_13(average=average), - sensitivity_all(average=average)) + for average in ['macro', 'weighted', 'micro']: + assert_not_equal( + specificity_13(average=average), specificity_all(average=average)) + + +def test_sensitivity_specificity_error_multilabels(): + # Test either if an error is raised when the input are multilabels + y_true = [1, 3, 3, 2] + y_pred = [1, 1, 3, 2] + y_true_bin = label_binarize(y_true, classes=np.arange(5)) + y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) + + assert_raises(ValueError, sensitivity_score, y_true_bin, y_pred_bin) @ignore_warnings @@ -166,57 +179,33 @@ def test_sensitivity_specificity_support_errors(): y_true, y_pred, _ = make_prediction(binary=True) # Bad pos_label - assert_raises(ValueError, sensitivity_specificity_support, - y_true, y_pred, pos_label=2, average='binary') + assert_raises( + ValueError, + sensitivity_specificity_support, + y_true, + y_pred, + pos_label=2, + average='binary') # Bad average option - assert_raises(ValueError, sensitivity_specificity_support, - [0, 1, 2], [1, 2, 0], average='mega') + assert_raises( + ValueError, + sensitivity_specificity_support, [0, 1, 2], [1, 2, 0], + average='mega') def test_sensitivity_specificity_unused_pos_label(): # Check warning that pos_label unused when set to non-default value # but average != 'binary'; even if data is binary. - assert_warns_message(UserWarning, - "Note that pos_label (set to 2) is " - "ignored when average != 'binary' (got 'macro'). You " - "may use labels=[pos_label] to specify a single " - "positive class.", sensitivity_specificity_support, - [1, 2, 1], [1, 2, 2], pos_label=2, average='macro') - - -def test_sensitivity_specificity_multiclass(): - # Test sensitivity and specificity for multiclass classification task - y_true, y_pred, _ = make_prediction(binary=False) - - # compute scores with default labels introspection - sens, spec, supp = sensitivity_specificity_support(y_true, y_pred, - average=None) - assert_array_almost_equal(spec, [0.92, 0.86, 0.55], 2) - assert_array_almost_equal(sens, [0.79, 0.09, 0.90], 2) - assert_array_equal(supp, [24, 31, 20]) - - # averaging tests - sens = sensitivity_score(y_true, y_pred, average='macro') - assert_array_almost_equal(sens, 0.60, 2) - - spec = specificity_score(y_true, y_pred, average='weighted') - assert_array_almost_equal(spec, 0.80, 2) - - sens = sensitivity_score(y_true, y_pred, average='weighted') - assert_array_almost_equal(sens, 0.53, 2) - - assert_raises(ValueError, sensitivity_score, y_true, y_pred, - average="samples") - assert_raises(ValueError, specificity_score, y_true, y_pred, - average="samples") - - # same prediction but with and explicit label ordering - sens, spec, supp = sensitivity_specificity_support( - y_true, y_pred, labels=[0, 2, 1], average=None) - assert_array_almost_equal(spec, [0.92, 0.55, 0.86], 2) - assert_array_almost_equal(sens, [0.79, 0.90, 0.10], 2) - assert_array_equal(supp, [24, 20, 31]) + assert_warns_message( + UserWarning, + "Note that pos_label (set to 2) is " + "ignored when average != 'binary' (got 'macro'). You " + "may use labels=[pos_label] to specify a single " + "positive class.", + sensitivity_specificity_support, [1, 2, 1], [1, 2, 2], + pos_label=2, + average='macro') def test_geometric_mean_support_binary(): From 0674ada7218a699d1e06bbc9c709e65c68afc9b2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 23 Dec 2016 01:22:28 +0100 Subject: [PATCH 10/21] Added the IBA metric --- imblearn/metrics/classification.py | 76 ++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 8 deletions(-) diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index e9516ed0f..20ffd4652 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -198,11 +198,6 @@ def sensitivity_specificity_support(y_true, pred_sum = pred_sum[indices] tn_sum = tn_sum[indices] - LOGGER.debug('tp: %s' % tp_sum) - LOGGER.debug('tn: %s' % tn_sum) - LOGGER.debug('pred_sum: %s' % pred_sum) - LOGGER.debug('true_sum: %s' % true_sum) - if average == 'micro': tp_sum = np.array([tp_sum.sum()]) pred_sum = np.array([pred_sum.sum()]) @@ -233,9 +228,6 @@ def sensitivity_specificity_support(y_true, else: weights = None - LOGGER.debug(specificity) - LOGGER.debug(weights) - if average is not None: assert average != 'binary' or len(specificity) == 1 specificity = np.average(specificity, weights=weights) @@ -509,3 +501,71 @@ def geometric_mean_score(y_true, LOGGER.debug('The sensitivity and specificity are : %s - %s' % (sen, spe)) return np.sqrt(sen * spe) + + +def indexed_balanced_accuracy_score(score_func, + y_true, + y_pred, + alpha=0.1, + squared=True, + **kwargs): + """ Compute the indexed balanced accuracy of a scoring function + + The indexed balanced accuracy (IBA) tends to weight a scoring function + to take into account the imbalancing of the data. + + Parameters + ---------- + score_func : callable, + Score function (or loss function) with signature + ``score_func(y, y_pred, **kwargs)``. + + y_true : ndarray, shape (n_samples, ) + Ground truth (correct) target values. + + y_pred : ndarray, shape (n_samples, ) + Estimated targets as returned by a classifier. + + alpha : float, optional (default=0.1) + Weighting factor. + + squared : bool, optional (default=True) + If ``squared`` is True, then the metric computed will be squared + before to be weighted. + + **kwargs : additional arguments + Additional parameters to be passed to score_func. + + Returns + ------- + iba : float (if ``average`` = None) or ndarray, \ + shape (n_unique_labels, ) + + References + ---------- + .. [1] Garcia, V. and Mollineda, R.A. and Sanchez, J.S. "Theoretical + analysis of a performance measure for imbalanced data" ICPR (2010) + """ + + score = score_func(**kwargs) + + if squared: + score = np.power(score, 2) + + # Pop the arguments to have the proper average, etc. for the + # sensitivity and specificity + labels = kwargs.get('labels', None) + pos_label = kwargs.get('pos_label', 1) + average = kwargs.get('average', 'binary') + sample_weight = kwargs.get('sample_weight', None) + + # Compute the sensitivity and specificity + sen = sensitivity_score(y_true, y_pred, labels, pos_label, average, + sample_weight) + spe = specificity_score(y_true, y_pred, labels, pos_label, average, + sample_weight) + + # Compute the dominance + dom = sen - spe + + return (1. + alpha * dom) * score From 21d2c7d5099ebbd8c8e4866a73176c4a78255777 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 23 Dec 2016 01:31:07 +0100 Subject: [PATCH 11/21] Add a single test for IBA --- imblearn/metrics/__init__.py | 8 ++++---- imblearn/metrics/classification.py | 2 +- imblearn/metrics/tests/test_classification.py | 11 +++++++++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py index 13a17ae9d..951829792 100644 --- a/imblearn/metrics/__init__.py +++ b/imblearn/metrics/__init__.py @@ -7,10 +7,10 @@ from .classification import sensitivity_score from .classification import specificity_score from .classification import geometric_mean_score +from .classification import indexed_balanced_accuracy_score __all__ = [ - 'sensitivity_specificity_support', - 'sensitivity_score', - 'specificity_score', - 'geometric_mean_score' + 'sensitivity_specificity_support', 'sensitivity_score', + 'specificity_score', 'geometric_mean_score', + 'indexed_balanced_accuracy_score' ] diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index 20ffd4652..6964a7d93 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -547,7 +547,7 @@ def indexed_balanced_accuracy_score(score_func, analysis of a performance measure for imbalanced data" ICPR (2010) """ - score = score_func(**kwargs) + score = score_func(y_true, y_pred, **kwargs) if squared: score = np.power(score, 2) diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index 02c2edb53..5d0bb94ce 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -22,6 +22,7 @@ from imblearn.metrics import sensitivity_score from imblearn.metrics import specificity_score from imblearn.metrics import geometric_mean_score +from imblearn.metrics import indexed_balanced_accuracy_score RND_SEED = 42 @@ -232,3 +233,13 @@ def test_geometric_mean_multiclass(): geo_mean = geometric_mean_score(y_true, y_pred, average='weighted') assert_array_almost_equal(geo_mean, 0.65, 2) + + +def test_iba_geo_mean_binary(): + """Test to test the iba using the geometric mean""" + y_true, y_pred, _ = make_prediction(binary=True) + + iba = indexed_balanced_accuracy_score( + geometric_mean_score, y_true, y_pred, alpha=0.5, squared=True) + + assert_almost_equal(iba, 0.54, 2) From 3a8fa2117a0b7e6e53129324154c06704cce8e44 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 23 Dec 2016 01:38:35 +0100 Subject: [PATCH 12/21] Update the doc --- doc/api.rst | 20 ++++++++++++++++++++ doc/whats_new.rst | 1 + 2 files changed, 21 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 77e96457b..272c7d22e 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -119,6 +119,26 @@ Functions pipeline.make_pipeline +.. _metrics_ref: + +Metrics +======= + +.. automodule:: imblearn.metrics + :no-members: + :no-inherited-members: + +.. currentmodule:: imblearn + +Functions +--------- +.. autosummary:: +:toctree: generated/ + metrics.sensitivity_specificity_support + metrics.sensitivity_score + metrics.specificity_score + metrics.geometric_mean_score + metrics.indexed_balanced_accuracy_score .. _datasets_ref: diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 40095d88d..bfe3c81a6 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -28,6 +28,7 @@ New features ~~~~~~~~~~~~ - Added AllKNN under sampling technique. By `Dayvid Oliveira`_. +- Added a module `metrics` implementing some specific scoring function for the problem of balancing. By `Guillaume Lemaitre`_ and `Christos Aridas`_. Enhancement ~~~~~~~~~~~ From f72fa66d83066ef23b5fda6603be811358c04008 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 23 Dec 2016 01:57:43 +0100 Subject: [PATCH 13/21] remove useless import --- imblearn/metrics/classification.py | 1 - 1 file changed, 1 deletion(-) diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index 6964a7d93..cc40bfc3e 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -19,7 +19,6 @@ from sklearn.preprocessing import LabelEncoder from sklearn.utils.fixes import bincount from sklearn.utils.multiclass import unique_labels -from sklearn.utils.sparsefuncs import count_nonzero LOGGER = logging.getLogger(__name__) From 17b322e1238c3acb10fece052c6da11f2329e7ae Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 23 Dec 2016 16:14:55 +0100 Subject: [PATCH 14/21] Change IBA to be a decorator instead of a score --- imblearn/metrics/__init__.py | 5 +- imblearn/metrics/classification.py | 201 ++++++++++++++---- imblearn/metrics/tests/test_classification.py | 161 ++++++++++++-- 3 files changed, 306 insertions(+), 61 deletions(-) diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py index 951829792..5cbe3b2f1 100644 --- a/imblearn/metrics/__init__.py +++ b/imblearn/metrics/__init__.py @@ -7,10 +7,11 @@ from .classification import sensitivity_score from .classification import specificity_score from .classification import geometric_mean_score -from .classification import indexed_balanced_accuracy_score +from .classification import make_indexed_balanced_accuracy +from .classification import classification_report_imbalanced __all__ = [ 'sensitivity_specificity_support', 'sensitivity_score', 'specificity_score', 'geometric_mean_score', - 'indexed_balanced_accuracy_score' + 'make_indexed_balanced_accuracy', 'classification_report_imbalanced' ] diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index cc40bfc3e..dcc13f82a 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -12,10 +12,12 @@ import warnings import logging +import functools import numpy as np -from sklearn.metrics.classification import _check_targets, _prf_divide +from sklearn.metrics.classification import (_check_targets, _prf_divide, + precision_recall_fscore_support) from sklearn.preprocessing import LabelEncoder from sklearn.utils.fixes import bincount from sklearn.utils.multiclass import unique_labels @@ -502,29 +504,15 @@ def geometric_mean_score(y_true, return np.sqrt(sen * spe) -def indexed_balanced_accuracy_score(score_func, - y_true, - y_pred, - alpha=0.1, - squared=True, - **kwargs): - """ Compute the indexed balanced accuracy of a scoring function +def make_indexed_balanced_accuracy(alpha=0.1, squared=True): + """Balance any scoring function using the indexed balanced accuracy - The indexed balanced accuracy (IBA) tends to weight a scoring function - to take into account the imbalancing of the data. + This factory function wraps scoring function to express it as the + indexed balanced accuracy (IBA). You need to use this function to + decorate any scoring function. Parameters ---------- - score_func : callable, - Score function (or loss function) with signature - ``score_func(y, y_pred, **kwargs)``. - - y_true : ndarray, shape (n_samples, ) - Ground truth (correct) target values. - - y_pred : ndarray, shape (n_samples, ) - Estimated targets as returned by a classifier. - alpha : float, optional (default=0.1) Weighting factor. @@ -532,39 +520,160 @@ def indexed_balanced_accuracy_score(score_func, If ``squared`` is True, then the metric computed will be squared before to be weighted. - **kwargs : additional arguments - Additional parameters to be passed to score_func. - Returns ------- - iba : float (if ``average`` = None) or ndarray, \ - shape (n_unique_labels, ) + iba_scoring_func : callable, + Returns the scoring metric decorated which will automatically compute + the indexed balanced accuracy. + + Examples + -------- + >>> from imblearn.metrics import geometric_mean_score as gmean + >>> from imblearn.metrics import make_indexed_balanced_accuracy as iba + >>> gmean = iba(alpha=0.1, squared=True)(gmean) + >>> y_true = [1, 0, 0, 1, 0, 1] + >>> y_pred = [0, 0, 1, 1, 0, 1] + >>> print(gmean(y_true, y_pred, average=None)) + [ 0.44444444 0.44444444] + """ + def decorate(scoring_func): + @functools.wraps(scoring_func) + def compute_score(*args, **kwargs): + # Compute the score from the scoring function + _score = scoring_func(*args, **kwargs) + # Square if desired + if squared: + _score = np.power(_score, 2) + # args will contain the y_pred and y_true + # kwargs will contain the other parameters + labels = kwargs.get('labels', None) + pos_label = kwargs.get('pos_label', 1) + average = kwargs.get('average', 'binary') + sample_weight = kwargs.get('sample_weight', None) + # Compute the sensitivity and specificity + dict_sen_spe = {'labels': labels, 'pos_label': pos_label, + 'average': average, 'sample_weight': sample_weight} + sen, spe, _ = sensitivity_specificity_support(*args, + **dict_sen_spe) + # Compute the dominance + dom = sen - spe + return (1. + alpha * dom) * _score + return compute_score + return decorate + + +def classification_report_imbalanced(y_true, + y_pred, + labels=None, + target_names=None, + sample_weight=None, + digits=2, + alpha=0.1): + """Build a classification report based on metrics used with imbalanced + dataset + + Specific metrics have been proposed to evaluate the classification + performed on imbalanced dataset. This report compiles the + state-of-the-art metrics: precision/recall/specificity, geometric + mean, and indexed balanced accuracy of the + geometric mean. - References + Parameters ---------- - .. [1] Garcia, V. and Mollineda, R.A. and Sanchez, J.S. "Theoretical - analysis of a performance measure for imbalanced data" ICPR (2010) - """ + y_true : ndarray, shape (n_samples, ) + Ground truth (correct) target values. + + y_pred : ndarray, shape (n_samples, ) + Estimated targets as returned by a classifier. - score = score_func(y_true, y_pred, **kwargs) + labels : list, optional + The set of labels to include when ``average != 'binary'``, and their + order if ``average is None``. Labels present in the data can be + excluded, for example to calculate a multiclass average ignoring a + majority negative class, while labels not present in the data will + result in 0 components in a macro average. - if squared: - score = np.power(score, 2) + target_names : list of strings, optional + Optional display names matching the labels (same order). - # Pop the arguments to have the proper average, etc. for the - # sensitivity and specificity - labels = kwargs.get('labels', None) - pos_label = kwargs.get('pos_label', 1) - average = kwargs.get('average', 'binary') - sample_weight = kwargs.get('sample_weight', None) + sample_weight : ndarray, shape (n_samples, ) + Sample weights. - # Compute the sensitivity and specificity - sen = sensitivity_score(y_true, y_pred, labels, pos_label, average, - sample_weight) - spe = specificity_score(y_true, y_pred, labels, pos_label, average, - sample_weight) + digits : int, optional (default=2) + Number of digits for formatting output floating point values - # Compute the dominance - dom = sen - spe + alpha : float, optional (default=0.1) + Weighting factor. - return (1. + alpha * dom) * score + Returns + ------- + report : string + Text summary of the precision, recall, specificity, geometric mean, + and indexed balanced accuracy. + + """ + + if labels is None: + labels = unique_labels(y_true, y_pred) + else: + labels = np.asarray(labels) + + last_line_heading = 'avg / total' + + if target_names is None: + target_names = ['%s' % l for l in labels] + name_width = max(len(cn) for cn in target_names) + width = max(name_width, len(last_line_heading), digits) + + headers = ["pre", "rec", "spe", "f1", + "geo", "iba", "sup"] + fmt = '%% %ds' % width # first column: class name + fmt += ' ' + fmt += ' '.join(['% 9s' for _ in headers]) + fmt += '\n' + + headers = [""] + headers + report = fmt % tuple(headers) + report += '\n' + + # Compute the different metrics + # Precision/recall/f1 + precision, recall, f1, support = precision_recall_fscore_support( + y_true, y_pred, + labels=labels, + average=None, + sample_weight=sample_weight) + # Specificity + specificity = specificity_score(y_true, y_pred, labels=labels, + average=None, sample_weight=sample_weight) + # Geometric mean + geo_mean = geometric_mean_score(y_pred, y_true, labels=labels, + average=None, sample_weight=sample_weight) + # Indexed balanced accuracy + iba_gmean = make_indexed_balanced_accuracy(alpha=alpha, squared=True)( + geometric_mean_score) + iba = iba_gmean(y_pred, y_true, labels=labels, average=None, + sample_weight=sample_weight) + + for i, label in enumerate(labels): + values = [target_names[i]] + for v in (precision[i], recall[i], specificity[i], + f1[i], geo_mean[i], iba[i]): + values += ["{0:0.{1}f}".format(v, digits)] + values += ["{0}".format(support[i])] + report += fmt % tuple(values) + + report += '\n' + + # compute averages + values = [last_line_heading] + for v in (np.average(precision, weights=support), + np.average(recall, weights=support), + np.average(specificity, weights=support), + np.average(f1, weights=support), + np.average(geo_mean, weights=support), + np.average(iba, weights=support)): + values += ["{0:0.{1}f}".format(v, digits)] + values += ['{0}'.format(np.sum(support))] + report += fmt % tuple(values) + return report diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index 5d0bb94ce..af021c0cb 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -2,6 +2,8 @@ from __future__ import division, print_function +import re + from functools import partial import numpy as np @@ -15,14 +17,16 @@ from sklearn import svm from sklearn.preprocessing import label_binarize -from sklearn.utils.testing import assert_not_equal +from sklearn.utils.fixes import np_version +from sklearn.utils.testing import assert_not_equal, assert_raise_message from sklearn.utils.validation import check_random_state from imblearn.metrics import sensitivity_specificity_support from imblearn.metrics import sensitivity_score from imblearn.metrics import specificity_score from imblearn.metrics import geometric_mean_score -from imblearn.metrics import indexed_balanced_accuracy_score +from imblearn.metrics import make_indexed_balanced_accuracy +from imblearn.metrics import classification_report_imbalanced RND_SEED = 42 @@ -79,7 +83,7 @@ def make_prediction(dataset=None, binary=False): def test_sensitivity_specificity_score_binary(): - # Test Sensitivity Specificity for binary classification task + """Test Sensitivity Specificity for binary classification task""" y_true, y_pred, _ = make_prediction(binary=True) # detailed measures for each class @@ -103,8 +107,8 @@ def test_sensitivity_specificity_score_binary(): def test_sensitivity_specificity_f_binary_single_class(): - # Test sensitivity and specificity behave with a single positive or - # negative class + """Test sensitivity and specificity behave with a single positive or + negative class""" # Such a case may occur with non-stratified cross-validation assert_equal(1., sensitivity_score([1, 1], [1, 1])) assert_equal(0., specificity_score([1, 1], [1, 1])) @@ -115,7 +119,7 @@ def test_sensitivity_specificity_f_binary_single_class(): @ignore_warnings def test_sensitivity_specificity_extra_labels(): - # Test handling of explicit additional (not in input) labels to SS + """Test handling of explicit additional (not in input) labels to SS""" y_true = [1, 3, 3, 2] y_pred = [1, 1, 3, 2] @@ -142,7 +146,7 @@ def test_sensitivity_specificity_extra_labels(): @ignore_warnings def test_sensitivity_specificity_ignored_labels(): - # Test a subset of labels may be requested for SS + """Test a subset of labels may be requested for SS""" y_true = [1, 1, 2, 3] y_pred = [1, 3, 3, 3] @@ -166,7 +170,7 @@ def test_sensitivity_specificity_ignored_labels(): def test_sensitivity_specificity_error_multilabels(): - # Test either if an error is raised when the input are multilabels + """Test either if an error is raised when the input are multilabels""" y_true = [1, 3, 3, 2] y_pred = [1, 1, 3, 2] y_true_bin = label_binarize(y_true, classes=np.arange(5)) @@ -177,6 +181,7 @@ def test_sensitivity_specificity_error_multilabels(): @ignore_warnings def test_sensitivity_specificity_support_errors(): + """Test either if an error is raised depending on parameters""" y_true, y_pred, _ = make_prediction(binary=True) # Bad pos_label @@ -196,8 +201,8 @@ def test_sensitivity_specificity_support_errors(): def test_sensitivity_specificity_unused_pos_label(): - # Check warning that pos_label unused when set to non-default value - # but average != 'binary'; even if data is binary. + """Check warning that pos_label unused when set to non-default value + # but average != 'binary'; even if data is binary""" assert_warns_message( UserWarning, "Note that pos_label (set to 2) is " @@ -220,7 +225,7 @@ def test_geometric_mean_support_binary(): def test_geometric_mean_multiclass(): - # Test geometric mean for multiclass classification task + """Test geometric mean for multiclass classification task""" y_true, y_pred, _ = make_prediction(binary=False) # Compute the geometric mean for each of the classes @@ -239,7 +244,137 @@ def test_iba_geo_mean_binary(): """Test to test the iba using the geometric mean""" y_true, y_pred, _ = make_prediction(binary=True) - iba = indexed_balanced_accuracy_score( - geometric_mean_score, y_true, y_pred, alpha=0.5, squared=True) + iba_gmean = make_indexed_balanced_accuracy(alpha=0.5, squared=True)( + geometric_mean_score) + iba = iba_gmean(y_true, y_pred) assert_almost_equal(iba, 0.54, 2) + +def _format_report(report): + """Private function to reformat the report for testing""" + + return ' '.join(report.split()) + + +def test_classification_report_imbalanced_multiclass(): + """Test classification report for multiclass problem""" + iris = datasets.load_iris() + y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) + + # print classification report with class names + expected_report = ("pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 0.81 " + "0.86 0.72 24 versicolor 0.33 0.10 0.86 0.15 0.44 " + "0.08 31 virginica 0.42 0.90 0.55 0.57 0.63 0.51 20 " + "avg / total 0.51 0.53 0.80 0.47 0.62 0.40 75") + + report = classification_report_imbalanced( + y_true, y_pred, labels=np.arange(len(iris.target_names)), + target_names=iris.target_names) + assert_equal(_format_report(report), expected_report) + # print classification report with label detection + expected_report = ("pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 " + "0.86 0.72 24 1 0.33 0.10 0.86 0.15 0.44 0.08 31 2 " + "0.42 0.90 0.55 0.57 0.63 0.51 20 avg / total 0.51 " + "0.53 0.80 0.47 0.62 0.40 75") + + report = classification_report_imbalanced(y_true, y_pred) + assert_equal(_format_report(report), expected_report) + + +def test_classification_report_imbalanced_multiclass_with_digits(): + """Test performance report with added digits in floating point values""" + iris = datasets.load_iris() + y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) + + # print classification report with class names + expected_report = ("pre rec spe f1 geo iba sup setosa 0.82609 0.79167 " + "0.92157 0.80851 0.86409 0.72010 24 versicolor 0.33333 " + "0.09677 0.86364 0.15000 0.43809 0.07717 31 virginica " + "0.41860 0.90000 0.54545 0.57143 0.62645 0.50831 20 " + "avg / total 0.51375 0.53333 0.79733 0.47310 0.62464 " + "0.39788 75") + report = classification_report_imbalanced( + y_true, y_pred, labels=np.arange(len(iris.target_names)), + target_names=iris.target_names, digits=5) + assert_equal(_format_report(report), expected_report) + # print classification report with label detection + expected_report = ("pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 " + "0.86 0.72 24 1 0.33 0.10 0.86 0.15 0.44 0.08 31 " + "2 0.42 0.90 0.55 0.57 0.63 0.51 20 " + "avg / total 0.51 0.53 0.80 0.47 0.62 0.40 75") + report = classification_report_imbalanced(y_true, y_pred) + assert_equal(_format_report(report), expected_report) + + +def test_classification_report_imbalanced_multiclass_with_string_label(): + """Test the report with string label""" + y_true, y_pred, _ = make_prediction(binary=False) + + y_true = np.array(["blue", "green", "red"])[y_true] + y_pred = np.array(["blue", "green", "red"])[y_pred] + + expected_report = """\ + precision recall f1-score support + blue 0.83 0.79 0.81 24 + green 0.33 0.10 0.15 31 + red 0.42 0.90 0.57 20 +avg / total 0.51 0.53 0.47 75 +""" + report = classification_report_imbalanced(y_true, y_pred) + assert_equal(report, expected_report) + + expected_report = """\ + precision recall f1-score support + a 0.83 0.79 0.81 24 + b 0.33 0.10 0.15 31 + c 0.42 0.90 0.57 20 +avg / total 0.51 0.53 0.47 75 +""" + report = classification_report_imbalanced(y_true, y_pred, + target_names=["a", "b", "c"]) + assert_equal(report, expected_report) + + +def test_classification_report_imbalanced_multiclass_with_unicode_label(): + """Test classification report with unicode label""" + y_true, y_pred, _ = make_prediction(binary=False) + + labels = np.array([u"blue\xa2", u"green\xa2", u"red\xa2"]) + y_true = labels[y_true] + y_pred = labels[y_pred] + + expected_report = u"""\ + precision recall f1-score support + blue\xa2 0.83 0.79 0.81 24 + green\xa2 0.33 0.10 0.15 31 + red\xa2 0.42 0.90 0.57 20 +avg / total 0.51 0.53 0.47 75 +""" + if np_version[:3] < (1, 7, 0): + expected_message = ("NumPy < 1.7.0 does not implement" + " searchsorted on unicode data correctly.") + assert_raise_message(RuntimeError, expected_message, + classification_report_imbalanced, y_true, y_pred) + else: + report = classification_report_imbalanced(y_true, y_pred) + assert_equal(report, expected_report) + + +def test_classification_report_imbalanced_multiclass_with_long_string_label(): + """Test classification report with long string label""" + y_true, y_pred, _ = make_prediction(binary=False) + + labels = np.array(["blue", "green"*5, "red"]) + y_true = labels[y_true] + y_pred = labels[y_pred] + + expected_report = """\ + precision recall f1-score support + blue 0.83 0.79 0.81 24 +greengreengreengreengreen 0.33 0.10 0.15 31 + red 0.42 0.90 0.57 20 + avg / total 0.51 0.53 0.47 75 +""" + + report = classification_report_imbalanced(y_true, y_pred) + assert_equal(report, expected_report) From 154749e638f173620163f433c502c496916d6756 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 23 Dec 2016 18:46:46 +0100 Subject: [PATCH 15/21] Implemented the test for the report --- doc/api.rst | 2 +- imblearn/metrics/tests/test_classification.py | 110 +++++++++--------- 2 files changed, 53 insertions(+), 59 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 272c7d22e..faec5f92b 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -138,7 +138,7 @@ Functions metrics.sensitivity_score metrics.specificity_score metrics.geometric_mean_score - metrics.indexed_balanced_accuracy_score + metrics.make_indexed_balanced_accuracy .. _datasets_ref: diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index af021c0cb..8142937d4 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -244,12 +244,13 @@ def test_iba_geo_mean_binary(): """Test to test the iba using the geometric mean""" y_true, y_pred, _ = make_prediction(binary=True) - iba_gmean = make_indexed_balanced_accuracy(alpha=0.5, squared=True)( - geometric_mean_score) + iba_gmean = make_indexed_balanced_accuracy( + alpha=0.5, squared=True)(geometric_mean_score) iba = iba_gmean(y_true, y_pred) assert_almost_equal(iba, 0.54, 2) + def _format_report(report): """Private function to reformat the report for testing""" @@ -262,20 +263,22 @@ def test_classification_report_imbalanced_multiclass(): y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names - expected_report = ("pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 0.81 " - "0.86 0.72 24 versicolor 0.33 0.10 0.86 0.15 0.44 " - "0.08 31 virginica 0.42 0.90 0.55 0.57 0.63 0.51 20 " - "avg / total 0.51 0.53 0.80 0.47 0.62 0.40 75") + expected_report = ('pre rec spe f1 geo iba sup setosa 0.83 0.79 0.92 ' + '0.81 0.86 0.74 24 versicolor 0.33 0.10 0.86 0.15 ' + '0.44 0.19 31 virginica 0.42 0.90 0.55 0.57 0.63 ' + '0.37 20 avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75') report = classification_report_imbalanced( - y_true, y_pred, labels=np.arange(len(iris.target_names)), + y_true, + y_pred, + labels=np.arange(len(iris.target_names)), target_names=iris.target_names) assert_equal(_format_report(report), expected_report) # print classification report with label detection - expected_report = ("pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 " - "0.86 0.72 24 1 0.33 0.10 0.86 0.15 0.44 0.08 31 2 " - "0.42 0.90 0.55 0.57 0.63 0.51 20 avg / total 0.51 " - "0.53 0.80 0.47 0.62 0.40 75") + expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 ' + '0.86 0.74 24 1 0.33 0.10 0.86 0.15 0.44 0.19 31 2 ' + '0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total 0.51 ' + '0.53 0.80 0.47 0.62 0.41 75') report = classification_report_imbalanced(y_true, y_pred) assert_equal(_format_report(report), expected_report) @@ -287,21 +290,24 @@ def test_classification_report_imbalanced_multiclass_with_digits(): y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names - expected_report = ("pre rec spe f1 geo iba sup setosa 0.82609 0.79167 " - "0.92157 0.80851 0.86409 0.72010 24 versicolor 0.33333 " - "0.09677 0.86364 0.15000 0.43809 0.07717 31 virginica " - "0.41860 0.90000 0.54545 0.57143 0.62645 0.50831 20 " - "avg / total 0.51375 0.53333 0.79733 0.47310 0.62464 " - "0.39788 75") + expected_report = ('pre rec spe f1 geo iba sup setosa 0.82609 0.79167 ' + '0.92157 0.80851 0.86409 0.74085 24 versicolor ' + '0.33333 0.09677 0.86364 0.15000 0.43809 0.18727 31 ' + 'virginica 0.41860 0.90000 0.54545 0.57143 0.62645 ' + '0.37208 20 avg / total 0.51375 0.53333 0.79733 ' + '0.47310 0.62464 0.41370 75') report = classification_report_imbalanced( - y_true, y_pred, labels=np.arange(len(iris.target_names)), - target_names=iris.target_names, digits=5) + y_true, + y_pred, + labels=np.arange(len(iris.target_names)), + target_names=iris.target_names, + digits=5) assert_equal(_format_report(report), expected_report) # print classification report with label detection - expected_report = ("pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 " - "0.86 0.72 24 1 0.33 0.10 0.86 0.15 0.44 0.08 31 " - "2 0.42 0.90 0.55 0.57 0.63 0.51 20 " - "avg / total 0.51 0.53 0.80 0.47 0.62 0.40 75") + expected_report = ('pre rec spe f1 geo iba sup 0 0.83 0.79 0.92 0.81 ' + '0.86 0.74 24 1 0.33 0.10 0.86 0.15 0.44 0.19 31 2 ' + '0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total 0.51 ' + '0.53 0.80 0.47 0.62 0.41 75') report = classification_report_imbalanced(y_true, y_pred) assert_equal(_format_report(report), expected_report) @@ -313,26 +319,20 @@ def test_classification_report_imbalanced_multiclass_with_string_label(): y_true = np.array(["blue", "green", "red"])[y_true] y_pred = np.array(["blue", "green", "red"])[y_pred] - expected_report = """\ - precision recall f1-score support - blue 0.83 0.79 0.81 24 - green 0.33 0.10 0.15 31 - red 0.42 0.90 0.57 20 -avg / total 0.51 0.53 0.47 75 -""" + expected_report = ('pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 ' + '0.81 0.86 0.74 24 green 0.33 0.10 0.86 0.15 0.44 ' + '0.19 31 red 0.42 0.90 0.55 0.57 0.63 0.37 20 ' + 'avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75') report = classification_report_imbalanced(y_true, y_pred) - assert_equal(report, expected_report) + assert_equal(_format_report(report), expected_report) - expected_report = """\ - precision recall f1-score support - a 0.83 0.79 0.81 24 - b 0.33 0.10 0.15 31 - c 0.42 0.90 0.57 20 -avg / total 0.51 0.53 0.47 75 -""" - report = classification_report_imbalanced(y_true, y_pred, - target_names=["a", "b", "c"]) - assert_equal(report, expected_report) + expected_report = ('pre rec spe f1 geo iba sup a 0.83 0.79 0.92 0.81 ' + '0.86 0.74 24 b 0.33 0.10 0.86 0.15 0.44 0.19 31 ' + 'c 0.42 0.90 0.55 0.57 0.63 0.37 20 avg / total ' + '0.51 0.53 0.80 0.47 0.62 0.41 75') + report = classification_report_imbalanced( + y_true, y_pred, target_names=["a", "b", "c"]) + assert_equal(_format_report(report), expected_report) def test_classification_report_imbalanced_multiclass_with_unicode_label(): @@ -343,13 +343,10 @@ def test_classification_report_imbalanced_multiclass_with_unicode_label(): y_true = labels[y_true] y_pred = labels[y_pred] - expected_report = u"""\ - precision recall f1-score support - blue\xa2 0.83 0.79 0.81 24 - green\xa2 0.33 0.10 0.15 31 - red\xa2 0.42 0.90 0.57 20 -avg / total 0.51 0.53 0.47 75 -""" + expected_report = (u'pre rec spe f1 geo iba sup blue\xa2 0.83 0.79 ' + u'0.92 0.81 0.86 0.74 24 green\xa2 0.33 0.10 0.86 ' + u'0.15 0.44 0.19 31 red\xa2 0.42 0.90 0.55 0.57 0.63 ' + u'0.37 20 avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75') if np_version[:3] < (1, 7, 0): expected_message = ("NumPy < 1.7.0 does not implement" " searchsorted on unicode data correctly.") @@ -357,24 +354,21 @@ def test_classification_report_imbalanced_multiclass_with_unicode_label(): classification_report_imbalanced, y_true, y_pred) else: report = classification_report_imbalanced(y_true, y_pred) - assert_equal(report, expected_report) + assert_equal(_format_report(report), expected_report) def test_classification_report_imbalanced_multiclass_with_long_string_label(): """Test classification report with long string label""" y_true, y_pred, _ = make_prediction(binary=False) - labels = np.array(["blue", "green"*5, "red"]) + labels = np.array(["blue", "green" * 5, "red"]) y_true = labels[y_true] y_pred = labels[y_pred] - expected_report = """\ - precision recall f1-score support - blue 0.83 0.79 0.81 24 -greengreengreengreengreen 0.33 0.10 0.15 31 - red 0.42 0.90 0.57 20 - avg / total 0.51 0.53 0.47 75 -""" + expected_report = ('pre rec spe f1 geo iba sup blue 0.83 0.79 0.92 0.81 ' + '0.86 0.74 24 greengreengreengreengreen 0.33 0.10 ' + '0.86 0.15 0.44 0.19 31 red 0.42 0.90 0.55 0.57 0.63 ' + '0.37 20 avg / total 0.51 0.53 0.80 0.47 0.62 0.41 75') report = classification_report_imbalanced(y_true, y_pred) - assert_equal(report, expected_report) + assert_equal(_format_report(report), expected_report) From 3da16046539d546bf145bd6127cc02eefaa3d11c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 23 Dec 2016 18:56:28 +0100 Subject: [PATCH 16/21] PEP8 --- imblearn/metrics/classification.py | 62 ++++++++++++------- imblearn/metrics/tests/test_classification.py | 2 - 2 files changed, 41 insertions(+), 23 deletions(-) diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index dcc13f82a..f3b7a58f4 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -536,6 +536,7 @@ def make_indexed_balanced_accuracy(alpha=0.1, squared=True): >>> print(gmean(y_true, y_pred, average=None)) [ 0.44444444 0.44444444] """ + def decorate(scoring_func): @functools.wraps(scoring_func) def compute_score(*args, **kwargs): @@ -551,14 +552,20 @@ def compute_score(*args, **kwargs): average = kwargs.get('average', 'binary') sample_weight = kwargs.get('sample_weight', None) # Compute the sensitivity and specificity - dict_sen_spe = {'labels': labels, 'pos_label': pos_label, - 'average': average, 'sample_weight': sample_weight} + dict_sen_spe = { + 'labels': labels, + 'pos_label': pos_label, + 'average': average, + 'sample_weight': sample_weight + } sen, spe, _ = sensitivity_specificity_support(*args, **dict_sen_spe) # Compute the dominance dom = sen - spe return (1. + alpha * dom) * _score + return compute_score + return decorate @@ -625,8 +632,7 @@ def classification_report_imbalanced(y_true, name_width = max(len(cn) for cn in target_names) width = max(name_width, len(last_line_heading), digits) - headers = ["pre", "rec", "spe", "f1", - "geo", "iba", "sup"] + headers = ["pre", "rec", "spe", "f1", "geo", "iba", "sup"] fmt = '%% %ds' % width # first column: class name fmt += ' ' fmt += ' '.join(['% 9s' for _ in headers]) @@ -639,26 +645,39 @@ def classification_report_imbalanced(y_true, # Compute the different metrics # Precision/recall/f1 precision, recall, f1, support = precision_recall_fscore_support( - y_true, y_pred, + y_true, + y_pred, labels=labels, average=None, sample_weight=sample_weight) # Specificity - specificity = specificity_score(y_true, y_pred, labels=labels, - average=None, sample_weight=sample_weight) + specificity = specificity_score( + y_true, + y_pred, + labels=labels, + average=None, + sample_weight=sample_weight) # Geometric mean - geo_mean = geometric_mean_score(y_pred, y_true, labels=labels, - average=None, sample_weight=sample_weight) + geo_mean = geometric_mean_score( + y_pred, + y_true, + labels=labels, + average=None, + sample_weight=sample_weight) # Indexed balanced accuracy - iba_gmean = make_indexed_balanced_accuracy(alpha=alpha, squared=True)( - geometric_mean_score) - iba = iba_gmean(y_pred, y_true, labels=labels, average=None, - sample_weight=sample_weight) + iba_gmean = make_indexed_balanced_accuracy( + alpha=alpha, squared=True)(geometric_mean_score) + iba = iba_gmean( + y_pred, + y_true, + labels=labels, + average=None, + sample_weight=sample_weight) for i, label in enumerate(labels): values = [target_names[i]] - for v in (precision[i], recall[i], specificity[i], - f1[i], geo_mean[i], iba[i]): + for v in (precision[i], recall[i], specificity[i], f1[i], geo_mean[i], + iba[i]): values += ["{0:0.{1}f}".format(v, digits)] values += ["{0}".format(support[i])] report += fmt % tuple(values) @@ -667,12 +686,13 @@ def classification_report_imbalanced(y_true, # compute averages values = [last_line_heading] - for v in (np.average(precision, weights=support), - np.average(recall, weights=support), - np.average(specificity, weights=support), - np.average(f1, weights=support), - np.average(geo_mean, weights=support), - np.average(iba, weights=support)): + for v in (np.average( + precision, weights=support), np.average( + recall, weights=support), np.average( + specificity, weights=support), np.average( + f1, weights=support), np.average( + geo_mean, weights=support), np.average( + iba, weights=support)): values += ["{0:0.{1}f}".format(v, digits)] values += ['{0}'.format(np.sum(support))] report += fmt % tuple(values) diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index 8142937d4..a9a938c1b 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -2,8 +2,6 @@ from __future__ import division, print_function -import re - from functools import partial import numpy as np From ca145e9c6e2ea81dedebd0956cf7a09989545737 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 25 Dec 2016 18:22:31 +0100 Subject: [PATCH 17/21] Modify the test for scorer --- imblearn/metrics/tests/test_score_objects.py | 104 +++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 imblearn/metrics/tests/test_score_objects.py diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py new file mode 100644 index 000000000..e22a64495 --- /dev/null +++ b/imblearn/metrics/tests/test_score_objects.py @@ -0,0 +1,104 @@ +from numpy.testing import assert_almost_equal + +from sklearn.datasets import make_blobs +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.metrics import make_scorer +from sklearn.svm import LinearSVC + +from imblearn.metrics import (sensitivity_score, specificity_score, + geometric_mean_score, + make_indexed_balanced_accuracy) + + +def test_imblearn_classification_scorers(): + """Test if the implemented scorer can be used in scikit-learn""" + X, y = make_blobs(random_state=0, centers=2) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + clf = LinearSVC(random_state=0) + clf.fit(X_train, y_train) + + # sensitivity scorer + scorer = make_scorer(sensitivity_score, pos_label=None, average='macro') + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.92, 2) + + scorer = make_scorer(sensitivity_score, pos_label=None, average='weighted') + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.92, 2) + + scorer = make_scorer(sensitivity_score, pos_label=None, average='micro') + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.92, 2) + + scorer = make_scorer(sensitivity_score, pos_label=1) + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.92, 2) + + # specificity scorer + scorer = make_scorer(specificity_score, pos_label=None, average='macro') + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.92, 2) + + scorer = make_scorer(specificity_score, pos_label=None, average='weighted') + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.92, 2) + + scorer = make_scorer(specificity_score, pos_label=None, average='micro') + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.92, 2) + + scorer = make_scorer(specificity_score, pos_label=1) + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.95, 2) + + # geometric_mean scorer + scorer = make_scorer(geometric_mean_score, pos_label=None, average='macro') + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.92, 2) + + scorer = make_scorer( + geometric_mean_score, pos_label=None, average='weighted') + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.92, 2) + + scorer = make_scorer(geometric_mean_score, pos_label=None, average='micro') + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.92, 2) + + scorer = make_scorer(geometric_mean_score, pos_label=1) + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.92, 2) + + # make a iba metric before a scorer + geo_mean_iba = make_indexed_balanced_accuracy()(geometric_mean_score) + scorer = make_scorer(geo_mean_iba, pos_label=None, average='macro') + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.85, 2) + + scorer = make_scorer(geo_mean_iba, pos_label=None, average='weighted') + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.85, 2) + + scorer = make_scorer(geo_mean_iba, pos_label=None, average='micro') + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.85, 2) + + scorer = make_scorer(geo_mean_iba, pos_label=1) + grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) + grid.fit(X_train, y_train).predict(X_test) + assert_almost_equal(grid.best_score_, 0.84, 2) From 115e817b7bb8618048590a29b1cd32ffb9d3a3d3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 25 Dec 2016 18:48:34 +0100 Subject: [PATCH 18/21] handle the model_selection in the testing --- imblearn/metrics/tests/test_score_objects.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py index e22a64495..1faf35191 100644 --- a/imblearn/metrics/tests/test_score_objects.py +++ b/imblearn/metrics/tests/test_score_objects.py @@ -1,7 +1,15 @@ from numpy.testing import assert_almost_equal +import sklearn +# Get the version +(major, minor, _) = sklearn.__version__.split('.') +if minor < 18: + from sklearn.cross_validation import train_test_split + from sklearn.grid_search import GridSearchCV +else: + from sklearn.model_selection import train_test_split, GridSearchCV + from sklearn.datasets import make_blobs -from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import make_scorer from sklearn.svm import LinearSVC From a740ddf6540403d6164bfbea2488a71426033e5f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 25 Dec 2016 19:08:25 +0100 Subject: [PATCH 19/21] Solve the import issue --- imblearn/metrics/tests/test_score_objects.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py index 1faf35191..7232264f3 100644 --- a/imblearn/metrics/tests/test_score_objects.py +++ b/imblearn/metrics/tests/test_score_objects.py @@ -1,13 +1,6 @@ from numpy.testing import assert_almost_equal import sklearn -# Get the version -(major, minor, _) = sklearn.__version__.split('.') -if minor < 18: - from sklearn.cross_validation import train_test_split - from sklearn.grid_search import GridSearchCV -else: - from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.datasets import make_blobs from sklearn.metrics import make_scorer @@ -16,6 +9,13 @@ from imblearn.metrics import (sensitivity_score, specificity_score, geometric_mean_score, make_indexed_balanced_accuracy) +# Get the version +(major, minor, _) = sklearn.__version__.split('.') +if int(minor) < 18: + from sklearn.cross_validation import train_test_split + from sklearn.grid_search import GridSearchCV +else: + from sklearn.model_selection import train_test_split, GridSearchCV def test_imblearn_classification_scorers(): From 0b36f6e6c1e383f4904c861c091c866211570755 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 28 Dec 2016 00:36:11 +0100 Subject: [PATCH 20/21] correct the name of IBA --- doc/api.rst | 2 +- imblearn/metrics/__init__.py | 4 ++-- imblearn/metrics/classification.py | 6 +++--- imblearn/metrics/tests/test_classification.py | 4 ++-- imblearn/metrics/tests/test_score_objects.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index faec5f92b..cdd56c6ce 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -138,7 +138,7 @@ Functions metrics.sensitivity_score metrics.specificity_score metrics.geometric_mean_score - metrics.make_indexed_balanced_accuracy + metrics.make_index_balanced_accuracy .. _datasets_ref: diff --git a/imblearn/metrics/__init__.py b/imblearn/metrics/__init__.py index 5cbe3b2f1..037a200d9 100644 --- a/imblearn/metrics/__init__.py +++ b/imblearn/metrics/__init__.py @@ -7,11 +7,11 @@ from .classification import sensitivity_score from .classification import specificity_score from .classification import geometric_mean_score -from .classification import make_indexed_balanced_accuracy +from .classification import make_index_balanced_accuracy from .classification import classification_report_imbalanced __all__ = [ 'sensitivity_specificity_support', 'sensitivity_score', 'specificity_score', 'geometric_mean_score', - 'make_indexed_balanced_accuracy', 'classification_report_imbalanced' + 'make_index_balanced_accuracy', 'classification_report_imbalanced' ] diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index f3b7a58f4..e145a26f7 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -504,7 +504,7 @@ def geometric_mean_score(y_true, return np.sqrt(sen * spe) -def make_indexed_balanced_accuracy(alpha=0.1, squared=True): +def make_index_balanced_accuracy(alpha=0.1, squared=True): """Balance any scoring function using the indexed balanced accuracy This factory function wraps scoring function to express it as the @@ -529,7 +529,7 @@ def make_indexed_balanced_accuracy(alpha=0.1, squared=True): Examples -------- >>> from imblearn.metrics import geometric_mean_score as gmean - >>> from imblearn.metrics import make_indexed_balanced_accuracy as iba + >>> from imblearn.metrics import make_index_balanced_accuracy as iba >>> gmean = iba(alpha=0.1, squared=True)(gmean) >>> y_true = [1, 0, 0, 1, 0, 1] >>> y_pred = [0, 0, 1, 1, 0, 1] @@ -665,7 +665,7 @@ def classification_report_imbalanced(y_true, average=None, sample_weight=sample_weight) # Indexed balanced accuracy - iba_gmean = make_indexed_balanced_accuracy( + iba_gmean = make_index_balanced_accuracy( alpha=alpha, squared=True)(geometric_mean_score) iba = iba_gmean( y_pred, diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index a9a938c1b..ab858fe81 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -23,7 +23,7 @@ from imblearn.metrics import sensitivity_score from imblearn.metrics import specificity_score from imblearn.metrics import geometric_mean_score -from imblearn.metrics import make_indexed_balanced_accuracy +from imblearn.metrics import make_index_balanced_accuracy from imblearn.metrics import classification_report_imbalanced RND_SEED = 42 @@ -242,7 +242,7 @@ def test_iba_geo_mean_binary(): """Test to test the iba using the geometric mean""" y_true, y_pred, _ = make_prediction(binary=True) - iba_gmean = make_indexed_balanced_accuracy( + iba_gmean = make_index_balanced_accuracy( alpha=0.5, squared=True)(geometric_mean_score) iba = iba_gmean(y_true, y_pred) diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py index 7232264f3..65a28cc98 100644 --- a/imblearn/metrics/tests/test_score_objects.py +++ b/imblearn/metrics/tests/test_score_objects.py @@ -8,7 +8,7 @@ from imblearn.metrics import (sensitivity_score, specificity_score, geometric_mean_score, - make_indexed_balanced_accuracy) + make_index_balanced_accuracy) # Get the version (major, minor, _) = sklearn.__version__.split('.') if int(minor) < 18: @@ -90,7 +90,7 @@ def test_imblearn_classification_scorers(): assert_almost_equal(grid.best_score_, 0.92, 2) # make a iba metric before a scorer - geo_mean_iba = make_indexed_balanced_accuracy()(geometric_mean_score) + geo_mean_iba = make_index_balanced_accuracy()(geometric_mean_score) scorer = make_scorer(geo_mean_iba, pos_label=None, average='macro') grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=scorer) grid.fit(X_train, y_train).predict(X_test) From c54197c3ba25ad5866bab803241772f5a1b1903e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 28 Dec 2016 01:07:28 +0100 Subject: [PATCH 21/21] Add example for each function --- imblearn/metrics/classification.py | 82 ++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index e145a26f7..6937c892c 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -112,6 +112,19 @@ def sensitivity_specificity_support(y_true, shape (n_unique_labels, ) The number of occurrences of each label in ``y_true``. + Examples + -------- + >>> import numpy as np + >>> from imblearn.metrics import sensitivity_specificity_support + >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig']) + >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog']) + >>> sensitivity_specificity_support(y_true, y_pred, average='macro') + (0.33333333333333331, 0.66666666666666663, None) + >>> sensitivity_specificity_support(y_true, y_pred, average='micro') + (0.33333333333333331, 0.66666666666666663, None) + >>> sensitivity_specificity_support(y_true, y_pred, average='weighted') + (0.33333333333333331, 0.66666666666666663, None) + References ---------- .. [1] `Wikipedia entry for the Sensitivity and specificity @@ -303,6 +316,21 @@ def sensitivity_score(y_true, sample_weight : ndarray, shape (n_samples, ) Sample weights. + Examples + -------- + >>> import numpy as np + >>> from imblearn.metrics import sensitivity_score + >>> y_true = [0, 1, 2, 0, 1, 2] + >>> y_pred = [0, 2, 1, 0, 0, 1] + >>> sensitivity_score(y_true, y_pred, average='macro') + 0.33333333333333331 + >>> sensitivity_score(y_true, y_pred, average='micro') + 0.33333333333333331 + >>> sensitivity_score(y_true, y_pred, average='weighted') + 0.33333333333333331 + >>> sensitivity_score(y_true, y_pred, average=None) + array([ 1., 0., 0.]) + Returns ------- specificity : float (if ``average`` = None) or ndarray, \ @@ -387,6 +415,21 @@ def specificity_score(y_true, sample_weight : ndarray, shape (n_samples, ) Sample weights. + Examples + -------- + >>> import numpy as np + >>> from imblearn.metrics import specificity_score + >>> y_true = [0, 1, 2, 0, 1, 2] + >>> y_pred = [0, 2, 1, 0, 0, 1] + >>> specificity_score(y_true, y_pred, average='macro') + 0.66666666666666663 + >>> specificity_score(y_true, y_pred, average='micro') + 0.66666666666666663 + >>> specificity_score(y_true, y_pred, average='weighted') + 0.66666666666666663 + >>> specificity_score(y_true, y_pred, average=None) + array([ 0.75, 0.5 , 0.75]) + Returns ------- specificity : float (if ``average`` = None) or ndarray, \ @@ -480,6 +523,21 @@ def geometric_mean_score(y_true, geometric_mean : float (if ``average`` = None) or ndarray, \ shape (n_unique_labels, ) + Examples + -------- + >>> import numpy as np + >>> from imblearn.metrics import geometric_mean_score + >>> y_true = [0, 1, 2, 0, 1, 2] + >>> y_pred = [0, 2, 1, 0, 0, 1] + >>> geometric_mean_score(y_true, y_pred, average='macro') + 0.47140452079103168 + >>> geometric_mean_score(y_true, y_pred, average='micro') + 0.47140452079103168 + >>> geometric_mean_score(y_true, y_pred, average='weighted') + 0.47140452079103168 + >>> geometric_mean_score(y_true, y_pred, average=None) + array([ 0.8660254, 0. , 0. ]) + References ---------- .. [1] Kubat, M. and Matwin, S. "Addressing the curse of @@ -618,6 +676,30 @@ def classification_report_imbalanced(y_true, Text summary of the precision, recall, specificity, geometric mean, and indexed balanced accuracy. + Examples + -------- + >>> import numpy as np + >>> from imblearn.metrics import classification_report_imbalanced + >>> y_true = [0, 1, 2, 2, 2] + >>> y_pred = [0, 0, 2, 2, 1] # doctest : +NORMALIZE_WHITESPACE + >>> target_names = ['class 0', 'class 1', \ + 'class 2'] # doctest : +NORMALIZE_WHITESPACE + >>> print(classification_report_imbalanced(y_true, y_pred, \ + target_names=target_names)) + pre rec spe f1 geo iba\ + sup + + class 0 0.50 1.00 0.75 0.67 0.71 0.48\ + 1 + class 1 0.00 0.00 0.75 0.00 0.00 0.00\ + 1 + class 2 1.00 0.67 1.00 0.80 0.82 0.69\ + 3 + + avg / total 0.70 0.60 0.90 0.61 0.63 0.51\ + 5 + + """ if labels is None: