Skip to content

Commit 334bee0

Browse files
author
Guillaume Lemaitre
committed
Add missing folder
1 parent 5c58dd7 commit 334bee0

File tree

3 files changed

+112
-0
lines changed

3 files changed

+112
-0
lines changed

examples/applications/README.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
.. _realword_examples:
2+
3+
Examples based on real world datasets
4+
-------------------------------------
5+
6+
Examples which use real-word dataset.
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
"""
2+
==========================================================
3+
Benchmark over-sampling methods in a face regognition task
4+
==========================================================
5+
6+
In this face recognition example two faces are used from the LFW
7+
(Faces in the Wild) dataset. Several implemented over-sampling
8+
methods are used in conjunction with a 3NN classifier in order
9+
to examine the improvement of the classifier's output quality
10+
by using an over-sampler.
11+
12+
"""
13+
14+
import matplotlib.pyplot as plt
15+
import numpy as np
16+
from scipy import interp
17+
from sklearn import datasets, neighbors
18+
from sklearn.metrics import auc, roc_curve
19+
from sklearn.model_selection import StratifiedKFold
20+
21+
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
22+
from imblearn.pipeline import make_pipeline
23+
24+
print(__doc__)
25+
26+
LW = 2
27+
RANDOM_STATE = 42
28+
29+
30+
class DummySampler(object):
31+
32+
def sample(self, X, y):
33+
return X, y
34+
35+
def fit(self, X, y):
36+
return self
37+
38+
def fit_sample(self, X, y):
39+
return self.sample(X, y)
40+
41+
42+
cv = StratifiedKFold(n_splits=3)
43+
44+
# Load the dataset
45+
data = datasets.fetch_lfw_people()
46+
majority_person = 1871 # 530 photos of George W Bush
47+
minority_person = 531 # 29 photos of Bill Clinton
48+
majority_idxs = np.flatnonzero(data.target == majority_person)
49+
minority_idxs = np.flatnonzero(data.target == minority_person)
50+
idxs = np.hstack((majority_idxs, minority_idxs))
51+
52+
X = data.data[idxs]
53+
y = data.target[idxs]
54+
y[y == majority_person] = 0
55+
y[y == minority_person] = 1
56+
57+
58+
classifier = ['3NN', neighbors.KNeighborsClassifier(3)]
59+
60+
samplers = [
61+
['Standard', DummySampler()],
62+
['ADASYN', ADASYN(random_state=RANDOM_STATE)],
63+
['ROS', RandomOverSampler(random_state=RANDOM_STATE)],
64+
['SMOTE', SMOTE(random_state=RANDOM_STATE)],
65+
]
66+
67+
pipelines = [
68+
['{}-{}'.format(sampler[0], classifier[0]),
69+
make_pipeline(sampler[1], classifier[1])]
70+
for sampler in samplers
71+
]
72+
73+
74+
for name, pipeline in pipelines:
75+
mean_tpr = 0.0
76+
mean_fpr = np.linspace(0, 1, 100)
77+
for train, test in cv.split(X, y):
78+
probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
79+
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
80+
mean_tpr += interp(mean_fpr, fpr, tpr)
81+
mean_tpr[0] = 0.0
82+
roc_auc = auc(fpr, tpr)
83+
84+
mean_tpr /= cv.get_n_splits(X, y)
85+
mean_tpr[-1] = 1.0
86+
mean_auc = auc(mean_fpr, mean_tpr)
87+
plt.plot(mean_fpr, mean_tpr, linestyle='--',
88+
label='{} (area = %0.2f)'.format(name) % mean_auc, lw=LW)
89+
90+
plt.xlim([-0.05, 1.05])
91+
plt.ylim([-0.05, 1.05])
92+
plt.xlabel('False Positive Rate')
93+
plt.ylabel('True Positive Rate')
94+
plt.title('Receiver operating characteristic example')
95+
plt.legend(loc="lower right")
96+
97+
plt.plot([0, 1], [0, 1], linestyle='--', lw=LW, color='k',
98+
label='Luck')
99+
100+
plt.show()

examples/evaluation/README.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
.. _evaluation_examples:
2+
3+
Evaluation examples
4+
-------------------
5+
6+
Examples illustrating how classification using imbalanced dataset can be done.

0 commit comments

Comments
 (0)