Skip to content

Commit 6495632

Browse files
committed
Enable pivot bootstrap intervals
1 parent ba1446c commit 6495632

File tree

2 files changed

+78
-60
lines changed

2 files changed

+78
-60
lines changed

econml/bootstrap.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import numpy as np
66
from joblib import Parallel, delayed
77
from sklearn.base import clone
8+
from scipy.stats import norm
89

910

1011
class BootstrapEstimator:
@@ -44,14 +45,22 @@ class BootstrapEstimator:
4445
In case a method ending in '_interval' exists on the wrapped object, whether
4546
that should be preferred (meaning this wrapper will compute the mean of it).
4647
This option only affects behavior if `compute_means` is set to ``True``.
48+
49+
bootstrap_type: 'percentile' or 'standard', default 'percentile'
50+
Bootstrap method used to compute results. 'percentile' will result in using the empiracal CDF of
51+
the replicated copmutations of the statistics. 'standard' will instead compute a pivot interval
52+
assuming the replicates are normally distributed.
4753
"""
4854

49-
def __init__(self, wrapped, n_bootstrap_samples=1000, n_jobs=None, compute_means=True, prefer_wrapped=False):
55+
def __init__(self, wrapped, n_bootstrap_samples=1000, n_jobs=None, compute_means=True, prefer_wrapped=False,
56+
bootstrap_type='percentile'):
5057
self._instances = [clone(wrapped, safe=False) for _ in range(n_bootstrap_samples)]
5158
self._n_bootstrap_samples = n_bootstrap_samples
5259
self._n_jobs = n_jobs
5360
self._compute_means = compute_means
5461
self._prefer_wrapped = prefer_wrapped
62+
self._boostrap_type = bootstrap_type
63+
self._wrapped = wrapped
5564

5665
# TODO: Add a __dir__ implementation?
5766

@@ -87,7 +96,7 @@ def __getattr__(self, name):
8796
def proxy(make_call, name, summary):
8897
def summarize_with(f):
8998
return summary(np.array(Parallel(n_jobs=self._n_jobs, prefer='threads', verbose=3)(
90-
(f, (obj, name), {}) for obj in self._instances)))
99+
(f, (obj, name), {}) for obj in self._instances)), f(self._wrapped, name))
91100
if make_call:
92101
def call(*args, **kwargs):
93102
return summarize_with(lambda obj, name: getattr(obj, name)(*args, **kwargs))
@@ -97,16 +106,24 @@ def call(*args, **kwargs):
97106

98107
def get_mean():
99108
# for attributes that exist on the wrapped object, just compute the mean of the wrapped calls
100-
return proxy(callable(getattr(self._instances[0], name)), name, lambda arr: np.mean(arr, axis=0))
109+
return proxy(callable(getattr(self._instances[0], name)), name, lambda arr, _: np.mean(arr, axis=0))
101110

102-
def get_interval():
111+
def get_interval(kind=None):
103112
# if the attribute exists on the wrapped object once we remove the suffix,
104113
# then we should be computing a confidence interval for the wrapped calls
105114
prefix = name[: - len("_interval")]
106115

107116
def call_with_bounds(can_call, lower, upper):
108-
return proxy(can_call, prefix,
109-
lambda arr: (np.percentile(arr, lower, axis=0), np.percentile(arr, upper, axis=0)))
117+
def percentile_bootstrap(arr, _):
118+
return np.percentile(arr, lower, axis=0), np.percentile(arr, upper, axis=0)
119+
120+
def pivot_bootstrap(arr, est):
121+
std = np.std(arr, axis=0)
122+
return est - norm.ppf(upper / 100) * std, est - norm.ppf(lower / 100) * std
123+
# TODO: studentized bootstrap? would be more accurate in most cases but can we avoid
124+
# second level bootstrap which would be prohibitive computationally
125+
fn = {'percentile': percentile_bootstrap, 'standard': pivot_bootstrap}[self._boostrap_type]
126+
return proxy(can_call, prefix, fn)
110127

111128
can_call = callable(getattr(self._instances[0], prefix))
112129
if can_call:

econml/tests/test_bootstrap.py

Lines changed: 55 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -17,60 +17,61 @@ class TestBootstrap(unittest.TestCase):
1717
def test_with_sklearn(self):
1818
"""Test that we can bootstrap sklearn estimators."""
1919
for n_jobs in [None, -1]: # test parallelism
20-
x = np.random.normal(size=(1000, 1))
21-
y = x * 0.5 + np.random.normal(size=(1000, 1))
22-
y = y.flatten()
23-
24-
est = LinearRegression()
25-
est.fit(x, y)
26-
27-
bs = BootstrapEstimator(est, 50, n_jobs=n_jobs)
28-
# test that we can fit with the same arguments as the base estimator
29-
bs.fit(x, y)
30-
31-
# test that we can get the same attribute for the bootstrap as the original, with the same shape
32-
self.assertEqual(np.shape(est.coef_), np.shape(bs.coef_))
33-
34-
# test that we can get an interval for the same attribute for the bootstrap as the original,
35-
# with the same shape for the lower and upper bounds
36-
lower, upper = bs.coef__interval()
37-
for bound in [lower, upper]:
38-
self.assertEqual(np.shape(est.coef_), np.shape(bound))
39-
40-
# test that the lower and upper bounds differ
41-
assert (lower <= upper).all()
42-
assert (lower < upper).any()
43-
44-
# test that we can do the same thing once we provide percentile bounds
45-
lower, upper = bs.coef__interval(lower=10, upper=90)
46-
for bound in [lower, upper]:
47-
self.assertEqual(np.shape(est.coef_), np.shape(bound))
48-
49-
# test that the lower and upper bounds differ
50-
assert (lower <= upper).all()
51-
assert (lower < upper).any()
52-
53-
# test that we can do the same thing with the results of a method, rather than an attribute
54-
self.assertEqual(np.shape(est.predict(x)), np.shape(bs.predict(x)))
55-
56-
# test that we can get an interval for the same attribute for the bootstrap as the original,
57-
# with the same shape for the lower and upper bounds
58-
lower, upper = bs.predict_interval(x)
59-
for bound in [lower, upper]:
60-
self.assertEqual(np.shape(est.predict(x)), np.shape(bound))
61-
62-
# test that the lower and upper bounds differ
63-
assert (lower <= upper).all()
64-
assert (lower < upper).any()
65-
66-
# test that we can do the same thing once we provide percentile bounds
67-
lower, upper = bs.predict_interval(x, lower=10, upper=90)
68-
for bound in [lower, upper]:
69-
self.assertEqual(np.shape(est.predict(x)), np.shape(bound))
70-
71-
# test that the lower and upper bounds differ
72-
assert (lower <= upper).all()
73-
assert (lower < upper).any()
20+
for kind in ['percentile', 'standard']: # test both percentile and pivot intervals
21+
x = np.random.normal(size=(1000, 1))
22+
y = x * 0.5 + np.random.normal(size=(1000, 1))
23+
y = y.flatten()
24+
25+
est = LinearRegression()
26+
est.fit(x, y)
27+
28+
bs = BootstrapEstimator(est, 50, n_jobs=n_jobs, bootstrap_type=kind)
29+
# test that we can fit with the same arguments as the base estimator
30+
bs.fit(x, y)
31+
32+
# test that we can get the same attribute for the bootstrap as the original, with the same shape
33+
self.assertEqual(np.shape(est.coef_), np.shape(bs.coef_))
34+
35+
# test that we can get an interval for the same attribute for the bootstrap as the original,
36+
# with the same shape for the lower and upper bounds
37+
lower, upper = bs.coef__interval()
38+
for bound in [lower, upper]:
39+
self.assertEqual(np.shape(est.coef_), np.shape(bound))
40+
41+
# test that the lower and upper bounds differ
42+
assert (lower <= upper).all()
43+
assert (lower < upper).any()
44+
45+
# test that we can do the same thing once we provide percentile bounds
46+
lower, upper = bs.coef__interval(lower=10, upper=90)
47+
for bound in [lower, upper]:
48+
self.assertEqual(np.shape(est.coef_), np.shape(bound))
49+
50+
# test that the lower and upper bounds differ
51+
assert (lower <= upper).all()
52+
assert (lower < upper).any()
53+
54+
# test that we can do the same thing with the results of a method, rather than an attribute
55+
self.assertEqual(np.shape(est.predict(x)), np.shape(bs.predict(x)))
56+
57+
# test that we can get an interval for the same attribute for the bootstrap as the original,
58+
# with the same shape for the lower and upper bounds
59+
lower, upper = bs.predict_interval(x)
60+
for bound in [lower, upper]:
61+
self.assertEqual(np.shape(est.predict(x)), np.shape(bound))
62+
63+
# test that the lower and upper bounds differ
64+
assert (lower <= upper).all()
65+
assert (lower < upper).any()
66+
67+
# test that we can do the same thing once we provide percentile bounds
68+
lower, upper = bs.predict_interval(x, lower=10, upper=90)
69+
for bound in [lower, upper]:
70+
self.assertEqual(np.shape(est.predict(x)), np.shape(bound))
71+
72+
# test that the lower and upper bounds differ
73+
assert (lower <= upper).all()
74+
assert (lower < upper).any()
7475

7576
def test_with_econml(self):
7677
"""Test that we can bootstrap econml estimators."""

0 commit comments

Comments
 (0)