diff --git a/doc/reference.rst b/doc/reference.rst
index 92bc265d8..c067d3202 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -4,6 +4,7 @@ Public Module Reference
 .. autosummary::
     :toctree: _autosummary
 
+    econml.automated_ml
     econml.bootstrap
     econml.cate_estimator
     econml.cate_interpreter
@@ -15,9 +16,9 @@ Public Module Reference
     econml.inference
     econml.ortho_forest
     econml.metalearners
+    econml.ortho_iv
     econml.two_stage_least_squares
     econml.utilities
-    econml.automated_ml
 
 Private Module Reference
 ========================
diff --git a/econml/_ortho_learner.py b/econml/_ortho_learner.py
index b14e9c20e..418a528c6 100644
--- a/econml/_ortho_learner.py
+++ b/econml/_ortho_learner.py
@@ -55,12 +55,13 @@ def _crossfit(model, folds, *args, **kwargs):
         function estimates a model of the nuisance function, based on the input
         data to fit. Predict evaluates the fitted nuisance function on the input
         data to predict.
-    folds : list of tuples
+    folds : list of tuples or None
         The crossfitting fold structure. Every entry in the list is a tuple whose
         first element are the training indices of the args and kwargs data and
         the second entry are the test indices. If the union of the test indices
         is not the full set of all indices, then the remaining nuisance parameters
-        for the missing indices have value NaN.
+        for the missing indices have value NaN.  If folds is None, then cross fitting
+        is not performed; all indices are used for both model fitting and prediction
     args : a sequence of (numpy matrices or None)
         Each matrix is a data variable whose first index corresponds to a sample
     kwargs : a sequence of key-value args, with values being (numpy matrices or None)
@@ -120,6 +121,19 @@ def predict(self, X, y, W=None):
     """
     model_list = []
     fitted_inds = []
+
+    if folds is None:  # skip crossfitting
+        model_list.append(clone(model, safe=False))
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        model_list[0].fit(*args, **kwargs)
+        nuisances = model_list[0].predict(*args, **kwargs)
+
+        if not isinstance(nuisances, tuple):
+            nuisances = (nuisances,)
+
+        first_arr = args[0] if args else kwargs.items()[0][1]
+        return nuisances, model_list, range(first_arr.shape[0])
+
     for idx, (train_idxs, test_idxs) in enumerate(folds):
         model_list.append(clone(model, safe=False))
         if len(np.intersect1d(train_idxs, test_idxs)) > 0:
@@ -129,18 +143,11 @@ def predict(self, X, y, W=None):
             raise AttributeError("Invalid crossfitting fold structure. The same index appears in two test folds.")
         fitted_inds = np.concatenate((fitted_inds, test_idxs))
 
-        args_train = ()
-        args_test = ()
-        for var in args:
-            args_train += (var[train_idxs],) if var is not None else (None,)
-            args_test += (var[test_idxs],) if var is not None else (None,)
+        args_train = tuple(var[train_idxs] if var is not None else None for var in args)
+        args_test = tuple(var[test_idxs] if var is not None else None for var in args)
 
-        kwargs_train = {}
-        kwargs_test = {}
-        for key, var in kwargs.items():
-            if var is not None:
-                kwargs_train[key] = var[train_idxs]
-                kwargs_test[key] = var[test_idxs]
+        kwargs_train = {key: var[train_idxs] for key, var in kwargs.items() if var is not None}
+        kwargs_test = {key: var[test_idxs] for key, var in kwargs.items() if var is not None}
 
         model_list[idx].fit(*args_train, **kwargs_train)
 
@@ -246,6 +253,9 @@ class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator):
     discrete_treatment: bool
         Whether the treatment values should be treated as categorical, rather than continuous, quantities
 
+    discrete_instrument: bool
+        Whether the instrument values should be treated as categorical, rather than continuous, quantities
+
     n_splits: int, cross-validation generator or an iterable
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
@@ -310,7 +320,8 @@ def score(self, Y, T, W=None, nuisances=None):
         y = X[:, 0] + X[:, 1] + np.random.normal(0, 0.1, size=(100,))
         est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()),
                             ModelFinal(),
-                            n_splits=2, discrete_treatment=False, random_state=None)
+                            n_splits=2, discrete_treatment=False, discrete_instrument=False,
+                            random_state=None)
         est.fit(y, X[:, 0], W=X[:, 1:])
 
     >>> est.score_
@@ -372,7 +383,8 @@ def score(self, Y, T, W=None, nuisances=None):
         T = np.random.binomial(1, scipy.special.expit(W[:, 0]))
         y = T + W[:, 0] + np.random.normal(0, 0.01, size=(100,))
         est = _OrthoLearner(ModelNuisance(LogisticRegression(solver='lbfgs'), LinearRegression()),
-                            ModelFinal(), n_splits=2, discrete_treatment=True, random_state=None)
+                            ModelFinal(), n_splits=2, discrete_treatment=True, discrete_instrument=False,
+                            random_state=None)
         est.fit(y, T, W=W)
 
     >>> est.score_
@@ -399,13 +411,14 @@ def score(self, Y, T, W=None, nuisances=None):
         of the final CATE model.
     """
 
-    def __init__(self, model_nuisance, model_final,
-                 discrete_treatment, n_splits, random_state):
+    def __init__(self, model_nuisance, model_final, *,
+                 discrete_treatment, discrete_instrument, n_splits, random_state):
         self._model_nuisance = clone(model_nuisance, safe=False)
         self._models_nuisance = None
         self._model_final = clone(model_final, safe=False)
         self._n_splits = n_splits
         self._discrete_treatment = discrete_treatment
+        self._discrete_instrument = discrete_instrument
         self._random_state = check_random_state(random_state)
         if discrete_treatment:
             self._label_encoder = LabelEncoder()
@@ -490,23 +503,51 @@ def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, sample_var=None,
 
     def _fit_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
         # use a binary array to get stratified split in case of discrete treatment
-        splitter = check_cv(self._n_splits, [0], classifier=self._discrete_treatment)
-        # if check_cv produced a new KFold or StratifiedKFold object, we need to set shuffle and random_state
-        if splitter != self._n_splits and isinstance(splitter, (KFold, StratifiedKFold)):
-            splitter.shuffle = True
-            splitter.random_state = self._random_state
-
-        all_vars = [var if np.ndim(var) == 2 else var.reshape(-1, 1) for var in [Z, W, X] if var is not None]
-        if all_vars:
-            all_vars = np.hstack(all_vars)
-            folds = splitter.split(all_vars, T)
-        else:
-            folds = splitter.split(np.ones((T.shape[0], 1)), T)
+        stratify = self._discrete_treatment or self._discrete_instrument
 
         if self._discrete_treatment:
             T = self._label_encoder.fit_transform(T.ravel())
+
+        if self._discrete_instrument:
+            z_enc = LabelEncoder()
+            Z = z_enc.fit_transform(Z.ravel())
+
+            if self._discrete_treatment:  # need to stratify on combination of Z and T
+                to_split = T + Z * len(self._label_encoder.classes_)
+            else:
+                to_split = Z  # just stratify on Z
+
+            z_ohe = OneHotEncoder(categories='auto', sparse=False)
+            Z = z_ohe.fit_transform(reshape(Z, (-1, 1)))[:, 1:]
+            self.z_transformer = FunctionTransformer(
+                func=(lambda Z:
+                      z_ohe.transform(
+                          reshape(z_enc.transform(Z.ravel()), (-1, 1)))[:, 1:]),
+                validate=False)
+        else:
+            to_split = T  # stratify on T if discrete, and fine to pass T as second arg to KFold.split even when not
+            self.z_transformer = None
+
+        if self._n_splits == 1:  # special case, no cross validation
+            folds = None
+        else:
+            splitter = check_cv(self._n_splits, [0], classifier=stratify)
+            # if check_cv produced a new KFold or StratifiedKFold object, we need to set shuffle and random_state
+            if splitter != self._n_splits and isinstance(splitter, (KFold, StratifiedKFold)):
+                splitter.shuffle = True
+                splitter.random_state = self._random_state
+
+            all_vars = [var if np.ndim(var) == 2 else var.reshape(-1, 1) for var in [Z, W, X] if var is not None]
+            if all_vars:
+                all_vars = np.hstack(all_vars)
+                folds = splitter.split(all_vars, to_split)
+            else:
+                folds = splitter.split(np.ones((T.shape[0], 1)), to_split)
+
+        if self._discrete_treatment:
             # drop first column since all columns sum to one
             T = self._one_hot_encoder.fit_transform(reshape(T, (-1, 1)))[:, 1:]
+
             self._d_t = shape(T)[1:]
             self.transformer = FunctionTransformer(
                 func=(lambda T:
@@ -592,6 +633,8 @@ def score(self, Y, T, X=None, W=None, Z=None):
         self._check_fitted_dims(X)
         self._check_fitted_dims_w_z(W, Z)
         X, T = self._expand_treatments(X, T)
+        if self.z_transformer is not None:
+            Z = self.z_transformer.transform(Z)
         n_splits = len(self._models_nuisance)
         for idx, mdl in enumerate(self._models_nuisance):
             nuisance_temp = mdl.predict(Y, T, **self._filter_none_kwargs(X=X, W=W, Z=Z))
diff --git a/econml/_rlearner.py b/econml/_rlearner.py
index b500d8cf2..d7747cf7e 100644
--- a/econml/_rlearner.py
+++ b/econml/_rlearner.py
@@ -28,19 +28,9 @@
 import numpy as np
 import copy
 from warnings import warn
-from .utilities import (shape, reshape, ndim, hstack, cross_product, transpose,
-                        broadcast_unit_treatments, reshape_treatmentwise_effects,
-                        StatsModelsLinearRegression, LassoCVWrapper)
-from sklearn.model_selection import KFold, StratifiedKFold, check_cv
-from sklearn.linear_model import LinearRegression, LassoCV
-from sklearn.preprocessing import (PolynomialFeatures, LabelEncoder, OneHotEncoder,
-                                   FunctionTransformer)
-from sklearn.base import clone, TransformerMixin
-from sklearn.pipeline import Pipeline
-from sklearn.utils import check_random_state
-from .cate_estimator import (BaseCateEstimator, LinearCateEstimator,
-                             TreatmentExpansionMixin, StatsModelsCateEstimatorMixin)
-from .inference import StatsModelsInference
+from .utilities import (shape, reshape, ndim, hstack)
+from sklearn.linear_model import LinearRegression
+from sklearn.base import clone
 from ._ortho_learner import _OrthoLearner
 
 
@@ -270,7 +260,11 @@ def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None
                     return np.mean((Y_res - Y_res_pred)**2)
 
         super().__init__(ModelNuisance(model_y, model_t),
-                         ModelFinal(model_final), discrete_treatment, n_splits, random_state)
+                         ModelFinal(model_final),
+                         discrete_treatment=discrete_treatment,
+                         discrete_instrument=False,  # no instrument, so doesn't matter
+                         n_splits=n_splits,
+                         random_state=random_state)
 
     def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, inference=None):
         """
diff --git a/econml/drlearner.py b/econml/drlearner.py
index 12f6107ff..538e11ec6 100644
--- a/econml/drlearner.py
+++ b/econml/drlearner.py
@@ -348,6 +348,7 @@ def score(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_v
         super().__init__(ModelNuisance(model_propensity, model_regression),
                          ModelFinal(model_final, featurizer, multitask_model_final),
                          n_splits=n_splits, discrete_treatment=True,
+                         discrete_instrument=False,  # no instrument, so doesn't matter
                          random_state=random_state)
 
     def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, inference=None):
diff --git a/econml/inference.py b/econml/inference.py
index c8d7aa762..8da590961 100644
--- a/econml/inference.py
+++ b/econml/inference.py
@@ -326,6 +326,8 @@ def __init__(self, cov_type='HC1'):
 
     def prefit(self, estimator, *args, **kwargs):
         super().prefit(estimator, *args, **kwargs)
+        assert not (self.model_final.fit_intercept), ("Inference can only be performed on models linear in "
+                                                      "their features, but here fit_intercept is True")
         self.model_final.cov_type = self.cov_type
 
 
diff --git a/econml/ortho_iv.py b/econml/ortho_iv.py
new file mode 100644
index 000000000..13e42b578
--- /dev/null
+++ b/econml/ortho_iv.py
@@ -0,0 +1,1146 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Orthogonal IV for Heterogeneous Treatment Effects.
+
+A Double/Orthogonal machine learning approach to estimation of heterogeneous
+treatment effect with an endogenous treatment and an instrument. It
+implements the DMLIV and related algorithms from the paper:
+
+Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments
+Vasilis Syrgkanis, Victor Lei, Miruna Oprescu, Maggie Hei, Keith Battocchi, Greg Lewis
+https://arxiv.org/abs/1905.10176
+
+"""
+
+import numpy as np
+from sklearn.base import clone
+from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import FunctionTransformer
+from sklearn.pipeline import Pipeline
+
+from ._ortho_learner import _OrthoLearner
+from .dml import _FinalWrapper
+from .utilities import (hstack, StatsModelsLinearRegression, inverse_onehot)
+from .inference import StatsModelsInference
+from .cate_estimator import StatsModelsCateEstimatorMixin
+
+
+# A cut-down version of the DML first stage wrapper, since we don't need to support W or linear first stages
+class _FirstStageWrapper:
+    def __init__(self, model, discrete_target):
+        self._model = clone(model, safe=False)
+        self._discrete_target = discrete_target
+
+    def _combine(self, X, Z, n_samples, fitting=True):
+        if Z is not None:
+            Z = Z.reshape(n_samples, -1)
+        if X is None:
+            # if both X and Z are None, just return a column of ones
+            return (Z if Z is not None else np.ones((n_samples, 1)))
+        XZ = hstack([X, Z]) if Z is not None else X
+        return XZ
+
+    def fit(self, X, *args, sample_weight=None):
+        if len(args) == 1:
+            Target, = args
+            Z = None
+        else:
+            (Z, Target) = args
+        if self._discrete_target:
+            # In this case, the Target is the one-hot-encoding of the treatment variable
+            # We need to go back to the label representation of the one-hot so as to call
+            # the classifier.
+            if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))):
+                raise AttributeError("Provided crossfit folds contain training splits that " +
+                                     "don't contain all treatments")
+            Target = inverse_onehot(Target)
+
+        if sample_weight is not None:
+            self._model.fit(self._combine(X, Z, Target.shape[0]), Target, sample_weight=sample_weight)
+        else:
+            self._model.fit(self._combine(X, Z, Target.shape[0]), Target)
+
+    def predict(self, X, Z=None):
+        n_samples = X.shape[0] if X is not None else (Z.shape[0] if Z is not None else 1)
+        if self._discrete_target:
+            return self._model.predict_proba(self._combine(X, Z, n_samples, fitting=False))[:, 1:]
+        else:
+            return self._model.predict(self._combine(X, Z, n_samples, fitting=False))
+
+
+class _BaseDMLATEIV(_OrthoLearner):
+    def __init__(self, model_nuisance,
+                 discrete_instrument=False, discrete_treatment=False,
+                 n_splits=2, random_state=None):
+        class ModelFinal:
+            def __init__(self):
+                self._first_stage = LinearRegression(fit_intercept=False)
+                self._model_final = _FinalWrapper(LinearRegression(fit_intercept=False),
+                                                  fit_cate_intercept=True, featurizer=None, use_weight_trick=False)
+
+            def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+                Y_res, T_res, Z_res = nuisances
+                if Z_res.ndim == 1:
+                    Z_res = Z_res.reshape(-1, 1)
+                # DMLATEIV is just like 2SLS; first regress T_res on Z_res, then regress Y_res on predicted T_res
+                T_res_pred = self._first_stage.fit(Z_res, T_res,
+                                                   sample_weight=sample_weight).predict(Z_res)
+                # TODO: allow the final model to actually use X? Then we'd need to rename the class
+                #       since we would actually be calculating a CATE rather than ATE.
+                self._model_final.fit(X=None, T_res=T_res_pred, Y_res=Y_res, sample_weight=sample_weight)
+                return self
+
+            def predict(self, X=None):
+                # TODO: allow the final model to actually use X?
+                return self._model_final.predict(X=None)
+
+            def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+                Y_res, T_res, Z_res = nuisances
+                if Y_res.ndim == 1:
+                    Y_res = Y_res.reshape((-1, 1))
+                if T_res.ndim == 1:
+                    T_res = T_res.reshape((-1, 1))
+                # TODO: allow the final model to actually use X?
+                effects = self._model_final.predict(X=None).reshape((-1, Y_res.shape[1], T_res.shape[1]))
+                Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
+                if sample_weight is not None:
+                    return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0))
+                else:
+                    return np.mean((Y_res - Y_res_pred) ** 2)
+
+        super().__init__(model_nuisance, ModelFinal(),
+                         discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument,
+                         n_splits=n_splits, random_state=random_state)
+
+    def fit(self, Y, T, Z, W=None, *, sample_weight=None, sample_var=None, inference=None):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        inference: string,:class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of:class:`.BootstrapInference`).
+
+        Returns
+        -------
+        self: _BaseDMLATEIV instance
+        """
+        # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring
+        return super().fit(Y, T, W=W, Z=Z,
+                           sample_weight=sample_weight, sample_var=sample_var, inference=inference)
+
+    def score(self, Y, T, Z, W=None):
+        """
+        Score the fitted CATE model on a new data set. Generates nuisance parameters
+        for the new data set based on the fitted residual nuisance models created at fit time.
+        It uses the mean prediction of the models fitted by the different crossfit folds.
+        Then calculates the MSE of the final residual Y on residual T regression.
+
+        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: optional(n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+
+
+        Returns
+        -------
+        score: float
+            The MSE of the final CATE model on the new data.
+        """
+        # Replacing score from _OrthoLearner, to enforce X=None and improve the docstring
+        return super().score(Y, T, W=W, Z=Z)
+
+
+class DMLATEIV(_BaseDMLATEIV):
+    """
+    Implementation of the orthogonal/double ml method for ATE estimation with
+    IV as described in
+
+    Double/Debiased Machine Learning for Treatment and Causal Parameters
+    Victor Chernozhukov, Denis Chetverikov, Mert Demirer, Esther Duflo, Christian Hansen, Whitney Newey, James Robins
+    https://arxiv.org/abs/1608.00060
+
+    Requires that either co-variance of T, Z is independent of X or that effect
+    is not heterogeneous in X for correct recovery. Otherwise it estimates
+    a biased ATE.
+    """
+
+    def __init__(self, model_Y_X, model_T_X, model_Z_X,
+                 discrete_treatment=False, discrete_instrument=False,
+                 n_splits=2, random_state=None):
+        class ModelNuisance:
+            def __init__(self, model_Y_X, model_T_X, model_Z_X):
+                self._model_Y_X = clone(model_Y_X, safe=False)
+                self._model_T_X = clone(model_T_X, safe=False)
+                self._model_Z_X = clone(model_Z_X, safe=False)
+
+            def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+                assert X is None, "DML ATE IV does not accept features"
+                self._model_Y_X.fit(W, Y, sample_weight=sample_weight)
+                self._model_T_X.fit(W, T, sample_weight=sample_weight)
+                self._model_Z_X.fit(W, Z, sample_weight=sample_weight)
+                return self
+
+            def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+                Y_pred = self._model_Y_X.predict(W)
+                T_pred = self._model_T_X.predict(W)
+                Z_pred = self._model_Z_X.predict(W)
+                if W is None:  # In this case predict above returns a single row
+                    Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
+                    T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1))
+                    Z_pred = np.tile(Z_pred.reshape(1, -1), (Z.shape[0], 1))
+                Y_res = Y - Y_pred.reshape(Y.shape)
+                T_res = T - T_pred.reshape(T.shape)
+                Z_res = Z - Z_pred.reshape(Z.shape)
+                return Y_res, T_res, Z_res
+        super().__init__(ModelNuisance(model_Y_X=_FirstStageWrapper(model_Y_X, discrete_target=False),
+                                       model_T_X=_FirstStageWrapper(model_T_X, discrete_target=discrete_treatment),
+                                       model_Z_X=_FirstStageWrapper(model_Z_X, discrete_target=discrete_instrument)),
+                         discrete_instrument=discrete_instrument, discrete_treatment=discrete_treatment,
+                         n_splits=n_splits, random_state=random_state)
+
+
+class ProjectedDMLATEIV(_BaseDMLATEIV):
+    def __init__(self, model_Y_X, model_T_X, model_T_XZ,
+                 discrete_treatment=False, discrete_instrument=False,
+                 n_splits=2, random_state=None):
+        class ModelNuisance:
+            def __init__(self, model_Y_X, model_T_X, model_T_XZ):
+                self._model_Y_X = clone(model_Y_X, safe=False)
+                self._model_T_X = clone(model_T_X, safe=False)
+                self._model_T_XZ = clone(model_T_XZ, safe=False)
+
+            def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+                assert X is None, "DML ATE IV does not accept features"
+                self._model_Y_X.fit(W, Y, sample_weight=sample_weight)
+                self._model_T_X.fit(W, T, sample_weight=sample_weight)
+                self._model_T_XZ.fit(W, Z, T, sample_weight=sample_weight)
+                return self
+
+            def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+                Y_pred = self._model_Y_X.predict(W)
+                TX_pred = self._model_T_X.predict(W)
+                TXZ_pred = self._model_T_XZ.predict(W, Z)
+                if W is None:  # In this case predict above returns a single row
+                    Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
+                    TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1))
+                Y_res = Y - Y_pred.reshape(Y.shape)
+                T_res = T - TX_pred.reshape(T.shape)
+                Z_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape)
+                return Y_res, T_res, Z_res
+
+        super().__init__(ModelNuisance(model_Y_X=_FirstStageWrapper(model_Y_X, discrete_target=False),
+                                       model_T_X=_FirstStageWrapper(model_T_X, discrete_target=discrete_treatment),
+                                       model_T_XZ=_FirstStageWrapper(model_T_XZ, discrete_target=discrete_treatment)),
+                         discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument,
+                         n_splits=n_splits, random_state=random_state)
+
+
+class _BaseDMLIV(_OrthoLearner):
+    """
+    The class _BaseDMLIV implements the base class of the DMLIV
+    algorithm for estimating a CATE. It accepts three generic machine
+    learning models:
+    1) model_Y_X that estimates :math:`\\E[Y | X]`
+    2) model_T_X that estimates :math:`\\E[T | X]`
+    3) model_T_XZ that estimates :math:`\\E[T | X, Z]`
+    These are estimated in a cross-fitting manner for each sample in the training set.
+    Then it minimizes the square loss:
+
+    .. math::
+        \\sum_i (Y_i - \\E[Y|X_i] - \theta(X) * (\\E[T|X_i, Z_i] - \\E[T|X_i]))^2
+
+    This loss is minimized by the model_final class, which is passed as an input.
+    In the two children classes {DMLIV, GenericDMLIV}, we implement different strategies of how to invoke
+    machine learning algorithms to minimize this final square loss.
+
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to predict :math:`\\E[Y | X]`
+
+    model_T_X : estimator
+        model to predict :math:`\\E[T | X]`
+
+    model_T_XZ : estimator
+        model to predict :math:`\\E[T | X, Z]`
+
+    model_final : estimator
+        final model that at fit time takes as input :math:`(Y-\\E[Y|X])`, :math:`(\\E[T|X,Z]-\\E[T|X])` and X
+        and supports method predict(X) that produces the CATE at X
+
+    discrete_instrument: bool, optional, default False
+        Whether the instrument values should be treated as categorical, rather than continuous, quantities
+
+    discrete_treatment: bool, optional, default False
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+
+    n_splits: int, cross-validation generator or an iterable, optional, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final,
+                 discrete_instrument=False, discrete_treatment=False, n_splits=2, random_state=None):
+        class ModelNuisance:
+            """
+            Nuisance model fits the three models at fit time and at predict time
+            returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals.
+            """
+
+            def __init__(self, model_Y_X, model_T_X, model_T_XZ):
+                self._model_Y_X = clone(model_Y_X, safe=False)
+                self._model_T_X = clone(model_T_X, safe=False)
+                self._model_T_XZ = clone(model_T_XZ, safe=False)
+
+            def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+                # TODO: would it be useful to extend to handle controls ala vanilla DML?
+                assert W is None, "DML IV does not accept controls"
+                self._model_Y_X.fit(X, Y, sample_weight=sample_weight)
+                self._model_T_X.fit(X, T, sample_weight=sample_weight)
+                self._model_T_XZ.fit(X, Z, T, sample_weight=sample_weight)
+                return self
+
+            def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+                Y_pred = self._model_Y_X.predict(X)
+                TXZ_pred = self._model_T_XZ.predict(X, Z)
+                TX_pred = self._model_T_X.predict(X)
+                if X is None:  # In this case predict above returns a single row
+                    Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
+                    TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1))
+                Y_res = Y - Y_pred.reshape(Y.shape)
+                T_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape)
+                return Y_res, T_res
+
+        class ModelFinal:
+            """
+            Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient
+            that depends on X, i.e.
+
+                .. math ::
+                    Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon
+
+            and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final
+            residual on residual regression.
+            """
+
+            def __init__(self, model_final):
+                self._model_final = clone(model_final, safe=False)
+
+            def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+                Y_res, T_res = nuisances
+                self._model_final.fit(X, T_res, Y_res, sample_weight=sample_weight, sample_var=sample_var)
+                return self
+
+            def predict(self, X=None):
+                return self._model_final.predict(X)
+
+            def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+                Y_res, T_res = nuisances
+                if Y_res.ndim == 1:
+                    Y_res = Y_res.reshape((-1, 1))
+                if T_res.ndim == 1:
+                    T_res = T_res.reshape((-1, 1))
+                effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1]))
+                Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
+                if sample_weight is not None:
+                    return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0))
+                else:
+                    return np.mean((Y_res - Y_res_pred)**2)
+
+        super().__init__(ModelNuisance(model_Y_X, model_T_X, model_T_XZ), ModelFinal(model_final),
+                         discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument,
+                         n_splits=n_splits, random_state=random_state)
+
+    def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, inference=None):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        inference: string,:class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of:class:`.BootstrapInference`).
+
+        Returns
+        -------
+        self: _BaseDMLIV
+        """
+        # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring
+        return super().fit(Y, T, X=X, Z=Z,
+                           sample_weight=sample_weight, sample_var=sample_var, inference=inference)
+
+    def score(self, Y, T, Z, X=None):
+        """
+        Score the fitted CATE model on a new data set. Generates nuisance parameters
+        for the new data set based on the fitted residual nuisance models created at fit time.
+        It uses the mean prediction of the models fitted by the different crossfit folds.
+        Then calculates the MSE of the final residual Y on residual T regression.
+
+        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: optional(n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+
+
+        Returns
+        -------
+        score: float
+            The MSE of the final CATE model on the new data.
+        """
+        # Replacing score from _OrthoLearner, to enforce W=None and improve the docstring
+        return super().score(Y, T, X=X, Z=Z)
+
+    @property
+    def original_featurizer(self):
+        return super().model_final._model_final._original_featurizer
+
+    @property
+    def featurizer(self):
+        # NOTE This is used by the inference methods and has to be the overall featurizer. intended
+        # for internal use by the library
+        return super().model_final._model_final._featurizer
+
+    @property
+    def model_final(self):
+        # NOTE This is used by the inference methods and is more for internal use to the library
+        return super().model_final._model_final._model
+
+    @property
+    def model_cate(self):
+        """
+        Get the fitted final CATE model.
+
+        Returns
+        -------
+        model_cate: object of type(model_final)
+            An instance of the model_final object that was fitted after calling fit which corresponds
+            to the constant marginal CATE model.
+        """
+        return super().model_final._model_final._model
+
+    @property
+    def models_Y_X(self):
+        """
+        Get the fitted models for :math:`\\E[Y | X]`.
+
+        Returns
+        -------
+        models_Y_X: list of objects of type(`model_Y_X`)
+            A list of instances of the `model_Y_X` object. Each element corresponds to a crossfitting
+            fold and is the model instance that was fitted for that training fold.
+        """
+        return [mdl._model for mdl in super().models_Y_X]
+
+    @property
+    def models_T_X(self):
+        """
+        Get the fitted models for :math:`\\E[T | X]`.
+
+        Returns
+        -------
+        models_T_X: list of objects of type(`model_T_X`)
+            A list of instances of the `model_T_X` object. Each element corresponds to a crossfitting
+            fold and is the model instance that was fitted for that training fold.
+        """
+        return [mdl._model for mdl in super().models_T_X]
+
+    @property
+    def models_T_XZ(self):
+        """
+        Get the fitted models for :math:`\\E[T | X, Z]`.
+
+        Returns
+        -------
+        models_T_XZ: list of objects of type(`model_T_XZ`)
+            A list of instances of the `model_T_XZ` object. Each element corresponds to a crossfitting
+            fold and is the model instance that was fitted for that training fold.
+        """
+        return [mdl._model for mdl in super().models_T_XZ]
+
+    def cate_feature_names(self, input_feature_names=None):
+        """
+        Get the output feature names.
+
+        Parameters
+        ----------
+        input_feature_names: list of strings of length X.shape[1] or None
+            The names of the input features
+
+        Returns
+        -------
+        out_feature_names: list of strings or None
+            The names of the output features :math:`\\phi(X)`, i.e. the features with respect to which the
+            final constant marginal CATE model is linear. It is the names of the features that are associated
+            with each entry of the :meth:`coef_` parameter. Not available when the featurizer is not None and
+            does not have a method: `get_feature_names(input_feature_names)`. Otherwise None is returned.
+        """
+        if self.original_featurizer is None:
+            return input_feature_names
+        elif hasattr(self.original_featurizer, 'get_feature_names'):
+            return self.original_featurizer.get_feature_names(input_feature_names)
+        else:
+            raise AttributeError("Featurizer does not have a method: get_feature_names!")
+
+
+class DMLIV(_BaseDMLIV):
+    """
+    A child of the _BaseDMLIV class that specifies a particular effect model
+    where the treatment effect is linear in some featurization of the variable X
+    The features are created by a provided featurizer that supports fit_transform.
+    Then an arbitrary model fits on the composite set of features.
+
+    Concretely, it assumes that :math:`\\theta(X)=<\\theta, \\phi(X)>` for some features :math:`\\phi(X)`
+    and runs a linear model regression of :math:`Y-\\E[Y|X]` on :math:`phi(X)*(\\E[T|X,Z]-\\E[T|X])`.
+    The features are created by the featurizer provided by the user. The particular
+    linear model regression is also specified by the user (e.g. Lasso, ElasticNet)
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to predict :math:`\\E[Y | X]`
+
+    model_T_X : estimator
+        model to predict :math:`\\E[T | X]`
+
+    model_T_XZ : estimator
+        model to predict :math:`\\E[T | X, Z]`
+
+    model_final : estimator
+        final linear model for predicting :math:`(Y-\\E[Y|X])` from :math:`\\phi(X) \\cdot (\\E[T|X,Z]-\\E[T|X])`
+        Method is incorrect if this model is not linear (e.g. Lasso, ElasticNet, LinearRegression).
+
+    featurizer: :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
+
+    n_splits: int, cross-validation generator or an iterable, optional, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    discrete_instrument: bool, optional, default False
+        Whether the instrument values should be treated as categorical, rather than continuous, quantities
+
+    discrete_treatment: bool, optional, default False
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+    """
+
+    def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final, featurizer=None,
+                 fit_cate_intercept=True,
+                 n_splits=2, discrete_instrument=False, discrete_treatment=False, random_state=None):
+        self.bias_part_of_coef = fit_cate_intercept
+        self.fit_cate_intercept = fit_cate_intercept
+        super().__init__(_FirstStageWrapper(model_Y_X, False),
+                         _FirstStageWrapper(model_T_X, discrete_treatment),
+                         _FirstStageWrapper(model_T_XZ, discrete_treatment),
+                         _FinalWrapper(model_final,
+                                       fit_cate_intercept=fit_cate_intercept,
+                                       featurizer=featurizer,
+                                       use_weight_trick=False),
+                         n_splits=n_splits,
+                         discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
+                         random_state=random_state)
+
+
+class NonParamDMLIV(_BaseDMLIV):
+    """
+    A child of the _BaseDMLIV class that allows for an arbitrary square loss based ML
+    method in the final stage of the DMLIV algorithm. The method has to support
+    sample weights and the fit method has to take as input sample_weights (e.g. random forests), i.e.
+    fit(X, y, sample_weight=None)
+    It achieves this by re-writing the final stage square loss of the DMLIV algorithm as:
+
+    .. math ::
+        \\sum_i (\\E[T|X_i, Z_i] - \\E[T|X_i])^2 * ((Y_i - \\E[Y|X_i])/(\\E[T|X_i, Z_i] - \\E[T|X_i]) - \\theta(X))^2
+
+    Then this can be viewed as a weighted square loss regression, where the target label is
+
+    .. math ::
+        \\tilde{Y}_i = (Y_i - \\E[Y|X_i])/(\\E[T|X_i, Z_i] - \\E[T|X_i])
+
+    and each sample has a weight of
+
+    .. math ::
+        V(X_i) = (\\E[T|X_i, Z_i] - \\E[T|X_i])^2
+
+    Thus we can call any regression model with inputs:
+
+        fit(X, :math:`\\tilde{Y}_i`, sample_weight= :math:`V(X_i)`)
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to predict :math:`\\E[Y | X]`
+
+    model_T_X : estimator
+        model to predict :math:`\\E[T | X]`
+
+    model_T_XZ : estimator
+        model to predict :math:`\\E[T | X, Z]`
+
+    model_final : estimator
+        final model for predicting :math:`\\tilde{Y}` from X with sample weights V(X)
+
+    n_splits: int, cross-validation generator or an iterable, optional, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    discrete_instrument: bool, optional, default False
+        Whether the instrument values should be treated as categorical, rather than continuous, quantities
+
+    discrete_treatment: bool, optional, default False
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+    """
+
+    def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final,
+                 featurizer=None, fit_cate_intercept=True,
+                 n_splits=2, discrete_instrument=False, discrete_treatment=False):
+        super().__init__(_FirstStageWrapper(model_Y_X, False),
+                         _FirstStageWrapper(model_T_X, discrete_treatment),
+                         _FirstStageWrapper(model_T_XZ, discrete_treatment),
+                         _FinalWrapper(model_final,
+                                       fit_cate_intercept=fit_cate_intercept,
+                                       featurizer=featurizer,
+                                       use_weight_trick=True),
+                         n_splits=n_splits,
+                         discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment)
+
+
+class _BaseDRIV(_OrthoLearner):
+
+    """
+    The _BaseDRIV algorithm for estimating CATE with IVs. It is the parent of the
+    two public classes {DRIV, ProjectedDRIV}
+
+    Parameters
+    ----------
+    nuisance_models : dictionary of nuisance models, with {'name_of_model' : EstimatorObject, ...}
+
+    model_final : estimator
+        model compatible with the sklearn regression API, used to fit the effect on X
+
+    cov_clip : float, optional, default 0.1
+        clipping of the covariate for regions with low "overlap", to reduce variance
+
+    opt_reweighted : bool, optional, default False
+        Whether to reweight the samples to minimize variance. If True then
+        model_final.fit must accept sample_weight as a kw argument. If True then
+        assumes the model_final is flexible enough to fit the true CATE model. Otherwise,
+        it method will return a biased projection to the model_final space, biased
+        to give more weight on parts of the feature space where the instrument is strong.
+
+    discrete_instrument: bool, optional, default False
+        Whether the instrument values should be treated as categorical, rather than continuous, quantities
+
+    discrete_treatment: bool, optional, default False
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+
+    n_splits: int, cross-validation generator or an iterable, optional, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self,
+                 nuisance_models,
+                 model_final,
+                 featurizer=None,
+                 cov_clip=0.1, opt_reweighted=False,
+                 discrete_instrument=False, discrete_treatment=False,
+                 n_splits=2, random_state=None):
+        class ModelFinal:
+            """
+            Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient
+            that depends on X, i.e.
+
+                .. math ::
+                    Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon
+
+            and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final
+            residual on residual regression.
+            """
+
+            def __init__(self, model_final, featurizer):
+                self._model_final = clone(model_final, safe=False)
+                self._featurizer = clone(featurizer, safe=False)
+
+            @staticmethod
+            def _effect_estimate(nuisances):
+                prel_theta, res_t, res_y, res_z, cov = [nuisance.reshape(nuisances[0].shape) for nuisance in nuisances]
+
+                # Estimate final model of theta(X) by minimizing the square loss:
+                # (prel_theta(X) + (Y_res - prel_theta(X) * T_res) * Z_res / cov[T,Z | X] - theta(X))^2
+                # We clip the covariance so that it is bounded away from zero, so as to reduce variance
+                # at the expense of some small bias. For points with very small covariance we revert
+                # to the model-based preliminary estimate and do not add the correction term.
+                cov_sign = np.sign(cov)
+                cov_sign[cov_sign == 0] = 1
+                clipped_cov = cov_sign * np.clip(np.abs(cov),
+                                                 self.cov_clip, np.inf)
+                return prel_theta + (res_y - prel_theta * res_t) * res_z / clipped_cov, clipped_cov
+
+            def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+                self.d_y = Y.shape[1:]
+                self.d_t = nuisances[1].shape[1:]
+                self.d_z = nuisances[3].shape[1:]
+
+                # TODO: if opt_reweighted is False, we could change the logic to support multidimensional treatments,
+                #       instruments, and outcomes
+                if self.d_y and self.d_y[0] > 2:
+                    raise AttributeError("DRIV only supports a single outcome")
+
+                if self.d_t and self.d_t[0] > 1:
+                    if discrete_treatment:
+                        raise AttributeError("DRIV only supports binary treatments")
+                    else:
+                        raise AttributeError("DRIV only supports single-dimensional continuous treatments")
+
+                if self.d_z and self.d_z[0] > 1:
+                    if discrete_instrument:
+                        raise AttributeError("DRIV only supports binary instruments")
+                    else:
+                        raise AttributeError("DRIV only supports single-dimensional continuous instruments")
+
+                theta_dr, clipped_cov = self._effect_estimate(nuisances)
+
+                if (X is not None) and (self._featurizer is not None):
+                    X = self._featurizer.fit_transform(X)
+                # TODO: how do we incorporate the sample_weight and sample_var passed into this method
+                #       as arguments?
+                if opt_reweighted:
+                    self._model_final.fit(X, theta_dr, sample_weight=clipped_cov.ravel()**2)
+                else:
+                    self._model_final.fit(X, theta_dr)
+                return self
+
+            def predict(self, X=None):
+                if (X is not None) and (self._featurizer is not None):
+                    X = self._featurizer.transform(X)
+                return self._model_final.predict(X).reshape((-1,) + self.d_y + self.d_t)
+
+            def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+                # TODO: is there a good way to incorporate the other nuisance terms in the score?
+                _, T_res, Y_res, _, _ = nuisances
+
+                if Y_res.ndim == 1:
+                    Y_res = Y_res.reshape((-1, 1))
+                if T_res.ndim == 1:
+                    T_res = T_res.reshape((-1, 1))
+                if (X is not None) and (self._featurizer is not None):
+                    X = self._featurizer.transform(X)
+                effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1]))
+                Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
+
+                if sample_weight is not None:
+                    return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0))
+                else:
+                    return np.mean((Y_res - Y_res_pred)**2)
+        self.cov_clip = cov_clip
+        self.opt_reweighted = opt_reweighted
+        super().__init__(nuisance_models, ModelFinal(model_final, featurizer),
+                         discrete_instrument=discrete_instrument, discrete_treatment=discrete_treatment,
+                         n_splits=n_splits, random_state=random_state)
+
+    def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, inference=None):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        inference: string,:class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of:class:`.BootstrapInference`).
+
+        Returns
+        -------
+        self: _BaseDRIV instance
+        """
+        # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring
+        return super().fit(Y, T, X=X, W=None, Z=Z,
+                           sample_weight=sample_weight, sample_var=sample_var, inference=inference)
+
+    @property
+    def featurizer(self):
+        # NOTE This is used by the inference methods and has to be the overall featurizer. intended
+        # for internal use by the library
+        return super().model_final._featurizer
+
+    @property
+    def model_final(self):
+        # NOTE This is used by the inference methods and is more for internal use to the library
+        return super().model_final._model_final
+
+
+class _IntentToTreatDRIV(_BaseDRIV):
+    """
+    Helper class for the DRIV algorithm for the intent-to-treat A/B test setting
+    """
+
+    def __init__(self, model_Y_X, model_T_XZ,
+                 prel_model_effect,
+                 model_effect,
+                 featurizer=None,
+                 cov_clip=.1,
+                 n_splits=3,
+                 opt_reweighted=False):
+        """
+        """
+
+        class ModelNuisance:
+            """
+            Nuisance model fits the three models at fit time and at predict time
+            returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals.
+            """
+
+            def __init__(self, model_Y_X, model_T_XZ, prel_model_effect):
+                self._model_Y_X = clone(model_Y_X, safe=False)
+                self._model_T_XZ = clone(model_T_XZ, safe=False)
+                self._prel_model_effect = clone(prel_model_effect, safe=False)
+
+            def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+                assert W is None, "IntentToTreatDRIV does not accept controls"
+                self._model_Y_X.fit(X, Y, sample_weight=sample_weight)
+                self._model_T_XZ.fit(X, Z, T, sample_weight=sample_weight)
+                # we need to undo the one-hot encoding for calling effect,
+                # since it expects raw values
+                self._prel_model_effect.fit(Y, inverse_onehot(T), inverse_onehot(Z), X=X)
+                return self
+
+            def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+                Y_pred = self._model_Y_X.predict(X)
+                T_pred_zero = self._model_T_XZ.predict(X, np.zeros(Z.shape))
+                T_pred_one = self._model_T_XZ.predict(X, np.ones(Z.shape))
+                delta = (T_pred_one - T_pred_zero) / 2
+                T_pred_mean = (T_pred_one + T_pred_zero) / 2
+                prel_theta = self._prel_model_effect.effect(X)
+                if X is None:  # In this case predict above returns a single row
+                    Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
+                    prel_theta = np.tile(prel_theta.reshape(1, -1), (T.shape[0], 1))
+                Y_res = Y - Y_pred.reshape(Y.shape)
+                T_res = T - T_pred_mean.reshape(T.shape)
+                return prel_theta, T_res, Y_res, 2 * Z - 1, delta
+        # TODO: check that Y, T, Z do not have multiple columns
+        super().__init__(ModelNuisance(model_Y_X, model_T_XZ, prel_model_effect), model_effect,
+                         featurizer=featurizer,
+                         cov_clip=cov_clip,
+                         n_splits=n_splits,
+                         discrete_instrument=True, discrete_treatment=True,
+                         opt_reweighted=opt_reweighted)
+
+
+class _DummyCATE:
+    """
+    A dummy cate effect model that always returns zero effect
+    """
+
+    def __init__(self):
+        return
+
+    def fit(self, y, T, Z, X):
+        return self
+
+    def effect(self, X):
+        if X is None:
+            return np.zeros(1)
+        return np.zeros(X.shape[0])
+
+
+class IntentToTreatDRIV(_IntentToTreatDRIV):
+    """
+    Implements the DRIV algorithm for the intent-to-treat A/B test setting
+
+    Parameters
+    ----------
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to predict :math:`\\E[Y | X]`
+
+    model_T_XZ : estimator
+        model to predict :math:`\\E[T | X, Z]`
+
+    flexible_model_effect : estimator
+        a flexible model for a preliminary version of the CATE, must accept sample_weight at fit time.
+
+    final_model_effect : estimator, optional
+        a final model for the CATE and projections. If None, then flexible_model_effect is also used as a final model
+
+    cov_clip : float, optional, default 0.1
+        clipping of the covariate for regions with low "overlap", to reduce variance
+
+    n_splits: int, cross-validation generator or an iterable, optional, default 3
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    opt_reweighted : bool, optional, default False
+        Whether to reweight the samples to minimize variance. If True then
+        final_model_effect.fit must accept sample_weight as a kw argument (WeightWrapper from
+        utilities can be used for any linear model to enable sample_weights). If True then
+        assumes the final_model_effect is flexible enough to fit the true CATE model. Otherwise,
+        it method will return a biased projection to the model_effect space, biased
+        to give more weight on parts of the feature space where the instrument is strong.
+    """
+
+    def __init__(self, model_Y_X, model_T_XZ,
+                 flexible_model_effect,
+                 final_model_effect=None,
+                 featurizer=None,
+                 cov_clip=.1,
+                 n_splits=3,
+                 opt_reweighted=False):
+        model_Y_X = _FirstStageWrapper(model_Y_X, discrete_target=False)
+        model_T_XZ = _FirstStageWrapper(model_T_XZ, discrete_target=True)
+        prel_model_effect = _IntentToTreatDRIV(model_Y_X,
+                                               model_T_XZ,
+                                               _DummyCATE(),
+                                               flexible_model_effect,
+                                               cov_clip=1e-7, n_splits=1, opt_reweighted=True)
+        if final_model_effect is None:
+            final_model_effect = flexible_model_effect
+        super().__init__(model_Y_X, model_T_XZ, prel_model_effect,
+                         final_model_effect,
+                         featurizer=featurizer,
+                         cov_clip=cov_clip,
+                         n_splits=n_splits,
+                         opt_reweighted=opt_reweighted)
+
+
+class LinearIntentToTreatDRIV(StatsModelsCateEstimatorMixin, IntentToTreatDRIV):
+    """
+    Implements the DRIV algorithm for the intent-to-treat A/B test setting
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to predict :math:`\\E[Y | X]`
+
+    model_T_XZ : estimator
+        model to predict :math:`\\E[T | X, Z]`
+
+    flexible_model_effect : estimator
+        a flexible model for a preliminary version of the CATE, must accept sample_weight at fit time.
+
+    final_model_effect : estimator, optional
+        a final model for the CATE and projections. If None, then flexible_model_effect is also used as a final model
+
+    cov_clip : float, optional, default 0.1
+        clipping of the covariate for regions with low "overlap", to reduce variance
+
+    n_splits: int, cross-validation generator or an iterable, optional, default 3
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    opt_reweighted : bool, optional, default False
+        Whether to reweight the samples to minimize variance. If True then
+        final_model_effect.fit must accept sample_weight as a kw argument (WeightWrapper from
+        utilities can be used for any linear model to enable sample_weights). If True then
+        assumes the final_model_effect is flexible enough to fit the true CATE model. Otherwise,
+        it method will return a biased projection to the model_effect space, biased
+        to give more weight on parts of the feature space where the instrument is strong.
+    """
+
+    def __init__(self, model_Y_X, model_T_XZ,
+                 flexible_model_effect,
+                 featurizer=None,
+                 fit_cate_intercept=True,
+                 cov_clip=.1,
+                 n_splits=3,
+                 opt_reweighted=False):
+        self.fit_cate_intercept = fit_cate_intercept
+        self.bias_part_of_coef = fit_cate_intercept
+        featurizer = clone(featurizer, safe=False)
+        if fit_cate_intercept:
+            add_intercept = FunctionTransformer(lambda F:
+                                                hstack([np.ones((F.shape[0], 1)), F]),
+                                                validate=True)
+            if featurizer:
+                featurizer = Pipeline([('featurize', featurizer),
+                                       ('add_intercept', add_intercept)])
+            else:
+                featurizer = add_intercept
+        super().__init__(model_Y_X, model_T_XZ,
+                         flexible_model_effect=flexible_model_effect,
+                         featurizer=featurizer,
+                         final_model_effect=StatsModelsLinearRegression(fit_intercept=False),
+                         cov_clip=cov_clip, n_splits=n_splits, opt_reweighted=opt_reweighted)
+
+    # override only so that we can update the docstring to indicate support for `StatsModelsInference`
+    def fit(self, Y, T, Z, X=None, sample_weight=None, sample_var=None, inference=None):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        inference: string,:class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of:class:`.BootstrapInference`) and 'statsmodels'
+            (or an instance of :class:`.StatsModelsInference`).
+
+        Returns
+        -------
+        self : instance
+        """
+        return super().fit(Y, T, Z, X=X, sample_weight=sample_weight, sample_var=sample_var, inference=inference)
diff --git a/econml/tests/test_ortho_iv.py b/econml/tests/test_ortho_iv.py
new file mode 100644
index 000000000..1646867d3
--- /dev/null
+++ b/econml/tests/test_ortho_iv.py
@@ -0,0 +1,357 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import unittest
+import pytest
+from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, PolynomialFeatures
+from sklearn.model_selection import KFold
+from econml.ortho_iv import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV,
+                             IntentToTreatDRIV, LinearIntentToTreatDRIV)
+import numpy as np
+from econml.utilities import shape, hstack, vstack, reshape, cross_product
+from econml.inference import BootstrapInference
+from contextlib import ExitStack
+from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier
+import itertools
+from econml.sklearn_extensions.linear_model import WeightedLasso
+from econml.tests.test_statsmodels import _summarize
+import econml.tests.utilities  # bugfix for assertWarns
+
+
+class TestDML(unittest.TestCase):
+
+    def test_cate_api(self):
+        """Test that we correctly implement the CATE API."""
+        n = 30
+
+        def size(n, d):
+            return (n, d) if d >= 0 else (n,)
+
+        def make_random(is_discrete, d):
+            if d is None:
+                return None
+            sz = size(n, d)
+            if is_discrete:
+                while True:
+                    arr = np.random.choice(['a', 'b', 'c'], size=sz)
+                    # ensure that we've got at least two of every row
+                    _, counts = np.unique(arr, return_counts=True, axis=0)
+                    if len(counts) == 3**(d if d > 0 else 1) and counts.min() > 1:
+                        return arr
+            else:
+                return np.random.normal(size=sz)
+
+        def eff_shape(n, d_y):
+            return (n,) + ((d_y,) if d_y > 0 else())
+
+        def marg_eff_shape(n, d_y, d_t_final):
+            return ((n,) +
+                    ((d_y,) if d_y > 0 else ()) +
+                    ((d_t_final,) if d_t_final > 0 else()))
+
+        # since T isn't passed to const_marginal_effect, defaults to one row if X is None
+        def const_marg_eff_shape(n, d_x, d_y, d_t_final):
+            return ((n if d_x else 1,) +
+                    ((d_y,) if d_y > 0 else ()) +
+                    ((d_t_final,) if d_t_final > 0 else()))
+
+        for d_t in [2, 1, -1]:
+            n_t = d_t if d_t > 0 else 1
+            for discrete_t in [True, False] if n_t == 1 else [False]:
+                for d_y in [3, 1, -1]:
+                    for d_q in [2, None]:
+                        for d_z in [2, 1]:
+                            if d_z < n_t:
+                                continue
+                            for discrete_z in [True, False] if d_z == 1 else[False]:
+                                Z1, Q, Y, T1 = [make_random(is_discrete, d)
+                                                for is_discrete, d in [(discrete_z, d_z),
+                                                                       (False, d_q),
+                                                                       (False, d_y),
+                                                                       (discrete_t, d_t)]]
+                                if discrete_t and discrete_z:
+                                    # need to make sure we get all *joint* combinations
+                                    arr = make_random(True, 2)
+                                    Z1 = arr[:, 0].reshape(size(n, d_z))
+                                    T1 = arr[:, 0].reshape(size(n, d_t))
+
+                                d_t_final1 = 2 if discrete_t else d_t
+
+                                if discrete_t:
+                                    # IntentToTreat only supports binary treatments/instruments
+                                    T2 = T1.copy()
+                                    T2[T1 == 'c'] = np.random.choice(['a', 'b'], size=np.count_nonzero(T1 == 'c'))
+                                    d_t_final2 = 1
+                                if discrete_z:
+                                    # IntentToTreat only supports binary treatments/instruments
+                                    Z2 = Z1.copy()
+                                    Z2[Z1 == 'c'] = np.random.choice(['a', 'b'], size=np.count_nonzero(Z1 == 'c'))
+
+                                effect_shape = eff_shape(n, d_y)
+
+                                model_t = LogisticRegression() if discrete_t else Lasso()
+                                model_z = LogisticRegression() if discrete_z else Lasso()
+
+                                # TODO: add stratification to bootstrap so that we can use it
+                                # even with discrete treatments
+                                all_infs = [None]
+                                if not (discrete_t or discrete_z):
+                                    all_infs.append(BootstrapInference(1))
+
+                                estimators = [(DMLATEIV(model_Y_X=Lasso(),
+                                                        model_T_X=model_t,
+                                                        model_Z_X=model_z,
+                                                        discrete_treatment=discrete_t,
+                                                        discrete_instrument=discrete_z),
+                                               True,
+                                               all_infs),
+                                              (ProjectedDMLATEIV(model_Y_X=Lasso(),
+                                                                 model_T_X=model_t,
+                                                                 model_T_XZ=model_t,
+                                                                 discrete_treatment=discrete_t,
+                                                                 discrete_instrument=discrete_z),
+                                               False,
+                                               all_infs),
+                                              (DMLIV(model_Y_X=Lasso(), model_T_X=model_t, model_T_XZ=model_t,
+                                                     model_final=Lasso(),
+                                                     discrete_treatment=discrete_t, discrete_instrument=discrete_z),
+                                               False,
+                                               all_infs)]
+
+                                if d_q and discrete_t and discrete_z:
+                                    # IntentToTreat requires X
+                                    estimators.append((
+                                        LinearIntentToTreatDRIV(model_Y_X=Lasso(), model_T_XZ=model_t,
+                                                                flexible_model_effect=WeightedLasso(),
+                                                                n_splits=2),
+                                        False,
+                                        all_infs + ['statsmodels']))
+
+                                for est, multi, infs in estimators:
+                                    if not(multi) and d_y > 1 or d_t > 1 or d_z > 1:
+                                        continue
+
+                                    for inf in infs:
+                                        with self.subTest(d_z=d_z, d_x=d_q, d_y=d_y, d_t=d_t,
+                                                          discrete_t=discrete_t, discrete_z=discrete_z,
+                                                          est=est, inf=inf):
+                                            Z = Z1
+                                            T = T1
+                                            d_t_final = d_t_final1
+                                            X = Q
+                                            d_x = d_q
+                                            W = None
+
+                                            if isinstance(est, (DMLATEIV, ProjectedDMLATEIV)):
+                                                # these support only W but not X
+                                                W = Q
+                                                X = None
+                                                d_x = None
+
+                                                def fit():
+                                                    return est.fit(Y, T, Z=Z, W=W, inference=inf)
+
+                                                def score():
+                                                    return est.score(Y, T, Z=Z, W=W)
+                                            else:
+                                                # these support only binary, not general discrete T and Z
+                                                if discrete_t:
+                                                    T = T2
+                                                    d_t_final = d_t_final2
+
+                                                if discrete_z:
+                                                    Z = Z2
+
+                                                def fit():
+                                                    return est.fit(Y, T, Z=Z, X=X, inference=inf)
+
+                                                def score():
+                                                    return est.score(Y, T, Z=Z, X=X)
+
+                                            marginal_effect_shape = marg_eff_shape(n, d_y, d_t_final)
+                                            const_marginal_effect_shape = const_marg_eff_shape(n, d_x, d_y, d_t_final)
+
+                                            fit()
+                                            # make sure we can call the marginal_effect and effect methods
+                                            const_marg_eff = est.const_marginal_effect(X)
+                                            marg_eff = est.marginal_effect(T, X)
+                                            self.assertEqual(shape(marg_eff), marginal_effect_shape)
+                                            self.assertEqual(shape(const_marg_eff), const_marginal_effect_shape)
+
+                                            np.testing.assert_array_equal(
+                                                marg_eff if d_x else marg_eff[0:1], const_marg_eff)
+
+                                            T0 = np.full_like(T, 'a') if discrete_t else np.zeros_like(T)
+                                            eff = est.effect(X, T0=T0, T1=T)
+                                            self.assertEqual(shape(eff), effect_shape)
+
+                                            # TODO: add tests for extra properties like coef_ where they exist
+
+                                            if inf is not None:
+                                                const_marg_eff_int = est.const_marginal_effect_interval(X)
+                                                marg_eff_int = est.marginal_effect_interval(T, X)
+                                                self.assertEqual(shape(marg_eff_int),
+                                                                 (2,) + marginal_effect_shape)
+                                                self.assertEqual(shape(const_marg_eff_int),
+                                                                 (2,) + const_marginal_effect_shape)
+                                                self.assertEqual(shape(est.effect_interval(X, T0=T0, T1=T)),
+                                                                 (2,) + effect_shape)
+
+                                            # TODO: add tests for extra properties like coef_ where they exist
+
+                                            score()
+
+                                            # make sure we can call effect with implied scalar treatments,
+                                            # no matter the dimensions of T, and also that we warn when there
+                                            # are multiple treatments
+                                            if d_t > 1:
+                                                cm = self.assertWarns(Warning)
+                                            else:
+                                                # ExitStack can be used as a "do nothing" ContextManager
+                                                cm = ExitStack()
+                                            with cm:
+                                                effect_shape2 = (n if d_x else 1,) + ((d_y,) if d_y > 0 else())
+                                                eff = est.effect(X) if not discrete_t else est.effect(
+                                                    X, T0='a', T1='b')
+                                                self.assertEqual(shape(eff), effect_shape2)
+
+    def test_bad_splits_discrete(self):
+        """
+        Tests that when some training splits in a crossfit fold don't contain all treatments then an error
+        is raised.
+        """
+        Y = np.array([2, 3, 1, 3, 2, 1, 1, 1])
+        bad = np.array([2, 2, 1, 2, 1, 1, 1, 1])
+        W = np.ones((8, 1))
+        ok = np.array([1, 2, 3, 1, 2, 3, 1, 2])
+        models = [Lasso(), Lasso(), Lasso()]
+        est = DMLATEIV(*models, n_splits=[(np.arange(4, 8), np.arange(4))])
+        est.fit(Y, T=bad, Z=bad, W=W)  # imbalance ok with continuous instrument/treatment
+
+        models = [Lasso(), LogisticRegression(), Lasso()]
+        est = DMLATEIV(*models, n_splits=[(np.arange(4, 8), np.arange(4))], discrete_treatment=True)
+        with pytest.raises(AttributeError):
+            est.fit(Y, T=bad, Z=ok, W=W)
+
+        models = [Lasso(), Lasso(), LogisticRegression()]
+        est = DMLATEIV(*models, n_splits=[(np.arange(4, 8), np.arange(4))], discrete_instrument=True)
+        with pytest.raises(AttributeError):
+            est.fit(Y, T=ok, Z=bad, W=W)
+
+        # TODO: ideally we could also test whether Z and X are jointly okay when both discrete
+        #       however, with custom splits the checking happens in the first stage wrapper
+        #       where we don't have all of the required information to do this;
+        #       we'd probably need to add it to _crossfit instead
+
+    def test_multidim_arrays_fail(self):
+
+        Y = np.array([2, 3, 1, 3, 2, 1, 1, 1])
+        three_class = np.array([1, 2, 3, 1, 2, 3, 1, 2])
+        two_class = np.array([1, 2, 1, 1, 2, 1, 1, 2])
+
+        est = NonParamDMLIV(Lasso(), LogisticRegression(), LogisticRegression(),
+                            WeightedLasso(), discrete_treatment=True)
+
+        with pytest.raises(AttributeError):
+            est.fit(Y, T=three_class, Z=two_class)
+
+        est = IntentToTreatDRIV(Lasso(), LogisticRegression(), WeightedLasso())
+
+        with pytest.raises(AttributeError):
+            est.fit(Y, T=three_class, Z=two_class)
+
+        with pytest.raises(AttributeError):
+            est.fit(Y, T=two_class, Z=three_class)
+
+    # TODO: make IV related
+    def test_access_to_internal_models(self):
+        """
+        Test that API related to accessing the nuisance models, cate_model and featurizer is working.
+        """
+        from econml.dml import DMLCateEstimator
+
+        Y = np.array([2, 3, 1, 3, 2, 1, 1, 1])
+        T = np.array([3, 2, 1, 2, 1, 2, 1, 3])
+        X = np.ones((8, 1))
+        est = DMLCateEstimator(model_y=WeightedLasso(),
+                               model_t=LogisticRegression(),
+                               model_final=WeightedLasso(),
+                               featurizer=PolynomialFeatures(degree=2, include_bias=False),
+                               fit_cate_intercept=True,
+                               discrete_treatment=True)
+        est.fit(Y, T, X)
+        assert isinstance(est.original_featurizer, PolynomialFeatures)
+        assert isinstance(est.featurizer, Pipeline)
+        assert isinstance(est.model_cate, WeightedLasso)
+        for mdl in est.models_y:
+            assert isinstance(mdl, WeightedLasso)
+        for mdl in est.models_t:
+            assert isinstance(mdl, LogisticRegression)
+        np.testing.assert_array_equal(est.cate_feature_names(['A']), ['A', 'A^2'])
+        np.testing.assert_array_equal(est.cate_feature_names(), ['x0', 'x0^2'])
+        est = DMLCateEstimator(model_y=WeightedLasso(),
+                               model_t=LogisticRegression(),
+                               model_final=WeightedLasso(),
+                               featurizer=None,
+                               fit_cate_intercept=True,
+                               discrete_treatment=True)
+        est.fit(Y, T, X)
+        assert est.original_featurizer is None
+        assert isinstance(est.featurizer, FunctionTransformer)
+        assert isinstance(est.model_cate, WeightedLasso)
+        for mdl in est.models_y:
+            assert isinstance(mdl, WeightedLasso)
+        for mdl in est.models_t:
+            assert isinstance(mdl, LogisticRegression)
+        np.testing.assert_array_equal(est.cate_feature_names(['A']), ['A'])
+
+    def test_can_use_statsmodel_inference(self):
+        """Test that we can use statsmodels to generate confidence intervals"""
+        est = LinearIntentToTreatDRIV(LinearRegression(), LogisticRegression(C=1000), WeightedLasso())
+        est.fit(np.array([1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2]),
+                np.array([1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2]),
+                np.array([1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]),
+                X=np.array([1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]).reshape(-1, 1),
+                inference='statsmodels')
+        interval = est.effect_interval(np.ones((9, 1)),
+                                       T0=np.array([1, 1, 1, 2, 2, 2, 1, 1, 1]),
+                                       T1=np.array([1, 2, 1, 1, 2, 2, 2, 2, 1]),
+                                       alpha=0.05)
+        point = est.effect(np.ones((9, 1)),
+                           T0=np.array([1, 1, 1, 2, 2, 2, 1, 1, 1]),
+                           T1=np.array([1, 2, 1, 1, 2, 2, 2, 2, 1]))
+
+        assert len(interval) == 2
+        lo, hi = interval
+        assert lo.shape == hi.shape == point.shape
+        assert np.all(lo <= point)
+        assert np.all(point <= hi)
+        assert np.any(lo < hi)  # for at least some of the examples, the CI should have nonzero width
+
+        interval = est.const_marginal_effect_interval(np.ones((9, 1)), alpha=0.05)
+        point = est.const_marginal_effect(np.ones((9, 1)))
+        assert len(interval) == 2
+        lo, hi = interval
+        assert lo.shape == hi.shape == point.shape
+        assert np.all(lo <= point)
+        assert np.all(point <= hi)
+        assert np.any(lo < hi)  # for at least some of the examples, the CI should have nonzero width
+
+        interval = est.coef__interval(alpha=0.05)
+        point = est.coef_
+        assert len(interval) == 2
+        lo, hi = interval
+        assert lo.shape == hi.shape == point.shape
+        assert np.all(lo <= point)
+        assert np.all(point <= hi)
+        assert np.any(lo < hi)  # for at least some of the examples, the CI should have nonzero width
+
+        interval = est.intercept__interval(alpha=0.05)
+        point = est.intercept_
+        assert len(interval) == 2
+        lo, hi = interval
+        assert np.all(lo <= point)
+        assert np.all(point <= hi)
+        assert np.any(lo < hi)  # for at least some of the examples, the CI should have nonzero width
diff --git a/econml/tests/test_ortho_learner.py b/econml/tests/test_ortho_learner.py
index 42c1c2916..62db46d55 100644
--- a/econml/tests/test_ortho_learner.py
+++ b/econml/tests/test_ortho_learner.py
@@ -149,7 +149,7 @@ def score(self, Y, T, W=None, nuisances=None):
         sigma = 0.1
         y = X[:, 0] + X[:, 1] + np.random.normal(0, sigma, size=(10000,))
         est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()), ModelFinal(),
-                            n_splits=2, discrete_treatment=False, random_state=None)
+                            n_splits=2, discrete_treatment=False, discrete_instrument=False, random_state=None)
         est.fit(y, X[:, 0], W=X[:, 1:])
         np.testing.assert_almost_equal(est.const_marginal_effect(), 1, decimal=3)
         np.testing.assert_array_almost_equal(est.effect(), np.ones(1), decimal=3)
@@ -164,7 +164,7 @@ def score(self, Y, T, W=None, nuisances=None):
         sigma = 0.1
         y = X[:, 0] + X[:, 1] + np.random.normal(0, sigma, size=(10000,))
         est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()), ModelFinal(),
-                            n_splits=2, discrete_treatment=False, random_state=None)
+                            n_splits=2, discrete_treatment=False, discrete_instrument=False, random_state=None)
         est.fit(y, X[:, 0], None, X[:, 1:])
         np.testing.assert_almost_equal(est.const_marginal_effect(), 1, decimal=3)
         np.testing.assert_array_almost_equal(est.effect(), np.ones(1), decimal=3)
@@ -179,7 +179,8 @@ def score(self, Y, T, W=None, nuisances=None):
         sigma = 0.1
         y = X[:, 0] + X[:, 1] + np.random.normal(0, sigma, size=(10000,))
         est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()), ModelFinal(),
-                            n_splits=KFold(n_splits=3), discrete_treatment=False, random_state=None)
+                            n_splits=KFold(n_splits=3),
+                            discrete_treatment=False, discrete_instrument=False, random_state=None)
         est.fit(y, X[:, 0], None, X[:, 1:])
         np.testing.assert_almost_equal(est.const_marginal_effect(), 1, decimal=3)
         np.testing.assert_array_almost_equal(est.effect(), np.ones(1), decimal=3)
@@ -195,7 +196,8 @@ def score(self, Y, T, W=None, nuisances=None):
         y = X[:, 0] + X[:, 1] + np.random.normal(0, sigma, size=(10000,))
         folds = [(np.arange(X.shape[0] // 2), np.arange(X.shape[0] // 2, X.shape[0]))]
         est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()), ModelFinal(),
-                            n_splits=KFold(n_splits=3), discrete_treatment=False, random_state=None)
+                            n_splits=KFold(n_splits=3), discrete_treatment=False,
+                            discrete_instrument=False, random_state=None)
         est.fit(y, X[:, 0], None, X[:, 1:])
         np.testing.assert_almost_equal(est.const_marginal_effect(), 1, decimal=3)
         np.testing.assert_array_almost_equal(est.effect(), np.ones(1), decimal=3)
@@ -236,7 +238,7 @@ def predict(self, X=None):
         sigma = 0.1
         y = X[:, 0] + X[:, 1] + np.random.normal(0, sigma, size=(10000,))
         est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()), ModelFinal(),
-                            n_splits=2, discrete_treatment=False, random_state=None)
+                            n_splits=2, discrete_treatment=False, discrete_instrument=False, random_state=None)
         est.fit(y, X[:, 0], W=X[:, 1:])
         np.testing.assert_almost_equal(est.const_marginal_effect(), 1, decimal=3)
         np.testing.assert_array_almost_equal(est.effect(), np.ones(1), decimal=3)
@@ -284,7 +286,7 @@ def score(self, Y, T, W=None, nuisances=None):
         sigma = 0.01
         y = T + X[:, 0] + np.random.normal(0, sigma, size=(10000,))
         est = _OrthoLearner(ModelNuisance(LogisticRegression(solver='lbfgs'), LinearRegression()), ModelFinal(),
-                            n_splits=2, discrete_treatment=True, random_state=None)
+                            n_splits=2, discrete_treatment=True, discrete_instrument=False, random_state=None)
         est.fit(y, T, W=X)
         np.testing.assert_almost_equal(est.const_marginal_effect(), 1, decimal=3)
         np.testing.assert_array_almost_equal(est.effect(), np.ones(1), decimal=3)
diff --git a/econml/utilities.py b/econml/utilities.py
index 640db1216..c36a10df0 100644
--- a/econml/utilities.py
+++ b/econml/utilities.py
@@ -1340,3 +1340,33 @@ def as_html(self):
         if self.extra_txt is not None:
             html = html + '<br/><br/>' + self.extra_txt.replace('\n', '<br/>')
         return html
+
+
+class SeparateModel:
+    """
+    Splits the data based on the last feature and trains
+    a separate model for each subsample. At predict time, it
+    uses the last feature to choose which model to use
+    to predict.
+    """
+
+    def __init__(self, *models):
+        self.models = [clone(model) for model in models]
+
+    def fit(self, XZ, T):
+        for (i, m) in enumerate(self.models):
+            inds = (XZ[:, -1] == i)
+            m.fit(XZ[inds, :-1], T[inds])
+        return self
+
+    def predict(self, XZ):
+        t_pred = np.zeros(XZ.shape[0])
+        for (i, m) in enumerate(self.models):
+            inds = (XZ[:, -1] == i)
+            if np.any(inds):
+                t_pred[inds] = m.predict(XZ[inds, :-1])
+        return t_pred
+
+    @property
+    def coef_(self):
+        return np.concatenate((model.coef_ for model in self.models))