Handle pandas categorical types for categorical columns in _causal_analysis.py (#602)

gaugup · web-flow · commit ac12f54f4304 · 2022-06-13T16:35:07.000-04:00
If the categorical type is set for a treatment column explicitly then there is a failure in `CausalAnalysis` class.

```
~\AppData\Local\Continuum\miniconda3\envs\nhs-hips\lib\site-packages\econml\solutions\causal_analysis\_causal_analysis.py in individualized_policy(self, Xtest, feature_index, n_rows, treatment_costs, alpha)
   1714                 all_costs = np.array([0] + [treatment_costs] * (len(treatment_arr) - 1))
   1715                 # construct index of current treatment
-&gt; 1716                 current_ind = (current_treatment.reshape(-1, 1) ==
   1717                                treatment_arr.reshape(1, -1)) @ np.arange(len(treatment_arr))
   1718                 current_cost = all_costs[current_ind]

~\AppData\Local\Continuum\miniconda3\envs\nhs-hips\lib\site-packages\pandas\core\ops\common.py in new_method(self, other)
     67         other = item_from_zerodim(other)
     68 
---&gt; 69         return method(self, other)
     70 
     71     return new_method

~\AppData\Local\Continuum\miniconda3\envs\nhs-hips\lib\site-packages\pandas\core\arrays\categorical.py in func(self, other)
    131         if is_list_like(other) and len(other) != len(self) and not hashable:
    132             # in hashable case we may have a tuple that is itself a category
--&gt; 133             raise ValueError("Lengths must match.")
    134 
    135         if not self.ordered:
```
Solution is to check for the type of the categorical column to see if it is of type `pd.core.arrays.categorical.Categorical` and extract the numpy array using `to_numpy()` method.
diff --git a/econml/solutions/causal_analysis/_causal_analysis.py b/econml/solutions/causal_analysis/_causal_analysis.py
@@ -1701,6 +1701,8 @@ def individualized_policy(self, Xtest, feature_index, *, n_rows=None, treatment_
             effect = result.estimator.effect_inference(Xtest, T0=orig_df['Current treatment'], T1=rec)
             # we now need to construct the delta in the cost between the two treatments and translate the effect
             current_treatment = orig_df['Current treatment'].values
+            if isinstance(current_treatment, pd.core.arrays.categorical.Categorical):
+                current_treatment = current_treatment.to_numpy()
             if np.ndim(treatment_costs) >= 2:
                 # remove third dimenions potentially added
                 if multi_y:  # y was an array, not a vector
diff --git a/econml/tests/test_causal_analysis.py b/econml/tests/test_causal_analysis.py
@@ -85,6 +85,8 @@ def test_basic_array(self):
                     # policy value should exceed always treating with any treatment
                     assert_less_close(np.array(list(always_trt.values())), policy_val)
 
+                    ind_pol = ca.individualized_policy(X, inds[idx])
+
                 # global shape is (d_y, sum(d_t))
                 assert glo_point_est.shape == coh_point_est.shape == (1, 5)
                 assert loc_point_est.shape == (2,) + glo_point_est.shape
@@ -128,113 +130,121 @@ def test_basic_array(self):
 
     def test_basic_pandas(self):
         for classification in [False, True]:
-            y = pd.Series(np.random.choice([0, 1], size=(500,)))
-            X = pd.DataFrame({'a': np.random.normal(size=500),
-                              'b': np.random.normal(size=500),
-                              'c': np.random.choice([0, 1], size=500),
-                              'd': np.random.choice(['a', 'b', 'c'], size=500)})
-            n_inds = [0, 1, 2, 3]
-            t_inds = ['a', 'b', 'c', 'd']
-            n_cats = [2, 3]
-            t_cats = ['c', 'd']
-            n_hinds = [0, 3]
-            t_hinds = ['a', 'd']
-            for (inds, cats, hinds) in [(n_inds, n_cats, n_hinds), (t_inds, t_cats, t_hinds)]:
-                ca = CausalAnalysis(inds, cats, hinds, classification=classification)
-                ca.fit(X, y)
-                glo = ca.global_causal_effect()
-                coh = ca.cohort_causal_effect(X[:2])
-                loc = ca.local_causal_effect(X[:2])
-
-                # global and cohort data should have exactly the same structure, but different values
-                assert glo.index.equals(coh.index)
-
-                # local index should have as many times entries as global as there were rows passed in
-                assert len(loc.index) == 2 * len(glo.index)
-
-                assert glo.index.names == ['feature', 'feature_value']
-                assert loc.index.names == ['sample'] + glo.index.names
-
-                # features; for categoricals they should appear #cats-1 times each
-                fts = ['a', 'b', 'c', 'd', 'd']
-
-                for i in range(len(fts)):
-                    assert fts[i] == glo.index[i][0] == loc.index[i][1] == loc.index[len(fts) + i][1]
-
-                glo_dict = ca._global_causal_effect_dict()
-                glo_dict2 = ca._global_causal_effect_dict(row_wise=True)
-
-                coh_dict = ca._cohort_causal_effect_dict(X[:2])
-                coh_dict2 = ca._cohort_causal_effect_dict(X[:2], row_wise=True)
-
-                loc_dict = ca._local_causal_effect_dict(X[:2])
-                loc_dict2 = ca._local_causal_effect_dict(X[:2], row_wise=True)
-
-                glo_point_est = np.array(glo_dict[_CausalInsightsConstants.PointEstimateKey])
-                coh_point_est = np.array(coh_dict[_CausalInsightsConstants.PointEstimateKey])
-                loc_point_est = np.array(loc_dict[_CausalInsightsConstants.PointEstimateKey])
-
-                # global shape is (d_y, sum(d_t))
-                assert glo_point_est.shape == coh_point_est.shape == (1, 5)
-                assert loc_point_est.shape == (2,) + glo_point_est.shape
-
-                # global and cohort row-wise dicts have d_y * d_t entries
-                assert len(
-                    glo_dict2[_CausalInsightsConstants.RowData]) == len(
-                    coh_dict2[_CausalInsightsConstants.RowData]) == 5
-                # local dictionary is flattened to n_rows * d_y * d_t
-                assert len(loc_dict2[_CausalInsightsConstants.RowData]) == 10
-
-                pto = ca._policy_tree_output(X, inds[1])
-                ca._heterogeneity_tree_output(X, inds[1])
-                ca._heterogeneity_tree_output(X, inds[3])
-
-                # continuous treatments have typical treatment values equal to
-                # the mean of the absolute value of non-zero entries
-                np.testing.assert_allclose(ca.typical_treatment_value(inds[0]), np.mean(np.abs(X['a'])))
-                np.testing.assert_allclose(ca.typical_treatment_value(inds[1]), np.mean(np.abs(X['b'])))
-                # discrete treatments have typical treatment value 1
-                assert ca.typical_treatment_value(inds[2]) == ca.typical_treatment_value(inds[3]) == 1
-
-                # Make sure we handle continuous, binary, and multi-class treatments
-                # For multiple discrete treatments, one "always treat" value per non-default treatment
-                for (idx, length) in [(0, 1), (1, 1), (2, 1), (3, 2)]:
-                    pto = ca._policy_tree_output(X, inds[idx])
-                    policy_val = pto.policy_value
-                    always_trt = pto.always_treat
-                    assert isinstance(pto.control_name, str)
-                    assert isinstance(always_trt, dict)
-                    assert np.array(policy_val).shape == ()
-                    assert len(always_trt) == length
-                    for val in always_trt.values():
-                        assert np.array(val).shape == ()
-
-                    # policy value should exceed always treating with any treatment
-                    assert_less_close(np.array(list(always_trt.values())), policy_val)
-
-                if not classification:
-                    # ExitStack can be used as a "do nothing" ContextManager
-                    cm = ExitStack()
-                else:
-                    cm = self.assertRaises(Exception)
-                with cm:
-                    inf = ca.whatif(X[:2], np.ones(shape=(2,)), inds[1], y[:2])
-                    assert np.shape(inf.point_estimate) == np.shape(y[:2])
-                    inf = ca.whatif(X[:2], np.ones(shape=(2,)), inds[2], y[:2])
-                    assert np.shape(inf.point_estimate) == np.shape(y[:2])
+            for category in [False, True]:
+                y = pd.Series(np.random.choice([0, 1], size=(500,)))
+                X = pd.DataFrame({'a': np.random.normal(size=500),
+                                  'b': np.random.normal(size=500),
+                                  'c': np.random.choice([0, 1], size=500),
+                                  'd': np.random.choice(['a', 'b', 'c'], size=500)})
+
+                if category:
+                    X['c'] = X['c'].astype('category')
+                    X['d'] = X['d'].astype('category')
+
+                n_inds = [0, 1, 2, 3]
+                t_inds = ['a', 'b', 'c', 'd']
+                n_cats = [2, 3]
+                t_cats = ['c', 'd']
+                n_hinds = [0, 3]
+                t_hinds = ['a', 'd']
+                for (inds, cats, hinds) in [(n_inds, n_cats, n_hinds), (t_inds, t_cats, t_hinds)]:
+                    ca = CausalAnalysis(inds, cats, hinds, classification=classification)
+                    ca.fit(X, y)
+                    glo = ca.global_causal_effect()
+                    coh = ca.cohort_causal_effect(X[:2])
+                    loc = ca.local_causal_effect(X[:2])
+
+                    # global and cohort data should have exactly the same structure, but different values
+                    assert glo.index.equals(coh.index)
+
+                    # local index should have as many times entries as global as there were rows passed in
+                    assert len(loc.index) == 2 * len(glo.index)
+
+                    assert glo.index.names == ['feature', 'feature_value']
+                    assert loc.index.names == ['sample'] + glo.index.names
+
+                    # features; for categoricals they should appear #cats-1 times each
+                    fts = ['a', 'b', 'c', 'd', 'd']
+
+                    for i in range(len(fts)):
+                        assert fts[i] == glo.index[i][0] == loc.index[i][1] == loc.index[len(fts) + i][1]
+
+                    glo_dict = ca._global_causal_effect_dict()
+                    glo_dict2 = ca._global_causal_effect_dict(row_wise=True)
+
+                    coh_dict = ca._cohort_causal_effect_dict(X[:2])
+                    coh_dict2 = ca._cohort_causal_effect_dict(X[:2], row_wise=True)
+
+                    loc_dict = ca._local_causal_effect_dict(X[:2])
+                    loc_dict2 = ca._local_causal_effect_dict(X[:2], row_wise=True)
+
+                    glo_point_est = np.array(glo_dict[_CausalInsightsConstants.PointEstimateKey])
+                    coh_point_est = np.array(coh_dict[_CausalInsightsConstants.PointEstimateKey])
+                    loc_point_est = np.array(loc_dict[_CausalInsightsConstants.PointEstimateKey])
+
+                    # global shape is (d_y, sum(d_t))
+                    assert glo_point_est.shape == coh_point_est.shape == (1, 5)
+                    assert loc_point_est.shape == (2,) + glo_point_est.shape
+
+                    # global and cohort row-wise dicts have d_y * d_t entries
+                    assert len(
+                        glo_dict2[_CausalInsightsConstants.RowData]) == len(
+                        coh_dict2[_CausalInsightsConstants.RowData]) == 5
+                    # local dictionary is flattened to n_rows * d_y * d_t
+                    assert len(loc_dict2[_CausalInsightsConstants.RowData]) == 10
+
+                    pto = ca._policy_tree_output(X, inds[1])
+                    ca._heterogeneity_tree_output(X, inds[1])
+                    ca._heterogeneity_tree_output(X, inds[3])
+
+                    # continuous treatments have typical treatment values equal to
+                    # the mean of the absolute value of non-zero entries
+                    np.testing.assert_allclose(ca.typical_treatment_value(inds[0]), np.mean(np.abs(X['a'])))
+                    np.testing.assert_allclose(ca.typical_treatment_value(inds[1]), np.mean(np.abs(X['b'])))
+                    # discrete treatments have typical treatment value 1
+                    assert ca.typical_treatment_value(inds[2]) == ca.typical_treatment_value(inds[3]) == 1
+
+                    # Make sure we handle continuous, binary, and multi-class treatments
+                    # For multiple discrete treatments, one "always treat" value per non-default treatment
+                    for (idx, length) in [(0, 1), (1, 1), (2, 1), (3, 2)]:
+                        pto = ca._policy_tree_output(X, inds[idx])
+                        policy_val = pto.policy_value
+                        always_trt = pto.always_treat
+                        assert isinstance(pto.control_name, str)
+                        assert isinstance(always_trt, dict)
+                        assert np.array(policy_val).shape == ()
+                        assert len(always_trt) == length
+                        for val in always_trt.values():
+                            assert np.array(val).shape == ()
+
+                        # policy value should exceed always treating with any treatment
+                        assert_less_close(np.array(list(always_trt.values())), policy_val)
+
+                        ind_pol = ca.individualized_policy(X, inds[idx])
+
+                    if not classification:
+                        # ExitStack can be used as a "do nothing" ContextManager
+                        cm = ExitStack()
+                    else:
+                        cm = self.assertRaises(Exception)
+                    with cm:
+                        inf = ca.whatif(X[:2], np.ones(shape=(2,)), inds[1], y[:2])
+                        assert np.shape(inf.point_estimate) == np.shape(y[:2])
+                        inf = ca.whatif(X[:2], np.ones(shape=(2,)), inds[2], y[:2])
+                        assert np.shape(inf.point_estimate) == np.shape(y[:2])
 
-                    ca._whatif_dict(X[:2], np.ones(shape=(2,)), inds[1], y[:2])
-                    ca._whatif_dict(X[:2], np.ones(shape=(2,)), inds[1], y[:2], row_wise=True)
+                        ca._whatif_dict(X[:2], np.ones(shape=(2,)), inds[1], y[:2])
+                        ca._whatif_dict(X[:2], np.ones(shape=(2,)), inds[1], y[:2], row_wise=True)
 
-            badargs = [
-                (n_inds, n_cats, [4]),  # hinds out of range
-                (n_inds, n_cats, ["test"])  # hinds out of range
-            ]
+                badargs = [
+                    (n_inds, n_cats, [4]),  # hinds out of range
+                    (n_inds, n_cats, ["test"])  # hinds out of range
+                ]
 
-            for args in badargs:
-                with self.assertRaises(Exception):
-                    ca = CausalAnalysis(*args)
-                    ca.fit(X, y)
+                for args in badargs:
+                    with self.assertRaises(Exception):
+                        ca = CausalAnalysis(*args)
+                        ca.fit(X, y)
 
     def test_automl_first_stage(self):
         d_y = (1,)
@@ -294,6 +304,8 @@ def test_automl_first_stage(self):
                 # policy value should exceed always treating with any treatment
                 assert_less_close(np.array(list(always_trt.values())), policy_val)
 
+                ind_pol = ca.individualized_policy(X, inds[idx])
+
             # global shape is (d_y, sum(d_t))
             assert glo_point_est.shape == coh_point_est.shape == (1, 5)
             assert loc_point_est.shape == (2,) + glo_point_est.shape
@@ -436,6 +448,8 @@ def test_final_models(self):
                     # policy value should exceed always treating with any treatment
                     assert_less_close(np.array(list(always_trt.values())), policy_val)
 
+                    ind_pol = ca.individualized_policy(X, inds[idx])
+
                 if not classification:
                     # ExitStack can be used as a "do nothing" ContextManager
                     cm = ExitStack()
@@ -526,6 +540,8 @@ def test_forest_with_pandas(self):
             # policy value should exceed always treating with any treatment
             assert_less_close(np.array(list(always_trt.values())), policy_val)
 
+            ind_pol = ca.individualized_policy(X, inds[idx])
+
     def test_warm_start(self):
         for classification in [True, False]:
             # dgp