From 9f3861271e7a5000d80864fa9e7a783a4a8d5a8b Mon Sep 17 00:00:00 2001 From: TomAugspurger Date: Thu, 6 Feb 2014 15:41:22 -0600 Subject: [PATCH 1/2] BUG: preserve dtypes in interpolate --- doc/source/release.rst | 7 +++- doc/source/v0.14.0.txt | 3 ++ pandas/core/generic.py | 6 +-- pandas/tests/test_generic.py | 79 ++++++++++++++++++++++++++++-------- vb_suite/frame_methods.py | 26 ++++++++++++ 5 files changed, 98 insertions(+), 23 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 6e1632f036f38..31d3b88094d37 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -66,7 +66,8 @@ API Changes - ``df['col'] = value`` and ``df.loc[:,'col'] = value`` are now completely equivalent; previously the ``.loc`` would not necessarily coerce the dtype of the resultant series (:issue:`6149`) - ``dtypes`` and ``ftypes`` now return a series with ``dtype=object`` on empty containers (:issue:`5740`) - +- The ``interpolate`` ``downcast`` keyword default has been changed from ``infer`` to + ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`). Experimental Features ~~~~~~~~~~~~~~~~~~~~~ @@ -115,12 +116,16 @@ Bug Fixes - TimeGrouper has a more compatible API to the rest of the groupers (e.g. ``groups`` was missing) (:issue:`3881`) - Bug in ``pd.eval`` when parsing strings with possible tokens like ``'&'`` (:issue:`6351`) +<<<<<<< HEAD - Bug correctly handle placements of ``-inf`` in Panels when dividing by integer 0 (:issue:`6178`) - ``DataFrame.shift`` with ``axis=1`` was raising (:issue:`6371`) - Disabled clipboard tests until release time (run locally with ``nosetests -A disabled`` (:issue:`6048`). - Bug in ``DataFrame.replace()`` when passing a nested ``dict`` that contained keys not in the values to be replaced (:issue:`6342`) - Bug in take with duplicate columns not consolidated (:issue:`6240`) +======= +- Bug in interpolate changing dtypes (:issue:`6290`) +>>>>>>> 336b309... BUG: preserve dtypes in interpolate pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 58ae5084c4827..a3839542dafcc 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -29,6 +29,9 @@ API changes df.iloc[:,2:3] df.iloc[:,1:3] +- The ``DataFrame.interpolate()`` ``downcast`` keyword default has been changed from ``infer`` to + ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`). + MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f8dbe079610c0..b9ffeb636615b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2435,7 +2435,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, return self._constructor(new_data).__finalize__(self) def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - downcast='infer', **kwargs): + downcast=None, **kwargs): """ Interpolate values according to different methods. @@ -2468,7 +2468,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, Maximum number of consecutive NaNs to fill. inplace : bool, default False Update the NDFrame in place if possible. - downcast : optional, 'infer' or None, defaults to 'infer' + downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. Returns @@ -2492,7 +2492,6 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, dtype: float64 """ - if self.ndim > 2: raise NotImplementedError("Interpolate has not been implemented " "on Panel and Panel 4D objects.") @@ -2534,7 +2533,6 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, inplace=inplace, downcast=downcast, **kwargs) - if inplace: if axis == 1: self._update_inplace(new_data) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index d694efff9b351..7e4b23b633477 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -459,7 +459,10 @@ def test_interpolate(self): self.assert_numpy_array_equal(time_interp, ord_ts) # try time interpolation on a non-TimeSeries - self.assertRaises(ValueError, self.series.interpolate, method='time') + # Only raises ValueError if there are NaNs. + non_ts = self.series.copy() + non_ts[0] = np.NaN + self.assertRaises(ValueError, non_ts.interpolate, method='time') def test_interp_regression(self): _skip_if_no_scipy() @@ -512,7 +515,7 @@ def test_interpolate_non_ts(self): def test_nan_interpolate(self): s = Series([0, 1, np.nan, 3]) result = s.interpolate() - expected = Series([0, 1, 2, 3]) + expected = Series([0., 1., 2., 3.]) assert_series_equal(result, expected) _skip_if_no_scipy() @@ -522,20 +525,20 @@ def test_nan_interpolate(self): def test_nan_irregular_index(self): s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9]) result = s.interpolate() - expected = Series([1, 2, 3, 4], index=[1, 3, 5, 9]) + expected = Series([1., 2., 3., 4.], index=[1, 3, 5, 9]) assert_series_equal(result, expected) def test_nan_str_index(self): s = Series([0, 1, 2, np.nan], index=list('abcd')) result = s.interpolate() - expected = Series([0, 1, 2, 2], index=list('abcd')) + expected = Series([0., 1., 2., 2.], index=list('abcd')) assert_series_equal(result, expected) def test_interp_quad(self): _skip_if_no_scipy() sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4]) result = sq.interpolate(method='quadratic') - expected = Series([1, 4, 9, 16], index=[1, 2, 3, 4]) + expected = Series([1., 4., 9., 16.], index=[1, 2, 3, 4]) assert_series_equal(result, expected) def test_interp_scipy_basic(self): @@ -545,18 +548,30 @@ def test_interp_scipy_basic(self): expected = Series([1., 3., 7.5, 12., 18.5, 25.]) result = s.interpolate(method='slinear') assert_series_equal(result, expected) + + result = s.interpolate(method='slinear', donwcast='infer') + assert_series_equal(result, expected) # nearest expected = Series([1, 3, 3, 12, 12, 25]) result = s.interpolate(method='nearest') + assert_series_equal(result, expected.astype('float')) + + result = s.interpolate(method='nearest', downcast='infer') assert_series_equal(result, expected) # zero expected = Series([1, 3, 3, 12, 12, 25]) result = s.interpolate(method='zero') + assert_series_equal(result, expected.astype('float')) + + result = s.interpolate(method='zero', downcast='infer') assert_series_equal(result, expected) # quadratic expected = Series([1, 3., 6.769231, 12., 18.230769, 25.]) result = s.interpolate(method='quadratic') assert_series_equal(result, expected) + + result = s.interpolate(method='quadratic', downcast='infer') + assert_series_equal(result, expected) # cubic expected = Series([1., 3., 6.8, 12., 18.2, 25.]) result = s.interpolate(method='cubic') @@ -585,7 +600,6 @@ def test_interp_multiIndex(self): expected = s.copy() expected.loc[2] = 2 - expected = expected.astype(np.int64) result = s.interpolate() assert_series_equal(result, expected) @@ -595,7 +609,7 @@ def test_interp_multiIndex(self): def test_interp_nonmono_raise(self): _skip_if_no_scipy() - s = pd.Series([1, 2, 3], index=[0, 2, 1]) + s = Series([1, np.nan, 3], index=[0, 2, 1]) with tm.assertRaises(ValueError): s.interpolate(method='krogh') @@ -603,7 +617,7 @@ def test_interp_datetime64(self): _skip_if_no_scipy() df = Series([1, np.nan, 3], index=date_range('1/1/2000', periods=3)) result = df.interpolate(method='nearest') - expected = Series([1, 1, 3], index=date_range('1/1/2000', periods=3)) + expected = Series([1., 1., 3.], index=date_range('1/1/2000', periods=3)) assert_series_equal(result, expected) class TestDataFrame(tm.TestCase, Generic): @@ -639,7 +653,7 @@ def test_get_numeric_data_preserve_dtype(self): def test_interp_basic(self): df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan], 'C': [1, 2, 3, 5], 'D': list('abcd')}) - expected = DataFrame({'A': [1, 2, 3, 4], 'B': [1, 4, 9, 9], + expected = DataFrame({'A': [1., 2., 3., 4.], 'B': [1., 4., 9., 9.], 'C': [1, 2, 3, 5], 'D': list('abcd')}) result = df.interpolate() assert_frame_equal(result, expected) @@ -648,8 +662,6 @@ def test_interp_basic(self): expected = df.set_index('C') expected.A.loc[3] = 3 expected.B.loc[5] = 9 - expected[['A', 'B']] = expected[['A', 'B']].astype(np.int64) - assert_frame_equal(result, expected) def test_interp_bad_method(self): @@ -663,9 +675,14 @@ def test_interp_combo(self): 'C': [1, 2, 3, 5], 'D': list('abcd')}) result = df['A'].interpolate() + expected = Series([1., 2., 3., 4.]) + assert_series_equal(result, expected) + + result = df['A'].interpolate(downcast='infer') expected = Series([1, 2, 3, 4]) assert_series_equal(result, expected) + def test_interp_nan_idx(self): df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]}) df = df.set_index('A') @@ -722,13 +739,16 @@ def test_interp_alt_scipy(self): expected = df.copy() expected['A'].iloc[2] = 3 expected['A'].iloc[5] = 6 + assert_frame_equal(result, expected) + + result = df.interpolate(method='barycentric', downcast='infer') assert_frame_equal(result, expected.astype(np.int64)) result = df.interpolate(method='krogh') expectedk = df.copy() - expectedk['A'].iloc[2] = 3 - expectedk['A'].iloc[5] = 6 - expectedk['A'] = expected['A'].astype(np.int64) + # expectedk['A'].iloc[2] = 3 + # expectedk['A'].iloc[5] = 6 + expectedk['A'] = expected['A'] assert_frame_equal(result, expectedk) _skip_if_no_pchip() @@ -786,9 +806,32 @@ def test_interp_raise_on_only_mixed(self): def test_interp_inplace(self): df = DataFrame({'a': [1., 2., np.nan, 4.]}) - expected = DataFrame({'a': [1, 2, 3, 4]}) - df['a'].interpolate(inplace=True) - assert_frame_equal(df, expected) + expected = DataFrame({'a': [1., 2., 3., 4.]}) + result = df.copy() + result['a'].interpolate(inplace=True) + assert_frame_equal(result, expected) + + result = df.copy() + result['a'].interpolate(inplace=True, downcast='infer') + assert_frame_equal(result, expected.astype('int')) + + def test_interp_ignore_all_good(self): + # GH + df = DataFrame({'A': [1, 2, np.nan, 4], + 'B': [1, 2, 3, 4], + 'C': [1., 2., np.nan, 4.], + 'D': [1., 2., 3., 4.]}) + expected = DataFrame({'A': np.array([1, 2, 3, 4], dtype='float'), + 'B': np.array([1, 2, 3, 4], dtype='int'), + 'C': np.array([1., 2., 3, 4.], dtype='float'), + 'D': np.array([1., 2., 3., 4.], dtype='float')}) + + result = df.interpolate(downcast=None) + assert_frame_equal(result, expected) + + # all good + result = df[['B', 'D']].interpolate(downcast=None) + assert_frame_equal(result, df[['B', 'D']]) def test_no_order(self): _skip_if_no_scipy() @@ -802,7 +845,7 @@ def test_spline(self): _skip_if_no_scipy() s = Series([1, 2, np.nan, 4, 5, np.nan, 7]) result = s.interpolate(method='spline', order=1) - expected = Series([1, 2, 3, 4, 5, 6, 7]) + expected = Series([1., 2., 3., 4., 5., 6., 7.]) assert_series_equal(result, expected) def test_metadata_propagation_indiv(self): diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index e658ce75247b4..a70d756c82b0a 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -403,3 +403,29 @@ def test_unequal(name): frame_object_unequal = Benchmark('test_unequal("object_df")', setup) frame_nonunique_unequal = Benchmark('test_unequal("nonunique_cols")', setup) +#----------------------------------------------------------------------------- +# interpolate +# this is the worst case, where every column has NaNs. +setup = common_setup + """ +df = DataFrame(randn(10000, 100)) +df.values[::2] = np.nan +""" + +frame_interpolate = Benchmark('df.interpolate()', setup, + start_date=datetime(2014, 2, 7)) + +setup = common_setup + """ +df = DataFrame({'A': np.arange(0, 10000), + 'B': np.random.randint(0, 100, 10000), + 'C': randn(10000), + 'D': randn(10000)}) +df.loc[1::5, 'A'] = np.nan +df.loc[1::5, 'C'] = np.nan +""" + +frame_interpolate_some_good = Benchmark('df.interpolate()', setup, + start_date=datetime(2014, 2, 7)) +frame_interpolate_some_good_infer = Benchmark('df.interpolate(downcast="infer")', + setup, + start_date=datetime(2014, 2, 7)) + From 8d8d7a3cc6fd3a8e761be77f62e3b621fd63a306 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 14 Feb 2014 10:36:23 -0600 Subject: [PATCH 2/2] check in interp_with_fill too --- doc/source/release.rst | 3 --- pandas/core/internals.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 31d3b88094d37..965ee1dc8e8d9 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -116,16 +116,13 @@ Bug Fixes - TimeGrouper has a more compatible API to the rest of the groupers (e.g. ``groups`` was missing) (:issue:`3881`) - Bug in ``pd.eval`` when parsing strings with possible tokens like ``'&'`` (:issue:`6351`) -<<<<<<< HEAD - Bug correctly handle placements of ``-inf`` in Panels when dividing by integer 0 (:issue:`6178`) - ``DataFrame.shift`` with ``axis=1`` was raising (:issue:`6371`) - Disabled clipboard tests until release time (run locally with ``nosetests -A disabled`` (:issue:`6048`). - Bug in ``DataFrame.replace()`` when passing a nested ``dict`` that contained keys not in the values to be replaced (:issue:`6342`) - Bug in take with duplicate columns not consolidated (:issue:`6240`) -======= - Bug in interpolate changing dtypes (:issue:`6290`) ->>>>>>> 336b309... BUG: preserve dtypes in interpolate pandas 0.13.1 ------------- diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c89aac0fa7923..e68fc8da6a5db 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -805,6 +805,15 @@ def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, fill_value=None, coerce=False, downcast=None, **kwargs): + def check_int_bool(self, inplace): + # Only FloatBlocks will contain NaNs. + # timedelta subclasses IntBlock + if (self.is_bool or self.is_integer) and not self.is_timedelta: + if inplace: + return self + else: + return self.copy() + # a fill na type method try: m = com._clean_fill_method(method) @@ -812,6 +821,9 @@ def interpolate(self, method='pad', axis=0, index=None, m = None if m is not None: + r = check_int_bool(self, inplace) + if r is not None: + return r return self._interpolate_with_fill(method=m, axis=axis, inplace=inplace, @@ -826,6 +838,9 @@ def interpolate(self, method='pad', axis=0, index=None, m = None if m is not None: + r = check_int_bool(self, inplace) + if r is not None: + return r return self._interpolate(method=m, index=index, values=values,