diff --git a/doc/source/release.rst b/doc/source/release.rst index 4e6ac7240512c..f492570c9bb0b 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -109,6 +109,9 @@ pandas 0.13 be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`) - allow a passed locations array or mask as a ``where`` condition (:issue:`4467`) - the ``fmt`` keyword now replaces the ``table`` keyword; allowed values are ``s|t`` + - add the keyword ``dropna=True`` to ``append`` to change whether ALL nan rows are not written + to the store (default is ``True``, ALL nan rows are NOT written), also settable + via the option ``io.hdf.dropna_table`` (:issue:`4625`) - ``JSON`` - added ``date_unit`` parameter to specify resolution of timestamps. Options diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 022799cd88014..f8a565157a04c 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -98,6 +98,9 @@ API changes import os os.remove(path) + - add the keyword ``dropna=True`` to ``append`` to change whether ALL nan rows are not written + to the store (default is ``True``, ALL nan rows are NOT written), also settable + via the option ``io.hdf.dropna_table`` (:issue:`4625`) - Changes to how ``Index`` and ``MultiIndex`` handle metadata (``levels``, ``labels``, and ``names``) (:issue:`4039`): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1eb8b0f266f68..33921b7e534e5 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -32,6 +32,7 @@ from pandas.tools.merge import concat from pandas import compat from pandas.io.common import PerformanceWarning +from pandas.core.config import get_option import pandas.lib as lib import pandas.algos as algos @@ -165,6 +166,17 @@ class DuplicateWarning(Warning): Panel4D: [1, 2, 3], } +# register our configuration options +from pandas.core import config +dropna_doc = """ +: boolean + drop ALL nan rows when appending to a table +""" + +with config.config_prefix('io.hdf'): + config.register_option('dropna_table', True, dropna_doc, + validator=config.is_bool) + # oh the troubles to reduce import time _table_mod = None _table_supports_index = False @@ -730,7 +742,7 @@ def remove(self, key, where=None, start=None, stop=None): 'can only remove with where on objects written as tables') return s.delete(where=where, start=start, stop=stop) - def append(self, key, value, fmt=None, append=True, columns=None, **kwargs): + def append(self, key, value, fmt=None, append=True, columns=None, dropna=None, **kwargs): """ Append to Table in file. Node must already exist and be Table format. @@ -751,7 +763,8 @@ def append(self, key, value, fmt=None, append=True, columns=None, **kwargs): chunksize : size to chunk the writing expectedrows : expected TOTAL row size of this table encoding : default None, provide an encoding for strings - + dropna : boolean, default True, do not write an ALL nan row to the store + settable by the option 'io.hdf.dropna_table' Notes ----- Does *not* check if data being appended overlaps with existing @@ -761,8 +774,10 @@ def append(self, key, value, fmt=None, append=True, columns=None, **kwargs): raise Exception( "columns is not a supported keyword in append, try data_columns") + if dropna is None: + dropna = get_option("io.hdf.dropna_table") kwargs = self._validate_format(fmt or 't', kwargs) - self._write_to_group(key, value, append=append, **kwargs) + self._write_to_group(key, value, append=append, dropna=dropna, **kwargs) def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, **kwargs): """ @@ -3219,7 +3234,7 @@ class AppendableTable(LegacyTable): def write(self, obj, axes=None, append=False, complib=None, complevel=None, fletcher32=None, min_itemsize=None, chunksize=None, - expectedrows=None, **kwargs): + expectedrows=None, dropna=True, **kwargs): if not append and self.is_exists: self._handle.removeNode(self.group, 'table') @@ -3254,29 +3269,36 @@ def write(self, obj, axes=None, append=False, complib=None, a.validate_and_set(table, append) # add the rows - self.write_data(chunksize) + self.write_data(chunksize, dropna=dropna) - def write_data(self, chunksize): + def write_data(self, chunksize, dropna=True): """ we form the data into a 2-d including indexes,values,mask write chunk-by-chunk """ names = self.dtype.names nrows = self.nrows_expected - # create the masks & values - masks = [] - for a in self.values_axes: + # if dropna==True, then drop ALL nan rows + if dropna: + + masks = [] + for a in self.values_axes: + + # figure the mask: only do if we can successfully process this + # column, otherwise ignore the mask + mask = com.isnull(a.data).all(axis=0) + masks.append(mask.astype('u1')) - # figure the mask: only do if we can successfully process this - # column, otherwise ignore the mask - mask = com.isnull(a.data).all(axis=0) - masks.append(mask.astype('u1')) + # consolidate masks + mask = masks[0] + for m in masks[1:]: + mask = mask & m + mask = mask.ravel() + + else: - # consolidate masks - mask = masks[0] - for m in masks[1:]: - mask = mask & m - mask = mask.ravel() + mask = np.empty(nrows, dtype='u1') + mask.fill(False) # broadcast the indexes if needed indexes = [a.cvalues for a in self.index_axes] diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index e2d9235510f83..ab7e5cf813b24 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -757,19 +757,40 @@ def test_append_some_nans(self): store.append('df3', df3[10:]) tm.assert_frame_equal(store['df3'], df3) - ##### THIS IS A BUG, should not drop these all-nan rows - ##### BUT need to store the index which we don't want to do.... - # nan some entire rows + def test_append_all_nans(self): + + with ensure_clean(self.path) as store: + df = DataFrame({'A1' : np.random.randn(20), 'A2' : np.random.randn(20)}, index=np.arange(20)) + df.ix[0:15,:] = np.nan + + + # nan some entire rows (dropna=True) + _maybe_remove(store, 'df') + store.append('df', df[:10], dropna=True) + store.append('df', df[10:], dropna=True) + tm.assert_frame_equal(store['df'], df[-4:]) + + # nan some entire rows (dropna=False) + _maybe_remove(store, 'df2') + store.append('df2', df[:10], dropna=False) + store.append('df2', df[10:], dropna=False) + tm.assert_frame_equal(store['df2'], df) + + # tests the option io.hdf.dropna_table + pandas.set_option('io.hdf.dropna_table',False) + _maybe_remove(store, 'df3') + store.append('df3', df[:10]) + store.append('df3', df[10:]) + tm.assert_frame_equal(store['df3'], df) + pandas.set_option('io.hdf.dropna_table',True) _maybe_remove(store, 'df4') - df.ix[0:15,:] = np.nan store.append('df4', df[:10]) store.append('df4', df[10:]) tm.assert_frame_equal(store['df4'], df[-4:]) - self.assert_(store.get_storer('df4').nrows == 4) # nan some entire rows (string are still written!) df = DataFrame({'A1' : np.random.randn(20), @@ -777,12 +798,17 @@ def test_append_some_nans(self): 'B' : 'foo', 'C' : 'bar'}, index=np.arange(20)) - _maybe_remove(store, 'df5') df.ix[0:15,:] = np.nan - store.append('df5', df[:10]) - store.append('df5', df[10:]) - tm.assert_frame_equal(store['df5'], df) - self.assert_(store.get_storer('df5').nrows == 20) + + _maybe_remove(store, 'df') + store.append('df', df[:10], dropna=True) + store.append('df', df[10:], dropna=True) + tm.assert_frame_equal(store['df'], df) + + _maybe_remove(store, 'df2') + store.append('df2', df[:10], dropna=False) + store.append('df2', df[10:], dropna=False) + tm.assert_frame_equal(store['df2'], df) # nan some entire rows (but since we have dates they are still written!) df = DataFrame({'A1' : np.random.randn(20), @@ -790,12 +816,17 @@ def test_append_some_nans(self): 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) }, index=np.arange(20)) - _maybe_remove(store, 'df6') df.ix[0:15,:] = np.nan - store.append('df6', df[:10]) - store.append('df6', df[10:]) - tm.assert_frame_equal(store['df6'], df) - self.assert_(store.get_storer('df6').nrows == 20) + + _maybe_remove(store, 'df') + store.append('df', df[:10], dropna=True) + store.append('df', df[10:], dropna=True) + tm.assert_frame_equal(store['df'], df) + + _maybe_remove(store, 'df2') + store.append('df2', df[:10], dropna=False) + store.append('df2', df[10:], dropna=False) + tm.assert_frame_equal(store['df2'], df) def test_append_frame_column_oriented(self):