From 5722e6df769515a3ecd57ae82722443722400861 Mon Sep 17 00:00:00 2001 From: 1MLightyears <1MLightyears@gmail.com> Date: Sun, 3 Jan 2021 19:13:06 +0800 Subject: [PATCH 1/7] BUG: fix the bad error raised by HDFStore.put() --- pandas/io/pytables.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 341a8a9f90b96..54214e0919021 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4908,14 +4908,19 @@ def _maybe_convert_for_string_atom( # we cannot serialize this data, so report an exception on a column # by column basis - for i in range(len(block.shape[0])): + + # expected behaviour: + # search block for a non-string object column by column + for i in range(block.shape[0]): col = block.iget(i) inferred_type = lib.infer_dtype(col, skipna=False) if inferred_type != "string": - iloc = block.mgr_locs.indexer[i] + # it seems to be no approach for an ObjectBlock to get the + # column name for {col}, so the col number is given as a + # compromise. raise TypeError( - f"Cannot serialize the column [{iloc}] because\n" - f"its data contents are [{inferred_type}] object dtype" + f"Cannot serialize the column [{i}] because\n" + f"its data contents are not string but [{inferred_type}] object dtype" ) # itemsize is the maximum length of a string (along any dimension) From ba55c20d8f633d53b765da385966ee62575fdfa4 Mon Sep 17 00:00:00 2001 From: 1MLightyears <1MLightyears@gmail.com> Date: Sun, 3 Jan 2021 20:02:44 +0800 Subject: [PATCH 2/7] fix flake8:line too long --- pandas/io/pytables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 54214e0919021..c448b7e70d3d1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4919,8 +4919,8 @@ def _maybe_convert_for_string_atom( # column name for {col}, so the col number is given as a # compromise. raise TypeError( - f"Cannot serialize the column [{i}] because\n" - f"its data contents are not string but [{inferred_type}] object dtype" + f"Cannot serialize the column [{i}] because its data contents \n" + f"are not string but [{inferred_type}] object dtype" ) # itemsize is the maximum length of a string (along any dimension) From 06ee87cde00e1952a94d8ee34b6ed83720b6012a Mon Sep 17 00:00:00 2001 From: 1MLightyears <1MLightyears@gmail.com> Date: Mon, 4 Jan 2021 19:49:53 +0800 Subject: [PATCH 3/7] now column label can be showed in error message; update test cases --- pandas/io/pytables.py | 16 ++++++++++------ pandas/tests/io/pytables/test_store.py | 6 ++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c448b7e70d3d1..28fd41e1013c4 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3916,6 +3916,7 @@ def get_blk_items(mgr, blocks): nan_rep=nan_rep, encoding=self.encoding, errors=self.errors, + block_columns=b_items ) adj_name = _maybe_adjust_name(new_name, self.version) @@ -4875,8 +4876,10 @@ def _unconvert_index( def _maybe_convert_for_string_atom( - name: str, block, existing_col, min_itemsize, nan_rep, encoding, errors + name: str, block, existing_col, min_itemsize, nan_rep, encoding, errors, + block_columns: List[str] = [] ): + # block_columns(list[str]): the label of columns for debug info use. if not block.is_object: return block.values @@ -4915,12 +4918,13 @@ def _maybe_convert_for_string_atom( col = block.iget(i) inferred_type = lib.infer_dtype(col, skipna=False) if inferred_type != "string": - # it seems to be no approach for an ObjectBlock to get the - # column name for {col}, so the col number is given as a - # compromise. + error_column_label = block_columns[i] \ + if len(block_columns) > i \ + else f"No.{i}" raise TypeError( - f"Cannot serialize the column [{i}] because its data contents \n" - f"are not string but [{inferred_type}] object dtype" + f"Cannot serialize the column [{error_column_label}]\n" + f"because its data contents are not [string] but " + f"[{inferred_type}] object dtype" ) # itemsize is the maximum length of a string (along any dimension) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 3f4c21389daed..bfef505c77086 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -2055,7 +2055,8 @@ def test_append_raise(self, setup_path): df = tm.makeDataFrame() df["invalid"] = [["a"]] * len(df) assert df.dtypes["invalid"] == np.object_ - msg = re.escape("object of type 'int' has no len()") + msg = re.escape("""Cannot serialize the column [invalid] +because its data contents are not [string] but [mixed] object dtype""") with pytest.raises(TypeError, match=msg): store.append("df", df) @@ -2221,7 +2222,8 @@ def test_unimplemented_dtypes_table_columns(self, setup_path): with ensure_clean_store(setup_path) as store: # this fails because we have a date in the object block...... - msg = "object of type 'int' has no len()" + msg = re.escape("""Cannot serialize the column [datetime1] +because its data contents are not [string] but [date] object dtype""") with pytest.raises(TypeError, match=msg): store.append("df_unimplemented", df) From 38cf514af8e33d5715303a132f507f956a2698f7 Mon Sep 17 00:00:00 2001 From: 1MLightyears <1MLightyears@gmail.com> Date: Mon, 4 Jan 2021 21:01:34 +0800 Subject: [PATCH 4/7] fix format for black --- pandas/io/pytables.py | 18 ++++++++++++------ pandas/tests/io/pytables/test_store.py | 12 ++++++++---- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 28fd41e1013c4..c5c3e1f7700b7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3916,7 +3916,7 @@ def get_blk_items(mgr, blocks): nan_rep=nan_rep, encoding=self.encoding, errors=self.errors, - block_columns=b_items + block_columns=b_items, ) adj_name = _maybe_adjust_name(new_name, self.version) @@ -4876,8 +4876,14 @@ def _unconvert_index( def _maybe_convert_for_string_atom( - name: str, block, existing_col, min_itemsize, nan_rep, encoding, errors, - block_columns: List[str] = [] + name: str, + block, + existing_col, + min_itemsize, + nan_rep, + encoding, + errors, + block_columns: List[str] = [], ): # block_columns(list[str]): the label of columns for debug info use. if not block.is_object: @@ -4918,9 +4924,9 @@ def _maybe_convert_for_string_atom( col = block.iget(i) inferred_type = lib.infer_dtype(col, skipna=False) if inferred_type != "string": - error_column_label = block_columns[i] \ - if len(block_columns) > i \ - else f"No.{i}" + error_column_label = ( + block_columns[i] if len(block_columns) > i else f"No.{i}" + ) raise TypeError( f"Cannot serialize the column [{error_column_label}]\n" f"because its data contents are not [string] but " diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index a7608c66a82e5..3f0fd6e7483f8 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -2055,8 +2055,10 @@ def test_append_raise(self, setup_path): df = tm.makeDataFrame() df["invalid"] = [["a"]] * len(df) assert df.dtypes["invalid"] == np.object_ - msg = re.escape("""Cannot serialize the column [invalid] -because its data contents are not [string] but [mixed] object dtype""") + msg = re.escape( + """Cannot serialize the column [invalid] +because its data contents are not [string] but [mixed] object dtype""" + ) with pytest.raises(TypeError, match=msg): store.append("df", df) @@ -2222,8 +2224,10 @@ def test_unimplemented_dtypes_table_columns(self, setup_path): with ensure_clean_store(setup_path) as store: # this fails because we have a date in the object block...... - msg = re.escape("""Cannot serialize the column [datetime1] -because its data contents are not [string] but [date] object dtype""") + msg = re.escape( + """Cannot serialize the column [datetime1] +because its data contents are not [string] but [date] object dtype""" + ) with pytest.raises(TypeError, match=msg): store.append("df_unimplemented", df) From d9f330a67c62ab295e47463ff98492fc5ed9e866 Mon Sep 17 00:00:00 2001 From: 1MLightyears <1MLightyears@gmail.com> Date: Mon, 4 Jan 2021 22:52:14 +0800 Subject: [PATCH 5/7] add docstring; update to whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/io/pytables.py | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index af11b6543a74b..3442575146e8d 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -271,6 +271,7 @@ I/O - Allow custom error values for parse_dates argument of :func:`read_sql`, :func:`read_sql_query` and :func:`read_sql_table` (:issue:`35185`) - Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`). +- Bug in :func:`put` raising a wrong ``TypeError`` when saving a DataFrame with non-string dtype (:issue:`34274`) - Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`) - Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`) - :func:`pandas.read_excel` now respects :func:``pandas.set_option`` (:issue:`34252`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c5c3e1f7700b7..38fbc660b209c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4883,9 +4883,27 @@ def _maybe_convert_for_string_atom( nan_rep, encoding, errors, - block_columns: List[str] = [], + block_columns: List[str], ): - # block_columns(list[str]): the label of columns for debug info use. + """ + Ensure all elements in the given block are "string". + + Parameter + ------ + name(str): + block: + existing_col: + min_itemsize: + nan_rep: + encoding: + errors: + block_columns(list[str]): the label of columns for debug info use. + + Return + ------ + Value of a block, if its dtype is object and constructed by "string" + elements. + """ if not block.is_object: return block.values From 6a93c8e5a507cbc5f4553938ae6514c4db2f7d59 Mon Sep 17 00:00:00 2001 From: 1MLightyears <1MLightyears@gmail.com> Date: Mon, 4 Jan 2021 22:59:24 +0800 Subject: [PATCH 6/7] docstring of _maybe_convert_for_string_atom() removed --- pandas/io/pytables.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 38fbc660b209c..40d4018940b59 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4885,25 +4885,6 @@ def _maybe_convert_for_string_atom( errors, block_columns: List[str], ): - """ - Ensure all elements in the given block are "string". - - Parameter - ------ - name(str): - block: - existing_col: - min_itemsize: - nan_rep: - encoding: - errors: - block_columns(list[str]): the label of columns for debug info use. - - Return - ------ - Value of a block, if its dtype is object and constructed by "string" - elements. - """ if not block.is_object: return block.values From 4778ba95b6775fddaa5fa1c0d9d35725ebc8d697 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 4 Jan 2021 19:13:09 -0500 Subject: [PATCH 7/7] Update doc/source/whatsnew/v1.3.0.rst --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 3442575146e8d..288be390e0c18 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -271,7 +271,7 @@ I/O - Allow custom error values for parse_dates argument of :func:`read_sql`, :func:`read_sql_query` and :func:`read_sql_table` (:issue:`35185`) - Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`). -- Bug in :func:`put` raising a wrong ``TypeError`` when saving a DataFrame with non-string dtype (:issue:`34274`) +- Bug in :meth:`~HDFStore.put` raising a wrong ``TypeError`` when saving a DataFrame with non-string dtype (:issue:`34274`) - Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`) - Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`) - :func:`pandas.read_excel` now respects :func:``pandas.set_option`` (:issue:`34252`)