From b9ecdb6005f39b6e7c1f1ab70cc2e6529a9bfa96 Mon Sep 17 00:00:00 2001 From: nakatomotoi Date: Wed, 8 Sep 2021 23:02:05 +0900 Subject: [PATCH 01/11] TST: add test to read zero-chunked array --- pandas/tests/io/test_parquet.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 01715ee133e96..dc6014bce815a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -931,6 +931,14 @@ def test_read_parquet_manager(self, pa, using_array_manager): else: assert isinstance(result._mgr, pd.core.internals.BlockManager) + def test_read_zero_chunked_array(self, pa): + df = pd.DataFrame({"value": pd.array([], dtype=pd.Int64Dtype())}) + print(pa) + with tm.ensure_clean() as path: + df.to_parquet(path, pa) + result = read_parquet(path, pa) + check_round_trip(df, pa, expected=result) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): From 427eee15240cb98d863824ee8fe17e80d4edc1bd Mon Sep 17 00:00:00 2001 From: nakatomotoi Date: Thu, 9 Sep 2021 02:18:39 +0900 Subject: [PATCH 02/11] Modify: change test function --- pandas/tests/io/test_parquet.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index dc6014bce815a..4af72a529d9d5 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -933,11 +933,10 @@ def test_read_parquet_manager(self, pa, using_array_manager): def test_read_zero_chunked_array(self, pa): df = pd.DataFrame({"value": pd.array([], dtype=pd.Int64Dtype())}) - print(pa) with tm.ensure_clean() as path: df.to_parquet(path, pa) result = read_parquet(path, pa) - check_round_trip(df, pa, expected=result) + tm.assert_frame_equal(result, df) class TestParquetFastParquet(Base): From 6419677c2f9917f853bf3a8593d3b121867a73bf Mon Sep 17 00:00:00 2001 From: nakatomotoi Date: Sun, 12 Sep 2021 15:25:37 +0900 Subject: [PATCH 03/11] Modify: add several dtypes --- pandas/tests/io/test_parquet.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4af72a529d9d5..be98467f4b19a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -931,12 +931,16 @@ def test_read_parquet_manager(self, pa, using_array_manager): else: assert isinstance(result._mgr, pd.core.internals.BlockManager) - def test_read_zero_chunked_array(self, pa): - df = pd.DataFrame({"value": pd.array([], dtype=pd.Int64Dtype())}) - with tm.ensure_clean() as path: - df.to_parquet(path, pa) - result = read_parquet(path, pa) - tm.assert_frame_equal(result, df) + def test_read_empty_array(self, pa): + df = pd.DataFrame( + { + "a": pd.array([], dtype="Int64"), + "b": pd.array([], dtype="UInt8"), + "c": pd.array([], dtype="string"), + "d": pd.array([], dtype="boolean"), + } + ) + check_round_trip(df, pa) class TestParquetFastParquet(Base): From a48a0a58d1f4fb0c7083ae2a2bdcbcaec8f25e04 Mon Sep 17 00:00:00 2001 From: nakatomotoi Date: Tue, 14 Sep 2021 23:05:15 +0900 Subject: [PATCH 04/11] Modify: set dtypes by @pytest.mark.parametrize --- pandas/tests/io/test_parquet.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index be98467f4b19a..8018ff1d0a5e8 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -931,13 +931,13 @@ def test_read_parquet_manager(self, pa, using_array_manager): else: assert isinstance(result._mgr, pd.core.internals.BlockManager) - def test_read_empty_array(self, pa): + @pytest.mark.parametrize( + "dtype", ["int64", "float64", "object", "datetime64[ns, UTC]", "bool"] + ) + def test_read_empty_array(self, pa, dtype): df = pd.DataFrame( { - "a": pd.array([], dtype="Int64"), - "b": pd.array([], dtype="UInt8"), - "c": pd.array([], dtype="string"), - "d": pd.array([], dtype="boolean"), + "value": pd.array([], dtype=dtype), } ) check_round_trip(df, pa) From e72733cb629fe6ffa46adc4d226bab2b260edad3 Mon Sep 17 00:00:00 2001 From: nakatomotoi Date: Wed, 15 Sep 2021 08:19:40 +0900 Subject: [PATCH 05/11] Modify: add Int64 and UInt8 --- pandas/tests/io/test_parquet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 8018ff1d0a5e8..6d282322fa8a2 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -932,7 +932,8 @@ def test_read_parquet_manager(self, pa, using_array_manager): assert isinstance(result._mgr, pd.core.internals.BlockManager) @pytest.mark.parametrize( - "dtype", ["int64", "float64", "object", "datetime64[ns, UTC]", "bool"] + "dtype", + ["int64", "float64", "object", "datetime64[ns, UTC]", "bool", "Int64", "UInt8"], ) def test_read_empty_array(self, pa, dtype): df = pd.DataFrame( From 38fa455325c8dbf9d95a99a698694c5d8a8f5131 Mon Sep 17 00:00:00 2001 From: nakatomotoi Date: Wed, 15 Sep 2021 09:16:32 +0900 Subject: [PATCH 06/11] Modify: add comment of original issue number --- pandas/tests/io/test_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 6d282322fa8a2..7ead581301400 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -936,6 +936,7 @@ def test_read_parquet_manager(self, pa, using_array_manager): ["int64", "float64", "object", "datetime64[ns, UTC]", "bool", "Int64", "UInt8"], ) def test_read_empty_array(self, pa, dtype): + # GH #41241 df = pd.DataFrame( { "value": pd.array([], dtype=dtype), From 242c81d5b14c38d3014cbf2c2de24fda614ef7e3 Mon Sep 17 00:00:00 2001 From: nakatomotoi Date: Wed, 15 Sep 2021 15:42:35 +0900 Subject: [PATCH 07/11] Modify: add use_nullable_dtypes as read_kwargs --- pandas/tests/io/test_parquet.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 7ead581301400..e588065abd10a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -931,10 +931,7 @@ def test_read_parquet_manager(self, pa, using_array_manager): else: assert isinstance(result._mgr, pd.core.internals.BlockManager) - @pytest.mark.parametrize( - "dtype", - ["int64", "float64", "object", "datetime64[ns, UTC]", "bool", "Int64", "UInt8"], - ) + @pytest.mark.parametrize("dtype", ["Int64", "UInt8"]) def test_read_empty_array(self, pa, dtype): # GH #41241 df = pd.DataFrame( @@ -942,7 +939,7 @@ def test_read_empty_array(self, pa, dtype): "value": pd.array([], dtype=dtype), } ) - check_round_trip(df, pa) + check_round_trip(df, pa, read_kwargs={"use_nullable_dtypes": True}) class TestParquetFastParquet(Base): From 74154ad9e7769909e5e0d7439c7e7f151e7510bf Mon Sep 17 00:00:00 2001 From: nakatomotoi Date: Wed, 15 Sep 2021 22:12:00 +0900 Subject: [PATCH 08/11] Modify: add boolean --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index e588065abd10a..574350c71c336 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -931,7 +931,7 @@ def test_read_parquet_manager(self, pa, using_array_manager): else: assert isinstance(result._mgr, pd.core.internals.BlockManager) - @pytest.mark.parametrize("dtype", ["Int64", "UInt8"]) + @pytest.mark.parametrize("dtype", ["Int64", "UInt8", "boolean"]) def test_read_empty_array(self, pa, dtype): # GH #41241 df = pd.DataFrame( From d1e114f291cd22e8c9e1eff7c29daccea9073944 Mon Sep 17 00:00:00 2001 From: nakatomotoi Date: Sun, 19 Sep 2021 15:25:39 +0900 Subject: [PATCH 09/11] Modify: remove use_nullable_dtypes --- pandas/tests/io/test_parquet.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 574350c71c336..a1a52cf0c67d6 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -931,7 +931,9 @@ def test_read_parquet_manager(self, pa, using_array_manager): else: assert isinstance(result._mgr, pd.core.internals.BlockManager) - @pytest.mark.parametrize("dtype", ["Int64", "UInt8", "boolean"]) + @pytest.mark.parametrize( + "dtype", ["Int64", "UInt8", "boolean", "object", "datetime64[ns, UTC]"] + ) def test_read_empty_array(self, pa, dtype): # GH #41241 df = pd.DataFrame( @@ -939,7 +941,7 @@ def test_read_empty_array(self, pa, dtype): "value": pd.array([], dtype=dtype), } ) - check_round_trip(df, pa, read_kwargs={"use_nullable_dtypes": True}) + check_round_trip(df, pa) class TestParquetFastParquet(Base): From 90b79427c407243573a1dfa781db3e7396148661 Mon Sep 17 00:00:00 2001 From: nakatomotoi Date: Fri, 24 Sep 2021 13:26:32 +0900 Subject: [PATCH 10/11] Modify: add float, int, period[D], Float64 and string --- pandas/tests/io/test_parquet.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a1a52cf0c67d6..88c46919bdf36 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -932,7 +932,19 @@ def test_read_parquet_manager(self, pa, using_array_manager): assert isinstance(result._mgr, pd.core.internals.BlockManager) @pytest.mark.parametrize( - "dtype", ["Int64", "UInt8", "boolean", "object", "datetime64[ns, UTC]"] + "dtype", + [ + "Int64", + "UInt8", + "boolean", + "object", + "datetime64[ns, UTC]", + "float", + "int", + "period[D]", + "Float64", + "string", + ], ) def test_read_empty_array(self, pa, dtype): # GH #41241 From 9fe7498bbc1aec0affcdac9da34aef7c0ae94a24 Mon Sep 17 00:00:00 2001 From: nakatomotoi Date: Sun, 26 Sep 2021 14:57:01 +0900 Subject: [PATCH 11/11] Modify: move test near test_use_nullable_dtypes --- pandas/tests/io/test_parquet.py | 47 ++++++++++++++++----------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 88c46919bdf36..ec724602c5249 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -637,6 +637,29 @@ def test_use_nullable_dtypes(self, engine): expected = expected.drop("c", axis=1) tm.assert_frame_equal(result2, expected) + @pytest.mark.parametrize( + "dtype", + [ + "Int64", + "UInt8", + "boolean", + "object", + "datetime64[ns, UTC]", + "float", + "period[D]", + "Float64", + "string", + ], + ) + def test_read_empty_array(self, pa, dtype): + # GH #41241 + df = pd.DataFrame( + { + "value": pd.array([], dtype=dtype), + } + ) + check_round_trip(df, pa, read_kwargs={"use_nullable_dtypes": True}) + @pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning") class TestParquetPyArrow(Base): @@ -931,30 +954,6 @@ def test_read_parquet_manager(self, pa, using_array_manager): else: assert isinstance(result._mgr, pd.core.internals.BlockManager) - @pytest.mark.parametrize( - "dtype", - [ - "Int64", - "UInt8", - "boolean", - "object", - "datetime64[ns, UTC]", - "float", - "int", - "period[D]", - "Float64", - "string", - ], - ) - def test_read_empty_array(self, pa, dtype): - # GH #41241 - df = pd.DataFrame( - { - "value": pd.array([], dtype=dtype), - } - ) - check_round_trip(df, pa) - class TestParquetFastParquet(Base): def test_basic(self, fp, df_full):