From 0f370f0d9bd1b18d52be8f06d6e0f9cf31801a02 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 5 Sep 2022 20:39:03 +0200 Subject: [PATCH 1/8] ENH: Add use_nullable_dtypes in csv internals --- pandas/_libs/parsers.pyx | 58 +++++++++++++++++++++++---- pandas/tests/io/parser/test_upcast.py | 57 ++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 8 deletions(-) create mode 100644 pandas/tests/io/parser/test_upcast.py diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index e8b7160af9b2c..da6c7dc0a449c 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -15,6 +15,13 @@ import warnings from pandas.util._exceptions import find_stack_level +from pandas import StringDtype +from pandas.core.arrays import ( + BooleanArray, + FloatingArray, + IntegerArray, +) + cimport cython from cpython.bytes cimport ( PyBytes_AsString, @@ -1378,18 +1385,53 @@ STR_NA_VALUES = { _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES)) -def _maybe_upcast(arr): - """ +def _maybe_upcast(arr, use_nullable_dtypes: bool = False): + """Sets nullable dtypes or upcasts if nans are present. + Upcast, if use_nullable_dtypes is false and nans are present so that the + current dtype can not hold the na value. We use nullable dtypes if the + flag is true for every array. + + Parameters + ---------- + arr: ndarray + Numpy array that is potentially being upcast. + + use_nullable_dtypes: bool, default False + If true, we cast to the associated nullable dtypes. + + Returns + ------- + The casted array. """ + na_value = na_values[arr.dtype] + if issubclass(arr.dtype.type, np.integer): - na_value = na_values[arr.dtype] - arr = arr.astype(float) - np.putmask(arr, arr == na_value, np.nan) + mask = arr == na_value + + if use_nullable_dtypes: + arr = IntegerArray(arr, mask) + else: + arr = arr.astype(float) + np.putmask(arr, mask, np.nan) + elif arr.dtype == np.bool_: - mask = arr.view(np.uint8) == na_values[np.uint8] - arr = arr.astype(object) - np.putmask(arr, mask, np.nan) + mask = arr.view(np.uint8) == na_value + + if use_nullable_dtypes: + arr = BooleanArray(arr, mask) + else: + arr = arr.astype(object) + np.putmask(arr, mask, np.nan) + + elif issubclass(arr.dtype.type, np.float): + if use_nullable_dtypes: + mask = np.isnan(arr) + arr = FloatingArray(arr, mask) + + elif arr.dtype == np.object: + if use_nullable_dtypes: + arr = StringDtype.construct_array_type()._from_sequence(arr) return arr diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py new file mode 100644 index 0000000000000..3a13a734e6f56 --- /dev/null +++ b/pandas/tests/io/parser/test_upcast.py @@ -0,0 +1,57 @@ +import numpy as np +import pytest + +from pandas._libs.parsers import ( + _maybe_upcast, + na_values, +) + +import pandas._testing as tm +from pandas.core.arrays import ( + BooleanArray, + FloatingArray, + IntegerArray, +) + + +def test_maybe_upcast(any_real_numpy_dtype): + # GH#36712 + if any_real_numpy_dtype == "float32": + pytest.skip() + + dtype = np.dtype(any_real_numpy_dtype) + na_value = na_values[dtype] + arr = np.array([1, 2, na_value], dtype=dtype) + result = _maybe_upcast(arr, use_nullable_dtypes=True) + + expected_mask = np.array([False, False, True]) + if issubclass(dtype.type, np.integer): + expected = IntegerArray(arr, mask=expected_mask) + else: + expected = FloatingArray(arr, mask=expected_mask) + + tm.assert_extension_array_equal(result, expected) + + +def test_maybe_upcaste_bool(): + # GH#36712 + dtype = np.bool_ + na_value = na_values[dtype] + arr = np.array([True, False, na_value], dtype="uint8").view(np.bool_) + result = _maybe_upcast(arr, use_nullable_dtypes=True) + + expected_mask = np.array([False, False, True]) + expected = BooleanArray(arr, mask=expected_mask) + tm.assert_extension_array_equal(result, expected) + + +def test_maybe_upcaste_all_nan(): + # GH#36712 + dtype = np.int64 + na_value = na_values[dtype] + arr = np.array([na_value, na_value], dtype=dtype) + result = _maybe_upcast(arr, use_nullable_dtypes=True) + + expected_mask = np.array([True, True]) + expected = IntegerArray(arr, mask=expected_mask) + tm.assert_extension_array_equal(result, expected) From f5e2015ce999422d52e998ca1f563a29d53ad8df Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 5 Sep 2022 20:53:34 +0200 Subject: [PATCH 2/8] Add tests --- pandas/_libs/parsers.pyx | 6 ++-- pandas/tests/io/parser/test_upcast.py | 43 ++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index da6c7dc0a449c..a3765418f60fa 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1424,14 +1424,14 @@ def _maybe_upcast(arr, use_nullable_dtypes: bool = False): arr = arr.astype(object) np.putmask(arr, mask, np.nan) - elif issubclass(arr.dtype.type, np.float): + elif issubclass(arr.dtype.type, float): if use_nullable_dtypes: mask = np.isnan(arr) arr = FloatingArray(arr, mask) - elif arr.dtype == np.object: + elif arr.dtype == np.object_: if use_nullable_dtypes: - arr = StringDtype.construct_array_type()._from_sequence(arr) + arr = StringDtype().construct_array_type()._from_sequence(arr) return arr diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index 3a13a734e6f56..5710745513a7b 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -6,11 +6,13 @@ na_values, ) +from pandas import NA import pandas._testing as tm from pandas.core.arrays import ( BooleanArray, FloatingArray, IntegerArray, + StringArray, ) @@ -33,11 +35,28 @@ def test_maybe_upcast(any_real_numpy_dtype): tm.assert_extension_array_equal(result, expected) +def test_maybe_upcast_no_na(any_real_numpy_dtype): + # GH#36712 + if any_real_numpy_dtype == "float32": + pytest.skip() + + arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype) + result = _maybe_upcast(arr, use_nullable_dtypes=True) + + expected_mask = np.array([False, False, False]) + if issubclass(np.dtype(any_real_numpy_dtype).type, np.integer): + expected = IntegerArray(arr, mask=expected_mask) + else: + expected = FloatingArray(arr, mask=expected_mask) + + tm.assert_extension_array_equal(result, expected) + + def test_maybe_upcaste_bool(): # GH#36712 dtype = np.bool_ na_value = na_values[dtype] - arr = np.array([True, False, na_value], dtype="uint8").view(np.bool_) + arr = np.array([True, False, na_value], dtype="uint8").view(dtype) result = _maybe_upcast(arr, use_nullable_dtypes=True) expected_mask = np.array([False, False, True]) @@ -45,6 +64,17 @@ def test_maybe_upcaste_bool(): tm.assert_extension_array_equal(result, expected) +def test_maybe_upcaste_bool_no_nan(): + # GH#36712 + dtype = np.bool_ + arr = np.array([True, False, False], dtype="uint8").view(dtype) + result = _maybe_upcast(arr, use_nullable_dtypes=True) + + expected_mask = np.array([False, False, False]) + expected = BooleanArray(arr, mask=expected_mask) + tm.assert_extension_array_equal(result, expected) + + def test_maybe_upcaste_all_nan(): # GH#36712 dtype = np.int64 @@ -55,3 +85,14 @@ def test_maybe_upcaste_all_nan(): expected_mask = np.array([True, True]) expected = IntegerArray(arr, mask=expected_mask) tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("val", [na_values[np.object_], "c"]) +def test_maybe_upcast_object(val): + # GH#36712 + arr = np.array(["a", "b", val], dtype=np.object_) + result = _maybe_upcast(arr, use_nullable_dtypes=True) + + exp_val = "c" if val == "c" else NA + expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_)) + tm.assert_extension_array_equal(result, expected) From 373a17bc74602d2cb961cc2d55d0204725ff162e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 5 Sep 2022 21:44:04 +0200 Subject: [PATCH 3/8] Fix mypy --- pandas/tests/io/parser/test_upcast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index 5710745513a7b..248d9c15cab65 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._libs.parsers import ( +from pandas._libs.parsers import ( # type: ignore[attr-defined] _maybe_upcast, na_values, ) From baa5310ff358f69e79d3b28e978aef85fc66978c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 12 Sep 2022 20:27:29 +0200 Subject: [PATCH 4/8] Add comment --- pandas/tests/io/parser/test_upcast.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index 248d9c15cab65..4de11b3fb2320 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -19,6 +19,7 @@ def test_maybe_upcast(any_real_numpy_dtype): # GH#36712 if any_real_numpy_dtype == "float32": + # na values not defined for float32 pytest.skip() dtype = np.dtype(any_real_numpy_dtype) From 709e068a6bc4cab416a0f4296618781cefaa5a89 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 12 Sep 2022 21:34:22 +0200 Subject: [PATCH 5/8] Add pyarrow test --- pandas/tests/io/parser/test_upcast.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index 4de11b3fb2320..2cae309b3787f 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -5,10 +5,15 @@ _maybe_upcast, na_values, ) +import pandas.util._test_decorators as td -from pandas import NA +from pandas import ( + NA, + set_option, +) import pandas._testing as tm from pandas.core.arrays import ( + ArrowStringArray, BooleanArray, FloatingArray, IntegerArray, @@ -88,12 +93,21 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize("storage", ["pyarrow", "python"]) @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) -def test_maybe_upcast_object(val): +def test_maybe_upcast_object(val, storage): # GH#36712 + import pyarrow as pa + + set_option("mode.string_storage", storage) arr = np.array(["a", "b", val], dtype=np.object_) result = _maybe_upcast(arr, use_nullable_dtypes=True) - exp_val = "c" if val == "c" else NA - expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_)) + if storage == "python": + exp_val = "c" if val == "c" else NA + expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_)) + else: + exp_val = "c" if val == "c" else None + expected = ArrowStringArray(pa.array(["a", "b", exp_val])) tm.assert_extension_array_equal(result, expected) From 4100ad0f6ef05804ae6cbf99d00d638f6ffffdac Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 12 Sep 2022 22:43:52 +0200 Subject: [PATCH 6/8] Fix float32 --- pandas/_libs/parsers.pyx | 6 ++++-- pandas/tests/io/parser/test_upcast.py | 3 --- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a3765418f60fa..bcfee7eb5fd4e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1405,7 +1405,8 @@ def _maybe_upcast(arr, use_nullable_dtypes: bool = False): The casted array. """ na_value = na_values[arr.dtype] - + print(arr.dtype.type) + print(issubclass(arr.dtype.type, float)) if issubclass(arr.dtype.type, np.integer): mask = arr == na_value @@ -1424,7 +1425,7 @@ def _maybe_upcast(arr, use_nullable_dtypes: bool = False): arr = arr.astype(object) np.putmask(arr, mask, np.nan) - elif issubclass(arr.dtype.type, float): + elif issubclass(arr.dtype.type, float) or arr.dtype.type == np.float32: if use_nullable_dtypes: mask = np.isnan(arr) arr = FloatingArray(arr, mask) @@ -2027,6 +2028,7 @@ def _compute_na_values(): uint16info = np.iinfo(np.uint16) uint8info = np.iinfo(np.uint8) na_values = { + np.float32: np.nan, np.float64: np.nan, np.int64: int64info.min, np.int32: int32info.min, diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index 2cae309b3787f..bb9449da75920 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -23,9 +23,6 @@ def test_maybe_upcast(any_real_numpy_dtype): # GH#36712 - if any_real_numpy_dtype == "float32": - # na values not defined for float32 - pytest.skip() dtype = np.dtype(any_real_numpy_dtype) na_value = na_values[dtype] From 757f113ba38d057064c9390dac1e9a67117602f8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 12 Sep 2022 22:44:02 +0200 Subject: [PATCH 7/8] Fix float32 --- pandas/_libs/parsers.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index bcfee7eb5fd4e..07bf7f69ec907 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1405,8 +1405,7 @@ def _maybe_upcast(arr, use_nullable_dtypes: bool = False): The casted array. """ na_value = na_values[arr.dtype] - print(arr.dtype.type) - print(issubclass(arr.dtype.type, float)) + if issubclass(arr.dtype.type, np.integer): mask = arr == na_value From 9e85b391406947c8abf775a7fcf2761498c30e03 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 13 Sep 2022 09:37:17 +0200 Subject: [PATCH 8/8] Add contextmanager --- pandas/tests/io/parser/test_upcast.py | 28 +++++++++++++-------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index bb9449da75920..428050ac01b58 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -7,10 +7,8 @@ ) import pandas.util._test_decorators as td -from pandas import ( - NA, - set_option, -) +import pandas as pd +from pandas import NA import pandas._testing as tm from pandas.core.arrays import ( ArrowStringArray, @@ -97,14 +95,14 @@ def test_maybe_upcast_object(val, storage): # GH#36712 import pyarrow as pa - set_option("mode.string_storage", storage) - arr = np.array(["a", "b", val], dtype=np.object_) - result = _maybe_upcast(arr, use_nullable_dtypes=True) - - if storage == "python": - exp_val = "c" if val == "c" else NA - expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_)) - else: - exp_val = "c" if val == "c" else None - expected = ArrowStringArray(pa.array(["a", "b", exp_val])) - tm.assert_extension_array_equal(result, expected) + with pd.option_context("mode.string_storage", storage): + arr = np.array(["a", "b", val], dtype=np.object_) + result = _maybe_upcast(arr, use_nullable_dtypes=True) + + if storage == "python": + exp_val = "c" if val == "c" else NA + expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_)) + else: + exp_val = "c" if val == "c" else None + expected = ArrowStringArray(pa.array(["a", "b", exp_val])) + tm.assert_extension_array_equal(result, expected)