Skip to content

POC: consistent NaN treatment for pyarrow dtypes #61732

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,8 @@ Datetimelike
- Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`)
- Bug in :meth:`to_datetime` with ``format="ISO8601"`` and ``utc=True`` where naive timestamps incorrectly inherited timezone offset from previous timestamps in a series. (:issue:`61389`)
- Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`)
- Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`)
- Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`)
- Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)

Timedelta
Expand Down
5 changes: 5 additions & 0 deletions pandas/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@
def using_string_dtype() -> bool:
_mode_options = _global_config["future"]
return _mode_options["infer_string"]


def using_pyarrow_strict_nans() -> bool:
_mode_options = _global_config["mode"]
return _mode_options["pyarrow_strict_nans"]
1 change: 1 addition & 0 deletions pandas/_libs/missing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ def isneginf_scalar(val: object) -> bool: ...
def checknull(val: object) -> bool: ...
def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ...
def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
def is_pdna_or_none(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
18 changes: 18 additions & 0 deletions pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,24 @@ cdef bint checknull_with_nat_and_na(object obj):
return checknull_with_nat(obj) or obj is C_NA


@cython.wraparound(False)
@cython.boundscheck(False)
def is_pdna_or_none(values: ndarray) -> ndarray:
cdef:
ndarray[uint8_t] result
Py_ssize_t i, N
object val

N = len(values)
result = np.zeros(N, dtype=np.uint8)

for i in range(N):
val = values[i]
if val is None or val is C_NA:
result[i] = True
return result.view(bool)


@cython.wraparound(False)
@cython.boundscheck(False)
def is_numeric_na(values: ndarray) -> ndarray:
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1456,7 +1456,7 @@ def _maybe_upcast(
if isinstance(arr, IntegerArray) and arr.isna().all():
# use null instead of int64 in pyarrow
arr = arr.to_numpy(na_value=None)
arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
arr = ArrowExtensionArray(pa.array(arr))

return arr

Expand Down
15 changes: 13 additions & 2 deletions pandas/core/arrays/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

import numpy as np

from pandas._config import using_pyarrow_strict_nans

from pandas._libs import lib
from pandas._libs.missing import NA
from pandas.errors import LossySetitemError

from pandas.core.dtypes.cast import np_can_hold_element
Expand All @@ -21,7 +24,11 @@


def to_numpy_dtype_inference(
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
arr: ArrayLike,
dtype: npt.DTypeLike | None,
na_value,
hasna: bool,
is_pyarrow: bool = True,
) -> tuple[npt.DTypeLike, Any]:
if dtype is None and is_numeric_dtype(arr.dtype):
dtype_given = False
Expand All @@ -34,7 +41,11 @@ def to_numpy_dtype_inference(
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
if na_value is lib.no_default:
na_value = np.nan
if is_pyarrow and using_pyarrow_strict_nans():
na_value = NA
dtype = np.dtype(object)
else:
na_value = np.nan
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
elif dtype is not None:
Expand Down
100 changes: 86 additions & 14 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@

import numpy as np

from pandas._config import using_pyarrow_strict_nans

from pandas._libs import lib
from pandas._libs.missing import is_pdna_or_none
from pandas._libs.tslibs import (
Timedelta,
Timestamp,
Expand Down Expand Up @@ -63,6 +66,7 @@
from pandas.core.arrays.masked import BaseMaskedArray
from pandas.core.arrays.string_ import StringDtype
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.indexers import (
check_array_indexer,
unpack_tuple_and_ellipses,
Expand Down Expand Up @@ -322,6 +326,11 @@ def _from_sequence_of_strings(
"""
Construct a new ExtensionArray from a sequence of strings.
"""
mask = isna(strings)

if isinstance(strings, cls):
strings = strings._pa_array

pa_type = to_pyarrow_type(dtype)
if (
pa_type is None
Expand All @@ -340,22 +349,27 @@ def _from_sequence_of_strings(
from pandas.core.tools.datetimes import to_datetime

scalars = to_datetime(strings, errors="raise").date

scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type)

elif pa.types.is_duration(pa_type):
from pandas.core.tools.timedeltas import to_timedelta

scalars = to_timedelta(strings, errors="raise")

if pa_type.unit != "ns":
# GH51175: test_from_sequence_of_strings_pa_array
# attempt to parse as int64 reflecting pyarrow's
# duration to string casting behavior
mask = isna(scalars)
if not isinstance(strings, (pa.Array, pa.ChunkedArray)):
strings = pa.array(strings, type=pa.string(), from_pandas=True)
strings = pa.array(strings, type=pa.string(), mask=mask)
strings = pc.if_else(mask, None, strings)
try:
scalars = strings.cast(pa.int64())
except pa.ArrowInvalid:
pass

elif pa.types.is_time(pa_type):
from pandas.core.tools.times import to_time

Expand All @@ -371,7 +385,7 @@ def _from_sequence_of_strings(
if isinstance(strings, (pa.Array, pa.ChunkedArray)):
scalars = strings
else:
scalars = pa.array(strings, type=pa.string(), from_pandas=True)
scalars = pa.array(strings, type=pa.string(), mask=mask)
scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars)
scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars)
scalars = scalars.cast(pa.bool_())
Expand All @@ -383,6 +397,19 @@ def _from_sequence_of_strings(
from pandas.core.tools.numeric import to_numeric

scalars = to_numeric(strings, errors="raise")
if not pa.types.is_decimal(pa_type) and isinstance(
strings, (pa.Array, pa.ChunkedArray)
):
# TODO: figure out why doing this cast breaks with decimal dtype
# in test_from_sequence_of_strings_pa_array
mask = strings.is_null()
scalars = pa.array(scalars, mask=np.array(mask), type=pa_type)
# TODO: could we just do strings.cast(pa_type)?
elif isinstance(strings, (pa.Array, pa.ChunkedArray)):
scalars = strings.cast(pa_type)
elif mask is not None:
scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type)

else:
raise NotImplementedError(
f"Converting strings to {pa_type} is not implemented."
Expand Down Expand Up @@ -425,7 +452,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
"""
if isinstance(value, pa.Scalar):
pa_scalar = value
elif isna(value):
elif isna(value) and not lib.is_float(value):
pa_scalar = pa.scalar(None, type=pa_type)
else:
# Workaround https://github.com/apache/arrow/issues/37291
Expand All @@ -442,7 +469,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
value = value.as_unit(pa_type.unit)
value = value._value

pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True)
pa_scalar = pa.scalar(value, type=pa_type)

if pa_type is not None and pa_scalar.type != pa_type:
pa_scalar = pa_scalar.cast(pa_type)
Expand Down Expand Up @@ -474,6 +501,13 @@ def _box_pa_array(
if copy:
value = value.copy()
pa_array = value.__arrow_array__()

elif hasattr(value, "__arrow_array__"):
# e.g. StringArray
if copy:
value = value.copy()
pa_array = value.__arrow_array__()

else:
if (
isinstance(value, np.ndarray)
Expand All @@ -500,19 +534,52 @@ def _box_pa_array(
value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit)
value = value.to_numpy()

if pa_type is not None and pa.types.is_timestamp(pa_type):
# Use DatetimeArray to exclude Decimal(NaN) (GH#61774) and
# ensure constructor treats tznaive the same as non-pyarrow
# dtypes (GH#61775)
from pandas.core.arrays.datetimes import (
DatetimeArray,
tz_to_dtype,
)

pass_dtype = tz_to_dtype(tz=pa_type.tz, unit=pa_type.unit)
value = extract_array(value, extract_numpy=True)
if isinstance(value, DatetimeArray):
dta = value
else:
dta = DatetimeArray._from_sequence(
value, copy=copy, dtype=pass_dtype
)
dta_mask = dta.isna()
value_i8 = cast("npt.NDArray", dta.view("i8"))
if not value_i8.flags["WRITEABLE"]:
# e.g. test_setitem_frame_2d_values
value_i8 = value_i8.copy()
dta = DatetimeArray._from_sequence(value_i8, dtype=dta.dtype)
value_i8[dta_mask] = 0 # GH#61776 avoid __sub__ overflow
pa_array = pa.array(dta._ndarray, type=pa_type, mask=dta_mask)
return pa_array

mask = None
if getattr(value, "dtype", None) is None or value.dtype.kind not in "iumMf":
arr_value = np.asarray(value, dtype=object)
# similar to isna(value) but exclude NaN, NaT, nat-like, nan-like
mask = is_pdna_or_none(arr_value)

try:
pa_array = pa.array(value, type=pa_type, from_pandas=True)
pa_array = pa.array(value, type=pa_type, mask=mask)
except (pa.ArrowInvalid, pa.ArrowTypeError):
# GH50430: let pyarrow infer type, then cast
pa_array = pa.array(value, from_pandas=True)
pa_array = pa.array(value, mask=mask)

if pa_type is None and pa.types.is_duration(pa_array.type):
# Workaround https://github.com/apache/arrow/issues/37291
from pandas.core.tools.timedeltas import to_timedelta

value = to_timedelta(value)
value = value.to_numpy()
pa_array = pa.array(value, type=pa_type, from_pandas=True)
pa_array = pa.array(value, type=pa_type)

if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0:
# GH52843: upstream bug for duration types when originally
Expand Down Expand Up @@ -1159,7 +1226,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
if not len(values):
return np.zeros(len(self), dtype=bool)

result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True))
result = pc.is_in(self._pa_array, value_set=pa.array(values))
# pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
# to False
return np.array(result, dtype=np.bool_)
Expand Down Expand Up @@ -1440,7 +1507,11 @@ def to_numpy(
pa.types.is_floating(pa_type)
and (
na_value is np.nan
or (original_na_value is lib.no_default and is_float_dtype(dtype))
or (
original_na_value is lib.no_default
and is_float_dtype(dtype)
and not using_pyarrow_strict_nans()
)
)
):
result = data._pa_array.to_numpy()
Expand Down Expand Up @@ -1966,7 +2037,7 @@ def __setitem__(self, key, value) -> None:
raise ValueError("Length of indexer and values mismatch")
chunks = [
*self._pa_array[:key].chunks,
pa.array([value], type=self._pa_array.type, from_pandas=True),
pa.array([value], type=self._pa_array.type),
*self._pa_array[key + 1 :].chunks,
]
data = pa.chunked_array(chunks).combine_chunks()
Expand Down Expand Up @@ -2020,7 +2091,7 @@ def _rank_calc(
pa_type = pa.float64()
else:
pa_type = pa.uint64()
result = pa.array(ranked, type=pa_type, from_pandas=True)
result = pa.array(ranked, type=pa_type)
return result

data = self._pa_array.combine_chunks()
Expand Down Expand Up @@ -2272,7 +2343,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
right, right_type = _to_numpy_and_type(right)
pa_type = left_type or right_type
result = np.where(cond, left, right)
return pa.array(result, type=pa_type, from_pandas=True)
return pa.array(result, type=pa_type)

@classmethod
def _replace_with_mask(
Expand Down Expand Up @@ -2313,9 +2384,10 @@ def _replace_with_mask(
replacements = np.array(replacements, dtype=object)
elif isinstance(replacements, pa.Scalar):
replacements = replacements.as_py()

result = np.array(values, dtype=object)
result[mask] = replacements
return pa.array(result, type=values.type, from_pandas=True)
return pa.array(result, type=values.type)

# ------------------------------------------------------------------
# GroupBy Methods
Expand Down Expand Up @@ -2394,7 +2466,7 @@ def _groupby_op(
return type(self)(pa_result)
else:
# DatetimeArray, TimedeltaArray
pa_result = pa.array(result, from_pandas=True)
pa_result = pa.array(result)
return type(self)(pa_result)

def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2539,6 +2539,14 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
if result is not NotImplemented:
return result

# TODO: putting this here is hacky as heck
if self.dtype == "float64[pyarrow]":
# e.g. test_log_arrow_backed_missing_value
new_inputs = [
x if x is not self else x.to_numpy(na_value=np.nan) for x in inputs
]
return getattr(ufunc, method)(*new_inputs, **kwargs)

return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs)

def map(self, mapper, na_action: Literal["ignore"] | None = None):
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,9 @@ def to_numpy(
array([ True, False, False])
"""
hasna = self._hasna
dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna)
dtype, na_value = to_numpy_dtype_inference(
self, dtype, na_value, hasna, is_pyarrow=False
)
if dtype is None:
dtype = object

Expand Down
8 changes: 7 additions & 1 deletion pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,12 @@ def _str_map_str_or_object(
if self.dtype.storage == "pyarrow":
import pyarrow as pa

# TODO: shouldn't this already be caught my passed mask?
# it isn't in test_extract_expand_capture_groups_index
# mask = mask | np.array(
# [x is libmissing.NA for x in result], dtype=bool
# )

result = pa.array(
result, mask=mask, type=pa.large_string(), from_pandas=True
)
Expand Down Expand Up @@ -733,7 +739,7 @@ def __arrow_array__(self, type=None):

values = self._ndarray.copy()
values[self.isna()] = None
return pa.array(values, type=type, from_pandas=True)
return pa.array(values, type=type)

def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override]
arr = self._ndarray
Expand Down
Loading