From 460a978c6dfb8ac11b97995f5e035e0d72730eb8 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Feb 2021 19:26:47 -0800 Subject: [PATCH 1/5] TYP: maybe_cast_to_integer_array --- pandas/core/dtypes/cast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 669bfe08d42b0..8f346faa2d6b1 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1834,7 +1834,7 @@ def construct_1d_ndarray_preserving_na( return subarr -def maybe_cast_to_integer_array(arr, dtype: Dtype, copy: bool = False): +def maybe_cast_to_integer_array(arr, dtype: np.dtype, copy: bool = False): """ Takes any dtype and returns the casted version, raising for when data is incompatible with integer/unsigned integer dtypes. @@ -1845,7 +1845,7 @@ def maybe_cast_to_integer_array(arr, dtype: Dtype, copy: bool = False): ---------- arr : array-like The array to cast. - dtype : str, np.dtype + dtype : np.dtype The integer dtype to cast the array to. copy: bool, default False Whether to make a copy of the array before returning. From 854f23fee311ecf9ed5681a4dd205912dd0fac41 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Feb 2021 20:18:14 -0800 Subject: [PATCH 2/5] CLN: tighten types in construction._try_cast --- pandas/core/construction.py | 16 ++++++++++++++-- pandas/core/dtypes/cast.py | 17 +++++++++++------ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index dd75473da6d78..1172fd25de3e4 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -521,6 +521,9 @@ def sanitize_array( subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype) else: + # realize e.g. generators + # TODO: non-standard array-likes we can convert to ndarray more efficiently? + data = list(data) subarr = _try_cast(data, dtype, copy, raise_cast_failure) subarr = _sanitize_ndim(subarr, data, dtype, index) @@ -594,13 +597,18 @@ def _maybe_repeat(arr: ArrayLike, index: Optional[Index]) -> ArrayLike: return arr -def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool): +def _try_cast( + arr: Union[list, np.ndarray], + dtype: Optional[DtypeObj], + copy: bool, + raise_cast_failure: bool, +) -> ArrayLike: """ Convert input to numpy ndarray and optionally cast to a given dtype. Parameters ---------- - arr : ndarray, list, tuple, iterator (catchall) + arr : ndarray or list Excludes: ExtensionArray, Series, Index. dtype : np.dtype, ExtensionDtype or None copy : bool @@ -608,6 +616,10 @@ def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bo raise_cast_failure : bool If True, and if a dtype is specified, raise errors during casting. Otherwise an object array is returned. + + Returns + ------- + np.ndarray or ExtensionArray """ # perf shortcut as this is the most common case if ( diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8f346faa2d6b1..78752527168e9 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1432,7 +1432,7 @@ def maybe_infer_to_datetimelike( if not len(v): return value - def try_datetime(v): + def try_datetime(v: np.ndarray) -> ArrayLike: # safe coerce to datetime64 try: # GH19671 @@ -1451,14 +1451,15 @@ def try_datetime(v): except (ValueError, TypeError): pass else: - return DatetimeIndex(values).tz_localize("UTC").tz_convert(tz=tz) + dti = DatetimeIndex(values).tz_localize("UTC").tz_convert(tz=tz) + return dti._data except TypeError: # e.g. is not convertible to datetime pass return v.reshape(shape) - def try_timedelta(v): + def try_timedelta(v: np.ndarray) -> np.ndarray: # safe coerce to timedelta64 # will try first with a string & object conversion @@ -1498,7 +1499,9 @@ def try_timedelta(v): return value -def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): +def maybe_cast_to_datetime( + value: Union[ArrayLike, list], dtype: Optional[DtypeObj] +) -> Union[ArrayLike, list]: """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT @@ -1834,7 +1837,9 @@ def construct_1d_ndarray_preserving_na( return subarr -def maybe_cast_to_integer_array(arr, dtype: np.dtype, copy: bool = False): +def maybe_cast_to_integer_array( + arr: Union[list, np.ndarray], dtype: np.dtype, copy: bool = False +): """ Takes any dtype and returns the casted version, raising for when data is incompatible with integer/unsigned integer dtypes. @@ -1843,7 +1848,7 @@ def maybe_cast_to_integer_array(arr, dtype: np.dtype, copy: bool = False): Parameters ---------- - arr : array-like + arr : np.ndarray or list The array to cast. dtype : np.dtype The integer dtype to cast the array to. From b3d53eabcd49e8e2961b15839c441ee8f882d8ae Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 21 Feb 2021 12:16:45 -0800 Subject: [PATCH 3/5] mypy fixup, update following maybe_infer_objects return type change --- pandas/core/arrays/categorical.py | 7 +++---- pandas/core/dtypes/cast.py | 10 +++++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 916d4f9f2fd28..ff835eb32f6df 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -399,7 +399,9 @@ def __init__( # sanitize_array coerces np.nan to a string under certain versions # of numpy values = maybe_infer_to_datetimelike(values) - if not isinstance(values, (np.ndarray, ExtensionArray)): + if isinstance(values, np.ndarray): + values = sanitize_to_nanoseconds(values) + elif not isinstance(values, ExtensionArray): values = com.convert_to_list_like(values) # By convention, empty lists result in object dtype: @@ -409,9 +411,6 @@ def __init__( values = [values[idx] for idx in np.where(~null_mask)[0]] values = sanitize_array(values, None, dtype=sanitize_dtype) - else: - values = sanitize_to_nanoseconds(values) - if dtype.categories is None: try: codes, categories = factorize(values, sort=True) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 78752527168e9..fbdfd20b9511c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1500,7 +1500,7 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: def maybe_cast_to_datetime( - value: Union[ArrayLike, list], dtype: Optional[DtypeObj] + raw_value: Union[ArrayLike, list], dtype: Optional[DtypeObj] ) -> Union[ArrayLike, list]: """ try to cast the array/value to a datetimelike dtype, converting float @@ -1509,8 +1509,12 @@ def maybe_cast_to_datetime( from pandas.core.tools.datetimes import to_datetime from pandas.core.tools.timedeltas import to_timedelta - if not is_list_like(value): + if not is_list_like(raw_value): raise TypeError("value must be listlike") + elif isinstance(raw_value, list): + value = np.array(raw_value, copy=False) + else: + value = raw_value if dtype is not None: is_datetime64 = is_datetime64_dtype(dtype) @@ -1884,7 +1888,7 @@ def maybe_cast_to_integer_array( assert is_integer_dtype(dtype) try: - if not hasattr(arr, "astype"): + if not isinstance(arr, np.ndarray): casted = np.array(arr, dtype=dtype, copy=copy) else: casted = arr.astype(dtype, copy=copy) From 2065566dc61b4f09060eda0e1985848b8e3c8f20 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 21 Feb 2021 19:50:01 -0800 Subject: [PATCH 4/5] fix failing tests --- pandas/core/dtypes/cast.py | 35 ++++++++++++++--------------------- pandas/core/frame.py | 2 +- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index fbdfd20b9511c..ab5cbdd823476 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1422,7 +1422,7 @@ def maybe_infer_to_datetimelike( v = np.array(v, copy=False) # we only care about object dtypes - if not is_object_dtype(v): + if not is_object_dtype(v.dtype): return value shape = v.shape @@ -1500,7 +1500,7 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: def maybe_cast_to_datetime( - raw_value: Union[ArrayLike, list], dtype: Optional[DtypeObj] + value: Union[ArrayLike, list], dtype: Optional[DtypeObj] ) -> Union[ArrayLike, list]: """ try to cast the array/value to a datetimelike dtype, converting float @@ -1509,12 +1509,8 @@ def maybe_cast_to_datetime( from pandas.core.tools.datetimes import to_datetime from pandas.core.tools.timedeltas import to_timedelta - if not is_list_like(raw_value): + if not is_list_like(value): raise TypeError("value must be listlike") - elif isinstance(raw_value, list): - value = np.array(raw_value, copy=False) - else: - value = raw_value if dtype is not None: is_datetime64 = is_datetime64_dtype(dtype) @@ -1569,26 +1565,28 @@ def maybe_cast_to_datetime( try: if is_datetime64: - value = to_datetime(value, errors="raise") + dti = to_datetime(value, errors="raise") # GH 25843: Remove tz information since the dtype # didn't specify one - if value.tz is not None: - value = value.tz_localize(None) - value = value._values + if dti.tz is not None: + dti = dti.tz_localize(None) + value = dti._values elif is_datetime64tz: # The string check can be removed once issue #13712 # is solved. String data that is passed with a # datetime64tz is assumed to be naive which should # be localized to the timezone. is_dt_string = is_string_dtype(value.dtype) - value = to_datetime(value, errors="raise").array - if is_dt_string: + dta = to_datetime(value, errors="raise").array + if dta.tz is not None: + value = dta.astype(dtype, copy=False) + elif is_dt_string: # Strings here are naive, so directly localize - value = value.tz_localize(dtype.tz) + value = dta.tz_localize(dtype.tz) else: # Numeric values are UTC at this point, # so localize and convert - value = value.tz_localize("UTC").tz_convert(dtype.tz) + value = dta.tz_localize("UTC").tz_convert(dtype.tz) elif is_timedelta64: value = to_timedelta(value, errors="raise")._values except OutOfBoundsDatetime: @@ -1621,12 +1619,7 @@ def maybe_cast_to_datetime( # only do this if we have an array and the dtype of the array is not # setup already we are not an integer/object, so don't bother with this # conversion - elif not ( - is_array - and not ( - issubclass(value.dtype.type, np.integer) or value.dtype == np.object_ - ) - ): + elif not is_array or value.dtype == object: value = maybe_infer_to_datetimelike(value) return value diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2d6cfff561aab..9112853c46b8b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4035,7 +4035,7 @@ def _sanitize_column(self, value): else: value = com.asarray_tuplesafe(value) elif isinstance(value, Index): - value = value.copy(deep=True) + value = value.copy(deep=True)._values else: value = value.copy() From 2ed2ac9adef1afa7a4deba97cf47281af4a27304 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 22 Feb 2021 07:52:40 -0800 Subject: [PATCH 5/5] mypy fixup --- pandas/core/dtypes/cast.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index cd3345b2a3238..531d784925e9d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1500,8 +1500,8 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: def maybe_cast_to_datetime( - value: Union[ArrayLike, list], dtype: Optional[DtypeObj] -) -> Union[ArrayLike, list]: + value: Union[ExtensionArray, np.ndarray, list], dtype: Optional[DtypeObj] +) -> Union[ExtensionArray, np.ndarray, list]: """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT @@ -1599,6 +1599,8 @@ def maybe_cast_to_datetime( getattr(value, "dtype", None) ) and not is_datetime64_dtype(dtype): if is_object_dtype(dtype): + value = cast(np.ndarray, value) + if value.dtype != DT64NS_DTYPE: value = value.astype(DT64NS_DTYPE) ints = np.asarray(value).view("i8") @@ -1607,20 +1609,20 @@ def maybe_cast_to_datetime( # we have a non-castable dtype that was passed raise TypeError(f"Cannot cast datetime64 to {dtype}") - else: - - is_array = isinstance(value, np.ndarray) - - # catch a datetime/timedelta that is not of ns variety - # and no coercion specified - if is_array and value.dtype.kind in ["M", "m"]: + elif isinstance(value, np.ndarray): + if value.dtype.kind in ["M", "m"]: + # catch a datetime/timedelta that is not of ns variety + # and no coercion specified value = sanitize_to_nanoseconds(value) + elif value.dtype == object: + value = maybe_infer_to_datetimelike(value) + + else: # only do this if we have an array and the dtype of the array is not # setup already we are not an integer/object, so don't bother with this # conversion - elif not is_array or value.dtype == object: - value = maybe_infer_to_datetimelike(value) + value = maybe_infer_to_datetimelike(value) return value