From 6f52f7d8d2c795075869b9ca9970f433eb3f9949 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 30 Oct 2020 16:25:42 -0700 Subject: [PATCH 1/3] BUG: isin incorrectly casting ints to datetimes --- doc/source/whatsnew/v1.2.0.rst | 4 ++ pandas/core/algorithms.py | 6 +++ pandas/core/indexes/datetimelike.py | 28 ++++++++++++ pandas/tests/series/methods/test_isin.py | 55 ++++++++++++++++++++++++ 4 files changed, 93 insertions(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 4c970eea60f40..b063a6b679cd6 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -380,6 +380,10 @@ Datetimelike - :class:`Timestamp` and :class:`DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`) - Bug in :meth:`DatetimeIndex.equals` and :meth:`TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) - Bug in :meth:`TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`) +- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`DatetimeIndex.isin` incorrectly casting integers to datetimes (:issue:`36621`) +- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`DatetimeIndex.isin` failing to consider timezone-aware and timezone-naive datetimes as always different (:issue:`???`) +- Bug in :meth:`Series.isin` with ``PeriodDtype`` dtype and :meth:`PeriodIndex.isin` failing to consider arguments with different ``PeriodDtype`` as always different (:issue:`???`) + Timedelta ^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a310ec5312cf4..4abcd7de3ac51 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -430,6 +430,12 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: # handle categoricals return cast("Categorical", comps).isin(values) + if needs_i8_conversion(comps): + # Dispatch to DatetimeLikeIndexMixin.isin + from pandas import Index + + return Index(comps).isin(values) + comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 25c6de3d255e3..1c6b51775d74b 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -473,12 +473,40 @@ def isin(self, values, level=None): if level is not None: self._validate_index_level(level) + if not hasattr(values, "dtype"): + values = np.asarray(values) + + if values.dtype.kind in ["f", "i", "u", "c"]: + # TODO: de-duplicate with equals, validate_comparison_value + return np.zeros(self.shape, dtype=bool) + if not isinstance(values, type(self)): + inferrable = [ + "timedelta", + "timedelta64", + "datetime", + "datetime64", + "date", + "period", + ] + if values.dtype == object: + inferred = lib.infer_dtype(values, skipna=False) + if inferred not in inferrable: + if "mixed" in inferred: + return self.astype(object).isin(values) + return np.zeros(self.shape, dtype=bool) + try: values = type(self)(values) except ValueError: return self.astype(object).isin(values) + try: + self._data._check_compatible_with(values) + except (TypeError, ValueError): + # Includes tzawareness mismatch and IncompatibleFrequencyError + return np.zeros(self.shape, dtype=bool) + return algorithms.isin(self.asi8, values.asi8) @Appender(Index.where.__doc__) diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 86ea2b2f02a4d..071b1f3f75f44 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -4,6 +4,7 @@ import pandas as pd from pandas import Series, date_range import pandas._testing as tm +from pandas.core.arrays import PeriodArray class TestSeriesIsIn: @@ -90,6 +91,60 @@ def test_isin_read_only(self): expected = Series([True, True, True]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("dtype", [object, None]) + def test_isin_dt64_values_vs_ints(self, dtype): + # GH#36621 dont cast integers to datetimes for isin + dti = date_range("2013-01-01", "2013-01-05") + ser = Series(dti) + + comps = np.asarray([1356998400000000000], dtype=dtype) + + res = dti.isin(comps) + expected = np.array([False] * len(dti), dtype=bool) + tm.assert_numpy_array_equal(res, expected) + + res = ser.isin(comps) + tm.assert_series_equal(res, Series(expected)) + + res = pd.core.algorithms.isin(ser, comps) + tm.assert_numpy_array_equal(res, expected) + + def test_isin_tzawareness_mismatch(self): + dti = date_range("2013-01-01", "2013-01-05") + ser = Series(dti) + + other = dti.tz_localize("UTC") + + res = dti.isin(other) + expected = np.array([False] * len(dti), dtype=bool) + tm.assert_numpy_array_equal(res, expected) + + res = ser.isin(other) + tm.assert_series_equal(res, Series(expected)) + + res = pd.core.algorithms.isin(ser, other) + tm.assert_numpy_array_equal(res, expected) + + def test_isin_period_freq_mismatch(self): + dti = date_range("2013-01-01", "2013-01-05") + pi = dti.to_period("M") + ser = Series(pi) + + # We construct another PeriodIndex with the same i8 values + # but different dtype + dtype = dti.to_period("Y").dtype + other = PeriodArray._simple_new(pi.asi8, dtype=dtype) + + res = pi.isin(other) + expected = np.array([False] * len(pi), dtype=bool) + tm.assert_numpy_array_equal(res, expected) + + res = ser.isin(other) + tm.assert_series_equal(res, Series(expected)) + + res = pd.core.algorithms.isin(ser, other) + tm.assert_numpy_array_equal(res, expected) + @pytest.mark.slow def test_isin_large_series_mixed_dtypes_and_nan(): From 023b4373893d0459f192d14760a18a100bb7b87c Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 30 Oct 2020 16:27:10 -0700 Subject: [PATCH 2/3] GH ref --- doc/source/whatsnew/v1.2.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b063a6b679cd6..d9379057ecc5d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -381,8 +381,8 @@ Datetimelike - Bug in :meth:`DatetimeIndex.equals` and :meth:`TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) - Bug in :meth:`TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`) - Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`DatetimeIndex.isin` incorrectly casting integers to datetimes (:issue:`36621`) -- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`DatetimeIndex.isin` failing to consider timezone-aware and timezone-naive datetimes as always different (:issue:`???`) -- Bug in :meth:`Series.isin` with ``PeriodDtype`` dtype and :meth:`PeriodIndex.isin` failing to consider arguments with different ``PeriodDtype`` as always different (:issue:`???`) +- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`DatetimeIndex.isin` failing to consider timezone-aware and timezone-naive datetimes as always different (:issue:`35728`) +- Bug in :meth:`Series.isin` with ``PeriodDtype`` dtype and :meth:`PeriodIndex.isin` failing to consider arguments with different ``PeriodDtype`` as always different (:issue:`37528`) Timedelta From ca75491e5c4c5abd96c3cfd9df1f73ea89285a4d Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 21 Nov 2020 10:03:53 -0800 Subject: [PATCH 3/3] add asvs --- asv_bench/benchmarks/series_methods.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 3b65bccd48aee..2db46abca119c 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -2,7 +2,7 @@ import numpy as np -from pandas import NaT, Series, date_range +from pandas import Categorical, NaT, Series, date_range from .pandas_vb_common import tm @@ -36,6 +36,28 @@ def time_isin(self, dtypes): self.s.isin(self.values) +class IsInDatetime64: + def setup(self): + dti = date_range( + start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s" + ) + self.ser = Series(dti) + self.subset = self.ser._values[::3] + self.cat_subset = Categorical(self.subset) + + def time_isin(self): + self.ser.isin(self.subset) + + def time_isin_cat_values(self): + self.ser.isin(self.cat_subset) + + def time_isin_mismatched_dtype(self): + self.ser.isin([1, 2]) + + def time_isin_empty(self): + self.ser.isin([]) + + class IsInFloat64: def setup(self): self.small = Series([1, 2], dtype=np.float64)