From 3cc9846e30524a9f9ad883a240ce4d7af2df6b5b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 16 Jun 2023 22:28:47 +0200 Subject: [PATCH 1/7] BUG: IntervalIndex.get_indexer raising for read only array --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/_libs/intervaltree.pxi.in | 4 ++-- pandas/tests/indexes/interval/test_indexing.py | 12 ++++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index e2f0904a78cf9..997ddff91969e 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -400,7 +400,7 @@ Strings Interval ^^^^^^^^ -- +- :meth:`pd.IntervalIndex.get_indexer` and :meth:`pd.IntervalIndex.get_indexer_nonunique` raising if ``target`` is read-only array (:issue:`53703`) - Indexing diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 67fee7c5fbadd..0b99aebbd3816 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -125,7 +125,7 @@ cdef class IntervalTree(IntervalMixin): sort_order = self.left_sorter return is_monotonic(sort_order, False)[0] - def get_indexer(self, scalar_t[:] target) -> np.ndarray: + def get_indexer(self, ndarray[scalar_t, ndim=1] target) -> np.ndarray: """Return the positions corresponding to unique intervals that overlap with the given array of scalar targets. """ @@ -153,7 +153,7 @@ cdef class IntervalTree(IntervalMixin): old_len = result.data.n return result.to_array().astype('intp') - def get_indexer_non_unique(self, scalar_t[:] target): + def get_indexer_non_unique(self, ndarray[scalar_t, ndim=1] target): """Return the positions corresponding to intervals that overlap with the given array of scalar targets. Non-unique positions are repeated. """ diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index e7db8076efa2b..78e232fcc9f92 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -424,6 +424,18 @@ def test_get_indexer_interval_index(self, box): expected = np.array([-1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(actual, expected) + def test_get_indexer_read_only(self): + idx = interval_range(start=0, end=5) + arr = np.array([1, 2]) + arr.flags.writeable = False + result = idx.get_indexer(arr) + expected = np.array([0, 1]) + tm.assert_numpy_array_equal(result, expected) + + result = idx.get_indexer_non_unique(arr)[0] + expected = np.array([0, 1]) + tm.assert_numpy_array_equal(result, expected) + class TestSliceLocs: def test_slice_locs_with_interval(self): From 6d050d89fa7c6f5a349894c9883b5e570b94a548 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 17 Jun 2023 00:06:42 +0200 Subject: [PATCH 2/7] Fix windows failure --- pandas/tests/indexes/interval/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 78e232fcc9f92..e06c887f9f423 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -434,7 +434,7 @@ def test_get_indexer_read_only(self): result = idx.get_indexer_non_unique(arr)[0] expected = np.array([0, 1]) - tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) class TestSliceLocs: From 4c2dac9bec0dc23cca45e8c1b3a8e7a36ae6cffe Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 17 Jun 2023 00:09:41 +0200 Subject: [PATCH 3/7] CoW: Return read-only array in Index.values --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/conftest.py | 2 +- pandas/core/indexes/base.py | 11 ++++++++++- pandas/core/indexes/datetimelike.py | 8 +++++++- pandas/tests/copy_view/index/test_datetimeindex.py | 9 +++++++++ pandas/tests/copy_view/index/test_index.py | 9 +++++++++ pandas/tests/copy_view/test_setitem.py | 9 +++++++-- pandas/tests/indexes/numeric/test_numeric.py | 10 +++++++--- pandas/tests/io/test_parquet.py | 9 ++++++++- 9 files changed, 59 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 997ddff91969e..96c9376062a76 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -19,6 +19,7 @@ Enhancements Copy-on-Write improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^ +- Calling :meth:`Index.values` will now return a read-only NumPy array (:issue:`53704`) - Setting a :class:`Series` into a :class:`DataFrame` now creates a lazy instead of a deep copy (:issue:`53142`) .. _whatsnew_210.enhancements.enhancement2: diff --git a/pandas/conftest.py b/pandas/conftest.py index ed05ddd1b2f31..1dcfc88eb1bfd 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -685,7 +685,7 @@ def index_with_missing(request): # GH 35538. Use deep copy to avoid illusive bug on np-dev # GHA pipeline that writes into indices_dict despite copy ind = indices_dict[request.param].copy(deep=True) - vals = ind.values + vals = ind.values.copy() if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]: # For setting missing values in the top level of MultiIndex vals = ind.tolist() diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2c6c6dd0e6ed1..86838ae3051ae 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -22,7 +22,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_copy_on_write, +) from pandas._libs import ( NaT, @@ -5055,6 +5058,12 @@ def values(self) -> ArrayLike: >>> idx.values array([1, 2, 3]) """ + if using_copy_on_write(): + data = self._data + if isinstance(data, np.ndarray): + data = data.view() + data.flags.writeable = False + return data return self._data @cache_readonly diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index d818e1e862c12..2125c1aef6162 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -18,6 +18,8 @@ import numpy as np +from pandas._config import using_copy_on_write + from pandas._libs import ( NaT, Timedelta, @@ -451,7 +453,11 @@ def _with_freq(self, freq): @property def values(self) -> np.ndarray: # NB: For Datetime64TZ this is lossy - return self._data._ndarray + data = self._data._ndarray + if using_copy_on_write(): + data = data.view() + data.flags.writeable = False + return data @doc(DatetimeIndexOpsMixin.shift) def shift(self, periods: int = 1, freq=None) -> Self: diff --git a/pandas/tests/copy_view/index/test_datetimeindex.py b/pandas/tests/copy_view/index/test_datetimeindex.py index f691d5589f48c..f54beca4cc414 100644 --- a/pandas/tests/copy_view/index/test_datetimeindex.py +++ b/pandas/tests/copy_view/index/test_datetimeindex.py @@ -54,3 +54,12 @@ def test_datetimeindex_isocalendar(using_copy_on_write): ser.iloc[0] = Timestamp("2020-12-31") if using_copy_on_write: tm.assert_index_equal(df.index, expected) + + +def test_index_values(using_copy_on_write): + idx = date_range("2019-12-31", periods=3, freq="D") + result = idx.values + if using_copy_on_write: + assert result.flags.writeable is False + else: + assert result.flags.writeable is True diff --git a/pandas/tests/copy_view/index/test_index.py b/pandas/tests/copy_view/index/test_index.py index 5e9c04c0adfc3..5e1d48f5dfdb3 100644 --- a/pandas/tests/copy_view/index/test_index.py +++ b/pandas/tests/copy_view/index/test_index.py @@ -153,3 +153,12 @@ def test_infer_objects(using_copy_on_write): view_.iloc[0, 0] = "aaaa" if using_copy_on_write: tm.assert_index_equal(idx, expected, check_names=False) + + +def test_index_values(using_copy_on_write): + idx = Index([1, 2, 3]) + result = idx.values + if using_copy_on_write: + assert result.flags.writeable is False + else: + assert result.flags.writeable is True diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 2a99a00e249fa..0c5a895953836 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( DataFrame, @@ -59,8 +60,12 @@ def test_set_column_with_index(using_copy_on_write): assert not np.shares_memory(get_array(df, "c"), idx.values) # and thus modifying the index does not modify the DataFrame - idx.values[0] = 0 - tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) + if using_copy_on_write: + with pytest.raises(ValueError, match="assignment"): + idx.values[0] = 0 + else: + idx.values[0] = 0 + tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) idx = RangeIndex(1, 4) arr = idx.values diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 4080dc7081771..92a8cad8e2a74 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -327,7 +327,7 @@ def test_constructor_from_list_no_dtype(self): index = Index([1, 2, 3]) assert index.dtype == np.int64 - def test_constructor(self, dtype): + def test_constructor(self, dtype, using_copy_on_write): index_cls = Index # scalar raise Exception @@ -347,8 +347,12 @@ def test_constructor(self, dtype): val = arr[0] + 3000 # this should not change index - arr[0] = val - assert new_index[0] != val + if not using_copy_on_write: + arr[0] = val + assert new_index[0] != val + else: + with pytest.raises(ValueError, match="assignment"): + arr[0] = val if dtype == np.int64: # pass list, coerce fine diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 57ef03b380601..10fce6b5bf43d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -433,8 +433,12 @@ def test_read_columns(self, engine): df, engine, expected=expected, read_kwargs={"columns": ["string"]} ) - def test_write_index(self, engine): + def test_write_index(self, engine, using_copy_on_write, request): check_names = engine != "fastparquet" + if using_copy_on_write and engine == "fastparquet": + request.node.add_marker( + pytest.mark.xfail(reason="fastparquet write into index") + ) df = pd.DataFrame({"A": [1, 2, 3]}) check_round_trip(df, engine) @@ -1213,12 +1217,14 @@ def test_error_on_using_partition_cols_and_partition_on( partition_cols=partition_cols, ) + @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index") def test_empty_dataframe(self, fp): # GH #27339 df = pd.DataFrame() expected = df.copy() check_round_trip(df, fp, expected=expected) + @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index") def test_timezone_aware_index(self, fp, timezone_aware_date_list): idx = 5 * [timezone_aware_date_list] @@ -1328,6 +1334,7 @@ def test_invalid_dtype_backend(self, engine): with pytest.raises(ValueError, match=msg): read_parquet(path, dtype_backend="numpy") + @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index") def test_empty_columns(self, fp): # GH 52034 df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")) From 2e451775e3bf060a321b1b5acf8d432c9c32dd4f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 18 Jun 2023 16:29:33 +0200 Subject: [PATCH 4/7] Fix test --- pandas/tests/copy_view/test_setitem.py | 8 ++++++-- pandas/tests/indexes/interval/test_indexing.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 0c5a895953836..e3055600cf85c 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -73,8 +73,12 @@ def test_set_column_with_index(using_copy_on_write): df["d"] = idx assert not np.shares_memory(get_array(df, "d"), arr) - arr[0] = 0 - tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d")) + if using_copy_on_write: + with pytest.raises(ValueError, match="assignment"): + arr[0] = 0 + else: + arr[0] = 0 + tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d")) def test_set_columns_with_dataframe(using_copy_on_write): diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index e06c887f9f423..2a43e7329321f 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -430,7 +430,7 @@ def test_get_indexer_read_only(self): arr.flags.writeable = False result = idx.get_indexer(arr) expected = np.array([0, 1]) - tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) result = idx.get_indexer_non_unique(arr)[0] expected = np.array([0, 1]) From 6e19b8d6e004f677202d719fc65bfdcceb367c54 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 19 Jun 2023 22:10:11 +0200 Subject: [PATCH 5/7] Update --- pandas/tests/copy_view/test_setitem.py | 9 --------- pandas/tests/indexes/numeric/test_numeric.py | 10 +++------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index e3055600cf85c..702a63104e1b8 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -1,5 +1,4 @@ import numpy as np -import pytest from pandas import ( DataFrame, @@ -59,14 +58,6 @@ def test_set_column_with_index(using_copy_on_write): # the index data is copied assert not np.shares_memory(get_array(df, "c"), idx.values) - # and thus modifying the index does not modify the DataFrame - if using_copy_on_write: - with pytest.raises(ValueError, match="assignment"): - idx.values[0] = 0 - else: - idx.values[0] = 0 - tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) - idx = RangeIndex(1, 4) arr = idx.values diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 92a8cad8e2a74..e465ac1e868d0 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -341,18 +341,14 @@ def test_constructor(self, dtype, using_copy_on_write): # copy # pass list, coerce fine index = index_cls([-5, 0, 1, 2], dtype=dtype) - arr = index.values + arr = index.values.copy() new_index = index_cls(arr, copy=True) tm.assert_index_equal(new_index, index, exact=True) val = arr[0] + 3000 # this should not change index - if not using_copy_on_write: - arr[0] = val - assert new_index[0] != val - else: - with pytest.raises(ValueError, match="assignment"): - arr[0] = val + arr[0] = val + assert new_index[0] != val if dtype == np.int64: # pass list, coerce fine From 2f1623a9a0b77082f7bd335870f89d51360d5d89 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 20 Jun 2023 09:36:36 +0200 Subject: [PATCH 6/7] Update pandas/tests/copy_view/test_setitem.py --- pandas/tests/copy_view/test_setitem.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 702a63104e1b8..5016b57bdd0b7 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -64,12 +64,6 @@ def test_set_column_with_index(using_copy_on_write): df["d"] = idx assert not np.shares_memory(get_array(df, "d"), arr) - if using_copy_on_write: - with pytest.raises(ValueError, match="assignment"): - arr[0] = 0 - else: - arr[0] = 0 - tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d")) def test_set_columns_with_dataframe(using_copy_on_write): From 7fe0283c1ec695de9437320e53800270b4f37cfc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 20 Jun 2023 09:37:10 +0200 Subject: [PATCH 7/7] Update pandas/tests/indexes/numeric/test_numeric.py --- pandas/tests/indexes/numeric/test_numeric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index e465ac1e868d0..977c7da7d866f 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -327,7 +327,7 @@ def test_constructor_from_list_no_dtype(self): index = Index([1, 2, 3]) assert index.dtype == np.int64 - def test_constructor(self, dtype, using_copy_on_write): + def test_constructor(self, dtype): index_cls = Index # scalar raise Exception