diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 74e0a3a434cde..4fd91c8aafe4b 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -358,6 +358,14 @@ def time_assign_with_setitem(self): for i in range(100): self.df[i] = np.random.randn(self.N) + def time_assign_list_like_with_setitem(self): + np.random.seed(1234) + self.df[list(range(100))] = np.random.randn(self.N, 100) + + def time_assign_list_of_columns_concat(self): + df = DataFrame(np.random.randn(self.N, 100)) + concat([self.df, df], axis=1) + class ChainIndexing: diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 24db70481c136..270de56a000a0 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -548,6 +548,7 @@ Performance improvements - Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`) - Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`) - Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`) +- Performance improvement for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f6cf691ea911c..e7cf8cae28b88 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -672,17 +672,12 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) ): - for i, k in enumerate(key): - if k not in self.obj: - if value is None: - self.obj[k] = np.nan - elif is_array_like(value) and value.ndim == 2: - # GH#37964 have to select columnwise in case of array - self.obj[k] = value[:, i] - elif is_list_like(value): - self.obj[k] = value[i] - else: - self.obj[k] = value + # GH#38148 + keys = self.obj.columns.union(key, sort=False) + + self.obj._mgr = self.obj._mgr.reindex_axis( + keys, axis=0, copy=False, consolidate=False, only_slice=True + ) def __setitem__(self, key, value): if isinstance(key, tuple): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 168dba25ba29c..93ab207d8ce12 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1236,6 +1236,8 @@ def reindex_axis( limit=None, fill_value=None, copy: bool = True, + consolidate: bool = True, + only_slice: bool = False, ): """ Conform block manager to new index. @@ -1246,7 +1248,13 @@ def reindex_axis( ) return self.reindex_indexer( - new_index, indexer, axis=axis, fill_value=fill_value, copy=copy + new_index, + indexer, + axis=axis, + fill_value=fill_value, + copy=copy, + consolidate=consolidate, + only_slice=only_slice, ) def reindex_indexer( @@ -1258,6 +1266,7 @@ def reindex_indexer( allow_dups: bool = False, copy: bool = True, consolidate: bool = True, + only_slice: bool = False, ) -> T: """ Parameters @@ -1270,6 +1279,8 @@ def reindex_indexer( copy : bool, default True consolidate: bool, default True Whether to consolidate inplace before reindexing. + only_slice : bool, default False + Whether to take views, not copies, along columns. pandas-indexer with -1's only. """ @@ -1293,7 +1304,9 @@ def reindex_indexer( raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0(indexer, fill_value=fill_value) + new_blocks = self._slice_take_blocks_ax0( + indexer, fill_value=fill_value, only_slice=only_slice + ) else: new_blocks = [ blk.take_nd( diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index e4a66ea9133dd..884cb6c20b77e 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -319,6 +319,24 @@ def test_setitem_bool_with_numeric_index(self, dtype): tm.assert_index_equal(df.columns, expected_cols) +class TestDataFrameSetItemWithExpansion: + def test_setitem_listlike_views(self): + # GH#38148 + df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]}) + + # get one column as a view of df + ser = df["a"] + + # add columns with list-like indexer + df[["c", "d"]] = np.array([[0.1, 0.2], [0.3, 0.4], [0.4, 0.5]]) + + # edit in place the first column to check view semantics + df.iloc[0, 0] = 100 + + expected = Series([100, 2, 3], name="a") + tm.assert_series_equal(ser, expected) + + class TestDataFrameSetItemSlicing: def test_setitem_slice_position(self): # GH#31469