From f953ff4a5655ef0e7a4c8351a0e4f8d5df0b63cf Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 29 Nov 2020 14:43:52 +0100 Subject: [PATCH 1/3] Fix performance problems for df.__setitem__ with lislike indexers --- asv_bench/benchmarks/indexing.py | 4 ++++ doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexing.py | 15 ++++----------- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 74e0a3a434cde..04b17090e27b7 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -358,6 +358,10 @@ def time_assign_with_setitem(self): for i in range(100): self.df[i] = np.random.randn(self.N) + def time_assign_list_like_with_setitem(self): + np.random.seed(1234) + self.df[list(range(100))] = np.random.randn(self.N, 100) + class ChainIndexing: diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 5d36c52da9f0d..ea81c2cc5f7d2 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -257,6 +257,7 @@ Other enhancements - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) +- Improve performance for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6aa031af64833..16afbab6997ed 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -663,17 +663,10 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) ): - for i, k in enumerate(key): - if k not in self.obj: - if value is None: - self.obj[k] = np.nan - elif is_array_like(value) and value.ndim == 2: - # GH#37964 have to select columnwise in case of array - self.obj[k] = value[:, i] - elif is_list_like(value): - self.obj[k] = value[i] - else: - self.obj[k] = value + keys = self.obj.columns.tolist() + keys.extend([k for k in key if k not in self.obj]) + data = self.obj._mgr.reindex_axis(keys, 0) + object.__setattr__(self.obj, "_mgr", data) def __setitem__(self, key, value): if isinstance(key, tuple): From 322feb509810e38c4564bf46ba48813df0cb3f53 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 29 Nov 2020 15:25:35 +0100 Subject: [PATCH 2/3] Improve code --- pandas/core/indexing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 16afbab6997ed..29666e709000c 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -665,8 +665,7 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): ): keys = self.obj.columns.tolist() keys.extend([k for k in key if k not in self.obj]) - data = self.obj._mgr.reindex_axis(keys, 0) - object.__setattr__(self.obj, "_mgr", data) + self.obj._mgr = self.obj._mgr.reindex_axis(keys, 0) def __setitem__(self, key, value): if isinstance(key, tuple): From 07b1c52892ba5db9d0d727fd99b8b932ae5a4b56 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 29 Nov 2020 17:17:15 +0100 Subject: [PATCH 3/3] Adress review comments --- asv_bench/benchmarks/indexing.py | 4 ++++ doc/source/whatsnew/v1.1.5.rst | 1 + doc/source/whatsnew/v1.2.0.rst | 1 - pandas/core/indexing.py | 3 +-- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 04b17090e27b7..4fd91c8aafe4b 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -362,6 +362,10 @@ def time_assign_list_like_with_setitem(self): np.random.seed(1234) self.df[list(range(100))] = np.random.randn(self.N, 100) + def time_assign_list_of_columns_concat(self): + df = DataFrame(np.random.randn(self.N, 100)) + concat([self.df, df], axis=1) + class ChainIndexing: diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 46c4ad4f35fe4..958fc762903f4 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -23,6 +23,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) - Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) - Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). +- Fixed performance regression for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ea81c2cc5f7d2..5d36c52da9f0d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -257,7 +257,6 @@ Other enhancements - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) -- Improve performance for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 29666e709000c..3df96f2cbe40d 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -663,8 +663,7 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) ): - keys = self.obj.columns.tolist() - keys.extend([k for k in key if k not in self.obj]) + keys = self.obj.columns.union(key, sort=False) self.obj._mgr = self.obj._mgr.reindex_axis(keys, 0) def __setitem__(self, key, value):