diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 46428dcf462ea..80d8f3380dd88 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -948,6 +948,8 @@ Performance improvements - Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`) - Fixed a reference leak in :func:`read_hdf` (:issue:`37441`) - Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`) +- Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`) +- .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: diff --git a/pandas/core/base.py b/pandas/core/base.py index 9cf93ea8eee2f..548420afcecb6 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -213,12 +213,12 @@ def ndim(self) -> int: @final @cache_readonly def _obj_with_exclusions(self): - if self._selection is not None and isinstance(self.obj, ABCDataFrame): - return self.obj[self._selection_list] - if isinstance(self.obj, ABCSeries): return self.obj + if self._selection is not None: + return self.obj._getitem_nocopy(self._selection_list) + if len(self.exclusions) > 0: # equivalent to `self.obj.drop(self.exclusions, axis=1) # but this avoids consolidating and making a copy diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 69c2476681021..a1536a671639e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3663,6 +3663,25 @@ def _iter_column_arrays(self) -> Iterator[ArrayLike]: for i in range(len(self.columns)): yield self._get_column_array(i) + def _getitem_nocopy(self, key: list): + """ + Behaves like __getitem__, but returns a view in cases where __getitem__ + would make a copy. + """ + # TODO(CoW): can be removed if/when we are always Copy-on-Write + indexer = self.columns._get_indexer_strict(key, "columns")[1] + new_axis = self.columns[indexer] + + new_mgr = self._mgr.reindex_indexer( + new_axis, + indexer, + axis=0, + allow_dups=True, + copy=False, + only_slice=True, + ) + return self._constructor(new_mgr) + def __getitem__(self, key): check_dict_or_set_indexers(key) key = lib.item_from_zerodim(key) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2be73d21e3d88..eb7f0117b8e34 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -76,6 +76,7 @@ class providing the base-class of operations. is_bool_dtype, is_datetime64_dtype, is_float_dtype, + is_hashable, is_integer, is_integer_dtype, is_numeric_dtype, @@ -723,13 +724,24 @@ def _get_index(self, name): @cache_readonly def _selected_obj(self): # Note: _selected_obj is always just `self.obj` for SeriesGroupBy - - if self._selection is None or isinstance(self.obj, Series): - if self._group_selection is not None: - return self.obj[self._group_selection] + if isinstance(self.obj, Series): return self.obj - else: - return self.obj[self._selection] + + if self._selection is not None: + if is_hashable(self._selection): + # i.e. a single key, so selecting it will return a Series. + # In this case, _obj_with_exclusions would wrap the key + # in a list and return a single-column DataFrame. + return self.obj[self._selection] + + # Otherwise _selection is equivalent to _selection_list, so + # _selected_obj matches _obj_with_exclusions, so we can re-use + # that and avoid making a copy. + return self._obj_with_exclusions + + if self._group_selection is not None: + return self._obj_with_exclusions + return self.obj @final def _dir_additions(self) -> set[str]: