diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 297681f1e10f5..2a2671374efc4 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -1,7 +1,4 @@ -from typing import ( - Optional, - Tuple, -) +from __future__ import annotations import numpy as np @@ -16,7 +13,7 @@ def recode_for_groupby( c: Categorical, sort: bool, observed: bool -) -> Tuple[Categorical, Optional[Categorical]]: +) -> tuple[Categorical, Categorical | None]: """ Code the categories to ensure we can groupby for categoricals. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index fd4b0cfa87950..8c402cf345095 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -366,7 +366,7 @@ def array_func(values: ArrayLike) -> ArrayLike: ) except NotImplementedError: ser = Series(values) # equiv 'obj' from outer frame - if self.grouper.ngroups > 0: + if self.ngroups > 0: res_values, _ = self.grouper.agg_series(ser, alt) else: # equiv: res_values = self._python_agg_general(alt) @@ -604,12 +604,12 @@ def _transform_fast(self, result) -> Series: fast version of transform, only applicable to builtin/cythonizable functions """ - ids, _, ngroup = self.grouper.group_info + ids, _, _ = self.grouper.group_info result = result.reindex(self.grouper.result_index, copy=False) out = algorithms.take_nd(result._values, ids) return self.obj._constructor(out, index=self.obj.index, name=self.obj.name) - def filter(self, func, dropna=True, *args, **kwargs): + def filter(self, func, dropna: bool = True, *args, **kwargs): """ Return a copy of a Series excluding elements from groups that do not satisfy the boolean criterion specified by func. @@ -1445,7 +1445,7 @@ def _transform_fast(self, result: DataFrame) -> DataFrame: obj = self._obj_with_exclusions # for each col, reshape to size of original frame by take operation - ids, _, ngroup = self.grouper.group_info + ids, _, _ = self.grouper.group_info result = result.reindex(self.grouper.result_index, copy=False) output = [ algorithms.take_nd(result.iloc[:, i].values, ids) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 55c3bfd316462..620668dadc32d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1103,13 +1103,13 @@ def _numba_prep(self, func, data): raise NotImplementedError( "Numba engine can only be used with a single function." ) - labels, _, n_groups = self.grouper.group_info - sorted_index = get_group_index_sorter(labels, n_groups) - sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False) + ids, _, ngroups = self.grouper.group_info + sorted_index = get_group_index_sorter(ids, ngroups) + sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False) sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() - starts, ends = lib.generate_slices(sorted_labels, n_groups) + starts, ends = lib.generate_slices(sorted_ids, ngroups) return starts, ends, sorted_index, sorted_data @final @@ -1253,11 +1253,12 @@ def _python_agg_general(self, func, *args, **kwargs): # iterate through "columns" ex exclusions to populate output dict output: dict[base.OutputKey, ArrayLike] = {} + if self.ngroups == 0: + # agg_series below assumes ngroups > 0 + return self._python_apply_general(f, self._selected_obj) + for idx, obj in enumerate(self._iterate_slices()): name = obj.name - if self.grouper.ngroups == 0: - # agg_series below assumes ngroups > 0 - continue try: # if this function is invalid for this dtype, we will ignore it. @@ -1368,7 +1369,7 @@ def _apply_filter(self, indices, dropna): return filtered @final - def _cumcount_array(self, ascending: bool = True): + def _cumcount_array(self, ascending: bool = True) -> np.ndarray: """ Parameters ---------- @@ -2721,7 +2722,7 @@ def _get_cythonized_result( grouper = self.grouper - labels, _, ngroups = grouper.group_info + ids, _, ngroups = grouper.group_info output: dict[base.OutputKey, np.ndarray] = {} base_func = getattr(libgroupby, how) @@ -2754,15 +2755,15 @@ def _get_cythonized_result( if pre_processing: try: vals, inferences = pre_processing(vals) - except TypeError as e: - error_msg = str(e) + except TypeError as err: + error_msg = str(err) continue vals = vals.astype(cython_dtype, copy=False) if needs_2d: vals = vals.reshape((-1, 1)) func = partial(func, vals) - func = partial(func, labels) + func = partial(func, ids) if min_count is not None: func = partial(func, min_count) diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py index 3ba70baec1561..26070fcb5e89c 100644 --- a/pandas/core/groupby/numba_.py +++ b/pandas/core/groupby/numba_.py @@ -1,11 +1,10 @@ """Common utilities for Numba operations with groupby ops""" +from __future__ import annotations + import inspect from typing import ( Any, Callable, - Dict, - Optional, - Tuple, ) import numpy as np @@ -57,10 +56,10 @@ def f(values, index, ...): def generate_numba_agg_func( - args: Tuple, - kwargs: Dict[str, Any], + args: tuple, + kwargs: dict[str, Any], func: Callable[..., Scalar], - engine_kwargs: Optional[Dict[str, bool]], + engine_kwargs: dict[str, bool] | None, ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: """ Generate a numba jitted agg function specified by values from engine_kwargs. @@ -117,10 +116,10 @@ def group_agg( def generate_numba_transform_func( - args: Tuple, - kwargs: Dict[str, Any], + args: tuple, + kwargs: dict[str, Any], func: Callable[..., np.ndarray], - engine_kwargs: Optional[Dict[str, bool]], + engine_kwargs: dict[str, bool] | None, ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: """ Generate a numba jitted transform function specified by values from engine_kwargs. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 9edbeb412026d..975a902f49db9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -723,8 +723,8 @@ def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> DataSplitter: __finalize__ has not been called for the subsetted objects returned. """ - comp_ids, _, ngroups = self.group_info - return get_splitter(data, comp_ids, ngroups, axis=axis) + ids, _, ngroups = self.group_info + return get_splitter(data, ids, ngroups, axis=axis) def _get_grouper(self): """ @@ -740,10 +740,10 @@ def _get_group_keys(self): if len(self.groupings) == 1: return self.levels[0] else: - comp_ids, _, ngroups = self.group_info + ids, _, ngroups = self.group_info # provide "flattened" iterator for multi-group setting - return get_flattened_list(comp_ids, ngroups, self.levels, self.codes) + return get_flattened_list(ids, ngroups, self.levels, self.codes) @final def apply(self, f: F, data: FrameOrSeries, axis: int = 0): @@ -846,9 +846,9 @@ def size(self) -> Series: """ Compute group sizes. """ - ids, _, ngroup = self.group_info - if ngroup: - out = np.bincount(ids[ids != -1], minlength=ngroup) + ids, _, ngroups = self.group_info + if ngroups: + out = np.bincount(ids[ids != -1], minlength=ngroups) else: out = [] return Series(out, index=self.result_index, dtype="int64") @@ -882,11 +882,11 @@ def group_info(self): @cache_readonly def codes_info(self) -> np.ndarray: # return the codes of items in original grouped axis - codes, _, _ = self.group_info + ids, _, _ = self.group_info if self.indexer is not None: - sorter = np.lexsort((codes, self.indexer)) - codes = codes[sorter] - return codes + sorter = np.lexsort((ids, self.indexer)) + ids = ids[sorter] + return ids @final def _get_compressed_codes(self) -> tuple[np.ndarray, np.ndarray]: @@ -906,8 +906,8 @@ def ngroups(self) -> int: @property def reconstructed_codes(self) -> list[np.ndarray]: codes = self.codes - comp_ids, obs_ids, _ = self.group_info - return decons_obs_group_ids(comp_ids, obs_ids, self.shape, codes, xnull=True) + ids, obs_ids, _ = self.group_info + return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) @cache_readonly def result_index(self) -> Index: @@ -954,13 +954,13 @@ def _cython_operation( cy_op = WrappedCythonOp(kind=kind, how=how) - comp_ids, _, _ = self.group_info + ids, _, _ = self.group_info ngroups = self.ngroups return cy_op.cython_operation( values=values, axis=axis, min_count=min_count, - comp_ids=comp_ids, + comp_ids=ids, ngroups=ngroups, **kwargs, ) @@ -997,26 +997,26 @@ def _aggregate_series_fast( # - ngroups != 0 func = com.is_builtin_func(func) - group_index, _, ngroups = self.group_info + ids, _, ngroups = self.group_info # avoids object / Series creation overhead - indexer = get_group_index_sorter(group_index, ngroups) + indexer = get_group_index_sorter(ids, ngroups) obj = obj.take(indexer) - group_index = group_index.take(indexer) - sgrouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups) + ids = ids.take(indexer) + sgrouper = libreduction.SeriesGrouper(obj, func, ids, ngroups) result, counts = sgrouper.get_result() return result, counts @final def _aggregate_series_pure_python(self, obj: Series, func: F): - group_index, _, ngroups = self.group_info + ids, _, ngroups = self.group_info counts = np.zeros(ngroups, dtype=int) result = np.empty(ngroups, dtype="O") initialized = False # equiv: splitter = self._get_splitter(obj, axis=0) - splitter = get_splitter(obj, group_index, ngroups, axis=0) + splitter = get_splitter(obj, ids, ngroups, axis=0) for i, group in enumerate(splitter): @@ -1152,7 +1152,7 @@ def indices(self): @cache_readonly def group_info(self): ngroups = self.ngroups - obs_group_ids = np.arange(ngroups) + obs_group_ids = np.arange(ngroups, dtype=np.int64) rep = np.diff(np.r_[0, self.bins]) rep = ensure_platform_int(rep) @@ -1163,7 +1163,7 @@ def group_info(self): return ( ensure_platform_int(comp_ids), - obs_group_ids.astype("int64", copy=False), + obs_group_ids, ngroups, )