From fa3f428f3919efe7133ccfb5eb19480c77ed8203 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 11 Mar 2021 11:57:27 -0800 Subject: [PATCH 1/6] PERF: self._name instead of self.name --- pandas/core/indexes/base.py | 8 ++++---- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/extension.py | 2 +- pandas/core/indexes/numeric.py | 2 +- pandas/core/indexes/range.py | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9543b11ad4de1..27e54f3a980b7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -665,7 +665,7 @@ def _shallow_copy(self: _IndexT, values, name: Hashable = no_default) -> _IndexT values : the values to create the new Index, optional name : Label, defaults to self.name """ - name = self.name if name is no_default else name + name = self._name if name is no_default else name return self._simple_new(values, name=name) @@ -673,7 +673,7 @@ def _view(self: _IndexT) -> _IndexT: """ fastpath to make a shallow copy, i.e. new object with same data. """ - result = self._simple_new(self._values, name=self.name) + result = self._simple_new(self._values, name=self._name) result._cache = self._cache return result @@ -4623,7 +4623,7 @@ def __getitem__(self, key): # pessimization of basic indexing. result = getitem(key) # Going through simple_new for performance. - return type(self)._simple_new(result, name=self.name) + return type(self)._simple_new(result, name=self._name) if com.is_bool_indexer(key): key = np.asarray(key, dtype=bool) @@ -4639,7 +4639,7 @@ def __getitem__(self, key): return result # NB: Using _constructor._simple_new would break if MultiIndex # didn't override __getitem__ - return self._constructor._simple_new(result, name=self.name) + return self._constructor._simple_new(result, name=self._name) else: return result diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index a38ef55614638..138d1edebedd4 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -239,7 +239,7 @@ def _shallow_copy( values: Categorical, name: Hashable = no_default, ): - name = self.name if name is no_default else name + name = self._name if name is no_default else name if values is not None: # In tests we only get here with Categorical objects that diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 793dd041fbf6f..d8e744a0b79d5 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -649,7 +649,7 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin): def _with_freq(self, freq): arr = self._data._with_freq(freq) - return type(self)._simple_new(arr, name=self.name) + return type(self)._simple_new(arr, name=self._name) @property def _has_complex_internals(self) -> bool: diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 4c15e9df534ba..e1856621bbd7d 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -250,7 +250,7 @@ def __getitem__(self, key): result = self._data[key] if isinstance(result, type(self._data)): if result.ndim == 1: - return type(self)(result, name=self.name) + return type(self)(result, name=self._name) # Unpack to ndarray for MPL compat result = result._ndarray diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index b6f476d864011..b3e4abc6c4040 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -125,7 +125,7 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): @doc(Index._shallow_copy) def _shallow_copy(self, values, name: Hashable = lib.no_default): if not self._can_hold_na and values.dtype.kind == "f": - name = self.name if name is lib.no_default else name + name = self._name if name is lib.no_default else name # Ensure we are not returning an Int64Index with float data: return Float64Index._simple_new(values, name=name) return super()._shallow_copy(values=values, name=name) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 05bb32dad6cab..1aa892146e2f3 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -457,7 +457,7 @@ def _shallow_copy(self, values, name: Hashable = no_default): return Int64Index._simple_new(values, name=name) def _view(self: RangeIndex) -> RangeIndex: - result = type(self)._simple_new(self._range, name=self.name) + result = type(self)._simple_new(self._range, name=self._name) result._cache = self._cache return result @@ -808,7 +808,7 @@ def __getitem__(self, key): """ if isinstance(key, slice): new_range = self._range[key] - return self._simple_new(new_range, name=self.name) + return self._simple_new(new_range, name=self._name) elif is_integer(key): new_key = int(key) try: From 85d24f78a4f55921e1873d09482ddd22a4191ef2 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 15:45:06 -0800 Subject: [PATCH 2/6] PERF/TYP lookup attributes instead of properties --- pandas/_libs/index.pyx | 3 +++ pandas/core/array_algos/take.py | 6 ++++-- pandas/core/internals/blocks.py | 31 ++++++++++++++++--------------- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index cb7b9f990a98e..f6f36f6ad523b 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1,5 +1,7 @@ import warnings +cimport cython + import numpy as np cimport numpy as cnp @@ -47,6 +49,7 @@ cdef inline bint is_definitely_invalid_key(object val): _SIZE_CUTOFF = 1_000_000 +@cython.freelist(32) cdef class IndexEngine: cdef readonly: diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index 110b47a11c3a9..a8d0a7cbfd17a 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -256,7 +256,9 @@ def take_2d_multi( @functools.lru_cache(maxsize=128) -def _get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis): +def _get_take_nd_function_cached( + ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: int +): """ Part of _get_take_nd_function below that doesn't need `mask_info` and thus can be cached (mask_info potentially contains a numpy ndarray which is not @@ -289,7 +291,7 @@ def _get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis): def _get_take_nd_function( - ndim: int, arr_dtype, out_dtype, axis: int = 0, mask_info=None + ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: int = 0, mask_info=None ): """ Get the appropriate "take" implementation for the given dimension, axis diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1bcddee4d726e..fc95851987def 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -276,7 +276,7 @@ def make_block(self, values, placement=None) -> Block: not specified """ if placement is None: - placement = self.mgr_locs + placement = self._mgr_locs if self.is_extension: values = ensure_block_shape(values, ndim=self.ndim) @@ -286,7 +286,7 @@ def make_block(self, values, placement=None) -> Block: def make_block_same_class(self, values, placement=None) -> Block: """ Wrap given values in a block of same type as self. """ if placement is None: - placement = self.mgr_locs + placement = self._mgr_locs return type(self)(values, placement=placement, ndim=self.ndim) @final @@ -330,7 +330,7 @@ def getitem_block(self, slicer, new_mgr_locs=None) -> Block: """ if new_mgr_locs is None: axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer - new_mgr_locs = self.mgr_locs[axis0_slicer] + new_mgr_locs = self._mgr_locs[axis0_slicer] elif not isinstance(new_mgr_locs, BlockPlacement): new_mgr_locs = BlockPlacement(new_mgr_locs) @@ -370,7 +370,7 @@ def delete(self, loc) -> None: Delete given loc(-s) from block in-place. """ self.values = np.delete(self.values, loc, 0) - self.mgr_locs = self.mgr_locs.delete(loc) + self.mgr_locs = self._mgr_locs.delete(loc) @final def apply(self, func, **kwargs) -> List[Block]: @@ -411,7 +411,7 @@ def _split_op_result(self, result) -> List[Block]: # TODO(EA2D): unnecessary with 2D EAs # if we get a 2D ExtensionArray, we need to split it into 1D pieces nbs = [] - for i, loc in enumerate(self.mgr_locs): + for i, loc in enumerate(self._mgr_locs): vals = result[i] block = self.make_block(values=vals, placement=loc) nbs.append(block) @@ -473,7 +473,7 @@ def _split(self) -> List[Block]: assert self.ndim == 2 new_blocks = [] - for i, ref_loc in enumerate(self.mgr_locs): + for i, ref_loc in enumerate(self._mgr_locs): vals = self.values[slice(i, i + 1)] nb = self.make_block(vals, BlockPlacement(ref_loc)) @@ -523,12 +523,12 @@ def make_a_block(nv, ref_loc): nv = f(mask, new_values, None) else: nv = new_values if inplace else new_values.copy() - block = make_a_block(nv, self.mgr_locs) + block = make_a_block(nv, self._mgr_locs) return [block] # ndim > 1 new_blocks = [] - for i, ref_loc in enumerate(self.mgr_locs): + for i, ref_loc in enumerate(self._mgr_locs): m = mask[i] v = new_values[i] @@ -1263,7 +1263,7 @@ def take_nd( # this assertion assert not (axis == 0 and new_mgr_locs is None) if new_mgr_locs is None: - new_mgr_locs = self.mgr_locs + new_mgr_locs = self._mgr_locs if not is_dtype_equal(new_values.dtype, self.dtype): return self.make_block(new_values, new_mgr_locs) @@ -1371,7 +1371,7 @@ def where(self, other, cond, errors="raise", axis: int = 0) -> List[Block]: result = cast(np.ndarray, result) # EABlock overrides where taken = result.take(m.nonzero()[0], axis=axis) r = maybe_downcast_numeric(taken, self.dtype) - nb = self.make_block(r.T, placement=self.mgr_locs[m]) + nb = self.make_block(r.T, placement=self._mgr_locs[m]) result_blocks.append(nb) return result_blocks @@ -1432,7 +1432,7 @@ def quantile( result = quantile_compat(self.values, qs, interpolation, axis) - return new_block(result, placement=self.mgr_locs, ndim=2) + return new_block(result, placement=self._mgr_locs, ndim=2) class ExtensionBlock(Block): @@ -1458,7 +1458,7 @@ def shape(self) -> Shape: # TODO(EA2D): override unnecessary with 2D EAs if self.ndim == 1: return (len(self.values),) - return len(self.mgr_locs), len(self.values) + return len(self._mgr_locs), len(self.values) def iget(self, col): @@ -1626,7 +1626,7 @@ def take_nd( # this assertion assert not (self.ndim == 1 and new_mgr_locs is None) if new_mgr_locs is None: - new_mgr_locs = self.mgr_locs + new_mgr_locs = self._mgr_locs return self.make_block_same_class(new_values, new_mgr_locs) @@ -1662,7 +1662,7 @@ def _slice(self, slicer): ) # GH#32959 only full-slicers along fake-dim0 are valid # TODO(EA2D): won't be necessary with 2D EAs - new_locs = self.mgr_locs[first] + new_locs = self._mgr_locs[first] if len(new_locs): # effectively slice(None) slicer = slicer[1] @@ -1773,9 +1773,10 @@ def _unstack(self, unstacker, fill_value, new_placement): # TODO: in all tests we have mask.all(); can we rely on that? blocks = [ + # TODO: could cast to object depending on fill_value? self.make_block_same_class( self.values.take(indices, allow_fill=True, fill_value=fill_value), - [place], + BlockPlacement(place), ) for indices, place in zip(new_values.T, new_placement) ] From 493cfb44dfc603cb58429a92dd22603757a92b6d Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 15:47:03 -0800 Subject: [PATCH 3/6] annotations --- pandas/core/indexes/multi.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7bb3dc5ab4545..0d89e75c097c1 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1718,7 +1718,7 @@ def unique(self, level=None): level = self._get_level_number(level) return self._get_level_values(level=level, unique=True) - def to_frame(self, index=True, name=None) -> DataFrame: + def to_frame(self, index: bool = True, name=None) -> DataFrame: """ Create a DataFrame with the levels of the MultiIndex as columns. @@ -2123,7 +2123,12 @@ def _getitem_slice(self: MultiIndex, slobj: slice) -> MultiIndex: @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take( - self: MultiIndex, indices, axis=0, allow_fill=True, fill_value=None, **kwargs + self: MultiIndex, + indices, + axis: int = 0, + allow_fill: bool = True, + fill_value=None, + **kwargs, ) -> MultiIndex: nv.validate_take((), kwargs) indices = ensure_platform_int(indices) @@ -3647,7 +3652,7 @@ def _intersection(self, other, sort=False) -> MultiIndex: zip(*uniq_tuples), sortorder=0, names=result_names ) - def _difference(self, other, sort): + def _difference(self, other, sort) -> MultiIndex: other, result_names = self._convert_can_do_setop(other) this = self._get_unique_index() @@ -3705,7 +3710,7 @@ def symmetric_difference(self, other, result_name=None, sort=None): # -------------------------------------------------------------------- @doc(Index.astype) - def astype(self, dtype, copy=True): + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) if is_categorical_dtype(dtype): msg = "> 1 ndim Categorical are not supported at this time" From 8e9efe1af9c1f73e70a0241e4a10efa682c2007c Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 15:52:59 -0800 Subject: [PATCH 4/6] docstring improvement, check for ndarray instead of EA --- pandas/_libs/missing.pyx | 10 ++++++++++ pandas/core/array_algos/putmask.py | 4 +--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index d2f47c9d25496..bd749d6eca18e 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -104,6 +104,7 @@ cpdef bint checknull(object val): - np.datetime64 representation of NaT - np.timedelta64 representation of NaT - NA + - Decimal("NaN") Parameters ---------- @@ -143,6 +144,8 @@ cpdef bint checknull_old(object val): - NaT - np.datetime64 representation of NaT - np.timedelta64 representation of NaT + - NA + - Decimal("NaN") Parameters ---------- @@ -175,6 +178,8 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr): - NaT - np.datetime64 representation of NaT - np.timedelta64 representation of NaT + - NA + - Decimal("NaN") Parameters ---------- @@ -211,6 +216,7 @@ def isnaobj_old(arr: ndarray) -> ndarray: - NEGINF - NaT - NA + - Decimal("NaN") Parameters ---------- @@ -249,6 +255,8 @@ def isnaobj2d(arr: ndarray) -> ndarray: - NaT - np.datetime64 representation of NaT - np.timedelta64 representation of NaT + - NA + - Decimal("NaN") Parameters ---------- @@ -293,6 +301,8 @@ def isnaobj2d_old(arr: ndarray) -> ndarray: - NaT - np.datetime64 representation of NaT - np.timedelta64 representation of NaT + - NA + - Decimal("NaN") Parameters ---------- diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index 3daf1b3ae3902..023957e92551c 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -24,8 +24,6 @@ ) from pandas.core.dtypes.missing import isna_compat -from pandas.core.arrays import ExtensionArray - def putmask_inplace(values: ArrayLike, mask: np.ndarray, value: Any) -> None: """ @@ -187,7 +185,7 @@ def extract_bool_array(mask: ArrayLike) -> np.ndarray: """ If we have a SparseArray or BooleanArray, convert it to ndarray[bool]. """ - if isinstance(mask, ExtensionArray): + if not isinstance(mask, np.ndarray): # We could have BooleanArray, Sparse[bool], ... # Except for BooleanArray, this is equivalent to just # np.asarray(mask, dtype=bool) From 6086542aa87162c05ace5ca6cff1765d2648851f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 15:55:22 -0800 Subject: [PATCH 5/6] BUG: fix raising corner cases --- pandas/compat/pickle_compat.py | 2 +- pandas/util/_exceptions.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 9d48035213126..25ebd3d3ddc62 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -49,7 +49,7 @@ def load_reduce(self): return except TypeError: pass - elif args and issubclass(args[0], BaseOffset): + elif args and isinstance(args[0], type) and issubclass(args[0], BaseOffset): # TypeError: object.__new__(Day) is not safe, use Day.__new__() cls = args[0] stack[-1] = cls.__new__(*args) diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index c31c421ee1445..e70c185628f71 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -11,6 +11,8 @@ def rewrite_exception(old_name: str, new_name: str): try: yield except Exception as err: + if not err.args: + raise msg = str(err.args[0]) msg = msg.replace(old_name, new_name) args: Tuple[str, ...] = (msg,) From 1a3de1fc25884029854bf4bf8eff4d5f4e414f2a Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 16:29:37 -0800 Subject: [PATCH 6/6] revert --- pandas/core/array_algos/putmask.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index 023957e92551c..3daf1b3ae3902 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -24,6 +24,8 @@ ) from pandas.core.dtypes.missing import isna_compat +from pandas.core.arrays import ExtensionArray + def putmask_inplace(values: ArrayLike, mask: np.ndarray, value: Any) -> None: """ @@ -185,7 +187,7 @@ def extract_bool_array(mask: ArrayLike) -> np.ndarray: """ If we have a SparseArray or BooleanArray, convert it to ndarray[bool]. """ - if not isinstance(mask, np.ndarray): + if isinstance(mask, ExtensionArray): # We could have BooleanArray, Sparse[bool], ... # Except for BooleanArray, this is equivalent to just # np.asarray(mask, dtype=bool)