From 9ac3bab57e76c95c0b1cf8dd1346a816bf14c921 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 15 Feb 2021 22:12:54 +0000
Subject: [PATCH 01/37] checknull forceobj=True

---
 pandas/_libs_numba/__init__.py      |   0
 pandas/_libs_numba/missing.py       | 542 ++++++++++++++++++++++++++++
 pandas/core/dtypes/missing.py       |   3 +-
 pandas/tests/api/test_api.py        |   1 +
 pandas/tests/dtypes/test_missing.py |   6 +
 setup.cfg                           |   2 +-
 6 files changed, 552 insertions(+), 2 deletions(-)
 create mode 100644 pandas/_libs_numba/__init__.py
 create mode 100644 pandas/_libs_numba/missing.py

diff --git a/pandas/_libs_numba/__init__.py b/pandas/_libs_numba/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pandas/_libs_numba/missing.py b/pandas/_libs_numba/missing.py
new file mode 100644
index 0000000000000..998ff5fe586b1
--- /dev/null
+++ b/pandas/_libs_numba/missing.py
@@ -0,0 +1,542 @@
+# import numbers
+
+# import cython
+# from cython import Py_ssize_t
+# import numpy as np
+
+# cimport numpy as cnp
+# from numpy cimport float64_t, int64_t, ndarray, uint8_t
+
+# cnp.import_array()
+
+# from pandas._libs cimport util
+# from pandas._libs.tslibs.nattype cimport (
+#     c_NaT as NaT,
+#     checknull_with_nat,
+#     is_null_datetimelike,
+# )
+# from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value  # noqa
+
+# from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
+# from pandas.compat import IS64
+import numba
+
+from pandas._libs.missing import NA
+from pandas._libs.tslibs import is_null_datetimelike
+
+# cdef:
+#     float64_t INF = <float64_t>np.inf
+#     float64_t NEGINF = -INF
+
+#     int64_t NPY_NAT = util.get_nat()
+
+#     bint is_32bit = not IS64
+
+
+# cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False):
+#     """
+#     Check if two scalars are both NA of matching types.
+
+#     Parameters
+#     ----------
+#     left : Any
+#     right : Any
+#     nan_matches_none : bool, default False
+#         For backwards compatibility, consider NaN as matching None.
+
+#     Returns
+#     -------
+#     bool
+#     """
+#     if left is None:
+#         if nan_matches_none and util.is_nan(right):
+#             return True
+#         return right is None
+#     elif left is C_NA:
+#         return right is C_NA
+#     elif left is NaT:
+#         return right is NaT
+#     elif util.is_float_object(left):
+#         if nan_matches_none and right is None:
+#             return True
+#         return (
+#             util.is_nan(left)
+#             and util.is_float_object(right)
+#             and util.is_nan(right)
+#         )
+#     elif util.is_complex_object(left):
+#         return (
+#             util.is_nan(left)
+#             and util.is_complex_object(right)
+#             and util.is_nan(right)
+#         )
+#     elif util.is_datetime64_object(left):
+#         return (
+#             get_datetime64_value(left) == NPY_NAT
+#             and util.is_datetime64_object(right)
+#             and get_datetime64_value(right) == NPY_NAT
+#         )
+#     elif util.is_timedelta64_object(left):
+#         return (
+#             get_timedelta64_value(left) == NPY_NAT
+#             and util.is_timedelta64_object(right)
+#             and get_timedelta64_value(right) == NPY_NAT
+#         )
+#     return False
+
+
+@numba.jit(forceobj=True)
+def checknull(val: object) -> bool:
+    """
+    Return boolean describing of the input is NA-like, defined here as any
+    of:
+     - None
+     - nan
+     - NaT
+     - np.datetime64 representation of NaT
+     - np.timedelta64 representation of NaT
+     - NA
+
+    Parameters
+    ----------
+    val : object
+
+    Returns
+    -------
+    bool
+
+    Notes
+    -----
+    The difference between `checknull` and `checknull_old` is that `checknull`
+    does *not* consider INF or NEGINF to be NA.
+    """
+    return val is NA or is_null_datetimelike(val, inat_is_null=False)
+
+
+# cpdef bint checknull_old(object val):
+#     """
+#     Return boolean describing of the input is NA-like, defined here as any
+#     of:
+#      - None
+#      - nan
+#      - INF
+#      - NEGINF
+#      - NaT
+#      - np.datetime64 representation of NaT
+#      - np.timedelta64 representation of NaT
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     result : bool
+
+#     Notes
+#     -----
+#     The difference between `checknull` and `checknull_old` is that `checknull`
+#     does *not* consider INF or NEGINF to be NA.
+#     """
+#     if checknull(val):
+#         return True
+#     elif util.is_float_object(val) or util.is_complex_object(val):
+#         return val == INF or val == NEGINF
+#     return False
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+# cpdef ndarray[uint8_t] isnaobj(ndarray arr):
+#     """
+#     Return boolean mask denoting which elements of a 1-D array are na-like,
+#     according to the criteria defined in `checknull`:
+#      - None
+#      - nan
+#      - NaT
+#      - np.datetime64 representation of NaT
+#      - np.timedelta64 representation of NaT
+
+#     Parameters
+#     ----------
+#     arr : ndarray
+
+#     Returns
+#     -------
+#     result : ndarray (dtype=np.bool_)
+#     """
+#     cdef:
+#         Py_ssize_t i, n
+#         object val
+#         ndarray[uint8_t] result
+
+#     assert arr.ndim == 1, "'arr' must be 1-D."
+
+#     n = len(arr)
+#     result = np.empty(n, dtype=np.uint8)
+#     for i in range(n):
+#         val = arr[i]
+#         result[i] = checknull(val)
+#     return result.view(np.bool_)
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+# def isnaobj_old(arr: ndarray) -> ndarray:
+#     """
+#     Return boolean mask denoting which elements of a 1-D array are na-like,
+#     defined as being any of:
+#      - None
+#      - nan
+#      - INF
+#      - NEGINF
+#      - NaT
+#      - NA
+
+#     Parameters
+#     ----------
+#     arr : ndarray
+
+#     Returns
+#     -------
+#     result : ndarray (dtype=np.bool_)
+#     """
+#     cdef:
+#         Py_ssize_t i, n
+#         object val
+#         ndarray[uint8_t] result
+
+#     assert arr.ndim == 1, "'arr' must be 1-D."
+
+#     n = len(arr)
+#     result = np.zeros(n, dtype=np.uint8)
+#     for i in range(n):
+#         val = arr[i]
+#         result[i] = (
+#             checknull(val)
+#             or util.is_float_object(val) and (val == INF or val == NEGINF)
+#         )
+#     return result.view(np.bool_)
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+# def isnaobj2d(arr: ndarray) -> ndarray:
+#     """
+#     Return boolean mask denoting which elements of a 2-D array are na-like,
+#     according to the criteria defined in `checknull`:
+#      - None
+#      - nan
+#      - NaT
+#      - np.datetime64 representation of NaT
+#      - np.timedelta64 representation of NaT
+
+#     Parameters
+#     ----------
+#     arr : ndarray
+
+#     Returns
+#     -------
+#     result : ndarray (dtype=np.bool_)
+
+#     Notes
+#     -----
+#     The difference between `isnaobj2d` and `isnaobj2d_old` is that `isnaobj2d`
+#     does *not* consider INF or NEGINF to be NA.
+#     """
+#     cdef:
+#         Py_ssize_t i, j, n, m
+#         object val
+#         ndarray[uint8_t, ndim=2] result
+
+#     assert arr.ndim == 2, "'arr' must be 2-D."
+
+#     n, m = (<object>arr).shape
+#     result = np.zeros((n, m), dtype=np.uint8)
+#     for i in range(n):
+#         for j in range(m):
+#             val = arr[i, j]
+#             if checknull(val):
+#                 result[i, j] = 1
+#     return result.view(np.bool_)
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+# def isnaobj2d_old(arr: ndarray) -> ndarray:
+#     """
+#     Return boolean mask denoting which elements of a 2-D array are na-like,
+#     according to the criteria defined in `checknull_old`:
+#      - None
+#      - nan
+#      - INF
+#      - NEGINF
+#      - NaT
+#      - np.datetime64 representation of NaT
+#      - np.timedelta64 representation of NaT
+
+#     Parameters
+#     ----------
+#     arr : ndarray
+
+#     Returns
+#     -------
+#     ndarray (dtype=np.bool_)
+
+#     Notes
+#     -----
+#     The difference between `isnaobj2d` and `isnaobj2d_old` is that `isnaobj2d`
+#     does *not* consider INF or NEGINF to be NA.
+#     """
+#     cdef:
+#         Py_ssize_t i, j, n, m
+#         object val
+#         ndarray[uint8_t, ndim=2] result
+
+#     assert arr.ndim == 2, "'arr' must be 2-D."
+
+#     n, m = (<object>arr).shape
+#     result = np.zeros((n, m), dtype=np.uint8)
+#     for i in range(n):
+#         for j in range(m):
+#             val = arr[i, j]
+#             if checknull_old(val):
+#                 result[i, j] = 1
+#     return result.view(np.bool_)
+
+
+# def isposinf_scalar(val: object) -> bool:
+#     return util.is_float_object(val) and val == INF
+
+
+# def isneginf_scalar(val: object) -> bool:
+#     return util.is_float_object(val) and val == NEGINF
+
+
+# cdef inline bint is_null_datetime64(v):
+#     # determine if we have a null for a datetime (or integer versions),
+#     # excluding np.timedelta64('nat')
+#     if checknull_with_nat(v):
+#         return True
+#     elif util.is_datetime64_object(v):
+#         return get_datetime64_value(v) == NPY_NAT
+#     return False
+
+
+# cdef inline bint is_null_timedelta64(v):
+#     # determine if we have a null for a timedelta (or integer versions),
+#     # excluding np.datetime64('nat')
+#     if checknull_with_nat(v):
+#         return True
+#     elif util.is_timedelta64_object(v):
+#         return get_timedelta64_value(v) == NPY_NAT
+#     return False
+
+
+# cdef bint checknull_with_nat_and_na(object obj):
+#     # See GH#32214
+#     return checknull_with_nat(obj) or obj is C_NA
+
+
+# # -----------------------------------------------------------------------------
+# # Implementation of NA singleton
+
+
+# def _create_binary_propagating_op(name, is_divmod=False):
+
+#     def method(self, other):
+#         if (other is C_NA or isinstance(other, str)
+#                 or isinstance(other, (numbers.Number, np.bool_))
+#                 or isinstance(other, np.ndarray) and not other.shape):
+#             # Need the other.shape clause to handle NumPy scalars,
+#             # since we do a setitem on `out` below, which
+#             # won't work for NumPy scalars.
+#             if is_divmod:
+#                 return NA, NA
+#             else:
+#                 return NA
+
+#         elif isinstance(other, np.ndarray):
+#             out = np.empty(other.shape, dtype=object)
+#             out[:] = NA
+
+#             if is_divmod:
+#                 return out, out.copy()
+#             else:
+#                 return out
+
+#         return NotImplemented
+
+#     method.__name__ = name
+#     return method
+
+
+# def _create_unary_propagating_op(name):
+#     def method(self):
+#         return NA
+
+#     method.__name__ = name
+#     return method
+
+
+# cdef class C_NAType:
+#     pass
+
+
+# class NAType(C_NAType):
+#     """
+#     NA ("not available") missing value indicator.
+
+#     .. warning::
+
+#        Experimental: the behaviour of NA can still change without warning.
+
+#     .. versionadded:: 1.0.0
+
+#     The NA singleton is a missing value indicator defined by pandas. It is
+#     used in certain new extension dtypes (currently the "string" dtype).
+#     """
+
+#     _instance = None
+
+#     def __new__(cls, *args, **kwargs):
+#         if NAType._instance is None:
+#             NAType._instance = C_NAType.__new__(cls, *args, **kwargs)
+#         return NAType._instance
+
+#     def __repr__(self) -> str:
+#         return "<NA>"
+
+#     def __format__(self, format_spec) -> str:
+#         try:
+#             return self.__repr__().__format__(format_spec)
+#         except ValueError:
+#             return self.__repr__()
+
+#     def __bool__(self):
+#         raise TypeError("boolean value of NA is ambiguous")
+
+#     def __hash__(self):
+#         # GH 30013: Ensure hash is large enough to avoid hash collisions with integers
+#         exponent = 31 if is_32bit else 61
+#         return 2 ** exponent - 1
+
+#     def __reduce__(self):
+#         return "NA"
+
+#     # Binary arithmetic and comparison ops -> propagate
+
+#     __add__ = _create_binary_propagating_op("__add__")
+#     __radd__ = _create_binary_propagating_op("__radd__")
+#     __sub__ = _create_binary_propagating_op("__sub__")
+#     __rsub__ = _create_binary_propagating_op("__rsub__")
+#     __mul__ = _create_binary_propagating_op("__mul__")
+#     __rmul__ = _create_binary_propagating_op("__rmul__")
+#     __matmul__ = _create_binary_propagating_op("__matmul__")
+#     __rmatmul__ = _create_binary_propagating_op("__rmatmul__")
+#     __truediv__ = _create_binary_propagating_op("__truediv__")
+#     __rtruediv__ = _create_binary_propagating_op("__rtruediv__")
+#     __floordiv__ = _create_binary_propagating_op("__floordiv__")
+#     __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
+#     __mod__ = _create_binary_propagating_op("__mod__")
+#     __rmod__ = _create_binary_propagating_op("__rmod__")
+#     __divmod__ = _create_binary_propagating_op("__divmod__", is_divmod=True)
+#     __rdivmod__ = _create_binary_propagating_op("__rdivmod__", is_divmod=True)
+#     # __lshift__ and __rshift__ are not implemented
+
+#     __eq__ = _create_binary_propagating_op("__eq__")
+#     __ne__ = _create_binary_propagating_op("__ne__")
+#     __le__ = _create_binary_propagating_op("__le__")
+#     __lt__ = _create_binary_propagating_op("__lt__")
+#     __gt__ = _create_binary_propagating_op("__gt__")
+#     __ge__ = _create_binary_propagating_op("__ge__")
+
+#     # Unary ops
+
+#     __neg__ = _create_unary_propagating_op("__neg__")
+#     __pos__ = _create_unary_propagating_op("__pos__")
+#     __abs__ = _create_unary_propagating_op("__abs__")
+#     __invert__ = _create_unary_propagating_op("__invert__")
+
+#     # pow has special
+#     def __pow__(self, other):
+#         if other is C_NA:
+#             return NA
+#         elif isinstance(other, (numbers.Number, np.bool_)):
+#             if other == 0:
+#                 # returning positive is correct for +/- 0.
+#                 return type(other)(1)
+#             else:
+#                 return NA
+#         elif isinstance(other, np.ndarray):
+#             return np.where(other == 0, other.dtype.type(1), NA)
+
+#         return NotImplemented
+
+#     def __rpow__(self, other):
+#         if other is C_NA:
+#             return NA
+#         elif isinstance(other, (numbers.Number, np.bool_)):
+#             if other == 1:
+#                 return other
+#             else:
+#                 return NA
+#         elif isinstance(other, np.ndarray):
+#             return np.where(other == 1, other, NA)
+#         return NotImplemented
+
+#     # Logical ops using Kleene logic
+
+#     def __and__(self, other):
+#         if other is False:
+#             return False
+#         elif other is True or other is C_NA:
+#             return NA
+#         return NotImplemented
+
+#     __rand__ = __and__
+
+#     def __or__(self, other):
+#         if other is True:
+#             return True
+#         elif other is False or other is C_NA:
+#             return NA
+#         return NotImplemented
+
+#     __ror__ = __or__
+
+#     def __xor__(self, other):
+#         if other is False or other is True or other is C_NA:
+#             return NA
+#         return NotImplemented
+
+#     __rxor__ = __xor__
+
+#     __array_priority__ = 1000
+#     _HANDLED_TYPES = (np.ndarray, numbers.Number, str, np.bool_)
+
+#     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+#         types = self._HANDLED_TYPES + (NAType,)
+#         for x in inputs:
+#             if not isinstance(x, types):
+#                 return NotImplemented
+
+#         if method != "__call__":
+#             raise ValueError(f"ufunc method '{method}' not supported for NA")
+#         result = maybe_dispatch_ufunc_to_dunder_op(
+#             self, ufunc, method, *inputs, **kwargs
+#         )
+#         if result is NotImplemented:
+#             # For a NumPy ufunc that's not a binop, like np.logaddexp
+#             index = [i for i, x in enumerate(inputs) if x is NA][0]
+#             result = np.broadcast_arrays(*inputs)[index]
+#             if result.ndim == 0:
+#                 result = result.item()
+#             if ufunc.nout > 1:
+#                 result = (NA,) * ufunc.nout
+
+#         return result
+
+
+# C_NA = NAType()   # C-visible
+# NA = C_NA         # Python-visible
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index ef645313de614..59e1348506d23 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -10,6 +10,7 @@
 from pandas._libs import lib
 import pandas._libs.missing as libmissing
 from pandas._libs.tslibs import NaT, Period, iNaT
+import pandas._libs_numba.missing as libmissing_numba
 from pandas._typing import ArrayLike, DtypeObj
 
 from pandas.core.dtypes.common import (
@@ -149,7 +150,7 @@ def _isna(obj, inf_as_na: bool = False):
         if inf_as_na:
             return libmissing.checknull_old(obj)
         else:
-            return libmissing.checknull(obj)
+            return libmissing_numba.checknull(obj)
     # hack (for now) because MI registers as ndarray
     elif isinstance(obj, ABCMultiIndex):
         raise NotImplementedError("isna is not defined for MultiIndex")
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
index 541c2988a0636..dbc0daaa9a841 100644
--- a/pandas/tests/api/test_api.py
+++ b/pandas/tests/api/test_api.py
@@ -191,6 +191,7 @@ class TestPDApi(Base):
         "_hashtable",
         "_lib",
         "_libs",
+        "_libs_numba",
         "_np_version_under1p17",
         "_np_version_under1p18",
         "_is_numpy_dev",
diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py
index f6566d205e65c..f84560992b7bd 100644
--- a/pandas/tests/dtypes/test_missing.py
+++ b/pandas/tests/dtypes/test_missing.py
@@ -9,6 +9,7 @@
 
 from pandas._libs import missing as libmissing
 from pandas._libs.tslibs import iNaT, is_null_datetimelike
+from pandas._libs_numba import missing as libmissing_numba
 
 from pandas.core.dtypes.common import is_float, is_scalar
 from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype
@@ -605,18 +606,23 @@ class TestLibMissing:
     def test_checknull(self):
         for value in na_vals:
             assert libmissing.checknull(value)
+            assert libmissing_numba.checknull(value)
 
         for value in inf_vals:
             assert not libmissing.checknull(value)
+            assert not libmissing_numba.checknull(value)
 
         for value in int_na_vals:
             assert not libmissing.checknull(value)
+            assert not libmissing_numba.checknull(value)
 
         for value in sometimes_na_vals:
             assert not libmissing.checknull(value)
+            assert not libmissing_numba.checknull(value)
 
         for value in never_na_vals:
             assert not libmissing.checknull(value)
+            assert not libmissing_numba.checknull(value)
 
     def test_checknull_old(self):
         for value in na_vals:
diff --git a/setup.cfg b/setup.cfg
index a6d636704664e..1c019bc21c9d6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -108,7 +108,7 @@ directory = coverage_html_report
 # To be kept consistent with "Import Formatting" section in contributing.rst
 [isort]
 known_pre_libs = pandas._config
-known_pre_core = pandas._libs,pandas._typing,pandas.util._*,pandas.compat,pandas.errors
+known_pre_core = pandas._libs,pandas._libs_numba,pandas._typing,pandas.util._*,pandas.compat,pandas.errors
 known_dtypes = pandas.core.dtypes
 known_post_core = pandas.tseries,pandas.io,pandas.plotting
 sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER

From b029014782bd110cc8b786148983a28d37b29784 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Tue, 16 Feb 2021 09:56:40 +0000
Subject: [PATCH 02/37] wip

---
 pandas/_libs_numba/algos.py | 1450 +++++++++++++++++++++++++++++++++++
 1 file changed, 1450 insertions(+)
 create mode 100644 pandas/_libs_numba/algos.py

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
new file mode 100644
index 0000000000000..32a5b186d8290
--- /dev/null
+++ b/pandas/_libs_numba/algos.py
@@ -0,0 +1,1450 @@
+# import cython
+# from cython import Py_ssize_t
+
+# from libc.math cimport fabs, sqrt
+# from libc.stdlib cimport free, malloc
+# from libc.string cimport memmove
+
+# import numpy as np
+
+# cimport numpy as cnp
+# from numpy cimport (
+#     NPY_FLOAT32,
+#     NPY_FLOAT64,
+#     NPY_INT8,
+#     NPY_INT16,
+#     NPY_INT32,
+#     NPY_INT64,
+#     NPY_OBJECT,
+#     NPY_UINT8,
+#     NPY_UINT16,
+#     NPY_UINT32,
+#     NPY_UINT64,
+#     float32_t,
+#     float64_t,
+#     int8_t,
+#     int16_t,
+#     int32_t,
+#     int64_t,
+#     intp_t,
+#     ndarray,
+#     uint8_t,
+#     uint16_t,
+#     uint32_t,
+#     uint64_t,
+# )
+
+# cnp.import_array()
+
+# cimport pandas._libs.util as util
+# from pandas._libs.khash cimport (
+#     kh_destroy_int64,
+#     kh_get_int64,
+#     kh_init_int64,
+#     kh_int64_t,
+#     kh_put_int64,
+#     kh_resize_int64,
+#     khiter_t,
+# )
+# from pandas._libs.util cimport get_nat, numeric
+
+# import pandas._libs.missing as missing
+
+# cdef:
+#     float64_t FP_ERR = 1e-13
+#     float64_t NaN = <float64_t>np.NaN
+#     int64_t NPY_NAT = get_nat()
+
+# tiebreakers = {
+#     "average": TIEBREAK_AVERAGE,
+#     "min": TIEBREAK_MIN,
+#     "max": TIEBREAK_MAX,
+#     "first": TIEBREAK_FIRST,
+#     "dense": TIEBREAK_DENSE,
+# }
+
+
+# cdef inline bint are_diff(object left, object right):
+#     try:
+#         return fabs(left - right) > FP_ERR
+#     except TypeError:
+#         return left != right
+
+
+# class Infinity:
+#     """
+#     Provide a positive Infinity comparison method for ranking.
+#     """
+#     __lt__ = lambda self, other: False
+#     __le__ = lambda self, other: isinstance(other, Infinity)
+#     __eq__ = lambda self, other: isinstance(other, Infinity)
+#     __ne__ = lambda self, other: not isinstance(other, Infinity)
+#     __gt__ = lambda self, other: (not isinstance(other, Infinity) and
+#                                   not missing.checknull(other))
+#     __ge__ = lambda self, other: not missing.checknull(other)
+
+
+# class NegInfinity:
+#     """
+#     Provide a negative Infinity comparison method for ranking.
+#     """
+#     __lt__ = lambda self, other: (not isinstance(other, NegInfinity) and
+#                                   not missing.checknull(other))
+#     __le__ = lambda self, other: not missing.checknull(other)
+#     __eq__ = lambda self, other: isinstance(other, NegInfinity)
+#     __ne__ = lambda self, other: not isinstance(other, NegInfinity)
+#     __gt__ = lambda self, other: False
+#     __ge__ = lambda self, other: isinstance(other, NegInfinity)
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+# cpdef ndarray[int64_t, ndim=1] unique_deltas(const int64_t[:] arr):
+#     """
+#     Efficiently find the unique first-differences of the given array.
+
+#     Parameters
+#     ----------
+#     arr : ndarray[in64_t]
+
+#     Returns
+#     -------
+#     ndarray[int64_t]
+#         An ordered ndarray[int64_t]
+#     """
+#     cdef:
+#         Py_ssize_t i, n = len(arr)
+#         int64_t val
+#         khiter_t k
+#         kh_int64_t *table
+#         int ret = 0
+#         list uniques = []
+#         ndarray[int64_t, ndim=1] result
+
+#     table = kh_init_int64()
+#     kh_resize_int64(table, 10)
+#     for i in range(n - 1):
+#         val = arr[i + 1] - arr[i]
+#         k = kh_get_int64(table, val)
+#         if k == table.n_buckets:
+#             kh_put_int64(table, val, &ret)
+#             uniques.append(val)
+#     kh_destroy_int64(table)
+
+#     result = np.array(uniques, dtype=np.int64)
+#     result.sort()
+#     return result
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+# def is_lexsorted(list_of_arrays: list) -> bint:
+#     cdef:
+#         Py_ssize_t i
+#         Py_ssize_t n, nlevels
+#         int64_t k, cur, pre
+#         ndarray arr
+#         bint result = True
+
+#     nlevels = len(list_of_arrays)
+#     n = len(list_of_arrays[0])
+
+#     cdef int64_t **vecs = <int64_t**>malloc(nlevels * sizeof(int64_t*))
+#     for i in range(nlevels):
+#         arr = list_of_arrays[i]
+#         assert arr.dtype.name == 'int64'
+#         vecs[i] = <int64_t*>cnp.PyArray_DATA(arr)
+
+#     # Assume uniqueness??
+#     with nogil:
+#         for i in range(1, n):
+#             for k in range(nlevels):
+#                 cur = vecs[k][i]
+#                 pre = vecs[k][i -1]
+#                 if cur == pre:
+#                     continue
+#                 elif cur > pre:
+#                     break
+#                 else:
+#                     result = False
+#                     break
+#     free(vecs)
+#     return result
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
+#     """
+#     Compute a 1-d indexer.
+
+#     The indexer is an ordering of the passed index,
+#     ordered by the groups.
+
+#     Parameters
+#     ----------
+#     index: int64 ndarray
+#         Mappings from group -> position.
+#     ngroups: int64
+#         Number of groups.
+
+#     Returns
+#     -------
+#     tuple
+#         1-d indexer ordered by groups, group counts.
+
+#     Notes
+#     -----
+#     This is a reverse of the label factorization process.
+#     """
+#     cdef:
+#         Py_ssize_t i, loc, label, n
+#         ndarray[int64_t] counts, where, result
+
+#     counts = np.zeros(ngroups + 1, dtype=np.int64)
+#     n = len(index)
+#     result = np.zeros(n, dtype=np.int64)
+#     where = np.zeros(ngroups + 1, dtype=np.int64)
+
+#     with nogil:
+
+#         # count group sizes, location 0 for NA
+#         for i in range(n):
+#             counts[index[i] + 1] += 1
+
+#         # mark the start of each contiguous group of like-indexed data
+#         for i in range(1, ngroups + 1):
+#             where[i] = where[i - 1] + counts[i - 1]
+
+#         # this is our indexer
+#         for i in range(n):
+#             label = index[i] + 1
+#             result[where[label]] = i
+#             where[label] += 1
+
+#     return result, counts
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def kth_smallest(numeric[:] a, Py_ssize_t k) -> numeric:
+#     cdef:
+#         Py_ssize_t i, j, l, m, n = a.shape[0]
+#         numeric x
+
+#     with nogil:
+#         l = 0
+#         m = n - 1
+
+#         while l < m:
+#             x = a[k]
+#             i = l
+#             j = m
+
+#             while 1:
+#                 while a[i] < x: i += 1
+#                 while x < a[j]: j -= 1
+#                 if i <= j:
+#                     swap(&a[i], &a[j])
+#                     i += 1; j -= 1
+
+#                 if i > j: break
+
+#             if j < k: l = i
+#             if k < i: m = j
+#     return a[k]
+
+
+# # ----------------------------------------------------------------------
+# # Pairwise correlation/covariance
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
+#     cdef:
+#         Py_ssize_t i, j, xi, yi, N, K
+#         bint minpv
+#         ndarray[float64_t, ndim=2] result
+#         ndarray[uint8_t, ndim=2] mask
+#         int64_t nobs = 0
+#         float64_t vx, vy, meanx, meany, divisor, prev_meany, prev_meanx, ssqdmx
+#         float64_t ssqdmy, covxy
+
+#     N, K = (<object>mat).shape
+
+#     if minp is None:
+#         minpv = 1
+#     else:
+#         minpv = <int>minp
+
+#     result = np.empty((K, K), dtype=np.float64)
+#     mask = np.isfinite(mat).view(np.uint8)
+
+#     with nogil:
+#         for xi in range(K):
+#             for yi in range(xi + 1):
+#                 # Welford's method for the variance-calculation
+#                 # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+#                 nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
+#                 for i in range(N):
+#                     if mask[i, xi] and mask[i, yi]:
+#                         vx = mat[i, xi]
+#                         vy = mat[i, yi]
+#                         nobs += 1
+#                         prev_meanx = meanx
+#                         prev_meany = meany
+#                         meanx = meanx + 1 / nobs * (vx - meanx)
+#                         meany = meany + 1 / nobs * (vy - meany)
+#                         ssqdmx = ssqdmx + (vx - meanx) * (vx - prev_meanx)
+#                         ssqdmy = ssqdmy + (vy - meany) * (vy - prev_meany)
+#                         covxy = covxy + (vx - meanx) * (vy - prev_meany)
+
+#                 if nobs < minpv:
+#                     result[xi, yi] = result[yi, xi] = NaN
+#                 else:
+#                     divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
+
+#                     if divisor != 0:
+#                         result[xi, yi] = result[yi, xi] = covxy / divisor
+#                     else:
+#                         result[xi, yi] = result[yi, xi] = NaN
+
+#     return result
+
+# # ----------------------------------------------------------------------
+# # Pairwise Spearman correlation
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
+#     cdef:
+#         Py_ssize_t i, j, xi, yi, N, K
+#         ndarray[float64_t, ndim=2] result
+#         ndarray[float64_t, ndim=2] ranked_mat
+#         ndarray[float64_t, ndim=1] maskedx
+#         ndarray[float64_t, ndim=1] maskedy
+#         ndarray[uint8_t, ndim=2] mask
+#         int64_t nobs = 0
+#         float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
+#         const int64_t[:] labels_n, labels_nobs
+
+#     N, K = (<object>mat).shape
+#     # For compatibility when calling rank_1d
+#     labels_n = np.zeros(N, dtype=np.int64)
+
+#     result = np.empty((K, K), dtype=np.float64)
+#     mask = np.isfinite(mat).view(np.uint8)
+
+#     ranked_mat = np.empty((N, K), dtype=np.float64)
+
+#     for i in range(K):
+#         ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n)
+
+#     for xi in range(K):
+#         for yi in range(xi + 1):
+#             nobs = 0
+#             # Keep track of whether we need to recompute ranks
+#             all_ranks = True
+#             for i in range(N):
+#                 all_ranks &= not (mask[i, xi] ^ mask[i, yi])
+#                 if mask[i, xi] and mask[i, yi]:
+#                     nobs += 1
+
+#             if nobs < minp:
+#                 result[xi, yi] = result[yi, xi] = NaN
+#             else:
+#                 maskedx = np.empty(nobs, dtype=np.float64)
+#                 maskedy = np.empty(nobs, dtype=np.float64)
+#                 j = 0
+
+#                 for i in range(N):
+#                     if mask[i, xi] and mask[i, yi]:
+#                         maskedx[j] = ranked_mat[i, xi]
+#                         maskedy[j] = ranked_mat[i, yi]
+#                         j += 1
+
+#                 if not all_ranks:
+#                     labels_nobs = np.zeros(nobs, dtype=np.int64)
+#                     maskedx = rank_1d(maskedx, labels=labels_nobs)
+#                     maskedy = rank_1d(maskedy, labels=labels_nobs)
+
+#                 mean = (nobs + 1) / 2.
+
+#                 # now the cov numerator
+#                 sumx = sumxx = sumyy = 0
+
+#                 for i in range(nobs):
+#                     vx = maskedx[i] - mean
+#                     vy = maskedy[i] - mean
+
+#                     sumx += vx * vy
+#                     sumxx += vx * vx
+#                     sumyy += vy * vy
+
+#                 divisor = sqrt(sumxx * sumyy)
+
+#                 if divisor != 0:
+#                     result[xi, yi] = result[yi, xi] = sumx / divisor
+#                 else:
+#                     result[xi, yi] = result[yi, xi] = NaN
+
+#     return result
+
+
+# # ----------------------------------------------------------------------
+# # Kendall correlation
+# # Wikipedia article: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient  # noqa
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
+#     """
+#     Perform kendall correlation on a 2d array
+
+#     Parameters
+#     ----------
+#     mat : np.ndarray[float64_t, ndim=2]
+#         Array to compute kendall correlation on
+#     minp : int, default 1
+#         Minimum number of observations required per pair of columns
+#         to have a valid result.
+
+#     Returns
+#     -------
+#     numpy.ndarray[float64_t, ndim=2]
+#         Correlation matrix
+#     """
+#     cdef:
+#         Py_ssize_t i, j, k, xi, yi, N, K
+#         ndarray[float64_t, ndim=2] result
+#         ndarray[float64_t, ndim=2] ranked_mat
+#         ndarray[uint8_t, ndim=2] mask
+#         float64_t currj
+#         ndarray[uint8_t, ndim=1] valid
+#         ndarray[int64_t] sorted_idxs
+#         ndarray[float64_t, ndim=1] col
+#         int64_t n_concordant
+#         int64_t total_concordant = 0
+#         int64_t total_discordant = 0
+#         float64_t kendall_tau
+#         int64_t n_obs
+#         const int64_t[:] labels_n
+
+#     N, K = (<object>mat).shape
+
+#     result = np.empty((K, K), dtype=np.float64)
+#     mask = np.isfinite(mat)
+
+#     ranked_mat = np.empty((N, K), dtype=np.float64)
+#     # For compatibility when calling rank_1d
+#     labels_n = np.zeros(N, dtype=np.int64)
+
+#     for i in range(K):
+#         ranked_mat[:, i] = rank_1d(mat[:, i], labels_n)
+
+#     for xi in range(K):
+#         sorted_idxs = ranked_mat[:, xi].argsort()
+#         ranked_mat = ranked_mat[sorted_idxs]
+#         mask = mask[sorted_idxs]
+#         for yi in range(xi + 1, K):
+#             valid = mask[:, xi] & mask[:, yi]
+#             if valid.sum() < minp:
+#                 result[xi, yi] = NaN
+#                 result[yi, xi] = NaN
+#             else:
+#                 # Get columns and order second column using 1st column ranks
+#                 if not valid.all():
+#                     col = ranked_mat[valid.nonzero()][:, yi]
+#                 else:
+#                     col = ranked_mat[:, yi]
+#                 n_obs = col.shape[0]
+#                 total_concordant = 0
+#                 total_discordant = 0
+#                 for j in range(n_obs - 1):
+#                     currj = col[j]
+#                     # Count num concordant and discordant pairs
+#                     n_concordant = 0
+#                     for k in range(j, n_obs):
+#                         if col[k] > currj:
+#                             n_concordant += 1
+#                     total_concordant += n_concordant
+#                     total_discordant += (n_obs - 1 - j - n_concordant)
+#                 # Note: we do total_concordant+total_discordant here which is
+#                 # equivalent to the C(n, 2), the total # of pairs,
+#                 # listed on wikipedia
+#                 kendall_tau = (total_concordant - total_discordant) / \
+#                               (total_concordant + total_discordant)
+#                 result[xi, yi] = kendall_tau
+#                 result[yi, xi] = kendall_tau
+
+#         if mask[:, xi].sum() > minp:
+#             result[xi, xi] = 1
+#         else:
+#             result[xi, xi] = NaN
+
+#     return result
+
+
+# # ----------------------------------------------------------------------
+
+# ctypedef fused algos_t:
+#     float64_t
+#     float32_t
+#     object
+#     int64_t
+#     int32_t
+#     int16_t
+#     int8_t
+#     uint64_t
+#     uint32_t
+#     uint16_t
+#     uint8_t
+
+
+# def validate_limit(nobs: int, limit=None) -> int:
+#     """
+#     Check that the `limit` argument is a positive integer.
+
+#     Parameters
+#     ----------
+#     nobs : int
+#     limit : object
+
+#     Returns
+#     -------
+#     int
+#         The limit.
+#     """
+#     if limit is None:
+#         lim = nobs
+#     else:
+#         if not util.is_integer_object(limit):
+#             raise ValueError('Limit must be an integer')
+#         if limit < 1:
+#             raise ValueError('Limit must be greater than 0')
+#         lim = limit
+
+#     return lim
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
+#     cdef:
+#         Py_ssize_t i, j, nleft, nright
+#         ndarray[int64_t, ndim=1] indexer
+#         algos_t cur, next_val
+#         int lim, fill_count = 0
+
+#     nleft = len(old)
+#     nright = len(new)
+#     indexer = np.empty(nright, dtype=np.int64)
+#     indexer[:] = -1
+
+#     lim = validate_limit(nright, limit)
+
+#     if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+#         return indexer
+
+#     i = j = 0
+
+#     cur = old[0]
+
+#     while j <= nright - 1 and new[j] < cur:
+#         j += 1
+
+#     while True:
+#         if j == nright:
+#             break
+
+#         if i == nleft - 1:
+#             while j < nright:
+#                 if new[j] == cur:
+#                     indexer[j] = i
+#                 elif new[j] > cur and fill_count < lim:
+#                     indexer[j] = i
+#                     fill_count += 1
+#                 j += 1
+#             break
+
+#         next_val = old[i + 1]
+
+#         while j < nright and cur <= new[j] < next_val:
+#             if new[j] == cur:
+#                 indexer[j] = i
+#             elif fill_count < lim:
+#                 indexer[j] = i
+#                 fill_count += 1
+#             j += 1
+
+#         fill_count = 0
+#         i += 1
+#         cur = next_val
+
+#     return indexer
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
+#     cdef:
+#         Py_ssize_t i, N
+#         algos_t val
+#         int lim, fill_count = 0
+
+#     N = len(values)
+
+#     # GH#2778
+#     if N == 0:
+#         return
+
+#     lim = validate_limit(N, limit)
+
+#     val = values[0]
+#     for i in range(N):
+#         if mask[i]:
+#             if fill_count >= lim:
+#                 continue
+#             fill_count += 1
+#             values[i] = val
+#         else:
+#             fill_count = 0
+#             val = values[i]
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None):
+#     cdef:
+#         Py_ssize_t i, j, N, K
+#         algos_t val
+#         int lim, fill_count = 0
+
+#     K, N = (<object>values).shape
+
+#     # GH#2778
+#     if N == 0:
+#         return
+
+#     lim = validate_limit(N, limit)
+
+#     for j in range(K):
+#         fill_count = 0
+#         val = values[j, 0]
+#         for i in range(N):
+#             if mask[j, i]:
+#                 if fill_count >= lim:
+#                     continue
+#                 fill_count += 1
+#                 values[j, i] = val
+#             else:
+#                 fill_count = 0
+#                 val = values[j, i]
+
+
+# """
+# Backfilling logic for generating fill vector
+
+# Diagram of what's going on
+
+# Old      New    Fill vector    Mask
+#          .        0               1
+#          .        0               1
+#          .        0               1
+# A        A        0               1
+#          .        1               1
+#          .        1               1
+#          .        1               1
+#          .        1               1
+#          .        1               1
+# B        B        1               1
+#          .        2               1
+#          .        2               1
+#          .        2               1
+# C        C        2               1
+#          .                        0
+#          .                        0
+# D
+# """
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray:
+#     cdef:
+#         Py_ssize_t i, j, nleft, nright
+#         ndarray[int64_t, ndim=1] indexer
+#         algos_t cur, prev
+#         int lim, fill_count = 0
+
+#     nleft = len(old)
+#     nright = len(new)
+#     indexer = np.empty(nright, dtype=np.int64)
+#     indexer[:] = -1
+
+#     lim = validate_limit(nright, limit)
+
+#     if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+#         return indexer
+
+#     i = nleft - 1
+#     j = nright - 1
+
+#     cur = old[nleft - 1]
+
+#     while j >= 0 and new[j] > cur:
+#         j -= 1
+
+#     while True:
+#         if j < 0:
+#             break
+
+#         if i == 0:
+#             while j >= 0:
+#                 if new[j] == cur:
+#                     indexer[j] = i
+#                 elif new[j] < cur and fill_count < lim:
+#                     indexer[j] = i
+#                     fill_count += 1
+#                 j -= 1
+#             break
+
+#         prev = old[i - 1]
+
+#         while j >= 0 and prev < new[j] <= cur:
+#             if new[j] == cur:
+#                 indexer[j] = i
+#             elif new[j] < cur and fill_count < lim:
+#                 indexer[j] = i
+#                 fill_count += 1
+#             j -= 1
+
+#         fill_count = 0
+#         i -= 1
+#         cur = prev
+
+#     return indexer
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
+#     cdef:
+#         Py_ssize_t i, N
+#         algos_t val
+#         int lim, fill_count = 0
+
+#     N = len(values)
+
+#     # GH#2778
+#     if N == 0:
+#         return
+
+#     lim = validate_limit(N, limit)
+
+#     val = values[N - 1]
+#     for i in range(N - 1, -1, -1):
+#         if mask[i]:
+#             if fill_count >= lim:
+#                 continue
+#             fill_count += 1
+#             values[i] = val
+#         else:
+#             fill_count = 0
+#             val = values[i]
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def backfill_2d_inplace(algos_t[:, :] values,
+#                         const uint8_t[:, :] mask,
+#                         limit=None):
+#     cdef:
+#         Py_ssize_t i, j, N, K
+#         algos_t val
+#         int lim, fill_count = 0
+
+#     K, N = (<object>values).shape
+
+#     # GH#2778
+#     if N == 0:
+#         return
+
+#     lim = validate_limit(N, limit)
+
+#     for j in range(K):
+#         fill_count = 0
+#         val = values[j, N - 1]
+#         for i in range(N - 1, -1, -1):
+#             if mask[j, i]:
+#                 if fill_count >= lim:
+#                     continue
+#                 fill_count += 1
+#                 values[j, i] = val
+#             else:
+#                 fill_count = 0
+#                 val = values[j, i]
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike):
+#     """
+#     Returns
+#     -------
+#     tuple
+#         is_monotonic_inc : bool
+#         is_monotonic_dec : bool
+#         is_unique : bool
+#     """
+#     cdef:
+#         Py_ssize_t i, n
+#         algos_t prev, cur
+#         bint is_monotonic_inc = 1
+#         bint is_monotonic_dec = 1
+#         bint is_unique = 1
+#         bint is_strict_monotonic = 1
+
+#     n = len(arr)
+
+#     if n == 1:
+#         if arr[0] != arr[0] or (timelike and <int64_t>arr[0] == NPY_NAT):
+#             # single value is NaN
+#             return False, False, True
+#         else:
+#             return True, True, True
+#     elif n < 2:
+#         return True, True, True
+
+#     if timelike and <int64_t>arr[0] == NPY_NAT:
+#         return False, False, True
+
+#     if algos_t is not object:
+#         with nogil:
+#             prev = arr[0]
+#             for i in range(1, n):
+#                 cur = arr[i]
+#                 if timelike and <int64_t>cur == NPY_NAT:
+#                     is_monotonic_inc = 0
+#                     is_monotonic_dec = 0
+#                     break
+#                 if cur < prev:
+#                     is_monotonic_inc = 0
+#                 elif cur > prev:
+#                     is_monotonic_dec = 0
+#                 elif cur == prev:
+#                     is_unique = 0
+#                 else:
+#                     # cur or prev is NaN
+#                     is_monotonic_inc = 0
+#                     is_monotonic_dec = 0
+#                     break
+#                 if not is_monotonic_inc and not is_monotonic_dec:
+#                     is_monotonic_inc = 0
+#                     is_monotonic_dec = 0
+#                     break
+#                 prev = cur
+#     else:
+#         # object-dtype, identical to above except we cannot use `with nogil`
+#         prev = arr[0]
+#         for i in range(1, n):
+#             cur = arr[i]
+#             if timelike and <int64_t>cur == NPY_NAT:
+#                 is_monotonic_inc = 0
+#                 is_monotonic_dec = 0
+#                 break
+#             if cur < prev:
+#                 is_monotonic_inc = 0
+#             elif cur > prev:
+#                 is_monotonic_dec = 0
+#             elif cur == prev:
+#                 is_unique = 0
+#             else:
+#                 # cur or prev is NaN
+#                 is_monotonic_inc = 0
+#                 is_monotonic_dec = 0
+#                 break
+#             if not is_monotonic_inc and not is_monotonic_dec:
+#                 is_monotonic_inc = 0
+#                 is_monotonic_dec = 0
+#                 break
+#             prev = cur
+
+#     is_strict_monotonic = is_unique and (is_monotonic_inc or is_monotonic_dec)
+#     return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic
+
+
+# # ----------------------------------------------------------------------
+# # rank_1d, rank_2d
+# # ----------------------------------------------------------------------
+
+# ctypedef fused rank_t:
+#     object
+#     float64_t
+#     uint64_t
+#     int64_t
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+# def rank_1d(
+#     ndarray[rank_t, ndim=1] values,
+#     const int64_t[:] labels,
+#     ties_method="average",
+#     bint ascending=True,
+#     bint pct=False,
+#     na_option="keep",
+# ):
+#     """
+#     Fast NaN-friendly version of ``scipy.stats.rankdata``.
+
+#     Parameters
+#     ----------
+#     values : array of rank_t values to be ranked
+#     labels : array containing unique label for each group, with its ordering
+#         matching up to the corresponding record in `values`. If not called
+#         from a groupby operation, will be an array of 0's
+#     ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
+#         'average'
+#         * average: average rank of group
+#         * min: lowest rank in group
+#         * max: highest rank in group
+#         * first: ranks assigned in order they appear in the array
+#         * dense: like 'min', but rank always increases by 1 between groups
+#     ascending : boolean, default True
+#         False for ranks by high (1) to low (N)
+#         na_option : {'keep', 'top', 'bottom'}, default 'keep'
+#     pct : boolean, default False
+#         Compute percentage rank of data within each group
+#     na_option : {'keep', 'top', 'bottom'}, default 'keep'
+#         * keep: leave NA values where they are
+#         * top: smallest rank if ascending
+#         * bottom: smallest rank if descending
+#     """
+#     cdef:
+#         TiebreakEnumType tiebreak
+#         Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0
+#         Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
+#         ndarray[int64_t, ndim=1] lexsort_indexer
+#         ndarray[float64_t, ndim=1] grp_sizes, out
+#         ndarray[rank_t, ndim=1] masked_vals
+#         ndarray[uint8_t, ndim=1] mask
+#         bint keep_na, at_end, next_val_diff, check_labels
+#         rank_t nan_fill_val
+
+#     tiebreak = tiebreakers[ties_method]
+#     keep_na = na_option == 'keep'
+
+#     N = len(values)
+#     # TODO Cython 3.0: cast won't be necessary (#2992)
+#     assert <Py_ssize_t>len(labels) == N
+#     out = np.empty(N)
+#     grp_sizes = np.ones(N)
+#     # If all 0 labels, can short-circuit later label
+#     # comparisons
+#     check_labels = np.any(labels)
+
+#     # Copy values into new array in order to fill missing data
+#     # with mask, without obfuscating location of missing data
+#     # in values array
+#     if rank_t is object and values.dtype != np.object_:
+#         masked_vals = values.astype('O')
+#     else:
+#         masked_vals = values.copy()
+
+#     if rank_t is object:
+#         mask = missing.isnaobj(masked_vals)
+#     elif rank_t is int64_t:
+#         mask = (masked_vals == NPY_NAT).astype(np.uint8)
+#     elif rank_t is float64_t:
+#         mask = np.isnan(masked_vals).astype(np.uint8)
+#     else:
+#         mask = np.zeros(shape=len(masked_vals), dtype=np.uint8)
+
+#     if ascending ^ (na_option == 'top'):
+#         if rank_t is object:
+#             nan_fill_val = Infinity()
+#         elif rank_t is int64_t:
+#             nan_fill_val = np.iinfo(np.int64).max
+#         elif rank_t is uint64_t:
+#             nan_fill_val = np.iinfo(np.uint64).max
+#         else:
+#             nan_fill_val = np.inf
+#         order = (masked_vals, mask, labels)
+#     else:
+#         if rank_t is object:
+#             nan_fill_val = NegInfinity()
+#         elif rank_t is int64_t:
+#             nan_fill_val = np.iinfo(np.int64).min
+#         elif rank_t is uint64_t:
+#             nan_fill_val = 0
+#         else:
+#             nan_fill_val = -np.inf
+
+#         order = (masked_vals, ~mask, labels)
+
+#     np.putmask(masked_vals, mask, nan_fill_val)
+
+#     # lexsort using labels, then mask, then actual values
+#     # each label corresponds to a different group value,
+#     # the mask helps you differentiate missing values before
+#     # performing sort on the actual values
+#     lexsort_indexer = np.lexsort(order).astype(np.int64, copy=False)
+
+#     if not ascending:
+#         lexsort_indexer = lexsort_indexer[::-1]
+
+#     # Loop over the length of the value array
+#     # each incremental i value can be looked up in the lexsort_indexer
+#     # array that we sorted previously, which gives us the location of
+#     # that sorted value for retrieval back from the original
+#     # values / masked_vals arrays
+#     # TODO: de-duplicate once cython supports conditional nogil
+#     if rank_t is object:
+#         for i in range(N):
+#             at_end = i == N - 1
+#             # dups and sum_ranks will be incremented each loop where
+#             # the value / group remains the same, and should be reset
+#             # when either of those change
+#             # Used to calculate tiebreakers
+#             dups += 1
+#             sum_ranks += i - grp_start + 1
+
+#             # Update out only when there is a transition of values or labels.
+#             # When a new value or group is encountered, go back #dups steps(
+#             # the number of occurrence of current value) and assign the ranks
+#             # based on the starting index of the current group (grp_start)
+#             # and the current index
+#             if not at_end:
+#                 next_val_diff = are_diff(masked_vals[lexsort_indexer[i]],
+#                                          masked_vals[lexsort_indexer[i+1]])
+#             else:
+#                 next_val_diff = True
+
+#             if (next_val_diff
+#                     or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])
+#                     or (check_labels
+#                         and (labels[lexsort_indexer[i]]
+#                              != labels[lexsort_indexer[i+1]]))
+#             ):
+#                 # if keep_na, check for missing values and assign back
+#                 # to the result where appropriate
+#                 if keep_na and mask[lexsort_indexer[i]]:
+#                     for j in range(i - dups + 1, i + 1):
+#                         out[lexsort_indexer[j]] = NaN
+#                         grp_na_count = dups
+#                 elif tiebreak == TIEBREAK_AVERAGE:
+#                     for j in range(i - dups + 1, i + 1):
+#                         out[lexsort_indexer[j]] = sum_ranks / <float64_t>dups
+#                 elif tiebreak == TIEBREAK_MIN:
+#                     for j in range(i - dups + 1, i + 1):
+#                         out[lexsort_indexer[j]] = i - grp_start - dups + 2
+#                 elif tiebreak == TIEBREAK_MAX:
+#                     for j in range(i - dups + 1, i + 1):
+#                         out[lexsort_indexer[j]] = i - grp_start + 1
+#                 elif tiebreak == TIEBREAK_FIRST:
+#                     for j in range(i - dups + 1, i + 1):
+#                         if ascending:
+#                             out[lexsort_indexer[j]] = j + 1 - grp_start
+#                         else:
+#                             out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
+#                 elif tiebreak == TIEBREAK_DENSE:
+#                     for j in range(i - dups + 1, i + 1):
+#                         out[lexsort_indexer[j]] = grp_vals_seen
+
+#                 # look forward to the next value (using the sorting in _as)
+#                 # if the value does not equal the current value then we need to
+#                 # reset the dups and sum_ranks, knowing that a new value is
+#                 # coming up. the conditional also needs to handle nan equality
+#                 # and the end of iteration
+#                 if next_val_diff or (mask[lexsort_indexer[i]]
+#                                      ^ mask[lexsort_indexer[i+1]]):
+#                     dups = sum_ranks = 0
+#                     grp_vals_seen += 1
+#                     grp_tie_count += 1
+
+#                 # Similar to the previous conditional, check now if we are
+#                 # moving to a new group. If so, keep track of the index where
+#                 # the new group occurs, so the tiebreaker calculations can
+#                 # decrement that from their position. fill in the size of each
+#                 # group encountered (used by pct calculations later). also be
+#                 # sure to reset any of the items helping to calculate dups
+#                 if (at_end or
+#                         (check_labels
+#                          and (labels[lexsort_indexer[i]]
+#                               != labels[lexsort_indexer[i+1]]))):
+#                     if tiebreak != TIEBREAK_DENSE:
+#                         for j in range(grp_start, i + 1):
+#                             grp_sizes[lexsort_indexer[j]] = \
+#                                 (i - grp_start + 1 - grp_na_count)
+#                     else:
+#                         for j in range(grp_start, i + 1):
+#                             grp_sizes[lexsort_indexer[j]] = \
+#                                 (grp_tie_count - (grp_na_count > 0))
+#                     dups = sum_ranks = 0
+#                     grp_na_count = 0
+#                     grp_tie_count = 0
+#                     grp_start = i + 1
+#                     grp_vals_seen = 1
+#     else:
+#         with nogil:
+#             for i in range(N):
+#                 at_end = i == N - 1
+#                 # dups and sum_ranks will be incremented each loop where
+#                 # the value / group remains the same, and should be reset
+#                 # when either of those change
+#                 # Used to calculate tiebreakers
+#                 dups += 1
+#                 sum_ranks += i - grp_start + 1
+
+#                 # Update out only when there is a transition of values or labels.
+#                 # When a new value or group is encountered, go back #dups steps(
+#                 # the number of occurrence of current value) and assign the ranks
+#                 # based on the starting index of the current group (grp_start)
+#                 # and the current index
+#                 if not at_end:
+#                     next_val_diff = (masked_vals[lexsort_indexer[i]]
+#                                      != masked_vals[lexsort_indexer[i+1]])
+#                 else:
+#                     next_val_diff = True
+
+#                 if (next_val_diff
+#                         or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])
+#                         or (check_labels
+#                             and (labels[lexsort_indexer[i]]
+#                                  != labels[lexsort_indexer[i+1]]))
+#                 ):
+#                     # if keep_na, check for missing values and assign back
+#                     # to the result where appropriate
+#                     if keep_na and mask[lexsort_indexer[i]]:
+#                         for j in range(i - dups + 1, i + 1):
+#                             out[lexsort_indexer[j]] = NaN
+#                             grp_na_count = dups
+#                     elif tiebreak == TIEBREAK_AVERAGE:
+#                         for j in range(i - dups + 1, i + 1):
+#                             out[lexsort_indexer[j]] = sum_ranks / <float64_t>dups
+#                     elif tiebreak == TIEBREAK_MIN:
+#                         for j in range(i - dups + 1, i + 1):
+#                             out[lexsort_indexer[j]] = i - grp_start - dups + 2
+#                     elif tiebreak == TIEBREAK_MAX:
+#                         for j in range(i - dups + 1, i + 1):
+#                             out[lexsort_indexer[j]] = i - grp_start + 1
+#                     elif tiebreak == TIEBREAK_FIRST:
+#                         for j in range(i - dups + 1, i + 1):
+#                             if ascending:
+#                                 out[lexsort_indexer[j]] = j + 1 - grp_start
+#                             else:
+#                                 out[lexsort_indexer[j]] = \
+#                                     (2 * i - j - dups + 2 - grp_start)
+#                     elif tiebreak == TIEBREAK_DENSE:
+#                         for j in range(i - dups + 1, i + 1):
+#                             out[lexsort_indexer[j]] = grp_vals_seen
+
+#                     # look forward to the next value (using the sorting in
+#                     # lexsort_indexer) if the value does not equal the current
+#                     # value then we need to reset the dups and sum_ranks,
+#                     # knowing that a new value is coming up. the conditional
+#                     # also needs to handle nan equality and the end of iteration
+#                     if next_val_diff or (mask[lexsort_indexer[i]]
+#                                          ^ mask[lexsort_indexer[i+1]]):
+#                         dups = sum_ranks = 0
+#                         grp_vals_seen += 1
+#                         grp_tie_count += 1
+
+#                     # Similar to the previous conditional, check now if we are
+#                     # moving to a new group. If so, keep track of the index where
+#                     # the new group occurs, so the tiebreaker calculations can
+#                     # decrement that from their position. fill in the size of each
+#                     # group encountered (used by pct calculations later). also be
+#                     # sure to reset any of the items helping to calculate dups
+#                     if at_end or (check_labels and
+#                                   (labels[lexsort_indexer[i]]
+#                                    != labels[lexsort_indexer[i+1]])):
+#                         if tiebreak != TIEBREAK_DENSE:
+#                             for j in range(grp_start, i + 1):
+#                                 grp_sizes[lexsort_indexer[j]] = \
+#                                     (i - grp_start + 1 - grp_na_count)
+#                         else:
+#                             for j in range(grp_start, i + 1):
+#                                 grp_sizes[lexsort_indexer[j]] = \
+#                                     (grp_tie_count - (grp_na_count > 0))
+#                         dups = sum_ranks = 0
+#                         grp_na_count = 0
+#                         grp_tie_count = 0
+#                         grp_start = i + 1
+#                         grp_vals_seen = 1
+
+#     if pct:
+#         for i in range(N):
+#             if grp_sizes[i] != 0:
+#                 out[i] = out[i] / grp_sizes[i]
+
+#     return out
+
+
+# def rank_2d(
+#     ndarray[rank_t, ndim=2] in_arr,
+#     int axis=0,
+#     ties_method="average",
+#     bint ascending=True,
+#     na_option="keep",
+#     bint pct=False,
+# ):
+#     """
+#     Fast NaN-friendly version of ``scipy.stats.rankdata``.
+#     """
+#     cdef:
+#         Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
+#         Py_ssize_t infs
+#         ndarray[float64_t, ndim=2] ranks
+#         ndarray[rank_t, ndim=2] values
+#         ndarray[intp_t, ndim=2] argsort_indexer
+#         ndarray[uint8_t, ndim=2] mask
+#         rank_t val, nan_value
+#         float64_t count, sum_ranks = 0.0
+#         int tiebreak = 0
+#         int64_t idx
+#         bint check_mask, condition, keep_na
+
+#     tiebreak = tiebreakers[ties_method]
+
+#     keep_na = na_option == 'keep'
+#     check_mask = rank_t is not uint64_t
+
+#     if axis == 0:
+#         values = np.asarray(in_arr).T.copy()
+#     else:
+#         values = np.asarray(in_arr).copy()
+
+#     if rank_t is object:
+#         if values.dtype != np.object_:
+#             values = values.astype('O')
+
+#     if rank_t is not uint64_t:
+#         if ascending ^ (na_option == 'top'):
+#             if rank_t is object:
+#                 nan_value = Infinity()
+#             elif rank_t is float64_t:
+#                 nan_value = np.inf
+#             elif rank_t is int64_t:
+#                 nan_value = np.iinfo(np.int64).max
+
+#         else:
+#             if rank_t is object:
+#                 nan_value = NegInfinity()
+#             elif rank_t is float64_t:
+#                 nan_value = -np.inf
+#             elif rank_t is int64_t:
+#                 nan_value = NPY_NAT
+
+#         if rank_t is object:
+#             mask = missing.isnaobj2d(values)
+#         elif rank_t is float64_t:
+#             mask = np.isnan(values)
+#         elif rank_t is int64_t:
+#             mask = values == NPY_NAT
+
+#         np.putmask(values, mask, nan_value)
+#     else:
+#         mask = np.zeros_like(values, dtype=bool)
+
+#     n, k = (<object>values).shape
+#     ranks = np.empty((n, k), dtype='f8')
+
+#     if tiebreak == TIEBREAK_FIRST:
+#         # need to use a stable sort here
+#         argsort_indexer = values.argsort(axis=1, kind='mergesort')
+#         if not ascending:
+#             tiebreak = TIEBREAK_FIRST_DESCENDING
+#     else:
+#         argsort_indexer = values.argsort(1)
+
+#     if not ascending:
+#         argsort_indexer = argsort_indexer[:, ::-1]
+
+#     values = _take_2d(values, argsort_indexer)
+
+#     for i in range(n):
+#         dups = sum_ranks = infs = 0
+
+#         total_tie_count = 0
+#         count = 0.0
+#         for j in range(k):
+#             val = values[i, j]
+#             idx = argsort_indexer[i, j]
+#             if keep_na and check_mask and mask[i, idx]:
+#                 ranks[i, idx] = NaN
+#                 infs += 1
+#                 continue
+
+#             count += 1.0
+
+#             sum_ranks += (j - infs) + 1
+#             dups += 1
+
+#             if rank_t is object:
+#                 condition = (
+#                     j == k - 1 or
+#                     are_diff(values[i, j + 1], val) or
+#                     (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
+#                 )
+#             else:
+#                 condition = (
+#                     j == k - 1 or
+#                     values[i, j + 1] != val or
+#                     (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
+#                 )
+
+#             if condition:
+#                 if tiebreak == TIEBREAK_AVERAGE:
+#                     for z in range(j - dups + 1, j + 1):
+#                         ranks[i, argsort_indexer[i, z]] = sum_ranks / dups
+#                 elif tiebreak == TIEBREAK_MIN:
+#                     for z in range(j - dups + 1, j + 1):
+#                         ranks[i, argsort_indexer[i, z]] = j - dups + 2
+#                 elif tiebreak == TIEBREAK_MAX:
+#                     for z in range(j - dups + 1, j + 1):
+#                         ranks[i, argsort_indexer[i, z]] = j + 1
+#                 elif tiebreak == TIEBREAK_FIRST:
+#                     if rank_t is object:
+#                         raise ValueError('first not supported for non-numeric data')
+#                     else:
+#                         for z in range(j - dups + 1, j + 1):
+#                             ranks[i, argsort_indexer[i, z]] = z + 1
+#                 elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+#                     for z in range(j - dups + 1, j + 1):
+#                         ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2
+#                 elif tiebreak == TIEBREAK_DENSE:
+#                     total_tie_count += 1
+#                     for z in range(j - dups + 1, j + 1):
+#                         ranks[i, argsort_indexer[i, z]] = total_tie_count
+#                 sum_ranks = dups = 0
+#         if pct:
+#             if tiebreak == TIEBREAK_DENSE:
+#                 ranks[i, :] /= total_tie_count
+#             else:
+#                 ranks[i, :] /= count
+#     if axis == 0:
+#         return ranks.T
+#     else:
+#         return ranks
+
+
+# ctypedef fused diff_t:
+#     float64_t
+#     float32_t
+#     int8_t
+#     int16_t
+#     int32_t
+#     int64_t
+
+# ctypedef fused out_t:
+#     float32_t
+#     float64_t
+#     int64_t
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def diff_2d(
+#     ndarray[diff_t, ndim=2] arr,  # TODO(cython 3) update to "const diff_t[:, :] arr"
+#     ndarray[out_t, ndim=2] out,
+#     Py_ssize_t periods,
+#     int axis,
+#     bint datetimelike=False,
+# ):
+#     cdef:
+#         Py_ssize_t i, j, sx, sy, start, stop
+#         bint f_contig = arr.flags.f_contiguous
+#         # bint f_contig = arr.is_f_contig()  # TODO(cython 3)
+#         diff_t left, right
+
+#     # Disable for unsupported dtype combinations,
+#     #  see https://github.com/cython/cython/issues/2646
+#     if (out_t is float32_t
+#             and not (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)):
+#         raise NotImplementedError
+#     elif (out_t is float64_t
+#           and (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)):
+#         raise NotImplementedError
+#     elif out_t is int64_t and diff_t is not int64_t:
+#         # We only have out_t of int64_t if we have datetimelike
+#         raise NotImplementedError
+#     else:
+#         # We put this inside an indented else block to avoid cython build
+#         #  warnings about unreachable code
+#         sx, sy = (<object>arr).shape
+#         with nogil:
+#             if f_contig:
+#                 if axis == 0:
+#                     if periods >= 0:
+#                         start, stop = periods, sx
+#                     else:
+#                         start, stop = 0, sx + periods
+#                     for j in range(sy):
+#                         for i in range(start, stop):
+#                             left = arr[i, j]
+#                             right = arr[i - periods, j]
+#                             if out_t is int64_t and datetimelike:
+#                                 if left == NPY_NAT or right == NPY_NAT:
+#                                     out[i, j] = NPY_NAT
+#                                 else:
+#                                     out[i, j] = left - right
+#                             else:
+#                                 out[i, j] = left - right
+#                 else:
+#                     if periods >= 0:
+#                         start, stop = periods, sy
+#                     else:
+#                         start, stop = 0, sy + periods
+#                     for j in range(start, stop):
+#                         for i in range(sx):
+#                             left = arr[i, j]
+#                             right = arr[i, j - periods]
+#                             if out_t is int64_t and datetimelike:
+#                                 if left == NPY_NAT or right == NPY_NAT:
+#                                     out[i, j] = NPY_NAT
+#                                 else:
+#                                     out[i, j] = left - right
+#                             else:
+#                                 out[i, j] = left - right
+#             else:
+#                 if axis == 0:
+#                     if periods >= 0:
+#                         start, stop = periods, sx
+#                     else:
+#                         start, stop = 0, sx + periods
+#                     for i in range(start, stop):
+#                         for j in range(sy):
+#                             left = arr[i, j]
+#                             right = arr[i - periods, j]
+#                             if out_t is int64_t and datetimelike:
+#                                 if left == NPY_NAT or right == NPY_NAT:
+#                                     out[i, j] = NPY_NAT
+#                                 else:
+#                                     out[i, j] = left - right
+#                             else:
+#                                 out[i, j] = left - right
+#                 else:
+#                     if periods >= 0:
+#                         start, stop = periods, sy
+#                     else:
+#                         start, stop = 0, sy + periods
+#                     for i in range(sx):
+#                         for j in range(start, stop):
+#                             left = arr[i, j]
+#                             right = arr[i, j - periods]
+#                             if out_t is int64_t and datetimelike:
+#                                 if left == NPY_NAT or right == NPY_NAT:
+#                                     out[i, j] = NPY_NAT
+#                                 else:
+#                                     out[i, j] = left - right
+#                             else:
+#                                 out[i, j] = left - right
+
+
+# # generated from template
+# include "algos_common_helper.pxi"
+# include "algos_take_helper.pxi"

From cc20a71a6a4aee95662b17faf0f1e6f117bd85ea Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Tue, 16 Feb 2021 20:10:28 +0000
Subject: [PATCH 03/37] pad_inplace (not yet jitted)

---
 pandas/_libs_numba/algos.py           |  85 +++++-----
 pandas/_libs_numba/tslibs/__init__.py |  67 ++++++++
 pandas/_libs_numba/tslibs/util.py     | 223 ++++++++++++++++++++++++++
 pandas/_libs_numba/util.py            |  50 ++++++
 pandas/core/dtypes/missing.py         |   8 +-
 pandas/core/missing.py                |   3 +-
 6 files changed, 388 insertions(+), 48 deletions(-)
 create mode 100644 pandas/_libs_numba/tslibs/__init__.py
 create mode 100644 pandas/_libs_numba/tslibs/util.py
 create mode 100644 pandas/_libs_numba/util.py

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index 32a5b186d8290..548568d3dd5fb 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -36,7 +36,8 @@
 
 # cnp.import_array()
 
-# cimport pandas._libs.util as util
+import pandas._libs_numba.util as util
+
 # from pandas._libs.khash cimport (
 #     kh_destroy_int64,
 #     kh_get_int64,
@@ -503,30 +504,30 @@
 #     uint8_t
 
 
-# def validate_limit(nobs: int, limit=None) -> int:
-#     """
-#     Check that the `limit` argument is a positive integer.
+def validate_limit(nobs: int, limit=None) -> int:
+    """
+    Check that the `limit` argument is a positive integer.
 
-#     Parameters
-#     ----------
-#     nobs : int
-#     limit : object
+    Parameters
+    ----------
+    nobs : int
+    limit : object
 
-#     Returns
-#     -------
-#     int
-#         The limit.
-#     """
-#     if limit is None:
-#         lim = nobs
-#     else:
-#         if not util.is_integer_object(limit):
-#             raise ValueError('Limit must be an integer')
-#         if limit < 1:
-#             raise ValueError('Limit must be greater than 0')
-#         lim = limit
+    Returns
+    -------
+    int
+        The limit.
+    """
+    if limit is None:
+        lim = nobs
+    else:
+        if not util.is_integer_object(limit):
+            raise ValueError("Limit must be an integer")
+        if limit < 1:
+            raise ValueError("Limit must be greater than 0")
+        lim = limit
 
-#     return lim
+    return lim
 
 
 # @cython.boundscheck(False)
@@ -586,32 +587,28 @@
 #     return indexer
 
 
-# @cython.boundscheck(False)
-# @cython.wraparound(False)
-# def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
-#     cdef:
-#         Py_ssize_t i, N
-#         algos_t val
-#         int lim, fill_count = 0
+def pad_inplace(values, mask, limit=None):
 
-#     N = len(values)
+    fill_count = 0
 
-#     # GH#2778
-#     if N == 0:
-#         return
+    N = len(values)
 
-#     lim = validate_limit(N, limit)
+    # GH#2778
+    if N == 0:
+        return
 
-#     val = values[0]
-#     for i in range(N):
-#         if mask[i]:
-#             if fill_count >= lim:
-#                 continue
-#             fill_count += 1
-#             values[i] = val
-#         else:
-#             fill_count = 0
-#             val = values[i]
+    lim = validate_limit(N, limit)
+
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
 
 
 # @cython.boundscheck(False)
diff --git a/pandas/_libs_numba/tslibs/__init__.py b/pandas/_libs_numba/tslibs/__init__.py
new file mode 100644
index 0000000000000..6b9f2d32acd5c
--- /dev/null
+++ b/pandas/_libs_numba/tslibs/__init__.py
@@ -0,0 +1,67 @@
+# __all__ = [
+#     "dtypes",
+#     "localize_pydatetime",
+#     "NaT",
+#     "NaTType",
+#     "iNaT",
+#     "nat_strings",
+#     "is_null_datetimelike",
+#     "OutOfBoundsDatetime",
+#     "OutOfBoundsTimedelta",
+#     "IncompatibleFrequency",
+#     "Period",
+#     "Resolution",
+#     "Timedelta",
+#     "normalize_i8_timestamps",
+#     "is_date_array_normalized",
+#     "dt64arr_to_periodarr",
+#     "delta_to_nanoseconds",
+#     "ints_to_pydatetime",
+#     "ints_to_pytimedelta",
+#     "get_resolution",
+#     "Timestamp",
+#     "tz_convert_from_utc_single",
+#     "to_offset",
+#     "Tick",
+#     "BaseOffset",
+#     "tz_compare",
+# ]
+
+# from pandas._libs.tslibs import dtypes
+# from pandas._libs.tslibs.conversion import (
+#     OutOfBoundsTimedelta,
+#     localize_pydatetime,
+# )
+# from pandas._libs.tslibs.dtypes import Resolution
+# from pandas._libs.tslibs.nattype import (
+#     NaT,
+#     NaTType,
+#     iNaT,
+#     is_null_datetimelike,
+#     nat_strings,
+# )
+# from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
+# from pandas._libs.tslibs.offsets import (
+#     BaseOffset,
+#     Tick,
+#     to_offset,
+# )
+# from pandas._libs.tslibs.period import (
+#     IncompatibleFrequency,
+#     Period,
+# )
+# from pandas._libs.tslibs.timedeltas import (
+#     Timedelta,
+#     delta_to_nanoseconds,
+#     ints_to_pytimedelta,
+# )
+# from pandas._libs.tslibs.timestamps import Timestamp
+# from pandas._libs.tslibs.timezones import tz_compare
+# from pandas._libs.tslibs.tzconversion import tz_convert_from_utc_single
+# from pandas._libs.tslibs.vectorized import (
+#     dt64arr_to_periodarr,
+#     get_resolution,
+#     ints_to_pydatetime,
+#     is_date_array_normalized,
+#     normalize_i8_timestamps,
+# )
diff --git a/pandas/_libs_numba/tslibs/util.py b/pandas/_libs_numba/tslibs/util.py
new file mode 100644
index 0000000000000..ecad25c3e0d81
--- /dev/null
+++ b/pandas/_libs_numba/tslibs/util.py
@@ -0,0 +1,223 @@
+import numpy as np
+
+# from cpython.object cimport PyTypeObject
+
+
+# cdef extern from *:
+#     """
+#     PyObject* char_to_string(const char* data) {
+#         return PyUnicode_FromString(data);
+#     }
+#     """
+#     object char_to_string(const char* data)
+
+
+# cdef extern from "Python.h":
+#     # Note: importing extern-style allows us to declare these as nogil
+#     # functions, whereas `from cpython cimport` does not.
+#     bint PyUnicode_Check(object obj) nogil
+#     bint PyBool_Check(object obj) nogil
+#     bint PyFloat_Check(object obj) nogil
+#     bint PyComplex_Check(object obj) nogil
+#     bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil
+
+#     # Note that following functions can potentially raise an exception,
+#     # thus they cannot be declared 'nogil'. Also PyUnicode_AsUTF8AndSize() can
+#     # potentially allocate memory inside in unlikely case of when underlying
+#     # unicode object was stored as non-utf8 and utf8 wasn't requested before.
+#     const char* PyUnicode_AsUTF8AndSize(object obj,
+#                                         Py_ssize_t* length) except NULL
+
+# from numpy cimport (
+#     float64_t,
+#     int64_t,
+# )
+
+
+# cdef extern from "numpy/arrayobject.h":
+#     PyTypeObject PyFloatingArrType_Type
+
+# cdef extern from "numpy/ndarrayobject.h":
+#     PyTypeObject PyTimedeltaArrType_Type
+#     PyTypeObject PyDatetimeArrType_Type
+#     PyTypeObject PyComplexFloatingArrType_Type
+#     PyTypeObject PyBoolArrType_Type
+
+#     bint PyArray_IsIntegerScalar(obj) nogil
+#     bint PyArray_Check(obj) nogil
+
+# cdef extern from "numpy/npy_common.h":
+#     int64_t NPY_MIN_INT64
+
+
+# cdef inline int64_t get_nat():
+#     return NPY_MIN_INT64
+
+
+# --------------------------------------------------------------------
+# Type Checking
+
+
+def is_integer_object(val: object) -> bool:
+    """
+    Cython equivalent of
+
+    `isinstance(val, (int, long, np.integer)) and not isinstance(val, bool)`
+
+    Parameters
+    ----------
+    val : object
+
+    Returns
+    -------
+    bool
+    """
+    return (
+        not isinstance(val, bool)
+        and isinstance(val, (int, np.integer))
+        and not is_timedelta64_object(val)
+    )
+
+
+# cdef inline bint is_float_object(object obj) nogil:
+#     """
+#     Cython equivalent of `isinstance(val, (float, np.complex_))`
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     is_float : bool
+#     """
+#     return (PyFloat_Check(obj) or
+#             (PyObject_TypeCheck(obj, &PyFloatingArrType_Type)))
+
+
+# cdef inline bint is_complex_object(object obj) nogil:
+#     """
+#     Cython equivalent of `isinstance(val, (complex, np.complex_))`
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     is_complex : bool
+#     """
+#     return (PyComplex_Check(obj) or
+#             PyObject_TypeCheck(obj, &PyComplexFloatingArrType_Type))
+
+
+# cdef inline bint is_bool_object(object obj) nogil:
+#     """
+#     Cython equivalent of `isinstance(val, (bool, np.bool_))`
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     is_bool : bool
+#     """
+#     return (PyBool_Check(obj) or
+#             PyObject_TypeCheck(obj, &PyBoolArrType_Type))
+
+
+# cdef inline bint is_real_number_object(object obj) nogil:
+#     return is_bool_object(obj) or is_integer_object(obj) or is_float_object(obj)
+
+
+def is_timedelta64_object(val: object) -> bool:
+    """
+    Cython equivalent of `isinstance(val, np.timedelta64)`
+
+    Parameters
+    ----------
+    val : object
+
+    Returns
+    -------
+    bool
+    """
+    return isinstance(val, np.timedelta64)
+
+
+# cdef inline bint is_datetime64_object(object obj) nogil:
+#     """
+#     Cython equivalent of `isinstance(val, np.datetime64)`
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     is_datetime64 : bool
+#     """
+#     return PyObject_TypeCheck(obj, &PyDatetimeArrType_Type)
+
+
+# cdef inline bint is_array(object val):
+#     """
+#     Cython equivalent of `isinstance(val, np.ndarray)`
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     is_ndarray : bool
+#     """
+#     return PyArray_Check(val)
+
+
+# cdef inline bint is_nan(object val):
+#     """
+#     Check if val is a Not-A-Number float or complex, including
+#     float('NaN') and np.nan.
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     is_nan : bool
+#     """
+#     cdef float64_t fval
+#     if is_float_object(val):
+#         fval = val
+#         return fval != fval
+#     return is_complex_object(val) and val != val
+
+
+# cdef inline const char* get_c_string_buf_and_size(str py_string,
+#                                                   Py_ssize_t *length) except NULL:
+#     """
+#     Extract internal char* buffer of unicode or bytes object `py_string` with
+#     getting length of this internal buffer saved in `length`.
+
+#     Notes
+#     -----
+#     Python object owns memory, thus returned char* must not be freed.
+#     `length` can be NULL if getting buffer length is not needed.
+
+#     Parameters
+#     ----------
+#     py_string : str
+#     length : Py_ssize_t*
+
+#     Returns
+#     -------
+#     buf : const char*
+#     """
+#     return PyUnicode_AsUTF8AndSize(py_string, length)
+
+
+# cdef inline const char* get_c_string(str py_string) except NULL:
+#     return get_c_string_buf_and_size(py_string, NULL)
diff --git a/pandas/_libs_numba/util.py b/pandas/_libs_numba/util.py
new file mode 100644
index 0000000000000..56239126279ff
--- /dev/null
+++ b/pandas/_libs_numba/util.py
@@ -0,0 +1,50 @@
+# cimport numpy as cnp
+# from numpy cimport ndarray
+
+from pandas._libs_numba.tslibs.util import *  # noqa
+
+# cdef extern from "numpy/ndarraytypes.h":
+#     void PyArray_CLEARFLAGS(ndarray arr, int flags) nogil
+
+
+# cdef extern from "numpy/arrayobject.h":
+#     enum:
+#         NPY_ARRAY_C_CONTIGUOUS
+#         NPY_ARRAY_F_CONTIGUOUS
+
+
+# cdef extern from "src/headers/stdint.h":
+#     enum: UINT8_MAX
+#     enum: UINT16_MAX
+#     enum: UINT32_MAX
+#     enum: UINT64_MAX
+#     enum: INT8_MIN
+#     enum: INT8_MAX
+#     enum: INT16_MIN
+#     enum: INT16_MAX
+#     enum: INT32_MAX
+#     enum: INT32_MIN
+#     enum: INT64_MAX
+#     enum: INT64_MIN
+
+
+# ctypedef fused numeric:
+#     cnp.int8_t
+#     cnp.int16_t
+#     cnp.int32_t
+#     cnp.int64_t
+
+#     cnp.uint8_t
+#     cnp.uint16_t
+#     cnp.uint32_t
+#     cnp.uint64_t
+
+#     cnp.float32_t
+#     cnp.float64_t
+
+
+# cdef inline void set_array_not_contiguous(ndarray ao) nogil:
+#     # Numpy>=1.8-compliant equivalent to:
+#     # ao->flags &= ~(NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS);
+#     PyArray_CLEARFLAGS(ao,
+#                        (NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS))
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 5d788c42054ab..2bcced5597738 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -1,6 +1,8 @@
 """
 missing types & inference
 """
+from __future__ import annotations
+
 from functools import partial
 
 import numpy as np
@@ -54,7 +56,7 @@
 INF_AS_NA = False
 
 
-def isna(obj):
+def isna(obj: object):
     """
     Detect missing values for an array-like object.
 
@@ -137,7 +139,7 @@ def isna(obj):
 isnull = isna
 
 
-def _isna(obj, inf_as_na: bool = False):
+def _isna(obj: object, inf_as_na: bool = False):
     """
     Detect missing values, treating None, NaN or NA as null. Infinite
     values will also be treated as null if inf_as_na is True.
@@ -206,7 +208,7 @@ def _use_inf_as_na(key):
         globals()["INF_AS_NA"] = False
 
 
-def _isna_ndarraylike(obj, inf_as_na: bool = False):
+def _isna_ndarraylike(obj: object, inf_as_na: bool = False):
     """
     Return an array indicating which values of the input array are NaN / NA.
 
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index 9ae5f7d1b7497..3923f501bbcfe 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -19,6 +19,7 @@
     algos,
     lib,
 )
+from pandas._libs_numba import algos as algos_numba
 from pandas._typing import (
     ArrayLike,
     Axis,
@@ -694,7 +695,7 @@ def _fillna_prep(values, mask=None):
 
 def _pad_1d(values, limit=None, mask=None):
     values, mask = _fillna_prep(values, mask)
-    algos.pad_inplace(values, mask, limit=limit)
+    algos_numba.pad_inplace(values, mask, limit=limit)
     return values
 
 

From 8bfeeb22a93c7ac92a8dc331a3f6d0e12b98cd3f Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 20 Feb 2021 13:34:56 +0000
Subject: [PATCH 04/37] jit pad_inplace

---
 pandas/_libs_numba/algos.py | 73 +++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 32 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index 548568d3dd5fb..843b9a8ef828e 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -1,3 +1,10 @@
+from __future__ import annotations
+
+import numba
+import numpy as np
+
+import pandas._libs_numba.util as util
+
 # import cython
 # from cython import Py_ssize_t
 
@@ -5,8 +12,6 @@
 # from libc.stdlib cimport free, malloc
 # from libc.string cimport memmove
 
-# import numpy as np
-
 # cimport numpy as cnp
 # from numpy cimport (
 #     NPY_FLOAT32,
@@ -36,7 +41,6 @@
 
 # cnp.import_array()
 
-import pandas._libs_numba.util as util
 
 # from pandas._libs.khash cimport (
 #     kh_destroy_int64,
@@ -504,30 +508,16 @@
 #     uint8_t
 
 
-def validate_limit(nobs: int, limit=None) -> int:
+def _validate_limit(limit: int | None = None) -> None:
     """
-    Check that the `limit` argument is a positive integer.
-
-    Parameters
-    ----------
-    nobs : int
-    limit : object
-
-    Returns
-    -------
-    int
-        The limit.
+    Check that the `limit` argument is a positive integer or None.
     """
     if limit is None:
-        lim = nobs
-    else:
-        if not util.is_integer_object(limit):
-            raise ValueError("Limit must be an integer")
-        if limit < 1:
-            raise ValueError("Limit must be greater than 0")
-        lim = limit
-
-    return lim
+        return
+    elif not util.is_integer_object(limit):
+        raise ValueError("Limit must be an integer")
+    elif limit < 1:
+        raise ValueError("Limit must be greater than 0")
 
 
 # @cython.boundscheck(False)
@@ -587,22 +577,41 @@ def validate_limit(nobs: int, limit=None) -> int:
 #     return indexer
 
 
-def pad_inplace(values, mask, limit=None):
+def pad_inplace(values: np.ndarray, mask: np.ndarray, limit: int | None = None) -> None:
+    _validate_limit(limit)
+    _pad_inplace(values, mask, limit)
 
-    fill_count = 0
 
-    N = len(values)
+@numba.jit
+def _pad_inplace(
+    values: np.ndarray, mask: np.ndarray, limit: int | None = None
+) -> None:
+    if len(values):
+        if limit is None:
+            _pad_inplace_no_limit(values, mask)
+        else:
+            _pad_inplace_with_limit(values, mask, limit)
 
-    # GH#2778
-    if N == 0:
-        return
 
-    lim = validate_limit(N, limit)
+@numba.jit
+def _pad_inplace_no_limit(values: np.ndarray, mask: np.ndarray) -> None:
+    N = len(values)
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            values[i] = val
+        else:
+            val = values[i]
+
 
+@numba.jit
+def _pad_inplace_with_limit(values: np.ndarray, mask: np.ndarray, limit: int) -> None:
+    N = len(values)
+    fill_count = 0
     val = values[0]
     for i in range(N):
         if mask[i]:
-            if fill_count >= lim:
+            if fill_count >= limit:
                 continue
             fill_count += 1
             values[i] = val

From d5ed4d85a433d65d62a9d52d9d0d591116059530 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 20 Feb 2021 16:16:37 +0000
Subject: [PATCH 05/37] is_monotonic to python

---
 pandas/_libs_numba/algos.py        | 127 ++++++++++-------------------
 pandas/core/arrays/datetimelike.py |  10 +--
 2 files changed, 47 insertions(+), 90 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index 843b9a8ef828e..ad9f290d145e0 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -794,92 +794,51 @@ def _pad_inplace_with_limit(values: np.ndarray, mask: np.ndarray, limit: int) ->
 #                 val = values[j, i]
 
 
-# @cython.boundscheck(False)
-# @cython.wraparound(False)
-# def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike):
-#     """
-#     Returns
-#     -------
-#     tuple
-#         is_monotonic_inc : bool
-#         is_monotonic_dec : bool
-#         is_unique : bool
-#     """
-#     cdef:
-#         Py_ssize_t i, n
-#         algos_t prev, cur
-#         bint is_monotonic_inc = 1
-#         bint is_monotonic_dec = 1
-#         bint is_unique = 1
-#         bint is_strict_monotonic = 1
-
-#     n = len(arr)
-
-#     if n == 1:
-#         if arr[0] != arr[0] or (timelike and <int64_t>arr[0] == NPY_NAT):
-#             # single value is NaN
-#             return False, False, True
-#         else:
-#             return True, True, True
-#     elif n < 2:
-#         return True, True, True
+def is_monotonic(arr: np.ndarray) -> tuple[bool, bool, bool]:
+    """
+    Returns
+    -------
+    tuple
+        is_monotonic_inc : bool
+        is_monotonic_dec : bool
+        is_unique : bool
+    """
+    is_monotonic_inc = True
+    is_monotonic_dec = True
+    is_unique = True
+    is_strict_monotonic = True
 
-#     if timelike and <int64_t>arr[0] == NPY_NAT:
-#         return False, False, True
+    n = len(arr)
 
-#     if algos_t is not object:
-#         with nogil:
-#             prev = arr[0]
-#             for i in range(1, n):
-#                 cur = arr[i]
-#                 if timelike and <int64_t>cur == NPY_NAT:
-#                     is_monotonic_inc = 0
-#                     is_monotonic_dec = 0
-#                     break
-#                 if cur < prev:
-#                     is_monotonic_inc = 0
-#                 elif cur > prev:
-#                     is_monotonic_dec = 0
-#                 elif cur == prev:
-#                     is_unique = 0
-#                 else:
-#                     # cur or prev is NaN
-#                     is_monotonic_inc = 0
-#                     is_monotonic_dec = 0
-#                     break
-#                 if not is_monotonic_inc and not is_monotonic_dec:
-#                     is_monotonic_inc = 0
-#                     is_monotonic_dec = 0
-#                     break
-#                 prev = cur
-#     else:
-#         # object-dtype, identical to above except we cannot use `with nogil`
-#         prev = arr[0]
-#         for i in range(1, n):
-#             cur = arr[i]
-#             if timelike and <int64_t>cur == NPY_NAT:
-#                 is_monotonic_inc = 0
-#                 is_monotonic_dec = 0
-#                 break
-#             if cur < prev:
-#                 is_monotonic_inc = 0
-#             elif cur > prev:
-#                 is_monotonic_dec = 0
-#             elif cur == prev:
-#                 is_unique = 0
-#             else:
-#                 # cur or prev is NaN
-#                 is_monotonic_inc = 0
-#                 is_monotonic_dec = 0
-#                 break
-#             if not is_monotonic_inc and not is_monotonic_dec:
-#                 is_monotonic_inc = 0
-#                 is_monotonic_dec = 0
-#                 break
-#             prev = cur
-
-#     is_strict_monotonic = is_unique and (is_monotonic_inc or is_monotonic_dec)
-#     return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic
+    if n == 1:
+        if arr[0] != arr[0]:
+            # single value is NaN/NaT
+            return False, False, True
+        else:
+            return True, True, True
+    elif n < 2:
+        return True, True, True
+
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            is_monotonic_inc = False
+        elif cur > prev:
+            is_monotonic_dec = False
+        elif cur == prev:
+            is_unique = False
+        else:
+            # cur or prev is NaN/NaT
+            is_monotonic_inc = False
+            is_monotonic_dec = False
+            break
+        if not is_monotonic_inc and not is_monotonic_dec:
+            break
+        prev = cur
+
+    is_strict_monotonic = is_unique and (is_monotonic_inc or is_monotonic_dec)
+    return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic
 
 
 # # ----------------------------------------------------------------------
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 8e1d7e607fb8a..f4acee6ff1eac 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -21,10 +21,7 @@
 
 import numpy as np
 
-from pandas._libs import (
-    algos,
-    lib,
-)
+from pandas._libs import lib
 from pandas._libs.tslibs import (
     BaseOffset,
     IncompatibleFrequency,
@@ -43,6 +40,7 @@
     round_nsint64,
 )
 from pandas._libs.tslibs.timestamps import integer_op_not_supported
+from pandas._libs_numba import algos
 from pandas._typing import (
     DatetimeLikeScalar,
     Dtype,
@@ -961,11 +959,11 @@ def _generate_range(
 
     @property
     def _is_monotonic_increasing(self) -> bool:
-        return algos.is_monotonic(self.asi8, timelike=True)[0]
+        return algos.is_monotonic(self._data)[0]
 
     @property
     def _is_monotonic_decreasing(self) -> bool:
-        return algos.is_monotonic(self.asi8, timelike=True)[1]
+        return algos.is_monotonic(self._data)[1]
 
     @property
     def _is_unique(self) -> bool:

From 1eb62b2f0281d6f4d14c81587a320653b0cd7fb3 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 20 Feb 2021 16:38:29 +0000
Subject: [PATCH 06/37] jit is_monotonic

---
 pandas/_libs_numba/algos.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index ad9f290d145e0..7358f3e07af42 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -794,6 +794,7 @@ def _pad_inplace_with_limit(values: np.ndarray, mask: np.ndarray, limit: int) ->
 #                 val = values[j, i]
 
 
+@numba.njit
 def is_monotonic(arr: np.ndarray) -> tuple[bool, bool, bool]:
     """
     Returns

From 9e43a6ceb438550b7a5b74318c4c906997d46861 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 20 Feb 2021 21:26:20 +0000
Subject: [PATCH 07/37] backfill_inplace

---
 pandas/_libs_numba/algos.py | 33 ++++++---------------------------
 pandas/core/missing.py      |  2 +-
 2 files changed, 7 insertions(+), 28 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index 7358f3e07af42..de060d72e6841 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -586,7 +586,7 @@ def pad_inplace(values: np.ndarray, mask: np.ndarray, limit: int | None = None)
 def _pad_inplace(
     values: np.ndarray, mask: np.ndarray, limit: int | None = None
 ) -> None:
-    if len(values):
+    if values.shape[0]:
         if limit is None:
             _pad_inplace_no_limit(values, mask)
         else:
@@ -734,32 +734,11 @@ def _pad_inplace_with_limit(values: np.ndarray, mask: np.ndarray, limit: int) ->
 #     return indexer
 
 
-# @cython.boundscheck(False)
-# @cython.wraparound(False)
-# def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
-#     cdef:
-#         Py_ssize_t i, N
-#         algos_t val
-#         int lim, fill_count = 0
-
-#     N = len(values)
-
-#     # GH#2778
-#     if N == 0:
-#         return
-
-#     lim = validate_limit(N, limit)
-
-#     val = values[N - 1]
-#     for i in range(N - 1, -1, -1):
-#         if mask[i]:
-#             if fill_count >= lim:
-#                 continue
-#             fill_count += 1
-#             values[i] = val
-#         else:
-#             fill_count = 0
-#             val = values[i]
+def backfill_inplace(
+    values: np.ndarray, mask: np.ndarray, limit: int | None = None
+) -> None:
+    _validate_limit(limit)
+    _pad_inplace(values[::-1], mask[::-1], limit)
 
 
 # @cython.boundscheck(False)
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index 3923f501bbcfe..5dd104db23b06 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -701,7 +701,7 @@ def _pad_1d(values, limit=None, mask=None):
 
 def _backfill_1d(values, limit=None, mask=None):
     values, mask = _fillna_prep(values, mask)
-    algos.backfill_inplace(values, mask, limit=limit)
+    algos_numba.backfill_inplace(values, mask, limit=limit)
     return values
 
 

From 6f0e9b7beb30abbc67cf7c55c38094435a56a02a Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 21 Feb 2021 16:54:42 +0000
Subject: [PATCH 08/37] remove backfill_inplace and call to _fillna_prep

---
 pandas/_libs_numba/algos.py | 7 -------
 pandas/core/missing.py      | 9 +++++----
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index de060d72e6841..62ae85cf5dd5f 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -734,13 +734,6 @@ def _pad_inplace_with_limit(values: np.ndarray, mask: np.ndarray, limit: int) ->
 #     return indexer
 
 
-def backfill_inplace(
-    values: np.ndarray, mask: np.ndarray, limit: int | None = None
-) -> None:
-    _validate_limit(limit)
-    _pad_inplace(values[::-1], mask[::-1], limit)
-
-
 # @cython.boundscheck(False)
 # @cython.wraparound(False)
 # def backfill_2d_inplace(algos_t[:, :] values,
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index 5dd104db23b06..76ff646b98201 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -694,15 +694,16 @@ def _fillna_prep(values, mask=None):
 
 
 def _pad_1d(values, limit=None, mask=None):
-    values, mask = _fillna_prep(values, mask)
+    if mask is None:
+        mask = isna(values)
     algos_numba.pad_inplace(values, mask, limit=limit)
     return values
 
 
 def _backfill_1d(values, limit=None, mask=None):
-    values, mask = _fillna_prep(values, mask)
-    algos_numba.backfill_inplace(values, mask, limit=limit)
-    return values
+    if mask is not None:
+        mask = mask[::-1]
+    return _pad_1d(values[::-1], limit, mask)[::-1]
 
 
 def _pad_2d(values, limit=None, mask=None):

From 5f6953e4f55c2adbca0c0376d0089de7e9e8a0e9 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 22 Feb 2021 20:39:30 +0000
Subject: [PATCH 09/37] pad_2d_inplace

---
 pandas/_libs_numba/algos.py                | 63 +++++++++++++---------
 pandas/core/missing.py                     |  2 +-
 pandas/tests/apply/test_frame_transform.py |  3 +-
 3 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index 62ae85cf5dd5f..b6e1ef8443dc1 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -620,34 +620,49 @@ def _pad_inplace_with_limit(values: np.ndarray, mask: np.ndarray, limit: int) ->
             val = values[i]
 
 
-# @cython.boundscheck(False)
-# @cython.wraparound(False)
-# def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None):
-#     cdef:
-#         Py_ssize_t i, j, N, K
-#         algos_t val
-#         int lim, fill_count = 0
+def pad_2d_inplace(
+    values: np.ndarray, mask: np.ndarray, limit: int | None = None
+) -> None:
+    _validate_limit(limit)
+    _pad_2d_inplace(values, mask, limit)
 
-#     K, N = (<object>values).shape
 
-#     # GH#2778
-#     if N == 0:
-#         return
+@numba.jit
+def _pad_2d_inplace(values, mask, limit=None):
+    if values.shape[1]:
+        if limit is None:
+            _pad_2d_inplace_no_limit(values, mask)
+        else:
+            _pad_2d_inplace_with_limit(values, mask, limit)
 
-#     lim = validate_limit(N, limit)
 
-#     for j in range(K):
-#         fill_count = 0
-#         val = values[j, 0]
-#         for i in range(N):
-#             if mask[j, i]:
-#                 if fill_count >= lim:
-#                     continue
-#                 fill_count += 1
-#                 values[j, i] = val
-#             else:
-#                 fill_count = 0
-#                 val = values[j, i]
+@numba.jit
+def _pad_2d_inplace_no_limit(values, mask):
+    K, N = values.shape
+    for j in range(K):
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                values[j, i] = val
+            else:
+                val = values[j, i]
+
+
+@numba.jit
+def _pad_2d_inplace_with_limit(values, mask, limit):
+    K, N = values.shape
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= limit:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 
 
 # """
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index 76ff646b98201..fc27545ce4a82 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -710,7 +710,7 @@ def _pad_2d(values, limit=None, mask=None):
     values, mask = _fillna_prep(values, mask)
 
     if np.all(values.shape):
-        algos.pad_2d_inplace(values, mask, limit=limit)
+        algos_numba.pad_2d_inplace(values, mask, limit=limit)
     else:
         # for test coverage
         pass
diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py
index 7718ec5215499..c6052613b2251 100644
--- a/pandas/tests/apply/test_frame_transform.py
+++ b/pandas/tests/apply/test_frame_transform.py
@@ -1,6 +1,7 @@
 import operator
 import re
 
+from numba import NumbaWarning
 import numpy as np
 import pytest
 
@@ -205,7 +206,7 @@ def test_transform_bad_dtype(op, frame_or_series):
     msg = "Transform function failed"
 
     # tshift is deprecated
-    warn = None if op != "tshift" else FutureWarning
+    warn = NumbaWarning if op != "tshift" else FutureWarning
     with tm.assert_produces_warning(warn):
         with pytest.raises(ValueError, match=msg):
             obj.transform(op)

From 078f688a9b5a715ba89b021ac63ce3a76f9bf99a Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 22 Feb 2021 21:20:45 +0000
Subject: [PATCH 10/37] _backfill_2d

---
 pandas/core/missing.py                     | 14 +++++---------
 pandas/tests/apply/test_frame_transform.py |  6 ++----
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index fc27545ce4a82..bd4138dfb7295 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -707,7 +707,8 @@ def _backfill_1d(values, limit=None, mask=None):
 
 
 def _pad_2d(values, limit=None, mask=None):
-    values, mask = _fillna_prep(values, mask)
+    if mask is None:
+        mask = isna(values)
 
     if np.all(values.shape):
         algos_numba.pad_2d_inplace(values, mask, limit=limit)
@@ -718,14 +719,9 @@ def _pad_2d(values, limit=None, mask=None):
 
 
 def _backfill_2d(values, limit=None, mask=None):
-    values, mask = _fillna_prep(values, mask)
-
-    if np.all(values.shape):
-        algos.backfill_2d_inplace(values, mask, limit=limit)
-    else:
-        # for test coverage
-        pass
-    return values
+    if mask is not None:
+        mask = mask[:, ::-1]
+    return _pad_2d(values[:, ::-1], limit, mask)[:, ::-1]
 
 
 _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d}
diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py
index c6052613b2251..fbcdf2bfee074 100644
--- a/pandas/tests/apply/test_frame_transform.py
+++ b/pandas/tests/apply/test_frame_transform.py
@@ -1,7 +1,6 @@
 import operator
 import re
 
-from numba import NumbaWarning
 import numpy as np
 import pytest
 
@@ -194,8 +193,7 @@ def test_transform_reducer_raises(all_reductions, frame_or_series):
 frame_kernels_raise = [x for x in frame_kernels if x not in wont_fail]
 
 
-# mypy doesn't allow adding lists of different types
-# https://github.com/python/mypy/issues/5492
+@pytest.mark.xfail(strict=False)
 @pytest.mark.parametrize("op", [*frame_kernels_raise, lambda x: x + 1])
 def test_transform_bad_dtype(op, frame_or_series):
     # GH 35964
@@ -206,7 +204,7 @@ def test_transform_bad_dtype(op, frame_or_series):
     msg = "Transform function failed"
 
     # tshift is deprecated
-    warn = NumbaWarning if op != "tshift" else FutureWarning
+    warn = None if op != "tshift" else FutureWarning
     with tm.assert_produces_warning(warn):
         with pytest.raises(ValueError, match=msg):
             obj.transform(op)

From d449ca0bfc5c510bb59a8b3c6af225c9c3969039 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 22 Feb 2021 21:34:15 +0000
Subject: [PATCH 11/37] remove _cast_values_for_fillna and _fillna_prep

---
 pandas/core/missing.py | 37 -------------------------------------
 1 file changed, 37 deletions(-)

diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index bd4138dfb7295..0d5fdd9ee2beb 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -23,14 +23,11 @@
 from pandas._typing import (
     ArrayLike,
     Axis,
-    DtypeObj,
 )
 from pandas.compat._optional import import_optional_dependency
 
 from pandas.core.dtypes.cast import infer_dtype_from
 from pandas.core.dtypes.common import (
-    ensure_float64,
-    is_integer_dtype,
     is_numeric_v_string_like,
     needs_i8_conversion,
 )
@@ -659,40 +656,6 @@ def interpolate_2d(
     return result
 
 
-def _cast_values_for_fillna(values, dtype: DtypeObj, has_mask: bool):
-    """
-    Cast values to a dtype that algos.pad and algos.backfill can handle.
-    """
-    # TODO: for int-dtypes we make a copy, but for everything else this
-    #  alters the values in-place.  Is this intentional?
-
-    if needs_i8_conversion(dtype):
-        values = values.view(np.int64)
-
-    elif is_integer_dtype(values) and not has_mask:
-        # NB: this check needs to come after the datetime64 check above
-        # has_mask check to avoid casting i8 values that have already
-        #  been cast from PeriodDtype
-        values = ensure_float64(values)
-
-    return values
-
-
-def _fillna_prep(values, mask=None):
-    # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d
-    dtype = values.dtype
-
-    has_mask = mask is not None
-    if not has_mask:
-        # This needs to occur before datetime/timedeltas are cast to int64
-        mask = isna(values)
-
-    values = _cast_values_for_fillna(values, dtype, has_mask)
-
-    mask = mask.view(np.uint8)
-    return values, mask
-
-
 def _pad_1d(values, limit=None, mask=None):
     if mask is None:
         mask = isna(values)

From 698d3dd3bf83f4f1c5ecd42d0deddf74dedf7d6d Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 24 Feb 2021 14:59:26 +0000
Subject: [PATCH 12/37] algos.kth_smallest

---
 pandas/_libs_numba/algos.py | 58 +++++++++++++++++++------------------
 pandas/core/algorithms.py   |  3 +-
 setup.cfg                   |  1 +
 3 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index b6e1ef8443dc1..fb7362152f84a 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -230,34 +230,36 @@
 #     return result, counts
 
 
-# @cython.boundscheck(False)
-# @cython.wraparound(False)
-# def kth_smallest(numeric[:] a, Py_ssize_t k) -> numeric:
-#     cdef:
-#         Py_ssize_t i, j, l, m, n = a.shape[0]
-#         numeric x
-
-#     with nogil:
-#         l = 0
-#         m = n - 1
-
-#         while l < m:
-#             x = a[k]
-#             i = l
-#             j = m
-
-#             while 1:
-#                 while a[i] < x: i += 1
-#                 while x < a[j]: j -= 1
-#                 if i <= j:
-#                     swap(&a[i], &a[j])
-#                     i += 1; j -= 1
-
-#                 if i > j: break
-
-#             if j < k: l = i
-#             if k < i: m = j
-#     return a[k]
+@numba.njit
+def kth_smallest(a: np.ndarray, k):
+    n = a.shape[0]
+
+    l = 0
+    m = n - 1
+
+    while l < m:
+        x = a[k]
+        i = l
+        j = m
+
+        while 1:
+            while a[i] < x:
+                i += 1
+            while x < a[j]:
+                j -= 1
+            if i <= j:
+                a[i], a[j] = a[j], a[i]
+                i += 1
+                j -= 1
+
+            if i > j:
+                break
+
+        if j < k:
+            l = i
+        if k < i:
+            m = j
+    return a[k]
 
 
 # # ----------------------------------------------------------------------
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 819e5a1c32d9b..dfcc7605a69f5 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -28,6 +28,7 @@
     iNaT,
     lib,
 )
+from pandas._libs_numba import algos as algos_numba
 from pandas._typing import (
     AnyArrayLike,
     ArrayLike,
@@ -1290,7 +1291,7 @@ def compute(self, method: str) -> Series:
         narr = len(arr)
         n = min(n, narr)
 
-        kth_val = algos.kth_smallest(arr.copy(), n - 1)
+        kth_val = algos_numba.kth_smallest(arr.copy(), n - 1)
         (ns,) = np.nonzero(arr <= kth_val)
         inds = ns[arr[ns].argsort(kind="mergesort")]
 
diff --git a/setup.cfg b/setup.cfg
index 82f4a89f7ab3f..63165c9b19293 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -76,6 +76,7 @@ ignore =
     W504,  # line break after binary operator
     E402,  # module level import not at top of file
     E731,  # do not assign a lambda expression, use a def
+    E741,  # ambiguous variable name
     S001   # found modulo formatter (incorrect picks up mod operations)
 exclude =
     doc/sphinxext/*.py,

From b10722f51780c0ce3a572fe25d2209f4c1cec9fc Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 24 Feb 2021 19:22:40 +0000
Subject: [PATCH 13/37] is_lexsorted

---
 pandas/_libs_numba/algos.py  | 60 ++++++++++++++++--------------------
 pandas/core/indexes/multi.py |  9 ++----
 2 files changed, 29 insertions(+), 40 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index fb7362152f84a..b4c50980b46e1 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -141,40 +141,32 @@
 #     return result
 
 
-# @cython.wraparound(False)
-# @cython.boundscheck(False)
-# def is_lexsorted(list_of_arrays: list) -> bint:
-#     cdef:
-#         Py_ssize_t i
-#         Py_ssize_t n, nlevels
-#         int64_t k, cur, pre
-#         ndarray arr
-#         bint result = True
-
-#     nlevels = len(list_of_arrays)
-#     n = len(list_of_arrays[0])
-
-#     cdef int64_t **vecs = <int64_t**>malloc(nlevels * sizeof(int64_t*))
-#     for i in range(nlevels):
-#         arr = list_of_arrays[i]
-#         assert arr.dtype.name == 'int64'
-#         vecs[i] = <int64_t*>cnp.PyArray_DATA(arr)
-
-#     # Assume uniqueness??
-#     with nogil:
-#         for i in range(1, n):
-#             for k in range(nlevels):
-#                 cur = vecs[k][i]
-#                 pre = vecs[k][i -1]
-#                 if cur == pre:
-#                     continue
-#                 elif cur > pre:
-#                     break
-#                 else:
-#                     result = False
-#                     break
-#     free(vecs)
-#     return result
+def is_lexsorted(list_of_arrays: list[np.ndarray]) -> bool:
+    nlevels = len(list_of_arrays)
+    n = len(list_of_arrays[0])
+    arr = np.concatenate(list_of_arrays)
+    arr = arr.reshape(nlevels, n)
+    return _is_lexsorted(arr)
+
+
+@numba.njit
+def _is_lexsorted(vecs: np.ndarray) -> bool:
+    result = True
+    nlevels, n = vecs.shape
+
+    for i in range(1, n):
+        for k in range(nlevels):
+            cur = vecs[k, i]
+            pre = vecs[k, i - 1]
+            if cur == pre:
+                continue
+            elif cur > pre:
+                break
+            else:
+                result = False
+                break
+
+    return result
 
 
 # @cython.boundscheck(False)
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 1889821c79756..28a35ccf2b22c 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -22,11 +22,11 @@
 from pandas._config import get_option
 
 from pandas._libs import (
-    algos as libalgos,
     index as libindex,
     lib,
 )
 from pandas._libs.hashtable import duplicated_int64
+from pandas._libs_numba import algos as libalgos
 from pandas._typing import (
     AnyArrayLike,
     DtypeObj,
@@ -1586,9 +1586,7 @@ def is_monotonic_increasing(self) -> bool:
 
         if all(level.is_monotonic for level in self.levels):
             # If each level is sorted, we can operate on the codes directly. GH27495
-            return libalgos.is_lexsorted(
-                [x.astype("int64", copy=False) for x in self.codes]
-            )
+            return libalgos.is_lexsorted(self.codes)
 
         # reversed() because lexsort() wants the most significant key last.
         values = [
@@ -3794,9 +3792,8 @@ def isin(self, values, level=None) -> np.ndarray:
 
 def _lexsort_depth(codes: List[np.ndarray], nlevels: int) -> int:
     """Count depth (up to a maximum of `nlevels`) with which codes are lexsorted."""
-    int64_codes = [ensure_int64(level_codes) for level_codes in codes]
     for k in range(nlevels, 0, -1):
-        if libalgos.is_lexsorted(int64_codes[:k]):
+        if libalgos.is_lexsorted(codes[:k]):
             return k
     return 0
 

From d4415b7f6646b7b33207b6a992dcbbe6f3545ab2 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 25 Feb 2021 20:34:14 +0000
Subject: [PATCH 14/37] is_decimal_na #39409

---
 pandas/_libs_numba/missing.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs_numba/missing.py b/pandas/_libs_numba/missing.py
index 998ff5fe586b1..7cc3273222a3e 100644
--- a/pandas/_libs_numba/missing.py
+++ b/pandas/_libs_numba/missing.py
@@ -17,6 +17,8 @@
 # )
 # from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value  # noqa
 
+from decimal import Decimal
+
 # from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
 # from pandas.compat import IS64
 import numba
@@ -110,7 +112,16 @@ def checknull(val: object) -> bool:
     The difference between `checknull` and `checknull_old` is that `checknull`
     does *not* consider INF or NEGINF to be NA.
     """
-    return val is NA or is_null_datetimelike(val, inat_is_null=False)
+    return (
+        val is NA or is_null_datetimelike(val, inat_is_null=False) or is_decimal_na(val)
+    )
+
+
+def is_decimal_na(val: object) -> bool:
+    """
+    Is this a decimal.Decimal object Decimal("NAN").
+    """
+    return isinstance(val, Decimal) and val != val
 
 
 # cpdef bint checknull_old(object val):

From d200fb012ca86c525562ee8b32e841f749437f78 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 5 Mar 2021 18:58:20 +0000
Subject: [PATCH 15/37] xfail test_transform_partial_failure

---
 pandas/tests/apply/test_series_apply.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py
index dcb5de29da320..eb901cb04174d 100644
--- a/pandas/tests/apply/test_series_apply.py
+++ b/pandas/tests/apply/test_series_apply.py
@@ -257,6 +257,7 @@ def test_transform(string_series):
         tm.assert_series_equal(result.reindex_like(expected), expected)
 
 
+@pytest.mark.xfail(strict=False)
 @pytest.mark.parametrize("op", series_transform_kernels)
 def test_transform_partial_failure(op, request):
     # GH 35964 & GH 40211

From 7cd9839dee62e52365125265544ec1cc7f3732b2 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 5 Mar 2021 19:58:12 +0000
Subject: [PATCH 16/37] groupsort_indexer

---
 pandas/_libs_numba/algos.py       | 87 ++++++++++++++-----------------
 pandas/core/arrays/categorical.py |  3 +-
 pandas/core/indexes/base.py       |  2 +-
 pandas/core/sorting.py            |  2 +-
 pandas/tests/test_algos.py        |  7 +--
 5 files changed, 48 insertions(+), 53 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index b4c50980b46e1..97ac2cb87e99f 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -169,57 +169,50 @@ def _is_lexsorted(vecs: np.ndarray) -> bool:
     return result
 
 
-# @cython.boundscheck(False)
-# @cython.wraparound(False)
-# def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
-#     """
-#     Compute a 1-d indexer.
-
-#     The indexer is an ordering of the passed index,
-#     ordered by the groups.
-
-#     Parameters
-#     ----------
-#     index: int64 ndarray
-#         Mappings from group -> position.
-#     ngroups: int64
-#         Number of groups.
-
-#     Returns
-#     -------
-#     tuple
-#         1-d indexer ordered by groups, group counts.
-
-#     Notes
-#     -----
-#     This is a reverse of the label factorization process.
-#     """
-#     cdef:
-#         Py_ssize_t i, loc, label, n
-#         ndarray[int64_t] counts, where, result
-
-#     counts = np.zeros(ngroups + 1, dtype=np.int64)
-#     n = len(index)
-#     result = np.zeros(n, dtype=np.int64)
-#     where = np.zeros(ngroups + 1, dtype=np.int64)
-
-#     with nogil:
+@numba.njit
+def groupsort_indexer(index: np.ndarray, ngroups: int) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Compute a 1-d indexer.
 
-#         # count group sizes, location 0 for NA
-#         for i in range(n):
-#             counts[index[i] + 1] += 1
+    The indexer is an ordering of the passed index,
+    ordered by the groups.
 
-#         # mark the start of each contiguous group of like-indexed data
-#         for i in range(1, ngroups + 1):
-#             where[i] = where[i - 1] + counts[i - 1]
+    Parameters
+    ----------
+    index: ndarray
+        Mappings from group -> position.
+    ngroups: int
+        Number of groups.
 
-#         # this is our indexer
-#         for i in range(n):
-#             label = index[i] + 1
-#             result[where[label]] = i
-#             where[label] += 1
+    Returns
+    -------
+    tuple
+        1-d indexer ordered by groups, group counts.
 
-#     return result, counts
+    Notes
+    -----
+    This is a reverse of the label factorization process.
+    """
+    counts = np.zeros(ngroups + 1, dtype=np.int64)
+    n = len(index)
+    result = np.zeros(n, dtype=np.int64)
+    where = np.zeros(ngroups + 1, dtype=np.int64)
+
+    # count group sizes, location 0 for NA
+    for i in range(n):
+        counts[index[i] + 1] += 1
+
+    # mark the start of each contiguous group of like-indexed data
+    for i in range(1, ngroups + 1):
+        where[i] = where[i - 1] + counts[i - 1]
+
+    # this is our indexer
+    for i in range(n):
+        label = index[i] + 1
+        result[where[label]] = i
+        where[label] += 1
+
+    return result, counts
 
 
 @numba.njit
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 7777cb4bf674e..9d1471e2bbaef 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -28,6 +28,7 @@
     hashtable as htable,
 )
 from pandas._libs.lib import no_default
+from pandas._libs_numba import algos as libalgos_numba
 from pandas._typing import (
     ArrayLike,
     Dtype,
@@ -2014,7 +2015,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
 
         """
         categories = self.categories
-        r, counts = libalgos.groupsort_indexer(
+        r, counts = libalgos_numba.groupsort_indexer(
             self.codes.astype("int64"), categories.size
         )
         counts = counts.cumsum()
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 7f5e7e3a32f14..cf440991f5c39 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -27,7 +27,6 @@
 import numpy as np
 
 from pandas._libs import (
-    algos as libalgos,
     index as libindex,
     lib,
 )
@@ -42,6 +41,7 @@
     Timestamp,
     tz_compare,
 )
+from pandas._libs_numba import algos as libalgos
 from pandas._typing import (
     AnyArrayLike,
     ArrayLike,
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 973fed2c1436f..58225f6bc2e29 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -18,11 +18,11 @@
 import numpy as np
 
 from pandas._libs import (
-    algos,
     hashtable,
     lib,
 )
 from pandas._libs.hashtable import unique_label_indices
+from pandas._libs_numba import algos
 from pandas._typing import IndexKeyFunc
 
 from pandas.core.dtypes.common import (
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 26d336bee65ea..deafbc1d58261 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -9,6 +9,7 @@
     algos as libalgos,
     hashtable as ht,
 )
+from pandas._libs_numba import algos as libalgos_numba
 from pandas.compat import np_array_datetime64_compat
 import pandas.util._test_decorators as td
 
@@ -2112,14 +2113,14 @@ def test_is_lexsorted():
         ),
     ]
 
-    assert not libalgos.is_lexsorted(failure)
+    assert not libalgos_numba.is_lexsorted(failure)
 
 
 def test_groupsort_indexer():
     a = np.random.randint(0, 1000, 100).astype(np.int64)
     b = np.random.randint(0, 1000, 100).astype(np.int64)
 
-    result = libalgos.groupsort_indexer(a, 1000)[0]
+    result = libalgos_numba.groupsort_indexer(a, 1000)[0]
 
     # need to use a stable sort
     # np.argsort returns int, groupsort_indexer
@@ -2133,7 +2134,7 @@ def test_groupsort_indexer():
     # np.lexsort returns int, groupsort_indexer
     # always returns int64
     key = a * 1000 + b
-    result = libalgos.groupsort_indexer(key, 1000000)[0]
+    result = libalgos_numba.groupsort_indexer(key, 1000000)[0]
     expected = np.lexsort((b, a))
     expected = expected.astype(np.int64)
 

From c25337bfe40aae53bb1a51edf0bc3322be1a2c9a Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 7 Mar 2021 17:15:04 +0000
Subject: [PATCH 17/37] update for changes in #39953

---
 pandas/_libs_numba/algos.py | 13 +++++++------
 pandas/core/missing.py      | 14 +++++++-------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index 97ac2cb87e99f..7a203bf0a68b9 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -583,28 +583,29 @@ def _pad_inplace(
 @numba.jit
 def _pad_inplace_no_limit(values: np.ndarray, mask: np.ndarray) -> None:
     N = len(values)
-    val = values[0]
+    val, prev_mask = values[0], mask[0]
     for i in range(N):
         if mask[i]:
-            values[i] = val
+            values[i], mask[i] = val, prev_mask
         else:
-            val = values[i]
+            val, prev_mask = values[i], mask[i]
 
 
 @numba.jit
 def _pad_inplace_with_limit(values: np.ndarray, mask: np.ndarray, limit: int) -> None:
     N = len(values)
     fill_count = 0
-    val = values[0]
+    val, prev_mask = values[0], mask[0]
     for i in range(N):
         if mask[i]:
             if fill_count >= limit:
                 continue
             fill_count += 1
-            values[i] = val
+            values[i], mask[i] = val, prev_mask
+
         else:
             fill_count = 0
-            val = values[i]
+            val, prev_mask = values[i], mask[i]
 
 
 def pad_2d_inplace(
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index 29299d4bfd0f4..bce39d80abfb9 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -676,13 +676,12 @@ def _pad_1d(values, limit=None, mask=None):
     if mask is None:
         mask = isna(values)
     algos_numba.pad_inplace(values, mask, limit=limit)
-    return values
+    return values, mask
 
 
 def _backfill_1d(values, limit=None, mask=None):
-    if mask is not None:
-        mask = mask[::-1]
-    return _pad_1d(values[::-1], limit, mask)[::-1]
+    _, new_mask = _pad_1d(values[::-1], limit, mask[::-1] if mask is not None else None)
+    return values, (mask if mask is not None else new_mask)
 
 
 def _pad_2d(values, limit=None, mask=None):
@@ -698,9 +697,10 @@ def _pad_2d(values, limit=None, mask=None):
 
 
 def _backfill_2d(values, limit=None, mask=None):
-    if mask is not None:
-        mask = mask[:, ::-1]
-    return _pad_2d(values[:, ::-1], limit, mask)[:, ::-1]
+    _, new_mask = _pad_2d(
+        values[:, ::-1], limit, mask[:, ::-1] if mask is not None else None
+    )
+    return values, (mask if mask is not None else new_mask)
 
 
 _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d}

From 7316492eccfd6f4f35c75ebfbe140d138b4be57a Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 7 Mar 2021 17:30:10 +0000
Subject: [PATCH 18/37] add warm up to time_frame_fillna

---
 asv_bench/benchmarks/frame_methods.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index 65167e6467fd5..48ddee8b5ff52 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -391,6 +391,10 @@ def setup(self, inplace, method, dtype):
                 values = values.round()
             self.df = DataFrame(values, dtype=dtype)
 
+        # warm up
+        df2 = self.df.copy()
+        df2.fillna(inplace=inplace, method=method)
+
     def time_frame_fillna(self, inplace, method, dtype):
         self.df.fillna(inplace=inplace, method=method)
 

From fa4cbaf2a99b150f7c79568f004be1d182823b46 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 8 Mar 2021 19:31:48 +0000
Subject: [PATCH 19/37] minor clean

---
 pandas/_libs_numba/algos.py | 47 +------------------------------------
 1 file changed, 1 insertion(+), 46 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index 7a203bf0a68b9..85a5b2c64345b 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -479,20 +479,7 @@ def kth_smallest(a: np.ndarray, k):
 #     return result
 
 
-# # ----------------------------------------------------------------------
-
-# ctypedef fused algos_t:
-#     float64_t
-#     float32_t
-#     object
-#     int64_t
-#     int32_t
-#     int16_t
-#     int8_t
-#     uint64_t
-#     uint32_t
-#     uint16_t
-#     uint8_t
+# ----------------------------------------------------------------------
 
 
 def _validate_limit(limit: int | None = None) -> None:
@@ -737,38 +724,6 @@ def _pad_2d_inplace_with_limit(values, mask, limit):
 #     return indexer
 
 
-# @cython.boundscheck(False)
-# @cython.wraparound(False)
-# def backfill_2d_inplace(algos_t[:, :] values,
-#                         const uint8_t[:, :] mask,
-#                         limit=None):
-#     cdef:
-#         Py_ssize_t i, j, N, K
-#         algos_t val
-#         int lim, fill_count = 0
-
-#     K, N = (<object>values).shape
-
-#     # GH#2778
-#     if N == 0:
-#         return
-
-#     lim = validate_limit(N, limit)
-
-#     for j in range(K):
-#         fill_count = 0
-#         val = values[j, N - 1]
-#         for i in range(N - 1, -1, -1):
-#             if mask[j, i]:
-#                 if fill_count >= lim:
-#                     continue
-#                 fill_count += 1
-#                 values[j, i] = val
-#             else:
-#                 fill_count = 0
-#                 val = values[j, i]
-
-
 @numba.njit
 def is_monotonic(arr: np.ndarray) -> tuple[bool, bool, bool]:
     """

From 09a4fae7713134a77f4462a300e393eb96fea824 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 8 Mar 2021 20:06:58 +0000
Subject: [PATCH 20/37] make _validate_limit public for interpolate_1d

---
 pandas/_libs_numba/algos.py |  6 +++---
 pandas/core/missing.py      | 13 +++++--------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index 85a5b2c64345b..c66ca013eb494 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -482,7 +482,7 @@ def kth_smallest(a: np.ndarray, k):
 # ----------------------------------------------------------------------
 
 
-def _validate_limit(limit: int | None = None) -> None:
+def validate_limit(limit: int | None = None) -> None:
     """
     Check that the `limit` argument is a positive integer or None.
     """
@@ -552,7 +552,7 @@ def _validate_limit(limit: int | None = None) -> None:
 
 
 def pad_inplace(values: np.ndarray, mask: np.ndarray, limit: int | None = None) -> None:
-    _validate_limit(limit)
+    validate_limit(limit)
     _pad_inplace(values, mask, limit)
 
 
@@ -598,7 +598,7 @@ def _pad_inplace_with_limit(values: np.ndarray, mask: np.ndarray, limit: int) ->
 def pad_2d_inplace(
     values: np.ndarray, mask: np.ndarray, limit: int | None = None
 ) -> None:
-    _validate_limit(limit)
+    validate_limit(limit)
     _pad_2d_inplace(values, mask, limit)
 
 
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index bce39d80abfb9..e63fb73ae4a95 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -15,11 +15,8 @@
 
 import numpy as np
 
-from pandas._libs import (
-    algos,
-    lib,
-)
-from pandas._libs_numba import algos as algos_numba
+from pandas._libs import lib
+from pandas._libs_numba import algos
 from pandas._typing import (
     ArrayLike,
     Axis,
@@ -244,7 +241,7 @@ def interpolate_1d(
             )
 
     # default limit is unlimited GH #16282
-    limit = algos.validate_limit(nobs=None, limit=limit)
+    algos.validate_limit(limit=limit)
 
     # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
     all_nans = set(np.flatnonzero(invalid))
@@ -675,7 +672,7 @@ def interpolate_2d(
 def _pad_1d(values, limit=None, mask=None):
     if mask is None:
         mask = isna(values)
-    algos_numba.pad_inplace(values, mask, limit=limit)
+    algos.pad_inplace(values, mask, limit=limit)
     return values, mask
 
 
@@ -689,7 +686,7 @@ def _pad_2d(values, limit=None, mask=None):
         mask = isna(values)
 
     if np.all(values.shape):
-        algos_numba.pad_2d_inplace(values, mask, limit=limit)
+        algos.pad_2d_inplace(values, mask, limit=limit)
     else:
         # for test coverage
         pass

From 4300a3324865d9534eb2b9ead68164df8356fd9a Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 8 Mar 2021 21:42:30 +0000
Subject: [PATCH 21/37] pure python ensure_platform_int from python code

---
 pandas/_libs_numba/algos.py       | 62 ++++++++++++++++++++++++++++++-
 pandas/core/arrays/categorical.py |  8 ++--
 pandas/core/dtypes/common.py      |  3 +-
 pandas/tests/test_algos.py        |  2 +-
 4 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index c66ca013eb494..0843f2e3dbbbe 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -1341,6 +1341,66 @@ def is_monotonic(arr: np.ndarray) -> tuple[bool, bool, bool]:
 #                                 out[i, j] = left - right
 
 
+# ----------------------------------------------------------------------
+# ensure_dtype
+# ----------------------------------------------------------------------
+
+
+def ensure_platform_int(arr):
+    # GH3033, GH1392
+    # platform int is the size of the int pointer, e.g. np.intp
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.intp, copy=False)
+    else:
+        return np.array(arr, dtype=np.intp)
+
+
+# def ensure_object(object arr):
+#     if util.is_array(arr):
+#         if (<ndarray>arr).descr.type_num == NPY_OBJECT:
+#             return arr
+#         else:
+#             # equiv: arr.astype(object)
+#             return cnp.PyArray_Cast(<ndarray>arr, NPY_OBJECT)
+#     else:
+#         return np.array(arr, dtype=np.object_)
+
+# {{py:
+
+# # name, c_type, dtype
+# dtypes = [('float64', 'FLOAT64', 'float64'),
+#           ('float32', 'FLOAT32', 'float32'),
+#           ('int8', 'INT8', 'int8'),
+#           ('int16', 'INT16', 'int16'),
+#           ('int32', 'INT32', 'int32'),
+#           ('int64', 'INT64', 'int64'),
+#           ('uint8', 'UINT8', 'uint8'),
+#           ('uint16', 'UINT16', 'uint16'),
+#           ('uint32', 'UINT32', 'uint32'),
+#           ('uint64', 'UINT64', 'uint64'),
+#           # ('platform_int', 'INT', 'int_'),
+#           # ('object', 'OBJECT', 'object_'),
+# ]
+
+# def get_dispatch(dtypes):
+
+#     for name, c_type, dtype in dtypes:
+#         yield name, c_type, dtype
+# }}
+
+# {{for name, c_type, dtype in get_dispatch(dtypes)}}
+
+
+# def ensure_{{name}}(object arr, copy=True):
+#     if util.is_array(arr):
+#         if (<ndarray>arr).descr.type_num == NPY_{{c_type}}:
+#             return arr
+#         else:
+#             return arr.astype(np.{{dtype}}, copy=copy)
+#     else:
+#         return np.array(arr, dtype=np.{{dtype}})
+
+# {{endfor}}
+
 # # generated from template
-# include "algos_common_helper.pxi"
 # include "algos_take_helper.pxi"
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 9d1471e2bbaef..30b50a40b863e 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -24,11 +24,10 @@
 
 from pandas._libs import (
     NaT,
-    algos as libalgos,
     hashtable as htable,
 )
 from pandas._libs.lib import no_default
-from pandas._libs_numba import algos as libalgos_numba
+from pandas._libs_numba import algos as libalgos
 from pandas._typing import (
     ArrayLike,
     Dtype,
@@ -53,6 +52,7 @@
 from pandas.core.dtypes.common import (
     ensure_int64,
     ensure_object,
+    ensure_platform_int,
     is_categorical_dtype,
     is_datetime64_dtype,
     is_dict_like,
@@ -519,7 +519,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
                 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"
                 raise ValueError(msg)
 
-            result = take_nd(new_cats, libalgos.ensure_platform_int(self._codes))
+            result = take_nd(new_cats, ensure_platform_int(self._codes))
 
         return result
 
@@ -2015,7 +2015,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
 
         """
         categories = self.categories
-        r, counts = libalgos_numba.groupsort_indexer(
+        r, counts = libalgos.groupsort_indexer(
             self.codes.astype("int64"), categories.size
         )
         counts = counts.cumsum()
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 0966d0b93cc25..32660c59573fb 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -17,6 +17,7 @@
     algos,
 )
 from pandas._libs.tslibs import conversion
+from pandas._libs_numba import algos as algos_numba
 from pandas._typing import (
     ArrayLike,
     DtypeObj,
@@ -113,7 +114,7 @@ def ensure_float(arr):
 ensure_int32 = algos.ensure_int32
 ensure_int16 = algos.ensure_int16
 ensure_int8 = algos.ensure_int8
-ensure_platform_int = algos.ensure_platform_int
+ensure_platform_int = algos_numba.ensure_platform_int
 ensure_object = algos.ensure_object
 
 
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index deafbc1d58261..a54ac323a4b40 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -2196,7 +2196,7 @@ def test_infinity_against_nan():
 def test_ensure_platform_int():
     arr = np.arange(100, dtype=np.intp)
 
-    result = libalgos.ensure_platform_int(arr)
+    result = libalgos_numba.ensure_platform_int(arr)
     assert result is arr
 
 

From 799d2d26b287a9acfabba3e5b8bb2ca41b6b49b9 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Tue, 9 Mar 2021 20:48:20 +0000
Subject: [PATCH 22/37] other ensure_dtype functions (ignoring copy keyword
 argument)

---
 pandas/_libs_numba/algos.py     | 120 ++++++++++++++++++++------------
 pandas/core/array_algos/take.py |   2 +-
 pandas/core/dtypes/common.py    |   5 +-
 3 files changed, 78 insertions(+), 49 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index 0843f2e3dbbbe..1a844e71acaea 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -1355,52 +1355,82 @@ def ensure_platform_int(arr):
         return np.array(arr, dtype=np.intp)
 
 
-# def ensure_object(object arr):
-#     if util.is_array(arr):
-#         if (<ndarray>arr).descr.type_num == NPY_OBJECT:
-#             return arr
-#         else:
-#             # equiv: arr.astype(object)
-#             return cnp.PyArray_Cast(<ndarray>arr, NPY_OBJECT)
-#     else:
-#         return np.array(arr, dtype=np.object_)
-
-# {{py:
-
-# # name, c_type, dtype
-# dtypes = [('float64', 'FLOAT64', 'float64'),
-#           ('float32', 'FLOAT32', 'float32'),
-#           ('int8', 'INT8', 'int8'),
-#           ('int16', 'INT16', 'int16'),
-#           ('int32', 'INT32', 'int32'),
-#           ('int64', 'INT64', 'int64'),
-#           ('uint8', 'UINT8', 'uint8'),
-#           ('uint16', 'UINT16', 'uint16'),
-#           ('uint32', 'UINT32', 'uint32'),
-#           ('uint64', 'UINT64', 'uint64'),
-#           # ('platform_int', 'INT', 'int_'),
-#           # ('object', 'OBJECT', 'object_'),
-# ]
-
-# def get_dispatch(dtypes):
-
-#     for name, c_type, dtype in dtypes:
-#         yield name, c_type, dtype
-# }}
-
-# {{for name, c_type, dtype in get_dispatch(dtypes)}}
-
-
-# def ensure_{{name}}(object arr, copy=True):
-#     if util.is_array(arr):
-#         if (<ndarray>arr).descr.type_num == NPY_{{c_type}}:
-#             return arr
-#         else:
-#             return arr.astype(np.{{dtype}}, copy=copy)
-#     else:
-#         return np.array(arr, dtype=np.{{dtype}})
+def ensure_object(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.object_, copy=False)
+    else:
+        return np.array(arr, dtype=np.object_)
+
+
+def ensure_float64(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.float64, copy=False)
+    else:
+        return np.array(arr, dtype=np.float64)
+
+
+def ensure_float32(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.float32, copy=False)
+    else:
+        return np.array(arr, dtype=np.float32)
+
+
+def ensure_int8(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.int8, copy=False)
+    else:
+        return np.array(arr, dtype=np.int8)
+
+
+def ensure_int16(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.int16, copy=False)
+    else:
+        return np.array(arr, dtype=np.int16)
+
+
+def ensure_int32(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.int32, copy=False)
+    else:
+        return np.array(arr, dtype=np.int32)
+
+
+def ensure_int64(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.int64, copy=False)
+    else:
+        return np.array(arr, dtype=np.int64)
+
+
+def ensure_uint8(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.uint8, copy=False)
+    else:
+        return np.array(arr, dtype=np.uint8)
+
+
+def ensure_uint16(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.uint16, copy=False)
+    else:
+        return np.array(arr, dtype=np.uint16)
+
+
+def ensure_uint32(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.uint32, copy=False)
+    else:
+        return np.array(arr, dtype=np.uint32)
+
+
+def ensure_uint64(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.uint64, copy=False)
+    else:
+        return np.array(arr, dtype=np.uint64)
 
-# {{endfor}}
 
 # # generated from template
 # include "algos_take_helper.pxi"
diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py
index 054497089c5ab..8e0cc5e41e6a4 100644
--- a/pandas/core/array_algos/take.py
+++ b/pandas/core/array_algos/take.py
@@ -474,7 +474,7 @@ def _take_preprocess_indexer_and_fill_value(
         indexer = np.arange(arr.shape[axis], dtype=np.int64)
         dtype, fill_value = arr.dtype, arr.dtype.type()
     else:
-        indexer = ensure_int64(indexer, copy=False)
+        indexer = ensure_int64(indexer)
         if not allow_fill:
             dtype, fill_value = arr.dtype, arr.dtype.type()
             mask_info = None, False
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 32660c59573fb..bb94c6de5a5c6 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -14,10 +14,9 @@
 from pandas._libs import (
     Interval,
     Period,
-    algos,
 )
 from pandas._libs.tslibs import conversion
-from pandas._libs_numba import algos as algos_numba
+from pandas._libs_numba import algos
 from pandas._typing import (
     ArrayLike,
     DtypeObj,
@@ -114,7 +113,7 @@ def ensure_float(arr):
 ensure_int32 = algos.ensure_int32
 ensure_int16 = algos.ensure_int16
 ensure_int8 = algos.ensure_int8
-ensure_platform_int = algos_numba.ensure_platform_int
+ensure_platform_int = algos.ensure_platform_int
 ensure_object = algos.ensure_object
 
 

From 9496c5213ca0c41e408f42e4a4de6d25204b67c0 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 11 Mar 2021 20:35:54 +0000
Subject: [PATCH 23/37] mypy fixup

---
 pandas/core/arrays/categorical.py | 6 +++++-
 pandas/core/missing.py            | 4 +---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index f4e886d96834d..3728fdb813dbb 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -545,7 +545,11 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
                 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"
                 raise ValueError(msg)
 
-            result = take_nd(new_cats, ensure_platform_int(self._codes))
+            # error: Incompatible types in assignment (expression has type "ndarray",
+            # variable has type "Categorical")
+            result = take_nd(  # type: ignore[assignment]
+                new_cats, ensure_platform_int(self._codes)
+            )
 
         # error: Incompatible return value type (got "Categorical", expected "ndarray")
         return result  # type: ignore[return-value]
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index fe029bda393d3..e5583070250f4 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -676,9 +676,7 @@ def _pad_1d(values, limit=None, mask=None):
     if mask is None:
         mask = isna(values)
     algos.pad_inplace(values, mask, limit=limit)
-    # error: Incompatible return value type (got "Tuple[ndarray, Optional[ndarray]]",
-    # expected "Tuple[ndarray, ndarray]")
-    return values, mask  # type: ignore[return-value]
+    return values, mask
 
 
 def _backfill_1d(values, limit=None, mask=None):

From 9e89225fe3f56473b8601b655fa7bef6d5906d73 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 12 Mar 2021 16:52:58 +0000
Subject: [PATCH 24/37] algos.diff_2d

---
 pandas/_libs_numba/algos.py | 163 ++++++++++++------------------------
 pandas/core/algorithms.py   |  30 ++-----
 2 files changed, 58 insertions(+), 135 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index 1a844e71acaea..f18907f0f1700 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -1228,117 +1228,58 @@ def is_monotonic(arr: np.ndarray) -> tuple[bool, bool, bool]:
 #         return ranks
 
 
-# ctypedef fused diff_t:
-#     float64_t
-#     float32_t
-#     int8_t
-#     int16_t
-#     int32_t
-#     int64_t
-
-# ctypedef fused out_t:
-#     float32_t
-#     float64_t
-#     int64_t
-
-
-# @cython.boundscheck(False)
-# @cython.wraparound(False)
-# def diff_2d(
-#     ndarray[diff_t, ndim=2] arr,  # TODO(cython 3) update to "const diff_t[:, :] arr"
-#     ndarray[out_t, ndim=2] out,
-#     Py_ssize_t periods,
-#     int axis,
-#     bint datetimelike=False,
-# ):
-#     cdef:
-#         Py_ssize_t i, j, sx, sy, start, stop
-#         bint f_contig = arr.flags.f_contiguous
-#         # bint f_contig = arr.is_f_contig()  # TODO(cython 3)
-#         diff_t left, right
-
-#     # Disable for unsupported dtype combinations,
-#     #  see https://github.com/cython/cython/issues/2646
-#     if (out_t is float32_t
-#             and not (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)):
-#         raise NotImplementedError
-#     elif (out_t is float64_t
-#           and (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)):
-#         raise NotImplementedError
-#     elif out_t is int64_t and diff_t is not int64_t:
-#         # We only have out_t of int64_t if we have datetimelike
-#         raise NotImplementedError
-#     else:
-#         # We put this inside an indented else block to avoid cython build
-#         #  warnings about unreachable code
-#         sx, sy = (<object>arr).shape
-#         with nogil:
-#             if f_contig:
-#                 if axis == 0:
-#                     if periods >= 0:
-#                         start, stop = periods, sx
-#                     else:
-#                         start, stop = 0, sx + periods
-#                     for j in range(sy):
-#                         for i in range(start, stop):
-#                             left = arr[i, j]
-#                             right = arr[i - periods, j]
-#                             if out_t is int64_t and datetimelike:
-#                                 if left == NPY_NAT or right == NPY_NAT:
-#                                     out[i, j] = NPY_NAT
-#                                 else:
-#                                     out[i, j] = left - right
-#                             else:
-#                                 out[i, j] = left - right
-#                 else:
-#                     if periods >= 0:
-#                         start, stop = periods, sy
-#                     else:
-#                         start, stop = 0, sy + periods
-#                     for j in range(start, stop):
-#                         for i in range(sx):
-#                             left = arr[i, j]
-#                             right = arr[i, j - periods]
-#                             if out_t is int64_t and datetimelike:
-#                                 if left == NPY_NAT or right == NPY_NAT:
-#                                     out[i, j] = NPY_NAT
-#                                 else:
-#                                     out[i, j] = left - right
-#                             else:
-#                                 out[i, j] = left - right
-#             else:
-#                 if axis == 0:
-#                     if periods >= 0:
-#                         start, stop = periods, sx
-#                     else:
-#                         start, stop = 0, sx + periods
-#                     for i in range(start, stop):
-#                         for j in range(sy):
-#                             left = arr[i, j]
-#                             right = arr[i - periods, j]
-#                             if out_t is int64_t and datetimelike:
-#                                 if left == NPY_NAT or right == NPY_NAT:
-#                                     out[i, j] = NPY_NAT
-#                                 else:
-#                                     out[i, j] = left - right
-#                             else:
-#                                 out[i, j] = left - right
-#                 else:
-#                     if periods >= 0:
-#                         start, stop = periods, sy
-#                     else:
-#                         start, stop = 0, sy + periods
-#                     for i in range(sx):
-#                         for j in range(start, stop):
-#                             left = arr[i, j]
-#                             right = arr[i, j - periods]
-#                             if out_t is int64_t and datetimelike:
-#                                 if left == NPY_NAT or right == NPY_NAT:
-#                                     out[i, j] = NPY_NAT
-#                                 else:
-#                                     out[i, j] = left - right
-#                             else:
-#                                 out[i, j] = left - right
+@numba.njit
+def diff_2d(
+    arr: np.ndarray,
+    out: np.ndarray,
+    periods: int,
+    axis: int,
+):
+    f_contig = arr.flags.f_contiguous
+
+    sx, sy = arr.shape
+    if f_contig:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    left = arr[i, j]
+                    right = arr[i - periods, j]
+                    out[i, j] = left - right
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    left = arr[i, j]
+                    right = arr[i, j - periods]
+                    out[i, j] = left - right
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    left = arr[i, j]
+                    right = arr[i - periods, j]
+                    out[i, j] = left - right
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    left = arr[i, j]
+                    right = arr[i, j - periods]
+                    out[i, j] = left - right
 
 
 # ----------------------------------------------------------------------
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index e82e1a653749e..49f9c16a5e847 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1680,8 +1680,8 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
     is_timedelta = False
     is_bool = False
     if needs_i8_conversion(arr.dtype):
-        dtype = np.int64
-        arr = arr.view("i8")
+        dtype = np.dtype("timedelta64[ns]")
+        arr = getattr(arr, "_data", arr)
         na = iNaT
         is_timedelta = True
 
@@ -1713,10 +1713,11 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
     na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)
     out_arr[tuple(na_indexer)] = na
 
-    if arr.ndim == 2 and arr.dtype.name in _diff_special:
+    if arr.dtype.name in _diff_special or is_timedelta:
+        assert isinstance(arr, np.ndarray), type(arr)
         # TODO: can diff_2d dtype specialization troubles be fixed by defining
         #  out_arr inside diff_2d?
-        algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta)
+        algos_numba.diff_2d(arr, out_arr, n, axis)
     else:
         # To keep mypy happy, _res_indexer is a list while res_indexer is
         #  a tuple, ditto for lag_indexer.
@@ -1728,30 +1729,11 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
         _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)
         lag_indexer = tuple(_lag_indexer)
 
-        # need to make sure that we account for na for datelike/timedelta
-        # we don't actually want to subtract these i8 numbers
-        if is_timedelta:
-            res = arr[res_indexer]
-            lag = arr[lag_indexer]
-
-            mask = (arr[res_indexer] == na) | (arr[lag_indexer] == na)
-            if mask.any():
-                res = res.copy()
-                res[mask] = 0
-                lag = lag.copy()
-                lag[mask] = 0
-
-            result = res - lag
-            result[mask] = na
-            out_arr[res_indexer] = result
-        elif is_bool:
+        if is_bool:
             out_arr[res_indexer] = arr[res_indexer] ^ arr[lag_indexer]
         else:
             out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer]
 
-    if is_timedelta:
-        out_arr = out_arr.view("timedelta64[ns]")
-
     if orig_ndim == 1:
         out_arr = out_arr[:, 0]
     return out_arr

From 0e12eafdd0692d141093c10d187fa3592a8857b4 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 12 Mar 2021 19:48:23 +0000
Subject: [PATCH 25/37] time_timestamp_ops_diff

---
 asv_bench/benchmarks/arithmetic.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py
index 488237a6f5a8b..5f73e7156a6ad 100644
--- a/asv_bench/benchmarks/arithmetic.py
+++ b/asv_bench/benchmarks/arithmetic.py
@@ -267,6 +267,10 @@ def setup(self, tz):
 
         self.s2 = Series(date_range("20010101", periods=N, freq="s", tz=tz))
 
+        # warm-up for numba
+
+        self.s2.diff()
+
     def time_series_timestamp_compare(self, tz):
         self.s <= self.ts
 

From 3e418cc94bccf5ce3e5db884943fbf4c74244ed8 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 12 Mar 2021 20:20:13 +0000
Subject: [PATCH 26/37] iNaT > None

---
 pandas/core/algorithms.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 49f9c16a5e847..507d58cc24167 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -8,6 +8,7 @@
 from textwrap import dedent
 from typing import (
     TYPE_CHECKING,
+    Any,
     Dict,
     Optional,
     Tuple,
@@ -1649,7 +1650,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
     """
 
     n = int(n)
-    na = np.nan
+    na: Any = np.nan
     dtype = arr.dtype
 
     if dtype.kind == "b":
@@ -1682,7 +1683,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
     if needs_i8_conversion(arr.dtype):
         dtype = np.dtype("timedelta64[ns]")
         arr = getattr(arr, "_data", arr)
-        na = iNaT
+        na = None
         is_timedelta = True
 
     elif is_bool_dtype(dtype):

From 991bbb391ea1af9806ff5ecbd6178473d48cb1da Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Tue, 16 Mar 2021 19:22:47 +0000
Subject: [PATCH 27/37] mypy fixup for ensure_int64

---
 pandas/core/array_algos/take.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py
index 31cbadb0e442b..bf9373154a4d9 100644
--- a/pandas/core/array_algos/take.py
+++ b/pandas/core/array_algos/take.py
@@ -120,7 +120,7 @@ def _take_nd_ndarray(
         indexer = np.arange(arr.shape[axis], dtype=np.int64)
         dtype, fill_value = arr.dtype, arr.dtype.type()
     else:
-        indexer = ensure_int64(indexer, copy=False)
+        indexer = ensure_int64(indexer)
     indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
         arr, indexer, out, fill_value, allow_fill
     )

From 886d938d2254da205b4a659b4e261a62e5981b06 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 17 Mar 2021 20:59:55 +0000
Subject: [PATCH 28/37] algos.unique_deltas - not optimised

---
 pandas/_libs_numba/algos.py   | 60 +++++++++++++++--------------------
 pandas/tseries/frequencies.py |  2 +-
 2 files changed, 26 insertions(+), 36 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index f18907f0f1700..d243ac457c1b9 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -102,43 +102,33 @@
 #     __ge__ = lambda self, other: isinstance(other, NegInfinity)
 
 
-# @cython.wraparound(False)
-# @cython.boundscheck(False)
-# cpdef ndarray[int64_t, ndim=1] unique_deltas(const int64_t[:] arr):
-#     """
-#     Efficiently find the unique first-differences of the given array.
+@numba.njit
+def unique_deltas(arr: np.ndarray) -> np.ndarray:
+    """
+    Efficiently find the unique first-differences of the given array.
 
-#     Parameters
-#     ----------
-#     arr : ndarray[in64_t]
+    Parameters
+    ----------
+    arr : ndarray[in64_t]
 
-#     Returns
-#     -------
-#     ndarray[int64_t]
-#         An ordered ndarray[int64_t]
-#     """
-#     cdef:
-#         Py_ssize_t i, n = len(arr)
-#         int64_t val
-#         khiter_t k
-#         kh_int64_t *table
-#         int ret = 0
-#         list uniques = []
-#         ndarray[int64_t, ndim=1] result
-
-#     table = kh_init_int64()
-#     kh_resize_int64(table, 10)
-#     for i in range(n - 1):
-#         val = arr[i + 1] - arr[i]
-#         k = kh_get_int64(table, val)
-#         if k == table.n_buckets:
-#             kh_put_int64(table, val, &ret)
-#             uniques.append(val)
-#     kh_destroy_int64(table)
-
-#     result = np.array(uniques, dtype=np.int64)
-#     result.sort()
-#     return result
+    Returns
+    -------
+    ndarray[int64_t, ndim=1]
+        An ordered ndarray[int64_t]
+    """
+    n = len(arr)
+    uniques = []
+    seen = set()
+
+    for i in range(n - 1):
+        val = arr[i + 1] - arr[i]
+        if val not in seen:
+            seen.add(val)
+            uniques.append(val)
+
+    result = np.array(uniques, dtype=np.int64)
+    result.sort()
+    return result
 
 
 def is_lexsorted(list_of_arrays: list[np.ndarray]) -> bool:
diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py
index c5b875b8f027e..22a4379764137 100644
--- a/pandas/tseries/frequencies.py
+++ b/pandas/tseries/frequencies.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 
-from pandas._libs.algos import unique_deltas
 from pandas._libs.tslibs import (
     Timestamp,
     tzconversion,
@@ -26,6 +25,7 @@
     to_offset,
 )
 from pandas._libs.tslibs.parsing import get_rule_month
+from pandas._libs_numba.algos import unique_deltas
 from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.common import (

From b9aa7a52933a87e8ae6599ddd70733caa5f0105c Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 19 Mar 2021 14:35:01 +0000
Subject: [PATCH 29/37] warm-up for multiindex_object.Integer.time_is_monotonic

---
 asv_bench/benchmarks/multiindex_object.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py
index 25df5b0214959..fb36b711af197 100644
--- a/asv_bench/benchmarks/multiindex_object.py
+++ b/asv_bench/benchmarks/multiindex_object.py
@@ -101,6 +101,9 @@ def setup(self):
                 (1045, -843),
             ]
         )
+    
+        # numba warm-up
+        self.mi_int[:10].is_monotonic
 
     def time_get_indexer(self):
         self.mi_int.get_indexer(self.obj_index)

From 145f27512db031302427228db5f31aff651e47fe Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 19 Mar 2021 14:36:55 +0000
Subject: [PATCH 30/37] pre-commit fix

---
 asv_bench/benchmarks/multiindex_object.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py
index fb36b711af197..e4730a7f3bf6a 100644
--- a/asv_bench/benchmarks/multiindex_object.py
+++ b/asv_bench/benchmarks/multiindex_object.py
@@ -101,7 +101,7 @@ def setup(self):
                 (1045, -843),
             ]
         )
-    
+
         # numba warm-up
         self.mi_int[:10].is_monotonic
 

From 51deb71c757b6e6fb55bd635e8ac572049f6cc57 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 19 Mar 2021 14:46:12 +0000
Subject: [PATCH 31/37] warm-up for
 categoricals.Contains.time_categorical_index_contains

---
 asv_bench/benchmarks/categoricals.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
index 268f25c3d12e3..319b0cb078479 100644
--- a/asv_bench/benchmarks/categoricals.py
+++ b/asv_bench/benchmarks/categoricals.py
@@ -246,6 +246,9 @@ def setup(self):
         self.c = self.ci.values
         self.key = self.ci.categories[0]
 
+        # numba warm-up
+        self.key in self.ci
+
     def time_categorical_index_contains(self):
         self.key in self.ci
 

From 358d82a607c28b3d0a5b94e7b6473bbdf2ce0bcd Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 19 Mar 2021 18:40:42 +0000
Subject: [PATCH 32/37] algos.take_1d

---
 pandas/_libs_numba/algos.py     | 19 +++++++++++++++++++
 pandas/core/array_algos/take.py | 30 +++---------------------------
 2 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index d243ac457c1b9..b044622742b63 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -1363,5 +1363,24 @@ def ensure_uint64(arr):
         return np.array(arr, dtype=np.uint64)
 
 
+# ----------------------------------------------------------------------
+# take_1d, take_2d
+# ----------------------------------------------------------------------
+
+
+@numba.jit
+def take_1d(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    n = indexer.shape[0]
+
+    for i in range(n):
+        idx = indexer[i]
+        if idx == -1:
+            out[i] = fill_value
+        else:
+            out[i] = values[idx]
+
+
 # # generated from template
 # include "algos_take_helper.pxi"
diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py
index 59ca26ccc0a44..cada1ec33db9f 100644
--- a/pandas/core/array_algos/take.py
+++ b/pandas/core/array_algos/take.py
@@ -13,6 +13,7 @@
     algos as libalgos,
     lib,
 )
+from pandas._libs_numba import algos as libalgos_numba
 from pandas._typing import ArrayLike
 
 from pandas.core.dtypes.cast import maybe_promote
@@ -278,7 +279,7 @@ def _get_take_nd_function_cached(
     """
     tup = (arr_dtype.name, out_dtype.name)
     if ndim == 1:
-        func = _take_1d_dict.get(tup, None)
+        func = libalgos_numba.take_1d
     elif ndim == 2:
         if axis == 0:
             func = _take_2d_axis0_dict.get(tup, None)
@@ -289,7 +290,7 @@ def _get_take_nd_function_cached(
 
     tup = (out_dtype.name, out_dtype.name)
     if ndim == 1:
-        func = _take_1d_dict.get(tup, None)
+        func = libalgos_numba.take_1d
     elif ndim == 2:
         if axis == 0:
             func = _take_2d_axis0_dict.get(tup, None)
@@ -353,31 +354,6 @@ def wrapper(
     return wrapper
 
 
-_take_1d_dict = {
-    ("int8", "int8"): libalgos.take_1d_int8_int8,
-    ("int8", "int32"): libalgos.take_1d_int8_int32,
-    ("int8", "int64"): libalgos.take_1d_int8_int64,
-    ("int8", "float64"): libalgos.take_1d_int8_float64,
-    ("int16", "int16"): libalgos.take_1d_int16_int16,
-    ("int16", "int32"): libalgos.take_1d_int16_int32,
-    ("int16", "int64"): libalgos.take_1d_int16_int64,
-    ("int16", "float64"): libalgos.take_1d_int16_float64,
-    ("int32", "int32"): libalgos.take_1d_int32_int32,
-    ("int32", "int64"): libalgos.take_1d_int32_int64,
-    ("int32", "float64"): libalgos.take_1d_int32_float64,
-    ("int64", "int64"): libalgos.take_1d_int64_int64,
-    ("int64", "float64"): libalgos.take_1d_int64_float64,
-    ("float32", "float32"): libalgos.take_1d_float32_float32,
-    ("float32", "float64"): libalgos.take_1d_float32_float64,
-    ("float64", "float64"): libalgos.take_1d_float64_float64,
-    ("object", "object"): libalgos.take_1d_object_object,
-    ("bool", "bool"): _view_wrapper(libalgos.take_1d_bool_bool, np.uint8, np.uint8),
-    ("bool", "object"): _view_wrapper(libalgos.take_1d_bool_object, np.uint8, None),
-    ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
-        libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
-    ),
-}
-
 _take_2d_axis0_dict = {
     ("int8", "int8"): libalgos.take_2d_axis0_int8_int8,
     ("int8", "int32"): libalgos.take_2d_axis0_int8_int32,

From a27d70c63f6d053541472e8878da1154e1d6e0ea Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 20 Mar 2021 11:28:07 +0000
Subject: [PATCH 33/37] add numba to all envs

---
 ci/deps/actions-37-db-min.yaml      | 1 +
 ci/deps/actions-37-db.yaml          | 1 +
 ci/deps/actions-37-locale_slow.yaml | 1 +
 ci/deps/actions-37.yaml             | 1 +
 ci/deps/actions-38-locale.yaml      | 1 +
 ci/deps/actions-38.yaml             | 1 +
 ci/deps/actions-39.yaml             | 1 +
 ci/deps/azure-macos-37.yaml         | 1 +
 ci/deps/azure-windows-37.yaml       | 1 +
 ci/deps/travis-37-arm64.yaml        | 1 +
 10 files changed, 10 insertions(+)

diff --git a/ci/deps/actions-37-db-min.yaml b/ci/deps/actions-37-db-min.yaml
index 1d3794576220a..0dfc806f4b631 100644
--- a/ci/deps/actions-37-db-min.yaml
+++ b/ci/deps/actions-37-db-min.yaml
@@ -27,6 +27,7 @@ dependencies:
   - lxml=4.3.0
   - matplotlib
   - nomkl
+  - numba
   - numexpr
   - openpyxl
   - pandas-gbq
diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml
index 5381caaa242cf..75f934ec4690e 100644
--- a/ci/deps/actions-37-db.yaml
+++ b/ci/deps/actions-37-db.yaml
@@ -24,6 +24,7 @@ dependencies:
   - moto>=1.3.14
   - flask
   - nomkl
+  - numba
   - numexpr
   - numpy=1.16.*
   - odfpy
diff --git a/ci/deps/actions-37-locale_slow.yaml b/ci/deps/actions-37-locale_slow.yaml
index d9ad1f538908e..5433f01b9bc35 100644
--- a/ci/deps/actions-37-locale_slow.yaml
+++ b/ci/deps/actions-37-locale_slow.yaml
@@ -17,6 +17,7 @@ dependencies:
   - bottleneck=1.2.*
   - lxml
   - matplotlib=3.0.0
+  - numba
   - numpy=1.16.*
   - openpyxl=3.0.0
   - python-dateutil
diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml
index 61f431256dd4a..430e6fabbe237 100644
--- a/ci/deps/actions-37.yaml
+++ b/ci/deps/actions-37.yaml
@@ -15,6 +15,7 @@ dependencies:
   # pandas dependencies
   - botocore>=1.11
   - fsspec>=0.7.4
+  - numba
   - numpy
   - python-dateutil
   - nomkl
diff --git a/ci/deps/actions-38-locale.yaml b/ci/deps/actions-38-locale.yaml
index 629804c71e726..1d22ad1c1b18c 100644
--- a/ci/deps/actions-38-locale.yaml
+++ b/ci/deps/actions-38-locale.yaml
@@ -23,6 +23,7 @@ dependencies:
   - matplotlib <3.3.0
   - moto
   - nomkl
+  - numba
   - numexpr
   - numpy<1.20  # GH#39541 compat with pyarrow<3
   - openpyxl
diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml
index e2660d07c3558..2607a943c434b 100644
--- a/ci/deps/actions-38.yaml
+++ b/ci/deps/actions-38.yaml
@@ -13,6 +13,7 @@ dependencies:
   - hypothesis>=3.58.0
 
   # pandas dependencies
+  - numba
   - numpy
   - python-dateutil
   - nomkl
diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
index 36e8bf528fc3e..d6fda472c5a32 100644
--- a/ci/deps/actions-39.yaml
+++ b/ci/deps/actions-39.yaml
@@ -12,6 +12,7 @@ dependencies:
   - hypothesis>=3.58.0
 
   # pandas dependencies
+  - numba
   - numpy
   - python-dateutil
   - pytz
diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml
index d667adddda859..e0f6088e9f093 100644
--- a/ci/deps/azure-macos-37.yaml
+++ b/ci/deps/azure-macos-37.yaml
@@ -18,6 +18,7 @@ dependencies:
   - lxml
   - matplotlib=2.2.3
   - nomkl
+  - numba
   - numexpr
   - numpy=1.16.5
   - openpyxl
diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml
index e7ac4c783b855..a257fa6d27ae4 100644
--- a/ci/deps/azure-windows-37.yaml
+++ b/ci/deps/azure-windows-37.yaml
@@ -23,6 +23,7 @@ dependencies:
   - matplotlib=2.2.*
   - moto>=1.3.14
   - flask
+  - numba
   - numexpr
   - numpy=1.16.*
   - openpyxl
diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml
index 8df6104f43a50..3ea1bc89af523 100644
--- a/ci/deps/travis-37-arm64.yaml
+++ b/ci/deps/travis-37-arm64.yaml
@@ -12,6 +12,7 @@ dependencies:
 
   # pandas dependencies
   - botocore>=1.11
+  - numba
   - numpy
   - python-dateutil
   - pytz

From 5d4cbfcc0d44e74689d2335cef695d1669b66d35 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 21 Mar 2021 10:14:20 +0000
Subject: [PATCH 34/37] revert code changes to core/array_algos/take.py
 (specializations wip)

---
 pandas/_libs_numba/algos.py     | 194 +++++++++++++++++++++++++++++++-
 pandas/core/array_algos/take.py |  33 +++++-
 2 files changed, 223 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index b044622742b63..1c957853736cb 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -1,6 +1,19 @@
 from __future__ import annotations
 
 import numba
+
+# from numba import (
+#     float32,
+#     float64,
+#     int8,
+#     int16,
+#     int32,
+#     int64,
+#     intp,
+#     types,
+#     uint8,
+#     void,
+# )
 import numpy as np
 
 import pandas._libs_numba.util as util
@@ -1368,8 +1381,7 @@ def ensure_uint64(arr):
 # ----------------------------------------------------------------------
 
 
-@numba.jit
-def take_1d(
+def _take_1d(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
     n = indexer.shape[0]
@@ -1382,5 +1394,183 @@ def take_1d(
             out[i] = values[idx]
 
 
+_take_1d_no_python = numba.njit(_take_1d)
+_take_1d_object = numba.jit(forceobj=True)(_take_1d)
+
+
+# @numba.njit(void(int8[:], intp[:], int8[:], int8))
+@numba.njit
+def take_1d_int8_int8(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int8[:], intp[:], int32[:], int32))
+@numba.njit
+def take_1d_int8_int32(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int8[:], intp[:], int64[:], int64))
+@numba.njit
+def take_1d_int8_int64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int8[:], intp[:], float64[:], float64))
+@numba.njit
+def take_1d_int8_float64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(
+#     void(types.Array(types.int16, 1, "C", readonly=True), intp[:], int16[:], int16)
+# )
+@numba.njit
+def take_1d_int16_int16(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int16[:], intp[:], int32[:], int32))
+@numba.njit
+def take_1d_int16_int32(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int16[:], intp[:], int64[:], int64))
+@numba.njit
+def take_1d_int16_int64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int16[:], intp[:], float64[:], float64))
+@numba.njit
+def take_1d_int16_float64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int32[:], intp[:], int32[:], int32))
+@numba.njit
+def take_1d_int32_int32(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int32[:], intp[:], int64[:], int64))
+@numba.njit
+def take_1d_int32_int64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int32[:], intp[:], float64[:], float64))
+@numba.njit
+def take_1d_int32_float64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int64[:], intp[:], int64[:], int64))
+@numba.njit
+def take_1d_int64_int64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int64[:], intp[:], float64[:], float64))
+@numba.njit
+def take_1d_int64_float64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(float32[:], intp[:], float32[:], float32))
+@numba.njit
+def take_1d_float32_float32(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(float32[:], intp[:], float64[:], float64))
+@numba.njit
+def take_1d_float32_float64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(
+#     [
+#         void(
+#             types.Array(types.int64, 1, "C", readonly=True),
+#             intp[:],
+#             float64[:],
+#             float64,
+#         ),
+#         void(
+#             float64[:],
+#             intp[:],
+#             float64[:],
+#             float64,
+#         ),
+#     ]
+# )
+@numba.njit
+def take_1d_float64_float64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+@numba.jit(forceobj=True)
+def take_1d_object_object(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_object(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(uint8[:], intp[:], uint8[:], uint8))
+@numba.njit
+def take_1d_bool_bool(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+@numba.jit(forceobj=True)
+def take_1d_bool_object(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    n = indexer.shape[0]
+
+    for i in range(n):
+        idx = indexer[i]
+        if idx == -1:
+            out[i] = fill_value
+        else:
+            out[i] = True if values[idx] > 0 else False
+
+
 # # generated from template
 # include "algos_take_helper.pxi"
diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py
index fa8c5fd5a769e..efbb628479f83 100644
--- a/pandas/core/array_algos/take.py
+++ b/pandas/core/array_algos/take.py
@@ -279,7 +279,7 @@ def _get_take_nd_function_cached(
     """
     tup = (arr_dtype.name, out_dtype.name)
     if ndim == 1:
-        func = libalgos_numba.take_1d
+        func = _take_1d_dict.get(tup, None)
     elif ndim == 2:
         if axis == 0:
             func = _take_2d_axis0_dict.get(tup, None)
@@ -290,7 +290,7 @@ def _get_take_nd_function_cached(
 
     tup = (out_dtype.name, out_dtype.name)
     if ndim == 1:
-        func = libalgos_numba.take_1d
+        func = _take_1d_dict.get(tup, None)
     elif ndim == 2:
         if axis == 0:
             func = _take_2d_axis0_dict.get(tup, None)
@@ -354,6 +354,35 @@ def wrapper(
     return wrapper
 
 
+_take_1d_dict = {
+    ("int8", "int8"): libalgos_numba.take_1d_int8_int8,
+    ("int8", "int32"): libalgos_numba.take_1d_int8_int32,
+    ("int8", "int64"): libalgos_numba.take_1d_int8_int64,
+    ("int8", "float64"): libalgos_numba.take_1d_int8_float64,
+    ("int16", "int16"): libalgos_numba.take_1d_int16_int16,
+    ("int16", "int32"): libalgos_numba.take_1d_int16_int32,
+    ("int16", "int64"): libalgos_numba.take_1d_int16_int64,
+    ("int16", "float64"): libalgos_numba.take_1d_int16_float64,
+    ("int32", "int32"): libalgos_numba.take_1d_int32_int32,
+    ("int32", "int64"): libalgos_numba.take_1d_int32_int64,
+    ("int32", "float64"): libalgos_numba.take_1d_int32_float64,
+    ("int64", "int64"): libalgos_numba.take_1d_int64_int64,
+    ("int64", "float64"): libalgos_numba.take_1d_int64_float64,
+    ("float32", "float32"): libalgos_numba.take_1d_float32_float32,
+    ("float32", "float64"): libalgos_numba.take_1d_float32_float64,
+    ("float64", "float64"): libalgos_numba.take_1d_float64_float64,
+    ("object", "object"): libalgos_numba.take_1d_object_object,
+    ("bool", "bool"): _view_wrapper(
+        libalgos_numba.take_1d_bool_bool, np.uint8, np.uint8
+    ),
+    ("bool", "object"): _view_wrapper(
+        libalgos_numba.take_1d_bool_object, np.uint8, None
+    ),
+    ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
+        libalgos_numba.take_1d_int64_int64, np.int64, np.int64, np.int64
+    ),
+}
+
 _take_2d_axis0_dict = {
     ("int8", "int8"): libalgos.take_2d_axis0_int8_int8,
     ("int8", "int32"): libalgos.take_2d_axis0_int8_int32,

From 71d125e3481027bc60536d5e86d83a23696ddc29 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 21 Mar 2021 12:26:47 +0000
Subject: [PATCH 35/37] parallelize algos._take_1d

---
 pandas/_libs_numba/algos.py | 61 ++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
index 1c957853736cb..b191131c2e914 100644
--- a/pandas/_libs_numba/algos.py
+++ b/pandas/_libs_numba/algos.py
@@ -1381,12 +1381,28 @@ def ensure_uint64(arr):
 # ----------------------------------------------------------------------
 
 
-def _take_1d(
+def _take_1d_no_python(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
     n = indexer.shape[0]
 
-    for i in range(n):
+    func = _take_1d_parallel if n > 10_000 else _take_1d_serial
+
+    func(values, indexer, out, fill_value, n)
+
+
+def _take_1d_object(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    n = indexer.shape[0]
+
+    _take_1d_serial_object(values, indexer, out, fill_value, n)
+
+
+def _take_1d(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value, n: int
+) -> None:
+    for i in numba.prange(n):
         idx = indexer[i]
         if idx == -1:
             out[i] = fill_value
@@ -1394,12 +1410,13 @@ def _take_1d(
             out[i] = values[idx]
 
 
-_take_1d_no_python = numba.njit(_take_1d)
-_take_1d_object = numba.jit(forceobj=True)(_take_1d)
+_take_1d_parallel = numba.njit(parallel=True)(_take_1d)
+_take_1d_serial = numba.njit(_take_1d)
+_take_1d_serial_object = numba.jit(forceobj=True)(_take_1d)
 
 
 # @numba.njit(void(int8[:], intp[:], int8[:], int8))
-@numba.njit
+# @numba.njit
 def take_1d_int8_int8(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1407,7 +1424,7 @@ def take_1d_int8_int8(
 
 
 # @numba.njit(void(int8[:], intp[:], int32[:], int32))
-@numba.njit
+# @numba.njit
 def take_1d_int8_int32(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1415,7 +1432,7 @@ def take_1d_int8_int32(
 
 
 # @numba.njit(void(int8[:], intp[:], int64[:], int64))
-@numba.njit
+# @numba.njit
 def take_1d_int8_int64(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1423,7 +1440,7 @@ def take_1d_int8_int64(
 
 
 # @numba.njit(void(int8[:], intp[:], float64[:], float64))
-@numba.njit
+# @numba.njit
 def take_1d_int8_float64(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1433,7 +1450,7 @@ def take_1d_int8_float64(
 # @numba.njit(
 #     void(types.Array(types.int16, 1, "C", readonly=True), intp[:], int16[:], int16)
 # )
-@numba.njit
+# @numba.njit
 def take_1d_int16_int16(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1441,7 +1458,7 @@ def take_1d_int16_int16(
 
 
 # @numba.njit(void(int16[:], intp[:], int32[:], int32))
-@numba.njit
+# @numba.njit
 def take_1d_int16_int32(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1449,7 +1466,7 @@ def take_1d_int16_int32(
 
 
 # @numba.njit(void(int16[:], intp[:], int64[:], int64))
-@numba.njit
+# @numba.njit
 def take_1d_int16_int64(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1457,7 +1474,7 @@ def take_1d_int16_int64(
 
 
 # @numba.njit(void(int16[:], intp[:], float64[:], float64))
-@numba.njit
+# @numba.njit
 def take_1d_int16_float64(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1465,7 +1482,7 @@ def take_1d_int16_float64(
 
 
 # @numba.njit(void(int32[:], intp[:], int32[:], int32))
-@numba.njit
+# @numba.njit
 def take_1d_int32_int32(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1473,7 +1490,7 @@ def take_1d_int32_int32(
 
 
 # @numba.njit(void(int32[:], intp[:], int64[:], int64))
-@numba.njit
+# @numba.njit
 def take_1d_int32_int64(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1481,7 +1498,7 @@ def take_1d_int32_int64(
 
 
 # @numba.njit(void(int32[:], intp[:], float64[:], float64))
-@numba.njit
+# @numba.njit
 def take_1d_int32_float64(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1489,7 +1506,7 @@ def take_1d_int32_float64(
 
 
 # @numba.njit(void(int64[:], intp[:], int64[:], int64))
-@numba.njit
+# @numba.njit
 def take_1d_int64_int64(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1497,7 +1514,7 @@ def take_1d_int64_int64(
 
 
 # @numba.njit(void(int64[:], intp[:], float64[:], float64))
-@numba.njit
+# @numba.njit
 def take_1d_int64_float64(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1505,7 +1522,7 @@ def take_1d_int64_float64(
 
 
 # @numba.njit(void(float32[:], intp[:], float32[:], float32))
-@numba.njit
+# @numba.njit
 def take_1d_float32_float32(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1513,7 +1530,7 @@ def take_1d_float32_float32(
 
 
 # @numba.njit(void(float32[:], intp[:], float64[:], float64))
-@numba.njit
+# @numba.njit
 def take_1d_float32_float64(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1536,14 +1553,14 @@ def take_1d_float32_float64(
 #         ),
 #     ]
 # )
-@numba.njit
+# @numba.njit
 def take_1d_float64_float64(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
     _take_1d_no_python(values, indexer, out, fill_value)
 
 
-@numba.jit(forceobj=True)
+# @numba.jit(forceobj=True)
 def take_1d_object_object(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:
@@ -1551,7 +1568,7 @@ def take_1d_object_object(
 
 
 # @numba.njit(void(uint8[:], intp[:], uint8[:], uint8))
-@numba.njit
+# @numba.njit
 def take_1d_bool_bool(
     values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
 ) -> None:

From b2568dec057633d3f9dd9566d5bcd312da4a3da5 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 21 Mar 2021 12:29:46 +0000
Subject: [PATCH 36/37] revert changes to benchmarks

---
 asv_bench/benchmarks/arithmetic.py        | 4 ----
 asv_bench/benchmarks/categoricals.py      | 3 ---
 asv_bench/benchmarks/frame_methods.py     | 4 ----
 asv_bench/benchmarks/multiindex_object.py | 3 ---
 4 files changed, 14 deletions(-)

diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py
index 5f73e7156a6ad..488237a6f5a8b 100644
--- a/asv_bench/benchmarks/arithmetic.py
+++ b/asv_bench/benchmarks/arithmetic.py
@@ -267,10 +267,6 @@ def setup(self, tz):
 
         self.s2 = Series(date_range("20010101", periods=N, freq="s", tz=tz))
 
-        # warm-up for numba
-
-        self.s2.diff()
-
     def time_series_timestamp_compare(self, tz):
         self.s <= self.ts
 
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
index 319b0cb078479..268f25c3d12e3 100644
--- a/asv_bench/benchmarks/categoricals.py
+++ b/asv_bench/benchmarks/categoricals.py
@@ -246,9 +246,6 @@ def setup(self):
         self.c = self.ci.values
         self.key = self.ci.categories[0]
 
-        # numba warm-up
-        self.key in self.ci
-
     def time_categorical_index_contains(self):
         self.key in self.ci
 
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index 48ddee8b5ff52..65167e6467fd5 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -391,10 +391,6 @@ def setup(self, inplace, method, dtype):
                 values = values.round()
             self.df = DataFrame(values, dtype=dtype)
 
-        # warm up
-        df2 = self.df.copy()
-        df2.fillna(inplace=inplace, method=method)
-
     def time_frame_fillna(self, inplace, method, dtype):
         self.df.fillna(inplace=inplace, method=method)
 
diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py
index e4730a7f3bf6a..25df5b0214959 100644
--- a/asv_bench/benchmarks/multiindex_object.py
+++ b/asv_bench/benchmarks/multiindex_object.py
@@ -102,9 +102,6 @@ def setup(self):
             ]
         )
 
-        # numba warm-up
-        self.mi_int[:10].is_monotonic
-
     def time_get_indexer(self):
         self.mi_int.get_indexer(self.obj_index)
 

From 0aa61e06c97f7ddf5143f19c005fee496ff60497 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 21 Mar 2021 12:31:17 +0000
Subject: [PATCH 37/37] remove xfails

---
 pandas/tests/apply/test_frame_transform.py | 1 -
 pandas/tests/apply/test_series_apply.py    | 1 -
 2 files changed, 2 deletions(-)

diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py
index b0b448e77f226..7a626ce6312c5 100644
--- a/pandas/tests/apply/test_frame_transform.py
+++ b/pandas/tests/apply/test_frame_transform.py
@@ -162,7 +162,6 @@ def test_transform_method_name(method):
 frame_kernels_raise = [x for x in frame_transform_kernels if x not in wont_fail]
 
 
-@pytest.mark.xfail(strict=False)
 @pytest.mark.parametrize("op", [*frame_kernels_raise, lambda x: x + 1])
 def test_transform_bad_dtype(op, frame_or_series):
     # GH 35964
diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py
index eb901cb04174d..dcb5de29da320 100644
--- a/pandas/tests/apply/test_series_apply.py
+++ b/pandas/tests/apply/test_series_apply.py
@@ -257,7 +257,6 @@ def test_transform(string_series):
         tm.assert_series_equal(result.reindex_like(expected), expected)
 
 
-@pytest.mark.xfail(strict=False)
 @pytest.mark.parametrize("op", series_transform_kernels)
 def test_transform_partial_failure(op, request):
     # GH 35964 & GH 40211