pandas-dev
diff --git a/‎asv_bench/benchmarks/strings.py
Lines changed: 11 additions & 6 deletions b/‎asv_bench/benchmarks/strings.py
Lines changed: 11 additions & 6 deletions
diff --git a/‎doc/source/ecosystem.rst
Lines changed: 29 additions & 0 deletions b/‎doc/source/ecosystem.rst
Lines changed: 29 additions & 0 deletions
diff --git a/‎doc/source/user_guide/basics.rst
Lines changed: 0 additions & 2 deletions b/‎doc/source/user_guide/basics.rst
Lines changed: 0 additions & 2 deletions
diff --git a/‎doc/source/user_guide/cookbook.rst
Lines changed: 0 additions & 13 deletions b/‎doc/source/user_guide/cookbook.rst
Lines changed: 0 additions & 13 deletions
diff --git a/‎doc/source/user_guide/groupby.rst
Lines changed: 0 additions & 2 deletions b/‎doc/source/user_guide/groupby.rst
Lines changed: 0 additions & 2 deletions
diff --git a/‎doc/source/user_guide/io.rst
Lines changed: 0 additions & 2 deletions b/‎doc/source/user_guide/io.rst
Lines changed: 0 additions & 2 deletions
diff --git a/‎doc/source/user_guide/merging.rst
Lines changed: 1 addition & 0 deletions b/‎doc/source/user_guide/merging.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/source/user_guide/reshaping.rst
Lines changed: 0 additions & 2 deletions b/‎doc/source/user_guide/reshaping.rst
Lines changed: 0 additions & 2 deletions
diff --git a/‎doc/source/user_guide/scale.rst
Lines changed: 1 addition & 0 deletions b/‎doc/source/user_guide/scale.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/source/user_guide/sparse.rst
Lines changed: 0 additions & 1 deletion b/‎doc/source/user_guide/sparse.rst
Lines changed: 0 additions & 1 deletion
@@ -230,16 +230,21 @@ def time_contains(self, dtype, regex):
 
 class Split:
 
-    params = [True, False]
-    param_names = ["expand"]
+    params = (["str", "string", "arrow_string"], [True, False])
+    param_names = ["dtype", "expand"]
+
+    def setup(self, dtype, expand):
+        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
 
-    def setup(self, expand):
-        self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("--")
+        try:
+            self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("--")
+        except ImportError:
+            raise NotImplementedError
 
-    def time_split(self, expand):
+    def time_split(self, dtype, expand):
         self.s.str.split("--", expand=expand)
 
-    def time_rsplit(self, expand):
+    def time_rsplit(self, dtype, expand):
         self.s.str.rsplit("--", expand=expand)
 
 
 
@@ -405,6 +405,35 @@ Blaze provides a standard API for doing computations with various
 in-memory and on-disk backends: NumPy, pandas, SQLAlchemy, MongoDB, PyTables,
 PySpark.
 
+`Cylon <https://cylondata.org/>`__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Cylon is a fast, scalable, distributed memory parallel runtime with a pandas
+like Python DataFrame API. ”Core Cylon” is implemented with C++ using Apache
+Arrow format to represent the data in-memory. Cylon DataFrame API implements
+most of the core operators of pandas such as merge, filter, join, concat,
+group-by, drop_duplicates, etc. These operators are designed to work across
+thousands of cores to scale applications. It can interoperate with pandas
+DataFrame by reading data from pandas or converting data to pandas so users
+can selectively scale parts of their pandas DataFrame applications.
+
+.. code:: python
+
+    from pycylon import read_csv, DataFrame, CylonEnv
+    from pycylon.net import MPIConfig
+
+    # Initialize Cylon distributed environment
+    config: MPIConfig = MPIConfig()
+    env: CylonEnv = CylonEnv(config=config, distributed=True)
+
+    df1: DataFrame = read_csv('/tmp/csv1.csv')
+    df2: DataFrame = read_csv('/tmp/csv2.csv')
+
+    # Using 1000s of cores across the cluster to compute the join
+    df3: Table = df1.join(other=df2, on=[0], algorithm="hash", env=env)
+
+    print(df3)
+
 `Dask <https://dask.readthedocs.io/en/latest/>`__
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 
@@ -1184,11 +1184,9 @@ a single value and returning a single value. For example:
 
    df4
 
-
    def f(x):
        return len(str(x))
 
-
    df4["one"].map(f)
    df4.applymap(f)
 
 
@@ -494,15 +494,12 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
 
    S = pd.Series([i / 100.0 for i in range(1, 11)])
 
-
    def cum_ret(x, y):
        return x * (1 + y)
 
-
    def red(x):
        return functools.reduce(cum_ret, x, 1.0)
 
-
    S.expanding().apply(red, raw=True)
 
 
@@ -514,12 +511,10 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
    df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]})
    gb = df.groupby("A")
 
-
    def replace(g):
        mask = g < 0
        return g.where(mask, g[~mask].mean())
 
-
    gb.transform(replace)
 
 `Sort groups by aggregated data
@@ -551,13 +546,11 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
    rng = pd.date_range(start="2014-10-07", periods=10, freq="2min")
    ts = pd.Series(data=list(range(10)), index=rng)
 
-
    def MyCust(x):
        if len(x) > 2:
            return x[1] * 1.234
        return pd.NaT
 
-
    mhc = {"Mean": np.mean, "Max": np.max, "Custom": MyCust}
    ts.resample("5min").apply(mhc)
    ts
@@ -803,11 +796,9 @@ Apply
        index=["I", "II", "III"],
    )
 
-
    def SeriesFromSubList(aList):
        return pd.Series(aList)
 
-
    df_orgz = pd.concat(
        {ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()}
    )
@@ -827,12 +818,10 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc
    )
    df
 
-
    def gm(df, const):
        v = ((((df["A"] + df["B"]) + 1).cumprod()) - 1) * const
        return v.iloc[-1]
 
-
    s = pd.Series(
        {
            df.index[i]: gm(df.iloc[i: min(i + 51, len(df) - 1)], 5)
@@ -859,11 +848,9 @@ Rolling Apply to multiple columns where function returns a Scalar (Volume Weight
    )
    df
 
-
    def vwap(bars):
        return (bars.Close * bars.Volume).sum() / bars.Volume.sum()
 
-
    window = 5
    s = pd.concat(
        [
 
@@ -1617,12 +1617,10 @@ column index name will be used as the name of the inserted column:
        }
    )
 
-
    def compute_metrics(x):
        result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()}
        return pd.Series(result, name="metrics")
 
-
    result = df.groupby("a").apply(compute_metrics)
 
    result
 
@@ -4648,11 +4648,9 @@ chunks.
 
    store.append("dfeq", dfeq, data_columns=["number"])
 
-
    def chunks(l, n):
        return [l[i: i + n] for i in range(0, len(l), n)]
 
-
    evens = [2, 4, 6, 8, 10]
    coordinates = store.select_as_coordinates("dfeq", "number=evens")
    for c in chunks(coordinates, 2):
 
@@ -1578,4 +1578,5 @@ to ``True``.
 You may also keep all the original values even if they are equal.
 
 .. ipython:: python
+
    df.compare(df2, keep_shape=True, keep_equal=True)
@@ -18,7 +18,6 @@ Reshaping by pivoting DataFrame objects
 
    import pandas._testing as tm
 
-
    def unpivot(frame):
        N, K = frame.shape
        data = {
@@ -29,7 +28,6 @@ Reshaping by pivoting DataFrame objects
        columns = ["date", "variable", "value"]
        return pd.DataFrame(data, columns=columns)
 
-
    df = unpivot(tm.makeTimeDataFrame(3))
 
 Data is often stored in so-called "stacked" or "record" format:
 
@@ -345,6 +345,7 @@ we need to supply the divisions manually.
 Now we can do things like fast random access with ``.loc``.
 
 .. ipython:: python
+   :okwarning:
 
    ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute()
 
 
@@ -325,7 +325,6 @@ In the example below, we transform the ``Series`` to a sparse representation of
        row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
    )
 
-
    A
    A.todense()
    rows
Original file line number	Diff line number	Diff line change
`@@ -1617,12 +1617,10 @@ column index name will be used as the name of the inserted column:`
`1617`	`1617`	`}`
`1618`	`1618`	`)`
`1619`	`1619`
`1620`		`-`
`1621`	`1620`	`def compute_metrics(x):`
`1622`	`1621`	`result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()}`
`1623`	`1622`	`return pd.Series(result, name="metrics")`
`1624`	`1623`
`1625`		`-`
`1626`	`1624`	`result = df.groupby("a").apply(compute_metrics)`
`1627`	`1625`
`1628`	`1626`	`result`
Original file line number	Diff line number	Diff line change
@@ -325,7 +325,6 @@ In the example below, we transform the ``Series`` to a sparse representation of
`325`	`325`	`row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True`
`326`	`326`	`)`
`327`	`327`
`328`		`-`
`329`	`328`	`A`
`330`	`329`	`A.todense()`
`331`	`330`	`rows`