Skip to content

Commit a831aa4

Browse files
committed
merging master
2 parents ec67841 + 4ec6925 commit a831aa4

File tree

109 files changed

+2706
-1660
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

109 files changed

+2706
-1660
lines changed

asv_bench/benchmarks/strings.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -230,16 +230,21 @@ def time_contains(self, dtype, regex):
230230

231231
class Split:
232232

233-
params = [True, False]
234-
param_names = ["expand"]
233+
params = (["str", "string", "arrow_string"], [True, False])
234+
param_names = ["dtype", "expand"]
235+
236+
def setup(self, dtype, expand):
237+
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
235238

236-
def setup(self, expand):
237-
self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("--")
239+
try:
240+
self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("--")
241+
except ImportError:
242+
raise NotImplementedError
238243

239-
def time_split(self, expand):
244+
def time_split(self, dtype, expand):
240245
self.s.str.split("--", expand=expand)
241246

242-
def time_rsplit(self, expand):
247+
def time_rsplit(self, dtype, expand):
243248
self.s.str.rsplit("--", expand=expand)
244249

245250

doc/source/ecosystem.rst

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,35 @@ Blaze provides a standard API for doing computations with various
405405
in-memory and on-disk backends: NumPy, pandas, SQLAlchemy, MongoDB, PyTables,
406406
PySpark.
407407

408+
`Cylon <https://cylondata.org/>`__
409+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
410+
411+
Cylon is a fast, scalable, distributed memory parallel runtime with a pandas
412+
like Python DataFrame API. ”Core Cylon” is implemented with C++ using Apache
413+
Arrow format to represent the data in-memory. Cylon DataFrame API implements
414+
most of the core operators of pandas such as merge, filter, join, concat,
415+
group-by, drop_duplicates, etc. These operators are designed to work across
416+
thousands of cores to scale applications. It can interoperate with pandas
417+
DataFrame by reading data from pandas or converting data to pandas so users
418+
can selectively scale parts of their pandas DataFrame applications.
419+
420+
.. code:: python
421+
422+
from pycylon import read_csv, DataFrame, CylonEnv
423+
from pycylon.net import MPIConfig
424+
425+
# Initialize Cylon distributed environment
426+
config: MPIConfig = MPIConfig()
427+
env: CylonEnv = CylonEnv(config=config, distributed=True)
428+
429+
df1: DataFrame = read_csv('/tmp/csv1.csv')
430+
df2: DataFrame = read_csv('/tmp/csv2.csv')
431+
432+
# Using 1000s of cores across the cluster to compute the join
433+
df3: Table = df1.join(other=df2, on=[0], algorithm="hash", env=env)
434+
435+
print(df3)
436+
408437
`Dask <https://dask.readthedocs.io/en/latest/>`__
409438
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
410439

doc/source/user_guide/basics.rst

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1184,11 +1184,9 @@ a single value and returning a single value. For example:
11841184
11851185
df4
11861186
1187-
11881187
def f(x):
11891188
return len(str(x))
11901189
1191-
11921190
df4["one"].map(f)
11931191
df4.applymap(f)
11941192

doc/source/user_guide/cookbook.rst

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -494,15 +494,12 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
494494
495495
S = pd.Series([i / 100.0 for i in range(1, 11)])
496496
497-
498497
def cum_ret(x, y):
499498
return x * (1 + y)
500499
501-
502500
def red(x):
503501
return functools.reduce(cum_ret, x, 1.0)
504502
505-
506503
S.expanding().apply(red, raw=True)
507504
508505
@@ -514,12 +511,10 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
514511
df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]})
515512
gb = df.groupby("A")
516513
517-
518514
def replace(g):
519515
mask = g < 0
520516
return g.where(mask, g[~mask].mean())
521517
522-
523518
gb.transform(replace)
524519
525520
`Sort groups by aggregated data
@@ -551,13 +546,11 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
551546
rng = pd.date_range(start="2014-10-07", periods=10, freq="2min")
552547
ts = pd.Series(data=list(range(10)), index=rng)
553548
554-
555549
def MyCust(x):
556550
if len(x) > 2:
557551
return x[1] * 1.234
558552
return pd.NaT
559553
560-
561554
mhc = {"Mean": np.mean, "Max": np.max, "Custom": MyCust}
562555
ts.resample("5min").apply(mhc)
563556
ts
@@ -803,11 +796,9 @@ Apply
803796
index=["I", "II", "III"],
804797
)
805798
806-
807799
def SeriesFromSubList(aList):
808800
return pd.Series(aList)
809801
810-
811802
df_orgz = pd.concat(
812803
{ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()}
813804
)
@@ -827,12 +818,10 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc
827818
)
828819
df
829820
830-
831821
def gm(df, const):
832822
v = ((((df["A"] + df["B"]) + 1).cumprod()) - 1) * const
833823
return v.iloc[-1]
834824
835-
836825
s = pd.Series(
837826
{
838827
df.index[i]: gm(df.iloc[i: min(i + 51, len(df) - 1)], 5)
@@ -859,11 +848,9 @@ Rolling Apply to multiple columns where function returns a Scalar (Volume Weight
859848
)
860849
df
861850
862-
863851
def vwap(bars):
864852
return (bars.Close * bars.Volume).sum() / bars.Volume.sum()
865853
866-
867854
window = 5
868855
s = pd.concat(
869856
[

doc/source/user_guide/groupby.rst

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1617,12 +1617,10 @@ column index name will be used as the name of the inserted column:
16171617
}
16181618
)
16191619
1620-
16211620
def compute_metrics(x):
16221621
result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()}
16231622
return pd.Series(result, name="metrics")
16241623
1625-
16261624
result = df.groupby("a").apply(compute_metrics)
16271625
16281626
result

doc/source/user_guide/io.rst

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4648,11 +4648,9 @@ chunks.
46484648
46494649
store.append("dfeq", dfeq, data_columns=["number"])
46504650
4651-
46524651
def chunks(l, n):
46534652
return [l[i: i + n] for i in range(0, len(l), n)]
46544653
4655-
46564654
evens = [2, 4, 6, 8, 10]
46574655
coordinates = store.select_as_coordinates("dfeq", "number=evens")
46584656
for c in chunks(coordinates, 2):

doc/source/user_guide/merging.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1578,4 +1578,5 @@ to ``True``.
15781578
You may also keep all the original values even if they are equal.
15791579

15801580
.. ipython:: python
1581+
15811582
df.compare(df2, keep_shape=True, keep_equal=True)

doc/source/user_guide/reshaping.rst

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ Reshaping by pivoting DataFrame objects
1818
1919
import pandas._testing as tm
2020
21-
2221
def unpivot(frame):
2322
N, K = frame.shape
2423
data = {
@@ -29,7 +28,6 @@ Reshaping by pivoting DataFrame objects
2928
columns = ["date", "variable", "value"]
3029
return pd.DataFrame(data, columns=columns)
3130
32-
3331
df = unpivot(tm.makeTimeDataFrame(3))
3432
3533
Data is often stored in so-called "stacked" or "record" format:

doc/source/user_guide/scale.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,7 @@ we need to supply the divisions manually.
345345
Now we can do things like fast random access with ``.loc``.
346346

347347
.. ipython:: python
348+
:okwarning:
348349
349350
ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute()
350351

doc/source/user_guide/sparse.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,6 @@ In the example below, we transform the ``Series`` to a sparse representation of
325325
row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
326326
)
327327
328-
329328
A
330329
A.todense()
331330
rows

0 commit comments

Comments
 (0)