From 0c264e3e116e862b361ac3cfc685ac472010a179 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 9 Sep 2018 21:13:58 +0200 Subject: [PATCH 1/5] ASV: more for str.cat [ci skip] --- asv_bench/benchmarks/strings.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index b203c8b0fa5c9..2a3b2c79072d1 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -1,7 +1,7 @@ import warnings import numpy as np -from pandas import Series +from pandas import Series, DataFrame import pandas.util.testing as tm @@ -12,9 +12,6 @@ class Methods(object): def setup(self): self.s = Series(tm.makeStringIndex(10**5)) - def time_cat(self): - self.s.str.cat(sep=',') - def time_center(self): self.s.str.center(100) @@ -87,6 +84,24 @@ def time_repeat(self, repeats): self.s.str.repeat(self.repeat) +class Cat(object): + + goal_time = 0.2 + params = ([None, 5], [None, ','], [None, '-']) + param_names = ['others', 'sep', 'na_rep'] + + def setup(self, others, sep, na_rep): + N = int(5e5) + mask_gen = lambda: np.random.choice([True, False], N, p=[0.9, 0.1]) + self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) + self.others = (DataFrame({i: tm.makeStringIndex(N).where(mask_gen()) + for i in range(others)}) + if others is not None else None) + + def time_cat(self, others, sep, na_rep): + self.s.str.cat(self.others, sep=sep, na_rep=na_rep) + + class Contains(object): goal_time = 0.2 From dadeeda4fa2dcab5eb9c27e8677e122a927a23a1 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 10 Sep 2018 08:19:01 +0200 Subject: [PATCH 2/5] Parametrize na_fraction [ci skip] --- asv_bench/benchmarks/strings.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 2a3b2c79072d1..7305deb606537 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -87,18 +87,19 @@ def time_repeat(self, repeats): class Cat(object): goal_time = 0.2 - params = ([None, 5], [None, ','], [None, '-']) - param_names = ['others', 'sep', 'na_rep'] + params = ([None, 5], [None, ','], [None, '-'], [0.0, 1e-4, 0.1]) + param_names = ['others', 'sep', 'na_rep', 'na_frac'] - def setup(self, others, sep, na_rep): + def setup(self, others, sep, na_rep, na_frac): N = int(5e5) - mask_gen = lambda: np.random.choice([True, False], N, p=[0.9, 0.1]) + mask_gen = lambda: np.random.choice([True, False], N, + p=[1 - na_frac, na_frac]) self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) self.others = (DataFrame({i: tm.makeStringIndex(N).where(mask_gen()) for i in range(others)}) if others is not None else None) - def time_cat(self, others, sep, na_rep): + def time_cat(self, others, sep, na_rep, na_frac): self.s.str.cat(self.others, sep=sep, na_rep=na_rep) From 4d14cfaecd1521ebc276d606cab86b07f227d6a5 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 11 Sep 2018 00:43:41 +0200 Subject: [PATCH 3/5] Add comment to explain choice of parameters [ci skip] --- asv_bench/benchmarks/strings.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 7305deb606537..783b06989051f 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -100,6 +100,10 @@ def setup(self, others, sep, na_rep, na_frac): if others is not None else None) def time_cat(self, others, sep, na_rep, na_frac): + # before the concatenation (one caller + others columns), the total + # expected fraction of rows containing any NaN is: + # reduce(lambda t, _: t + (1 - t) * na_frac, range(others + 1), 0) + # for others=5 and na_frac=0.1, this works out to ~47% self.s.str.cat(self.others, sep=sep, na_rep=na_rep) From d92f5f147bb9d8f41517829906993d6add976f2e Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 11 Sep 2018 07:43:34 +0200 Subject: [PATCH 4/5] Review (WillAyd) [ci skip] --- asv_bench/benchmarks/strings.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 783b06989051f..763207443769f 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -87,24 +87,24 @@ def time_repeat(self, repeats): class Cat(object): goal_time = 0.2 - params = ([None, 5], [None, ','], [None, '-'], [0.0, 1e-4, 0.1]) - param_names = ['others', 'sep', 'na_rep', 'na_frac'] + params = ([0, 3], [None, ','], [None, '-'], [0.0, 0.001, 0.15]) + param_names = ['other_cols', 'sep', 'na_rep', 'na_frac'] - def setup(self, others, sep, na_rep, na_frac): - N = int(5e5) + def setup(self, other_cols, sep, na_rep, na_frac): + N = 10 ** 5 mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac]) self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) self.others = (DataFrame({i: tm.makeStringIndex(N).where(mask_gen()) - for i in range(others)}) - if others is not None else None) + for i in range(other_cols)}) + if other_cols > 0 else None) - def time_cat(self, others, sep, na_rep, na_frac): - # before the concatenation (one caller + others columns), the total + def time_cat(self, other_cols, sep, na_rep, na_frac): + # before the concatenation (one caller + other_cols columns), the total # expected fraction of rows containing any NaN is: - # reduce(lambda t, _: t + (1 - t) * na_frac, range(others + 1), 0) - # for others=5 and na_frac=0.1, this works out to ~47% - self.s.str.cat(self.others, sep=sep, na_rep=na_rep) + # reduce(lambda t, _: t + (1 - t) * na_frac, range(other_cols + 1), 0) + # for other_cols=3 and na_frac=0.15, this works out to ~48% + self.s.str.cat(others=self.others, sep=sep, na_rep=na_rep) class Contains(object): From ddc130ef21eff0d052bd515a01dff6a0dca24189 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 13 Sep 2018 12:24:40 +0200 Subject: [PATCH 5/5] Review (WillAyd) [ci skip] --- asv_bench/benchmarks/strings.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 763207443769f..ccfac2f73f14d 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -95,9 +95,12 @@ def setup(self, other_cols, sep, na_rep, na_frac): mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac]) self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) - self.others = (DataFrame({i: tm.makeStringIndex(N).where(mask_gen()) - for i in range(other_cols)}) - if other_cols > 0 else None) + if other_cols == 0: + # str.cat self-concatenates only for others=None + self.others = None + else: + self.others = DataFrame({i: tm.makeStringIndex(N).where(mask_gen()) + for i in range(other_cols)}) def time_cat(self, other_cols, sep, na_rep, na_frac): # before the concatenation (one caller + other_cols columns), the total