From 0c264e3e116e862b361ac3cfc685ac472010a179 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 9 Sep 2018 21:13:58 +0200
Subject: [PATCH 1/5] ASV: more for str.cat [ci skip]

---
 asv_bench/benchmarks/strings.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index b203c8b0fa5c9..2a3b2c79072d1 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -1,7 +1,7 @@
 import warnings
 
 import numpy as np
-from pandas import Series
+from pandas import Series, DataFrame
 import pandas.util.testing as tm
 
 
@@ -12,9 +12,6 @@ class Methods(object):
     def setup(self):
         self.s = Series(tm.makeStringIndex(10**5))
 
-    def time_cat(self):
-        self.s.str.cat(sep=',')
-
     def time_center(self):
         self.s.str.center(100)
 
@@ -87,6 +84,24 @@ def time_repeat(self, repeats):
         self.s.str.repeat(self.repeat)
 
 
+class Cat(object):
+
+    goal_time = 0.2
+    params = ([None, 5], [None, ','], [None, '-'])
+    param_names = ['others', 'sep', 'na_rep']
+
+    def setup(self, others, sep, na_rep):
+        N = int(5e5)
+        mask_gen = lambda: np.random.choice([True, False], N, p=[0.9, 0.1])
+        self.s = Series(tm.makeStringIndex(N)).where(mask_gen())
+        self.others = (DataFrame({i: tm.makeStringIndex(N).where(mask_gen())
+                                  for i in range(others)})
+                       if others is not None else None)
+
+    def time_cat(self, others, sep, na_rep):
+        self.s.str.cat(self.others, sep=sep, na_rep=na_rep)
+
+
 class Contains(object):
 
     goal_time = 0.2

From dadeeda4fa2dcab5eb9c27e8677e122a927a23a1 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Mon, 10 Sep 2018 08:19:01 +0200
Subject: [PATCH 2/5] Parametrize na_fraction [ci skip]

---
 asv_bench/benchmarks/strings.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 2a3b2c79072d1..7305deb606537 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -87,18 +87,19 @@ def time_repeat(self, repeats):
 class Cat(object):
 
     goal_time = 0.2
-    params = ([None, 5], [None, ','], [None, '-'])
-    param_names = ['others', 'sep', 'na_rep']
+    params = ([None, 5], [None, ','], [None, '-'], [0.0, 1e-4, 0.1])
+    param_names = ['others', 'sep', 'na_rep', 'na_frac']
 
-    def setup(self, others, sep, na_rep):
+    def setup(self, others, sep, na_rep, na_frac):
         N = int(5e5)
-        mask_gen = lambda: np.random.choice([True, False], N, p=[0.9, 0.1])
+        mask_gen = lambda: np.random.choice([True, False], N,
+                                            p=[1 - na_frac, na_frac])
         self.s = Series(tm.makeStringIndex(N)).where(mask_gen())
         self.others = (DataFrame({i: tm.makeStringIndex(N).where(mask_gen())
                                   for i in range(others)})
                        if others is not None else None)
 
-    def time_cat(self, others, sep, na_rep):
+    def time_cat(self, others, sep, na_rep, na_frac):
         self.s.str.cat(self.others, sep=sep, na_rep=na_rep)
 
 

From 4d14cfaecd1521ebc276d606cab86b07f227d6a5 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Tue, 11 Sep 2018 00:43:41 +0200
Subject: [PATCH 3/5] Add comment to explain choice of parameters [ci skip]

---
 asv_bench/benchmarks/strings.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 7305deb606537..783b06989051f 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -100,6 +100,10 @@ def setup(self, others, sep, na_rep, na_frac):
                        if others is not None else None)
 
     def time_cat(self, others, sep, na_rep, na_frac):
+        # before the concatenation (one caller + others columns), the total
+        # expected fraction of rows containing any NaN is:
+        #     reduce(lambda t, _: t + (1 - t) * na_frac, range(others + 1), 0)
+        # for others=5 and na_frac=0.1, this works out to ~47%
         self.s.str.cat(self.others, sep=sep, na_rep=na_rep)
 
 

From d92f5f147bb9d8f41517829906993d6add976f2e Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Tue, 11 Sep 2018 07:43:34 +0200
Subject: [PATCH 4/5] Review (WillAyd) [ci skip]

---
 asv_bench/benchmarks/strings.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 783b06989051f..763207443769f 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -87,24 +87,24 @@ def time_repeat(self, repeats):
 class Cat(object):
 
     goal_time = 0.2
-    params = ([None, 5], [None, ','], [None, '-'], [0.0, 1e-4, 0.1])
-    param_names = ['others', 'sep', 'na_rep', 'na_frac']
+    params = ([0, 3], [None, ','], [None, '-'], [0.0, 0.001, 0.15])
+    param_names = ['other_cols', 'sep', 'na_rep', 'na_frac']
 
-    def setup(self, others, sep, na_rep, na_frac):
-        N = int(5e5)
+    def setup(self, other_cols, sep, na_rep, na_frac):
+        N = 10 ** 5
         mask_gen = lambda: np.random.choice([True, False], N,
                                             p=[1 - na_frac, na_frac])
         self.s = Series(tm.makeStringIndex(N)).where(mask_gen())
         self.others = (DataFrame({i: tm.makeStringIndex(N).where(mask_gen())
-                                  for i in range(others)})
-                       if others is not None else None)
+                                  for i in range(other_cols)})
+                       if other_cols > 0 else None)
 
-    def time_cat(self, others, sep, na_rep, na_frac):
-        # before the concatenation (one caller + others columns), the total
+    def time_cat(self, other_cols, sep, na_rep, na_frac):
+        # before the concatenation (one caller + other_cols columns), the total
         # expected fraction of rows containing any NaN is:
-        #     reduce(lambda t, _: t + (1 - t) * na_frac, range(others + 1), 0)
-        # for others=5 and na_frac=0.1, this works out to ~47%
-        self.s.str.cat(self.others, sep=sep, na_rep=na_rep)
+        # reduce(lambda t, _: t + (1 - t) * na_frac, range(other_cols + 1), 0)
+        # for other_cols=3 and na_frac=0.15, this works out to ~48%
+        self.s.str.cat(others=self.others, sep=sep, na_rep=na_rep)
 
 
 class Contains(object):

From ddc130ef21eff0d052bd515a01dff6a0dca24189 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Thu, 13 Sep 2018 12:24:40 +0200
Subject: [PATCH 5/5] Review (WillAyd) [ci skip]

---
 asv_bench/benchmarks/strings.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 763207443769f..ccfac2f73f14d 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -95,9 +95,12 @@ def setup(self, other_cols, sep, na_rep, na_frac):
         mask_gen = lambda: np.random.choice([True, False], N,
                                             p=[1 - na_frac, na_frac])
         self.s = Series(tm.makeStringIndex(N)).where(mask_gen())
-        self.others = (DataFrame({i: tm.makeStringIndex(N).where(mask_gen())
-                                  for i in range(other_cols)})
-                       if other_cols > 0 else None)
+        if other_cols == 0:
+            # str.cat self-concatenates only for others=None
+            self.others = None
+        else:
+            self.others = DataFrame({i: tm.makeStringIndex(N).where(mask_gen())
+                                     for i in range(other_cols)})
 
     def time_cat(self, other_cols, sep, na_rep, na_frac):
         # before the concatenation (one caller + other_cols columns), the total