diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 84dfe73adefe3..ba272fa364145 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -353,3 +353,4 @@ Bug Fixes - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`) +- Bug in ``pd.DataFrame.to_records`` which failed with unicode caracters in column names (:issue:`11879`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d96fb094f5d5c..efbb5d6f892f5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1105,13 +1105,17 @@ def to_records(self, index=True, convert_datetime64=True): count += 1 elif index_names[0] is None: index_names = ['index'] - names = lmap(str, index_names) + lmap(str, self.columns) + names = (lmap(compat.text_type, index_names) + + lmap(compat.text_type, self.columns)) else: arrays = [self[c].get_values() for c in self.columns] - names = lmap(str, self.columns) + names = lmap(compat.text_type, self.columns) - dtype = np.dtype([(x, v.dtype) for x, v in zip(names, arrays)]) - return np.rec.fromarrays(arrays, dtype=dtype, names=names) + formats = [v.dtype for v in arrays] + return np.rec.fromarrays( + arrays, + dtype={'names': names, 'formats': formats} + ) @classmethod def from_items(cls, items, columns=None, orient='columns'): diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 53083a602e183..fb82b0598bb0a 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -179,3 +179,16 @@ def test_to_records_with_unicode_index(self): .to_records() expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')]) tm.assert_almost_equal(result, expected) + + def test_to_records_with_unicode_column_names(self): + # Issue #11879. to_records used to raise an exception when used + # with column names containing non ascii caracters in Python 2 + result = DataFrame(data={u"accented_name_é": [1.0]}).to_records() + # Note that numpy allows for unicode field names but dtypes need + # to be specified using dictionnary intsead of list of tuples. + expected = np.rec.array( + [(0, 1.0)], + dtype={"names": ["index", u"accented_name_é"], + "formats": ['