From 5e9dc6b6c074389802bc9bafdd3c99c9c61c576a Mon Sep 17 00:00:00 2001 From: Jakub Nowacki Date: Thu, 4 Jan 2018 13:15:44 +0100 Subject: [PATCH 1/4] str.extractall with no match returns appropriate MultIndex now. --- pandas/core/strings.py | 10 ++++------ pandas/tests/test_strings.py | 14 ++++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index fab4e77ce4467..3b7ec2ad8a508 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -794,12 +794,10 @@ def str_extractall(arr, pat, flags=0): result_key = tuple(subject_key + (match_i, )) index_list.append(result_key) - if 0 < len(index_list): - from pandas import MultiIndex - index = MultiIndex.from_tuples( - index_list, names=arr.index.names + ["match"]) - else: - index = None + from pandas import MultiIndex + index = MultiIndex.from_tuples( + index_list, names=arr.index.names + ["match"]) + result = arr._constructor_expanddim(match_list, index=index, columns=columns) return result diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 8aa69bcbfdf7f..f2317aa2ef4a0 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1074,26 +1074,28 @@ def test_extractall_single_group_with_quantifier(self): def test_extractall_no_matches(self): s = Series(['a3', 'b3', 'd4c2'], name='series_name') + i = MultiIndex(levels=[[], []], + labels=[[], []], + names=[None, 'match']) # one un-named group. r = s.str.extractall('(z)') - e = DataFrame(columns=[0]) + e = DataFrame(columns=[0], index=i) tm.assert_frame_equal(r, e) # two un-named groups. r = s.str.extractall('(z)(z)') - e = DataFrame(columns=[0, 1]) + e = DataFrame(columns=[0, 1], index=i) tm.assert_frame_equal(r, e) # one named group. r = s.str.extractall('(?Pz)') - e = DataFrame(columns=["first"]) + e = DataFrame(columns=["first"], index=i) tm.assert_frame_equal(r, e) # two named groups. r = s.str.extractall('(?Pz)(?Pz)') - e = DataFrame(columns=["first", "second"]) + e = DataFrame(columns=["first", "second"], index=i) tm.assert_frame_equal(r, e) # one named, one un-named. r = s.str.extractall('(z)(?Pz)') - e = DataFrame(columns=[0, - "second"]) + e = DataFrame(columns=[0, "second"], index=i) tm.assert_frame_equal(r, e) def test_extractall_stringindex(self): From b8dcd9c6c1c9a821ab41d55ddcf21df6958d5608 Mon Sep 17 00:00:00 2001 From: Jakub Nowacki Date: Thu, 4 Jan 2018 21:12:02 +0100 Subject: [PATCH 2/4] tests parametrized; whatsnew entry added --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/tests/test_strings.py | 34 +++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6a48abb6c6592..a692d1e60fd24 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -325,7 +325,7 @@ Indexing - Bug in indexing non-scalar value from ``Series`` having non-unique ``Index`` will return value flattened (:issue:`17610`) - Bug in :func:`DatetimeIndex.insert` where inserting ``NaT`` into a timezone-aware index incorrectly raised (:issue:`16357`) - Bug in ``__setitem__`` when indexing a :class:`DataFrame` with a 2-d boolean ndarray (:issue:`18582`) - +- Bug in ``str.extractall`` when there were no matches empty :class:`Index` was returned instead of appropriate :class:`MultiIndex` (:issue:`19034`) I/O ^^^ diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f2317aa2ef4a0..595953b9289da 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1072,30 +1072,44 @@ def test_extractall_single_group_with_quantifier(self): e = DataFrame(['ab', 'abc', 'd', 'cd'], i) tm.assert_frame_equal(r, e) - def test_extractall_no_matches(self): - s = Series(['a3', 'b3', 'd4c2'], name='series_name') - i = MultiIndex(levels=[[], []], - labels=[[], []], - names=[None, 'match']) + @pytest.mark.parametrize('data, names', [ + ([], (None, )), + ([], ('i1', )), + ([], (None, 'i2')), + ([], ('i1', 'i2')), + (['a3', 'b3', 'd4c2'], (None, )), + (['a3', 'b3', 'd4c2'], ('i1', 'i2')), + (['a3', 'b3', 'd4c2'], (None, 'i2')), + (['a3', 'b3', 'd4c2'], ('i1', 'i2')), + ]) + def test_extractall_no_matches(self, data, names): + n = len(data) + if len(names) == 1: + i = Index(range(n), name=names[0]) + else: + a = (tuple([i] * (n - 1)) for i in range(n)) + i = MultiIndex.from_tuples(a, names=names) + s = Series(data, name='series_name', index=i, dtype='object') + ei = MultiIndex.from_tuples([], names=(names + ('match',))) # one un-named group. r = s.str.extractall('(z)') - e = DataFrame(columns=[0], index=i) + e = DataFrame(columns=[0], index=ei) tm.assert_frame_equal(r, e) # two un-named groups. r = s.str.extractall('(z)(z)') - e = DataFrame(columns=[0, 1], index=i) + e = DataFrame(columns=[0, 1], index=ei) tm.assert_frame_equal(r, e) # one named group. r = s.str.extractall('(?Pz)') - e = DataFrame(columns=["first"], index=i) + e = DataFrame(columns=["first"], index=ei) tm.assert_frame_equal(r, e) # two named groups. r = s.str.extractall('(?Pz)(?Pz)') - e = DataFrame(columns=["first", "second"], index=i) + e = DataFrame(columns=["first", "second"], index=ei) tm.assert_frame_equal(r, e) # one named, one un-named. r = s.str.extractall('(z)(?Pz)') - e = DataFrame(columns=[0, "second"], index=i) + e = DataFrame(columns=[0, "second"], index=ei) tm.assert_frame_equal(r, e) def test_extractall_stringindex(self): From 4a099735ae8a51c31a8ed1a8f424cd5fca8af824 Mon Sep 17 00:00:00 2001 From: Jakub Nowacki Date: Fri, 5 Jan 2018 10:04:45 +0100 Subject: [PATCH 3/4] test comment added --- pandas/tests/test_strings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 595953b9289da..91fa6ff3b25f7 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1083,6 +1083,7 @@ def test_extractall_single_group_with_quantifier(self): (['a3', 'b3', 'd4c2'], ('i1', 'i2')), ]) def test_extractall_no_matches(self, data, names): + # GH19075 extractall with no matches should return a valid MultiIndex n = len(data) if len(names) == 1: i = Index(range(n), name=names[0]) From 12f94de43c870ab036510396d7572ebaf9bd26ea Mon Sep 17 00:00:00 2001 From: Jakub Nowacki Date: Fri, 5 Jan 2018 16:49:58 +0100 Subject: [PATCH 4/4] test comments newlines added --- pandas/tests/test_strings.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 91fa6ff3b25f7..973fe74429551 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1092,22 +1092,27 @@ def test_extractall_no_matches(self, data, names): i = MultiIndex.from_tuples(a, names=names) s = Series(data, name='series_name', index=i, dtype='object') ei = MultiIndex.from_tuples([], names=(names + ('match',))) + # one un-named group. r = s.str.extractall('(z)') e = DataFrame(columns=[0], index=ei) tm.assert_frame_equal(r, e) + # two un-named groups. r = s.str.extractall('(z)(z)') e = DataFrame(columns=[0, 1], index=ei) tm.assert_frame_equal(r, e) + # one named group. r = s.str.extractall('(?Pz)') e = DataFrame(columns=["first"], index=ei) tm.assert_frame_equal(r, e) + # two named groups. r = s.str.extractall('(?Pz)(?Pz)') e = DataFrame(columns=["first", "second"], index=ei) tm.assert_frame_equal(r, e) + # one named, one un-named. r = s.str.extractall('(z)(?Pz)') e = DataFrame(columns=[0, "second"], index=ei)