From 369702c29839ac6fdfca195667978c05c0c6ccbe Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 20 May 2021 20:53:45 +0100 Subject: [PATCH 1/5] [ArrowStringArray] use pyarrow.compute.replace_substring(_regex) if available --- pandas/core/arrays/string_arrow.py | 23 ++ pandas/core/strings/accessor.py | 64 +-- pandas/core/strings/base.py | 20 +- pandas/core/strings/object_array.py | 36 +- pandas/tests/series/methods/test_replace.py | 11 - pandas/tests/strings/test_find_replace.py | 416 ++++++++++++-------- 6 files changed, 345 insertions(+), 225 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d5ee28eb7017e..0ad2df77165a7 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections.abc import Callable # noqa: PDF001 import re from typing import ( TYPE_CHECKING, @@ -834,6 +835,28 @@ def _str_endswith(self, pat: str, na=None): pat = re.escape(pat) + "$" return self._str_contains(pat, na=na, regex=True) + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ): + if ( + pa_version_under4p0 + or isinstance(pat, re.Pattern) + or callable(repl) + or not case + or flags + ): + return super()._str_replace(pat, repl, n, case, flags, regex) + + func = pc.replace_substring_regex if regex else pc.replace_substring + result = func(self._data, pattern=pat, replacement=repl, max_replacements=n) + return type(self)(result) + def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None ): diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 43df34a7ecbb2..f48b38d38c53b 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1,14 +1,12 @@ +from __future__ import annotations + import codecs +from collections.abc import Callable # noqa: PDF001 from functools import wraps import re from typing import ( TYPE_CHECKING, - Dict, Hashable, - List, - Optional, - Pattern, - Union, ) import warnings @@ -43,7 +41,7 @@ if TYPE_CHECKING: from pandas import Index -_shared_docs: Dict[str, str] = {} +_shared_docs: dict[str, str] = {} _cpython_optimized_encoders = ( "utf-8", "utf8", @@ -325,7 +323,7 @@ def cons_row(x): else: index = self._orig.index # This is a mess. - dtype: Optional[str] + dtype: str | None if self._is_string and returns_string: dtype = self._orig.dtype else: @@ -391,7 +389,7 @@ def _get_series_list(self, others): or (isinstance(x, np.ndarray) and x.ndim == 1) for x in others ): - los: List[Series] = [] + los: list[Series] = [] while others: # iterate through list and append each element los = los + self._get_series_list(others.pop(0)) return los @@ -1219,7 +1217,15 @@ def fullmatch(self, pat, case=True, flags=0, na=None): return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): + def replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool | None = None, + flags: int = 0, + regex: bool | None = None, + ): r""" Replace each occurrence of pattern/regex in the Series/Index. @@ -1348,26 +1354,21 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): ) if len(pat) == 1: msg += ( - " In addition, single character regular expressions will" + " In addition, single character regular expressions will " "*not* be treated as literal strings when regex=True." ) warnings.warn(msg, FutureWarning, stacklevel=3) - regex = True # Check whether repl is valid (GH 13438, GH 15055) if not (isinstance(repl, str) or callable(repl)): raise TypeError("repl must be a string or callable") is_compiled_re = is_re(pat) - if regex: - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError( - "case and flags cannot be set when pat is a compiled regex" - ) - elif case is None: - # not a compiled regex, set default case - case = True + if regex or regex is None: + if is_compiled_re and (case is not None or flags != 0): + raise ValueError( + "case and flags cannot be set when pat is a compiled regex" + ) elif is_compiled_re: raise ValueError( @@ -1376,6 +1377,15 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): elif callable(repl): raise ValueError("Cannot use a callable replacement when regex=False") + if regex is None: + if isinstance(pat, str) and len(pat) == 1: + regex = False + else: + regex = True + + if case is None: + case = True + result = self._data.array._str_replace( pat, repl, n=n, case=case, flags=flags, regex=regex ) @@ -2292,7 +2302,7 @@ def findall(self, pat, flags=0): @forbid_nonstring_types(["bytes"]) def extract( self, pat: str, flags: int = 0, expand: bool = True - ) -> Union[FrameOrSeriesUnion, "Index"]: + ) -> FrameOrSeriesUnion | Index: r""" Extract capture groups in the regex `pat` as columns in a DataFrame. @@ -2733,7 +2743,7 @@ def len(self): # boolean: # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle # _doc_args holds dict of strings to use in substituting casemethod docs - _doc_args: Dict[str, Dict[str, str]] = {} + _doc_args: dict[str, dict[str, str]] = {} _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""} _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""} @@ -2971,7 +2981,7 @@ def casefold(self): ) -def cat_safe(list_of_columns: List, sep: str): +def cat_safe(list_of_columns: list, sep: str): """ Auxiliary function for :meth:`str.cat`. @@ -3007,7 +3017,7 @@ def cat_safe(list_of_columns: List, sep: str): return result -def cat_core(list_of_columns: List, sep: str): +def cat_core(list_of_columns: list, sep: str): """ Auxiliary function for :meth:`str.cat` @@ -3046,14 +3056,14 @@ def _result_dtype(arr): return object -def _get_single_group_name(regex: Pattern) -> Hashable: +def _get_single_group_name(regex: re.Pattern) -> Hashable: if regex.groupindex: return next(iter(regex.groupindex)) else: return None -def _get_group_names(regex: Pattern) -> List[Hashable]: +def _get_group_names(regex: re.Pattern) -> list[Hashable]: """ Get named groups from compiled regex. @@ -3119,7 +3129,7 @@ def str_extract(accessor: StringMethods, pat: str, flags: int = 0, expand: bool else: result_list = _str_extract(obj.array, pat, flags=flags, expand=returns_df) - result_index: Optional["Index"] + result_index: Index | None if isinstance(obj, ABCSeries): result_index = obj.index else: diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index a77f8861a7c02..730870b448cb2 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -1,8 +1,8 @@ +from __future__ import annotations + import abc -from typing import ( - Pattern, - Union, -) +from collections.abc import Callable # noqa: PDF001 +import re import numpy as np @@ -52,7 +52,15 @@ def _str_endswith(self, pat, na=None): pass @abc.abstractmethod - def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ): pass @abc.abstractmethod @@ -68,7 +76,7 @@ def _str_match( @abc.abstractmethod def _str_fullmatch( self, - pat: Union[str, Pattern], + pat: str | re.Pattern, case: bool = True, flags: int = 0, na: Scalar = np.nan, diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 869eabc76b555..8505a88adc212 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -1,11 +1,8 @@ +from __future__ import annotations + +from collections.abc import Callable # noqa: PDF001 import re import textwrap -from typing import ( - Optional, - Pattern, - Set, - Union, -) import unicodedata import numpy as np @@ -18,10 +15,7 @@ Scalar, ) -from pandas.core.dtypes.common import ( - is_re, - is_scalar, -) +from pandas.core.dtypes.common import is_scalar from pandas.core.dtypes.missing import isna from pandas.core.strings.base import BaseStringArrayMethods @@ -38,7 +32,7 @@ def __len__(self): # For typing, _str_map relies on the object being sized. raise NotImplementedError - def _str_map(self, f, na_value=None, dtype: Optional[Dtype] = None): + def _str_map(self, f, na_value=None, dtype: Dtype | None = None): """ Map a callable over valid element of the array. @@ -138,15 +132,21 @@ def _str_endswith(self, pat, na=None): f = lambda x: x.endswith(pat) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) - def _str_replace(self, pat, repl, n=-1, case: bool = True, flags=0, regex=True): - is_compiled_re = is_re(pat) - + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ): if case is False: # add case flag, if provided flags |= re.IGNORECASE - if regex and (is_compiled_re or len(pat) > 1 or flags or callable(repl)): - if not is_compiled_re: + if regex or flags or callable(repl): + if not isinstance(pat, re.Pattern): pat = re.compile(pat, flags=flags) n = n if n >= 0 else 0 @@ -198,7 +198,7 @@ def _str_match( def _str_fullmatch( self, - pat: Union[str, Pattern], + pat: str | re.Pattern, case: bool = True, flags: int = 0, na: Scalar = None, @@ -339,7 +339,7 @@ def _str_get_dummies(self, sep="|"): except TypeError: arr = sep + arr.astype(str) + sep - tags: Set[str] = set() + tags: set[str] = set() for ts in Series(arr).str.split(sep): tags.update(ts) tags2 = sorted(tags - {""}) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index b21a2c54ae615..c32d74c17a47e 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -449,14 +449,3 @@ def test_replace_with_compiled_regex(self): result = s.replace({regex: "z"}, regex=True) expected = pd.Series(["z", "b", "c"]) tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("pattern", ["^.$", "."]) - def test_str_replace_regex_default_raises_warning(self, pattern): - # https://github.com/pandas-dev/pandas/pull/24809 - s = pd.Series(["a", "b", "c"]) - msg = r"The default value of regex will change from True to False" - if len(pattern) == 1: - msg += r".*single character regular expressions.*not.*literal strings" - with tm.assert_produces_warning(FutureWarning) as w: - s.str.replace(pattern, "") - assert re.match(msg, str(w[0].message)) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 0815d23f2b493..f9054b3de0a72 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -10,6 +10,10 @@ _testing as tm, ) +# -------------------------------------------------------------------------------------- +# str.contains +# -------------------------------------------------------------------------------------- + def test_contains(any_string_dtype): values = np.array( @@ -148,6 +152,81 @@ def test_contains_na_kwarg_for_nullable_string_dtype( tm.assert_series_equal(result, expected) +def test_contains_moar(any_string_dtype): + # PR #1179 + s = Series( + ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], + dtype=any_string_dtype, + ) + + result = s.str.contains("a") + expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected = Series( + [False, False, False, True, True, False, np.nan, False, False, True], + dtype=expected_dtype, + ) + tm.assert_series_equal(result, expected) + + result = s.str.contains("a", case=False) + expected = Series( + [True, False, False, True, True, False, np.nan, True, False, True], + dtype=expected_dtype, + ) + tm.assert_series_equal(result, expected) + + result = s.str.contains("Aa") + expected = Series( + [False, False, False, True, False, False, np.nan, False, False, False], + dtype=expected_dtype, + ) + tm.assert_series_equal(result, expected) + + result = s.str.contains("ba") + expected = Series( + [False, False, False, True, False, False, np.nan, False, False, False], + dtype=expected_dtype, + ) + tm.assert_series_equal(result, expected) + + result = s.str.contains("ba", case=False) + expected = Series( + [False, False, False, True, True, False, np.nan, True, False, False], + dtype=expected_dtype, + ) + tm.assert_series_equal(result, expected) + + +def test_contains_nan(any_string_dtype): + # PR #14171 + s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) + + result = s.str.contains("foo", na=False) + expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected = Series([False, False, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + result = s.str.contains("foo", na=True) + expected = Series([True, True, True], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + result = s.str.contains("foo", na="foo") + if any_string_dtype == "object": + expected = Series(["foo", "foo", "foo"], dtype=np.object_) + else: + expected = Series([True, True, True], dtype="boolean") + tm.assert_series_equal(result, expected) + + result = s.str.contains("foo") + expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + +# -------------------------------------------------------------------------------------- +# str.startswith +# -------------------------------------------------------------------------------------- + + @pytest.mark.parametrize("dtype", [None, "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) @@ -195,6 +274,11 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): tm.assert_series_equal(result, exp) +# -------------------------------------------------------------------------------------- +# str.endswith +# -------------------------------------------------------------------------------------- + + @pytest.mark.parametrize("dtype", [None, "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) @@ -242,39 +326,50 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na): tm.assert_series_equal(result, exp) +# -------------------------------------------------------------------------------------- +# str.replace +# -------------------------------------------------------------------------------------- + + def test_replace(any_string_dtype): - values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) + ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) - result = values.str.replace("BAD[_]*", "", regex=True) + result = ser.str.replace("BAD[_]*", "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - result = values.str.replace("BAD[_]*", "", n=1, regex=True) + +def test_replace_max_replacements(any_string_dtype): + ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) + expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) + result = ser.str.replace("BAD[_]*", "", n=1, regex=True) + tm.assert_series_equal(result, expected) + + expected = Series(["foo__barBAD", np.nan], dtype=any_string_dtype) + result = ser.str.replace("BAD", "", n=1, regex=False) tm.assert_series_equal(result, expected) def test_replace_mixed_object(): - mixed = Series( + ser = Series( ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) - - result = Series(mixed).str.replace("BAD[_]*", "", regex=True) + result = Series(ser).str.replace("BAD[_]*", "", regex=True) expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) - assert isinstance(result, Series) - tm.assert_almost_equal(result, expected) + tm.assert_series_equal(result, expected) def test_replace_unicode(any_string_dtype): - values = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) + ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) - result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) + result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("repl", [None, 3, {"a": "b"}]) @pytest.mark.parametrize("data", [["a", "b", None], ["a", "b", "c", "ad"]]) -def test_replace_raises(any_string_dtype, index_or_series, repl, data): +def test_replace_wrong_repl_type_raises(any_string_dtype, index_or_series, repl, data): # https://github.com/pandas-dev/pandas/issues/13438 msg = "repl must be a string or callable" obj = index_or_series(data, dtype=any_string_dtype) @@ -284,11 +379,11 @@ def test_replace_raises(any_string_dtype, index_or_series, repl, data): def test_replace_callable(any_string_dtype): # GH 15055 - values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) + ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with callable repl = lambda m: m.group(0).swapcase() - result = values.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) + result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -311,100 +406,193 @@ def test_replace_callable_raises(any_string_dtype, repl): def test_replace_callable_named_groups(any_string_dtype): # test regex named groups - values = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) + ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - result = values.str.replace(pat, repl, regex=True) + result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) def test_replace_compiled_regex(any_string_dtype): # GH 15446 - values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) + ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with compiled regex pat = re.compile(r"BAD_*") - result = values.str.replace(pat, "", regex=True) + result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - result = values.str.replace(pat, "", n=1, regex=True) + result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) def test_replace_compiled_regex_mixed_object(): pat = re.compile(r"BAD_*") - mixed = Series( + ser = Series( ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) - - result = Series(mixed).str.replace(pat, "", regex=True) + result = Series(ser).str.replace(pat, "", regex=True) expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) - assert isinstance(result, Series) - tm.assert_almost_equal(result, expected) + tm.assert_series_equal(result, expected) def test_replace_compiled_regex_unicode(any_string_dtype): - values = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) + ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - result = values.str.replace(pat, ", ") + result = ser.str.replace(pat, ", ") tm.assert_series_equal(result, expected) def test_replace_compiled_regex_raises(any_string_dtype): # case and flags provided to str.replace will have no effect # and will produce warnings - values = Series(["fooBAD__barBAD__bad", np.nan], dtype=any_string_dtype) + ser = Series(["fooBAD__barBAD__bad", np.nan], dtype=any_string_dtype) pat = re.compile(r"BAD_*") msg = "case and flags cannot be set when pat is a compiled regex" with pytest.raises(ValueError, match=msg): - values.str.replace(pat, "", flags=re.IGNORECASE) + ser.str.replace(pat, "", flags=re.IGNORECASE) with pytest.raises(ValueError, match=msg): - values.str.replace(pat, "", case=False) + ser.str.replace(pat, "", case=False) with pytest.raises(ValueError, match=msg): - values.str.replace(pat, "", case=True) + ser.str.replace(pat, "", case=True) def test_replace_compiled_regex_callable(any_string_dtype): # test with callable - values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) + ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") - result = values.str.replace(pat, repl, n=2) + result = ser.str.replace(pat, repl, n=2) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) -def test_replace_literal(any_string_dtype): +@pytest.mark.parametrize( + "regex,expected", [(True, ["bao", "bao", np.nan]), (False, ["bao", "foo", np.nan])] +) +def test_replace_literal(regex, expected, any_string_dtype): # GH16808 literal replace (regex=False vs regex=True) - values = Series(["f.o", "foo", np.nan], dtype=any_string_dtype) - expected = Series(["bao", "bao", np.nan], dtype=any_string_dtype) - result = values.str.replace("f.", "ba", regex=True) + ser = Series(["f.o", "foo", np.nan], dtype=any_string_dtype) + expected = Series(expected, dtype=any_string_dtype) + result = ser.str.replace("f.", "ba", regex=regex) tm.assert_series_equal(result, expected) - expected = Series(["bao", "foo", np.nan], dtype=any_string_dtype) - result = values.str.replace("f.", "ba", regex=False) - tm.assert_series_equal(result, expected) - # Cannot do a literal replace if given a callable repl or compiled - # pattern - callable_repl = lambda m: m.group(0).swapcase() - compiled_pat = re.compile("[a-z][A-Z]{2}") +def test_replace_literal_callable_raises(any_string_dtype): + ser = Series([], dtype=any_string_dtype) + repl = lambda m: m.group(0).swapcase() msg = "Cannot use a callable replacement when regex=False" with pytest.raises(ValueError, match=msg): - values.str.replace("abc", callable_repl, regex=False) + ser.str.replace("abc", repl, regex=False) + + +def test_replace_literal_compiled_raises(any_string_dtype): + ser = Series([], dtype=any_string_dtype) + pat = re.compile("[a-z][A-Z]{2}") msg = "Cannot use a compiled regex as replacement pattern with regex=False" with pytest.raises(ValueError, match=msg): - values.str.replace(compiled_pat, "", regex=False) + ser.str.replace(pat, "", regex=False) + + +def test_replace_moar(any_string_dtype): + # PR #1179 + ser = Series( + ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], + dtype=any_string_dtype, + ) + + result = ser.str.replace("A", "YYY") + expected = Series( + ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"], + dtype=any_string_dtype, + ) + tm.assert_series_equal(result, expected) + + result = ser.str.replace("A", "YYY", case=False) + expected = Series( + [ + "YYY", + "B", + "C", + "YYYYYYbYYY", + "BYYYcYYY", + "", + np.nan, + "CYYYBYYY", + "dog", + "cYYYt", + ], + dtype=any_string_dtype, + ) + tm.assert_series_equal(result, expected) + + result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) + expected = Series( + [ + "A", + "B", + "C", + "XX-XX ba", + "XX-XX ca", + "", + np.nan, + "XX-XX BA", + "XX-XX ", + "XX-XX t", + ], + dtype=any_string_dtype, + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("regex", [True, False]) +def test_replace_not_case_sensitive(regex, any_string_dtype): + ser = Series(["A", "a", np.nan], dtype=any_string_dtype) + result = ser.str.replace("A", "A", case=False, regex=regex) + expected = Series(["A", "A", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) + + +def test_replace_regex_default_warning(any_string_dtype): + # https://github.com/pandas-dev/pandas/pull/24809 + s = Series(["a", "b", "ac", np.nan, ""], dtype=any_string_dtype) + msg = ( + "The default value of regex will change from True to False in a " + "future version\\.$" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.str.replace("^.$", "a") + expected = Series(["a", "a", "ac", np.nan, ""], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) + + +def test_replace_regex_default_warning_single_character(any_string_dtype): + # https://github.com/pandas-dev/pandas/pull/24809 + s = Series(["a.b", ".", "b", np.nan, ""], dtype=any_string_dtype) + msg = re.escape( + "The default value of regex will change from True to False in a " + "future version. In addition, single character regular expressions will *not* " + "be treated as literal strings when regex=True." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.str.replace(".", "a") + expected = Series(["aab", "a", "b", np.nan, ""], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) + + +# -------------------------------------------------------------------------------------- +# str.match +# -------------------------------------------------------------------------------------- def test_match(any_string_dtype): @@ -484,6 +672,11 @@ def test_match_case_kwarg(any_string_dtype): tm.assert_series_equal(result, expected) +# -------------------------------------------------------------------------------------- +# str.fullmatch +# -------------------------------------------------------------------------------------- + + def test_fullmatch(any_string_dtype): # GH 32806 ser = Series( @@ -523,6 +716,11 @@ def test_fullmatch_case_kwarg(any_string_dtype): tm.assert_series_equal(result, expected) +# -------------------------------------------------------------------------------------- +# str.findall +# -------------------------------------------------------------------------------------- + + def test_findall(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"], dtype=any_string_dtype) result = ser.str.findall("BAD[_]*") @@ -563,6 +761,11 @@ def test_findall_mixed_object(): tm.assert_series_equal(result, expected) +# -------------------------------------------------------------------------------------- +# str.find +# -------------------------------------------------------------------------------------- + + def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype @@ -646,6 +849,11 @@ def test_find_nan(any_string_dtype): tm.assert_series_equal(result, expected) +# -------------------------------------------------------------------------------------- +# str.translate +# -------------------------------------------------------------------------------------- + + def test_translate(index_or_series, any_string_dtype): obj = index_or_series( ["abcdefg", "abcc", "cdddfg", "cdefggg"], dtype=any_string_dtype @@ -670,125 +878,7 @@ def test_translate_mixed_object(): tm.assert_series_equal(result, expected) -def test_contains_moar(any_string_dtype): - # PR #1179 - s = Series( - ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], - dtype=any_string_dtype, - ) - - result = s.str.contains("a") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" - expected = Series( - [False, False, False, True, True, False, np.nan, False, False, True], - dtype=expected_dtype, - ) - tm.assert_series_equal(result, expected) - - result = s.str.contains("a", case=False) - expected = Series( - [True, False, False, True, True, False, np.nan, True, False, True], - dtype=expected_dtype, - ) - tm.assert_series_equal(result, expected) - - result = s.str.contains("Aa") - expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False], - dtype=expected_dtype, - ) - tm.assert_series_equal(result, expected) - - result = s.str.contains("ba") - expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False], - dtype=expected_dtype, - ) - tm.assert_series_equal(result, expected) - - result = s.str.contains("ba", case=False) - expected = Series( - [False, False, False, True, True, False, np.nan, True, False, False], - dtype=expected_dtype, - ) - tm.assert_series_equal(result, expected) - - -def test_contains_nan(any_string_dtype): - # PR #14171 - s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) - - result = s.str.contains("foo", na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" - expected = Series([False, False, False], dtype=expected_dtype) - tm.assert_series_equal(result, expected) - - result = s.str.contains("foo", na=True) - expected = Series([True, True, True], dtype=expected_dtype) - tm.assert_series_equal(result, expected) - - result = s.str.contains("foo", na="foo") - if any_string_dtype == "object": - expected = Series(["foo", "foo", "foo"], dtype=np.object_) - else: - expected = Series([True, True, True], dtype="boolean") - tm.assert_series_equal(result, expected) - - result = s.str.contains("foo") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" - expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) - tm.assert_series_equal(result, expected) - - -def test_replace_moar(any_string_dtype): - # PR #1179 - s = Series( - ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], - dtype=any_string_dtype, - ) - - result = s.str.replace("A", "YYY") - expected = Series( - ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"], - dtype=any_string_dtype, - ) - tm.assert_series_equal(result, expected) - - result = s.str.replace("A", "YYY", case=False) - expected = Series( - [ - "YYY", - "B", - "C", - "YYYYYYbYYY", - "BYYYcYYY", - "", - np.nan, - "CYYYBYYY", - "dog", - "cYYYt", - ], - dtype=any_string_dtype, - ) - tm.assert_series_equal(result, expected) - - result = s.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) - expected = Series( - [ - "A", - "B", - "C", - "XX-XX ba", - "XX-XX ca", - "", - np.nan, - "XX-XX BA", - "XX-XX ", - "XX-XX t", - ], - dtype=any_string_dtype, - ) - tm.assert_series_equal(result, expected) +# -------------------------------------------------------------------------------------- def test_flags_kwarg(any_string_dtype): From 19ee363dc4b4301ea73c1cf50462fea3aae62d5c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 21 May 2021 11:55:12 +0100 Subject: [PATCH 2/5] parametrize test_replace_regex_single_character --- pandas/tests/strings/test_find_replace.py | 26 ++++++++++++++++------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index f9054b3de0a72..80be19438f39d 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -576,16 +576,26 @@ def test_replace_regex_default_warning(any_string_dtype): tm.assert_series_equal(result, expected) -def test_replace_regex_default_warning_single_character(any_string_dtype): +@pytest.mark.parametrize("regex", [True, False, None]) +def test_replace_regex_single_character(regex, any_string_dtype): # https://github.com/pandas-dev/pandas/pull/24809 + + # The current behavior is to treat single character patterns as literal strings, + # even when ``regex`` is set to ``True``. + s = Series(["a.b", ".", "b", np.nan, ""], dtype=any_string_dtype) - msg = re.escape( - "The default value of regex will change from True to False in a " - "future version. In addition, single character regular expressions will *not* " - "be treated as literal strings when regex=True." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s.str.replace(".", "a") + + if regex is None: + msg = re.escape( + "The default value of regex will change from True to False in a future " + "version. In addition, single character regular expressions will *not* " + "be treated as literal strings when regex=True." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.str.replace(".", "a", regex=regex) + else: + result = s.str.replace(".", "a", regex=regex) + expected = Series(["aab", "a", "b", np.nan, ""], dtype=any_string_dtype) tm.assert_series_equal(result, expected) From fa666c24e849e2545e0fa4bf9fa3fa67f8ba9928 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 21 May 2021 12:01:32 +0100 Subject: [PATCH 3/5] undo bugfix that's not supposed to be fixed yet and add test --- pandas/core/strings/accessor.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index f48b38d38c53b..ca0067b77ee23 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1377,11 +1377,13 @@ def replace( elif callable(repl): raise ValueError("Cannot use a callable replacement when regex=False") + # The current behavior is to treat single character patterns as literal strings, + # even when ``regex`` is set to ``True``. + if isinstance(pat, str) and len(pat) == 1: + regex = False + if regex is None: - if isinstance(pat, str) and len(pat) == 1: - regex = False - else: - regex = True + regex = True if case is None: case = True From 2505ca41945820307ade2b6d6ccaaae626fa8f36 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 21 May 2021 13:00:03 +0100 Subject: [PATCH 4/5] fix case flag with regex=False --- pandas/core/strings/object_array.py | 2 ++ pandas/tests/strings/test_find_replace.py | 14 +++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 8505a88adc212..c214ada9c1ada 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -147,6 +147,8 @@ def _str_replace( if regex or flags or callable(repl): if not isinstance(pat, re.Pattern): + if regex is False: + pat = re.escape(pat) pat = re.compile(pat, flags=flags) n = n if n >= 0 else 0 diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 80be19438f39d..ecc15c8b32004 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -555,11 +555,15 @@ def test_replace_moar(any_string_dtype): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("regex", [True, False]) -def test_replace_not_case_sensitive(regex, any_string_dtype): - ser = Series(["A", "a", np.nan], dtype=any_string_dtype) - result = ser.str.replace("A", "A", case=False, regex=regex) - expected = Series(["A", "A", np.nan], dtype=any_string_dtype) +def test_replace_not_case_sensitive_not_regex(any_string_dtype): + ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) + + result = ser.str.replace("a", "c", case=False, regex=False) + expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) + + result = ser.str.replace("a.", "c.", case=False, regex=False) + expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) From 26bd1efb43d22d078995f91926cfc8eaac8225fa Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 21 May 2021 13:16:21 +0100 Subject: [PATCH 5/5] add issue number --- pandas/tests/strings/test_find_replace.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index ecc15c8b32004..391c71e57399a 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -556,6 +556,7 @@ def test_replace_moar(any_string_dtype): def test_replace_not_case_sensitive_not_regex(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) result = ser.str.replace("a", "c", case=False, regex=False)