Infer strings as pyarrow_numpy backed strings (#54720)

phofl · web-flow · commit 1ecfa6749d07 · 2023-08-24T21:30:03.000+02:00
* Update v2.1.0.rst
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -21,7 +21,7 @@ PyArrow will become a required dependency with pandas 3.0
 
 `PyArrow <https://arrow.apache.org/docs/python/index.html>`_ will become a required
 dependency of pandas starting with pandas 3.0. This decision was made based on
-`PDEP 12 <https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html>`_.
+`PDEP 10 <https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html>`_.
 
 This will enable more changes that are hugely beneficial to pandas users, including
 but not limited to:
@@ -41,7 +41,9 @@ Avoid NumPy object dtype for strings by default
 
 Previously, all strings were stored in columns with NumPy object dtype.
 This release introduces an option ``future.infer_string`` that infers all
-strings as PyArrow backed strings with dtype ``pd.ArrowDtype(pa.string())`` instead.
+strings as PyArrow backed strings with dtype ``"string[pyarrow_numpy]"`` instead.
+This is a new string dtype implementation that follows NumPy semantics in comparison
+operations and will return ``np.nan`` as the missing value indicator.
 This option only works if PyArrow is installed. PyArrow backed strings have a
 significantly reduced memory footprint and provide a big performance improvement
 compared to NumPy object (:issue:`54430`).
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2682,11 +2682,9 @@ def maybe_convert_objects(ndarray[object] objects,
 
     elif seen.str_:
         if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True):
-            import pyarrow as pa
+            from pandas.core.arrays.string_ import StringDtype
 
-            from pandas.core.dtypes.dtypes import ArrowDtype
-
-            dtype = ArrowDtype(pa.string())
+            dtype = StringDtype(storage="pyarrow_numpy")
             return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
         seen.object_ = True
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -51,10 +51,7 @@
     is_object_dtype,
     pandas_dtype,
 )
-from pandas.core.dtypes.dtypes import (
-    ArrowDtype,
-    NumpyEADtype,
-)
+from pandas.core.dtypes.dtypes import NumpyEADtype
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
     ABCExtensionArray,
@@ -595,9 +592,9 @@ def sanitize_array(
             if data.dtype == object:
                 subarr = maybe_infer_to_datetimelike(data)
             elif data.dtype.kind == "U" and using_pyarrow_string_dtype():
-                import pyarrow as pa
+                from pandas.core.arrays.string_ import StringDtype
 
-                dtype = ArrowDtype(pa.string())
+                dtype = StringDtype(storage="pyarrow_numpy")
                 subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
 
             if subarr is data and copy:
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -799,10 +799,9 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
 
         dtype = _dtype_obj
         if using_pyarrow_string_dtype():
-            import pyarrow as pa
+            from pandas.core.arrays.string_ import StringDtype
 
-            pa_dtype = pa.string()
-            dtype = ArrowDtype(pa_dtype)
+            dtype = StringDtype(storage="pyarrow_numpy")
 
     elif isinstance(val, (np.datetime64, dt.datetime)):
         try:
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -32,10 +32,7 @@
     is_named_tuple,
     is_object_dtype,
 )
-from pandas.core.dtypes.dtypes import (
-    ArrowDtype,
-    ExtensionDtype,
-)
+from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
     ABCSeries,
@@ -379,10 +376,9 @@ def ndarray_to_mgr(
             nb = new_block_2d(values, placement=bp, refs=refs)
             block_values = [nb]
     elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype():
-        import pyarrow as pa
+        dtype = StringDtype(storage="pyarrow_numpy")
 
         obj_columns = list(values)
-        dtype = ArrowDtype(pa.string())
         block_values = [
             new_block(
                 dtype.construct_array_type()._from_sequence(data, dtype=dtype),
diff --git a/pandas/io/_util.py b/pandas/io/_util.py
@@ -28,4 +28,4 @@ def _arrow_dtype_mapping() -> dict:
 def arrow_string_types_mapper() -> Callable:
     pa = import_optional_dependency("pyarrow")
 
-    return {pa.string(): pd.ArrowDtype(pa.string())}.get
+    return {pa.string(): pd.StringDtype(storage="pyarrow_numpy")}.get
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -68,7 +68,6 @@
 )
 from pandas.core.dtypes.missing import array_equivalent
 
-import pandas as pd
 from pandas import (
     DataFrame,
     DatetimeIndex,
@@ -3224,9 +3223,7 @@ def read(
         values = self.read_array("values", start=start, stop=stop)
         result = Series(values, index=index, name=self.name, copy=False)
         if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
-            import pyarrow as pa
-
-            result = result.astype(pd.ArrowDtype(pa.string()))
+            result = result.astype("string[pyarrow_numpy]")
         return result
 
     # error: Signature of "write" incompatible with supertype "Fixed"
@@ -3296,9 +3293,7 @@ def read(
             columns = items[items.get_indexer(blk_items)]
             df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
             if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
-                import pyarrow as pa
-
-                df = df.astype(pd.ArrowDtype(pa.string()))
+                df = df.astype("string[pyarrow_numpy]")
             dfs.append(df)
 
         if len(dfs) > 0:
@@ -4686,9 +4681,7 @@ def read(
                 values,  # type: ignore[arg-type]
                 skipna=True,
             ):
-                import pyarrow as pa
-
-                df = df.astype(pd.ArrowDtype(pa.string()))
+                df = df.astype("string[pyarrow_numpy]")
             frames.append(df)
 
         if len(frames) == 1:
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -2685,8 +2685,8 @@ def test_construct_with_strings_and_none(self):
 
     def test_frame_string_inference(self):
         # GH#54430
-        pa = pytest.importorskip("pyarrow")
-        dtype = pd.ArrowDtype(pa.string())
+        pytest.importorskip("pyarrow")
+        dtype = "string[pyarrow_numpy]"
         expected = DataFrame(
             {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
         )
@@ -2720,8 +2720,8 @@ def test_frame_string_inference(self):
 
     def test_frame_string_inference_array_string_dtype(self):
         # GH#54496
-        pa = pytest.importorskip("pyarrow")
-        dtype = pd.ArrowDtype(pa.string())
+        pytest.importorskip("pyarrow")
+        dtype = "string[pyarrow_numpy]"
         expected = DataFrame(
             {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
         )
diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py
@@ -46,8 +46,8 @@ def test_construct_empty_tuples(self, tuple_list):
 
     def test_index_string_inference(self):
         # GH#54430
-        pa = pytest.importorskip("pyarrow")
-        dtype = pd.ArrowDtype(pa.string())
+        pytest.importorskip("pyarrow")
+        dtype = "string[pyarrow_numpy]"
         expected = Index(["a", "b"], dtype=dtype)
         with pd.option_context("future.infer_string", True):
             ser = Index(["a", "b"])
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -2095,7 +2095,7 @@ def test_pyarrow_engine_lines_false():
 
 
 def test_json_roundtrip_string_inference(orient):
-    pa = pytest.importorskip("pyarrow")
+    pytest.importorskip("pyarrow")
     df = DataFrame(
         [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"]
     )
@@ -2104,9 +2104,9 @@ def test_json_roundtrip_string_inference(orient):
         result = read_json(StringIO(out))
     expected = DataFrame(
         [["a", "b"], ["c", "d"]],
-        dtype=pd.ArrowDtype(pa.string()),
-        index=pd.Index(["row 1", "row 2"], dtype=pd.ArrowDtype(pa.string())),
-        columns=pd.Index(["col 1", "col 2"], dtype=pd.ArrowDtype(pa.string())),
+        dtype="string[pyarrow_numpy]",
+        index=pd.Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"),
+        columns=pd.Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"),
     )
     tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -542,8 +542,8 @@ def test_ea_int_avoid_overflow(all_parsers):
 
 def test_string_inference(all_parsers):
     # GH#54430
-    pa = pytest.importorskip("pyarrow")
-    dtype = pd.ArrowDtype(pa.string())
+    pytest.importorskip("pyarrow")
+    dtype = "string[pyarrow_numpy]"
 
     data = """a,b
 x,1
diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py
@@ -392,15 +392,15 @@ def test_read_py2_hdf_file_in_py3(datapath):
 
 def test_read_infer_string(tmp_path, setup_path):
     # GH#54431
-    pa = pytest.importorskip("pyarrow")
+    pytest.importorskip("pyarrow")
     df = DataFrame({"a": ["a", "b", None]})
     path = tmp_path / setup_path
     df.to_hdf(path, key="data", format="table")
     with pd.option_context("future.infer_string", True):
         result = read_hdf(path, key="data", mode="r")
     expected = DataFrame(
         {"a": ["a", "b", None]},
-        dtype=pd.ArrowDtype(pa.string()),
-        columns=Index(["a"], dtype=pd.ArrowDtype(pa.string())),
+        dtype="string[pyarrow_numpy]",
+        columns=Index(["a"], dtype="string[pyarrow_numpy]"),
     )
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
@@ -222,14 +222,10 @@ def test_invalid_dtype_backend(self):
 
     def test_string_inference(self, tmp_path):
         # GH#54431
-        import pyarrow as pa
-
         path = tmp_path / "test_string_inference.p"
         df = pd.DataFrame(data={"a": ["x", "y"]})
         df.to_feather(path)
         with pd.option_context("future.infer_string", True):
             result = read_feather(path)
-        expected = pd.DataFrame(
-            data={"a": ["x", "y"]}, dtype=pd.ArrowDtype(pa.string())
-        )
+        expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]")
         tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
@@ -426,7 +426,7 @@ def test_string_inference(tmp_path):
         result = read_orc(path)
     expected = pd.DataFrame(
         data={"a": ["x", "y"]},
-        dtype=pd.ArrowDtype(pa.string()),
-        columns=pd.Index(["a"], dtype=pd.ArrowDtype(pa.string())),
+        dtype="string[pyarrow_numpy]",
+        columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"),
     )
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -1113,17 +1113,15 @@ def test_df_attrs_persistence(self, tmp_path, pa):
 
     def test_string_inference(self, tmp_path, pa):
         # GH#54431
-        import pyarrow as pa
-
         path = tmp_path / "test_string_inference.p"
         df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"])
         df.to_parquet(path, engine="pyarrow")
         with pd.option_context("future.infer_string", True):
             result = read_parquet(path, engine="pyarrow")
         expected = pd.DataFrame(
             data={"a": ["x", "y"]},
-            dtype=pd.ArrowDtype(pa.string()),
-            index=pd.Index(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
+            dtype="string[pyarrow_numpy]",
+            index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
         )
         tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -2946,15 +2946,15 @@ def test_read_sql_dtype_backend_table(self, string_storage, func):
 
     def test_read_sql_string_inference(self):
         # GH#54430
-        pa = pytest.importorskip("pyarrow")
+        pytest.importorskip("pyarrow")
         table = "test"
         df = DataFrame({"a": ["x", "y"]})
         df.to_sql(table, con=self.conn, index=False, if_exists="replace")
 
         with pd.option_context("future.infer_string", True):
             result = read_sql_table(table, self.conn)
 
-        dtype = pd.ArrowDtype(pa.string())
+        dtype = "string[pyarrow_numpy]"
         expected = DataFrame(
             {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
         )
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -2077,8 +2077,8 @@ def test_series_from_index_dtype_equal_does_not_copy(self):
 
     def test_series_string_inference(self):
         # GH#54430
-        pa = pytest.importorskip("pyarrow")
-        dtype = pd.ArrowDtype(pa.string())
+        pytest.importorskip("pyarrow")
+        dtype = "string[pyarrow_numpy]"
         expected = Series(["a", "b"], dtype=dtype)
         with pd.option_context("future.infer_string", True):
             ser = Series(["a", "b"])
@@ -2092,25 +2092,25 @@ def test_series_string_inference(self):
     @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA])
     def test_series_string_with_na_inference(self, na_value):
         # GH#54430
-        pa = pytest.importorskip("pyarrow")
-        dtype = pd.ArrowDtype(pa.string())
+        pytest.importorskip("pyarrow")
+        dtype = "string[pyarrow_numpy]"
         expected = Series(["a", na_value], dtype=dtype)
         with pd.option_context("future.infer_string", True):
             ser = Series(["a", na_value])
         tm.assert_series_equal(ser, expected)
 
     def test_series_string_inference_scalar(self):
         # GH#54430
-        pa = pytest.importorskip("pyarrow")
-        expected = Series("a", index=[1], dtype=pd.ArrowDtype(pa.string()))
+        pytest.importorskip("pyarrow")
+        expected = Series("a", index=[1], dtype="string[pyarrow_numpy]")
         with pd.option_context("future.infer_string", True):
             ser = Series("a", index=[1])
         tm.assert_series_equal(ser, expected)
 
     def test_series_string_inference_array_string_dtype(self):
         # GH#54496
-        pa = pytest.importorskip("pyarrow")
-        expected = Series(["a", "b"], dtype=pd.ArrowDtype(pa.string()))
+        pytest.importorskip("pyarrow")
+        expected = Series(["a", "b"], dtype="string[pyarrow_numpy]")
         with pd.option_context("future.infer_string", True):
             ser = Series(np.array(["a", "b"]))
         tm.assert_series_equal(ser, expected)