Skip to content

Commit 1ecfa67

Browse files
authored
Infer strings as pyarrow_numpy backed strings (#54720)
* Update v2.1.0.rst
1 parent 766e2fc commit 1ecfa67

File tree

17 files changed

+47
-68
lines changed

17 files changed

+47
-68
lines changed

doc/source/whatsnew/v2.1.0.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ PyArrow will become a required dependency with pandas 3.0
2121

2222
`PyArrow <https://arrow.apache.org/docs/python/index.html>`_ will become a required
2323
dependency of pandas starting with pandas 3.0. This decision was made based on
24-
`PDEP 12 <https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html>`_.
24+
`PDEP 10 <https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html>`_.
2525

2626
This will enable more changes that are hugely beneficial to pandas users, including
2727
but not limited to:
@@ -41,7 +41,9 @@ Avoid NumPy object dtype for strings by default
4141

4242
Previously, all strings were stored in columns with NumPy object dtype.
4343
This release introduces an option ``future.infer_string`` that infers all
44-
strings as PyArrow backed strings with dtype ``pd.ArrowDtype(pa.string())`` instead.
44+
strings as PyArrow backed strings with dtype ``"string[pyarrow_numpy]"`` instead.
45+
This is a new string dtype implementation that follows NumPy semantics in comparison
46+
operations and will return ``np.nan`` as the missing value indicator.
4547
This option only works if PyArrow is installed. PyArrow backed strings have a
4648
significantly reduced memory footprint and provide a big performance improvement
4749
compared to NumPy object (:issue:`54430`).

pandas/_libs/lib.pyx

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2682,11 +2682,9 @@ def maybe_convert_objects(ndarray[object] objects,
26822682

26832683
elif seen.str_:
26842684
if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True):
2685-
import pyarrow as pa
2685+
from pandas.core.arrays.string_ import StringDtype
26862686

2687-
from pandas.core.dtypes.dtypes import ArrowDtype
2688-
2689-
dtype = ArrowDtype(pa.string())
2687+
dtype = StringDtype(storage="pyarrow_numpy")
26902688
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
26912689

26922690
seen.object_ = True

pandas/core/construction.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,7 @@
5151
is_object_dtype,
5252
pandas_dtype,
5353
)
54-
from pandas.core.dtypes.dtypes import (
55-
ArrowDtype,
56-
NumpyEADtype,
57-
)
54+
from pandas.core.dtypes.dtypes import NumpyEADtype
5855
from pandas.core.dtypes.generic import (
5956
ABCDataFrame,
6057
ABCExtensionArray,
@@ -595,9 +592,9 @@ def sanitize_array(
595592
if data.dtype == object:
596593
subarr = maybe_infer_to_datetimelike(data)
597594
elif data.dtype.kind == "U" and using_pyarrow_string_dtype():
598-
import pyarrow as pa
595+
from pandas.core.arrays.string_ import StringDtype
599596

600-
dtype = ArrowDtype(pa.string())
597+
dtype = StringDtype(storage="pyarrow_numpy")
601598
subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
602599

603600
if subarr is data and copy:

pandas/core/dtypes/cast.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -799,10 +799,9 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
799799

800800
dtype = _dtype_obj
801801
if using_pyarrow_string_dtype():
802-
import pyarrow as pa
802+
from pandas.core.arrays.string_ import StringDtype
803803

804-
pa_dtype = pa.string()
805-
dtype = ArrowDtype(pa_dtype)
804+
dtype = StringDtype(storage="pyarrow_numpy")
806805

807806
elif isinstance(val, (np.datetime64, dt.datetime)):
808807
try:

pandas/core/internals/construction.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,7 @@
3232
is_named_tuple,
3333
is_object_dtype,
3434
)
35-
from pandas.core.dtypes.dtypes import (
36-
ArrowDtype,
37-
ExtensionDtype,
38-
)
35+
from pandas.core.dtypes.dtypes import ExtensionDtype
3936
from pandas.core.dtypes.generic import (
4037
ABCDataFrame,
4138
ABCSeries,
@@ -379,10 +376,9 @@ def ndarray_to_mgr(
379376
nb = new_block_2d(values, placement=bp, refs=refs)
380377
block_values = [nb]
381378
elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype():
382-
import pyarrow as pa
379+
dtype = StringDtype(storage="pyarrow_numpy")
383380

384381
obj_columns = list(values)
385-
dtype = ArrowDtype(pa.string())
386382
block_values = [
387383
new_block(
388384
dtype.construct_array_type()._from_sequence(data, dtype=dtype),

pandas/io/_util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,4 @@ def _arrow_dtype_mapping() -> dict:
2828
def arrow_string_types_mapper() -> Callable:
2929
pa = import_optional_dependency("pyarrow")
3030

31-
return {pa.string(): pd.ArrowDtype(pa.string())}.get
31+
return {pa.string(): pd.StringDtype(storage="pyarrow_numpy")}.get

pandas/io/pytables.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@
6868
)
6969
from pandas.core.dtypes.missing import array_equivalent
7070

71-
import pandas as pd
7271
from pandas import (
7372
DataFrame,
7473
DatetimeIndex,
@@ -3224,9 +3223,7 @@ def read(
32243223
values = self.read_array("values", start=start, stop=stop)
32253224
result = Series(values, index=index, name=self.name, copy=False)
32263225
if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
3227-
import pyarrow as pa
3228-
3229-
result = result.astype(pd.ArrowDtype(pa.string()))
3226+
result = result.astype("string[pyarrow_numpy]")
32303227
return result
32313228

32323229
# error: Signature of "write" incompatible with supertype "Fixed"
@@ -3296,9 +3293,7 @@ def read(
32963293
columns = items[items.get_indexer(blk_items)]
32973294
df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
32983295
if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
3299-
import pyarrow as pa
3300-
3301-
df = df.astype(pd.ArrowDtype(pa.string()))
3296+
df = df.astype("string[pyarrow_numpy]")
33023297
dfs.append(df)
33033298

33043299
if len(dfs) > 0:
@@ -4686,9 +4681,7 @@ def read(
46864681
values, # type: ignore[arg-type]
46874682
skipna=True,
46884683
):
4689-
import pyarrow as pa
4690-
4691-
df = df.astype(pd.ArrowDtype(pa.string()))
4684+
df = df.astype("string[pyarrow_numpy]")
46924685
frames.append(df)
46934686

46944687
if len(frames) == 1:

pandas/tests/frame/test_constructors.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2685,8 +2685,8 @@ def test_construct_with_strings_and_none(self):
26852685

26862686
def test_frame_string_inference(self):
26872687
# GH#54430
2688-
pa = pytest.importorskip("pyarrow")
2689-
dtype = pd.ArrowDtype(pa.string())
2688+
pytest.importorskip("pyarrow")
2689+
dtype = "string[pyarrow_numpy]"
26902690
expected = DataFrame(
26912691
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
26922692
)
@@ -2720,8 +2720,8 @@ def test_frame_string_inference(self):
27202720

27212721
def test_frame_string_inference_array_string_dtype(self):
27222722
# GH#54496
2723-
pa = pytest.importorskip("pyarrow")
2724-
dtype = pd.ArrowDtype(pa.string())
2723+
pytest.importorskip("pyarrow")
2724+
dtype = "string[pyarrow_numpy]"
27252725
expected = DataFrame(
27262726
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
27272727
)

pandas/tests/indexes/base_class/test_constructors.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ def test_construct_empty_tuples(self, tuple_list):
4646

4747
def test_index_string_inference(self):
4848
# GH#54430
49-
pa = pytest.importorskip("pyarrow")
50-
dtype = pd.ArrowDtype(pa.string())
49+
pytest.importorskip("pyarrow")
50+
dtype = "string[pyarrow_numpy]"
5151
expected = Index(["a", "b"], dtype=dtype)
5252
with pd.option_context("future.infer_string", True):
5353
ser = Index(["a", "b"])

pandas/tests/io/json/test_pandas.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2095,7 +2095,7 @@ def test_pyarrow_engine_lines_false():
20952095

20962096

20972097
def test_json_roundtrip_string_inference(orient):
2098-
pa = pytest.importorskip("pyarrow")
2098+
pytest.importorskip("pyarrow")
20992099
df = DataFrame(
21002100
[["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"]
21012101
)
@@ -2104,9 +2104,9 @@ def test_json_roundtrip_string_inference(orient):
21042104
result = read_json(StringIO(out))
21052105
expected = DataFrame(
21062106
[["a", "b"], ["c", "d"]],
2107-
dtype=pd.ArrowDtype(pa.string()),
2108-
index=pd.Index(["row 1", "row 2"], dtype=pd.ArrowDtype(pa.string())),
2109-
columns=pd.Index(["col 1", "col 2"], dtype=pd.ArrowDtype(pa.string())),
2107+
dtype="string[pyarrow_numpy]",
2108+
index=pd.Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"),
2109+
columns=pd.Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"),
21102110
)
21112111
tm.assert_frame_equal(result, expected)
21122112

0 commit comments

Comments
 (0)