Skip to content

DEPS: bump pyarrow minimum version from 10.0 to 12.0 #61723

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/deps/actions-310-minimum_versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dependencies:
- qtpy=2.3.0
- openpyxl=3.1.2
- psycopg2=2.9.6
- pyarrow=10.0.1
- pyarrow=12.0.1
- pyiceberg=0.7.1
- pymysql=1.1.0
- pyqt=5.15.9
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-310.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ dependencies:
- qtpy>=2.3.0
- openpyxl>=3.1.2
- psycopg2>=2.9.6
- pyarrow>=10.0.1
- pyarrow>=12.0.1
- pyiceberg>=0.7.1
- pymysql>=1.1.0
- pyqt>=5.15.9
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-311-downstream_compat.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ dependencies:
- qtpy>=2.3.0
- openpyxl>=3.1.2
- psycopg2>=2.9.6
- pyarrow>=10.0.1
- pyarrow>=12.0.1
- pyiceberg>=0.7.1
- pymysql>=1.1.0
- pyqt>=5.15.9
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-311.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ dependencies:
- pyqt>=5.15.9
- openpyxl>=3.1.2
- psycopg2>=2.9.6
- pyarrow>=10.0.1
- pyarrow>=12.0.1
- pyiceberg>=0.7.1
- pymysql>=1.1.0
- pyreadstat>=1.2.6
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-312.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ dependencies:
- pyqt>=5.15.9
- openpyxl>=3.1.2
- psycopg2>=2.9.6
- pyarrow>=10.0.1
- pyarrow>=12.0.1
- pyiceberg>=0.7.1
- pymysql>=1.1.0
- pyreadstat>=1.2.6
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-313.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dependencies:
- pyqt>=5.15.9
- openpyxl>=3.1.2
- psycopg2>=2.9.6
- pyarrow>=10.0.1
- pyarrow>=12.0.1
- pymysql>=1.1.0
- pyreadstat>=1.2.6
- pytables>=3.8.0
Expand Down
2 changes: 1 addition & 1 deletion doc/source/getting_started/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ Dependency Minimum Version pip ex
`PyTables <https://github.com/PyTables/PyTables>`__ 3.8.0 hdf5 HDF5-based reading / writing
`zlib <https://github.com/madler/zlib>`__ hdf5 Compression for HDF5
`fastparquet <https://github.com/dask/fastparquet>`__ 2024.2.0 - Parquet reading / writing (pyarrow is default)
`pyarrow <https://github.com/apache/arrow>`__ 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing
`pyarrow <https://github.com/apache/arrow>`__ 12.0.1 parquet, feather Parquet, ORC, and feather reading / writing
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this is for the "Other data sources" section.

As is now also improves performance for a default dtype, should also add to "Performance dependencies (recommended)" section or will this be done in another PR, #61722?

`PyIceberg <https://py.iceberg.apache.org/>`__ 0.7.1 iceberg Apache Iceberg reading / writing
`pyreadstat <https://github.com/Roche/pyreadstat>`__ 1.2.6 spss SPSS files (.sav) reading
`odfpy <https://github.com/eea/odfpy>`__ 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ dependencies:
- openpyxl>=3.1.2
- odfpy>=1.4.1
- psycopg2>=2.9.6
- pyarrow>=10.0.1
- pyarrow>=12.0.1
- pyiceberg>=0.7.1
- pymysql>=1.1.0
- pyreadstat>=1.2.6
Expand Down
4 changes: 2 additions & 2 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
set_locale,
)

from pandas.compat import pa_version_under10p1
from pandas.compat import HAS_PYARROW

import pandas as pd
from pandas import (
Expand Down Expand Up @@ -183,7 +183,7 @@
]
]

if not pa_version_under10p1:
if HAS_PYARROW:
import pyarrow as pa

UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]
Expand Down
6 changes: 2 additions & 4 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@
from pandas.compat.numpy import is_numpy_dev
from pandas.compat.pyarrow import (
HAS_PYARROW,
pa_version_under10p1,
pa_version_under11p0,
pa_version_under12p1,
pa_version_under13p0,
pa_version_under14p0,
pa_version_under14p1,
Expand Down Expand Up @@ -160,8 +159,7 @@ def is_ci_environment() -> bool:
"PYPY",
"WASM",
"is_numpy_dev",
"pa_version_under10p1",
"pa_version_under11p0",
"pa_version_under12p1",
"pa_version_under13p0",
"pa_version_under14p0",
"pa_version_under14p1",
Expand Down
2 changes: 1 addition & 1 deletion pandas/compat/_optional.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"openpyxl": "3.1.2",
"psycopg2": "2.9.6", # (dt dec pq3 ext lo64)
"pymysql": "1.1.0",
"pyarrow": "10.0.1",
"pyarrow": "12.0.1",
"pyiceberg": "0.7.1",
"pyreadstat": "1.2.6",
"pytest": "7.3.2",
Expand Down
10 changes: 3 additions & 7 deletions pandas/compat/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@
import pyarrow as pa

_palv = Version(Version(pa.__version__).base_version)
pa_version_under10p1 = _palv < Version("10.0.1")
pa_version_under11p0 = _palv < Version("11.0.0")
pa_version_under12p0 = _palv < Version("12.0.0")
pa_version_under12p1 = _palv < Version("12.0.1")
pa_version_under13p0 = _palv < Version("13.0.0")
pa_version_under14p0 = _palv < Version("14.0.0")
pa_version_under14p1 = _palv < Version("14.0.1")
Expand All @@ -20,11 +18,9 @@
pa_version_under18p0 = _palv < Version("18.0.0")
pa_version_under19p0 = _palv < Version("19.0.0")
pa_version_under20p0 = _palv < Version("20.0.0")
HAS_PYARROW = True
HAS_PYARROW = _palv >= Version("12.0.1")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked the current usages of HAS_PYARROW and essentially everywhere we mean it to be a supported version of pyarrow (didnt check the tests, but those we run only with supported versions anyway).

By changing the definition here, we can use HAS_PYARROW in other places to protect imports (the ones that were now using if not pa_version_under10p1), and then we don't have to update those everytime updating the minimum version.

except ImportError:
pa_version_under10p1 = True
pa_version_under11p0 = True
pa_version_under12p0 = True
pa_version_under12p1 = True
pa_version_under13p0 = True
pa_version_under14p0 = True
pa_version_under14p1 = True
Expand Down
9 changes: 2 additions & 7 deletions pandas/core/arrays/_arrow_string_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,12 @@

from pandas._libs import lib
from pandas.compat import (
pa_version_under10p1,
pa_version_under11p0,
HAS_PYARROW,
pa_version_under13p0,
pa_version_under17p0,
)

if not pa_version_under10p1:
if HAS_PYARROW:
import pyarrow as pa
import pyarrow.compute as pc

Expand Down Expand Up @@ -132,10 +131,6 @@ def _str_get(self, i: int) -> Self:
def _str_slice(
self, start: int | None = None, stop: int | None = None, step: int | None = None
) -> Self:
if pa_version_under11p0:
# GH#59724
result = self._apply_elementwise(lambda val: val[start:stop:step])
return type(self)(pa.chunked_array(result, type=self._pa_array.type))
if start is None:
if step is not None and step < 0:
# GH#59710
Expand Down
14 changes: 3 additions & 11 deletions pandas/core/arrays/arrow/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,11 @@
cast,
)

from pandas.compat import (
pa_version_under10p1,
pa_version_under11p0,
)
from pandas.compat import HAS_PYARROW

from pandas.core.dtypes.common import is_list_like

if not pa_version_under10p1:
if HAS_PYARROW:
import pyarrow as pa
import pyarrow.compute as pc

Expand Down Expand Up @@ -46,7 +43,7 @@ def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:

def _validate(self, data) -> None:
dtype = data.dtype
if pa_version_under10p1 or not isinstance(dtype, ArrowDtype):
if not HAS_PYARROW or not isinstance(dtype, ArrowDtype):
# Raise AttributeError so that inspect can handle non-struct Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))

Expand Down Expand Up @@ -171,11 +168,6 @@ def __getitem__(self, key: int | slice) -> Series:
name=self._data.name,
)
elif isinstance(key, slice):
if pa_version_under11p0:
raise NotImplementedError(
f"List slice not supported by pyarrow {pa.__version__}."
)

# TODO: Support negative start/stop/step, ideally this would be added
# upstream in pyarrow.
start, stop, step = key.start, key.stop, key.step
Expand Down
52 changes: 7 additions & 45 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
timezones,
)
from pandas.compat import (
pa_version_under10p1,
pa_version_under11p0,
HAS_PYARROW,
pa_version_under12p1,
pa_version_under13p0,
)
from pandas.util._decorators import doc
Expand Down Expand Up @@ -74,7 +74,7 @@
from pandas.io._util import _arrow_dtype_mapping
from pandas.tseries.frequencies import to_offset

if not pa_version_under10p1:
if HAS_PYARROW:
import pyarrow as pa
import pyarrow.compute as pc

Expand Down Expand Up @@ -208,16 +208,6 @@ def floordiv_compat(
from pandas.core.arrays.timedeltas import TimedeltaArray


def get_unit_from_pa_dtype(pa_dtype) -> str:
# https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804
if pa_version_under11p0:
unit = str(pa_dtype).split("[", 1)[-1][:-1]
if unit not in ["s", "ms", "us", "ns"]:
raise ValueError(pa_dtype)
return unit
return pa_dtype.unit


def to_pyarrow_type(
dtype: ArrowDtype | pa.DataType | Dtype | None,
) -> pa.DataType | None:
Expand Down Expand Up @@ -300,7 +290,7 @@ class ArrowExtensionArray(
_dtype: ArrowDtype

def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
if pa_version_under10p1:
if pa_version_under12p1:
msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray."
raise ImportError(msg)
if isinstance(values, pa.Array):
Expand Down Expand Up @@ -1199,10 +1189,6 @@ def factorize(
null_encoding = "mask" if use_na_sentinel else "encode"

data = self._pa_array
pa_type = data.type
if pa_version_under11p0 and pa.types.is_duration(pa_type):
# https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
data = data.cast(pa.int64())

if pa.types.is_dictionary(data.type):
if null_encoding == "encode":
Expand All @@ -1227,8 +1213,6 @@ def factorize(
)
uniques = type(self)(combined.dictionary)

if pa_version_under11p0 and pa.types.is_duration(pa_type):
uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype))
return indices, uniques

def reshape(self, *args, **kwargs):
Expand Down Expand Up @@ -1515,19 +1499,7 @@ def unique(self) -> Self:
-------
ArrowExtensionArray
"""
pa_type = self._pa_array.type

if pa_version_under11p0 and pa.types.is_duration(pa_type):
# https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
data = self._pa_array.cast(pa.int64())
else:
data = self._pa_array

pa_result = pc.unique(data)

if pa_version_under11p0 and pa.types.is_duration(pa_type):
pa_result = pa_result.cast(pa_type)

pa_result = pc.unique(self._pa_array)
return type(self)(pa_result)

def value_counts(self, dropna: bool = True) -> Series:
Expand All @@ -1547,18 +1519,12 @@ def value_counts(self, dropna: bool = True) -> Series:
--------
Series.value_counts
"""
pa_type = self._pa_array.type
if pa_version_under11p0 and pa.types.is_duration(pa_type):
# https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
data = self._pa_array.cast(pa.int64())
else:
data = self._pa_array

from pandas import (
Index,
Series,
)

data = self._pa_array
vc = data.value_counts()

values = vc.field(0)
Expand All @@ -1568,9 +1534,6 @@ def value_counts(self, dropna: bool = True) -> Series:
values = values.filter(mask)
counts = counts.filter(mask)

if pa_version_under11p0 and pa.types.is_duration(pa_type):
values = values.cast(pa_type)

counts = ArrowExtensionArray(counts)

index = Index(type(self)(values))
Expand Down Expand Up @@ -1864,8 +1827,7 @@ def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc]
if pa.types.is_duration(pa_type):
result = result.cast(pa_type)
elif pa.types.is_time(pa_type):
unit = get_unit_from_pa_dtype(pa_type)
result = result.cast(pa.duration(unit))
result = result.cast(pa.duration(pa_type.unit))
elif pa.types.is_date(pa_type):
# go with closest available unit, i.e. "s"
result = result.cast(pa.duration("s"))
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pandas._libs.lib import ensure_string_array
from pandas.compat import (
HAS_PYARROW,
pa_version_under10p1,
pa_version_under12p1,
)
from pandas.compat.numpy import function as nv
from pandas.util._decorators import (
Expand Down Expand Up @@ -182,9 +182,9 @@ def __init__(
raise ValueError(
f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
)
if storage == "pyarrow" and pa_version_under10p1:
if storage == "pyarrow" and pa_version_under12p1:
raise ImportError(
"pyarrow>=10.0.1 is required for PyArrow backed StringArray."
"pyarrow>=12.0.1 is required for PyArrow backed StringArray."
)

if isinstance(na_value, float) and np.isnan(na_value):
Expand Down
9 changes: 5 additions & 4 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
missing as libmissing,
)
from pandas.compat import (
pa_version_under10p1,
HAS_PYARROW,
pa_version_under12p1,
pa_version_under13p0,
pa_version_under16p0,
)
Expand All @@ -38,7 +39,7 @@
)
from pandas.core.strings.object_array import ObjectStringArrayMixin

if not pa_version_under10p1:
if HAS_PYARROW:
import pyarrow as pa
import pyarrow.compute as pc

Expand All @@ -63,8 +64,8 @@


def _chk_pyarrow_available() -> None:
if pa_version_under10p1:
msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray."
if pa_version_under12p1:
msg = "pyarrow>=12.0.1 is required for PyArrow backed ArrowExtensionArray."
raise ImportError(msg)


Expand Down
Loading
Loading