From e50fa37c8038feaacbdea92c76e560ead736de5f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 27 Jun 2025 16:45:29 +0200 Subject: [PATCH 01/14] bump pyarrow minimum version to 12.0 --- ci/deps/actions-310-minimum_versions.yaml | 2 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-312.yaml | 2 +- ci/deps/actions-313.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- environment.yml | 2 +- pandas/compat/_optional.py | 2 +- pyproject.toml | 8 ++++---- requirements-dev.txt | 2 +- 11 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index 9f12fe941d488..a9ea6a639043b 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -41,7 +41,7 @@ dependencies: - qtpy=2.3.0 - openpyxl=3.1.2 - psycopg2=2.9.6 - - pyarrow=10.0.1 + - pyarrow=12.0.1 - pyiceberg=0.7.1 - pymysql=1.1.0 - pyqt=5.15.9 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 66d49475bf34b..4904140f2e70b 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -39,7 +39,7 @@ dependencies: - qtpy>=2.3.0 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyqt>=5.15.9 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 100a250f0bf01..1c6bece3374b5 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -40,7 +40,7 @@ dependencies: - qtpy>=2.3.0 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyqt>=5.15.9 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 9669c1e29a435..deb646a7ba86a 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -40,7 +40,7 @@ dependencies: - pyqt>=5.15.9 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 61f1d602bb241..97b582b80fb8f 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -40,7 +40,7 @@ dependencies: - pyqt>=5.15.9 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 diff --git a/ci/deps/actions-313.yaml b/ci/deps/actions-313.yaml index 11f4428be27e5..4bc363dc4a27e 100644 --- a/ci/deps/actions-313.yaml +++ b/ci/deps/actions-313.yaml @@ -41,7 +41,7 @@ dependencies: - pyqt>=5.15.9 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 - pytables>=3.8.0 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 1589fea5f8953..ed0c8bd05098d 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -307,7 +307,7 @@ Dependency Minimum Version pip ex `PyTables `__ 3.8.0 hdf5 HDF5-based reading / writing `zlib `__ hdf5 Compression for HDF5 `fastparquet `__ 2024.2.0 - Parquet reading / writing (pyarrow is default) -`pyarrow `__ 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing +`pyarrow `__ 12.0.1 parquet, feather Parquet, ORC, and feather reading / writing `PyIceberg `__ 0.7.1 iceberg Apache Iceberg reading / writing `pyreadstat `__ 1.2.6 spss SPSS files (.sav) reading `odfpy `__ 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing diff --git a/environment.yml b/environment.yml index 74186bd2581c4..80a1b720ae2ad 100644 --- a/environment.yml +++ b/environment.yml @@ -43,7 +43,7 @@ dependencies: - openpyxl>=3.1.2 - odfpy>=1.4.1 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index f01dfab0de829..068219443799d 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -38,7 +38,7 @@ "openpyxl": "3.1.2", "psycopg2": "2.9.6", # (dt dec pq3 ext lo64) "pymysql": "1.1.0", - "pyarrow": "10.0.1", + "pyarrow": "12.0.1", "pyiceberg": "0.7.1", "pyreadstat": "1.2.6", "pytest": "7.3.2", diff --git a/pyproject.toml b/pyproject.toml index b17a1eacfa717..7582e2bce3879 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,15 +59,15 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] -pyarrow = ['pyarrow>=10.0.1'] +pyarrow = ['pyarrow>=12.0.1'] performance = ['bottleneck>=1.3.6', 'numba>=0.59.0', 'numexpr>=2.9.0'] computation = ['scipy>=1.12.0', 'xarray>=2024.1.1'] fss = ['fsspec>=2023.12.2'] aws = ['s3fs>=2023.12.2'] gcp = ['gcsfs>=2023.12.2'] excel = ['odfpy>=1.4.1', 'openpyxl>=3.1.2', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'xlrd>=2.0.1', 'xlsxwriter>=3.2.0'] -parquet = ['pyarrow>=10.0.1'] -feather = ['pyarrow>=10.0.1'] +parquet = ['pyarrow>=12.0.1'] +feather = ['pyarrow>=12.0.1'] iceberg = ['pyiceberg>=0.7.1'] hdf5 = ['tables>=3.8.0'] spss = ['pyreadstat>=1.2.6'] @@ -98,7 +98,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'odfpy>=1.4.1', 'openpyxl>=3.1.2', 'psycopg2>=2.9.6', - 'pyarrow>=10.0.1', + 'pyarrow>=12.0.1', 'pyiceberg>=0.7.1', 'pymysql>=1.1.0', 'PyQt5>=5.15.9', diff --git a/requirements-dev.txt b/requirements-dev.txt index 6515797bc3b9d..b4e977e1b0b1b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -32,7 +32,7 @@ numexpr>=2.9.0 openpyxl>=3.1.2 odfpy>=1.4.1 psycopg2-binary>=2.9.6 -pyarrow>=10.0.1 +pyarrow>=12.0.1 pyiceberg>=0.7.1 pymysql>=1.1.0 pyreadstat>=1.2.6 From f9389e1d30f5a146020df22ba6f50e1ce6601349 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 27 Jun 2025 17:34:53 +0200 Subject: [PATCH 02/14] update usage of pa_version_under10p1 --- pandas/_testing/__init__.py | 4 ++-- pandas/compat/__init__.py | 2 -- pandas/compat/pyarrow.py | 6 ++---- pandas/core/arrays/_arrow_string_mixins.py | 4 ++-- pandas/core/arrays/arrow/accessors.py | 6 +++--- pandas/core/arrays/arrow/array.py | 7 ++++--- pandas/core/arrays/string_.py | 6 +++--- pandas/core/arrays/string_arrow.py | 9 +++++---- pandas/core/dtypes/dtypes.py | 13 ++++++++----- pandas/tests/arrays/period/test_arrow_compat.py | 3 --- pandas/tests/arrays/string_/test_string.py | 4 ++-- pandas/tests/copy_view/test_astype.py | 4 ++-- pandas/tests/groupby/test_groupby_dropna.py | 9 +-------- 13 files changed, 34 insertions(+), 43 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index ec9b5098c97c9..fc447aaba37db 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -18,7 +18,7 @@ set_locale, ) -from pandas.compat import pa_version_under10p1 +from pandas.compat import HAS_PYARROW import pandas as pd from pandas import ( @@ -183,7 +183,7 @@ ] ] -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 9f3bfdc205498..901ff3d9c161c 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -26,7 +26,6 @@ from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( HAS_PYARROW, - pa_version_under10p1, pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, @@ -160,7 +159,6 @@ def is_ci_environment() -> bool: "PYPY", "WASM", "is_numpy_dev", - "pa_version_under10p1", "pa_version_under11p0", "pa_version_under13p0", "pa_version_under14p0", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 163934bee509c..ad28597e14cb0 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -8,9 +8,8 @@ import pyarrow as pa _palv = Version(Version(pa.__version__).base_version) - pa_version_under10p1 = _palv < Version("10.0.1") pa_version_under11p0 = _palv < Version("11.0.0") - pa_version_under12p0 = _palv < Version("12.0.0") + pa_version_under12p1 = _palv < Version("12.0.1") pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") pa_version_under14p1 = _palv < Version("14.0.1") @@ -22,9 +21,8 @@ pa_version_under20p0 = _palv < Version("20.0.0") HAS_PYARROW = True except ImportError: - pa_version_under10p1 = True pa_version_under11p0 = True - pa_version_under12p0 = True + pa_version_under12p1 = True pa_version_under13p0 = True pa_version_under14p0 = True pa_version_under14p1 = True diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 1ca52ce64bd77..4ca7530e134f0 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -12,13 +12,13 @@ from pandas._libs import lib from pandas.compat import ( - pa_version_under10p1, + HAS_PYARROW, pa_version_under11p0, pa_version_under13p0, pa_version_under17p0, ) -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa import pyarrow.compute as pc diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index b220a94d032b5..419dcc9396c0c 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -12,13 +12,13 @@ ) from pandas.compat import ( - pa_version_under10p1, + HAS_PYARROW, pa_version_under11p0, ) from pandas.core.dtypes.common import is_list_like -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa import pyarrow.compute as pc @@ -46,7 +46,7 @@ def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: def _validate(self, data) -> None: dtype = data.dtype - if pa_version_under10p1 or not isinstance(dtype, ArrowDtype): + if not HAS_PYARROW or not isinstance(dtype, ArrowDtype): # Raise AttributeError so that inspect can handle non-struct Series. raise AttributeError(self._validation_msg.format(dtype=dtype)) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c18f06c3a126d..a1171690ff86d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -22,8 +22,9 @@ timezones, ) from pandas.compat import ( - pa_version_under10p1, + HAS_PYARROW, pa_version_under11p0, + pa_version_under12p1, pa_version_under13p0, ) from pandas.util._decorators import doc @@ -74,7 +75,7 @@ from pandas.io._util import _arrow_dtype_mapping from pandas.tseries.frequencies import to_offset -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa import pyarrow.compute as pc @@ -300,7 +301,7 @@ class ArrowExtensionArray( _dtype: ArrowDtype def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: - if pa_version_under10p1: + if pa_version_under12p1: msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray." raise ImportError(msg) if isinstance(values, pa.Array): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8048306df91a2..6087e42cf273d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -25,7 +25,7 @@ from pandas._libs.lib import ensure_string_array from pandas.compat import ( HAS_PYARROW, - pa_version_under10p1, + pa_version_under12p1, ) from pandas.compat.numpy import function as nv from pandas.util._decorators import ( @@ -182,9 +182,9 @@ def __init__( raise ValueError( f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage == "pyarrow" and pa_version_under10p1: + if storage == "pyarrow" and pa_version_under12p1: raise ImportError( - "pyarrow>=10.0.1 is required for PyArrow backed StringArray." + "pyarrow>=12.0.1 is required for PyArrow backed StringArray." ) if isinstance(na_value, float) and np.isnan(na_value): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 9668981df827b..624b5bf6d8b36 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -15,7 +15,8 @@ missing as libmissing, ) from pandas.compat import ( - pa_version_under10p1, + HAS_PYARROW, + pa_version_under12p1, pa_version_under13p0, pa_version_under16p0, ) @@ -39,7 +40,7 @@ ) from pandas.core.strings.object_array import ObjectStringArrayMixin -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa import pyarrow.compute as pc @@ -67,8 +68,8 @@ def _chk_pyarrow_available() -> None: - if pa_version_under10p1: - msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray." + if pa_version_under12p1: + msg = "pyarrow>=12.0.1 is required for PyArrow backed ArrowExtensionArray." raise ImportError(msg) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 570074e047da6..3986392774f28 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -46,7 +46,10 @@ abbrev_to_npy_unit, ) from pandas._libs.tslibs.offsets import BDay -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + HAS_PYARROW, + pa_version_under12p1, +) from pandas.errors import PerformanceWarning from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level @@ -66,7 +69,7 @@ is_list_like, ) -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa if TYPE_CHECKING: @@ -2193,8 +2196,8 @@ class ArrowDtype(StorageExtensionDtype): def __init__(self, pyarrow_dtype: pa.DataType) -> None: super().__init__("pyarrow") - if pa_version_under10p1: - raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") + if pa_version_under12p1: + raise ImportError("pyarrow>=12.0.1 is required for ArrowDtype") if not isinstance(pyarrow_dtype, pa.DataType): raise ValueError( f"pyarrow_dtype ({pyarrow_dtype}) must be an instance " @@ -2346,7 +2349,7 @@ def construct_from_string(cls, string: str) -> ArrowDtype: if string in ("string[pyarrow]", "str[pyarrow]"): # Ensure Registry.find skips ArrowDtype to use StringDtype instead raise TypeError("string[pyarrow] should be constructed by StringDtype") - if pa_version_under10p1: + if pa_version_under12p1: raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") base_type = string[:-9] # get rid of "[pyarrow]" diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index 431309aca0df2..f47103cf04762 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -1,7 +1,5 @@ import pytest -from pandas.compat.pyarrow import pa_version_under10p1 - from pandas.core.dtypes.dtypes import PeriodDtype import pandas as pd @@ -33,7 +31,6 @@ def test_arrow_extension_type(): assert hash(p1) != hash(p3) -@pytest.mark.xfail(not pa_version_under10p1, reason="Wrong behavior with pyarrow 10") @pytest.mark.parametrize( "data, freq", [ diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 736c0e1782fc0..96e1cc05e284c 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -12,7 +12,7 @@ from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import ( - pa_version_under12p0, + pa_version_under12p1, pa_version_under19p0, ) import pandas.util._test_decorators as td @@ -600,7 +600,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) - if dtype.storage == "pyarrow" and pa_version_under12p0: + if dtype.storage == "pyarrow" and pa_version_under12p1: expected = pa.chunked_array(expected) if dtype.storage == "python": expected = pc.cast(expected, pa.string()) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 91f5badeb9728..90f662eeec5ca 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -4,7 +4,7 @@ import pytest from pandas.compat import HAS_PYARROW -from pandas.compat.pyarrow import pa_version_under12p0 +from pandas.compat.pyarrow import pa_version_under12p1 from pandas import ( DataFrame, @@ -196,7 +196,7 @@ def test_astype_arrow_timestamp(): ) result = df.astype("timestamp[ns][pyarrow]") assert not result._mgr._has_no_reference(0) - if pa_version_under12p0: + if pa_version_under12p1: assert not np.shares_memory( get_array(df, "a"), get_array(result, "a")._pa_array ) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 8c4ab42b7be7a..b9e95c49673e0 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas.compat.pyarrow import pa_version_under10p1 - from pandas.core.dtypes.missing import na_value_for_dtype import pandas as pd @@ -411,12 +409,7 @@ def test_groupby_drop_nan_with_multi_index(): "Float64", "category", "string", - pytest.param( - "string[pyarrow]", - marks=pytest.mark.skipif( - pa_version_under10p1, reason="pyarrow is not installed" - ), - ), + "string[pyarrow]", "datetime64[ns]", "period[D]", "Sparse[float]", From b8bc90e91fe7b45cd48dfaad1af185b35a45f1bf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 27 Jun 2025 17:38:34 +0200 Subject: [PATCH 03/14] define HAS_PYARROW as having minimum version of pyarrow --- pandas/compat/pyarrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index ad28597e14cb0..1f20ab6a02130 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -19,7 +19,7 @@ pa_version_under18p0 = _palv < Version("18.0.0") pa_version_under19p0 = _palv < Version("19.0.0") pa_version_under20p0 = _palv < Version("20.0.0") - HAS_PYARROW = True + HAS_PYARROW = _palv >= Version("12.0.1") except ImportError: pa_version_under11p0 = True pa_version_under12p1 = True From f16bf766013d8739bb49a2051234584db92ff1dc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 30 Jun 2025 09:20:36 +0200 Subject: [PATCH 04/14] remove usage of pa_version_under11p0 --- pandas/compat/__init__.py | 4 +- pandas/compat/pyarrow.py | 2 - pandas/core/arrays/_arrow_string_mixins.py | 5 --- pandas/core/arrays/arrow/accessors.py | 10 +---- pandas/core/arrays/arrow/array.py | 45 ++----------------- pandas/core/strings/accessor.py | 28 +++--------- pandas/tests/extension/test_arrow.py | 16 ------- pandas/tests/io/test_parquet.py | 13 +----- .../series/accessors/test_list_accessor.py | 34 +++++--------- .../series/accessors/test_struct_accessor.py | 6 +-- 10 files changed, 25 insertions(+), 138 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 901ff3d9c161c..8ed19f97958b9 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -26,7 +26,7 @@ from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( HAS_PYARROW, - pa_version_under11p0, + pa_version_under12p1, pa_version_under13p0, pa_version_under14p0, pa_version_under14p1, @@ -159,7 +159,7 @@ def is_ci_environment() -> bool: "PYPY", "WASM", "is_numpy_dev", - "pa_version_under11p0", + "pa_version_under12p1", "pa_version_under13p0", "pa_version_under14p0", "pa_version_under14p1", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 1f20ab6a02130..569d702592982 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -8,7 +8,6 @@ import pyarrow as pa _palv = Version(Version(pa.__version__).base_version) - pa_version_under11p0 = _palv < Version("11.0.0") pa_version_under12p1 = _palv < Version("12.0.1") pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") @@ -21,7 +20,6 @@ pa_version_under20p0 = _palv < Version("20.0.0") HAS_PYARROW = _palv >= Version("12.0.1") except ImportError: - pa_version_under11p0 = True pa_version_under12p1 = True pa_version_under13p0 = True pa_version_under14p0 = True diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 4ca7530e134f0..e109f88b03e37 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -13,7 +13,6 @@ from pandas._libs import lib from pandas.compat import ( HAS_PYARROW, - pa_version_under11p0, pa_version_under13p0, pa_version_under17p0, ) @@ -132,10 +131,6 @@ def _str_get(self, i: int) -> Self: def _str_slice( self, start: int | None = None, stop: int | None = None, step: int | None = None ) -> Self: - if pa_version_under11p0: - # GH#59724 - result = self._apply_elementwise(lambda val: val[start:stop:step]) - return type(self)(pa.chunked_array(result, type=self._pa_array.type)) if start is None: if step is not None and step < 0: # GH#59710 diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 419dcc9396c0c..7f3da9be0c03d 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -11,10 +11,7 @@ cast, ) -from pandas.compat import ( - HAS_PYARROW, - pa_version_under11p0, -) +from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import is_list_like @@ -171,11 +168,6 @@ def __getitem__(self, key: int | slice) -> Series: name=self._data.name, ) elif isinstance(key, slice): - if pa_version_under11p0: - raise NotImplementedError( - f"List slice not supported by pyarrow {pa.__version__}." - ) - # TODO: Support negative start/stop/step, ideally this would be added # upstream in pyarrow. start, stop, step = key.start, key.stop, key.step diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a1171690ff86d..b4e60819b033f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -23,7 +23,6 @@ ) from pandas.compat import ( HAS_PYARROW, - pa_version_under11p0, pa_version_under12p1, pa_version_under13p0, ) @@ -209,16 +208,6 @@ def floordiv_compat( from pandas.core.arrays.timedeltas import TimedeltaArray -def get_unit_from_pa_dtype(pa_dtype) -> str: - # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804 - if pa_version_under11p0: - unit = str(pa_dtype).split("[", 1)[-1][:-1] - if unit not in ["s", "ms", "us", "ns"]: - raise ValueError(pa_dtype) - return unit - return pa_dtype.unit - - def to_pyarrow_type( dtype: ArrowDtype | pa.DataType | Dtype | None, ) -> pa.DataType | None: @@ -1200,10 +1189,6 @@ def factorize( null_encoding = "mask" if use_na_sentinel else "encode" data = self._pa_array - pa_type = data.type - if pa_version_under11p0 and pa.types.is_duration(pa_type): - # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = data.cast(pa.int64()) if pa.types.is_dictionary(data.type): if null_encoding == "encode": @@ -1228,8 +1213,6 @@ def factorize( ) uniques = type(self)(combined.dictionary) - if pa_version_under11p0 and pa.types.is_duration(pa_type): - uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype)) return indices, uniques def reshape(self, *args, **kwargs): @@ -1516,19 +1499,7 @@ def unique(self) -> Self: ------- ArrowExtensionArray """ - pa_type = self._pa_array.type - - if pa_version_under11p0 and pa.types.is_duration(pa_type): - # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._pa_array.cast(pa.int64()) - else: - data = self._pa_array - - pa_result = pc.unique(data) - - if pa_version_under11p0 and pa.types.is_duration(pa_type): - pa_result = pa_result.cast(pa_type) - + pa_result = pc.unique(self._pa_array) return type(self)(pa_result) def value_counts(self, dropna: bool = True) -> Series: @@ -1548,18 +1519,12 @@ def value_counts(self, dropna: bool = True) -> Series: -------- Series.value_counts """ - pa_type = self._pa_array.type - if pa_version_under11p0 and pa.types.is_duration(pa_type): - # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._pa_array.cast(pa.int64()) - else: - data = self._pa_array - from pandas import ( Index, Series, ) + data = self._pa_array vc = data.value_counts() values = vc.field(0) @@ -1569,9 +1534,6 @@ def value_counts(self, dropna: bool = True) -> Series: values = values.filter(mask) counts = counts.filter(mask) - if pa_version_under11p0 and pa.types.is_duration(pa_type): - values = values.cast(pa_type) - counts = ArrowExtensionArray(counts) index = Index(type(self)(values)) @@ -1865,8 +1827,7 @@ def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc] if pa.types.is_duration(pa_type): result = result.cast(pa_type) elif pa.types.is_time(pa_type): - unit = get_unit_from_pa_dtype(pa_type) - result = result.cast(pa.duration(unit)) + result = result.cast(pa.duration(pa_type.unit)) elif pa.types.is_date(pa_type): # go with closest available unit, i.e. "s" result = result.cast(pa.duration("s")) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 81f7441846589..d1cf1e7504ece 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -305,8 +305,6 @@ def _wrap_result( if isinstance(result.dtype, ArrowDtype): import pyarrow as pa - from pandas.compat import pa_version_under11p0 - from pandas.core.arrays.arrow.array import ArrowExtensionArray value_lengths = pa.compute.list_value_length(result._pa_array) @@ -319,26 +317,14 @@ def _wrap_result( ) if min_len < max_len: # append nulls to each scalar list element up to max_len - if not pa_version_under11p0: - result = ArrowExtensionArray( - pa.compute.list_slice( - result._pa_array, - start=0, - stop=max_len, - return_fixed_size_list=True, - ) + result = ArrowExtensionArray( + pa.compute.list_slice( + result._pa_array, + start=0, + stop=max_len, + return_fixed_size_list=True, ) - else: - all_null = np.full(max_len, fill_value=None, dtype=object) - values = result.to_numpy() - new_values = [] - for row in values: - if len(row) < max_len: - nulls = all_null[: max_len - len(row)] - row = np.append(row, nulls) - new_values.append(row) - pa_type = result._pa_array.type - result = ArrowExtensionArray(pa.array(new_values, type=pa_type)) + ) if name is None: name = range(max_len) result = ( diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fc5930ebcd8ac..3170f3a151663 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -39,7 +39,6 @@ PY312, is_ci_environment, is_platform_windows, - pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, pa_version_under19p0, @@ -353,15 +352,6 @@ def test_from_sequence_of_strings_pa_array(self, data, request): reason="Nanosecond time parsing not supported.", ) ) - elif pa_version_under11p0 and ( - pa.types.is_duration(pa_dtype) or pa.types.is_decimal(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"pyarrow doesn't support parsing {pa_dtype}", - ) - ) elif pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None: _require_timezone_database(request) @@ -3288,9 +3278,6 @@ def test_pow_missing_operand(): tm.assert_series_equal(result, expected) -@pytest.mark.skipif( - pa_version_under11p0, reason="Decimal128 to string cast implemented in pyarrow 11" -) def test_decimal_parse_raises(): # GH 56984 ser = pd.Series(["1.2345"], dtype=ArrowDtype(pa.string())) @@ -3300,9 +3287,6 @@ def test_decimal_parse_raises(): ser.astype(ArrowDtype(pa.decimal128(1, 0))) -@pytest.mark.skipif( - pa_version_under11p0, reason="Decimal128 to string cast implemented in pyarrow 11" -) def test_decimal_parse_succeeds(): # GH 56984 ser = pd.Series(["1.2345"], dtype=ArrowDtype(pa.string())) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 78f39b649cb9a..d17b5b54f3f06 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -13,7 +13,6 @@ from pandas.compat import is_platform_windows from pandas.compat.pyarrow import ( - pa_version_under11p0, pa_version_under13p0, pa_version_under15p0, pa_version_under17p0, @@ -729,14 +728,7 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): expected = df_full.copy() expected.loc[1, "string_with_nan"] = None - if pa_version_under11p0: - expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( - "M8[ns]" - ) - else: - expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( - "M8[ms]" - ) + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype("M8[ms]") tm.assert_frame_equal(res, expected) def test_duplicate_columns(self, pa): @@ -1003,8 +995,6 @@ def test_timezone_aware_index(self, pa, timezone_aware_date_list): # this use-case sets the resolution to 1 minute expected = df[:] - if pa_version_under11p0: - expected.index = expected.index.as_unit("ns") if timezone_aware_date_list.tzinfo != datetime.timezone.utc: # pyarrow returns pytz.FixedOffset while pandas constructs datetime.timezone # https://github.com/pandas-dev/pandas/issues/37286 @@ -1140,7 +1130,6 @@ def test_string_inference(self, tmp_path, pa, using_infer_string): ) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(pa_version_under11p0, reason="not supported before 11.0") def test_roundtrip_decimal(self, tmp_path, pa): # GH#54768 import pyarrow as pa diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index bec8ca13a2f5f..3541592e7c51e 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -10,8 +10,6 @@ pa = pytest.importorskip("pyarrow") -from pandas.compat import pa_version_under11p0 - @pytest.mark.parametrize( "list_dtype", @@ -57,20 +55,14 @@ def test_list_getitem_slice(): index=[1, 3, 7], name="a", ) - if pa_version_under11p0: - with pytest.raises( - NotImplementedError, match="List slice not supported by pyarrow " - ): - ser.list[1:None:None] - else: - actual = ser.list[1:None:None] - expected = Series( - [[2, 3], [None, 5], None], - dtype=ArrowDtype(pa.list_(pa.int64())), - index=[1, 3, 7], - name="a", - ) - tm.assert_series_equal(actual, expected) + actual = ser.list[1:None:None] + expected = Series( + [[2, 3], [None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], + name="a", + ) + tm.assert_series_equal(actual, expected) def test_list_len(): @@ -105,14 +97,8 @@ def test_list_getitem_slice_invalid(): [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), ) - if pa_version_under11p0: - with pytest.raises( - NotImplementedError, match="List slice not supported by pyarrow " - ): - ser.list[1:None:0] - else: - with pytest.raises(pa.lib.ArrowInvalid, match=re.escape("`step` must be >= 1")): - ser.list[1:None:0] + with pytest.raises(pa.lib.ArrowInvalid, match=re.escape("`step` must be >= 1")): + ser.list[1:None:0] def test_list_accessor_non_list_dtype(): diff --git a/pandas/tests/series/accessors/test_struct_accessor.py b/pandas/tests/series/accessors/test_struct_accessor.py index 80aea75fda406..c1ef1b14ec3d0 100644 --- a/pandas/tests/series/accessors/test_struct_accessor.py +++ b/pandas/tests/series/accessors/test_struct_accessor.py @@ -2,10 +2,7 @@ import pytest -from pandas.compat.pyarrow import ( - pa_version_under11p0, - pa_version_under13p0, -) +from pandas.compat.pyarrow import pa_version_under13p0 from pandas import ( ArrowDtype, @@ -105,7 +102,6 @@ def test_struct_accessor_field_with_invalid_name_or_index(): ser.struct.field(1.1) -@pytest.mark.skipif(pa_version_under11p0, reason="pyarrow>=11.0.0 required") def test_struct_accessor_explode(): index = Index([-100, 42, 123]) ser = Series( From a6f464ae7d3a91da57026c353827691f6132cf57 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 30 Jun 2025 11:39:53 +0200 Subject: [PATCH 05/14] cleanup usage get_unit_from_pa_dtype --- pandas/tests/extension/test_arrow.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 3170f3a151663..ae51e8f1554af 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -67,10 +67,7 @@ pa = pytest.importorskip("pyarrow") -from pandas.core.arrays.arrow.array import ( - ArrowExtensionArray, - get_unit_from_pa_dtype, -) +from pandas.core.arrays.arrow.array import ArrowExtensionArray from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -539,8 +536,7 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): elif pa.types.is_date(pa_type): cmp_dtype = ArrowDtype(pa.duration("s")) elif pa.types.is_time(pa_type): - unit = get_unit_from_pa_dtype(pa_type) - cmp_dtype = ArrowDtype(pa.duration(unit)) + cmp_dtype = ArrowDtype(pa.duration(pa_type.unit)) else: cmp_dtype = ArrowDtype(pa.duration(pa_type.unit)) else: From 671fc1691086810e0db4cf39230a52902442b919 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 30 Jun 2025 22:27:49 +0200 Subject: [PATCH 06/14] update period arrow compat test to remove the failing cast --- pandas/tests/arrays/period/test_arrow_compat.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index f47103cf04762..c1d9ac0d1d273 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -57,9 +57,6 @@ def test_arrow_array(data, freq): with pytest.raises(TypeError, match=msg): pa.array(periods, type="float64") - with pytest.raises(TypeError, match="different 'freq'"): - pa.array(periods, type=ArrowPeriodType("T")) - def test_arrow_array_missing(): from pandas.core.arrays.arrow.extension_types import ArrowPeriodType From 2495ad20dc0b2e59a7181ac9c7c94d9042dbf0e7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 1 Jul 2025 08:59:46 +0200 Subject: [PATCH 07/14] fixup skip if no pyarrow --- pandas/tests/groupby/test_groupby_dropna.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index b9e95c49673e0..724ee0489f0a0 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.missing import na_value_for_dtype import pandas as pd @@ -409,7 +411,7 @@ def test_groupby_drop_nan_with_multi_index(): "Float64", "category", "string", - "string[pyarrow]", + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), "datetime64[ns]", "period[D]", "Sparse[float]", From a7cee8a2bcf1497f94f6abac356b26888461cdca Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 1 Jul 2025 09:38:14 +0200 Subject: [PATCH 08/14] update expected message --- .../tests/arrays/string_/test_string_arrow.py | 2 +- pandas/tests/base/test_misc.py | 21 ++++++++++--------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index e6103da5021bb..2b5f60ce70b4c 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -178,7 +178,7 @@ def test_from_sequence_wrong_dtype_raises(using_infer_string): @td.skip_if_installed("pyarrow") def test_pyarrow_not_installed_raises(): - msg = re.escape("pyarrow>=10.0.1 is required for PyArrow backed") + msg = re.escape("pyarrow>=12.0.1 is required for PyArrow backed") with pytest.raises(ImportError, match=msg): StringDtype(storage="pyarrow") diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 7819b7b75f065..e6e8a6a029b17 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -3,10 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import PYPY - from pandas.core.dtypes.common import ( is_dtype_equal, is_object_dtype, @@ -81,10 +77,10 @@ def test_ndarray_compat_properties(index_or_series_obj): assert Series([1]).item() == 1 -@pytest.mark.skipif( - PYPY or using_string_dtype(), - reason="not relevant for PyPy doesn't work properly for arrow strings", -) +# @pytest.mark.skipif( +# PYPY or using_string_dtype(), +# reason="not relevant for PyPy doesn't work properly for arrow strings", +# ) def test_memory_usage(index_or_series_memory_obj): obj = index_or_series_memory_obj # Clear index caches so that len(obj) == 0 report 0 memory usage @@ -98,7 +94,12 @@ def test_memory_usage(index_or_series_memory_obj): res = obj.memory_usage() res_deep = obj.memory_usage(deep=True) - is_object = is_object_dtype(obj) or (is_ser and is_object_dtype(obj.index)) + def _is_object_dtype(obj): + if isinstance(obj, pd.MultiIndex): + return any(is_object_dtype(level) for level in obj.levels) + return is_object_dtype(obj) + + is_object = _is_object_dtype(obj) or (is_ser and _is_object_dtype(obj.index)) is_categorical = isinstance(obj.dtype, pd.CategoricalDtype) or ( is_ser and isinstance(obj.index.dtype, pd.CategoricalDtype) ) @@ -111,7 +112,7 @@ def test_memory_usage(index_or_series_memory_obj): assert res_deep == res == expected elif is_object or is_categorical or is_object_string: # only deep will pick them up - assert res_deep > res + assert res_deep > res, (res_deep, res) else: assert res == res_deep From 99ba44f7ec997530881d8893c00233cee6502998 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 1 Jul 2025 22:06:14 +0200 Subject: [PATCH 09/14] add back import of DatetimeTZBlock for compat with pyarrow version 12 to 15 --- pandas/core/internals/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index d64c7e33657d4..aae0b23023970 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -8,6 +8,7 @@ __all__ = [ "Block", # pyright:ignore[reportUnsupportedDunderAll)] "BlockManager", + "DatetimeTZBlock", # pyright:ignore[reportUnsupportedDunderAll)] "ExtensionBlock", # pyright:ignore[reportUnsupportedDunderAll)] "SingleBlockManager", "concatenate_managers", @@ -36,6 +37,7 @@ def __getattr__(name: str): if name in [ "Block", "ExtensionBlock", + "DatetimeTZBlock", ]: warnings.warn( f"{name} is deprecated and will be removed in a future version. " @@ -45,6 +47,10 @@ def __getattr__(name: str): # on hard-coding stacklevel stacklevel=2, ) + if name == "DatetimeTZBlock": + from pandas.core.internals.blocks import DatetimeLikeBlock + + return DatetimeLikeBlock if name == "ExtensionBlock": from pandas.core.internals.blocks import ExtensionBlock From 066af94ad4afcfb4d221cb40c2522a24fba2fbb3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Jul 2025 09:09:00 +0200 Subject: [PATCH 10/14] fixup pyarrow datetimetz compat and update parquet tests --- pandas/core/internals/__init__.py | 4 ++-- pandas/core/internals/api.py | 19 +++++++++++++++++++ pandas/tests/io/test_parquet.py | 18 ++++++++++++------ scripts/validate_unwanted_patterns.py | 1 + 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index aae0b23023970..12999a44a446b 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -48,9 +48,9 @@ def __getattr__(name: str): stacklevel=2, ) if name == "DatetimeTZBlock": - from pandas.core.internals.blocks import DatetimeLikeBlock + from pandas.core.internals.api import _DatetimeTZBlock as DatetimeTZBlock - return DatetimeLikeBlock + return DatetimeTZBlock if name == "ExtensionBlock": from pandas.core.internals.blocks import ExtensionBlock diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index 04944db2ebd9c..c5d6a2fe7a6a6 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -29,6 +29,7 @@ ) from pandas.core.construction import extract_array from pandas.core.internals.blocks import ( + DatetimeLikeBlock, check_ndim, ensure_block_shape, extract_pandas_array, @@ -74,6 +75,14 @@ def _make_block(values: ArrayLike, placement: np.ndarray) -> Block: return klass(values, ndim=2, placement=placement_obj) +class _DatetimeTZBlock(DatetimeLikeBlock): + """implement a datetime64 block with a tz attribute""" + + values: DatetimeArray + + __slots__ = () + + def make_block( values, placement, klass=None, ndim=None, dtype: Dtype | None = None ) -> Block: @@ -114,6 +123,16 @@ def make_block( dtype = dtype or values.dtype klass = get_block_type(dtype) + elif klass is _DatetimeTZBlock and not isinstance(values.dtype, DatetimeTZDtype): + # pyarrow calls get here (pyarrow<15) + values = DatetimeArray._simple_new( + # error: Argument "dtype" to "_simple_new" of "DatetimeArray" has + # incompatible type "Union[ExtensionDtype, dtype[Any], None]"; + # expected "Union[dtype[datetime64], DatetimeTZDtype]" + values, + dtype=dtype, # type: ignore[arg-type] + ) + if not isinstance(placement, BlockPlacement): placement = BlockPlacement(placement) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f1547c16799fc..9f9304c8d1664 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -728,7 +728,14 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): expected = df_full.copy() expected.loc[1, "string_with_nan"] = None - expected["datetime_with_nat"] = expected["datetime_with_nat"].astype("M8[ms]") + if pa_version_under13p0: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ns]" + ) + else: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ms]" + ) tm.assert_frame_equal(res, expected) def test_duplicate_columns(self, pa): @@ -972,15 +979,12 @@ def test_additional_extension_types(self, pa): def test_timestamp_nanoseconds(self, pa): # with version 2.6, pyarrow defaults to writing the nanoseconds, so - # this should work without error - # Note in previous pyarrows(<7.0.0), only the pseudo-version 2.0 was available + # this should work without error, even for pyarrow < 13 ver = "2.6" df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1ns", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": ver}) def test_timezone_aware_index(self, pa, timezone_aware_date_list): - pytest.importorskip("pyarrow", "11.0.0") - idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) @@ -995,6 +999,8 @@ def test_timezone_aware_index(self, pa, timezone_aware_date_list): # this use-case sets the resolution to 1 minute expected = df[:] + if pa_version_under13p0: + expected.index = expected.index.as_unit("ns") if timezone_aware_date_list.tzinfo != datetime.timezone.utc: # pyarrow returns pytz.FixedOffset while pandas constructs datetime.timezone # https://github.com/pandas-dev/pandas/issues/37286 @@ -1178,7 +1184,7 @@ def test_infer_string_large_string_type(self, tmp_path, pa): def test_non_nanosecond_timestamps(self, temp_file): # GH#49236 - pa = pytest.importorskip("pyarrow", "11.0.0") + pa = pytest.importorskip("pyarrow", "13.0.0") pq = pytest.importorskip("pyarrow.parquet") arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us")) diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index d804e15f6d48f..8475747a80367 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -53,6 +53,7 @@ "_get_option", "_fill_limit_area_1d", "_make_block", + "_DatetimeTZBlock", } From 6fd257af0fa8e4f9094761d527714b653586ea8f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Jul 2025 09:15:19 +0200 Subject: [PATCH 11/14] add back str_slice compat --- pandas/core/arrays/_arrow_string_mixins.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index e109f88b03e37..07cbf489cfe1c 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -131,6 +131,10 @@ def _str_get(self, i: int) -> Self: def _str_slice( self, start: int | None = None, stop: int | None = None, step: int | None = None ) -> Self: + if pa_version_under13p0: + # GH#59724 + result = self._apply_elementwise(lambda val: val[start:stop:step]) + return type(self)(pa.chunked_array(result, type=self._pa_array.type)) if start is None: if step is not None and step < 0: # GH#59710 From 23783cbfa75e6a7e2a8de61add55d5111f26587c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Jul 2025 09:21:18 +0200 Subject: [PATCH 12/14] update skip versions for dataframe interchange tests --- pandas/tests/interchange/test_impl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index a41d7dec8b496..bf746a9eaa976 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -282,7 +282,7 @@ def test_empty_pyarrow(data): def test_multi_chunk_pyarrow() -> None: - pa = pytest.importorskip("pyarrow", "11.0.0") + pa = pytest.importorskip("pyarrow", "14.0.0") n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) names = ["n_legs"] table = pa.table([n_legs], names=names) @@ -488,7 +488,7 @@ def test_pandas_nullable_with_missing_values( ) -> None: # https://github.com/pandas-dev/pandas/issues/57643 # https://github.com/pandas-dev/pandas/issues/57664 - pa = pytest.importorskip("pyarrow", "11.0.0") + pa = pytest.importorskip("pyarrow", "14.0.0") import pyarrow.interchange as pai if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": @@ -554,7 +554,7 @@ def test_pandas_nullable_without_missing_values( data: list, dtype: str, expected_dtype: str ) -> None: # https://github.com/pandas-dev/pandas/issues/57643 - pa = pytest.importorskip("pyarrow", "11.0.0") + pa = pytest.importorskip("pyarrow", "14.0.0") import pyarrow.interchange as pai if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": From 4d79c27b33effbb72d92d912580034888211cbfa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Jul 2025 11:05:22 +0200 Subject: [PATCH 13/14] cleanup --- pandas/tests/base/test_misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 4f82ab8e67a0b..0a820c0d3e0bd 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -108,7 +108,7 @@ def _is_object_dtype(obj): assert res_deep == res == expected elif has_objects: # only deep will pick them up - assert res_deep > res, (res_deep, res) + assert res_deep > res else: assert res == res_deep From fef968109bfada86291823668124047f47c09607 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Jul 2025 09:10:16 +0200 Subject: [PATCH 14/14] add increased version to v3.0.0 --- doc/source/whatsnew/v3.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5ff1ea9d194f6..57dce003c2846 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -321,6 +321,8 @@ Optional libraries below the lowest tested version may still work, but are not c +------------------------+---------------------+ | Package | New Minimum Version | +========================+=====================+ +| pyarrow | 12.0.1 | ++------------------------+---------------------+ | pytz | 2023.4 | +------------------------+---------------------+ | fastparquet | 2024.2.0 |