diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index 9f12fe941d488..a9ea6a639043b 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -41,7 +41,7 @@ dependencies: - qtpy=2.3.0 - openpyxl=3.1.2 - psycopg2=2.9.6 - - pyarrow=10.0.1 + - pyarrow=12.0.1 - pyiceberg=0.7.1 - pymysql=1.1.0 - pyqt=5.15.9 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 66d49475bf34b..4904140f2e70b 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -39,7 +39,7 @@ dependencies: - qtpy>=2.3.0 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyqt>=5.15.9 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 70e66a18daba9..1fc8a9ed21777 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -40,7 +40,7 @@ dependencies: - qtpy>=2.3.0 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyqt>=5.15.9 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 9669c1e29a435..deb646a7ba86a 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -40,7 +40,7 @@ dependencies: - pyqt>=5.15.9 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 61f1d602bb241..97b582b80fb8f 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -40,7 +40,7 @@ dependencies: - pyqt>=5.15.9 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 diff --git a/ci/deps/actions-313.yaml b/ci/deps/actions-313.yaml index 11f4428be27e5..4bc363dc4a27e 100644 --- a/ci/deps/actions-313.yaml +++ b/ci/deps/actions-313.yaml @@ -41,7 +41,7 @@ dependencies: - pyqt>=5.15.9 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 - pytables>=3.8.0 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 1589fea5f8953..ed0c8bd05098d 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -307,7 +307,7 @@ Dependency Minimum Version pip ex `PyTables `__ 3.8.0 hdf5 HDF5-based reading / writing `zlib `__ hdf5 Compression for HDF5 `fastparquet `__ 2024.2.0 - Parquet reading / writing (pyarrow is default) -`pyarrow `__ 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing +`pyarrow `__ 12.0.1 parquet, feather Parquet, ORC, and feather reading / writing `PyIceberg `__ 0.7.1 iceberg Apache Iceberg reading / writing `pyreadstat `__ 1.2.6 spss SPSS files (.sav) reading `odfpy `__ 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5ff1ea9d194f6..57dce003c2846 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -321,6 +321,8 @@ Optional libraries below the lowest tested version may still work, but are not c +------------------------+---------------------+ | Package | New Minimum Version | +========================+=====================+ +| pyarrow | 12.0.1 | ++------------------------+---------------------+ | pytz | 2023.4 | +------------------------+---------------------+ | fastparquet | 2024.2.0 | diff --git a/environment.yml b/environment.yml index b698c4c2ec131..d89a788827109 100644 --- a/environment.yml +++ b/environment.yml @@ -43,7 +43,7 @@ dependencies: - openpyxl>=3.1.2 - odfpy>=1.4.1 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index ec9b5098c97c9..fc447aaba37db 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -18,7 +18,7 @@ set_locale, ) -from pandas.compat import pa_version_under10p1 +from pandas.compat import HAS_PYARROW import pandas as pd from pandas import ( @@ -183,7 +183,7 @@ ] ] -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 9f3bfdc205498..8ed19f97958b9 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -26,8 +26,7 @@ from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( HAS_PYARROW, - pa_version_under10p1, - pa_version_under11p0, + pa_version_under12p1, pa_version_under13p0, pa_version_under14p0, pa_version_under14p1, @@ -160,8 +159,7 @@ def is_ci_environment() -> bool: "PYPY", "WASM", "is_numpy_dev", - "pa_version_under10p1", - "pa_version_under11p0", + "pa_version_under12p1", "pa_version_under13p0", "pa_version_under14p0", "pa_version_under14p1", diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 07a07ba4ab60c..c2a232d55d8e2 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -38,7 +38,7 @@ "openpyxl": "3.1.2", "psycopg2": "2.9.6", # (dt dec pq3 ext lo64) "pymysql": "1.1.0", - "pyarrow": "10.0.1", + "pyarrow": "12.0.1", "pyiceberg": "0.7.1", "pyreadstat": "1.2.6", "pytest": "7.3.2", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 163934bee509c..569d702592982 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -8,9 +8,7 @@ import pyarrow as pa _palv = Version(Version(pa.__version__).base_version) - pa_version_under10p1 = _palv < Version("10.0.1") - pa_version_under11p0 = _palv < Version("11.0.0") - pa_version_under12p0 = _palv < Version("12.0.0") + pa_version_under12p1 = _palv < Version("12.0.1") pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") pa_version_under14p1 = _palv < Version("14.0.1") @@ -20,11 +18,9 @@ pa_version_under18p0 = _palv < Version("18.0.0") pa_version_under19p0 = _palv < Version("19.0.0") pa_version_under20p0 = _palv < Version("20.0.0") - HAS_PYARROW = True + HAS_PYARROW = _palv >= Version("12.0.1") except ImportError: - pa_version_under10p1 = True - pa_version_under11p0 = True - pa_version_under12p0 = True + pa_version_under12p1 = True pa_version_under13p0 = True pa_version_under14p0 = True pa_version_under14p1 = True diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 1ca52ce64bd77..07cbf489cfe1c 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -12,13 +12,12 @@ from pandas._libs import lib from pandas.compat import ( - pa_version_under10p1, - pa_version_under11p0, + HAS_PYARROW, pa_version_under13p0, pa_version_under17p0, ) -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa import pyarrow.compute as pc @@ -132,7 +131,7 @@ def _str_get(self, i: int) -> Self: def _str_slice( self, start: int | None = None, stop: int | None = None, step: int | None = None ) -> Self: - if pa_version_under11p0: + if pa_version_under13p0: # GH#59724 result = self._apply_elementwise(lambda val: val[start:stop:step]) return type(self)(pa.chunked_array(result, type=self._pa_array.type)) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index b220a94d032b5..7f3da9be0c03d 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -11,14 +11,11 @@ cast, ) -from pandas.compat import ( - pa_version_under10p1, - pa_version_under11p0, -) +from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import is_list_like -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa import pyarrow.compute as pc @@ -46,7 +43,7 @@ def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: def _validate(self, data) -> None: dtype = data.dtype - if pa_version_under10p1 or not isinstance(dtype, ArrowDtype): + if not HAS_PYARROW or not isinstance(dtype, ArrowDtype): # Raise AttributeError so that inspect can handle non-struct Series. raise AttributeError(self._validation_msg.format(dtype=dtype)) @@ -171,11 +168,6 @@ def __getitem__(self, key: int | slice) -> Series: name=self._data.name, ) elif isinstance(key, slice): - if pa_version_under11p0: - raise NotImplementedError( - f"List slice not supported by pyarrow {pa.__version__}." - ) - # TODO: Support negative start/stop/step, ideally this would be added # upstream in pyarrow. start, stop, step = key.start, key.stop, key.step diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c18f06c3a126d..b4e60819b033f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -22,8 +22,8 @@ timezones, ) from pandas.compat import ( - pa_version_under10p1, - pa_version_under11p0, + HAS_PYARROW, + pa_version_under12p1, pa_version_under13p0, ) from pandas.util._decorators import doc @@ -74,7 +74,7 @@ from pandas.io._util import _arrow_dtype_mapping from pandas.tseries.frequencies import to_offset -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa import pyarrow.compute as pc @@ -208,16 +208,6 @@ def floordiv_compat( from pandas.core.arrays.timedeltas import TimedeltaArray -def get_unit_from_pa_dtype(pa_dtype) -> str: - # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804 - if pa_version_under11p0: - unit = str(pa_dtype).split("[", 1)[-1][:-1] - if unit not in ["s", "ms", "us", "ns"]: - raise ValueError(pa_dtype) - return unit - return pa_dtype.unit - - def to_pyarrow_type( dtype: ArrowDtype | pa.DataType | Dtype | None, ) -> pa.DataType | None: @@ -300,7 +290,7 @@ class ArrowExtensionArray( _dtype: ArrowDtype def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: - if pa_version_under10p1: + if pa_version_under12p1: msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray." raise ImportError(msg) if isinstance(values, pa.Array): @@ -1199,10 +1189,6 @@ def factorize( null_encoding = "mask" if use_na_sentinel else "encode" data = self._pa_array - pa_type = data.type - if pa_version_under11p0 and pa.types.is_duration(pa_type): - # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = data.cast(pa.int64()) if pa.types.is_dictionary(data.type): if null_encoding == "encode": @@ -1227,8 +1213,6 @@ def factorize( ) uniques = type(self)(combined.dictionary) - if pa_version_under11p0 and pa.types.is_duration(pa_type): - uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype)) return indices, uniques def reshape(self, *args, **kwargs): @@ -1515,19 +1499,7 @@ def unique(self) -> Self: ------- ArrowExtensionArray """ - pa_type = self._pa_array.type - - if pa_version_under11p0 and pa.types.is_duration(pa_type): - # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._pa_array.cast(pa.int64()) - else: - data = self._pa_array - - pa_result = pc.unique(data) - - if pa_version_under11p0 and pa.types.is_duration(pa_type): - pa_result = pa_result.cast(pa_type) - + pa_result = pc.unique(self._pa_array) return type(self)(pa_result) def value_counts(self, dropna: bool = True) -> Series: @@ -1547,18 +1519,12 @@ def value_counts(self, dropna: bool = True) -> Series: -------- Series.value_counts """ - pa_type = self._pa_array.type - if pa_version_under11p0 and pa.types.is_duration(pa_type): - # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._pa_array.cast(pa.int64()) - else: - data = self._pa_array - from pandas import ( Index, Series, ) + data = self._pa_array vc = data.value_counts() values = vc.field(0) @@ -1568,9 +1534,6 @@ def value_counts(self, dropna: bool = True) -> Series: values = values.filter(mask) counts = counts.filter(mask) - if pa_version_under11p0 and pa.types.is_duration(pa_type): - values = values.cast(pa_type) - counts = ArrowExtensionArray(counts) index = Index(type(self)(values)) @@ -1864,8 +1827,7 @@ def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc] if pa.types.is_duration(pa_type): result = result.cast(pa_type) elif pa.types.is_time(pa_type): - unit = get_unit_from_pa_dtype(pa_type) - result = result.cast(pa.duration(unit)) + result = result.cast(pa.duration(pa_type.unit)) elif pa.types.is_date(pa_type): # go with closest available unit, i.e. "s" result = result.cast(pa.duration("s")) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8048306df91a2..6087e42cf273d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -25,7 +25,7 @@ from pandas._libs.lib import ensure_string_array from pandas.compat import ( HAS_PYARROW, - pa_version_under10p1, + pa_version_under12p1, ) from pandas.compat.numpy import function as nv from pandas.util._decorators import ( @@ -182,9 +182,9 @@ def __init__( raise ValueError( f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage == "pyarrow" and pa_version_under10p1: + if storage == "pyarrow" and pa_version_under12p1: raise ImportError( - "pyarrow>=10.0.1 is required for PyArrow backed StringArray." + "pyarrow>=12.0.1 is required for PyArrow backed StringArray." ) if isinstance(na_value, float) and np.isnan(na_value): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7264efa3298d9..2ca12870709f0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -14,7 +14,8 @@ missing as libmissing, ) from pandas.compat import ( - pa_version_under10p1, + HAS_PYARROW, + pa_version_under12p1, pa_version_under13p0, pa_version_under16p0, ) @@ -38,7 +39,7 @@ ) from pandas.core.strings.object_array import ObjectStringArrayMixin -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa import pyarrow.compute as pc @@ -63,8 +64,8 @@ def _chk_pyarrow_available() -> None: - if pa_version_under10p1: - msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray." + if pa_version_under12p1: + msg = "pyarrow>=12.0.1 is required for PyArrow backed ArrowExtensionArray." raise ImportError(msg) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 570074e047da6..3986392774f28 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -46,7 +46,10 @@ abbrev_to_npy_unit, ) from pandas._libs.tslibs.offsets import BDay -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + HAS_PYARROW, + pa_version_under12p1, +) from pandas.errors import PerformanceWarning from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level @@ -66,7 +69,7 @@ is_list_like, ) -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa if TYPE_CHECKING: @@ -2193,8 +2196,8 @@ class ArrowDtype(StorageExtensionDtype): def __init__(self, pyarrow_dtype: pa.DataType) -> None: super().__init__("pyarrow") - if pa_version_under10p1: - raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") + if pa_version_under12p1: + raise ImportError("pyarrow>=12.0.1 is required for ArrowDtype") if not isinstance(pyarrow_dtype, pa.DataType): raise ValueError( f"pyarrow_dtype ({pyarrow_dtype}) must be an instance " @@ -2346,7 +2349,7 @@ def construct_from_string(cls, string: str) -> ArrowDtype: if string in ("string[pyarrow]", "str[pyarrow]"): # Ensure Registry.find skips ArrowDtype to use StringDtype instead raise TypeError("string[pyarrow] should be constructed by StringDtype") - if pa_version_under10p1: + if pa_version_under12p1: raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") base_type = string[:-9] # get rid of "[pyarrow]" diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index d64c7e33657d4..12999a44a446b 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -8,6 +8,7 @@ __all__ = [ "Block", # pyright:ignore[reportUnsupportedDunderAll)] "BlockManager", + "DatetimeTZBlock", # pyright:ignore[reportUnsupportedDunderAll)] "ExtensionBlock", # pyright:ignore[reportUnsupportedDunderAll)] "SingleBlockManager", "concatenate_managers", @@ -36,6 +37,7 @@ def __getattr__(name: str): if name in [ "Block", "ExtensionBlock", + "DatetimeTZBlock", ]: warnings.warn( f"{name} is deprecated and will be removed in a future version. " @@ -45,6 +47,10 @@ def __getattr__(name: str): # on hard-coding stacklevel stacklevel=2, ) + if name == "DatetimeTZBlock": + from pandas.core.internals.api import _DatetimeTZBlock as DatetimeTZBlock + + return DatetimeTZBlock if name == "ExtensionBlock": from pandas.core.internals.blocks import ExtensionBlock diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index 04944db2ebd9c..c5d6a2fe7a6a6 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -29,6 +29,7 @@ ) from pandas.core.construction import extract_array from pandas.core.internals.blocks import ( + DatetimeLikeBlock, check_ndim, ensure_block_shape, extract_pandas_array, @@ -74,6 +75,14 @@ def _make_block(values: ArrayLike, placement: np.ndarray) -> Block: return klass(values, ndim=2, placement=placement_obj) +class _DatetimeTZBlock(DatetimeLikeBlock): + """implement a datetime64 block with a tz attribute""" + + values: DatetimeArray + + __slots__ = () + + def make_block( values, placement, klass=None, ndim=None, dtype: Dtype | None = None ) -> Block: @@ -114,6 +123,16 @@ def make_block( dtype = dtype or values.dtype klass = get_block_type(dtype) + elif klass is _DatetimeTZBlock and not isinstance(values.dtype, DatetimeTZDtype): + # pyarrow calls get here (pyarrow<15) + values = DatetimeArray._simple_new( + # error: Argument "dtype" to "_simple_new" of "DatetimeArray" has + # incompatible type "Union[ExtensionDtype, dtype[Any], None]"; + # expected "Union[dtype[datetime64], DatetimeTZDtype]" + values, + dtype=dtype, # type: ignore[arg-type] + ) + if not isinstance(placement, BlockPlacement): placement = BlockPlacement(placement) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 81f7441846589..d1cf1e7504ece 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -305,8 +305,6 @@ def _wrap_result( if isinstance(result.dtype, ArrowDtype): import pyarrow as pa - from pandas.compat import pa_version_under11p0 - from pandas.core.arrays.arrow.array import ArrowExtensionArray value_lengths = pa.compute.list_value_length(result._pa_array) @@ -319,26 +317,14 @@ def _wrap_result( ) if min_len < max_len: # append nulls to each scalar list element up to max_len - if not pa_version_under11p0: - result = ArrowExtensionArray( - pa.compute.list_slice( - result._pa_array, - start=0, - stop=max_len, - return_fixed_size_list=True, - ) + result = ArrowExtensionArray( + pa.compute.list_slice( + result._pa_array, + start=0, + stop=max_len, + return_fixed_size_list=True, ) - else: - all_null = np.full(max_len, fill_value=None, dtype=object) - values = result.to_numpy() - new_values = [] - for row in values: - if len(row) < max_len: - nulls = all_null[: max_len - len(row)] - row = np.append(row, nulls) - new_values.append(row) - pa_type = result._pa_array.type - result = ArrowExtensionArray(pa.array(new_values, type=pa_type)) + ) if name is None: name = range(max_len) result = ( diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index 431309aca0df2..c1d9ac0d1d273 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -1,7 +1,5 @@ import pytest -from pandas.compat.pyarrow import pa_version_under10p1 - from pandas.core.dtypes.dtypes import PeriodDtype import pandas as pd @@ -33,7 +31,6 @@ def test_arrow_extension_type(): assert hash(p1) != hash(p3) -@pytest.mark.xfail(not pa_version_under10p1, reason="Wrong behavior with pyarrow 10") @pytest.mark.parametrize( "data, freq", [ @@ -60,9 +57,6 @@ def test_arrow_array(data, freq): with pytest.raises(TypeError, match=msg): pa.array(periods, type="float64") - with pytest.raises(TypeError, match="different 'freq'"): - pa.array(periods, type=ArrowPeriodType("T")) - def test_arrow_array_missing(): from pandas.core.arrays.arrow.extension_types import ArrowPeriodType diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 736c0e1782fc0..96e1cc05e284c 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -12,7 +12,7 @@ from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import ( - pa_version_under12p0, + pa_version_under12p1, pa_version_under19p0, ) import pandas.util._test_decorators as td @@ -600,7 +600,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) - if dtype.storage == "pyarrow" and pa_version_under12p0: + if dtype.storage == "pyarrow" and pa_version_under12p1: expected = pa.chunked_array(expected) if dtype.storage == "python": expected = pc.cast(expected, pa.string()) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index e6103da5021bb..2b5f60ce70b4c 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -178,7 +178,7 @@ def test_from_sequence_wrong_dtype_raises(using_infer_string): @td.skip_if_installed("pyarrow") def test_pyarrow_not_installed_raises(): - msg = re.escape("pyarrow>=10.0.1 is required for PyArrow backed") + msg = re.escape("pyarrow>=12.0.1 is required for PyArrow backed") with pytest.raises(ImportError, match=msg): StringDtype(storage="pyarrow") diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 91f5badeb9728..90f662eeec5ca 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -4,7 +4,7 @@ import pytest from pandas.compat import HAS_PYARROW -from pandas.compat.pyarrow import pa_version_under12p0 +from pandas.compat.pyarrow import pa_version_under12p1 from pandas import ( DataFrame, @@ -196,7 +196,7 @@ def test_astype_arrow_timestamp(): ) result = df.astype("timestamp[ns][pyarrow]") assert not result._mgr._has_no_reference(0) - if pa_version_under12p0: + if pa_version_under12p1: assert not np.shares_memory( get_array(df, "a"), get_array(result, "a")._pa_array ) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e0632722df808..1bec5f7303355 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -39,7 +39,6 @@ PY312, is_ci_environment, is_platform_windows, - pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, pa_version_under19p0, @@ -68,10 +67,7 @@ pa = pytest.importorskip("pyarrow") -from pandas.core.arrays.arrow.array import ( - ArrowExtensionArray, - get_unit_from_pa_dtype, -) +from pandas.core.arrays.arrow.array import ArrowExtensionArray from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -353,15 +349,6 @@ def test_from_sequence_of_strings_pa_array(self, data, request): reason="Nanosecond time parsing not supported.", ) ) - elif pa_version_under11p0 and ( - pa.types.is_duration(pa_dtype) or pa.types.is_decimal(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"pyarrow doesn't support parsing {pa_dtype}", - ) - ) elif pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None: _require_timezone_database(request) @@ -549,8 +536,7 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): elif pa.types.is_date(pa_type): cmp_dtype = ArrowDtype(pa.duration("s")) elif pa.types.is_time(pa_type): - unit = get_unit_from_pa_dtype(pa_type) - cmp_dtype = ArrowDtype(pa.duration(unit)) + cmp_dtype = ArrowDtype(pa.duration(pa_type.unit)) else: cmp_dtype = ArrowDtype(pa.duration(pa_type.unit)) else: @@ -3288,9 +3274,6 @@ def test_pow_missing_operand(): tm.assert_series_equal(result, expected) -@pytest.mark.skipif( - pa_version_under11p0, reason="Decimal128 to string cast implemented in pyarrow 11" -) def test_decimal_parse_raises(): # GH 56984 ser = pd.Series(["1.2345"], dtype=ArrowDtype(pa.string())) @@ -3300,9 +3283,6 @@ def test_decimal_parse_raises(): ser.astype(ArrowDtype(pa.decimal128(1, 0))) -@pytest.mark.skipif( - pa_version_under11p0, reason="Decimal128 to string cast implemented in pyarrow 11" -) def test_decimal_parse_succeeds(): # GH 56984 ser = pd.Series(["1.2345"], dtype=ArrowDtype(pa.string())) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 8c4ab42b7be7a..724ee0489f0a0 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas.compat.pyarrow import pa_version_under10p1 +import pandas.util._test_decorators as td from pandas.core.dtypes.missing import na_value_for_dtype @@ -411,12 +411,7 @@ def test_groupby_drop_nan_with_multi_index(): "Float64", "category", "string", - pytest.param( - "string[pyarrow]", - marks=pytest.mark.skipif( - pa_version_under10p1, reason="pyarrow is not installed" - ), - ), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), "datetime64[ns]", "period[D]", "Sparse[float]", diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index a41d7dec8b496..bf746a9eaa976 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -282,7 +282,7 @@ def test_empty_pyarrow(data): def test_multi_chunk_pyarrow() -> None: - pa = pytest.importorskip("pyarrow", "11.0.0") + pa = pytest.importorskip("pyarrow", "14.0.0") n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) names = ["n_legs"] table = pa.table([n_legs], names=names) @@ -488,7 +488,7 @@ def test_pandas_nullable_with_missing_values( ) -> None: # https://github.com/pandas-dev/pandas/issues/57643 # https://github.com/pandas-dev/pandas/issues/57664 - pa = pytest.importorskip("pyarrow", "11.0.0") + pa = pytest.importorskip("pyarrow", "14.0.0") import pyarrow.interchange as pai if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": @@ -554,7 +554,7 @@ def test_pandas_nullable_without_missing_values( data: list, dtype: str, expected_dtype: str ) -> None: # https://github.com/pandas-dev/pandas/issues/57643 - pa = pytest.importorskip("pyarrow", "11.0.0") + pa = pytest.importorskip("pyarrow", "14.0.0") import pyarrow.interchange as pai if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fefed34894cf3..9f9304c8d1664 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -13,7 +13,6 @@ from pandas.compat import is_platform_windows from pandas.compat.pyarrow import ( - pa_version_under11p0, pa_version_under13p0, pa_version_under15p0, pa_version_under17p0, @@ -729,7 +728,7 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): expected = df_full.copy() expected.loc[1, "string_with_nan"] = None - if pa_version_under11p0: + if pa_version_under13p0: expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( "M8[ns]" ) @@ -980,15 +979,12 @@ def test_additional_extension_types(self, pa): def test_timestamp_nanoseconds(self, pa): # with version 2.6, pyarrow defaults to writing the nanoseconds, so - # this should work without error - # Note in previous pyarrows(<7.0.0), only the pseudo-version 2.0 was available + # this should work without error, even for pyarrow < 13 ver = "2.6" df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1ns", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": ver}) def test_timezone_aware_index(self, pa, timezone_aware_date_list): - pytest.importorskip("pyarrow", "11.0.0") - idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) @@ -1003,7 +999,7 @@ def test_timezone_aware_index(self, pa, timezone_aware_date_list): # this use-case sets the resolution to 1 minute expected = df[:] - if pa_version_under11p0: + if pa_version_under13p0: expected.index = expected.index.as_unit("ns") if timezone_aware_date_list.tzinfo != datetime.timezone.utc: # pyarrow returns pytz.FixedOffset while pandas constructs datetime.timezone @@ -1140,7 +1136,6 @@ def test_string_inference(self, tmp_path, pa, using_infer_string): ) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(pa_version_under11p0, reason="not supported before 11.0") def test_roundtrip_decimal(self, tmp_path, pa): # GH#54768 import pyarrow as pa @@ -1189,7 +1184,7 @@ def test_infer_string_large_string_type(self, tmp_path, pa): def test_non_nanosecond_timestamps(self, temp_file): # GH#49236 - pa = pytest.importorskip("pyarrow", "11.0.0") + pa = pytest.importorskip("pyarrow", "13.0.0") pq = pytest.importorskip("pyarrow.parquet") arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us")) diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index bec8ca13a2f5f..3541592e7c51e 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -10,8 +10,6 @@ pa = pytest.importorskip("pyarrow") -from pandas.compat import pa_version_under11p0 - @pytest.mark.parametrize( "list_dtype", @@ -57,20 +55,14 @@ def test_list_getitem_slice(): index=[1, 3, 7], name="a", ) - if pa_version_under11p0: - with pytest.raises( - NotImplementedError, match="List slice not supported by pyarrow " - ): - ser.list[1:None:None] - else: - actual = ser.list[1:None:None] - expected = Series( - [[2, 3], [None, 5], None], - dtype=ArrowDtype(pa.list_(pa.int64())), - index=[1, 3, 7], - name="a", - ) - tm.assert_series_equal(actual, expected) + actual = ser.list[1:None:None] + expected = Series( + [[2, 3], [None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], + name="a", + ) + tm.assert_series_equal(actual, expected) def test_list_len(): @@ -105,14 +97,8 @@ def test_list_getitem_slice_invalid(): [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), ) - if pa_version_under11p0: - with pytest.raises( - NotImplementedError, match="List slice not supported by pyarrow " - ): - ser.list[1:None:0] - else: - with pytest.raises(pa.lib.ArrowInvalid, match=re.escape("`step` must be >= 1")): - ser.list[1:None:0] + with pytest.raises(pa.lib.ArrowInvalid, match=re.escape("`step` must be >= 1")): + ser.list[1:None:0] def test_list_accessor_non_list_dtype(): diff --git a/pandas/tests/series/accessors/test_struct_accessor.py b/pandas/tests/series/accessors/test_struct_accessor.py index 80aea75fda406..c1ef1b14ec3d0 100644 --- a/pandas/tests/series/accessors/test_struct_accessor.py +++ b/pandas/tests/series/accessors/test_struct_accessor.py @@ -2,10 +2,7 @@ import pytest -from pandas.compat.pyarrow import ( - pa_version_under11p0, - pa_version_under13p0, -) +from pandas.compat.pyarrow import pa_version_under13p0 from pandas import ( ArrowDtype, @@ -105,7 +102,6 @@ def test_struct_accessor_field_with_invalid_name_or_index(): ser.struct.field(1.1) -@pytest.mark.skipif(pa_version_under11p0, reason="pyarrow>=11.0.0 required") def test_struct_accessor_explode(): index = Index([-100, 42, 123]) ser = Series( diff --git a/pyproject.toml b/pyproject.toml index b17a1eacfa717..7582e2bce3879 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,15 +59,15 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] -pyarrow = ['pyarrow>=10.0.1'] +pyarrow = ['pyarrow>=12.0.1'] performance = ['bottleneck>=1.3.6', 'numba>=0.59.0', 'numexpr>=2.9.0'] computation = ['scipy>=1.12.0', 'xarray>=2024.1.1'] fss = ['fsspec>=2023.12.2'] aws = ['s3fs>=2023.12.2'] gcp = ['gcsfs>=2023.12.2'] excel = ['odfpy>=1.4.1', 'openpyxl>=3.1.2', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'xlrd>=2.0.1', 'xlsxwriter>=3.2.0'] -parquet = ['pyarrow>=10.0.1'] -feather = ['pyarrow>=10.0.1'] +parquet = ['pyarrow>=12.0.1'] +feather = ['pyarrow>=12.0.1'] iceberg = ['pyiceberg>=0.7.1'] hdf5 = ['tables>=3.8.0'] spss = ['pyreadstat>=1.2.6'] @@ -98,7 +98,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'odfpy>=1.4.1', 'openpyxl>=3.1.2', 'psycopg2>=2.9.6', - 'pyarrow>=10.0.1', + 'pyarrow>=12.0.1', 'pyiceberg>=0.7.1', 'pymysql>=1.1.0', 'PyQt5>=5.15.9', diff --git a/requirements-dev.txt b/requirements-dev.txt index 64a9ecdacfb45..b0f8819befbe9 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -32,7 +32,7 @@ numexpr>=2.9.0 openpyxl>=3.1.2 odfpy>=1.4.1 psycopg2-binary>=2.9.6 -pyarrow>=10.0.1 +pyarrow>=12.0.1 pyiceberg>=0.7.1 pymysql>=1.1.0 pyreadstat>=1.2.6 diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index d804e15f6d48f..8475747a80367 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -53,6 +53,7 @@ "_get_option", "_fill_limit_area_1d", "_make_block", + "_DatetimeTZBlock", }