diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index dc05745c8c0e5..5450fdf6b1923 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -49,6 +49,16 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_feather` * :func:`to_numeric` +To simplify opting-in to nullable dtypes for these functions, a new option ``nullable_dtypes`` was added that allows setting +the keyword argument globally to ``True`` if not specified directly. The option can be enabled +through: + +.. ipython:: python + + pd.options.mode.nullable_dtypes = True + +The option will only work for functions with the keyword ``use_nullable_dtypes``. + Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions to select the nullable dtypes implementation. diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 5219abc697dbd..d12dd3b4cb8aa 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -33,3 +33,8 @@ def using_copy_on_write(): _mode_options = _global_config["mode"] return _mode_options["copy_on_write"] and _mode_options["data_manager"] == "block" + + +def using_nullable_dtypes(): + _mode_options = _global_config["mode"] + return _mode_options["nullable_dtypes"] diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index da9e7de9821b1..2e1ddb3c0a628 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -560,6 +560,22 @@ def use_inf_as_na_cb(key) -> None: validator=is_one_of_factory(["pandas", "pyarrow"]), ) + +nullable_dtypes_doc = """ +: bool + If nullable dtypes should be returned. This is only applicable to functions + where the ``use_nullable_dtypes`` keyword is implemented. +""" + +with cf.config_prefix("mode"): + cf.register_option( + "nullable_dtypes", + False, + nullable_dtypes_doc, + validator=is_bool, + ) + + # Set up the io.excel specific reader configuration. reader_engine_doc = """ : string diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 64bb34241d956..a2bebc7dd3217 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -4,7 +4,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_nullable_dtypes, +) from pandas._libs import lib from pandas._typing import ( @@ -38,7 +41,7 @@ def to_numeric( arg, errors: DateTimeErrorChoices = "raise", downcast: Literal["integer", "signed", "unsigned", "float"] | None = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ): """ Convert argument to a numeric type. @@ -157,6 +160,12 @@ def to_numeric( if errors not in ("ignore", "raise", "coerce"): raise ValueError("invalid error value specified") + _use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + is_series = False is_index = False is_scalars = False @@ -204,11 +213,11 @@ def to_numeric( values = ensure_object(values) coerce_numeric = errors not in ("ignore", "raise") try: - values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] + values, new_mask = lib.maybe_convert_numeric( values, set(), coerce_numeric=coerce_numeric, - convert_to_masked_nullable=use_nullable_dtypes, + convert_to_masked_nullable=_use_nullable_dtypes, ) except (ValueError, TypeError): if errors == "raise": @@ -218,7 +227,7 @@ def to_numeric( # Remove unnecessary values, is expected later anyway and enables # downcasting values = values[~new_mask] - elif use_nullable_dtypes and new_mask is None: + elif _use_nullable_dtypes and new_mask is None: new_mask = np.zeros(values.shape, dtype=np.bool_) # attempt downcast only if the data has been successfully converted diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 44bee11518cd3..0ba3846f415ad 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -4,6 +4,9 @@ from io import StringIO import warnings +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.generic import ABCDataFrame @@ -15,7 +18,9 @@ def read_clipboard( - sep: str = r"\s+", use_nullable_dtypes: bool = False, **kwargs + sep: str = r"\s+", + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + **kwargs, ): # pragma: no cover r""" Read text from clipboard and pass to read_csv. @@ -56,6 +61,12 @@ def read_clipboard( if encoding is not None and encoding.lower().replace("-", "") != "utf8": raise NotImplementedError("reading from clipboard only supports utf-8 encoding") + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_csv diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6f706a4554855..d44bdc466aed9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -23,8 +23,12 @@ ) import zipfile -from pandas._config import config +from pandas._config import ( + config, + using_nullable_dtypes, +) +from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES from pandas._typing import ( DtypeArg, @@ -380,7 +384,7 @@ def read_excel( comment: str | None = ..., skipfooter: int = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -419,7 +423,7 @@ def read_excel( comment: str | None = ..., skipfooter: int = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> dict[IntStrT, DataFrame]: ... @@ -458,7 +462,7 @@ def read_excel( comment: str | None = None, skipfooter: int = 0, storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | dict[IntStrT, DataFrame]: should_close = False @@ -471,6 +475,12 @@ def read_excel( "an ExcelFile - ExcelFile already has the engine set" ) + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + try: data = io.parse( sheet_name=sheet_name, diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index cb2890777621a..136f49fef156e 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -6,6 +6,9 @@ Sequence, ) +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas._typing import ( FilePath, ReadBuffer, @@ -103,7 +106,7 @@ def read_feather( columns: Sequence[Hashable] | None = None, use_threads: bool = True, storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ): """ Load a feather-format object from the file path. @@ -143,6 +146,12 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: diff --git a/pandas/io/html.py b/pandas/io/html.py index f025e12bd0f55..dd1f7ef239f73 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -18,6 +18,9 @@ cast, ) +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas._typing import ( BaseBuffer, FilePath, @@ -1036,7 +1039,7 @@ def read_html( keep_default_na: bool = True, displayed_only: bool = True, extract_links: Literal[None, "header", "footer", "body", "all"] = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -1206,6 +1209,12 @@ def read_html( ) validate_header_arg(header) + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + io = stringify_path(io) return _parse( diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index aa1342d0f135f..afb0be0729344 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -21,6 +21,9 @@ import numpy as np +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas._libs.json import ( dumps, loads, @@ -496,7 +499,7 @@ def read_json( compression: CompressionOptions = "infer", nrows: int | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | Series | JsonReader: """ Convert a JSON string to pandas object. @@ -732,6 +735,12 @@ def read_json( if orient == "table" and convert_axes: raise ValueError("cannot pass both convert_axes and orient='table'") + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + if dtype is None and orient != "table": # error: Incompatible types in assignment (expression has type "bool", variable # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 169cb5d16da8d..ccc7afe7ee0f7 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -8,8 +8,12 @@ Literal, ) -from pandas._config import get_option +from pandas._config import ( + get_option, + using_nullable_dtypes, +) +from pandas._libs import lib from pandas._typing import ( FilePath, ReadBuffer, @@ -33,7 +37,7 @@ def read_orc( path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, **kwargs, ) -> DataFrame: """ @@ -86,6 +90,12 @@ def read_orc( orc = import_optional_dependency("pyarrow.orc") + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + with get_handle(path, "rb", is_text=False) as handles: orc_file = orc.ORCFile(handles.handle) pa_table = orc_file.read(columns=columns, **kwargs) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index cb66d1a422811..2a33ec8969838 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -9,6 +9,9 @@ ) from warnings import catch_warnings +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas._typing import ( FilePath, ReadBuffer, @@ -453,7 +456,7 @@ def read_parquet( engine: str = "auto", columns: list[str] | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, **kwargs, ) -> DataFrame: """ @@ -511,6 +514,12 @@ def read_parquet( """ impl = get_engine(engine) + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + return impl.read( path, columns=columns, diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index b97c0161958fa..410b4fc0bf9c0 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -24,7 +24,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_nullable_dtypes, +) from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES @@ -639,7 +642,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> TextFileReader: ... @@ -695,7 +698,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> TextFileReader: ... @@ -751,7 +754,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -807,7 +810,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame | TextFileReader: ... @@ -879,7 +882,7 @@ def read_csv( memory_map: bool = False, float_precision: Literal["high", "legacy"] | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: if infer_datetime_format is not lib.no_default: warnings.warn( @@ -904,6 +907,7 @@ def read_csv( on_bad_lines, names, defaults={"delimiter": ","}, + use_nullable_dtypes=use_nullable_dtypes, ) kwds.update(kwds_defaults) @@ -961,7 +965,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> TextFileReader: ... @@ -1017,7 +1021,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> TextFileReader: ... @@ -1073,7 +1077,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -1129,7 +1133,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame | TextFileReader: ... @@ -1201,7 +1205,7 @@ def read_table( memory_map: bool = False, float_precision: str | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: # locals() should never be modified kwds = locals().copy() @@ -1217,6 +1221,7 @@ def read_table( on_bad_lines, names, defaults={"delimiter": "\t"}, + use_nullable_dtypes=use_nullable_dtypes, ) kwds.update(kwds_defaults) @@ -1229,7 +1234,7 @@ def read_fwf( colspecs: Sequence[tuple[int, int]] | str | None = "infer", widths: Sequence[int] | None = None, infer_nrows: int = 100, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, **kwds, ) -> DataFrame | TextFileReader: r""" @@ -1292,6 +1297,12 @@ def read_fwf( if colspecs not in (None, "infer") and widths is not None: raise ValueError("You must specify only one of 'widths' and 'colspecs'") + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + # Compute 'colspecs' from 'widths', if specified. if widths is not None: colspecs, col = [], 0 @@ -1858,6 +1869,7 @@ def _refine_defaults_read( on_bad_lines: str | Callable, names: Sequence[Hashable] | None | lib.NoDefault, defaults: dict[str, Any], + use_nullable_dtypes: bool | lib.NoDefault, ): """Validate/refine default values of input parameters of read_csv, read_table. @@ -1971,6 +1983,13 @@ def _refine_defaults_read( else: raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + kwds["use_nullable_dtypes"] = use_nullable_dtypes + return kwds diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 8acc57685dc16..8ba208aa84286 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -29,6 +29,8 @@ import numpy as np +from pandas._config import using_nullable_dtypes + from pandas._libs import lib from pandas._typing import ( DateTimeErrorChoices, @@ -230,7 +232,7 @@ def read_sql_table( parse_dates: list[str] | dict[str, str] | None = ..., columns: list[str] | None = ..., chunksize: None = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -245,7 +247,7 @@ def read_sql_table( parse_dates: list[str] | dict[str, str] | None = ..., columns: list[str] | None = ..., chunksize: int = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> Iterator[DataFrame]: ... @@ -259,7 +261,7 @@ def read_sql_table( parse_dates: list[str] | dict[str, str] | None = None, columns: list[str] | None = None, chunksize: int | None = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | Iterator[DataFrame]: """ Read SQL database table into a DataFrame. @@ -322,6 +324,12 @@ def read_sql_table( -------- >>> pd.read_sql_table('table_name', 'postgres:///db_name') # doctest:+SKIP """ + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + with pandasSQL_builder(con, schema=schema) as pandas_sql: if not pandas_sql.has_table(table_name): raise ValueError(f"Table {table_name} not found") @@ -352,7 +360,7 @@ def read_sql_query( parse_dates: list[str] | dict[str, str] | None = ..., chunksize: None = ..., dtype: DtypeArg | None = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -367,7 +375,7 @@ def read_sql_query( parse_dates: list[str] | dict[str, str] | None = ..., chunksize: int = ..., dtype: DtypeArg | None = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., ) -> Iterator[DataFrame]: ... @@ -381,7 +389,7 @@ def read_sql_query( parse_dates: list[str] | dict[str, str] | None = None, chunksize: int | None = None, dtype: DtypeArg | None = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | Iterator[DataFrame]: """ Read SQL query into a DataFrame. @@ -446,6 +454,12 @@ def read_sql_query( Any datetime values with time zone information parsed via the `parse_dates` parameter will be converted to UTC. """ + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + with pandasSQL_builder(con) as pandas_sql: return pandas_sql.read_query( sql, @@ -469,7 +483,7 @@ def read_sql( parse_dates=..., columns: list[str] = ..., chunksize: None = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., dtype: DtypeArg | None = None, ) -> DataFrame: ... @@ -485,7 +499,7 @@ def read_sql( parse_dates=..., columns: list[str] = ..., chunksize: int = ..., - use_nullable_dtypes: bool = ..., + use_nullable_dtypes: bool | lib.NoDefault = ..., dtype: DtypeArg | None = None, ) -> Iterator[DataFrame]: ... @@ -500,7 +514,7 @@ def read_sql( parse_dates=None, columns: list[str] | None = None, chunksize: int | None = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, dtype: DtypeArg | None = None, ) -> DataFrame | Iterator[DataFrame]: """ @@ -630,6 +644,12 @@ def read_sql( 0 0 2012-11-10 1 1 2010-11-12 """ + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + with pandasSQL_builder(con) as pandas_sql: if isinstance(pandas_sql, SQLiteDatabase): diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 1b32cf252d315..4e6afbf4f4164 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -11,6 +11,9 @@ Sequence, ) +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas._typing import ( TYPE_CHECKING, CompressionOptions, @@ -868,7 +871,7 @@ def read_xml( iterparse: dict[str, list[str]] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame: r""" Read XML document into a ``DataFrame`` object. @@ -1110,6 +1113,12 @@ def read_xml( 2 triangle 180 3.0 """ + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + return _parse( path_or_buffer=path_or_buffer, xpath=xpath, diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 5899125ca2904..f194cadbc73d8 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -540,7 +540,8 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): "dtype_backend", ["pandas", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))], ) - def test_use_nullable_dtypes(self, read_ext, dtype_backend): + @pytest.mark.parametrize("option", [True, False]) + def test_use_nullable_dtypes(self, read_ext, dtype_backend, option): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") @@ -562,9 +563,13 @@ def test_use_nullable_dtypes(self, read_ext, dtype_backend): with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, "test", index=False) with pd.option_context("mode.dtype_backend", dtype_backend): - result = pd.read_excel( - file_path, sheet_name="test", use_nullable_dtypes=True - ) + if not option: + result = pd.read_excel( + file_path, sheet_name="test", use_nullable_dtypes=True + ) + else: + with pd.option_context("mode.nullable_dtypes", True): + result = pd.read_excel(file_path, sheet_name="test") if dtype_backend == "pyarrow": import pyarrow as pa diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 31566f67bef2c..7b473a56aa200 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1873,7 +1873,8 @@ def test_json_uint64(self): @pytest.mark.parametrize( "orient", ["split", "records", "values", "index", "columns"] ) - def test_read_json_nullable(self, string_storage, dtype_backend, orient): + @pytest.mark.parametrize("option", [True, False]) + def test_read_json_nullable(self, string_storage, dtype_backend, orient, option): # GH#50750 pa = pytest.importorskip("pyarrow") df = DataFrame( @@ -1900,7 +1901,11 @@ def test_read_json_nullable(self, string_storage, dtype_backend, orient): out = df.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): with pd.option_context("mode.dtype_backend", dtype_backend): - result = read_json(out, use_nullable_dtypes=True, orient=orient) + if option: + with pd.option_context("mode.nullable_dtypes", option): + result = read_json(out, orient=orient) + else: + result = read_json(out, use_nullable_dtypes=True, orient=orient) expected = DataFrame( { diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 52b142d81cd5e..ca12b1ce4b967 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -530,6 +530,22 @@ def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request): tm.assert_frame_equal(result, expected) +@pytest.mark.usefixtures("pyarrow_xfail") +def test_use_nullable_dtypes_option(all_parsers): + # GH#50748 + + parser = all_parsers + + data = """a +1 +3 +""" + with pd.option_context("mode.nullable_dtypes", True): + result = parser.read_csv(StringIO(data)) + expected = DataFrame({"a": pd.Series([1, 3], dtype="Int64")}) + tm.assert_frame_equal(result, expected) + + def test_ea_int_avoid_overflow(all_parsers): # GH#32134 parser = all_parsers diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index f4320f6480517..434e617ff05f9 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -994,3 +994,16 @@ def test_use_nullable_dtypes(string_storage, dtype_backend): expected["i"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) + + +def test_use_nullable_dtypes_option(): + # GH#50748 + + data = """a +1 +3""" + with pd.option_context("mode.nullable_dtypes", True): + result = read_fwf(StringIO(data)) + + expected = DataFrame({"a": pd.Series([1, 3], dtype="Int64")}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index ae9c5aacf6e6b..5e4b2c1ebad9d 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -466,3 +466,20 @@ def test_read_clipboard_nullable_dtypes( expected["g"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_read_clipboard_nullable_dtypes_option( + self, request, mock_clipboard, engine + ): + # GH#50748 + + text = """a +1 +2""" + mock_clipboard[request.node.name] = text + + with pd.option_context("mode.nullable_dtypes", True): + result = read_clipboard(sep=",", engine=engine) + + expected = DataFrame({"a": Series([1, 2], dtype="Int64")}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 28a6054098a6f..7e07ad0ec2ad3 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -200,7 +200,8 @@ def test_http_path(self, feather_file): tm.assert_frame_equal(expected, res) @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) - def test_read_json_nullable(self, string_storage, dtype_backend): + @pytest.mark.parametrize("option", [True, False]) + def test_read_json_nullable(self, string_storage, dtype_backend, option): # GH#50765 pa = pytest.importorskip("pyarrow") df = pd.DataFrame( @@ -228,7 +229,11 @@ def test_read_json_nullable(self, string_storage, dtype_backend): to_feather(df, path) with pd.option_context("mode.string_storage", string_storage): with pd.option_context("mode.dtype_backend", dtype_backend): - result = read_feather(path, use_nullable_dtypes=True) + if option: + with pd.option_context("mode.nullable_dtypes", option): + result = read_feather(path) + else: + result = read_feather(path, use_nullable_dtypes=True) expected = pd.DataFrame( { diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index f8284b5ab1c65..de36548f08a12 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -196,6 +196,17 @@ def test_use_nullable_dtypes(self, storage, dtype_backend): tm.assert_frame_equal(result, expected) + def test_use_nullable_dtypes_option(self): + # GH#50748 + df = DataFrame({"a": Series([1, np.nan, 3], dtype="Int64")}) + + out = df.to_html(index=False) + with pd.option_context("mode.nullable_dtypes", True): + result = self.read_html(out)[0] + + expected = DataFrame({"a": Series([1, np.nan, 3], dtype="Int64")}) + tm.assert_frame_equal(result, expected) + @pytest.mark.network @tm.network( url=( diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a519d9536eb32..2a95240a5f83d 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -383,3 +383,16 @@ def test_orc_use_nullable_dtypes_pandas_backend(): ) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow", min_version="7.0.0") +def test_orc_use_nullable_dtypes_option(): + # GH#50748 + df = pd.DataFrame({"int": list(range(1, 4))}) + + bytes_data = df.copy().to_orc() + with pd.option_context("mode.nullable_dtypes", True): + result = read_orc(BytesIO(bytes_data)) + + expected = pd.DataFrame({"int": pd.Series([1, 2, 3], dtype="Int64")}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b5841593d4f45..4c884e20cf423 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -640,6 +640,28 @@ def test_use_nullable_dtypes(self, engine, request): expected = expected.drop("c", axis=1) tm.assert_frame_equal(result2, expected) + def test_use_nullable_dtypes_option(self, engine, request): + # GH#50748 + import pyarrow.parquet as pq + + if engine == "fastparquet": + # We are manually disabling fastparquet's + # nullable dtype support pending discussion + mark = pytest.mark.xfail( + reason="Fastparquet nullable dtype support is disabled" + ) + request.node.add_marker(mark) + + table = pyarrow.table({"a": pyarrow.array([1, 2, 3, None], "int64")}) + with tm.ensure_clean() as path: + # write manually with pyarrow to write integers + pq.write_table(table, path) + with pd.option_context("mode.nullable_dtypes", True): + result2 = read_parquet(path, engine=engine) + + expected = pd.DataFrame({"a": pd.array([1, 2, 3, None], dtype="Int64")}) + tm.assert_frame_equal(result2, expected) + @pytest.mark.parametrize( "dtype", [ diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a1dbec1bb2f44..a5bcfa8845785 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2291,17 +2291,22 @@ def test_get_engine_auto_error_message(self): pass # TODO(GH#36893) fill this in when we add more engines + @pytest.mark.parametrize("option", [True, False]) @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) - def test_read_sql_nullable_dtypes(self, string_storage, func): + def test_read_sql_nullable_dtypes(self, string_storage, func, option): # GH#50048 table = "test" df = self.nullable_data() df.to_sql(table, self.conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): - result = getattr(pd, func)( - f"Select * from {table}", self.conn, use_nullable_dtypes=True - ) + if option: + with pd.option_context("mode.nullable_dtypes", True): + result = getattr(pd, func)(f"Select * from {table}", self.conn) + else: + result = getattr(pd, func)( + f"Select * from {table}", self.conn, use_nullable_dtypes=True + ) expected = self.nullable_expected(string_storage) tm.assert_frame_equal(result, expected) @@ -2316,15 +2321,20 @@ def test_read_sql_nullable_dtypes(self, string_storage, func): for result in iterator: tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("option", [True, False]) @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) - def test_read_sql_nullable_dtypes_table(self, string_storage, func): + def test_read_sql_nullable_dtypes_table(self, string_storage, func, option): # GH#50048 table = "test" df = self.nullable_data() df.to_sql(table, self.conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): - result = getattr(pd, func)(table, self.conn, use_nullable_dtypes=True) + if option: + with pd.option_context("mode.nullable_dtypes", True): + result = getattr(pd, func)(table, self.conn) + else: + result = getattr(pd, func)(table, self.conn, use_nullable_dtypes=True) expected = self.nullable_expected(string_storage) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 6aaa11866a584..2d3435eab9f60 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1842,3 +1842,21 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): expected["g"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) + + +def test_use_nullable_dtypes_option(parser): + # GH#50748 + + data = """ + + + 1 + + + 3 + + """ + with pd.option_context("mode.nullable_dtypes", True): + result = read_xml(data, parser=parser) + expected = DataFrame({"a": Series([1, 3], dtype="Int64")}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index a2b94883d457d..8b57bbe03f9e7 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -838,6 +838,15 @@ def test_to_numeric_use_nullable_dtypes_na(val, dtype): tm.assert_series_equal(result, expected) +def test_to_numeric_use_nullable_dtypes_option(): + # GH#50748 + ser = Series([1, None], dtype=object) + with option_context("mode.nullable_dtypes", True): + result = to_numeric(ser) + expected = Series([1, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "val, dtype, downcast", [