From 6fe0831fdf3d71a845125f3b7b39e2563d40819a Mon Sep 17 00:00:00 2001 From: Saul Shanabrook Date: Thu, 19 Mar 2020 17:39:52 -0400 Subject: [PATCH 01/12] Switch dataframe constructor to use dispatch --- pandas/core/frame.py | 219 +++++++++++++++++++++++++------------------ 1 file changed, 130 insertions(+), 89 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b9e43b1cd9b05..bf2c893143a02 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -14,6 +14,7 @@ import datetime from io import StringIO import itertools +import functools from textwrap import dedent from typing import ( IO, @@ -36,6 +37,7 @@ import numpy as np import numpy.ma as ma +import numpy.ma.mrecords as mrecords from pandas._config import get_option @@ -427,97 +429,9 @@ def __init__( dtype: Optional[Dtype] = None, copy: bool = False, ): - if data is None: - data = {} if dtype is not None: dtype = self._validate_dtype(dtype) - - if isinstance(data, DataFrame): - data = data._data - - if isinstance(data, BlockManager): - mgr = self._init_mgr( - data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy - ) - elif isinstance(data, dict): - mgr = init_dict(data, index, columns, dtype=dtype) - elif isinstance(data, ma.MaskedArray): - import numpy.ma.mrecords as mrecords - - # masked recarray - if isinstance(data, mrecords.MaskedRecords): - mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) - - # a masked array - else: - mask = ma.getmaskarray(data) - if mask.any(): - data, fill_value = maybe_upcast(data, copy=True) - data.soften_mask() # set hardmask False if it was True - data[mask] = fill_value - else: - data = data.copy() - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) - - elif isinstance(data, (np.ndarray, Series, Index)): - if data.dtype.names: - data_columns = list(data.dtype.names) - data = {k: data[k] for k in data_columns} - if columns is None: - columns = data_columns - mgr = init_dict(data, index, columns, dtype=dtype) - elif getattr(data, "name", None) is not None: - mgr = init_dict({data.name: data}, index, columns, dtype=dtype) - else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) - - # For data is list-like, or Iterable (will consume into list) - elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): - if not isinstance(data, (abc.Sequence, ExtensionArray)): - data = list(data) - if len(data) > 0: - if is_dataclass(data[0]): - data = dataclasses_to_dicts(data) - if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: - if is_named_tuple(data[0]) and columns is None: - columns = data[0]._fields - arrays, columns = to_arrays(data, columns, dtype=dtype) - columns = ensure_index(columns) - - # set the index - if index is None: - if isinstance(data[0], Series): - index = get_names_from_index(data) - elif isinstance(data[0], Categorical): - index = ibase.default_index(len(data[0])) - else: - index = ibase.default_index(len(data)) - - mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) - else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) - else: - mgr = init_dict({}, index, columns, dtype=dtype) - else: - try: - arr = np.array(data, dtype=dtype, copy=copy) - except (ValueError, TypeError) as err: - exc = TypeError( - "DataFrame constructor called with " - f"incompatible data and dtype: {err}" - ) - raise exc from err - - if arr.ndim == 0 and index is not None and columns is not None: - values = cast_scalar_to_array( - (len(index), len(columns)), data, dtype=dtype - ) - mgr = init_ndarray( - values, index, columns, dtype=values.dtype, copy=False - ) - else: - raise ValueError("DataFrame constructor not properly called!") - + mgr = create_block_manager(data, self, index, columns, dtype, copy) NDFrame.__init__(self, mgr) # ---------------------------------------------------------------------- @@ -8548,6 +8462,133 @@ def isin(self, values) -> "DataFrame": ops.add_special_arithmetic_methods(DataFrame) +@functools.singledispatch +def create_block_manager( + data: Any, + df: DataFrame, + index: Optional[Axes], + columns: Optional[Axes], + dtype: Optional[Dtype], + copy: bool + ) -> BlockManager: + """ + Convert an object into a BlockManager. Used inside the DataFrame constructor + so if you want to provide a custom way to convert from your objec to a DataFrame + you can register a dispatch on this method. + """ + # Base case is to try to cast to NumPy array + try: + arr = np.array(data, dtype=dtype, copy=copy) + except (ValueError, TypeError) as err: + exc = TypeError( + "DataFrame constructor called with " + f"incompatible data and dtype: {err}" + ) + raise exc from err + + if arr.ndim == 0 and index is not None and columns is not None: + values = cast_scalar_to_array( + (len(index), len(columns)), data, dtype=dtype + ) + return init_ndarray( + values, index, columns, dtype=values.dtype, copy=False + ) + else: + raise ValueError("DataFrame constructor not properly called!") + +@create_block_manager.register +def _create_block_manager_none(data: None, *args, **kwargs): + return create_block_manager({}, *args, **kwargs) + +@create_block_manager.register +def _create_block_manager_dataframe(data: DataFrame, *args, **kwargs): + return create_block_manager(data._data, *args, **kwargs) + + +@create_block_manager.register +def _create_block_manager_dataframe(data: BlockManager, df, index, columns, dtype, copy): + mgr = df._init_mgr( + data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy + ) + return mgr + +@create_block_manager.register +def _create_block_manager_dict(data: dict, df, index, columns, dtype, copy): + return init_dict(data, index, columns, dtype=dtype) + + +@create_block_manager.register +def _create_block_manager_masked_array(data: ma.MaskedArray, df, index, columns, dtype, copy): + mask = ma.getmaskarray(data) + if mask.any(): + data, fill_value = maybe_upcast(data, copy=True) + data.soften_mask() # set hardmask False if it was True + data[mask] = fill_value + else: + data = data.copy() + return init_ndarray(data, index, columns, dtype=dtype, copy=copy) + + +@create_block_manager.register +def _create_block_manager_masked_record(data: mrecords.MaskedRecords, df, index, columns, dtype, copy): + return masked_rec_array_to_mgr(data, index, columns, dtype, copy) + +@create_block_manager.register(np.ndarray) +@create_block_manager.register(Series) +@create_block_manager.register(Index) +def _create_block_manager_array_series_index(data: Union[np.ndarray, Series, Index], df, index, columns, dtype, copy): + if data.dtype.names: + data_columns = list(data.dtype.names) + data = {k: data[k] for k in data_columns} + if columns is None: + columns = data_columns + return init_dict(data, index, columns, dtype=dtype) + elif getattr(data, "name", None) is not None: + return init_dict({data.name: data}, index, columns, dtype=dtype) + return init_ndarray(data, index, columns, dtype=dtype, copy=copy) + +class _IterableExceptStringOrBytesMeta(type): + def __subclasscheck__(cls, sub: Type) -> bool: + return ( + not issubclass(sub, (str, bytes)) + and issubclass(sub, abc.Iterable) + ) + +class _IterableExceptStringOrBytes(metaclass=_IterableExceptStringOrBytesMeta): + """ + Class that is subclass of iterable but not of str or bytes to use for singledispatch + registration + """ + pass + + +@create_block_manager.register +def _create_block_manager_iterable(data: _IterableExceptStringOrBytes, df, index, columns, dtype, copy): + if not isinstance(data, (abc.Sequence, ExtensionArray)): + data = list(data) + if len(data) > 0: + if is_dataclass(data[0]): + data = dataclasses_to_dicts(data) + if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: + if is_named_tuple(data[0]) and columns is None: + columns = data[0]._fields + arrays, columns = to_arrays(data, columns, dtype=dtype) + columns = ensure_index(columns) + + # set the index + if index is None: + if isinstance(data[0], Series): + index = get_names_from_index(data) + elif isinstance(data[0], Categorical): + index = ibase.default_index(len(data[0])) + else: + index = ibase.default_index(len(data)) + + return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + return init_ndarray(data, index, columns, dtype=dtype, copy=copy) + return init_dict({}, index, columns, dtype=dtype) + + def _from_nested_dict(data): # TODO: this should be seriously cythonized new_data = collections.defaultdict(dict) From e123ab4ff2403bb2a518e25b3505bc9a8036ebec Mon Sep 17 00:00:00 2001 From: Saul Shanabrook Date: Thu, 19 Mar 2020 17:41:04 -0400 Subject: [PATCH 02/12] blacken --- pandas/core/frame.py | 57 +++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bf2c893143a02..f577994410bd4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8464,13 +8464,13 @@ def isin(self, values) -> "DataFrame": @functools.singledispatch def create_block_manager( - data: Any, - df: DataFrame, - index: Optional[Axes], - columns: Optional[Axes], - dtype: Optional[Dtype], - copy: bool - ) -> BlockManager: + data: Any, + df: DataFrame, + index: Optional[Axes], + columns: Optional[Axes], + dtype: Optional[Dtype], + copy: bool, +) -> BlockManager: """ Convert an object into a BlockManager. Used inside the DataFrame constructor so if you want to provide a custom way to convert from your objec to a DataFrame @@ -8481,44 +8481,46 @@ def create_block_manager( arr = np.array(data, dtype=dtype, copy=copy) except (ValueError, TypeError) as err: exc = TypeError( - "DataFrame constructor called with " - f"incompatible data and dtype: {err}" + "DataFrame constructor called with " f"incompatible data and dtype: {err}" ) raise exc from err if arr.ndim == 0 and index is not None and columns is not None: - values = cast_scalar_to_array( - (len(index), len(columns)), data, dtype=dtype - ) - return init_ndarray( - values, index, columns, dtype=values.dtype, copy=False - ) + values = cast_scalar_to_array((len(index), len(columns)), data, dtype=dtype) + return init_ndarray(values, index, columns, dtype=values.dtype, copy=False) else: raise ValueError("DataFrame constructor not properly called!") + @create_block_manager.register def _create_block_manager_none(data: None, *args, **kwargs): return create_block_manager({}, *args, **kwargs) + @create_block_manager.register def _create_block_manager_dataframe(data: DataFrame, *args, **kwargs): return create_block_manager(data._data, *args, **kwargs) @create_block_manager.register -def _create_block_manager_dataframe(data: BlockManager, df, index, columns, dtype, copy): +def _create_block_manager_dataframe( + data: BlockManager, df, index, columns, dtype, copy +): mgr = df._init_mgr( data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy ) return mgr + @create_block_manager.register def _create_block_manager_dict(data: dict, df, index, columns, dtype, copy): return init_dict(data, index, columns, dtype=dtype) @create_block_manager.register -def _create_block_manager_masked_array(data: ma.MaskedArray, df, index, columns, dtype, copy): +def _create_block_manager_masked_array( + data: ma.MaskedArray, df, index, columns, dtype, copy +): mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) @@ -8530,13 +8532,18 @@ def _create_block_manager_masked_array(data: ma.MaskedArray, df, index, columns, @create_block_manager.register -def _create_block_manager_masked_record(data: mrecords.MaskedRecords, df, index, columns, dtype, copy): +def _create_block_manager_masked_record( + data: mrecords.MaskedRecords, df, index, columns, dtype, copy +): return masked_rec_array_to_mgr(data, index, columns, dtype, copy) + @create_block_manager.register(np.ndarray) @create_block_manager.register(Series) @create_block_manager.register(Index) -def _create_block_manager_array_series_index(data: Union[np.ndarray, Series, Index], df, index, columns, dtype, copy): +def _create_block_manager_array_series_index( + data: Union[np.ndarray, Series, Index], df, index, columns, dtype, copy +): if data.dtype.names: data_columns = list(data.dtype.names) data = {k: data[k] for k in data_columns} @@ -8547,23 +8554,25 @@ def _create_block_manager_array_series_index(data: Union[np.ndarray, Series, Ind return init_dict({data.name: data}, index, columns, dtype=dtype) return init_ndarray(data, index, columns, dtype=dtype, copy=copy) + class _IterableExceptStringOrBytesMeta(type): def __subclasscheck__(cls, sub: Type) -> bool: - return ( - not issubclass(sub, (str, bytes)) - and issubclass(sub, abc.Iterable) - ) + return not issubclass(sub, (str, bytes)) and issubclass(sub, abc.Iterable) + class _IterableExceptStringOrBytes(metaclass=_IterableExceptStringOrBytesMeta): """ Class that is subclass of iterable but not of str or bytes to use for singledispatch registration """ + pass @create_block_manager.register -def _create_block_manager_iterable(data: _IterableExceptStringOrBytes, df, index, columns, dtype, copy): +def _create_block_manager_iterable( + data: _IterableExceptStringOrBytes, df, index, columns, dtype, copy +): if not isinstance(data, (abc.Sequence, ExtensionArray)): data = list(data) if len(data) > 0: From 4dfb4b73c3e5602a6a69a570c913a10c6a40de36 Mon Sep 17 00:00:00 2001 From: Saul Shanabrook Date: Thu, 19 Mar 2020 17:41:49 -0400 Subject: [PATCH 03/12] flake8 --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f577994410bd4..f87dd3ac71fb0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8503,7 +8503,7 @@ def _create_block_manager_dataframe(data: DataFrame, *args, **kwargs): @create_block_manager.register -def _create_block_manager_dataframe( +def _create_block_manager_blockmanager( data: BlockManager, df, index, columns, dtype, copy ): mgr = df._init_mgr( From 522029d34726f348619dde5c4c7de82561e23cae Mon Sep 17 00:00:00 2001 From: Saul Shanabrook Date: Thu, 19 Mar 2020 18:10:54 -0400 Subject: [PATCH 04/12] mypy fixes --- pandas/core/frame.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f87dd3ac71fb0..5b4b14b120016 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8569,15 +8569,13 @@ class _IterableExceptStringOrBytes(metaclass=_IterableExceptStringOrBytesMeta): pass -@create_block_manager.register -def _create_block_manager_iterable( - data: _IterableExceptStringOrBytes, df, index, columns, dtype, copy -): +@create_block_manager.register(_IterableExceptStringOrBytes) +def _create_block_manager_iterable(data: abc.Iterable, df, index, columns, dtype, copy): if not isinstance(data, (abc.Sequence, ExtensionArray)): data = list(data) if len(data) > 0: if is_dataclass(data[0]): - data = dataclasses_to_dicts(data) + data = cast(List[dict], dataclasses_to_dicts(data)) if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: if is_named_tuple(data[0]) and columns is None: columns = data[0]._fields From b8dd353a0bb13e33bc7ccaa547c5fe6d6297663a Mon Sep 17 00:00:00 2001 From: Saul Shanabrook Date: Thu, 19 Mar 2020 19:13:44 -0400 Subject: [PATCH 05/12] style fixes --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5b4b14b120016..3638bb3a8ac72 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12,9 +12,9 @@ import collections from collections import abc import datetime +import functools from io import StringIO import itertools -import functools from textwrap import dedent from typing import ( IO, @@ -8481,7 +8481,7 @@ def create_block_manager( arr = np.array(data, dtype=dtype, copy=copy) except (ValueError, TypeError) as err: exc = TypeError( - "DataFrame constructor called with " f"incompatible data and dtype: {err}" + f"DataFrame constructor called with incompatible data and dtype: {err}" ) raise exc from err From 3e4b466a20fecbda52ff6f15223cb1bdedaceaa1 Mon Sep 17 00:00:00 2001 From: Saul Shanabrook Date: Thu, 19 Mar 2020 19:41:13 -0400 Subject: [PATCH 06/12] rename and take class instead of instance --- pandas/core/frame.py | 64 ++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3638bb3a8ac72..76008104aaf45 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -431,7 +431,7 @@ def __init__( ): if dtype is not None: dtype = self._validate_dtype(dtype) - mgr = create_block_manager(data, self, index, columns, dtype, copy) + mgr = create_dataframe(data, self, index, columns, dtype, copy) NDFrame.__init__(self, mgr) # ---------------------------------------------------------------------- @@ -8463,18 +8463,19 @@ def isin(self, values) -> "DataFrame": @functools.singledispatch -def create_block_manager( +def create_dataframe( data: Any, - df: DataFrame, index: Optional[Axes], columns: Optional[Axes], dtype: Optional[Dtype], copy: bool, + cls: Type[DataFrame], ) -> BlockManager: """ - Convert an object into a BlockManager. Used inside the DataFrame constructor - so if you want to provide a custom way to convert from your objec to a DataFrame - you can register a dispatch on this method. + Create a BlockManager for some given data. Used inside the DataFrame constructor + to convert different input types. + If you want to provide a custom way to convert from your objec to a DataFrame + you can register a dispatch on this function. """ # Base case is to try to cast to NumPy array try: @@ -8492,34 +8493,33 @@ def create_block_manager( raise ValueError("DataFrame constructor not properly called!") -@create_block_manager.register -def _create_block_manager_none(data: None, *args, **kwargs): - return create_block_manager({}, *args, **kwargs) +@create_dataframe.register +def _create_dataframe_none(data: None, *args, **kwargs): + return create_dataframe({}, *args, **kwargs) -@create_block_manager.register -def _create_block_manager_dataframe(data: DataFrame, *args, **kwargs): - return create_block_manager(data._data, *args, **kwargs) +@create_dataframe.register +def _create_dataframe_dataframe(data: DataFrame, *args, **kwargs): + return create_dataframe(data._data, *args, **kwargs) -@create_block_manager.register -def _create_block_manager_blockmanager( - data: BlockManager, df, index, columns, dtype, copy +@create_dataframe.register +def _create_dataframe_blockmanager( + data: BlockManager, index, columns, dtype, copy, cls ): - mgr = df._init_mgr( + return cls._init_mgr( data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy ) - return mgr -@create_block_manager.register -def _create_block_manager_dict(data: dict, df, index, columns, dtype, copy): +@create_dataframe.register +def _create_dataframe_dict(data: dict, index, columns, dtype, copy, cls): return init_dict(data, index, columns, dtype=dtype) -@create_block_manager.register -def _create_block_manager_masked_array( - data: ma.MaskedArray, df, index, columns, dtype, copy +@create_dataframe.register +def _create_dataframe_masked_array( + data: ma.MaskedArray, index, columns, dtype, copy, cls ): mask = ma.getmaskarray(data) if mask.any(): @@ -8531,18 +8531,18 @@ def _create_block_manager_masked_array( return init_ndarray(data, index, columns, dtype=dtype, copy=copy) -@create_block_manager.register -def _create_block_manager_masked_record( - data: mrecords.MaskedRecords, df, index, columns, dtype, copy +@create_dataframe.register +def _create_dataframe_masked_record( + data: mrecords.MaskedRecords, index, columns, dtype, copy, cls ): return masked_rec_array_to_mgr(data, index, columns, dtype, copy) -@create_block_manager.register(np.ndarray) -@create_block_manager.register(Series) -@create_block_manager.register(Index) -def _create_block_manager_array_series_index( - data: Union[np.ndarray, Series, Index], df, index, columns, dtype, copy +@create_dataframe.register(np.ndarray) +@create_dataframe.register(Series) +@create_dataframe.register(Index) +def _create_dataframe_array_series_index( + data: Union[np.ndarray, Series, Index], index, columns, dtype, copy, cls ): if data.dtype.names: data_columns = list(data.dtype.names) @@ -8569,8 +8569,8 @@ class _IterableExceptStringOrBytes(metaclass=_IterableExceptStringOrBytesMeta): pass -@create_block_manager.register(_IterableExceptStringOrBytes) -def _create_block_manager_iterable(data: abc.Iterable, df, index, columns, dtype, copy): +@create_dataframe.register(_IterableExceptStringOrBytes) +def _create_dataframe_iterable(data: abc.Iterable, index, columns, dtype, copy, cls): if not isinstance(data, (abc.Sequence, ExtensionArray)): data = list(data) if len(data) > 0: From 898e3d705a2ce0112a819797e607969d4b2cf5e4 Mon Sep 17 00:00:00 2001 From: Saul Shanabrook Date: Thu, 19 Mar 2020 19:48:58 -0400 Subject: [PATCH 07/12] Move create_dataframe to construction --- pandas/core/frame.py | 194 ++++---------------------- pandas/core/internals/construction.py | 141 ++++++++++++++++++- 2 files changed, 168 insertions(+), 167 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 76008104aaf45..5e051d5ee4d2b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10,11 +10,11 @@ """ import collections -from collections import abc import datetime -import functools -from io import StringIO import itertools +import warnings +from collections import abc +from io import StringIO from textwrap import dedent from typing import ( IO, @@ -33,32 +33,24 @@ Union, cast, ) -import warnings import numpy as np -import numpy.ma as ma -import numpy.ma.mrecords as mrecords +import pandas.plotting from pandas._config import get_option - -from pandas._libs import algos as libalgos, lib, properties +from pandas._libs import algos as libalgos +from pandas._libs import lib, properties from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Label, Level, Renamer from pandas.compat import PY37 from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv -from pandas.util._decorators import ( - Appender, - Substitution, - deprecate_kwarg, - doc, - rewrite_axis_style_signature, -) -from pandas.util._validators import ( - validate_axis_style_args, - validate_bool_kwarg, - validate_percentile, -) - +from pandas.core import algorithms +from pandas.core import common as com +from pandas.core import nanops, ops +from pandas.core.accessor import CachedAccessor +from pandas.core.arrays import ExtensionArray +from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray +from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.dtypes.cast import ( cast_scalar_to_array, coerce_to_dtypes, @@ -69,7 +61,6 @@ maybe_convert_platform, maybe_downcast_to_dtype, maybe_infer_to_datetimelike, - maybe_upcast, maybe_upcast_putmask, validate_numeric_casting, ) @@ -104,12 +95,6 @@ ABCSeries, ) from pandas.core.dtypes.missing import isna, notna - -from pandas.core import algorithms, common as com, nanops, ops -from pandas.core.accessor import CachedAccessor -from pandas.core.arrays import Categorical, ExtensionArray -from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray -from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences @@ -117,25 +102,31 @@ from pandas.core.indexes.multi import MultiIndex, maybe_droplevels from pandas.core.indexes.period import PeriodIndex from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable -from pandas.core.internals import BlockManager from pandas.core.internals.construction import ( arrays_to_mgr, - dataclasses_to_dicts, - get_names_from_index, - init_dict, - init_ndarray, - masked_rec_array_to_mgr, + create_dataframe, reorder_arrays, sanitize_index, to_arrays, ) from pandas.core.ops.missing import dispatch_fill_zeros from pandas.core.series import Series - from pandas.io.common import get_filepath_or_buffer -from pandas.io.formats import console, format as fmt +from pandas.io.formats import console +from pandas.io.formats import format as fmt from pandas.io.formats.info import info -import pandas.plotting +from pandas.util._decorators import ( + Appender, + Substitution, + deprecate_kwarg, + doc, + rewrite_axis_style_signature, +) +from pandas.util._validators import ( + validate_axis_style_args, + validate_bool_kwarg, + validate_percentile, +) if TYPE_CHECKING: from pandas.core.groupby.generic import DataFrameGroupBy @@ -8462,140 +8453,11 @@ def isin(self, values) -> "DataFrame": ops.add_special_arithmetic_methods(DataFrame) -@functools.singledispatch -def create_dataframe( - data: Any, - index: Optional[Axes], - columns: Optional[Axes], - dtype: Optional[Dtype], - copy: bool, - cls: Type[DataFrame], -) -> BlockManager: - """ - Create a BlockManager for some given data. Used inside the DataFrame constructor - to convert different input types. - If you want to provide a custom way to convert from your objec to a DataFrame - you can register a dispatch on this function. - """ - # Base case is to try to cast to NumPy array - try: - arr = np.array(data, dtype=dtype, copy=copy) - except (ValueError, TypeError) as err: - exc = TypeError( - f"DataFrame constructor called with incompatible data and dtype: {err}" - ) - raise exc from err - - if arr.ndim == 0 and index is not None and columns is not None: - values = cast_scalar_to_array((len(index), len(columns)), data, dtype=dtype) - return init_ndarray(values, index, columns, dtype=values.dtype, copy=False) - else: - raise ValueError("DataFrame constructor not properly called!") - - -@create_dataframe.register -def _create_dataframe_none(data: None, *args, **kwargs): - return create_dataframe({}, *args, **kwargs) - - @create_dataframe.register def _create_dataframe_dataframe(data: DataFrame, *args, **kwargs): return create_dataframe(data._data, *args, **kwargs) -@create_dataframe.register -def _create_dataframe_blockmanager( - data: BlockManager, index, columns, dtype, copy, cls -): - return cls._init_mgr( - data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy - ) - - -@create_dataframe.register -def _create_dataframe_dict(data: dict, index, columns, dtype, copy, cls): - return init_dict(data, index, columns, dtype=dtype) - - -@create_dataframe.register -def _create_dataframe_masked_array( - data: ma.MaskedArray, index, columns, dtype, copy, cls -): - mask = ma.getmaskarray(data) - if mask.any(): - data, fill_value = maybe_upcast(data, copy=True) - data.soften_mask() # set hardmask False if it was True - data[mask] = fill_value - else: - data = data.copy() - return init_ndarray(data, index, columns, dtype=dtype, copy=copy) - - -@create_dataframe.register -def _create_dataframe_masked_record( - data: mrecords.MaskedRecords, index, columns, dtype, copy, cls -): - return masked_rec_array_to_mgr(data, index, columns, dtype, copy) - - -@create_dataframe.register(np.ndarray) -@create_dataframe.register(Series) -@create_dataframe.register(Index) -def _create_dataframe_array_series_index( - data: Union[np.ndarray, Series, Index], index, columns, dtype, copy, cls -): - if data.dtype.names: - data_columns = list(data.dtype.names) - data = {k: data[k] for k in data_columns} - if columns is None: - columns = data_columns - return init_dict(data, index, columns, dtype=dtype) - elif getattr(data, "name", None) is not None: - return init_dict({data.name: data}, index, columns, dtype=dtype) - return init_ndarray(data, index, columns, dtype=dtype, copy=copy) - - -class _IterableExceptStringOrBytesMeta(type): - def __subclasscheck__(cls, sub: Type) -> bool: - return not issubclass(sub, (str, bytes)) and issubclass(sub, abc.Iterable) - - -class _IterableExceptStringOrBytes(metaclass=_IterableExceptStringOrBytesMeta): - """ - Class that is subclass of iterable but not of str or bytes to use for singledispatch - registration - """ - - pass - - -@create_dataframe.register(_IterableExceptStringOrBytes) -def _create_dataframe_iterable(data: abc.Iterable, index, columns, dtype, copy, cls): - if not isinstance(data, (abc.Sequence, ExtensionArray)): - data = list(data) - if len(data) > 0: - if is_dataclass(data[0]): - data = cast(List[dict], dataclasses_to_dicts(data)) - if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: - if is_named_tuple(data[0]) and columns is None: - columns = data[0]._fields - arrays, columns = to_arrays(data, columns, dtype=dtype) - columns = ensure_index(columns) - - # set the index - if index is None: - if isinstance(data[0], Series): - index = get_names_from_index(data) - elif isinstance(data[0], Categorical): - index = ibase.default_index(len(data[0])) - else: - index = ibase.default_index(len(data)) - - return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) - return init_ndarray(data, index, columns, dtype=dtype, copy=copy) - return init_dict({}, index, columns, dtype=dtype) - - def _from_nested_dict(data): # TODO: this should be seriously cythonized new_data = collections.defaultdict(dict) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c4416472d451c..0394cef38ba49 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,13 +3,18 @@ constructors before passing them to a BlockManager. """ from collections import abc +import functools +from typing import Any, List, Optional, Type, Union, cast import numpy as np import numpy.ma as ma +import numpy.ma.mrecords as mrecords from pandas._libs import lib +from pandas._typing import Axes, Dtype from pandas.core.dtypes.cast import ( + cast_scalar_to_array, construct_1d_arraylike_from_scalar, maybe_cast_to_datetime, maybe_convert_platform, @@ -18,11 +23,13 @@ ) from pandas.core.dtypes.common import ( is_categorical_dtype, + is_dataclass, is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, is_integer_dtype, is_list_like, + is_named_tuple, is_object_dtype, ) from pandas.core.dtypes.generic import ( @@ -35,8 +42,9 @@ ) from pandas.core import algorithms, common as com -from pandas.core.arrays import Categorical +from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.construction import extract_array, sanitize_array +from pandas.core.generic import NDFrame from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( Index, @@ -45,9 +53,11 @@ union_indexes, ) from pandas.core.internals import ( + BlockManager, create_block_manager_from_arrays, create_block_manager_from_blocks, ) +from pandas.core.series import Series # --------------------------------------------------------------------- # BlockManager Interface @@ -115,6 +125,135 @@ def masked_rec_array_to_mgr(data, index, columns, dtype, copy: bool): return mgr +@functools.singledispatch +def create_dataframe( + data: Any, + index: Optional[Axes], + columns: Optional[Axes], + dtype: Optional[Dtype], + copy: bool, + cls: Type[NDFrame], +) -> BlockManager: + """ + Create a BlockManager for some given data. Used inside the DataFrame constructor + to convert different input types. + If you want to provide a custom way to convert from your objec to a DataFrame + you can register a dispatch on this function. + """ + # Base case is to try to cast to NumPy array + try: + arr = np.array(data, dtype=dtype, copy=copy) + except (ValueError, TypeError) as err: + exc = TypeError( + f"DataFrame constructor called with incompatible data and dtype: {err}" + ) + raise exc from err + + if arr.ndim == 0 and index is not None and columns is not None: + values = cast_scalar_to_array((len(index), len(columns)), data, dtype=dtype) + return init_ndarray(values, index, columns, dtype=values.dtype, copy=False) + else: + raise ValueError("DataFrame constructor not properly called!") + + +@create_dataframe.register +def _create_dataframe_none(data: None, *args, **kwargs): + return create_dataframe({}, *args, **kwargs) + + +@create_dataframe.register +def _create_dataframe_blockmanager( + data: BlockManager, index, columns, dtype, copy, cls +): + return cls._init_mgr( + data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy + ) + + +@create_dataframe.register +def _create_dataframe_dict(data: dict, index, columns, dtype, copy, cls): + return init_dict(data, index, columns, dtype=dtype) + + +@create_dataframe.register +def _create_dataframe_masked_array( + data: ma.MaskedArray, index, columns, dtype, copy, cls +): + mask = ma.getmaskarray(data) + if mask.any(): + data, fill_value = maybe_upcast(data, copy=True) + data.soften_mask() # set hardmask False if it was True + data[mask] = fill_value + else: + data = data.copy() + return init_ndarray(data, index, columns, dtype=dtype, copy=copy) + + +@create_dataframe.register +def _create_dataframe_masked_record( + data: mrecords.MaskedRecords, index, columns, dtype, copy, cls +): + return masked_rec_array_to_mgr(data, index, columns, dtype, copy) + + +@create_dataframe.register(np.ndarray) +@create_dataframe.register(Series) +@create_dataframe.register(Index) +def _create_dataframe_array_series_index( + data: Union[np.ndarray, Series, Index], index, columns, dtype, copy, cls +): + if data.dtype.names: + data_columns = list(data.dtype.names) + data = {k: data[k] for k in data_columns} + if columns is None: + columns = data_columns + return init_dict(data, index, columns, dtype=dtype) + elif getattr(data, "name", None) is not None: + return init_dict({data.name: data}, index, columns, dtype=dtype) + return init_ndarray(data, index, columns, dtype=dtype, copy=copy) + + +class _IterableExceptStringOrBytesMeta(type): + def __subclasscheck__(cls, sub: Type) -> bool: + return not issubclass(sub, (str, bytes)) and issubclass(sub, abc.Iterable) + + +class _IterableExceptStringOrBytes(metaclass=_IterableExceptStringOrBytesMeta): + """ + Class that is subclass of iterable but not of str or bytes to use for singledispatch + registration + """ + + pass + + +@create_dataframe.register(_IterableExceptStringOrBytes) +def _create_dataframe_iterable(data: abc.Iterable, index, columns, dtype, copy, cls): + if not isinstance(data, (abc.Sequence, ExtensionArray)): + data = list(data) + if len(data) > 0: + if is_dataclass(data[0]): + data = cast(List[dict], dataclasses_to_dicts(data)) + if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: + if is_named_tuple(data[0]) and columns is None: + columns = data[0]._fields + arrays, columns = to_arrays(data, columns, dtype=dtype) + columns = ensure_index(columns) + + # set the index + if index is None: + if isinstance(data[0], Series): + index = get_names_from_index(data) + elif isinstance(data[0], Categorical): + index = ibase.default_index(len(data[0])) + else: + index = ibase.default_index(len(data)) + + return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + return init_ndarray(data, index, columns, dtype=dtype, copy=copy) + return init_dict({}, index, columns, dtype=dtype) + + # --------------------------------------------------------------------- # DataFrame Constructor Interface From 7e8282671f9c1f89b2a53addb3820c3296233330 Mon Sep 17 00:00:00 2001 From: Saul Shanabrook Date: Thu, 19 Mar 2020 19:53:22 -0400 Subject: [PATCH 08/12] Sort imports --- pandas/core/frame.py | 54 ++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5e051d5ee4d2b..31befd3a0d604 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10,11 +10,10 @@ """ import collections -import datetime -import itertools -import warnings from collections import abc +import datetime from io import StringIO +import itertools from textwrap import dedent from typing import ( IO, @@ -33,24 +32,30 @@ Union, cast, ) +import warnings import numpy as np -import pandas.plotting from pandas._config import get_option -from pandas._libs import algos as libalgos -from pandas._libs import lib, properties + +from pandas._libs import algos as libalgos, lib, properties from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Label, Level, Renamer from pandas.compat import PY37 from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv -from pandas.core import algorithms -from pandas.core import common as com -from pandas.core import nanops, ops -from pandas.core.accessor import CachedAccessor -from pandas.core.arrays import ExtensionArray -from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray -from pandas.core.arrays.sparse import SparseFrameAccessor +from pandas.util._decorators import ( + Appender, + Substitution, + deprecate_kwarg, + doc, + rewrite_axis_style_signature, +) +from pandas.util._validators import ( + validate_axis_style_args, + validate_bool_kwarg, + validate_percentile, +) + from pandas.core.dtypes.cast import ( cast_scalar_to_array, coerce_to_dtypes, @@ -95,6 +100,12 @@ ABCSeries, ) from pandas.core.dtypes.missing import isna, notna + +from pandas.core import algorithms, common as com, nanops, ops +from pandas.core.accessor import CachedAccessor +from pandas.core.arrays import ExtensionArray +from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray +from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences @@ -111,22 +122,11 @@ ) from pandas.core.ops.missing import dispatch_fill_zeros from pandas.core.series import Series + from pandas.io.common import get_filepath_or_buffer -from pandas.io.formats import console -from pandas.io.formats import format as fmt +from pandas.io.formats import console, format as fmt from pandas.io.formats.info import info -from pandas.util._decorators import ( - Appender, - Substitution, - deprecate_kwarg, - doc, - rewrite_axis_style_signature, -) -from pandas.util._validators import ( - validate_axis_style_args, - validate_bool_kwarg, - validate_percentile, -) +import pandas.plotting if TYPE_CHECKING: from pandas.core.groupby.generic import DataFrameGroupBy From b307c4b01f8c70ce35fe3ae3659bec3163260b34 Mon Sep 17 00:00:00 2001 From: Saul Shanabrook Date: Thu, 19 Mar 2020 19:57:59 -0400 Subject: [PATCH 09/12] Fix calling --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 31befd3a0d604..5cbbf6842f4df 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -422,7 +422,7 @@ def __init__( ): if dtype is not None: dtype = self._validate_dtype(dtype) - mgr = create_dataframe(data, self, index, columns, dtype, copy) + mgr = create_dataframe(data, index, columns, dtype, copy, type(self)) NDFrame.__init__(self, mgr) # ---------------------------------------------------------------------- From edd85c3ccbb5e965ef3a55c17e1efb25a441443a Mon Sep 17 00:00:00 2001 From: Saul Shanabrook Date: Thu, 19 Mar 2020 20:05:29 -0400 Subject: [PATCH 10/12] Add test for custom constructor --- pandas/tests/generic/test_frame.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 631f484cfc22a..dab49c147d44d 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range import pandas._testing as tm +from pandas.core.internals.construction import create_dataframe from .test_generic import Generic @@ -169,6 +170,7 @@ def test_set_attribute(self): df = DataFrame({"x": [1, 2, 3]}) df.y = 2 + df["y"] = [2, 4, 6] df.y = 5 @@ -183,6 +185,25 @@ def test_deepcopy_empty(self): self._compare(empty_frame_copy, empty_frame) + def test_register_constructor(self): + # Verify that if you register a custom `create_dataframe` imeplementation + # this will be used in the constructor + class MyCustomObject: + pass + + o = MyCustomObject() + + with pytest.raises(ValueError): + DataFrame(o) + + @create_dataframe.register + def _create_dataframe_custom(o: MyCustomObject, *args, **kwargs): + return create_dataframe(None, *args, **kwargs) + + result = DataFrame(o) + expected = DataFrame(None) + self._compare(result, expected) + # formerly in Generic but only test DataFrame class TestDataFrame2: From 8240d8603200b0287ea432c6f7dab84aa063c964 Mon Sep 17 00:00:00 2001 From: Saul Shanabrook Date: Thu, 19 Mar 2020 20:09:29 -0400 Subject: [PATCH 11/12] Added whats new --- doc/source/whatsnew/v1.1.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 720ce7af47a18..c2ff1b7243af3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -70,7 +70,8 @@ Other enhancements - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) -- +- You can now override how Pandas constructs DataFrames from custom objects, by registering a new function on the + ``pandas.core.internals.construction.create_dataframe`` ``singledispatch`` function. .. --------------------------------------------------------------------------- From b856214d5d89022958ebcaef5267915fa7bbe266 Mon Sep 17 00:00:00 2001 From: Saul Shanabrook Date: Thu, 19 Mar 2020 20:20:02 -0400 Subject: [PATCH 12/12] unused imports --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/frame.py | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c2ff1b7243af3..d4e1539404ead 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -72,6 +72,7 @@ Other enhancements - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - You can now override how Pandas constructs DataFrames from custom objects, by registering a new function on the ``pandas.core.internals.construction.create_dataframe`` ``singledispatch`` function. +- .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5cbbf6842f4df..f1eb32b5ce830 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -75,7 +75,6 @@ ensure_platform_int, infer_dtype_from_object, is_bool_dtype, - is_dataclass, is_datetime64_any_dtype, is_dict_like, is_dtype_equal, @@ -86,7 +85,6 @@ is_integer_dtype, is_iterator, is_list_like, - is_named_tuple, is_object_dtype, is_period_dtype, is_scalar,