From 5773efcf73b565c6d3b01f75e75ed3a69515fac6 Mon Sep 17 00:00:00 2001 From: Stephannie Jimenez Date: Thu, 24 Jun 2021 00:52:02 -0500 Subject: [PATCH 1/3] Add zero-copy flag --- protocol/dataframe_protocol.py | 8 ++++- protocol/pandas_implementation.py | 60 +++++++++++++++++++------------ 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 00cf5b12..d8f37366 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -335,7 +335,8 @@ class DataFrame: ``__dataframe__`` method of a public data frame class in a library adhering to the dataframe interchange protocol specification. """ - def __dataframe__(self, nan_as_null : bool = False) -> dict: + def __dataframe__(self, nan_as_null : bool = False, + allow_zero_copy : bool = True) -> dict: """ Produces a dictionary object following the dataframe protocol spec @@ -343,8 +344,13 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict: producer to overwrite null values in the data with ``NaN`` (or ``NaT``). It is intended for cases where the consumer does not support the bit mask or byte mask that is the producer's native representation. + + ``allow_zero_copy`` is a keyword that defines if the given implementation + is going to support striding buffers. It is optional, and the libraries + do not need to implement it. """ self._nan_as_null = nan_as_null + self._allow_zero_zopy = allow_zero_copy return { "dataframe": self, # DataFrame object adhering to the protocol "version": 0 # Version number of the protocol diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index e3e3e62e..2d2c8f0e 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -35,7 +35,8 @@ ColumnObject = Any -def from_dataframe(df : DataFrameObject) -> pd.DataFrame: +def from_dataframe(df : DataFrameObject, + allow_zero_copy : bool = False) -> pd.DataFrame: """ Construct a pandas DataFrame from ``df`` if it supports ``__dataframe__`` """ @@ -46,7 +47,7 @@ def from_dataframe(df : DataFrameObject) -> pd.DataFrame: if not hasattr(df, '__dataframe__'): raise ValueError("`df` does not support __dataframe__") - return _from_dataframe(df.__dataframe__()) + return _from_dataframe(df.__dataframe__(allow_zero_copy=allow_zero_copy)) def _from_dataframe(df : DataFrameObject) -> pd.DataFrame: @@ -160,7 +161,8 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series: return series -def __dataframe__(cls, nan_as_null : bool = False) -> dict: +def __dataframe__(cls, nan_as_null : bool = False, + allow_zero_copy : bool = False) -> dict: """ The public method to attach to pd.DataFrame @@ -171,8 +173,14 @@ def __dataframe__(cls, nan_as_null : bool = False) -> dict: producer to overwrite null values in the data with ``NaN`` (or ``NaT``). This currently has no effect; once support for nullable extension dtypes is added, this value should be propagated to columns. + + ``allow_zero_copy`` is a keyword that defines if the given implementation + is going to support striding buffers. It is optional, and the libraries + do not need to implement it. Currently, if the flag is set to ``True`` it + will raise a ``RuntimeError``. """ - return _PandasDataFrame(cls, nan_as_null=nan_as_null) + return _PandasDataFrame( + cls, nan_as_null=nan_as_null, allow_zero_copy=allow_zero_copy) # Monkeypatch the Pandas DataFrame class to support the interchange protocol @@ -187,16 +195,16 @@ class _PandasBuffer: Data in the buffer is guaranteed to be contiguous in memory. """ - def __init__(self, x : np.ndarray) -> None: + def __init__(self, x : np.ndarray, allow_zero_copy : bool = False) -> None: """ Handle only regular columns (= numpy arrays) for now. """ - if not x.strides == (x.dtype.itemsize,): - # Array is not contiguous - this is possible to get in Pandas, - # there was some discussion on whether to support it. Som extra - # complexity for libraries that don't support it (e.g. Arrow), - # but would help with numpy-based libraries like Pandas. - raise RuntimeError("Design needs fixing - non-contiguous buffer") + if allow_zero_copy: + # Array is not contiguous and strided buffers do not need to be + # supported. It brings some extra complexity for libraries that + # don't support it (e.g. Arrow). + raise RuntimeError( + "Exports cannot be zero-copy in the case of a non-contiguous buffer") # Store the numpy array in which the data resides as a private # attribute, so we can use it to retrieve the public attributes @@ -251,7 +259,8 @@ class _PandasColumn: """ - def __init__(self, column : pd.Series) -> None: + def __init__(self, column : pd.Series, + allow_zero_copy : bool = False) -> None: """ Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. @@ -262,6 +271,7 @@ def __init__(self, column : pd.Series) -> None: # Store the column as a private attribute self._col = column + self._allow_zero_copy = allow_zero_copy @property def size(self) -> int: @@ -446,11 +456,13 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype """ _k = _DtypeKind if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - buffer = _PandasBuffer(self._col.to_numpy()) + buffer = _PandasBuffer( + self._col.to_numpy(), allow_zero_copy=self._allow_zero_copy) dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: codes = self._col.values.codes - buffer = _PandasBuffer(codes) + buffer = _PandasBuffer( + codes, allow_zero_copy=self._allow_zero_copy) dtype = self._dtype_from_pandasdtype(codes.dtype) else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") @@ -483,7 +495,8 @@ class _PandasDataFrame: ``pd.DataFrame.__dataframe__`` as objects with the methods and attributes defined on this class. """ - def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None: + def __init__(self, df : pd.DataFrame, nan_as_null : bool = False, + allow_zero_copy : bool = False) -> None: """ Constructor - an instance of this (private) class is returned from `pd.DataFrame.__dataframe__`. @@ -494,6 +507,7 @@ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None: # This currently has no effect; once support for nullable extension # dtypes is added, this value should be propagated to columns. self._nan_as_null = nan_as_null + self._allow_zero_copy = allow_zero_copy def num_columns(self) -> int: return len(self._df.columns) @@ -508,13 +522,16 @@ def column_names(self) -> Iterable[str]: return self._df.columns.tolist() def get_column(self, i: int) -> _PandasColumn: - return _PandasColumn(self._df.iloc[:, i]) + return _PandasColumn( + self._df.iloc[:, i], allow_zero_copy=self._allow_zero_copy) def get_column_by_name(self, name: str) -> _PandasColumn: - return _PandasColumn(self._df[name]) + return _PandasColumn( + self._df[name], allow_zero_copy=self._allow_zero_copy) def get_columns(self) -> Iterable[_PandasColumn]: - return [_PandasColumn(self._df[name]) for name in self._df.columns] + return [_PandasColumn(self._df[name], allow_zero_copy=self._allow_zero_copy) + for name in self._df.columns] def select_columns(self, indices: Sequence[int]) -> '_PandasDataFrame': if not isinstance(indices, collections.Sequence): @@ -552,13 +569,12 @@ def test_mixed_intfloat(): def test_noncontiguous_columns(): - # Currently raises: TBD whether it should work or not, see code comment - # where the RuntimeError is raised. + # Currently raises if the flag of allow zero copy is True. arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) df = pd.DataFrame(arr) assert df[0].to_numpy().strides == (24,) - pytest.raises(RuntimeError, from_dataframe, df) - #df2 = from_dataframe(df) + with pytest.raises(RuntimeError): + df2 = from_dataframe(df, allow_zero_copy=True) def test_categorical_dtype(): From 04bbe1be6a7a0b25f145559a09a4e04a8a3fc73a Mon Sep 17 00:00:00 2001 From: Stephannie Jimenez Date: Thu, 24 Jun 2021 14:45:41 -0500 Subject: [PATCH 2/3] Rename keyword to allow_copy --- protocol/dataframe_protocol.py | 6 +++--- protocol/pandas_implementation.py | 34 +++++++++++++++---------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index d8f37366..b200285c 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -336,7 +336,7 @@ class DataFrame: to the dataframe interchange protocol specification. """ def __dataframe__(self, nan_as_null : bool = False, - allow_zero_copy : bool = True) -> dict: + allow_copy : bool = True) -> dict: """ Produces a dictionary object following the dataframe protocol spec @@ -345,12 +345,12 @@ def __dataframe__(self, nan_as_null : bool = False, It is intended for cases where the consumer does not support the bit mask or byte mask that is the producer's native representation. - ``allow_zero_copy`` is a keyword that defines if the given implementation + ``allow_copy`` is a keyword that defines if the given implementation is going to support striding buffers. It is optional, and the libraries do not need to implement it. """ self._nan_as_null = nan_as_null - self._allow_zero_zopy = allow_zero_copy + self._allow_zero_zopy = allow_copy return { "dataframe": self, # DataFrame object adhering to the protocol "version": 0 # Version number of the protocol diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 2d2c8f0e..fedb65ab 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -36,7 +36,7 @@ def from_dataframe(df : DataFrameObject, - allow_zero_copy : bool = False) -> pd.DataFrame: + allow_copy : bool = False) -> pd.DataFrame: """ Construct a pandas DataFrame from ``df`` if it supports ``__dataframe__`` """ @@ -47,7 +47,7 @@ def from_dataframe(df : DataFrameObject, if not hasattr(df, '__dataframe__'): raise ValueError("`df` does not support __dataframe__") - return _from_dataframe(df.__dataframe__(allow_zero_copy=allow_zero_copy)) + return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) def _from_dataframe(df : DataFrameObject) -> pd.DataFrame: @@ -162,7 +162,7 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series: def __dataframe__(cls, nan_as_null : bool = False, - allow_zero_copy : bool = False) -> dict: + allow_copy : bool = False) -> dict: """ The public method to attach to pd.DataFrame @@ -174,13 +174,13 @@ def __dataframe__(cls, nan_as_null : bool = False, This currently has no effect; once support for nullable extension dtypes is added, this value should be propagated to columns. - ``allow_zero_copy`` is a keyword that defines if the given implementation + ``allow_copy`` is a keyword that defines if the given implementation is going to support striding buffers. It is optional, and the libraries do not need to implement it. Currently, if the flag is set to ``True`` it will raise a ``RuntimeError``. """ return _PandasDataFrame( - cls, nan_as_null=nan_as_null, allow_zero_copy=allow_zero_copy) + cls, nan_as_null=nan_as_null, allow_copy=allow_copy) # Monkeypatch the Pandas DataFrame class to support the interchange protocol @@ -195,11 +195,11 @@ class _PandasBuffer: Data in the buffer is guaranteed to be contiguous in memory. """ - def __init__(self, x : np.ndarray, allow_zero_copy : bool = False) -> None: + def __init__(self, x : np.ndarray, allow_copy : bool = False) -> None: """ Handle only regular columns (= numpy arrays) for now. """ - if allow_zero_copy: + if allow_copy: # Array is not contiguous and strided buffers do not need to be # supported. It brings some extra complexity for libraries that # don't support it (e.g. Arrow). @@ -260,7 +260,7 @@ class _PandasColumn: """ def __init__(self, column : pd.Series, - allow_zero_copy : bool = False) -> None: + allow_copy : bool = False) -> None: """ Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. @@ -271,7 +271,7 @@ def __init__(self, column : pd.Series, # Store the column as a private attribute self._col = column - self._allow_zero_copy = allow_zero_copy + self._allow_copy = allow_copy @property def size(self) -> int: @@ -457,12 +457,12 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype _k = _DtypeKind if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): buffer = _PandasBuffer( - self._col.to_numpy(), allow_zero_copy=self._allow_zero_copy) + self._col.to_numpy(), allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: codes = self._col.values.codes buffer = _PandasBuffer( - codes, allow_zero_copy=self._allow_zero_copy) + codes, allow_copy=self._allow_copy) dtype = self._dtype_from_pandasdtype(codes.dtype) else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") @@ -496,7 +496,7 @@ class _PandasDataFrame: attributes defined on this class. """ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False, - allow_zero_copy : bool = False) -> None: + allow_copy : bool = False) -> None: """ Constructor - an instance of this (private) class is returned from `pd.DataFrame.__dataframe__`. @@ -507,7 +507,7 @@ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False, # This currently has no effect; once support for nullable extension # dtypes is added, this value should be propagated to columns. self._nan_as_null = nan_as_null - self._allow_zero_copy = allow_zero_copy + self._allow_copy = allow_copy def num_columns(self) -> int: return len(self._df.columns) @@ -523,14 +523,14 @@ def column_names(self) -> Iterable[str]: def get_column(self, i: int) -> _PandasColumn: return _PandasColumn( - self._df.iloc[:, i], allow_zero_copy=self._allow_zero_copy) + self._df.iloc[:, i], allow_copy=self._allow_copy) def get_column_by_name(self, name: str) -> _PandasColumn: return _PandasColumn( - self._df[name], allow_zero_copy=self._allow_zero_copy) + self._df[name], allow_copy=self._allow_copy) def get_columns(self) -> Iterable[_PandasColumn]: - return [_PandasColumn(self._df[name], allow_zero_copy=self._allow_zero_copy) + return [_PandasColumn(self._df[name], allow_copy=self._allow_copy) for name in self._df.columns] def select_columns(self, indices: Sequence[int]) -> '_PandasDataFrame': @@ -574,7 +574,7 @@ def test_noncontiguous_columns(): df = pd.DataFrame(arr) assert df[0].to_numpy().strides == (24,) with pytest.raises(RuntimeError): - df2 = from_dataframe(df, allow_zero_copy=True) + df2 = from_dataframe(df, allow_copy=True) def test_categorical_dtype(): From c82e90ba14f6ac9c3d72a626cbc3e9af3c37dda7 Mon Sep 17 00:00:00 2001 From: Stephannie Jimenez Date: Fri, 25 Jun 2021 17:17:50 -0500 Subject: [PATCH 3/3] Change default value and update the requirements doc --- protocol/dataframe_protocol_summary.md | 19 +++++++++---------- protocol/pandas_implementation.py | 14 +++++++------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/protocol/dataframe_protocol_summary.md b/protocol/dataframe_protocol_summary.md index 91583bc4..57152b12 100644 --- a/protocol/dataframe_protocol_summary.md +++ b/protocol/dataframe_protocol_summary.md @@ -40,8 +40,8 @@ libraries, the example above can change to: def get_df_module(df): """Utility function to support programming against a dataframe API""" if hasattr(df, '__dataframe_namespace__'): - # Retrieve the namespace - pdx = df.__dataframe_namespace__() + # Retrieve the namespace + pdx = df.__dataframe_namespace__() else: # Here we can raise an exception if we only want to support compliant dataframes, # or convert to our default choice of dataframe if we want to accept (e.g.) dicts @@ -168,13 +168,12 @@ We'll also list some things that were discussed but are not requirements: 3. Extension dtypes, i.e. a way to extend the set of dtypes that is explicitly support, are out of scope. _Rationale: complex to support, not used enough to justify that complexity._ -4. "virtual columns", i.e. columns for which the data is not yet in memory - because it uses lazy evaluation, are not supported other than through - letting the producer materialize the data in memory when the consumer - calls `__dataframe__`. - _Rationale: the full dataframe API will support this use case by - "programming to an interface"; this data interchange protocol is - fundamentally built around describing data in memory_. +4. Support for strided storage in buffers. + _Rationale: this is supported by a subset of dataframes only, mainly those + that use NumPy arrays. In many real-world use cases, strided arrays will + force a copy at some point, so requiring contiguous memory layout (and hence + an extra copy at the moment `__dataframe__` is used) is considered a good + trade-off for reduced implementation complexity._ ### To be decided @@ -245,7 +244,7 @@ library that implements `__array__` must depend (optionally at least) on NumPy, and call a NumPy `ndarray` constructor itself from within `__array__`. -### What is wrong with `.to_numpy?` and `.to_arrow()`? +### What is wrong with `.to_numpy?` and `.to_arrow()`? Such methods ask the object it is attached to to turn itself into a NumPy or Arrow array. Which means each library must have at least an optional diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index fedb65ab..867f86dd 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -36,7 +36,7 @@ def from_dataframe(df : DataFrameObject, - allow_copy : bool = False) -> pd.DataFrame: + allow_copy : bool = True) -> pd.DataFrame: """ Construct a pandas DataFrame from ``df`` if it supports ``__dataframe__`` """ @@ -162,7 +162,7 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series: def __dataframe__(cls, nan_as_null : bool = False, - allow_copy : bool = False) -> dict: + allow_copy : bool = True) -> dict: """ The public method to attach to pd.DataFrame @@ -195,11 +195,11 @@ class _PandasBuffer: Data in the buffer is guaranteed to be contiguous in memory. """ - def __init__(self, x : np.ndarray, allow_copy : bool = False) -> None: + def __init__(self, x : np.ndarray, allow_copy : bool = True) -> None: """ Handle only regular columns (= numpy arrays) for now. """ - if allow_copy: + if not allow_copy: # Array is not contiguous and strided buffers do not need to be # supported. It brings some extra complexity for libraries that # don't support it (e.g. Arrow). @@ -260,7 +260,7 @@ class _PandasColumn: """ def __init__(self, column : pd.Series, - allow_copy : bool = False) -> None: + allow_copy : bool = True) -> None: """ Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. @@ -496,7 +496,7 @@ class _PandasDataFrame: attributes defined on this class. """ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False, - allow_copy : bool = False) -> None: + allow_copy : bool = True) -> None: """ Constructor - an instance of this (private) class is returned from `pd.DataFrame.__dataframe__`. @@ -574,7 +574,7 @@ def test_noncontiguous_columns(): df = pd.DataFrame(arr) assert df[0].to_numpy().strides == (24,) with pytest.raises(RuntimeError): - df2 = from_dataframe(df, allow_copy=True) + df2 = from_dataframe(df, allow_copy=False) def test_categorical_dtype():