From b9fd585f0e25c68f832359abb09ad9e19e242bcc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 25 Mar 2023 13:12:05 -0400 Subject: [PATCH 1/4] ENH: Add dtype of categories to repr of CategoricalDtype --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/dtypes/dtypes.py | 8 +++++++- pandas/tests/dtypes/test_dtypes.py | 19 +++++++++++++++++-- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 1f5c3c88c5ff5..6a15c052cca5c 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -37,6 +37,7 @@ Other enhancements - Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`) - Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`) - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`). +- Add dtype of categories to display information of :class:`CategoricalDtype` (:issue:`52179`) .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 54f6d84c8dc2a..5b7ff8aaf74ad 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -429,13 +429,19 @@ def __eq__(self, other: Any) -> bool: def __repr__(self) -> str_type: if self.categories is None: data = "None" + dtype = "None" else: data = self.categories._format_data(name=type(self).__name__) if data is None: # self.categories is RangeIndex data = str(self.categories._range) data = data.rstrip(", ") - return f"CategoricalDtype(categories={data}, ordered={self.ordered})" + dtype = self.categories.dtype + + return ( + f"CategoricalDtype(categories={data}, ordered={self.ordered}, " + f"categories_dtype={dtype})" + ) @cache_readonly def _hash_categories(self) -> int: diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 9057d91b1960a..3841e626ed05e 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -211,7 +211,10 @@ def test_repr_range_categories(self): dtype = CategoricalDtype(categories=rng, ordered=False) result = repr(dtype) - expected = "CategoricalDtype(categories=range(0, 3), ordered=False)" + expected = ( + "CategoricalDtype(categories=range(0, 3), ordered=False, " + "categories_dtype=int64)" + ) assert result == expected def test_update_dtype(self): @@ -220,6 +223,15 @@ def test_update_dtype(self): expected = CategoricalDtype(["b"], ordered=True) assert result == expected + def test_repr(self): + cat = Categorical(pd.Index([1, 2, 3], dtype="int32")) + result = cat.dtype.__repr__() + expected = ( + "CategoricalDtype(categories=[1, 2, 3], ordered=False, " + "categories_dtype=int32)" + ) + assert result == expected + class TestDatetimeTZDtype(Base): @pytest.fixture @@ -980,7 +992,10 @@ def test_str_vs_repr(self, ordered): c1 = CategoricalDtype(["a", "b"], ordered=ordered) assert str(c1) == "category" # Py2 will have unicode prefixes - pat = r"CategoricalDtype\(categories=\[.*\], ordered={ordered}\)" + pat = ( + r"CategoricalDtype\(categories=\[.*\], ordered={ordered}, " + r"categories_dtype=object\)" + ) assert re.match(pat.format(ordered=ordered), repr(c1)) def test_categorical_categories(self): From 608927ec17adba28c80bbcd5f09f225c1803197f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 25 Mar 2023 23:46:16 -0400 Subject: [PATCH 2/4] Fix --- pandas/tests/util/test_assert_index_equal.py | 5 +++-- pandas/tests/util/test_assert_series_equal.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index f7d41ed536a40..eff83282de182 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -209,9 +209,10 @@ def test_index_equal_category_mismatch(check_categorical): msg = """Index are different Attribute "dtype" are different -\\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False\\) +\\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False, \ +categories_dtype=object\\) \\[right\\]: CategoricalDtype\\(categories=\\['a', 'b', 'c'\\], \ -ordered=False\\)""" +ordered=False, categories_dtype=object\\)""" idx1 = Index(Categorical(["a", "b"])) idx2 = Index(Categorical(["a", "b"], categories=["a", "b", "c"])) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 835f710842cc0..dd28773f08cc4 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -250,9 +250,10 @@ def test_series_equal_categorical_mismatch(check_categorical): msg = """Attributes of Series are different Attribute "dtype" are different -\\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False\\) +\\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False, \ +categories_dtype=object\\) \\[right\\]: CategoricalDtype\\(categories=\\['a', 'b', 'c'\\], \ -ordered=False\\)""" +ordered=False, categories_dtype=object\\)""" s1 = Series(Categorical(["a", "b"])) s2 = Series(Categorical(["a", "b"], categories=list("abc"))) From abdaeeee399eedfd7f6310bc3dc0ef585fd47dd5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 29 Mar 2023 10:15:40 -0400 Subject: [PATCH 3/4] Update doc/source/whatsnew/v2.1.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 6a15c052cca5c..15f7a2056f41d 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -37,7 +37,7 @@ Other enhancements - Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`) - Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`) - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`). -- Add dtype of categories to display information of :class:`CategoricalDtype` (:issue:`52179`) +- Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`) .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: From 93db4d5bfc5be8b50f416028f938af9faef7fb15 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 29 Mar 2023 10:21:22 -0400 Subject: [PATCH 4/4] Fix docs --- pandas/core/dtypes/dtypes.py | 6 +++--- pandas/io/json/_table_schema.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 5b7ff8aaf74ad..422302dc2fed7 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -253,11 +253,11 @@ def _from_values_or_dtype( Examples -------- >>> pd.CategoricalDtype._from_values_or_dtype() - CategoricalDtype(categories=None, ordered=None) + CategoricalDtype(categories=None, ordered=None, categories_dtype=None) >>> pd.CategoricalDtype._from_values_or_dtype( ... categories=['a', 'b'], ordered=True ... ) - CategoricalDtype(categories=['a', 'b'], ordered=True) + CategoricalDtype(categories=['a', 'b'], ordered=True, categories_dtype=object) >>> dtype1 = pd.CategoricalDtype(['a', 'b'], ordered=True) >>> dtype2 = pd.CategoricalDtype(['x', 'y'], ordered=False) >>> c = pd.Categorical([0, 1], dtype=dtype1, fastpath=True) @@ -272,7 +272,7 @@ def _from_values_or_dtype( The supplied dtype takes precedence over values' dtype: >>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2) - CategoricalDtype(categories=['x', 'y'], ordered=False) + CategoricalDtype(categories=['x', 'y'], ordered=False, categories_dtype=object) """ if dtype is not None: diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 35ea4dc911fa8..41a969839c9bd 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -181,7 +181,7 @@ def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype: ... "ordered": True, ... } ... ) - CategoricalDtype(categories=['a', 'b', 'c'], ordered=True) + CategoricalDtype(categories=['a', 'b', 'c'], ordered=True, categories_dtype=object) >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"}) 'datetime64[ns]'