Skip to content

ENH: Add dtype of categories to repr of CategoricalDtype #52202

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Mar 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Other enhancements
- Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`)
- :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`)
- :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`).
- Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`)

.. ---------------------------------------------------------------------------
.. _whatsnew_210.notable_bug_fixes:
Expand Down
14 changes: 10 additions & 4 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,11 +253,11 @@ def _from_values_or_dtype(
Examples
--------
>>> pd.CategoricalDtype._from_values_or_dtype()
CategoricalDtype(categories=None, ordered=None)
CategoricalDtype(categories=None, ordered=None, categories_dtype=None)
>>> pd.CategoricalDtype._from_values_or_dtype(
... categories=['a', 'b'], ordered=True
... )
CategoricalDtype(categories=['a', 'b'], ordered=True)
CategoricalDtype(categories=['a', 'b'], ordered=True, categories_dtype=object)
>>> dtype1 = pd.CategoricalDtype(['a', 'b'], ordered=True)
>>> dtype2 = pd.CategoricalDtype(['x', 'y'], ordered=False)
>>> c = pd.Categorical([0, 1], dtype=dtype1, fastpath=True)
Expand All @@ -272,7 +272,7 @@ def _from_values_or_dtype(
The supplied dtype takes precedence over values' dtype:

>>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)
CategoricalDtype(categories=['x', 'y'], ordered=False)
CategoricalDtype(categories=['x', 'y'], ordered=False, categories_dtype=object)
"""

if dtype is not None:
Expand Down Expand Up @@ -429,13 +429,19 @@ def __eq__(self, other: Any) -> bool:
def __repr__(self) -> str_type:
if self.categories is None:
data = "None"
dtype = "None"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have a test that hits this branch?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, can't recall which one exactly but got a failure before I added this

else:
data = self.categories._format_data(name=type(self).__name__)
if data is None:
# self.categories is RangeIndex
data = str(self.categories._range)
data = data.rstrip(", ")
return f"CategoricalDtype(categories={data}, ordered={self.ordered})"
dtype = self.categories.dtype

return (
f"CategoricalDtype(categories={data}, ordered={self.ordered}, "
f"categories_dtype={dtype})"
)

@cache_readonly
def _hash_categories(self) -> int:
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/json/_table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype:
... "ordered": True,
... }
... )
CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)
CategoricalDtype(categories=['a', 'b', 'c'], ordered=True, categories_dtype=object)

>>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"})
'datetime64[ns]'
Expand Down
19 changes: 17 additions & 2 deletions pandas/tests/dtypes/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,10 @@ def test_repr_range_categories(self):
dtype = CategoricalDtype(categories=rng, ordered=False)
result = repr(dtype)

expected = "CategoricalDtype(categories=range(0, 3), ordered=False)"
expected = (
"CategoricalDtype(categories=range(0, 3), ordered=False, "
"categories_dtype=int64)"
)
assert result == expected

def test_update_dtype(self):
Expand All @@ -220,6 +223,15 @@ def test_update_dtype(self):
expected = CategoricalDtype(["b"], ordered=True)
assert result == expected

def test_repr(self):
cat = Categorical(pd.Index([1, 2, 3], dtype="int32"))
result = cat.dtype.__repr__()
expected = (
"CategoricalDtype(categories=[1, 2, 3], ordered=False, "
"categories_dtype=int32)"
)
assert result == expected


class TestDatetimeTZDtype(Base):
@pytest.fixture
Expand Down Expand Up @@ -980,7 +992,10 @@ def test_str_vs_repr(self, ordered):
c1 = CategoricalDtype(["a", "b"], ordered=ordered)
assert str(c1) == "category"
# Py2 will have unicode prefixes
pat = r"CategoricalDtype\(categories=\[.*\], ordered={ordered}\)"
pat = (
r"CategoricalDtype\(categories=\[.*\], ordered={ordered}, "
r"categories_dtype=object\)"
)
assert re.match(pat.format(ordered=ordered), repr(c1))

def test_categorical_categories(self):
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/util/test_assert_index_equal.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,9 +209,10 @@ def test_index_equal_category_mismatch(check_categorical):
msg = """Index are different

Attribute "dtype" are different
\\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False\\)
\\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False, \
categories_dtype=object\\)
\\[right\\]: CategoricalDtype\\(categories=\\['a', 'b', 'c'\\], \
ordered=False\\)"""
ordered=False, categories_dtype=object\\)"""

idx1 = Index(Categorical(["a", "b"]))
idx2 = Index(Categorical(["a", "b"], categories=["a", "b", "c"]))
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/util/test_assert_series_equal.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,10 @@ def test_series_equal_categorical_mismatch(check_categorical):
msg = """Attributes of Series are different

Attribute "dtype" are different
\\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False\\)
\\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False, \
categories_dtype=object\\)
\\[right\\]: CategoricalDtype\\(categories=\\['a', 'b', 'c'\\], \
ordered=False\\)"""
ordered=False, categories_dtype=object\\)"""

s1 = Series(Categorical(["a", "b"]))
s2 = Series(Categorical(["a", "b"], categories=list("abc")))
Expand Down