diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6fc1ec9c6ff90..2f5c2bb36f405 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -995,6 +995,7 @@ Reshaping - Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`) - Bug in :meth:`DataFrame.align` when aligning a :class:`MultiIndex` to a :class:`Series` with another :class:`MultiIndex` (:issue:`46001`) - Bug in concatenation with ``IntegerDtype``, or ``FloatingDtype`` arrays where the resulting dtype did not mirror the behavior of the non-nullable dtypes (:issue:`46379`) +- Bug in :func:`concat` losing dtype of columns when ``join="outer"`` and ``sort=True`` (:issue:`47329`) - Bug in :func:`concat` not sorting the column names when ``None`` is included (:issue:`47331`) - Bug in :func:`concat` with identical key leads to error when indexing :class:`MultiIndex` (:issue:`46519`) - Bug in :meth:`DataFrame.join` with a list when using suffixes to join DataFrames with duplicate column names (:issue:`46396`) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index a9e24bd64922b..6820b7b5360a5 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -11,6 +11,7 @@ ) from pandas.errors import InvalidIndexError +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import is_dtype_equal from pandas.core.algorithms import safe_sort @@ -223,7 +224,7 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: indexes, kind = _sanitize_and_check(indexes) - def _unique_indices(inds) -> Index: + def _unique_indices(inds, dtype) -> Index: """ Convert indexes to lists and concatenate them, removing duplicates. @@ -243,7 +244,30 @@ def conv(i): i = i.tolist() return i - return Index(lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) + return Index( + lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort), + dtype=dtype, + ) + + def _find_common_index_dtype(inds): + """ + Finds a common type for the indexes to pass through to resulting index. + + Parameters + ---------- + inds: list of Index or list objects + + Returns + ------- + The common type or None if no indexes were given + """ + dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)] + if dtypes: + dtype = find_common_type(dtypes) + else: + dtype = None + + return dtype if kind == "special": result = indexes[0] @@ -283,16 +307,18 @@ def conv(i): return result elif kind == "array": + dtype = _find_common_index_dtype(indexes) index = indexes[0] if not all(index.equals(other) for other in indexes[1:]): - index = _unique_indices(indexes) + index = _unique_indices(indexes, dtype) name = get_unanimous_names(*indexes)[0] if name != index.name: index = index.rename(name) return index else: # kind='list' - return _unique_indices(indexes) + dtype = _find_common_index_dtype(indexes) + return _unique_indices(indexes, dtype) def _sanitize_and_check(indexes): diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 74a3e93c32ebe..66382eb0e95a9 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -398,3 +398,14 @@ def test_concat_range_index_result(self): tm.assert_frame_equal(result, expected) expected_index = pd.RangeIndex(0, 2) tm.assert_index_equal(result.index, expected_index, exact=True) + + @pytest.mark.parametrize("dtype", ["Int64", "object"]) + def test_concat_index_keep_dtype(self, dtype): + # GH#47329 + df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype=dtype)) + df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype=dtype)) + result = concat([df1, df2], ignore_index=True, join="outer", sort=True) + expected = DataFrame( + [[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype=dtype) + ) + tm.assert_frame_equal(result, expected)