From 57616c92cde9f4b54ed2a9c91fd2dfbc7dd972b3 Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Wed, 19 Oct 2022 13:32:20 -0300 Subject: [PATCH 1/6] Add sheet to error message when reading Excel When reading Excel files with `BaseExcelReader`, is possible that `TextParser` raises `ValueError` on different situations. As the reader works on a sheet basis, including the name of the sheet that caused the exception can be useful for debugging. The implemented solution captures any `ValueError` that gets raised while parsing and reading a sheet from an Excel file, then adds the name of the sheet to the exception message and "re-raises" the exception using the `Exception.with_traceback` method. Why use `with_traceback`? This allow-us to maintin the focus on the actual line that raised the ValueError instead of directing the attention to the introduced block of code. Signed-off-by: Antonio Ossa Guerra --- pandas/io/excel/_base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f555e7c5f5d95..82b2e88f80efe 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -901,6 +901,10 @@ def parse( # No Data, return an empty DataFrame output[asheetname] = DataFrame() + except ValueError as err: + err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) + raise err.with_traceback(err.__traceback__) + if ret_dict: return output else: From 1f1f02d007bfdd4056b3641f1ed51c54e6f54e6a Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Wed, 19 Oct 2022 17:43:09 -0300 Subject: [PATCH 2/6] Test sheet name presence in `ValueError` message The test performs a `read_excel` operation on a blank Excel file while also passing `[1]` as header value. This operations results in a `ValueError`. Before the fix, the error message did not display any information about the offending sheet, making it harder to debug Excel files with several sheets. For this reason, the value of `sheet_name` was explicitly set to its default value (`None`). The test compares the `ValueError` message with the expected message that contains the offending sheet name. Signed-off-by: Antonio Ossa Guerra --- pandas/tests/io/excel/test_readers.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index fa1d6bbfd5a7e..c725325086b6e 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -655,6 +655,14 @@ def test_read_excel_blank_with_header(self, read_ext): actual = pd.read_excel("blank_with_header" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) + def test_value_error_message_includes_sheet_name(self, read_ext): + # GH 48706 + msg = ( + r"Passed header=\[1\], len of 1, but only 1 lines in file \(sheet: Sheet1\)" + ) + with pytest.raises(ValueError, match=msg): + pd.read_excel("blank_with_header" + read_ext, header=[1], sheet_name=None) + @pytest.mark.filterwarnings("ignore:Cell A4 is marked:UserWarning:openpyxl") def test_date_conversion_overflow(self, request, engine, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False From 9acb7654be8383ad4d774e8fe20b271920f567fc Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Wed, 19 Oct 2022 22:31:24 -0300 Subject: [PATCH 3/6] Add whatsnew not on improved error message The introduced enhancement of the error message on `read_excel` is now included in the whatsnew Signed-off-by: Antonio Ossa Guerra --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5e8677e2ae7a6..0887ae29ddf03 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -252,6 +252,7 @@ MultiIndex I/O ^^^ - Bug in :func:`read_sas` caused fragmentation of :class:`DataFrame` and raised :class:`.errors.PerformanceWarning` (:issue:`48595`) +- Improved error message in :func:`read_excel` by including the offending sheet name when a ``ValueError`` is raised while reading a file (:issue:`48706`) - Period From 4ec4f83eadec3090bfa064d06d85c890f10ddf3c Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Mon, 24 Oct 2022 10:28:58 -0300 Subject: [PATCH 4/6] Use simpler syntax for exception chaining Instead of handling traceback of the capture exception manually, this simpler syntax looks better, is clearer, and produces the same result Signed-off-by: Antonio Ossa Guerra --- pandas/io/excel/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 5bb1327c2a6bc..29b1fbb41309c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -865,7 +865,7 @@ def parse( except ValueError as err: err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) - raise err.with_traceback(err.__traceback__) + raise err if ret_dict: return output From 4a75b61f1ac5876f5778aa97f4f0a6ee5b8acbb4 Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Mon, 24 Oct 2022 10:31:29 -0300 Subject: [PATCH 5/6] Add sheet name to any Exception traceback Instead of including the sheet name on the traceback when a ValueError is raised, we'll do it for any kind of Exception. In particular, this is because some of the arguments for `read_excel` are callables, and they can raise any Exception. Including the sheet name might be helpful in some of those cases, so we'll capture them too Signed-off-by: Antonio Ossa Guerra --- pandas/io/excel/_base.py | 2 +- pandas/tests/io/excel/test_readers.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 29b1fbb41309c..92a0326eac4e9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -863,7 +863,7 @@ def parse( # No Data, return an empty DataFrame output[asheetname] = DataFrame() - except ValueError as err: + except Exception as err: err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) raise err diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 6cd7bb34a5163..497d8f83de33b 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -624,13 +624,12 @@ def test_read_excel_blank_with_header(self, read_ext): actual = pd.read_excel("blank_with_header" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) - def test_value_error_message_includes_sheet_name(self, read_ext): + def test_exception_message_includes_sheet_name(self, read_ext): # GH 48706 - msg = ( - r"Passed header=\[1\], len of 1, but only 1 lines in file \(sheet: Sheet1\)" - ) - with pytest.raises(ValueError, match=msg): + with pytest.raises(ValueError, match=r" \(sheet: Sheet1\)$"): pd.read_excel("blank_with_header" + read_ext, header=[1], sheet_name=None) + with pytest.raises(ZeroDivisionError, match=r" \(sheet: Sheet1\)$"): + pd.read_excel("test1" + read_ext, usecols=lambda x: 1 / 0, sheet_name=None) @pytest.mark.filterwarnings("ignore:Cell A4 is marked:UserWarning:openpyxl") def test_date_conversion_overflow(self, request, engine, read_ext): From 75c1753f5f820dd173dc2600bcea6b0476e74c84 Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Mon, 24 Oct 2022 10:35:59 -0300 Subject: [PATCH 6/6] Update whatsnew about enhanced error message The previous message only included `ValueError` as the messages that displayed the sheet name on their traceback, but that behavior is now present on any exception Signed-off-by: Antonio Ossa Guerra --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index ca1932351740d..56b4e8ae851eb 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -323,7 +323,7 @@ MultiIndex I/O ^^^ - Bug in :func:`read_sas` caused fragmentation of :class:`DataFrame` and raised :class:`.errors.PerformanceWarning` (:issue:`48595`) -- Improved error message in :func:`read_excel` by including the offending sheet name when a ``ValueError`` is raised while reading a file (:issue:`48706`) +- Improved error message in :func:`read_excel` by including the offending sheet name when an exception is raised while reading a file (:issue:`48706`) - Bug when a pickling a subset PyArrow-backed data that would serialize the entire data instead of the subset (:issue:`42600`) - Bug in :func:`read_csv` for a single-line csv with fewer columns than ``names`` raised :class:`.errors.ParserError` with ``engine="c"`` (:issue:`47566`) -