From 67a6dce23b686c38dedf8bc77ad999f9a3b3adb6 Mon Sep 17 00:00:00 2001 From: "C.A.M. Gerlach" Date: Wed, 16 Aug 2023 16:55:00 -0500 Subject: [PATCH 1/5] Only show GitHub check annotations on changed doc paragraphs --- .github/workflows/reusable-docs.yml | 11 +- Doc/tools/check-warnings.py | 182 +++++++++++++++++++++++++--- 2 files changed, 165 insertions(+), 28 deletions(-) diff --git a/.github/workflows/reusable-docs.yml b/.github/workflows/reusable-docs.yml index 39e5ad62924ad3..b0f608c29afe41 100644 --- a/.github/workflows/reusable-docs.yml +++ b/.github/workflows/reusable-docs.yml @@ -18,6 +18,8 @@ jobs: timeout-minutes: 60 steps: - uses: actions/checkout@v3 + with: + fetch-depth: 0 - name: 'Set up Python' uses: actions/setup-python@v4 with: @@ -28,13 +30,6 @@ jobs: run: make -C Doc/ venv # To annotate PRs with Sphinx nitpicks (missing references) - - name: 'Get list of changed files' - if: github.event_name == 'pull_request' - id: changed_files - uses: Ana06/get-changed-files@v2.2.0 - with: - filter: "Doc/**" - format: csv # works for paths with spaces - name: 'Build HTML documentation' continue-on-error: true run: | @@ -45,7 +40,7 @@ jobs: if: github.event_name == 'pull_request' run: | python Doc/tools/check-warnings.py \ - --check-and-annotate '${{ steps.changed_files.outputs.added_modified }}' \ + --check-and-annotate-changed 'origin/${{ github.base_ref }}' '${{ github.sha }}' \ --fail-if-regression \ --fail-if-improved diff --git a/Doc/tools/check-warnings.py b/Doc/tools/check-warnings.py index c17d0f51cd1272..f07357e8586c19 100644 --- a/Doc/tools/check-warnings.py +++ b/Doc/tools/check-warnings.py @@ -3,11 +3,13 @@ Check the output of running Sphinx in nit-picky mode (missing references). """ import argparse -import csv +import itertools import os import re +import subprocess import sys from pathlib import Path +from typing import TextIO # Exclude these whether they're dirty or clean, # because they trigger a rebuild of dirty files. @@ -24,28 +26,167 @@ "venv", } -PATTERN = re.compile(r"(?P[^:]+):(?P\d+): WARNING: (?P.+)") +# Regex pattern to match the parts of a Sphinx warning +WARNING_PATTERN = re.compile( + r"(?P([A-Za-z]:[\\/])?[^:]+):(?P\d+): WARNING: (?P.+)" +) +# Regex pattern to match the line numbers in a Git unified diff +DIFF_PATTERN = re.compile( + r"^@@ -(?P\d+)(?:,(?P\d+))? \+(?P\d+)(?:,(?P\d+))? @@", + flags=re.MULTILINE, +) -def check_and_annotate(warnings: list[str], files_to_check: str) -> None: + +def get_diff_files(ref_a: str, ref_b: str, filter_mode: str = "") -> set[Path]: + """List the files changed between two Gif refs, filtered by change type.""" + added_files_result = subprocess.run( + [ + "git", + "diff", + f"--diff-filter={filter_mode}", + "--name-only", + f"{ref_a}...{ref_b}", + "--", + ], + stdout=subprocess.PIPE, + check=True, + text=True, + encoding="UTF-8", + ) + + added_files = added_files_result.stdout.strip().split("\n") + return {Path(file.strip()) for file in added_files if file.strip()} + + +def get_diff_lines(ref_a: str, ref_b: str, file: Path) -> list[int]: + """List the lines changed between two Gif refs for a specific file.""" + diff_output = subprocess.run( + [ + "git", + "diff", + "--unified=0", + f"{ref_a}...{ref_b}", + "--", + str(file), + ], + stdout=subprocess.PIPE, + check=True, + text=True, + encoding="UTF-8", + ) + + # Scrape line offsets + lengths from diff and convert to line numbers + line_matches = DIFF_PATTERN.finditer(diff_output.stdout) + line_match_values = [ + line_match.groupdict(default=1) for line_match in line_matches + ] + line_ints = [ + (int(match_value["lineb"]), int(match_value["added"])) + for match_value in line_match_values + ] + line_ranges = [ + range(line_b, line_b + added) for line_b, added in line_ints + ] + line_numbers = list(itertools.chain(*line_ranges)) + + return line_numbers + + +def get_para_line_numbers(file_obj: TextIO) -> list[list[int]]: + """Get the line numbers of text in a file object, grouped by paragraph.""" + paragraphs = [] + prev_line = None + for lineno, line in enumerate(file_obj): + lineno = lineno + 1 + if prev_line is None or (line.strip() and not prev_line.strip()): + paragraph = [lineno - 1] + paragraphs.append(paragraph) + paragraph.append(lineno) + prev_line = line + return paragraphs + + +def parse_and_filter_warnings( + warnings: list[str], files: set[Path] +) -> list[re.Match[str]]: + """Get the warnings matching passed files and parse them with regex.""" + filtered_warnings = [ + warning + for warning in warnings + if any(str(file) in warning for file in files) + ] + warning_matches = [ + WARNING_PATTERN.fullmatch(warning.strip()) + for warning in filtered_warnings + ] + non_null_matches = [warning for warning in warning_matches if warning] + return non_null_matches + + +def process_touched_warnings( + warnings: list[str], ref_a: str, ref_b: str +) -> list[re.Match[str]]: + """Filter a list of Sphinx warnings to those affecting touched lines.""" + added_files, modified_files = tuple( + get_diff_files(ref_a, ref_b, filter_mode=mode) for mode in ("A", "M") + ) + + warnings_added = parse_and_filter_warnings(warnings, added_files) + warnings_modified = parse_and_filter_warnings(warnings, modified_files) + + modified_files_warned = { + file + for file in modified_files + if any(str(file) in warning["file"] for warning in warnings_modified) + } + + warnings_touched = warnings_added.copy() + for file in modified_files_warned: + diff_lines = get_diff_lines(ref_a, ref_b, file) + with file.open(encoding="UTF-8") as file_obj: + paragraphs = get_para_line_numbers(file_obj) + touched_paras = [ + para_lines + for para_lines in paragraphs + if set(diff_lines) & set(para_lines) + ] + touched_para_lines = set(itertools.chain(*touched_paras)) + warnings_infile = [ + warning + for warning in warnings_modified + if str(file) in warning["file"] + ] + warnings_touched += [ + warning + for warning in warnings_infile + if int(warning["line"]) in touched_para_lines + ] + + return warnings_touched + + +def check_and_annotate_changed( + warnings: list[str], ref_a: str = "main", ref_b: str = "HEAD" +) -> None: """ - Convert Sphinx warning messages to GitHub Actions. + Convert Sphinx warning messages to GitHub Actions for changed paragraphs. Converts lines like: .../Doc/library/cgi.rst:98: WARNING: reference target not found to: ::warning file=.../Doc/library/cgi.rst,line=98::reference target not found - Non-matching lines are echoed unchanged. - - see: + See: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-a-warning-message """ - files_to_check = next(csv.reader([files_to_check])) - for warning in warnings: - if any(filename in warning for filename in files_to_check): - if match := PATTERN.fullmatch(warning): - print("::warning file={file},line={line}::{msg}".format_map(match)) + warnings_touched = process_touched_warnings(warnings, ref_a, ref_b) + print("Emitting doc warnings matching modified lines:") + for warning in warnings_touched: + print(warning[0]) + print("::warning file={file},line={line}::{msg}".format_map(warning)) + if not warnings_touched: + print("None") def fail_if_regression( @@ -68,7 +209,7 @@ def fail_if_regression( print(filename) for warning in warnings: if filename in warning: - if match := PATTERN.fullmatch(warning): + if match := WARNING_PATTERN.fullmatch(warning): print(" {line}: {msg}".format_map(match)) return -1 return 0 @@ -94,9 +235,10 @@ def fail_if_improved( def main() -> int: parser = argparse.ArgumentParser() parser.add_argument( - "--check-and-annotate", - help="Comma-separated list of files to check, " - "and annotate those with warnings on GitHub Actions", + "--check-and-annotate-changed", + nargs=2, + help="Annotate lines changed between two refs " + "with warnings on GitHub Actions", ) parser.add_argument( "--fail-if-regression", @@ -114,7 +256,7 @@ def main() -> int: wrong_directory_msg = "Must run this script from the repo root" assert Path("Doc").exists() and Path("Doc").is_dir(), wrong_directory_msg - with Path("Doc/sphinx-warnings.txt").open() as f: + with Path("Doc/sphinx-warnings.txt").open(encoding="UTF-8") as f: warnings = f.read().splitlines() cwd = str(Path.cwd()) + os.path.sep @@ -124,15 +266,15 @@ def main() -> int: if "Doc/" in warning } - with Path("Doc/tools/.nitignore").open() as clean_files: + with Path("Doc/tools/.nitignore").open(encoding="UTF-8") as clean_files: files_with_expected_nits = { filename.strip() for filename in clean_files if filename.strip() and not filename.startswith("#") } - if args.check_and_annotate: - check_and_annotate(warnings, args.check_and_annotate) + if args.check_and_annotate_changed: + check_and_annotate_changed(warnings, *args.check_and_annotate_changed) if args.fail_if_regression: exit_code += fail_if_regression( From 887ca3f7c21bfe09f8a95e2872eacfd2f9709045 Mon Sep 17 00:00:00 2001 From: "C.A.M. Gerlach" Date: Thu, 17 Aug 2023 11:26:17 -0500 Subject: [PATCH 2/5] Improve check-warnings script arg parsing following Hugo's suggestions Co-authored-by: Hugo van Kemenade --- .github/workflows/reusable-docs.yml | 2 +- Doc/tools/check-warnings.py | 31 ++++++++++++++++++----------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/.github/workflows/reusable-docs.yml b/.github/workflows/reusable-docs.yml index b0f608c29afe41..fdc5f9eb3cc604 100644 --- a/.github/workflows/reusable-docs.yml +++ b/.github/workflows/reusable-docs.yml @@ -40,7 +40,7 @@ jobs: if: github.event_name == 'pull_request' run: | python Doc/tools/check-warnings.py \ - --check-and-annotate-changed 'origin/${{ github.base_ref }}' '${{ github.sha }}' \ + --annotate-diff 'origin/${{ github.base_ref }}' '${{ github.sha }}' \ --fail-if-regression \ --fail-if-improved diff --git a/Doc/tools/check-warnings.py b/Doc/tools/check-warnings.py index f07357e8586c19..91fb9be31db088 100644 --- a/Doc/tools/check-warnings.py +++ b/Doc/tools/check-warnings.py @@ -39,7 +39,7 @@ def get_diff_files(ref_a: str, ref_b: str, filter_mode: str = "") -> set[Path]: - """List the files changed between two Gif refs, filtered by change type.""" + """List the files changed between two Git refs, filtered by change type.""" added_files_result = subprocess.run( [ "git", @@ -60,7 +60,7 @@ def get_diff_files(ref_a: str, ref_b: str, filter_mode: str = "") -> set[Path]: def get_diff_lines(ref_a: str, ref_b: str, file: Path) -> list[int]: - """List the lines changed between two Gif refs for a specific file.""" + """List the lines changed between two Git refs for a specific file.""" diff_output = subprocess.run( [ "git", @@ -166,7 +166,7 @@ def process_touched_warnings( return warnings_touched -def check_and_annotate_changed( +def annotate_diff( warnings: list[str], ref_a: str = "main", ref_b: str = "HEAD" ) -> None: """ @@ -183,8 +183,8 @@ def check_and_annotate_changed( warnings_touched = process_touched_warnings(warnings, ref_a, ref_b) print("Emitting doc warnings matching modified lines:") for warning in warnings_touched: - print(warning[0]) print("::warning file={file},line={line}::{msg}".format_map(warning)) + print(warning[0]) if not warnings_touched: print("None") @@ -232,13 +232,14 @@ def fail_if_improved( return 0 -def main() -> int: +def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser() parser.add_argument( - "--check-and-annotate-changed", - nargs=2, - help="Annotate lines changed between two refs " - "with warnings on GitHub Actions", + "--annotate-diff", + nargs="*", + metavar=("BASE_REF", "HEAD_REF"), + help="Add GitHub Actions annotations on the diff for warnings on " + "lines changed between the given refs (main and HEAD, by default)", ) parser.add_argument( "--fail-if-regression", @@ -250,7 +251,13 @@ def main() -> int: action="store_true", help="Fail if new files with no nits are found", ) - args = parser.parse_args() + + args = parser.parse_args(argv) + if args.annotate_diff is not None and len(args.annotate_diff) > 2: + parser.error( + "--annotate-diff takes between 0 and 2 ref args, not " + f"{len(args.annotate_diff)} {tuple(args.annotate_diff)}" + ) exit_code = 0 wrong_directory_msg = "Must run this script from the repo root" @@ -273,8 +280,8 @@ def main() -> int: if filename.strip() and not filename.startswith("#") } - if args.check_and_annotate_changed: - check_and_annotate_changed(warnings, *args.check_and_annotate_changed) + if args.annotate_diff is not None: + annotate_diff(warnings, *args.annotate_diff) if args.fail_if_regression: exit_code += fail_if_regression( From 70459ca8a0682c536f14bf37cf691ce9850da498 Mon Sep 17 00:00:00 2001 From: "C.A.M. Gerlach" Date: Thu, 17 Aug 2023 12:34:07 -0500 Subject: [PATCH 3/5] Factor filtering warnings by modified diffs into helper function --- Doc/tools/check-warnings.py | 58 ++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/Doc/tools/check-warnings.py b/Doc/tools/check-warnings.py index 91fb9be31db088..4a5ce358b1d6f0 100644 --- a/Doc/tools/check-warnings.py +++ b/Doc/tools/check-warnings.py @@ -107,7 +107,7 @@ def get_para_line_numbers(file_obj: TextIO) -> list[list[int]]: return paragraphs -def parse_and_filter_warnings( +def filter_and_parse_warnings( warnings: list[str], files: set[Path] ) -> list[re.Match[str]]: """Get the warnings matching passed files and parse them with regex.""" @@ -124,6 +124,30 @@ def parse_and_filter_warnings( return non_null_matches +def filter_warnings_by_diff( + warnings: list[re.Match[str]], ref_a: str, ref_b: str, file: Path +) -> list[re.Match[str]]: + """Filter the passed per-file warnings to just those on changed lines.""" + diff_lines = get_diff_lines(ref_a, ref_b, file) + with file.open(encoding="UTF-8") as file_obj: + paragraphs = get_para_line_numbers(file_obj) + touched_paras = [ + para_lines + for para_lines in paragraphs + if set(diff_lines) & set(para_lines) + ] + touched_para_lines = set(itertools.chain(*touched_paras)) + warnings_infile = [ + warning for warning in warnings if str(file) in warning["file"] + ] + warnings_touched = [ + warning + for warning in warnings_infile + if int(warning["line"]) in touched_para_lines + ] + return warnings_touched + + def process_touched_warnings( warnings: list[str], ref_a: str, ref_b: str ) -> list[re.Match[str]]: @@ -132,8 +156,8 @@ def process_touched_warnings( get_diff_files(ref_a, ref_b, filter_mode=mode) for mode in ("A", "M") ) - warnings_added = parse_and_filter_warnings(warnings, added_files) - warnings_modified = parse_and_filter_warnings(warnings, modified_files) + warnings_added = filter_and_parse_warnings(warnings, added_files) + warnings_modified = filter_and_parse_warnings(warnings, modified_files) modified_files_warned = { file @@ -141,27 +165,13 @@ def process_touched_warnings( if any(str(file) in warning["file"] for warning in warnings_modified) } - warnings_touched = warnings_added.copy() - for file in modified_files_warned: - diff_lines = get_diff_lines(ref_a, ref_b, file) - with file.open(encoding="UTF-8") as file_obj: - paragraphs = get_para_line_numbers(file_obj) - touched_paras = [ - para_lines - for para_lines in paragraphs - if set(diff_lines) & set(para_lines) - ] - touched_para_lines = set(itertools.chain(*touched_paras)) - warnings_infile = [ - warning - for warning in warnings_modified - if str(file) in warning["file"] - ] - warnings_touched += [ - warning - for warning in warnings_infile - if int(warning["line"]) in touched_para_lines - ] + warnings_modified_touched = [ + filter_warnings_by_diff(warnings_modified, ref_a, ref_b, file) + for file in modified_files_warned + ] + warnings_touched = warnings_added + list( + itertools.chain(*warnings_modified_touched) + ) return warnings_touched From d8344a9dc9f2ad6116d6d24877dc75ad9a2c6362 Mon Sep 17 00:00:00 2001 From: "C.A.M. Gerlach" Date: Thu, 17 Aug 2023 22:42:28 -0500 Subject: [PATCH 4/5] Build docs on unmerged branch so warning lines match & avoid deep clone --- .github/workflows/reusable-docs.yml | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/.github/workflows/reusable-docs.yml b/.github/workflows/reusable-docs.yml index fdc5f9eb3cc604..6150b1a7d416a3 100644 --- a/.github/workflows/reusable-docs.yml +++ b/.github/workflows/reusable-docs.yml @@ -16,10 +16,30 @@ jobs: name: 'Docs' runs-on: ubuntu-latest timeout-minutes: 60 + env: + branch_base: 'origin/${{ github.event.pull_request.base.ref }}' + branch_pr: 'origin/${{ github.event.pull_request.head.ref }}' + refspec_base: '+${{ github.event.pull_request.base.sha }}:remotes/origin/${{ github.event.pull_request.base.ref }}' + refspec_pr: '+${{ github.event.pull_request.head.sha }}:remotes/origin/${{ github.event.pull_request.head.ref }}' steps: - - uses: actions/checkout@v3 + - name: 'Check out latest PR branch commit' + uses: actions/checkout@v3 with: - fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha }} + # Adapted from https://github.com/actions/checkout/issues/520#issuecomment-1167205721 + - name: 'Fetch commits to get branch diff' + run: | + # Fetch enough history to find a common ancestor commit (aka merge-base): + git fetch origin ${{ env.refspec_pr }} --depth=$(( ${{ github.event.pull_request.commits }} + 1 )) \ + --no-tags --prune --no-recurse-submodules + + # This should get the oldest commit in the local fetched history (which may not be the commit the PR branched from): + COMMON_ANCESTOR=$( git rev-list --first-parent --max-parents=0 --max-count=1 ${{ env.branch_pr }} ) + DATE=$( git log --date=iso8601 --format=%cd "${COMMON_ANCESTOR}" ) + + # Get all commits since that commit date from the base branch (eg: master or main): + git fetch origin ${{ env.refspec_base }} --shallow-since="${DATE}" \ + --no-tags --prune --no-recurse-submodules - name: 'Set up Python' uses: actions/setup-python@v4 with: @@ -40,7 +60,7 @@ jobs: if: github.event_name == 'pull_request' run: | python Doc/tools/check-warnings.py \ - --annotate-diff 'origin/${{ github.base_ref }}' '${{ github.sha }}' \ + --annotate-diff '${{ env.branch_base }}' '${{ env.branch_pr }}' \ --fail-if-regression \ --fail-if-improved From b006d01508860ff0f6f0da8751ec5243183d519e Mon Sep 17 00:00:00 2001 From: "C.A.M. Gerlach" Date: Fri, 18 Aug 2023 00:11:11 -0500 Subject: [PATCH 5/5] Apply a few final review suggestions from Adam Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- Doc/tools/check-warnings.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Doc/tools/check-warnings.py b/Doc/tools/check-warnings.py index 4a5ce358b1d6f0..809a8d63087e12 100644 --- a/Doc/tools/check-warnings.py +++ b/Doc/tools/check-warnings.py @@ -2,6 +2,8 @@ """ Check the output of running Sphinx in nit-picky mode (missing references). """ +from __future__ import annotations + import argparse import itertools import os @@ -78,6 +80,7 @@ def get_diff_lines(ref_a: str, ref_b: str, file: Path) -> list[int]: # Scrape line offsets + lengths from diff and convert to line numbers line_matches = DIFF_PATTERN.finditer(diff_output.stdout) + # Removed and added line counts are 1 if not printed line_match_values = [ line_match.groupdict(default=1) for line_match in line_matches ]