Skip to content

Recognize full-string "boundary checks" in string_boundary_linter() #2372

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
* `unnecessary_lambda_linter` is extended to encourage vectorized comparisons where possible, e.g. `sapply(x, sum) > 0` instead of `sapply(x, function(x) sum(x) > 0)` (part of #884, @MichaelChirico). Toggle this behavior with argument `allow_comparison`.
* `backport_linter()` is slightly faster by moving expensive computations outside the linting function (#2339, #2348, @AshesITR and @MichaelChirico).
* `Linter()` has a new argument `linter_level` (default `NA`). This is used by `lint()` to more efficiently check for expression levels than the idiom `if (!is_lint_level(...)) { return(list()) }` (#2351, @AshesITR).
* `string_boundary_linter()` recognizes regular expression calls like `grepl("^abc$", x)` that can be replaced by using `==` instead (#1613, @MichaelChirico).
* `unreachable_code_linter()` has an argument `allow_comment_regex` for customizing which "terminal" comments to exclude (#2327, @MichaelChirico). `# nolint end` comments are always excluded, as are {covr} exclusions (e.g. `# nocov end`) by default.
* `format()` and `print()` methods for `lint` and `lints` classes get a new option `width` to control the printing width of lint messages (#1884, @MichaelChirico). The default is controlled by a new option `lintr.format_width`; if unset, no wrapping occurs (matching earlier behavior).

Expand Down
50 changes: 29 additions & 21 deletions R/string_boundary_linter.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ string_boundary_linter <- function(allow_grepl = FALSE) {
/following-sibling::expr[2]
/STR_CONST[ {str_cond} ]
")
str_detect_message_map <- c(
both = "Use == to check for an exact string match.",
initial = "Use startsWith() to detect a fixed initial substring.",
terminal = "Use endsWith() to detect a fixed terminal substring."
)

if (!allow_grepl) {
grepl_xpath <- glue("
Expand All @@ -84,16 +89,34 @@ string_boundary_linter <- function(allow_grepl = FALSE) {
/expr[2]
/STR_CONST[ {str_cond} ]
")
grepl_lint_fmt <- paste(
"Use !is.na(x) & %1$s(x, string) to detect a fixed %2$s substring, or,",
"if missingness is not a concern, just %1$s()."
)
grepl_message_map <- c(
both = "Use == to check for an exact string match.",
initial = sprintf(grepl_lint_fmt, "startsWith", "initial"),
terminal = sprintf(grepl_lint_fmt, "endsWith", "terminal")
)
}

get_regex_lint_data <- function(xml, xpath) {
expr <- xml_find_all(xml, xpath)
patterns <- get_r_string(expr)
initial_anchor <- startsWith(patterns, "^")
terminal_anchor <- endsWith(patterns, "$")
search_start <- 1L + initial_anchor
search_end <- nchar(patterns) - 1L + initial_anchor
search_end <- nchar(patterns) - terminal_anchor
can_replace <- is_not_regex(substr(patterns, search_start, search_end))
list(lint_expr = expr[can_replace], initial_anchor = initial_anchor[can_replace])
initial_anchor <- initial_anchor[can_replace]
terminal_anchor <- terminal_anchor[can_replace]

lint_type <- character(length(initial_anchor))

lint_type[initial_anchor & terminal_anchor] <- "both"
lint_type[initial_anchor & !terminal_anchor] <- "initial"
lint_type[!initial_anchor & terminal_anchor] <- "terminal"
list(lint_expr = expr[can_replace], lint_type = lint_type)
}

substr_xpath_parts <- glue("
Expand Down Expand Up @@ -125,38 +148,23 @@ string_boundary_linter <- function(allow_grepl = FALSE) {
lints <- list()

str_detect_lint_data <- get_regex_lint_data(xml, str_detect_xpath)
str_detect_lint_message <- paste(
ifelse(
str_detect_lint_data$initial_anchor,
"Use startsWith() to detect a fixed initial substring.",
"Use endsWith() to detect a fixed terminal substring."
),
"Doing so is more readable and more efficient."
)
str_detect_lint_message <- str_detect_message_map[str_detect_lint_data$lint_type]

lints <- c(lints, xml_nodes_to_lints(
str_detect_lint_data$lint_expr,
source_expression = source_expression,
lint_message = str_detect_lint_message,
lint_message = paste(str_detect_lint_message, "Doing so is more readable and more efficient."),
type = "warning"
))

if (!allow_grepl) {
grepl_lint_data <- get_regex_lint_data(xml, grepl_xpath)
grepl_replacement <- ifelse(grepl_lint_data$initial_anchor, "startsWith", "endsWith")
grepl_type <- ifelse(grepl_lint_data$initial_anchor, "initial", "terminal")
grepl_lint_message <- paste(
sprintf(
"Use !is.na(x) & %s(x, string) to detect a fixed %s substring, or, if missingness is not a concern, just %s.",
grepl_replacement, grepl_type, grepl_replacement
),
"Doing so is more readable and more efficient."
)
grepl_lint_message <- grepl_message_map[grepl_lint_data$lint_type]

lints <- c(lints, xml_nodes_to_lints(
grepl_lint_data$lint_expr,
source_expression = source_expression,
lint_message = grepl_lint_message,
lint_message = paste(grepl_lint_message, "Doing so is more readable and more efficient."),
type = "warning"
))
}
Expand Down
104 changes: 69 additions & 35 deletions tests/testthat/test-string_boundary_linter.R
Original file line number Diff line number Diff line change
Expand Up @@ -71,78 +71,112 @@ test_that("string_boundary_linter blocks simple disallowed grepl() usages", {
})

test_that("string_boundary_linter blocks simple disallowed str_detect() usages", {
linter <- string_boundary_linter()

expect_lint(
"str_detect(x, '^a')",
rex::rex("Use startsWith() to detect a fixed initial substring."),
string_boundary_linter()
linter
)
expect_lint(
"str_detect(x, 'a$')",
rex::rex("Use endsWith() to detect a fixed terminal substring."),
string_boundary_linter()
linter
)
})

test_that("string_boundary_linter blocks disallowed substr()/substring() usage", {
expect_lint(
"substr(x, 1L, 2L) == 'ab'",
rex::rex("Use startsWith() to detect an initial substring."),
string_boundary_linter()
)
linter <- string_boundary_linter()
starts_message <- rex::rex("Use startsWith() to detect an initial substring.")
ends_message <- rex::rex("Use endsWith() to detect a terminal substring.")

expect_lint("substr(x, 1L, 2L) == 'ab'", starts_message, linter)
# end doesn't matter, just anchoring to 1L
expect_lint(
"substr(x, 1L, end) == 'ab'",
rex::rex("Use startsWith() to detect an initial substring."),
string_boundary_linter()
)
expect_lint(
"substring(x, nchar(x) - 4L, nchar(x)) == 'abcde'",
rex::rex("Use endsWith() to detect a terminal substring."),
string_boundary_linter()
)
expect_lint("substr(x, 1L, end) == 'ab'", starts_message, linter)
expect_lint("substring(x, nchar(x) - 4L, nchar(x)) == 'abcde'", ends_message, linter)
# start doesn't matter, just anchoring to nchar(x)
expect_lint(
"substring(x, start, nchar(x)) == 'abcde'",
rex::rex("Use endsWith() to detect a terminal substring."),
string_boundary_linter()
)
expect_lint("substring(x, start, nchar(x)) == 'abcde'", ends_message, linter)
# more complicated expressions
expect_lint(
"substring(colnames(x), start, nchar(colnames(x))) == 'abc'",
rex::rex("Use endsWith() to detect a terminal substring."),
string_boundary_linter()
)
expect_lint("substring(colnames(x), start, nchar(colnames(x))) == 'abc'", ends_message, linter)
})

test_that("plain ^ or $ are skipped", {
expect_lint('grepl("^", x)', NULL, string_boundary_linter())
expect_lint('grepl("$", x)', NULL, string_boundary_linter())
linter <- string_boundary_linter()

expect_lint('grepl("^", x)', NULL, linter)
expect_lint('grepl("$", x)', NULL, linter)
})

test_that("substr inverted tests are caught as well", {
linter <- string_boundary_linter()

expect_lint(
"substr(x, 1L, 2L) != 'ab'",
rex::rex("Use startsWith() to detect an initial substring."),
string_boundary_linter()
linter
)
expect_lint(
"substring(x, nchar(x) - 4L, nchar(x)) != 'abcde'",
rex::rex("Use endsWith() to detect a terminal substring."),
string_boundary_linter()
linter
)
})

test_that("R>=4 raw strings are detected", {
linter <- string_boundary_linter()

skip_if_not_r_version("4.0.0")
expect_lint('grepl(R"(^.{3})", x)', NULL, string_boundary_linter())
expect_lint('grepl(R"(^.{3})", x)', NULL, linter)
expect_lint(
'grepl(R"(^abc)", x)',
rex::rex("Use !is.na(x) & startsWith(x, string) to detect a fixed initial substring,"),
string_boundary_linter()
linter
)
})

test_that("grepl() can optionally be ignored", {
expect_lint("grepl('^abc', x)", NULL, string_boundary_linter(allow_grepl = TRUE))
expect_lint("grepl('xyz$', x)", NULL, string_boundary_linter(allow_grepl = TRUE))
linter <- string_boundary_linter(allow_grepl = TRUE)

expect_lint("grepl('^abc', x)", NULL, linter)
expect_lint("grepl('xyz$', x)", NULL, linter)
})

test_that("whole-string regex recommends ==, not {starts,ends}With()", {
linter <- string_boundary_linter()
lint_msg <- rex::rex("Use == to check for an exact string match.")

expect_lint("grepl('^abc$', x)", lint_msg, linter)
expect_lint("grepl('^a\\\\.b$', x)", lint_msg, linter)
expect_lint("str_detect(x, '^abc$')", lint_msg, linter)
expect_lint("str_detect(x, '^a[.]b$')", lint_msg, linter)
})

test_that("vectorization + metadata work as intended", {
expect_lint(
trim_some("{
substring(a, 1, 3) == 'abc'
substring(b, nchar(b) - 3, nchar(b)) == 'defg'
substr(c, 1, 3) == 'hij'
substr(d, nchar(d) - 3, nchar(d)) == 'klmn'
grepl('^abc', e)
grepl('abc$', f)
grepl('^abc$', g)
str_detect(h, '^abc')
str_detect(i, 'abc$')
str_detect(j, '^abc$')
}"),
list(
list("startsWith", line_number = 2L),
list("endsWith", line_number = 3L),
list("startsWith", line_number = 4L),
list("endsWith", line_number = 5L),
list("startsWith", line_number = 6L),
list("endsWith", line_number = 7L),
list("==", line_number = 8L),
list("startsWith", line_number = 9L),
list("endsWith", line_number = 10L),
list("==", line_number = 11L)
),
string_boundary_linter()
)
})