Skip to content

Handle boundaries when matching in substrings #675

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 11, 2023
16 changes: 5 additions & 11 deletions Sources/_StringProcessing/Engine/MEBuiltins.swift
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ extension Processor {
isStrictASCII: Bool,
isScalarSemantics: Bool
) -> Bool {
guard let next = input.matchBuiltinCC(
guard currentPosition < end, let next = input.matchBuiltinCC(
cc,
at: currentPosition,
isInverted: isInverted,
Expand Down Expand Up @@ -102,18 +102,16 @@ extension Processor {

case .wordBoundary:
if payload.usesSimpleUnicodeBoundaries {
// TODO: How should we handle bounds?
return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel)
} else {
return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex)
return input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex)
}

case .notWordBoundary:
if payload.usesSimpleUnicodeBoundaries {
// TODO: How should we handle bounds?
return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel)
} else {
return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex)
return !input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex)
}
}
}
Expand All @@ -127,9 +125,7 @@ extension String {
at currentPosition: String.Index,
isScalarSemantics: Bool
) -> String.Index? {
guard currentPosition < endIndex else {
return nil
}
assert(currentPosition < endIndex)
if case .definite(let result) = _quickMatchAnyNonNewline(
at: currentPosition,
isScalarSemantics: isScalarSemantics
Expand Down Expand Up @@ -194,9 +190,7 @@ extension String {
isStrictASCII: Bool,
isScalarSemantics: Bool
) -> String.Index? {
guard currentPosition < endIndex else {
return nil
}
assert(currentPosition < endIndex)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, why hoist these out? We'll probably want a naming convention for the assumption in this case, such as atAssumingInBounds currentPosition.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As written, the check can't be done in this method, since we're missing the processor's end. With the limitedBy parameter we can add the guard back in.

if case .definite(let result) = _quickMatchBuiltinCC(
cc,
at: currentPosition,
Expand Down
5 changes: 2 additions & 3 deletions Sources/_StringProcessing/Engine/MEQuantify.swift
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ extension Processor {
boundaryCheck: !isScalarSemantics,
isCaseInsensitive: false)
case .builtin:
// FIXME: bounds check? endIndex or end?
guard currentPosition < end else { return nil }

// We only emit .quantify if it consumes a single character
return input.matchBuiltinCC(
Expand All @@ -27,8 +27,7 @@ extension Processor {
isStrictASCII: payload.builtinIsStrict,
isScalarSemantics: isScalarSemantics)
case .any:
// FIXME: endIndex or end?
guard currentPosition < input.endIndex else { return nil }
guard currentPosition < end else { return nil }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Were you able to write a test case for this one?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is hit by the existing "match any" test cases when you pass a substring instead of a string.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we hoist this bounds check to the top of the method, or else leave it as a concern for the String methods? Either way, we want a consistent place and the body of the switch would ideally just be doing dispatch.


if payload.anyMatchesNewline {
if isScalarSemantics {
Expand Down
15 changes: 8 additions & 7 deletions Sources/_StringProcessing/Engine/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ extension Processor {
mutating func match(
_ e: Element, isCaseInsensitive: Bool
) -> Bool {
guard let next = input.match(
guard currentPosition < end, let next = input.match(
e,
at: currentPosition,
limitedBy: end,
Expand All @@ -251,7 +251,7 @@ extension Processor {
_ seq: Substring,
isScalarSemantics: Bool
) -> Bool {
guard let next = input.matchSeq(
guard currentPosition < end, let next = input.matchSeq(
seq,
at: currentPosition,
limitedBy: end,
Expand All @@ -270,7 +270,7 @@ extension Processor {
boundaryCheck: Bool,
isCaseInsensitive: Bool
) -> Bool {
guard let next = input.matchScalar(
guard currentPosition < end, let next = input.matchScalar(
s,
at: currentPosition,
limitedBy: end,
Expand All @@ -291,7 +291,7 @@ extension Processor {
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset,
isScalarSemantics: Bool
) -> Bool {
guard let next = input.matchBitset(
guard currentPosition < end, let next = input.matchBitset(
bitset,
at: currentPosition,
limitedBy: end,
Expand All @@ -308,7 +308,7 @@ extension Processor {
mutating func matchAnyNonNewline(
isScalarSemantics: Bool
) -> Bool {
guard let next = input.matchAnyNonNewline(
guard currentPosition < end, let next = input.matchAnyNonNewline(
at: currentPosition,
isScalarSemantics: isScalarSemantics
) else {
Expand Down Expand Up @@ -538,7 +538,8 @@ extension Processor {
let reg = payload.consumer
guard currentPosition < searchBounds.upperBound,
let nextIndex = registers[reg](
input, currentPosition..<searchBounds.upperBound)
input, currentPosition..<searchBounds.upperBound),
nextIndex <= end
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm ok with being extra cautious, especially in the context of 3rd party consumer code. Technically this might mean that they are breaking the contract by not doing their own bounds checking, but that's such an onerous requirement that I think this is better.

else {
signalFailure()
return
Expand All @@ -565,7 +566,7 @@ extension Processor {
do {
guard let (nextIdx, val) = try matcher(
input, currentPosition, searchBounds
) else {
), nextIdx <= end else {
signalFailure()
return
}
Expand Down
11 changes: 7 additions & 4 deletions Sources/_StringProcessing/Unicode/WordBreaking.swift
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ extension Processor {
if currentPosition == subjectBounds.lowerBound {
return matchesWord(at: currentPosition)
}
let priorIdx = input.index(before: currentPosition)
let priorIdx = semanticLevel == .graphemeCluster
? input.index(before: currentPosition)
: input.unicodeScalars.index(before: currentPosition)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have any new test cases that hit this?

if currentPosition == subjectBounds.upperBound {
return matchesWord(at: priorIdx)
}
Expand All @@ -51,11 +53,12 @@ extension Processor {
extension String {
func isOnWordBoundary(
at i: String.Index,
in range: Range<String.Index>,
using cache: inout Set<String.Index>?,
_ maxIndex: inout String.Index?
) -> Bool {
// TODO: needs benchmark coverage
guard i != startIndex, i != endIndex else {
guard i != range.lowerBound, i != range.upperBound else {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if i is outside of range?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a programming error, added an assertion for that here.

return true
}

Expand All @@ -76,9 +79,9 @@ extension String {

if #available(SwiftStdlib 5.7, *) {
var indices: Set<String.Index> = []
var j = maxIndex ?? startIndex
var j = maxIndex ?? range.lowerBound

while j < endIndex, j <= i {
while j < range.upperBound, j <= i {
indices.insert(j)
j = _wordIndex(after: j)
}
Expand Down
56 changes: 56 additions & 0 deletions Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,61 @@ func _firstMatch(
) throws -> (String, [String?])? {
var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel)
let result = try regex.firstMatch(in: input)

func validateSubstring(_ substringInput: Substring) throws {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Post-PR testing: let's make a test input that contains many kinds of complex grapheme clusters and, for each test regex and each substring of that input, any match of that substring is equal to the match of a String copy of that substring.

// Sometimes the characters we add to a substring merge with existing
// string members. This messes up cross-validation, so skip the test.
guard input == substringInput else { return }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that we will want to programmatically test these kinds of situations, but maybe check with @lorentey for what the semantics should be when a substring splits a grapheme cluster


let substringResult = try regex.firstMatch(in: substringInput)
switch (result, substringResult) {
case (nil, nil):
break
case let (result?, substringResult?):
if substringResult.range.upperBound > substringInput.endIndex {
throw MatchError("Range exceeded substring upper bound for \(input) and \(regexStr)")
}
let stringMatch = input[result.range]
let substringMatch = substringInput[substringResult.range]
if stringMatch != substringMatch {
throw MatchError("""
Pattern: '\(regexStr)'
String match returned: '\(stringMatch)'
Substring match returned: '\(substringMatch)'
""")
}
case (.some(let result), nil):
throw MatchError("""
Pattern: '\(regexStr)'
Input: '\(input)'
Substring '\(substringInput)' ('\(substringInput.base)')
String match returned: '\(input[result.range])'
Substring match returned: nil
""")
case (nil, .some(let substringResult)):
throw MatchError("""
Pattern: '\(regexStr)'
Input: '\(input)'
Substring '\(substringInput)' ('\(substringInput.base)')
String match returned: nil
Substring match returned: '\(substringInput[substringResult.range])'
""")
}
}

if !input.isEmpty {
try validateSubstring("\(input)\(input.last!)".dropLast())
}
try validateSubstring("\(input)\n".dropLast())
try validateSubstring("A\(input)Z".dropFirst().dropLast())

// This is causing out-of-bounds indexing errors:
// do {
// // Test sub-character slicing
// let substr = input + "\n"
// let prevIndex = substr.unicodeScalars.index(substr.endIndex, offsetBy: -1)
// try validateSubstring(substr[..<prevIndex])
// }

if validateOptimizations {
assert(regex._forceAction(.addOptions(.disableOptimizations)))
Expand All @@ -52,6 +107,7 @@ func _firstMatch(
}
}
}

guard let result = result else { return nil }
let caps = result.output.slices(from: input)
return (String(input[result.range]), caps.map { $0.map(String.init) })
Expand Down