-
Notifications
You must be signed in to change notification settings - Fork 50
Handle boundaries when matching in substrings #675
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
5b157cf
e694693
9024254
6160220
b331bc0
3cd121a
5114ea4
656c388
0f4f005
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,7 @@ extension Processor { | |
isStrictASCII: Bool, | ||
isScalarSemantics: Bool | ||
) -> Bool { | ||
guard let next = input.matchBuiltinCC( | ||
guard currentPosition < end, let next = input.matchBuiltinCC( | ||
cc, | ||
at: currentPosition, | ||
isInverted: isInverted, | ||
|
@@ -102,18 +102,16 @@ extension Processor { | |
|
||
case .wordBoundary: | ||
if payload.usesSimpleUnicodeBoundaries { | ||
// TODO: How should we handle bounds? | ||
return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) | ||
} else { | ||
return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) | ||
return input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex) | ||
} | ||
|
||
case .notWordBoundary: | ||
if payload.usesSimpleUnicodeBoundaries { | ||
// TODO: How should we handle bounds? | ||
return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) | ||
} else { | ||
return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) | ||
return !input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex) | ||
} | ||
} | ||
} | ||
|
@@ -127,9 +125,7 @@ extension String { | |
at currentPosition: String.Index, | ||
isScalarSemantics: Bool | ||
) -> String.Index? { | ||
guard currentPosition < endIndex else { | ||
return nil | ||
} | ||
assert(currentPosition < endIndex) | ||
if case .definite(let result) = _quickMatchAnyNonNewline( | ||
at: currentPosition, | ||
isScalarSemantics: isScalarSemantics | ||
|
@@ -194,9 +190,7 @@ extension String { | |
isStrictASCII: Bool, | ||
isScalarSemantics: Bool | ||
) -> String.Index? { | ||
guard currentPosition < endIndex else { | ||
return nil | ||
} | ||
assert(currentPosition < endIndex) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Out of curiosity, why hoist these out? We'll probably want a naming convention for the assumption in this case, such as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As written, the check can't be done in this method, since we're missing the processor's |
||
if case .definite(let result) = _quickMatchBuiltinCC( | ||
cc, | ||
at: currentPosition, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,7 @@ extension Processor { | |
boundaryCheck: !isScalarSemantics, | ||
isCaseInsensitive: false) | ||
case .builtin: | ||
// FIXME: bounds check? endIndex or end? | ||
guard currentPosition < end else { return nil } | ||
|
||
// We only emit .quantify if it consumes a single character | ||
return input.matchBuiltinCC( | ||
|
@@ -27,8 +27,7 @@ extension Processor { | |
isStrictASCII: payload.builtinIsStrict, | ||
isScalarSemantics: isScalarSemantics) | ||
case .any: | ||
// FIXME: endIndex or end? | ||
guard currentPosition < input.endIndex else { return nil } | ||
guard currentPosition < end else { return nil } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Were you able to write a test case for this one? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is hit by the existing "match any" test cases when you pass a substring instead of a string. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we hoist this bounds check to the top of the method, or else leave it as a concern for the String methods? Either way, we want a consistent place and the body of the switch would ideally just be doing dispatch. |
||
|
||
if payload.anyMatchesNewline { | ||
if isScalarSemantics { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -232,7 +232,7 @@ extension Processor { | |
mutating func match( | ||
_ e: Element, isCaseInsensitive: Bool | ||
) -> Bool { | ||
guard let next = input.match( | ||
guard currentPosition < end, let next = input.match( | ||
e, | ||
at: currentPosition, | ||
limitedBy: end, | ||
|
@@ -251,7 +251,7 @@ extension Processor { | |
_ seq: Substring, | ||
isScalarSemantics: Bool | ||
) -> Bool { | ||
guard let next = input.matchSeq( | ||
guard currentPosition < end, let next = input.matchSeq( | ||
seq, | ||
at: currentPosition, | ||
limitedBy: end, | ||
|
@@ -270,7 +270,7 @@ extension Processor { | |
boundaryCheck: Bool, | ||
isCaseInsensitive: Bool | ||
) -> Bool { | ||
guard let next = input.matchScalar( | ||
guard currentPosition < end, let next = input.matchScalar( | ||
s, | ||
at: currentPosition, | ||
limitedBy: end, | ||
|
@@ -291,7 +291,7 @@ extension Processor { | |
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset, | ||
isScalarSemantics: Bool | ||
) -> Bool { | ||
guard let next = input.matchBitset( | ||
guard currentPosition < end, let next = input.matchBitset( | ||
bitset, | ||
at: currentPosition, | ||
limitedBy: end, | ||
|
@@ -308,7 +308,7 @@ extension Processor { | |
mutating func matchAnyNonNewline( | ||
isScalarSemantics: Bool | ||
) -> Bool { | ||
guard let next = input.matchAnyNonNewline( | ||
guard currentPosition < end, let next = input.matchAnyNonNewline( | ||
at: currentPosition, | ||
isScalarSemantics: isScalarSemantics | ||
) else { | ||
|
@@ -538,7 +538,8 @@ extension Processor { | |
let reg = payload.consumer | ||
guard currentPosition < searchBounds.upperBound, | ||
let nextIndex = registers[reg]( | ||
input, currentPosition..<searchBounds.upperBound) | ||
input, currentPosition..<searchBounds.upperBound), | ||
nextIndex <= end | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm ok with being extra cautious, especially in the context of 3rd party consumer code. Technically this might mean that they are breaking the contract by not doing their own bounds checking, but that's such an onerous requirement that I think this is better. |
||
else { | ||
signalFailure() | ||
return | ||
|
@@ -565,7 +566,7 @@ extension Processor { | |
do { | ||
guard let (nextIdx, val) = try matcher( | ||
input, currentPosition, searchBounds | ||
) else { | ||
), nextIdx <= end else { | ||
signalFailure() | ||
return | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,7 +37,9 @@ extension Processor { | |
if currentPosition == subjectBounds.lowerBound { | ||
return matchesWord(at: currentPosition) | ||
} | ||
let priorIdx = input.index(before: currentPosition) | ||
let priorIdx = semanticLevel == .graphemeCluster | ||
? input.index(before: currentPosition) | ||
: input.unicodeScalars.index(before: currentPosition) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we have any new test cases that hit this? |
||
if currentPosition == subjectBounds.upperBound { | ||
return matchesWord(at: priorIdx) | ||
} | ||
|
@@ -51,11 +53,12 @@ extension Processor { | |
extension String { | ||
func isOnWordBoundary( | ||
at i: String.Index, | ||
in range: Range<String.Index>, | ||
using cache: inout Set<String.Index>?, | ||
_ maxIndex: inout String.Index? | ||
) -> Bool { | ||
// TODO: needs benchmark coverage | ||
guard i != startIndex, i != endIndex else { | ||
guard i != range.lowerBound, i != range.upperBound else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a programming error, added an assertion for that here. |
||
return true | ||
} | ||
|
||
|
@@ -76,9 +79,9 @@ extension String { | |
|
||
if #available(SwiftStdlib 5.7, *) { | ||
var indices: Set<String.Index> = [] | ||
var j = maxIndex ?? startIndex | ||
var j = maxIndex ?? range.lowerBound | ||
|
||
while j < endIndex, j <= i { | ||
while j < range.upperBound, j <= i { | ||
indices.insert(j) | ||
j = _wordIndex(after: j) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,61 @@ func _firstMatch( | |
) throws -> (String, [String?])? { | ||
var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) | ||
let result = try regex.firstMatch(in: input) | ||
|
||
func validateSubstring(_ substringInput: Substring) throws { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Post-PR testing: let's make a test input that contains many kinds of complex grapheme clusters and, for each test regex and each substring of that input, any match of that substring is equal to the match of a |
||
// Sometimes the characters we add to a substring merge with existing | ||
// string members. This messes up cross-validation, so skip the test. | ||
guard input == substringInput else { return } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that we will want to programmatically test these kinds of situations, but maybe check with @lorentey for what the semantics should be when a substring splits a grapheme cluster |
||
|
||
let substringResult = try regex.firstMatch(in: substringInput) | ||
switch (result, substringResult) { | ||
case (nil, nil): | ||
break | ||
case let (result?, substringResult?): | ||
if substringResult.range.upperBound > substringInput.endIndex { | ||
throw MatchError("Range exceeded substring upper bound for \(input) and \(regexStr)") | ||
} | ||
let stringMatch = input[result.range] | ||
let substringMatch = substringInput[substringResult.range] | ||
if stringMatch != substringMatch { | ||
throw MatchError(""" | ||
Pattern: '\(regexStr)' | ||
String match returned: '\(stringMatch)' | ||
Substring match returned: '\(substringMatch)' | ||
""") | ||
} | ||
case (.some(let result), nil): | ||
throw MatchError(""" | ||
Pattern: '\(regexStr)' | ||
Input: '\(input)' | ||
Substring '\(substringInput)' ('\(substringInput.base)') | ||
String match returned: '\(input[result.range])' | ||
Substring match returned: nil | ||
""") | ||
case (nil, .some(let substringResult)): | ||
throw MatchError(""" | ||
Pattern: '\(regexStr)' | ||
Input: '\(input)' | ||
Substring '\(substringInput)' ('\(substringInput.base)') | ||
String match returned: nil | ||
Substring match returned: '\(substringInput[substringResult.range])' | ||
""") | ||
} | ||
} | ||
|
||
if !input.isEmpty { | ||
try validateSubstring("\(input)\(input.last!)".dropLast()) | ||
} | ||
try validateSubstring("\(input)\n".dropLast()) | ||
try validateSubstring("A\(input)Z".dropFirst().dropLast()) | ||
|
||
// This is causing out-of-bounds indexing errors: | ||
// do { | ||
// // Test sub-character slicing | ||
// let substr = input + "\n" | ||
// let prevIndex = substr.unicodeScalars.index(substr.endIndex, offsetBy: -1) | ||
// try validateSubstring(substr[..<prevIndex]) | ||
// } | ||
|
||
if validateOptimizations { | ||
assert(regex._forceAction(.addOptions(.disableOptimizations))) | ||
|
@@ -52,6 +107,7 @@ func _firstMatch( | |
} | ||
} | ||
} | ||
|
||
guard let result = result else { return nil } | ||
let caps = result.output.slices(from: input) | ||
return (String(input[result.range]), caps.map { $0.map(String.init) }) | ||
|
Uh oh!
There was an error while loading. Please reload this page.