From b384858cf14b9d1a5fc67f9f543665426fe1606e Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 19 Jul 2023 15:47:44 -0500 Subject: [PATCH 1/5] Optimize search for start-anchored regexes (#682) When a regex is anchored to the start of a subject, there's no need to search throughout a string for the pattern when searching for the first match: a prefix match is sufficient. This adds a regex compilation-time check about whether a match can only be found at the start of a subject, and then uses that to choose whether to defer to `prefixMatch` from within `firstMatch`. --- Sources/RegexBenchmark/Suite/NotFound.swift | 2 +- Sources/_StringProcessing/ByteCodeGen.swift | 1 + .../_StringProcessing/Engine/MEBuilder.swift | 6 +- .../_StringProcessing/Engine/MEProgram.swift | 1 + Sources/_StringProcessing/Regex/DSLTree.swift | 110 ++++++++++++++++++ Sources/_StringProcessing/Regex/Match.swift | 4 +- Tests/RegexBuilderTests/RegexDSLTests.swift | 51 +++++++- Tests/RegexTests/CompileTests.swift | 38 ++++++ 8 files changed, 209 insertions(+), 4 deletions(-) diff --git a/Sources/RegexBenchmark/Suite/NotFound.swift b/Sources/RegexBenchmark/Suite/NotFound.swift index a1ed7eae0..be2e67e79 100644 --- a/Sources/RegexBenchmark/Suite/NotFound.swift +++ b/Sources/RegexBenchmark/Suite/NotFound.swift @@ -13,7 +13,7 @@ extension BenchmarkRunner { baseName: "AnchoredNotFound", regex: "^ +a", input: input, - isWhole: true) + includeFirst: true) anchoredNotFound.register(&self) } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 00ce0d5f6..cb2e9ed04 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -46,6 +46,7 @@ extension Compiler.ByteCodeGen { // The whole match (`.0` element of output) is equivalent to an implicit // capture over the entire regex. try emitNode(.capture(name: nil, reference: nil, root)) + builder.canOnlyMatchAtStart = root.canOnlyMatchAtStart() builder.buildAccept() return try builder.assemble() } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 93801aeec..e26a00fb1 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -43,6 +43,9 @@ extension MEProgram { var captureList = CaptureList() var initialOptions = MatchingOptions() + // Starting constraint + var canOnlyMatchAtStart = false + // Symbolic reference resolution var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:] var referencedCaptureOffsets: [ReferenceID: Int] = [:] @@ -404,7 +407,8 @@ extension MEProgram.Builder { enableMetrics: enableMetrics, captureList: captureList, referencedCaptureOffsets: referencedCaptureOffsets, - initialOptions: initialOptions) + initialOptions: initialOptions, + canOnlyMatchAtStart: canOnlyMatchAtStart) } mutating func reset() { self = Self() } diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index 67f5a8bc9..3107d5ef7 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -38,6 +38,7 @@ struct MEProgram { let referencedCaptureOffsets: [ReferenceID: Int] var initialOptions: MatchingOptions + var canOnlyMatchAtStart: Bool } extension MEProgram: CustomStringConvertible { diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index b784e2382..f24b87d09 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -711,6 +711,105 @@ extension DSLTree.Node { } } +extension DSLTree.Node { + /// Implementation for `canOnlyMatchAtStart`, which maintains the option + /// state. + /// + /// For a given specific node, this method can return one of three values: + /// + /// - `true`: This node is guaranteed to match only at the start of a subject. + /// - `false`: This node can match anywhere in the subject. + /// - `nil`: This node is inconclusive about where it can match. + /// + /// In particular, non-required groups and option-setting groups are + /// inconclusive about where they can match. + private func _canOnlyMatchAtStartImpl(_ options: inout MatchingOptions) -> Bool? { + switch self { + // Defining cases + case .atom(.assertion(.startOfSubject)): + return true + case .atom(.assertion(.caretAnchor)): + return !options.anchorsMatchNewlines + + // Changing options doesn't determine `true`/`false`. + case .atom(.changeMatchingOptions(let sequence)): + options.apply(sequence.ast) + return nil + + // Any other atom or consuming node returns `false`. + case .atom, .customCharacterClass, .quotedLiteral: + return false + + // Trivia/empty have no effect. + case .trivia, .empty: + return nil + + // In an alternation, all of its children must match only at start. + case .orderedChoice(let children): + return children.allSatisfy { $0._canOnlyMatchAtStartImpl(&options) == true } + + // In a concatenation, the first definitive child provides the answer. + case .concatenation(let children): + for child in children { + if let result = child._canOnlyMatchAtStartImpl(&options) { + return result + } + } + return false + + // Groups (and other parent nodes) defer to the child. + case .nonCapturingGroup(let kind, let child): + options.beginScope() + defer { options.endScope() } + if case .changeMatchingOptions(let sequence) = kind.ast { + options.apply(sequence) + } + return child._canOnlyMatchAtStartImpl(&options) + case .capture(_, _, let child, _): + options.beginScope() + defer { options.endScope() } + return child._canOnlyMatchAtStartImpl(&options) + case .ignoreCapturesInTypedOutput(let child), + .convertedRegexLiteral(let child, _): + return child._canOnlyMatchAtStartImpl(&options) + + // A quantification that doesn't require its child to exist can still + // allow a start-only match. (e.g. `/(foo)?^bar/`) + case .quantification(let amount, _, let child): + return amount.requiresAtLeastOne + ? child._canOnlyMatchAtStartImpl(&options) + : nil + + // For conditional nodes, both sides must require matching at start. + case .conditional(_, let child1, let child2): + return child1._canOnlyMatchAtStartImpl(&options) == true + && child2._canOnlyMatchAtStartImpl(&options) == true + + // Extended behavior isn't known, so we return `false` for safety. + case .consumer, .matcher, .characterPredicate, .absentFunction: + return false + } + } + + /// Returns a Boolean value indicating whether the regex with this node as + /// the root can _only_ match at the start of a subject. + /// + /// For example, these regexes can only match at the start of a subject: + /// + /// - `/^foo/` + /// - `/(^foo|^bar)/` (both sides of the alternation start with `^`) + /// + /// These can match other places in a subject: + /// + /// - `/(^foo)?bar/` (`^` is in an optional group) + /// - `/(^foo|bar)/` (only one side of the alternation starts with `^`) + /// - `/(?m)^foo/` (`^` means "the start of a line" due to `(?m)`) + internal func canOnlyMatchAtStart() -> Bool { + var options = MatchingOptions() + return _canOnlyMatchAtStartImpl(&options) ?? false + } +} + // MARK: AST wrapper types // // These wrapper types are required because even @_spi-marked public APIs can't @@ -818,6 +917,17 @@ extension DSLTree { public static func range(_ lower: Int, _ upper: Int) -> Self { .init(ast: .range(.init(lower, at: .fake), .init(upper, at: .fake))) } + + internal var requiresAtLeastOne: Bool { + switch ast { + case .zeroOrOne, .zeroOrMore, .upToN: + return false + case .oneOrMore: + return true + case .exactly(let num), .nOrMore(let num), .range(let num, _): + return num.value.map { $0 > 0 } ?? false + } + } } @_spi(RegexBuilder) diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index e5e899ced..f13b01a85 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -273,7 +273,9 @@ extension Regex { _ input: String, in subjectBounds: Range ) throws -> Regex.Match? { - try _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds) + try regex.program.loweredProgram.canOnlyMatchAtStart + ? _match(input, in: subjectBounds, mode: .partialFromFront) + : _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds) } func _firstMatch( diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 19ac675dc..06b6ff1a3 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -10,7 +10,7 @@ //===----------------------------------------------------------------------===// import XCTest -import _StringProcessing +@testable import _StringProcessing import RegexBuilder import TestSupport @@ -973,6 +973,55 @@ class RegexDSLTests: XCTestCase { } } + func testCanOnlyMatchAtStart() throws { + func expectCanOnlyMatchAtStart( + _ expectation: Bool, + file: StaticString = #file, line: UInt = #line, + @RegexComponentBuilder _ content: () -> some RegexComponent + ) { + let regex = content().regex + XCTAssertEqual(regex.program.loweredProgram.canOnlyMatchAtStart, expectation, file: file, line: line) + } + + expectCanOnlyMatchAtStart(true) { + Anchor.startOfSubject + "foo" + } + expectCanOnlyMatchAtStart(false) { + "foo" + } + expectCanOnlyMatchAtStart(true) { + Optionally { "foo" } + Anchor.startOfSubject + "bar" + } + + expectCanOnlyMatchAtStart(true) { + ChoiceOf { + Regex { + Anchor.startOfSubject + "foo" + } + Regex { + Anchor.startOfSubject + "bar" + } + } + } + expectCanOnlyMatchAtStart(false) { + ChoiceOf { + Regex { + Anchor.startOfSubject + "foo" + } + Regex { + Anchor.startOfLine + "bar" + } + } + } + } + func testNestedGroups() throws { return; diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 752921e19..aafe752bc 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -484,4 +484,42 @@ extension RegexTests { expectProgram(for: #"(a+)*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) expectProgram(for: #"(a{1,})*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) } + + func testCanOnlyMatchAtStart() throws { + func expectCanOnlyMatchAtStart( + _ regexStr: String, + _ expectTrue: Bool, + file: StaticString = #file, + line: UInt = #line + ) throws { + let regex = try Regex(regexStr) + XCTAssertEqual( + regex.program.loweredProgram.canOnlyMatchAtStart, expectTrue, + file: file, line: line) + } + + try expectCanOnlyMatchAtStart("^foo", true) // anchor + try expectCanOnlyMatchAtStart("\\Afoo", true) // more specific anchor + try expectCanOnlyMatchAtStart("foo", false) // no anchor + + try expectCanOnlyMatchAtStart("(?i)^foo", true) // unrelated option + try expectCanOnlyMatchAtStart("(?m)^foo", false) // anchors match newlines + try expectCanOnlyMatchAtStart("(?i:^foo)", true) // unrelated option + try expectCanOnlyMatchAtStart("(?m:^foo)", false) // anchors match newlines + + try expectCanOnlyMatchAtStart("(^foo|bar)", false) // one side of alternation + try expectCanOnlyMatchAtStart("(foo|^bar)", false) // other side of alternation + try expectCanOnlyMatchAtStart("(^foo|^bar)", true) // both sides of alternation + + // Test quantifiers that include the anchor + try expectCanOnlyMatchAtStart("(^foo)?bar", false) + try expectCanOnlyMatchAtStart("(^foo)*bar", false) + try expectCanOnlyMatchAtStart("(^foo)+bar", true) + try expectCanOnlyMatchAtStart("(?:^foo)+bar", true) + + // Test quantifiers before the anchor + try expectCanOnlyMatchAtStart("(foo)?^bar", true) // The initial group must match "" + try expectCanOnlyMatchAtStart("(?:foo)?^bar", true) + try expectCanOnlyMatchAtStart("(foo)+^bar", false) // This can't actually match anywhere + } } From b58c24c5227ca4b5daa68e20d23c49e87b182a83 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 20 Jul 2023 11:13:05 -0500 Subject: [PATCH 2/5] Refactor DSL test to explore XCTest issue --- Tests/RegexBuilderTests/RegexDSLTests.swift | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 06b6ff1a3..11f4f1a09 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -980,6 +980,17 @@ class RegexDSLTests: XCTestCase { @RegexComponentBuilder _ content: () -> some RegexComponent ) { let regex = content().regex + print(""" + canOnlyMatchAtStart: \(regex.program.loweredProgram.canOnlyMatchAtStart) + expectation: \(expectation) + equal? \(regex.program.loweredProgram.canOnlyMatchAtStart == expectation) + """) + + if expectation { + XCTAssertTrue(regex.program.loweredProgram.canOnlyMatchAtStart, file: file, line: line) + } else { + XCTAssertFalse(regex.program.loweredProgram.canOnlyMatchAtStart, file: file, line: line) + } XCTAssertEqual(regex.program.loweredProgram.canOnlyMatchAtStart, expectation, file: file, line: line) } From a243f60b82e1c792ab1dcdcb033ba3a1eae02568 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 20 Jul 2023 12:28:59 -0500 Subject: [PATCH 3/5] Include some more logging in strange test? --- Tests/RegexBuilderTests/RegexDSLTests.swift | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 11f4f1a09..dd3bc852b 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -14,6 +14,10 @@ import XCTest import RegexBuilder import TestSupport +extension Bool { + var baseValue: UInt8 { withUnsafeBytes(of: self) { $0.load(as: UInt8.self) } } +} + @available(SwiftStdlib 5.7, *) class RegexDSLTests: XCTestCase { func _testDSLCaptures( @@ -991,7 +995,17 @@ class RegexDSLTests: XCTestCase { } else { XCTAssertFalse(regex.program.loweredProgram.canOnlyMatchAtStart, file: file, line: line) } + + XCTAssertTrue(expectation == regex.program.loweredProgram.canOnlyMatchAtStart, file: file, line: line) + XCTAssertEqual(regex.program.loweredProgram.canOnlyMatchAtStart, expectation, file: file, line: line) + XCTAssertEqual( + regex.program.loweredProgram.canOnlyMatchAtStart.baseValue, + expectation.baseValue, + file: file, line: line) + print( + regex.program.loweredProgram.canOnlyMatchAtStart.baseValue, + expectation.baseValue) } expectCanOnlyMatchAtStart(true) { From 12a8c9be58e1f7c844a179623ffcb04cde133b18 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 20 Jul 2023 16:46:57 -0500 Subject: [PATCH 4/5] ... --- Sources/_StringProcessing/Regex/DSLTree.swift | 1 + Tests/RegexBuilderTests/RegexDSLTests.swift | 21 ++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index f24b87d09..65984210e 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -724,6 +724,7 @@ extension DSLTree.Node { /// In particular, non-required groups and option-setting groups are /// inconclusive about where they can match. private func _canOnlyMatchAtStartImpl(_ options: inout MatchingOptions) -> Bool? { + print(self) switch self { // Defining cases case .atom(.assertion(.startOfSubject)): diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index dd3bc852b..660e9de32 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -984,28 +984,29 @@ class RegexDSLTests: XCTestCase { @RegexComponentBuilder _ content: () -> some RegexComponent ) { let regex = content().regex + let result = regex.program.loweredProgram.canOnlyMatchAtStart print(""" - canOnlyMatchAtStart: \(regex.program.loweredProgram.canOnlyMatchAtStart) + canOnlyMatchAtStart: \(result) expectation: \(expectation) - equal? \(regex.program.loweredProgram.canOnlyMatchAtStart == expectation) + equal? \(result == expectation) """) + XCTAssertEqual(result ? 1 : 0, expectation ? 1 : 0, file: file, line: line) + if expectation { - XCTAssertTrue(regex.program.loweredProgram.canOnlyMatchAtStart, file: file, line: line) + XCTAssertTrue(result, file: file, line: line) } else { - XCTAssertFalse(regex.program.loweredProgram.canOnlyMatchAtStart, file: file, line: line) + XCTAssertFalse(result, file: file, line: line) } - XCTAssertTrue(expectation == regex.program.loweredProgram.canOnlyMatchAtStart, file: file, line: line) + XCTAssertTrue(expectation == result, file: file, line: line) - XCTAssertEqual(regex.program.loweredProgram.canOnlyMatchAtStart, expectation, file: file, line: line) + XCTAssertEqual(result, expectation, file: file, line: line) XCTAssertEqual( - regex.program.loweredProgram.canOnlyMatchAtStart.baseValue, + result.baseValue, expectation.baseValue, file: file, line: line) - print( - regex.program.loweredProgram.canOnlyMatchAtStart.baseValue, - expectation.baseValue) + fflush(stdout) } expectCanOnlyMatchAtStart(true) { From d1cd5b49ef13e8f35699ea118dd0c20c4e356292 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 21 Jul 2023 10:36:54 -0500 Subject: [PATCH 5/5] Change position of `canOnlyMatchAtStart` property The order of this property in MEProgram seems to determine whether or not it persists from the time it's stored to when it's accessed in RegexDSLTests. Clearly something else is going on here, but this works around the issue for now. --- .../_StringProcessing/Engine/MEBuilder.swift | 4 ++-- .../_StringProcessing/Engine/MEProgram.swift | 3 ++- Sources/_StringProcessing/Regex/DSLTree.swift | 1 - Tests/RegexBuilderTests/RegexDSLTests.swift | 21 ------------------- 4 files changed, 4 insertions(+), 25 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index e26a00fb1..ccbbd4440 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -395,6 +395,7 @@ extension MEProgram.Builder { regInfo.captures = nextCaptureRegister.rawValue return MEProgram( + canOnlyMatchAtStart: canOnlyMatchAtStart, instructions: InstructionList(instructions), staticElements: elements.stored, staticSequences: sequences.stored, @@ -407,8 +408,7 @@ extension MEProgram.Builder { enableMetrics: enableMetrics, captureList: captureList, referencedCaptureOffsets: referencedCaptureOffsets, - initialOptions: initialOptions, - canOnlyMatchAtStart: canOnlyMatchAtStart) + initialOptions: initialOptions) } mutating func reset() { self = Self() } diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index 3107d5ef7..6c80ffc53 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -20,6 +20,8 @@ struct MEProgram { typealias MatcherFunction = (Input, Input.Index, Range) throws -> (Input.Index, Any)? + var canOnlyMatchAtStart: Bool + var instructions: InstructionList var staticElements: [Input.Element] @@ -38,7 +40,6 @@ struct MEProgram { let referencedCaptureOffsets: [ReferenceID: Int] var initialOptions: MatchingOptions - var canOnlyMatchAtStart: Bool } extension MEProgram: CustomStringConvertible { diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 65984210e..f24b87d09 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -724,7 +724,6 @@ extension DSLTree.Node { /// In particular, non-required groups and option-setting groups are /// inconclusive about where they can match. private func _canOnlyMatchAtStartImpl(_ options: inout MatchingOptions) -> Bool? { - print(self) switch self { // Defining cases case .atom(.assertion(.startOfSubject)): diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 660e9de32..013e5d851 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -985,28 +985,7 @@ class RegexDSLTests: XCTestCase { ) { let regex = content().regex let result = regex.program.loweredProgram.canOnlyMatchAtStart - print(""" - canOnlyMatchAtStart: \(result) - expectation: \(expectation) - equal? \(result == expectation) - """) - - XCTAssertEqual(result ? 1 : 0, expectation ? 1 : 0, file: file, line: line) - - if expectation { - XCTAssertTrue(result, file: file, line: line) - } else { - XCTAssertFalse(result, file: file, line: line) - } - - XCTAssertTrue(expectation == result, file: file, line: line) - XCTAssertEqual(result, expectation, file: file, line: line) - XCTAssertEqual( - result.baseValue, - expectation.baseValue, - file: file, line: line) - fflush(stdout) } expectCanOnlyMatchAtStart(true) {