diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index 31a3e8a0d..4508e3dd7 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -37,16 +37,30 @@ public struct Anchor { @available(SwiftStdlib 5.7, *) extension Anchor: RegexComponent { - var baseAssertion: DSLTree._AST.AssertionKind { + var baseAssertion: DSLTree.Atom.Assertion { switch kind { - case .startOfSubject: return .startOfSubject(isInverted) - case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline(isInverted) - case .endOfSubject: return .endOfSubject(isInverted) - case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject(isInverted) - case .textSegmentBoundary: return .textSegmentBoundary(isInverted) - case .startOfLine: return .startOfLine(isInverted) - case .endOfLine: return .endOfLine(isInverted) - case .wordBoundary: return .wordBoundary(isInverted) + case .startOfSubject: + // FIXME: Inverted? + return .startOfSubject + case .endOfSubjectBeforeNewline: + // FIXME: Inverted? + return .endOfSubjectBeforeNewline + case .endOfSubject: + // FIXME: Inverted? + return .endOfSubject + case .firstMatchingPositionInSubject: + // FIXME: Inverted? + return .firstMatchingPositionInSubject + case .textSegmentBoundary: + return isInverted ? .notTextSegment : .textSegment + case .startOfLine: + // FIXME: Inverted? + return .startOfLine + case .endOfLine: + // FIXME: Inverted? + return .endOfLine + case .wordBoundary: + return isInverted ? .notWordBoundary : .wordBoundary } } diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 4e96e510d..ea52c28f3 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -45,6 +45,10 @@ extension RegexComponent where Self == CharacterClass { .init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) } + public static var anyNonNewline: CharacterClass { + .init(DSLTree.CustomCharacterClass(members: [.atom(.anyNonNewline)])) + } + public static var anyGraphemeCluster: CharacterClass { .init(unconverted: ._anyGrapheme) } diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index f1419ad78..b03ce8c39 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -60,13 +60,13 @@ extension AST { case namedCharacter(String) /// . - case any + case dot /// ^ - case startOfLine + case caretAnchor /// $ - case endOfLine + case dollarAnchor // References case backreference(Reference) @@ -104,9 +104,9 @@ extension AST.Atom { case .callout(let v): return v case .backtrackingDirective(let v): return v case .changeMatchingOptions(let v): return v - case .any: return nil - case .startOfLine: return nil - case .endOfLine: return nil + case .dot: return nil + case .caretAnchor: return nil + case .dollarAnchor: return nil case .invalid: return nil } } @@ -511,67 +511,6 @@ extension AST.Atom.CharacterProperty { } } -extension AST.Atom { - /// Anchors and other built-in zero-width assertions. - public enum AssertionKind: String, Hashable { - /// \A - case startOfSubject = #"\A"# - - /// \Z - case endOfSubjectBeforeNewline = #"\Z"# - - /// \z - case endOfSubject = #"\z"# - - /// \K - case resetStartOfMatch = #"\K"# - - /// \G - case firstMatchingPositionInSubject = #"\G"# - - /// \y - case textSegment = #"\y"# - - /// \Y - case notTextSegment = #"\Y"# - - /// ^ - case startOfLine = #"^"# - - /// $ - case endOfLine = #"$"# - - /// \b (from outside a custom character class) - case wordBoundary = #"\b"# - - /// \B - case notWordBoundary = #"\B"# - - } - - public var assertionKind: AssertionKind? { - switch kind { - case .startOfLine: return .startOfLine - case .endOfLine: return .endOfLine - - case .escaped(.wordBoundary): return .wordBoundary - case .escaped(.notWordBoundary): return .notWordBoundary - case .escaped(.startOfSubject): return .startOfSubject - case .escaped(.endOfSubject): return .endOfSubject - case .escaped(.textSegment): return .textSegment - case .escaped(.notTextSegment): return .notTextSegment - case .escaped(.endOfSubjectBeforeNewline): - return .endOfSubjectBeforeNewline - case .escaped(.firstMatchingPositionInSubject): - return .firstMatchingPositionInSubject - - case .escaped(.resetStartOfMatch): return .resetStartOfMatch - - default: return nil - } - } -} - extension AST.Atom { public enum Callout: Hashable { /// A PCRE callout written `(?C...)` @@ -806,9 +745,9 @@ extension AST.Atom { // the AST? Or defer for the matching engine? return nil - case .scalarSequence, .property, .any, .startOfLine, .endOfLine, - .backreference, .subpattern, .callout, .backtrackingDirective, - .changeMatchingOptions, .invalid: + case .scalarSequence, .property, .dot, .caretAnchor, + .dollarAnchor, .backreference, .subpattern, .callout, + .backtrackingDirective, .changeMatchingOptions, .invalid: return nil } } @@ -858,7 +797,7 @@ extension AST.Atom { case .keyboardMetaControl(let x): return "\\M-\\C-\(x)" - case .property, .escaped, .any, .startOfLine, .endOfLine, + case .property, .escaped, .dot, .caretAnchor, .dollarAnchor, .backreference, .subpattern, .namedCharacter, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: return nil @@ -874,7 +813,7 @@ extension AST.Atom { // TODO: Are callouts quantifiable? case .escaped(let esc): return esc.isQuantifiable - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: return false default: return true diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 2168dbb03..4a4f5c05f 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -2073,9 +2073,9 @@ extension Parser { p.unreachable("Should have lexed a group or group-like atom") // (sometimes) special metacharacters - case ".": return customCC ? .char(".") : .any - case "^": return customCC ? .char("^") : .startOfLine - case "$": return customCC ? .char("$") : .endOfLine + case ".": return customCC ? .char(".") : .dot + case "^": return customCC ? .char("^") : .caretAnchor + case "$": return customCC ? .char("$") : .dollarAnchor // Escaped case "\\": return p.expectEscaped().value diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 0aeee282d..ea541fba7 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -221,7 +221,7 @@ extension RegexValidator { ) { switch esc { case .resetStartOfMatch, .singleDataUnit, .trueAnychar, - // '\N' needs to be emitted using 'emitAny'. + // '\N' needs to be emitted using 'emitDot'. .notNewline: error(.unsupported("'\\\(esc.character)'"), at: loc) @@ -288,7 +288,7 @@ extension RegexValidator { at: atom.location) } - case .char, .scalar, .startOfLine, .endOfLine, .any: + case .char, .scalar, .caretAnchor, .dollarAnchor, .dot: break case .invalid: diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 48a2512cf..cf5a56721 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -153,9 +153,9 @@ extension AST.Atom { case .keyboardControl, .keyboardMeta, .keyboardMetaControl: fatalError("TODO") - case .any: return "." - case .startOfLine: return "^" - case .endOfLine: return "$" + case .dot: return "." + case .caretAnchor: return "^" + case .dollarAnchor: return "$" case .backreference(let r), .subpattern(let r): return "\(r._dumpBase)" diff --git a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift index 0e7cfb1d3..6b8c8ab93 100644 --- a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift +++ b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift @@ -237,9 +237,6 @@ extension AST.Atom.Number { extension AST.Atom { var _canonicalBase: String { - if let anchor = self.assertionKind { - return anchor.rawValue - } if let lit = self.literalStringValue { // FIXME: We may have to re-introduce escapes // For example, `\.` will come back as "." instead @@ -248,6 +245,10 @@ extension AST.Atom { return lit } switch self.kind { + case .caretAnchor: + return "^" + case .dollarAnchor: + return "$" case .escaped(let e): return "\\\(e.character)" case .backreference(let br): diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index beba6101b..477760ef8 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -58,6 +58,12 @@ fileprivate extension Compiler.ByteCodeGen { case .any: emitAny() + case .anyNonNewline: + emitAnyNonNewline() + + case .dot: + emitDot() + case let .char(c): emitCharacter(c) @@ -69,7 +75,7 @@ fileprivate extension Compiler.ByteCodeGen { } case let .assertion(kind): - try emitAssertion(kind.ast) + try emitAssertion(kind) case let .backreference(ref): try emitBackreference(ref.ast) @@ -142,8 +148,34 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitStartOfLine() { + builder.buildAssert { [semanticLevel = options.semanticLevel] + (_, _, input, pos, subjectBounds) in + if pos == subjectBounds.lowerBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[input.index(before: pos)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline + } + } + } + + mutating func emitEndOfLine() { + builder.buildAssert { [semanticLevel = options.semanticLevel] + (_, _, input, pos, subjectBounds) in + if pos == subjectBounds.upperBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[pos].isNewline + case .unicodeScalar: + return input.unicodeScalars[pos].isNewline + } + } + } + mutating func emitAssertion( - _ kind: AST.Atom.AssertionKind + _ kind: DSLTree.Atom.Assertion ) throws { // FIXME: Depends on API model we have... We may want to // think through some of these with API interactions in mind @@ -200,43 +232,23 @@ fileprivate extension Compiler.ByteCodeGen { } case .startOfLine: - // FIXME: Anchor.startOfLine must always use this first branch - // The behavior of `^` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.startOfLine` anchor should always match the start - // of a line. Right now we don't distinguish between those anchors. + emitStartOfLine() + + case .endOfLine: + emitEndOfLine() + + case .caretAnchor: if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.lowerBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[input.index(before: pos)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline - } - } + emitStartOfLine() } else { builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.lowerBound } } - - case .endOfLine: - // FIXME: Anchor.endOfLine must always use this first branch - // The behavior of `$` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.endOfLine` anchor should always match the end - // of a line. Right now we don't distinguish between those anchors. + + case .dollarAnchor: if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[pos].isNewline - case .unicodeScalar: - return input.unicodeScalars[pos].isNewline - } - } + emitEndOfLine() } else { builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.upperBound @@ -321,22 +333,26 @@ fileprivate extension Compiler.ByteCodeGen { } mutating func emitAny() { - switch (options.semanticLevel, options.dotMatchesNewline) { - case (.graphemeCluster, true): + switch options.semanticLevel { + case .graphemeCluster: builder.buildAdvance(1) - case (.graphemeCluster, false): + case .unicodeScalar: + // TODO: builder.buildAdvanceUnicodeScalar(1) builder.buildConsume { input, bounds in - input[bounds.lowerBound].isNewline - ? nil - : input.index(after: bounds.lowerBound) + input.unicodeScalars.index(after: bounds.lowerBound) } + } + } - case (.unicodeScalar, true): - // TODO: builder.buildAdvanceUnicodeScalar(1) + mutating func emitAnyNonNewline() { + switch options.semanticLevel { + case .graphemeCluster: builder.buildConsume { input, bounds in - input.unicodeScalars.index(after: bounds.lowerBound) + input[bounds.lowerBound].isNewline + ? nil + : input.index(after: bounds.lowerBound) } - case (.unicodeScalar, false): + case .unicodeScalar: builder.buildConsume { input, bounds in input[bounds.lowerBound].isNewline ? nil @@ -345,6 +361,14 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitDot() { + if options.dotMatchesNewline { + emitAny() + } else { + emitAnyNonNewline() + } + } + mutating func emitAlternation( _ children: [DSLTree.Node] ) throws { @@ -823,9 +847,9 @@ fileprivate extension Compiler.ByteCodeGen { try emitQuantification(amt.ast, kind, child) case let .customCharacterClass(ccc): - if ccc.containsAny { + if ccc.containsDot { if !ccc.isInverted { - emitAny() + emitDot() } else { throw Unsupported("Inverted any") } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 4b98bc17c..668d16eb6 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -140,6 +140,25 @@ extension DSLTree.Atom { } } + case .anyNonNewline: + switch opts.semanticLevel { + case .graphemeCluster: + return { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.index(after: bounds.lowerBound) + } + case .unicodeScalar: + return { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.unicodeScalars.index(after: bounds.lowerBound) + } + } + + case .dot: + throw Unreachable(".atom(.dot) should be handled by emitDot") + case .assertion: // TODO: We could handle, should this be total? return nil @@ -285,12 +304,12 @@ extension AST.Atom { case let .namedCharacter(name): return consumeName(name, opts: opts) - case .any: + case .dot: assertionFailure( "Should have been handled by tree conversion") - fatalError(".atom(.any) is handled in emitAny") + fatalError(".atom(.dot) is handled in emitDot") - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: // handled in emitAssertion return nil diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 119a5d14f..80f2e7697 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -647,7 +647,7 @@ extension StringLiteralBuilder: CustomStringConvertible { var description: String { result } } -extension AST.Atom.AssertionKind { +extension DSLTree.Atom.Assertion { // TODO: Some way to integrate this with conversion... var _patternBase: String { switch self { @@ -655,6 +655,12 @@ extension AST.Atom.AssertionKind { return "Anchor.startOfLine" case .endOfLine: return "Anchor.endOfLine" + case .caretAnchor: + // The DSL doesn't have an equivalent to this, so print as regex. + return "/^/" + case .dollarAnchor: + // The DSL doesn't have an equivalent to this, so print as regex. + return "/$/" case .wordBoundary: return "Anchor.wordBoundary" case .notWordBoundary: @@ -833,7 +839,7 @@ extension AST.Atom { /// /// TODO: Some way to integrate this with conversion... var _patternBase: (String, canBeWrapped: Bool) { - if let anchor = self.assertionKind { + if let anchor = self.dslAssertionKind { return (anchor._patternBase, false) } @@ -919,10 +925,11 @@ extension AST.Atom { case .namedCharacter: return (" /* TODO: named character */", false) - case .any: - return (".any", true) + case .dot: + // The DSL does not have an equivalent to '.', print as a regex. + return ("/./", false) - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: fatalError("unreachable") case .backreference: @@ -974,10 +981,10 @@ extension AST.Atom { case .namedCharacter(let n): return "\\N{\(n)}" - case .any: + case .dot: return "." - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: fatalError("unreachable") case .backreference: @@ -1125,6 +1132,13 @@ extension DSLTree.Atom { switch self { case .any: return (".any", true) + + case .anyNonNewline: + return (".anyNonNewline", true) + + case .dot: + // The DSL does not have an equivalent to '.', print as a regex. + return ("/./", false) case let .char(c): return (String(c)._quoted, false) @@ -1141,7 +1155,7 @@ extension DSLTree.Atom { } case .assertion(let a): - return (a.ast._patternBase, false) + return (a._patternBase, false) case .backreference(_): return ("/* TOOD: backreferences */", false) @@ -1166,6 +1180,12 @@ extension DSLTree.Atom { var _regexBase: String { switch self { case .any: + return "(?s:.)" + + case .anyNonNewline: + return "(?-s:.)" + + case .dot: return "." case let .char(c): diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 8e58280c0..c4ac8e759 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -208,16 +208,44 @@ extension AST.CustomCharacterClass { } } +extension AST.Atom.EscapedBuiltin { + var dslAssertionKind: DSLTree.Atom.Assertion? { + switch self { + case .wordBoundary: return .wordBoundary + case .notWordBoundary: return .notWordBoundary + case .startOfSubject: return .startOfSubject + case .endOfSubject: return .endOfSubject + case .textSegment: return .textSegment + case .notTextSegment: return .notTextSegment + case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline + case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject + case .resetStartOfMatch: return .resetStartOfMatch + default: return nil + } + } +} + +extension AST.Atom { + var dslAssertionKind: DSLTree.Atom.Assertion? { + switch kind { + case .caretAnchor: return .caretAnchor + case .dollarAnchor: return .dollarAnchor + case .escaped(let b): return b.dslAssertionKind + default: return nil + } + } +} + extension AST.Atom { var dslTreeAtom: DSLTree.Atom { - if let kind = assertionKind { - return .assertion(.init(ast: kind)) + if let kind = dslAssertionKind { + return .assertion(kind) } switch self.kind { case let .char(c): return .char(c) case let .scalar(s): return .scalar(s.value) - case .any: return .any + case .dot: return .dot case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index d52f30f2e..4ea905fd5 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -117,11 +117,11 @@ extension DSLTree { var members: [Member] var isInverted: Bool - var containsAny: Bool { + var containsDot: Bool { members.contains { member in switch member { - case .atom(.any): return true - case .custom(let ccc): return ccc.containsAny + case .atom(.dot): return true + case .custom(let ccc): return ccc.containsDot default: return false } @@ -165,9 +165,19 @@ extension DSLTree { public enum Atom { case char(Character) case scalar(Unicode.Scalar) + + /// Any character, including newlines. case any - case assertion(_AST.AssertionKind) + /// Any character, excluding newlines. This differs from '.', as it is not + /// affected by single line mode. + case anyNonNewline + + /// The DSL representation of '.' in a regex literal. This does not match + /// newlines unless single line mode is enabled. + case dot + + case assertion(Assertion) case backreference(_AST.Reference) case symbolicReference(ReferenceID) @@ -177,6 +187,52 @@ extension DSLTree { } } +extension DSLTree.Atom { + @_spi(RegexBuilder) + public enum Assertion: Hashable { + /// \A + case startOfSubject + + /// \Z + case endOfSubjectBeforeNewline + + /// \z + case endOfSubject + + /// \K + case resetStartOfMatch + + /// \G + case firstMatchingPositionInSubject + + /// \y + case textSegment + + /// \Y + case notTextSegment + + /// The DSL's Anchor.startOfLine, which matches the start of a line + /// even if `anchorsMatchNewlines` is false. + case startOfLine + + /// The DSL's Anchor.endOfLine, which matches the end of a line + /// even if `anchorsMatchNewlines` is false. + case endOfLine + + /// ^ + case caretAnchor + + /// $ + case dollarAnchor + + /// \b (from outside a custom character class) + case wordBoundary + + /// \B + case notWordBoundary + } +} + extension Unicode.GeneralCategory { var extendedGeneralCategory: Unicode.ExtendedGeneralCategory? { switch self { @@ -693,40 +749,6 @@ extension DSLTree { internal var ast: AST.AbsentFunction } - @_spi(RegexBuilder) - public struct AssertionKind { - internal var ast: AST.Atom.AssertionKind - - public static func startOfSubject(_ inverted: Bool = false) -> Self { - .init(ast: .startOfSubject) - } - public static func endOfSubjectBeforeNewline(_ inverted: Bool = false) -> Self { - .init(ast: .endOfSubjectBeforeNewline) - } - public static func endOfSubject(_ inverted: Bool = false) -> Self { - .init(ast: .endOfSubject) - } - public static func firstMatchingPositionInSubject(_ inverted: Bool = false) -> Self { - .init(ast: .firstMatchingPositionInSubject) - } - public static func textSegmentBoundary(_ inverted: Bool = false) -> Self { - inverted - ? .init(ast: .notTextSegment) - : .init(ast: .textSegment) - } - public static func startOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .startOfLine) - } - public static func endOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .endOfLine) - } - public static func wordBoundary(_ inverted: Bool = false) -> Self { - inverted - ? .init(ast: .notWordBoundary) - : .init(ast: .wordBoundary) - } - } - @_spi(RegexBuilder) public struct Reference { internal var ast: AST.Reference @@ -757,8 +779,7 @@ extension DSLTree { .init(ast: .init(.escaped(.horizontalWhitespace), .fake)) } public static var _newlineSequence: Self { - // FIXME: newline sequence is not same as \n - .init(ast: .init(.escaped(.newline), .fake)) + .init(ast: .init(.escaped(.newlineSequence), .fake)) } public static var _verticalWhitespace: Self { .init(ast: .init(.escaped(.verticalTab), .fake)) @@ -778,7 +799,8 @@ extension DSLTree.Atom { switch self { case .changeMatchingOptions, .assertion: return false - case .char, .scalar, .any, .backreference, .symbolicReference, .unconverted: + case .char, .scalar, .any, .anyNonNewline, .dot, .backreference, + .symbolicReference, .unconverted: return true } } diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 693b04966..31245c0f7 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -40,7 +40,7 @@ public struct _RegexFactory { @_spi(RegexBuilder) @available(SwiftStdlib 5.7, *) public func assertion( - _ kind: DSLTree._AST.AssertionKind + _ kind: DSLTree.Atom.Assertion ) -> Regex { .init(node: .atom(.assertion(kind))) } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index c0de6ebaa..9f515f220 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -245,8 +245,8 @@ extension AST.Atom { // this? Or does grapheme-semantic mode complicate that? return nil - case .any: - // `.any` is handled in the matching engine by Compiler.emitAny() and in + case .dot: + // `.dot` is handled in the matching engine by Compiler.emitDot() and in // the legacy compiler by the `.any` instruction, which can provide lower // level instructions than the CharacterClass-generated consumer closure // @@ -275,7 +275,7 @@ extension AST.Atom.EscapedBuiltin { // FIXME: This is more like '.' than inverted '\R', as it is affected // by e.g (*CR). We should therefore really be emitting it through - // emitAny(). For now we treat it as semantically invalid. + // emitDot(). For now we treat it as semantically invalid. case .notNewline: return .newlineSequence.inverted case .whitespace: return .whitespace diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index bf6e48607..84a2d11ad 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -73,6 +73,9 @@ class RegexDSLTests: XCTestCase { XCTAssertTrue(match.output == substringMatch.output) } + let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" + let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" + func testCharacterClasses() throws { try _testDSLCaptures( ("a c", ("a c", " ", "c")), @@ -114,6 +117,137 @@ class RegexDSLTests: XCTestCase { CharacterClass.whitespace.inverted } } + + // `.newlineSequence` and `.verticalWhitespace` match the same set of + // newlines in grapheme semantic mode, and scalar mode when applied with + // OneOrMore. + for cc in [CharacterClass.newlineSequence, .verticalWhitespace] { + for mode in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + try _testDSLCaptures( + ("\n", ("\n", "\n")), + ("\r", ("\r", "\r")), + ("\r\n", ("\r\n", "\r\n")), + (allNewlines, (allNewlines[...], allNewlines[...])), + ("abc\ndef", ("abc\ndef", "\n")), + ("abc\n\r\ndef", ("abc\n\r\ndef", "\n\r\n")), + ("abc\(allNewlines)def", ("abc\(allNewlines)def", allNewlines[...])), + ("abc", nil), + matchType: (Substring, Substring).self, ==) + { + Regex { + ZeroOrMore { + cc.inverted + } + Capture { + OneOrMore(cc) + } + ZeroOrMore { + cc.inverted + } + }.matchingSemantics(mode) + } + + // Try with ASCII-only whitespace. + try _testDSLCaptures( + ("\n", ("\n", "\n")), + ("\r", ("\r", "\r")), + ("\r\n", ("\r\n", "\r\n")), + (allNewlines, (allNewlines[...], asciiNewlines[...])), + ("abc\ndef", ("abc\ndef", "\n")), + ("abc\n\r\ndef", ("abc\n\r\ndef", "\n\r\n")), + ("abc\(allNewlines)def", ("abc\(allNewlines)def", asciiNewlines[...])), + ("abc", nil), + matchType: (Substring, Substring).self, ==) + { + Regex { + ZeroOrMore { + cc.inverted + } + Capture { + OneOrMore(cc) + } + ZeroOrMore { + cc.inverted + } + }.matchingSemantics(mode).asciiOnlyWhitespace() + } + } + } + + // `.newlineSequence` in scalar mode may match a single `\r\n`. + // `.verticalWhitespace` may not. + for asciiOnly in [true, false] { + try _testDSLCaptures( + ("\r", "\r"), + ("\r\n", "\r\n"), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.newlineSequence + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.newlineSequence.inverted + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", "\r"), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace.inverted + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace.inverted + "\n" + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + } + + // Make sure horizontal whitespace does not match newlines or other + // vertical whitespace. + try _testDSLCaptures( + (" \u{A0} \u{9} \t ", " \u{A0} \u{9} \t "), + (" \n", nil), + (" \r", nil), + (" \r\n", nil), + (" \u{2028}", nil), + matchType: Substring.self, ==) + { + OneOrMore(.horizontalWhitespace) + } + + // Horizontal whitespace in ASCII mode. + try _testDSLCaptures( + (" \u{9} \t ", " \u{9} \t "), + ("\u{A0}", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.horizontalWhitespace) + }.asciiOnlyWhitespace() + } } func testCharacterClassOperations() throws { @@ -137,6 +271,105 @@ class RegexDSLTests: XCTestCase { } } + func testAny() throws { + // .any matches newlines regardless of matching options. + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abc\(allNewlines)def", "abc\(allNewlines)def"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.any) + }.dotMatchesNewlines(dotMatchesNewline) + } + } + + // `.anyGraphemeCluster` is the same as `.any` in grapheme mode. + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + try _testDSLCaptures( + ("a", "a"), + ("\r\n", "\r\n"), + ("e\u{301}", "e\u{301}"), + ("e\u{301}f", nil), + ("e\u{303}\u{301}\u{302}", "e\u{303}\u{301}\u{302}"), + matchType: Substring.self, ==) + { + Regex { + One(.anyGraphemeCluster) + }.matchingSemantics(mode) + } + + // Like `.any` it also always matches newlines. + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abc\(allNewlines)def", "abc\(allNewlines)def"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyGraphemeCluster) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + } + } + } + + func testAnyNonNewline() throws { + // `.anyNonNewline` is `.` without single-line mode. + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abcdef", "abcdef"), + ("abcdef\n", nil), + ("\r\n", nil), + ("\r", nil), + ("\n", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyNonNewline) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + + try _testDSLCaptures( + ("abcdef", nil), + ("abcdef\n", nil), + ("\r\n", "\r\n"), + ("\r", "\r"), + ("\n", "\n"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyNonNewline.inverted) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + + try _testDSLCaptures( + ("abc", "abc"), + ("abcd", nil), + ("\r\n", nil), + ("\r", nil), + ("\n", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(CharacterClass.anyNonNewline.intersection(.anyOf("\n\rabc"))) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + } + } + + try _testDSLCaptures( + ("\r\n", "\r\n"), matchType: Substring.self, ==) { + CharacterClass.anyNonNewline.inverted + } + try _testDSLCaptures( + ("\r\n", nil), matchType: Substring.self, ==) { + Regex { + CharacterClass.anyNonNewline.inverted + }.matchingSemantics(.unicodeScalar) + } + } + func testMatchResultDotZeroWithoutCapture() throws { let match = try XCTUnwrap("aaa".wholeMatch { OneOrMore { "a" } }) XCTAssertEqual(match.0, "aaa") @@ -678,19 +911,40 @@ class RegexDSLTests: XCTestCase { Anchor.endOfSubject }.anchorsMatchLineEndings() } - - // FIXME: Anchor.start/endOfLine needs to always match line endings, - // even when the `anchorsMatchLineEndings()` option is turned off. + try _testDSLCaptures( - ("\naaa", "aaa"), - ("aaa\n", "aaa"), - ("\naaa\n", "aaa"), - matchType: Substring.self, ==, xfail: true) + ("\naaa", "\naaa"), + ("aaa\n", "aaa\n"), + ("\naaa\n", "\naaa\n"), + matchType: Substring.self, ==) { Regex { + Optionally { "\n" } Anchor.startOfLine Repeat("a", count: 3) Anchor.endOfLine + Optionally { "\n" } + } + } + + // startOfLine/endOfLine apply regardless of mode. + for matchLineEndings in [true, false] { + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + let r = Regex { + Anchor.startOfLine + Repeat("a", count: 3) + Anchor.endOfLine + }.anchorsMatchLineEndings(matchLineEndings).matchingSemantics(mode) + + XCTAssertNotNil(try r.firstMatch(in: "\naaa")) + XCTAssertNotNil(try r.firstMatch(in: "aaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\naaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\naaa\r\n")) + XCTAssertNotNil(try r.firstMatch(in: "\r\naaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\r\naaa\r\n")) + + XCTAssertNil(try r.firstMatch(in: "\nbaaa\n")) + XCTAssertNil(try r.firstMatch(in: "\naaab\n")) } } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index f2715eac1..c087974a7 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -641,7 +641,41 @@ extension RegexTests { ("\n", true), ("\r", true), ("\r\n", false)) - + + let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" + let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" + + for level in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + firstMatchTest( + #"\R+"#, + input: "abc\(allNewlines)def", match: allNewlines, + semanticLevel: level + ) + firstMatchTest( + #"\v+"#, + input: "abc\(allNewlines)def", match: allNewlines, + semanticLevel: level + ) + } + + // In scalar mode, \R can match \r\n, \v cannot. + firstMatchTest( + #"\R"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"\v"#, input: "\r\n", match: "\r", semanticLevel: .unicodeScalar) + firstMatchTest( + #"\v\v"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"[^\v]"#, input: "\r\n", match: nil, semanticLevel: .unicodeScalar) + + // ASCII-only spaces. + firstMatchTest(#"(?S)\R+"#, input: allNewlines, match: asciiNewlines) + firstMatchTest(#"(?S)\v+"#, input: allNewlines, match: asciiNewlines) + firstMatchTest( + #"(?S)\R"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"(?S)\v"#, input: "\r\n", match: "\r", semanticLevel: .unicodeScalar) + matchTest( #"[a]\u0301"#, ("a\u{301}", false), diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 3c43f27af..52a272915 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -359,14 +359,14 @@ extension RegexTests { parseTest( "(.)*(.*)", concat( - zeroOrMore(of: capture(atom(.any))), - capture(zeroOrMore(of: atom(.any)))), + zeroOrMore(of: capture(atom(.dot))), + capture(zeroOrMore(of: atom(.dot)))), captures: [.opt, .cap]) parseTest( "((.))*((.)?)", concat( - zeroOrMore(of: capture(capture(atom(.any)))), - capture(zeroOrOne(of: capture(atom(.any))))), + zeroOrMore(of: capture(capture(atom(.dot)))), + capture(zeroOrOne(of: capture(atom(.dot))))), captures: [.opt, .opt, .cap, .opt]) parseTest( #"abc\d"#, @@ -479,7 +479,7 @@ extension RegexTests { parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) - // FIXME: '\N' should be emitted through 'emitAny', not through the + // FIXME: '\N' should be emitted through 'emitDot', not through the // _CharacterClassModel model. parseTest(#"\N"#, escaped(.notNewline), unsupported: true) diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 6822330f3..3b0a8d5b3 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -68,7 +68,38 @@ extension RenderDSLTests { } """) } - + + func testDot() throws { + try testConversion(#".+"#, #""" + Regex { + OneOrMore { + /./ + } + } + """#) + try testConversion(#"a.c"#, #""" + Regex { + "a" + /./ + "c" + } + """#) + } + + func testAnchor() throws { + try testConversion(#"^(?:a|b|c)$"#, #""" + Regex { + /^/ + ChoiceOf { + "a" + "b" + "c" + } + /$/ + } + """#) + } + func testOptions() throws { try XCTExpectFailure("Options like '(?i)' aren't converted") { try testConversion(#"(?i)abc"#, """