From 8f934987975a3f5fd5303284af39c5aa67dc3ec1 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 14 Jul 2022 17:53:38 +0100 Subject: [PATCH 1/3] Rip out unused _CharacterClassModel API Remove the DSL -> _CharacterClassModel conversion, and _CharacterClassModel's custom character class matching logic, none of which is being used. --- .../_CharacterClassModel.swift | 192 ------------------ 1 file changed, 192 deletions(-) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index db2088782..c1183972b 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -50,74 +50,6 @@ public struct _CharacterClassModel: Hashable { case whitespace /// Character.isLetter or Character.isDigit or Character == "_" case word - /// One of the custom character set. - case custom([CharacterSetComponent]) - } - - public enum SetOperator: Hashable { - case subtraction - case intersection - case symmetricDifference - } - - /// A binary set operation that forms a character class component. - public struct SetOperation: Hashable { - var lhs: CharacterSetComponent - var op: SetOperator - var rhs: CharacterSetComponent - - func matches(_ c: Character, with options: MatchingOptions) -> Bool { - switch op { - case .intersection: - return lhs.matches(c, with: options) && rhs.matches(c, with: options) - case .subtraction: - return lhs.matches(c, with: options) && !rhs.matches(c, with: options) - case .symmetricDifference: - return lhs.matches(c, with: options) != rhs.matches(c, with: options) - } - } - } - - public enum CharacterSetComponent: Hashable { - case character(Character) - case range(ClosedRange) - - /// A nested character class. - case characterClass(_CharacterClassModel) - - /// A binary set operation of character class components. - indirect case setOperation(SetOperation) - - public static func setOperation( - lhs: CharacterSetComponent, op: SetOperator, rhs: CharacterSetComponent - ) -> CharacterSetComponent { - .setOperation(.init(lhs: lhs, op: op, rhs: rhs)) - } - - func matches(_ character: Character, with options: MatchingOptions) -> Bool { - switch self { - case .character(let c): - if options.isCaseInsensitive { - return c.lowercased() == character.lowercased() - } else { - return c == character - } - case .range(let range): - if options.isCaseInsensitive { - let newLower = range.lowerBound.lowercased() - let newUpper = range.upperBound.lowercased() - // FIXME: Is failing this possible? Is this the right behavior if so? - guard newLower <= newUpper else { return false } - return (newLower...newUpper).contains(character.lowercased()) - } else { - return range.contains(character) - } - case .characterClass(let custom): - let str = String(character) - return custom.matches(in: str, at: str.startIndex, with: options) != nil - case .setOperation(let op): return op.matches(character, with: options) - } - } } enum MatchLevel: Hashable { @@ -188,8 +120,6 @@ public struct _CharacterClassModel: Hashable { matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): - matched = set.any { $0.matches(c, with: options) } } if isInverted { matched.toggle() @@ -222,8 +152,6 @@ public struct _CharacterClassModel: Hashable { matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): - matched = set.any { $0.matches(Character(c), with: options) } } if isInverted { matched.toggle() @@ -286,23 +214,6 @@ extension _CharacterClassModel { public static var word: _CharacterClassModel { .init(cc: .word, matchLevel: .graphemeCluster) } - - public static func custom( - _ components: [_CharacterClassModel.CharacterSetComponent] - ) -> _CharacterClassModel { - .init(cc: .custom(components), matchLevel: .graphemeCluster) - } -} - -extension _CharacterClassModel.CharacterSetComponent: CustomStringConvertible { - public var description: String { - switch self { - case .range(let range): return "" - case .character(let character): return "" - case .characterClass(let custom): return "\(custom)" - case .setOperation(let op): return "<\(op.lhs) \(op.op) \(op.rhs)>" - } - } } extension _CharacterClassModel.Representation: CustomStringConvertible { @@ -318,7 +229,6 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { case .verticalWhitespace: return "vertical whitespace" case .whitespace: return "" case .word: return "" - case .custom(let set): return "" } } } @@ -391,22 +301,6 @@ extension _CharacterClassModel { } } -extension DSLTree.Node { - var characterClass: _CharacterClassModel? { - switch self { - case let .customCharacterClass(ccc): - return ccc.modelCharacterClass - case let .atom(a): - return a.characterClass - case .characterPredicate: - // FIXME: Do we make one from this? - return nil - default: - return nil - } - } -} - extension _CharacterClassModel { func withMatchLevel( _ level: _CharacterClassModel.MatchLevel @@ -417,17 +311,6 @@ extension _CharacterClassModel { } } -extension DSLTree.Atom { - var characterClass: _CharacterClassModel? { - switch self { - case let .unconverted(a): - return a.ast.characterClass - - default: return nil - } - } -} - extension AST.Atom { var characterClass: _CharacterClassModel? { switch kind { @@ -489,81 +372,6 @@ extension AST.Atom.EscapedBuiltin { } } -extension DSLTree.CustomCharacterClass { - // TODO: Refactor a bit, and... can we drop this type? - var modelCharacterClass: _CharacterClassModel? { - var result = - Array<_CharacterClassModel.CharacterSetComponent>() - for m in members { - switch m { - case let .atom(a): - if let cc = a.characterClass { - result.append(.characterClass(cc)) - } else if let c = a.literalCharacterValue { - result.append(.character(c)) - } else { - return nil - } - case let .range(low, high): - guard let lhs = low.literalCharacterValue, - let rhs = high.literalCharacterValue - else { - return nil - } - result.append(.range(lhs...rhs)) - - case let .custom(ccc): - guard let cc = ccc.modelCharacterClass else { - return nil - } - result.append(.characterClass(cc)) - - case let .intersection(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .intersection, - rhs: .characterClass(rhs))) - - case let .subtraction(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .subtraction, - rhs: .characterClass(rhs))) - - case let .symmetricDifference(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .symmetricDifference, - rhs: .characterClass(rhs))) - - case let .quotedLiteral(s): - // Decompose quoted literal into literal characters. - result += s.map { .character($0) } - - case .trivia: - break - } - } - let cc = _CharacterClassModel.custom(result) - return isInverted ? cc.inverted : cc - } -} - extension _CharacterClassModel { // FIXME: Calling on inverted sets wont be the same as the // inverse of a boundary if at the start or end of the From 297a69d9c134f02ffd7bc7d52db2e7e5adec2672 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 14 Jul 2022 17:53:38 +0100 Subject: [PATCH 2/3] Remove _CharacterClassModel conformance to RegexComponent --- Sources/_StringProcessing/_CharacterClassModel.swift | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index c1183972b..e280ba473 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -161,18 +161,6 @@ public struct _CharacterClassModel: Hashable { } } -@available(SwiftStdlib 5.7, *) -extension _CharacterClassModel: RegexComponent { - public typealias RegexOutput = Substring - - public var regex: Regex { - guard let ast = self.makeAST() else { - fatalError("FIXME: extended AST?") - } - return Regex(ast: ast) - } -} - @_spi(RegexBuilder) extension _CharacterClassModel { public static var any: _CharacterClassModel { From 7d5e86d2cdc1d5c862d44bc40edfa4eebae70666 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 14 Jul 2022 17:53:39 +0100 Subject: [PATCH 3/3] Internalize `_CharacterClassModel` `makeDSLTreeCharacterClass` was the last API that required it to be public. Remove it, and replace it with some static members on `_AST.Atom`. --- Sources/RegexBuilder/CharacterClass.swift | 21 ++--- Sources/_StringProcessing/Regex/DSLTree.swift | 26 +++++ .../_CharacterClassModel.swift | 94 +++---------------- 3 files changed, 50 insertions(+), 91 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index a6d18b2cf..4e96e510d 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -20,11 +20,8 @@ public struct CharacterClass { self.ccc = ccc } - init(unconverted model: _CharacterClassModel) { - guard let ccc = model.makeDSLTreeCharacterClass() else { - fatalError("Unsupported character class") - } - self.ccc = ccc + init(unconverted atom: DSLTree._AST.Atom) { + self.ccc = .init(members: [.atom(.unconverted(atom))]) } } @@ -49,15 +46,15 @@ extension RegexComponent where Self == CharacterClass { } public static var anyGraphemeCluster: CharacterClass { - .init(unconverted: .anyGrapheme) + .init(unconverted: ._anyGrapheme) } public static var whitespace: CharacterClass { - .init(unconverted: .whitespace) + .init(unconverted: ._whitespace) } public static var digit: CharacterClass { - .init(unconverted: .digit) + .init(unconverted: ._digit) } public static var hexDigit: CharacterClass { @@ -69,19 +66,19 @@ extension RegexComponent where Self == CharacterClass { } public static var horizontalWhitespace: CharacterClass { - .init(unconverted: .horizontalWhitespace) + .init(unconverted: ._horizontalWhitespace) } public static var newlineSequence: CharacterClass { - .init(unconverted: .newlineSequence) + .init(unconverted: ._newlineSequence) } public static var verticalWhitespace: CharacterClass { - .init(unconverted: .verticalWhitespace) + .init(unconverted: ._verticalWhitespace) } public static var word: CharacterClass { - .init(unconverted: .word) + .init(unconverted: ._word) } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index c251dded7..d52f30f2e 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -740,6 +740,32 @@ extension DSLTree { @_spi(RegexBuilder) public struct Atom { internal var ast: AST.Atom + + // FIXME: The below APIs should be removed once the DSL tree has been + // migrated to use proper DSL atoms for them. + + public static var _anyGrapheme: Self { + .init(ast: .init(.escaped(.graphemeCluster), .fake)) + } + public static var _whitespace: Self { + .init(ast: .init(.escaped(.whitespace), .fake)) + } + public static var _digit: Self { + .init(ast: .init(.escaped(.decimalDigit), .fake)) + } + public static var _horizontalWhitespace: Self { + .init(ast: .init(.escaped(.horizontalWhitespace), .fake)) + } + public static var _newlineSequence: Self { + // FIXME: newline sequence is not same as \n + .init(ast: .init(.escaped(.newline), .fake)) + } + public static var _verticalWhitespace: Self { + .init(ast: .init(.escaped(.verticalTab), .fake)) + } + public static var _word: Self { + .init(ast: .init(.escaped(.wordCharacter), .fake)) + } } } } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index e280ba473..c0de6ebaa 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -15,8 +15,7 @@ // an AST, but this isn't a natural thing to produce in the context // of parsing or to store in an AST -@_spi(RegexBuilder) -public struct _CharacterClassModel: Hashable { +struct _CharacterClassModel: Hashable { /// The actual character class to match. var cc: Representation @@ -28,7 +27,7 @@ public struct _CharacterClassModel: Hashable { var isInverted: Bool = false // TODO: Split out builtin character classes into their own type? - public enum Representation: Hashable { + enum Representation: Hashable { /// Any character case any /// Any grapheme cluster @@ -85,7 +84,7 @@ public struct _CharacterClassModel: Hashable { } /// Inverts a character class. - public var inverted: Self { + var inverted: Self { return withInversion(true) } @@ -161,51 +160,50 @@ public struct _CharacterClassModel: Hashable { } } -@_spi(RegexBuilder) extension _CharacterClassModel { - public static var any: _CharacterClassModel { + static var any: _CharacterClassModel { .init(cc: .any, matchLevel: .graphemeCluster) } - public static var anyGrapheme: _CharacterClassModel { + static var anyGrapheme: _CharacterClassModel { .init(cc: .anyGrapheme, matchLevel: .graphemeCluster) } - public static var anyUnicodeScalar: _CharacterClassModel { + static var anyUnicodeScalar: _CharacterClassModel { .init(cc: .any, matchLevel: .unicodeScalar) } - public static var whitespace: _CharacterClassModel { + static var whitespace: _CharacterClassModel { .init(cc: .whitespace, matchLevel: .graphemeCluster) } - public static var digit: _CharacterClassModel { + static var digit: _CharacterClassModel { .init(cc: .digit, matchLevel: .graphemeCluster) } - public static var hexDigit: _CharacterClassModel { + static var hexDigit: _CharacterClassModel { .init(cc: .hexDigit, matchLevel: .graphemeCluster) } - public static var horizontalWhitespace: _CharacterClassModel { + static var horizontalWhitespace: _CharacterClassModel { .init(cc: .horizontalWhitespace, matchLevel: .graphemeCluster) } - public static var newlineSequence: _CharacterClassModel { + static var newlineSequence: _CharacterClassModel { .init(cc: .newlineSequence, matchLevel: .graphemeCluster) } - public static var verticalWhitespace: _CharacterClassModel { + static var verticalWhitespace: _CharacterClassModel { .init(cc: .verticalWhitespace, matchLevel: .graphemeCluster) } - public static var word: _CharacterClassModel { + static var word: _CharacterClassModel { .init(cc: .word, matchLevel: .graphemeCluster) } } extension _CharacterClassModel.Representation: CustomStringConvertible { - public var description: String { + var description: String { switch self { case .any: return "" case .anyGrapheme: return "" @@ -222,73 +220,11 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { } extension _CharacterClassModel: CustomStringConvertible { - public var description: String { + var description: String { return "\(isInverted ? "not " : "")\(cc)" } } -extension _CharacterClassModel { - public func makeDSLTreeCharacterClass() -> DSLTree.CustomCharacterClass? { - // FIXME: Implement in DSLTree instead of wrapping an AST atom - switch makeAST() { - case .atom(let atom): - return .init(members: [.atom(.unconverted(.init(ast: atom)))]) - default: - return nil - } - } - - internal func makeAST() -> AST.Node? { - let inv = isInverted - - func esc(_ b: AST.Atom.EscapedBuiltin) -> AST.Node { - escaped(b) - } - - switch cc { - case .any: return atom(.any) - - case .digit: - return esc(inv ? .notDecimalDigit : .decimalDigit) - - case .horizontalWhitespace: - return esc( - inv ? .notHorizontalWhitespace : .horizontalWhitespace) - - // FIXME: newline sequence is not same as \n - case .newlineSequence: - return esc(inv ? .notNewline : .newline) - - case .whitespace: - return esc(inv ? .notWhitespace : .whitespace) - - case .verticalWhitespace: - return esc(inv ? .notVerticalTab : .verticalTab) - - case .word: - return esc(inv ? .notWordCharacter : .wordCharacter) - - case .anyGrapheme: - return esc(.graphemeCluster) - - case .hexDigit: - let members: [AST.CustomCharacterClass.Member] = [ - range_m(.char("a"), .char("f")), - range_m(.char("A"), .char("F")), - range_m(.char("0"), .char("9")), - ] - let ccc = AST.CustomCharacterClass( - .init(faking: inv ? .inverted : .normal), - members, - .fake) - - return .customCharacterClass(ccc) - - default: return nil - } - } -} - extension _CharacterClassModel { func withMatchLevel( _ level: _CharacterClassModel.MatchLevel