Last active
February 6, 2019 09:10
-
-
Save algal/45e2efaf0c7ef1cd7372dc4caa847542 to your computer and use it in GitHub Desktop.
Swift Regular Expression Helpers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// known-good: Swift 4.2 | |
// expected good: Swift 5.0 | |
import Foundation | |
/* | |
This file defines three helpers for matching regular expressions against strings, and inspecting the results of capture groups in the regular expressions. | |
1. `RegexMatches(ofPattern:againstString:)` provides a lazy Sequence of matches, where every match is an array representing the matches capture groups. | |
2. `captureGroupInMatches(of:NSRegularExpression,against:)` immediately returns an array of matches, where every match is an array representing capture groups | |
3. `namedCaptureGroupsInMatches(of:against:)` immediately returns an array of matches, where every match is a dictionary mapping capture group names to the captured strings. | |
All these return Substrings rather than Strings, for storage efficiency. | |
I (Alexis Gallagher) have not tested but I believe these will work fine for Swift 5, since these methods do not assume a UTF16 encoding when converting between `Swift.Range` and `NSRange` in order to access the Foundation regular expression functionality. 2019-02-02 | |
*/ | |
/** | |
A `Sequence` of matches of a regular expression against a string. | |
Each element in the sequence is an `Array<Substring>`, containing the substrings of any capture groups within that match. The first element in the array is substring matching the entire expression. | |
Since the `Sequence` is lazy, and since it returns `Substring`s which share storage with the `String` being searched, this type should be suitable for searching for matches over large strings and for storing results regarding many matches even if they themselves are long. | |
Example: | |
let s = "😄😄😄Long paragraph!" | |
for match in RegexMatches(ofPattern: "ra(.)", againstString: s2) { | |
print(match) | |
} | |
// ["rag", "g"] | |
// ["rap", "p"] | |
*/ | |
struct RegexMatches:Sequence,IteratorProtocol | |
{ | |
let re:NSRegularExpression | |
let s:String | |
/// tracks the start point for the suffix of the string still to be searched for matches | |
private var startSearchableRange:String.Index | |
var searchableRange:Range<String.Index> { | |
return Range<String.Index>(uncheckedBounds: (self.startSearchableRange,self.s.endIndex)) | |
} | |
/// Initializes a Sequence of matches of the regular expression against a string | |
init(regularExpression:NSRegularExpression, | |
string:String, | |
options: NSRegularExpression.Options = []) | |
{ | |
self.re = regularExpression | |
self.s = string | |
self.startSearchableRange = string.startIndex | |
} | |
/** | |
Creates a sequences of matches of the regular expression pattern against a string. | |
Traps if handed a pattern which is not a regular expression | |
*/ | |
init(ofPattern pattern: String, | |
againstString string:String) | |
{ | |
let m = try! NSRegularExpression(pattern: pattern, options: []) | |
self.init(regularExpression: m, string: string) | |
} | |
mutating func next() -> [Substring]? | |
{ | |
let r = self.searchableRange | |
guard let tcr = self.re.firstMatch(in: self.s,options: [], | |
range: NSRange(r,in:self.s)) | |
else { return nil } | |
let numberOfGroups = tcr.numberOfRanges | |
let groupRanges = (0..<numberOfGroups).map({tcr.range(at: $0)}) | |
let groupSubstrings = groupRanges.map({ self.s[Range($0, in: self.s)!] }) | |
self.startSearchableRange = Range<String.Index>(tcr.range,in:self.s)!.upperBound | |
return groupSubstrings | |
} | |
} | |
/** | |
Returns a `[[Substring]]` of all matches of a `NSRegularExpression` against a `String`. | |
- parameter regularExpression: the regular expression | |
- parameter string: the String to match against | |
- returns: a `[[Substring]]`, where every element represents a distinct match of the entire regular expression against `string`. | |
In the return value, every element represents a distinct match of the entire regular expression against the `string`. Every element is itself an `Array<Substring>`, where each `Substring` is a substring for an individual capture group within that match. The first capture group (i.e., the 0th) is the entire regular expression itself. | |
So for example, a match on the regular expression "a(.)z" produces two capture groups, the expression as a whole and the middle character. It expression would match three times against against the string "aaz abz acz". This would be expressed as the array [["aaz","a"], ["abz","b"], ["acz","c"]] | |
*/ | |
func captureGroupInMatches(of regularExpression:NSRegularExpression, | |
against string:String) -> [[Substring]] | |
{ | |
let ms = regularExpression.matches(in: string, options: [], | |
range: NSRange(string.startIndex..<string.endIndex, | |
in:string)) | |
return ms.map({ | |
(tcr:NSTextCheckingResult) -> [Substring] in | |
let numberOfGroups = tcr.numberOfRanges | |
let groupRanges = (0..<numberOfGroups).map({tcr.range(at: $0)}) | |
let groupSubstrings = groupRanges.map({ string[Range($0, in: string)!] }) | |
return groupSubstrings | |
}) | |
} | |
/// Returns the names of capture groups in the regular expression. | |
func namedCaptureGroups(inRegularExpression regularExpression:NSRegularExpression) -> [String] | |
{ | |
let regexString = regularExpression.pattern | |
let nameRegex = try! NSRegularExpression(pattern: "\\(\\?\\<(\\w+)\\>", options: []) | |
let nameMatches = nameRegex.matches(in: regexString, options: [], | |
range: NSRange(regexString.startIndex..<regexString.endIndex, | |
in:regexString)) | |
let names = nameMatches.map { (textCheckingResult) -> String in | |
return (regexString as NSString).substring(with: textCheckingResult.range(at: 1)) | |
} | |
return names | |
} | |
/** | |
Returns a `[[String:Substring?]]` of all matches of a `NSRegularExpression` against a `String`. | |
- parameter regularExpression: the regular expression | |
- parameter string: the String to match against | |
- returns: an `[[String:Substring?]]`, where every element represents a distinct match of the entire regular expression against `s`. | |
In the return value, every element represents a distinct match of the entire regular expression against the string. Every element is itself a `Dictionary<String,Substring?>`, mapping the name of the capture groups to the Substring which matched that capture group. | |
So for example, a match on the regular expression "a(?<middleChar.)z" includes one capture group named "middleChar". It would match three times against against the string "aaz abz acz". This would be expressed as the array [["middleChar":"a"], ["middleChar":"b"], ["middleChar":"c"]] | |
*/ | |
func namedCaptureGroupsInMatches(of regularExpression:NSRegularExpression, | |
against string:String) -> [[String:Substring?]] | |
{ | |
let names = namedCaptureGroups(inRegularExpression: regularExpression) | |
let ms = regularExpression.matches(in: string, options: [], | |
range:NSRange(string.startIndex..<string.endIndex, | |
in:string)) | |
return ms.map({ | |
(tcr:NSTextCheckingResult) -> [String:Substring?] in | |
let keyvalues = names.map({ (name:String) -> (String,Substring?) in | |
let captureGroupRange = tcr.range(withName: name) | |
if captureGroupRange.location == NSNotFound { | |
return (name,nil) | |
} | |
else { | |
return (name,string[Range(captureGroupRange, in: string)!]) | |
} | |
}) | |
return Dictionary(uniqueKeysWithValues: keyvalues) | |
}) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you