Last active
May 4, 2021 18:56
-
-
Save DanielCardonaRojas/6bff2705e331485f9b024e4b4e300a43 to your computer and use it in GitHub Desktop.
HTMLParser #Swift
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Foundation | |
struct AttributeMatcher { | |
enum Comparison { | |
case equal, regex | |
} | |
let name: String | |
let value: String | |
let comparison: Comparison | |
func isMatch(_ attributeValue: String) -> Bool { | |
switch comparison { | |
case .equal: | |
return attributeValue == value | |
case .regex: | |
return true | |
} | |
} | |
static func class_(_ className: String) -> AttributeMatcher { | |
AttributeMatcher(name: "class", value: className, comparison: .equal) | |
} | |
static func id(_ identifier: String) -> AttributeMatcher { | |
AttributeMatcher(name: "id", value: identifier, comparison: .equal) | |
} | |
} | |
struct XMLElementMatcher: CustomStringConvertible { | |
var description: String { | |
return "" | |
} | |
let tagName: String? | |
var attributeMatchers: [AttributeMatcher]? | |
var multiMatch: Bool = true | |
private init(tagName: String?, attributeMatchers: [AttributeMatcher]? = nil, multiMatch: Bool = false) { | |
self.tagName = tagName | |
self.attributeMatchers = attributeMatchers | |
} | |
static func tag(_ named: String) -> XMLElementMatcher { | |
XMLElementMatcher(tagName: named, attributeMatchers: nil) | |
} | |
static func id(_ named: String) -> XMLElementMatcher { | |
XMLElementMatcher(tagName: nil, attributeMatchers: [.id(named)]) | |
} | |
static func class_(_ named: String) -> XMLElementMatcher { | |
XMLElementMatcher.init(tagName: nil, attributeMatchers: [.class_(named)]) | |
} | |
static func any() -> XMLElementMatcher { | |
XMLElementMatcher(tagName: nil, attributeMatchers: nil) | |
} | |
mutating func and(_ attributes: AttributeMatcher...) -> XMLElementMatcher { | |
self.attributeMatchers?.append(contentsOf: attributes) | |
return self | |
} | |
func matches(tag: String, attributes: [String : String]) -> Bool { | |
let matchesTag = tagName.map({ $0 == tag}) ?? true | |
var matchesAttributes = true | |
if let attrMatchers = attributeMatchers { | |
for attrMatcher in attrMatchers { | |
if let value = attributes[attrMatcher.name], attrMatcher.isMatch(value) { | |
continue | |
} else { | |
matchesAttributes = false | |
break | |
} | |
} | |
} | |
return matchesTag && matchesAttributes | |
} | |
func matchesEnd(tag: String) -> Bool { | |
tagName.map({ $0 == tag }) ?? true | |
} | |
} | |
class XMLSelector: NSObject, XMLParserDelegate { | |
public private(set) var done = false | |
var content = [String]() | |
private (set) var matchers: [XMLElementMatcher] | |
private var currentMatcherIndex: Int = 0 | |
private var currentMatcher: XMLElementMatcher { | |
matchers[currentMatcherIndex] | |
} | |
private var hasExhaustedTags: Bool { | |
currentMatcherIndex == (matchers.count - 1) | |
} | |
var shouldCapture = false | |
init(_ matchers: XMLElementMatcher...) { | |
self.matchers = matchers | |
} | |
// MARK: XMLParserDelegate | |
func parser(_ parser: XMLParser, | |
didStartElement elementName: String, | |
namespaceURI: String?, | |
qualifiedName qName: String?, | |
attributes attributeDict: [String : String] = [:]) { | |
let matches = currentMatcher.matches(tag: elementName, attributes: attributeDict) | |
if matches && !done { | |
shouldCapture = hasExhaustedTags | |
// TODO: | |
currentMatcherIndex = hasExhaustedTags && currentMatcher.multiMatch ? currentMatcherIndex : currentMatcherIndex + 1 | |
} | |
} | |
func parser(_ parser: XMLParser, didEndElement elementName: String, namespaceURI: String?, qualifiedName qName: String?) { | |
if currentMatcher.matchesEnd(tag: elementName) && hasExhaustedTags { | |
shouldCapture = false | |
if currentMatcher.multiMatch { | |
// TODO | |
currentMatcherIndex = 0 | |
} | |
} | |
} | |
func parser(_ parser: XMLParser, foundCharacters string: String) { | |
if shouldCapture { | |
content.append(string) | |
shouldCapture = false | |
done = !currentMatcher.multiMatch | |
} | |
} | |
func parserDidEndDocument(_ parser: XMLParser) { | |
done = true | |
} | |
} | |
class HTMLParser: NSObject, XMLParserDelegate { | |
private var currentElement: String? | |
public private(set) var selectors = [XMLSelector]() | |
var currentTag: String? | |
var captured: [[String]] { | |
selectors.compactMap { sel in | |
guard sel.done else { | |
return nil | |
} | |
return sel.content | |
} | |
} | |
var parser: XMLParser? | |
init(url: URL) { | |
super.init() | |
self.parser = XMLParser(contentsOf: url) | |
parser?.delegate = self | |
} | |
init(data: Data) { | |
super.init() | |
self.parser = XMLParser(data: data) | |
parser?.delegate = self | |
} | |
func scrapeWith(_ selectors: [XMLSelector]) { | |
self.selectors = selectors | |
} | |
func scrapeTags(_ tags: [String]) { | |
self.selectors = tags.map({ XMLSelector(.tag($0)) }) | |
} | |
// MARK: Public API | |
public func run() { | |
let didFinishParsing = parser?.parse() | |
print("Did finish parsing \(didFinishParsing ?? false)") | |
} | |
public func printScrapped() { | |
for v in captured { | |
print("\(v)") | |
} | |
} | |
// MARK: - XMLParserDelegate | |
public func parser(_ parser: XMLParser, didStartElement elementName: String, namespaceURI: String?, qualifiedName qName: String?, attributes attributeDict: [String : String] = [:]) { | |
selectors.forEach { selector in | |
selector.parser(parser, didStartElement: elementName, namespaceURI: namespaceURI, qualifiedName: qName, attributes: attributeDict) | |
} | |
} | |
public func parser(_ parser: XMLParser, didEndElement elementName: String, namespaceURI: String?, qualifiedName qName: String?) { | |
selectors.forEach { selector in | |
selector.parser(parser, didEndElement: elementName, namespaceURI: namespaceURI, qualifiedName: qName) | |
} | |
} | |
public func parser(_ parser: XMLParser, foundCharacters string: String) { | |
selectors.forEach { selector in | |
selector.parser(parser, foundCharacters: string) | |
} | |
} | |
public func parserDidEndDocument(_ parser: XMLParser) { | |
selectors.forEach { selector in | |
selector.parserDidEndDocument(parser) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment