Skip to content

Instantly share code, notes, and snippets.

@DanielCardonaRojas
Last active May 4, 2021 18:56
Show Gist options
  • Save DanielCardonaRojas/6bff2705e331485f9b024e4b4e300a43 to your computer and use it in GitHub Desktop.
Save DanielCardonaRojas/6bff2705e331485f9b024e4b4e300a43 to your computer and use it in GitHub Desktop.
HTMLParser #Swift
import Foundation
struct AttributeMatcher {
enum Comparison {
case equal, regex
}
let name: String
let value: String
let comparison: Comparison
func isMatch(_ attributeValue: String) -> Bool {
switch comparison {
case .equal:
return attributeValue == value
case .regex:
return true
}
}
static func class_(_ className: String) -> AttributeMatcher {
AttributeMatcher(name: "class", value: className, comparison: .equal)
}
static func id(_ identifier: String) -> AttributeMatcher {
AttributeMatcher(name: "id", value: identifier, comparison: .equal)
}
}
struct XMLElementMatcher: CustomStringConvertible {
var description: String {
return ""
}
let tagName: String?
var attributeMatchers: [AttributeMatcher]?
var multiMatch: Bool = true
private init(tagName: String?, attributeMatchers: [AttributeMatcher]? = nil, multiMatch: Bool = false) {
self.tagName = tagName
self.attributeMatchers = attributeMatchers
}
static func tag(_ named: String) -> XMLElementMatcher {
XMLElementMatcher(tagName: named, attributeMatchers: nil)
}
static func id(_ named: String) -> XMLElementMatcher {
XMLElementMatcher(tagName: nil, attributeMatchers: [.id(named)])
}
static func class_(_ named: String) -> XMLElementMatcher {
XMLElementMatcher.init(tagName: nil, attributeMatchers: [.class_(named)])
}
static func any() -> XMLElementMatcher {
XMLElementMatcher(tagName: nil, attributeMatchers: nil)
}
mutating func and(_ attributes: AttributeMatcher...) -> XMLElementMatcher {
self.attributeMatchers?.append(contentsOf: attributes)
return self
}
func matches(tag: String, attributes: [String : String]) -> Bool {
let matchesTag = tagName.map({ $0 == tag}) ?? true
var matchesAttributes = true
if let attrMatchers = attributeMatchers {
for attrMatcher in attrMatchers {
if let value = attributes[attrMatcher.name], attrMatcher.isMatch(value) {
continue
} else {
matchesAttributes = false
break
}
}
}
return matchesTag && matchesAttributes
}
func matchesEnd(tag: String) -> Bool {
tagName.map({ $0 == tag }) ?? true
}
}
class XMLSelector: NSObject, XMLParserDelegate {
public private(set) var done = false
var content = [String]()
private (set) var matchers: [XMLElementMatcher]
private var currentMatcherIndex: Int = 0
private var currentMatcher: XMLElementMatcher {
matchers[currentMatcherIndex]
}
private var hasExhaustedTags: Bool {
currentMatcherIndex == (matchers.count - 1)
}
var shouldCapture = false
init(_ matchers: XMLElementMatcher...) {
self.matchers = matchers
}
// MARK: XMLParserDelegate
func parser(_ parser: XMLParser,
didStartElement elementName: String,
namespaceURI: String?,
qualifiedName qName: String?,
attributes attributeDict: [String : String] = [:]) {
let matches = currentMatcher.matches(tag: elementName, attributes: attributeDict)
if matches && !done {
shouldCapture = hasExhaustedTags
// TODO:
currentMatcherIndex = hasExhaustedTags && currentMatcher.multiMatch ? currentMatcherIndex : currentMatcherIndex + 1
}
}
func parser(_ parser: XMLParser, didEndElement elementName: String, namespaceURI: String?, qualifiedName qName: String?) {
if currentMatcher.matchesEnd(tag: elementName) && hasExhaustedTags {
shouldCapture = false
if currentMatcher.multiMatch {
// TODO
currentMatcherIndex = 0
}
}
}
func parser(_ parser: XMLParser, foundCharacters string: String) {
if shouldCapture {
content.append(string)
shouldCapture = false
done = !currentMatcher.multiMatch
}
}
func parserDidEndDocument(_ parser: XMLParser) {
done = true
}
}
class HTMLParser: NSObject, XMLParserDelegate {
private var currentElement: String?
public private(set) var selectors = [XMLSelector]()
var currentTag: String?
var captured: [[String]] {
selectors.compactMap { sel in
guard sel.done else {
return nil
}
return sel.content
}
}
var parser: XMLParser?
init(url: URL) {
super.init()
self.parser = XMLParser(contentsOf: url)
parser?.delegate = self
}
init(data: Data) {
super.init()
self.parser = XMLParser(data: data)
parser?.delegate = self
}
func scrapeWith(_ selectors: [XMLSelector]) {
self.selectors = selectors
}
func scrapeTags(_ tags: [String]) {
self.selectors = tags.map({ XMLSelector(.tag($0)) })
}
// MARK: Public API
public func run() {
let didFinishParsing = parser?.parse()
print("Did finish parsing \(didFinishParsing ?? false)")
}
public func printScrapped() {
for v in captured {
print("\(v)")
}
}
// MARK: - XMLParserDelegate
public func parser(_ parser: XMLParser, didStartElement elementName: String, namespaceURI: String?, qualifiedName qName: String?, attributes attributeDict: [String : String] = [:]) {
selectors.forEach { selector in
selector.parser(parser, didStartElement: elementName, namespaceURI: namespaceURI, qualifiedName: qName, attributes: attributeDict)
}
}
public func parser(_ parser: XMLParser, didEndElement elementName: String, namespaceURI: String?, qualifiedName qName: String?) {
selectors.forEach { selector in
selector.parser(parser, didEndElement: elementName, namespaceURI: namespaceURI, qualifiedName: qName)
}
}
public func parser(_ parser: XMLParser, foundCharacters string: String) {
selectors.forEach { selector in
selector.parser(parser, foundCharacters: string)
}
}
public func parserDidEndDocument(_ parser: XMLParser) {
selectors.forEach { selector in
selector.parserDidEndDocument(parser)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment