Skip to content

Instantly share code, notes, and snippets.

@lf-araujo
Created September 20, 2018 12:44
Show Gist options
  • Save lf-araujo/ec17e95219fa0fee4ae8c8a3d11f23b4 to your computer and use it in GitHub Desktop.
Save lf-araujo/ec17e95219fa0fee4ae8c8a3d11f23b4 to your computer and use it in GitHub Desktop.
A simple swift web crawler that look for a word of your choosing on the web, starting from a given webpage. Uses Swift, docopt and works on Linux.
/**
* Selenops
* Copyright (c) Federico Zanetello 2017
* Licensed under the MIT license. See LICENSE file.
*/
import Foundation
import Docopt // marathon: https://github.com/lf-araujo/docopt.swift.git
extension Collection where Indices.Iterator.Element == Index {
/// Returns the element at the specified index iff it is within bounds, otherwise nil.
subscript (safe index: Index) -> Iterator.Element? {
return indices.contains(index) ? self[index] : nil
}
}
let manual: String = """
selenops is a simple swift web crawler that look for a word of your choosing on the web, starting from a given webpage.
By default it will crawl 10 webpages, use the third optional parameter to change this behaviour.
Usage:
selenops WORD WEB [-m MAX]
Options:
-h, --help
-v, --version
-m MAX [default: 10]
"""
var args = CommandLine.arguments
args.remove(at: 0)
let result = Docopt.parse(manual, argv: args, help: true, version: "0.0.3")
let wordToSearch2: String = result["WORD"] as! String ?? ""
let startUrlString: String = result["WEB"] as! String ?? ""
guard let startUrl = URL(string: startUrlString) else {
print("🚫 Bad url!")
exit(1)
}
let maximumPagesToVisit = result["-m"] as? Int ?? 10
// Crawler Parameters
let semaphore = DispatchSemaphore(value: 0)
var visitedPages: Set<URL> = []
var pagesToVisit: Set<URL> = [startUrl]
// Crawler Core
func crawl() {
guard visitedPages.count < maximumPagesToVisit else {
print("🏁 Reached max number of pages to visit")
semaphore.signal()
return
}
guard let pageToVisit = pagesToVisit.popFirst() else {
print("🏁 No more pages to visit")
semaphore.signal()
return
}
if visitedPages.contains(pageToVisit) {
crawl()
} else {
visit(page: pageToVisit)
}
}
func visit(page url: URL) {
visitedPages.insert(url)
let task = URLSession.shared.dataTask(with: url) { data, response, error in
defer { crawl() }
guard
let data = data,
error == nil,
let document = String(data: data, encoding: .utf8) else { return }
parse(document: document, url: url)
}
print("πŸ”Ž Visiting page: \(url)")
task.resume()
}
func parse(document: String, url: URL) {
func find(word: String) {
if document.contains(word) {
print("βœ… Word '\(word)' found at page \(url)")
}
}
func collectLinks() -> [URL] {
func getMatches(pattern: String, text: String) -> [String] {
// used to remove the 'href="' & '"' from the matches
func trim(url: String) -> String {
return String(url.characters.dropLast()).substring(from: url.index(url.startIndex, offsetBy: "href=\"".characters.count))
}
let regex = try! NSRegularExpression(pattern: pattern, options: [.caseInsensitive])
let matches = regex.matches(in: text, options: [.reportCompletion], range: NSRange(text.startIndex..<text.endIndex, in: text))
return matches.map { trim(url: (text as String).substring(with: $0.range)) }
}
let pattern = "href=\"(http://.*?|https://.*?)\""
let matches = getMatches(pattern: pattern, text: document)
return matches.flatMap { URL(string: $0) }
}
find(word: wordToSearch2)
collectLinks().forEach { pagesToVisit.insert($0) }
}
crawl()
semaphore.wait()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment