Created
September 20, 2018 12:44
-
-
Save lf-araujo/ec17e95219fa0fee4ae8c8a3d11f23b4 to your computer and use it in GitHub Desktop.
A simple swift web crawler that look for a word of your choosing on the web, starting from a given webpage. Uses Swift, docopt and works on Linux.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Selenops | |
* Copyright (c) Federico Zanetello 2017 | |
* Licensed under the MIT license. See LICENSE file. | |
*/ | |
import Foundation | |
import Docopt // marathon: https://github.com/lf-araujo/docopt.swift.git | |
extension Collection where Indices.Iterator.Element == Index { | |
/// Returns the element at the specified index iff it is within bounds, otherwise nil. | |
subscript (safe index: Index) -> Iterator.Element? { | |
return indices.contains(index) ? self[index] : nil | |
} | |
} | |
let manual: String = """ | |
selenops is a simple swift web crawler that look for a word of your choosing on the web, starting from a given webpage. | |
By default it will crawl 10 webpages, use the third optional parameter to change this behaviour. | |
Usage: | |
selenops WORD WEB [-m MAX] | |
Options: | |
-h, --help | |
-v, --version | |
-m MAX [default: 10] | |
""" | |
var args = CommandLine.arguments | |
args.remove(at: 0) | |
let result = Docopt.parse(manual, argv: args, help: true, version: "0.0.3") | |
let wordToSearch2: String = result["WORD"] as! String ?? "" | |
let startUrlString: String = result["WEB"] as! String ?? "" | |
guard let startUrl = URL(string: startUrlString) else { | |
print("π« Bad url!") | |
exit(1) | |
} | |
let maximumPagesToVisit = result["-m"] as? Int ?? 10 | |
// Crawler Parameters | |
let semaphore = DispatchSemaphore(value: 0) | |
var visitedPages: Set<URL> = [] | |
var pagesToVisit: Set<URL> = [startUrl] | |
// Crawler Core | |
func crawl() { | |
guard visitedPages.count < maximumPagesToVisit else { | |
print("π Reached max number of pages to visit") | |
semaphore.signal() | |
return | |
} | |
guard let pageToVisit = pagesToVisit.popFirst() else { | |
print("π No more pages to visit") | |
semaphore.signal() | |
return | |
} | |
if visitedPages.contains(pageToVisit) { | |
crawl() | |
} else { | |
visit(page: pageToVisit) | |
} | |
} | |
func visit(page url: URL) { | |
visitedPages.insert(url) | |
let task = URLSession.shared.dataTask(with: url) { data, response, error in | |
defer { crawl() } | |
guard | |
let data = data, | |
error == nil, | |
let document = String(data: data, encoding: .utf8) else { return } | |
parse(document: document, url: url) | |
} | |
print("π Visiting page: \(url)") | |
task.resume() | |
} | |
func parse(document: String, url: URL) { | |
func find(word: String) { | |
if document.contains(word) { | |
print("β Word '\(word)' found at page \(url)") | |
} | |
} | |
func collectLinks() -> [URL] { | |
func getMatches(pattern: String, text: String) -> [String] { | |
// used to remove the 'href="' & '"' from the matches | |
func trim(url: String) -> String { | |
return String(url.characters.dropLast()).substring(from: url.index(url.startIndex, offsetBy: "href=\"".characters.count)) | |
} | |
let regex = try! NSRegularExpression(pattern: pattern, options: [.caseInsensitive]) | |
let matches = regex.matches(in: text, options: [.reportCompletion], range: NSRange(text.startIndex..<text.endIndex, in: text)) | |
return matches.map { trim(url: (text as String).substring(with: $0.range)) } | |
} | |
let pattern = "href=\"(http://.*?|https://.*?)\"" | |
let matches = getMatches(pattern: pattern, text: document) | |
return matches.flatMap { URL(string: $0) } | |
} | |
find(word: wordToSearch2) | |
collectLinks().forEach { pagesToVisit.insert($0) } | |
} | |
crawl() | |
semaphore.wait() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment