lf-araujo · September 20, 2018 12:44
diff --git a/Selenops.swift b/Selenops.swift
 /**
 *  Selenops
 *  Copyright (c) Federico Zanetello 2017
 *  Licensed under the MIT license. See LICENSE file.
 */

 import Foundation
 import Docopt // marathon: https://github.com/lf-araujo/docopt.swift.git

 extension Collection where Indices.Iterator.Element == Index {
  /// Returns the element at the specified index iff it is within bounds, otherwise nil.
  subscript (safe index: Index) -> Iterator.Element? {
    return indices.contains(index) ? self[index] : nil
  }
 }

 let manual: String = """
 selenops is a simple swift web crawler that look for a word of your choosing on the web, starting from a given webpage.
 By default it will crawl 10 webpages, use the third optional parameter to change this behaviour.

 Usage:
  selenops WORD WEB [-m MAX]

 Options:
  -h, --help
  -v, --version
  -m MAX [default: 10]
 """

 var args = CommandLine.arguments
 args.remove(at: 0)
 let result = Docopt.parse(manual, argv: args, help: true, version: "0.0.3")

 let wordToSearch2: String =  result["WORD"] as! String ?? ""
 let startUrlString: String = result["WEB"] as! String ?? ""

 guard let startUrl = URL(string: startUrlString) else {
  print("🚫 Bad url!")
  exit(1)
 }

 let maximumPagesToVisit = result["-m"] as? Int ?? 10

 // Crawler Parameters
 let semaphore = DispatchSemaphore(value: 0)
 var visitedPages: Set<URL> = []
 var pagesToVisit: Set<URL> = [startUrl]

 // Crawler Core
 func crawl() {
  guard visitedPages.count < maximumPagesToVisit else {
    print("🏁 Reached max number of pages to visit")
    semaphore.signal()
    return
  }
  guard let pageToVisit = pagesToVisit.popFirst() else {
    print("🏁 No more pages to visit")
    semaphore.signal()
    return
  }
  if visitedPages.contains(pageToVisit) {
    crawl()
  } else {
    visit(page: pageToVisit)
  }
 }

 func visit(page url: URL) {
  visitedPages.insert(url)
  let task = URLSession.shared.dataTask(with: url) { data, response, error in
    defer { crawl() }
    guard
      let data = data,
      error == nil,
      let document = String(data: data, encoding: .utf8) else { return }
    parse(document: document, url: url)
  }

  print("🔎 Visiting page: \(url)")
  task.resume()
 }

 func parse(document: String, url: URL) {
  func find(word: String) {
    if document.contains(word) {
      print("✅ Word '\(word)' found at page \(url)")
    }
  }

  func collectLinks() -> [URL] {
    func getMatches(pattern: String, text: String) -> [String] {
      // used to remove the 'href="' & '"' from the matches
      func trim(url: String) -> String {
        return String(url.characters.dropLast()).substring(from: url.index(url.startIndex, offsetBy: "href=\"".characters.count))
      }

      let regex = try! NSRegularExpression(pattern: pattern, options: [.caseInsensitive])
      let matches = regex.matches(in: text, options: [.reportCompletion], range: NSRange(text.startIndex..<text.endIndex, in: text))
      return matches.map { trim(url: (text as String).substring(with: $0.range)) }
    }

    let pattern = "href=\"(http://.*?|https://.*?)\""
    let matches = getMatches(pattern: pattern, text: document)
    return matches.flatMap { URL(string: $0) }
  }
  find(word: wordToSearch2)
  collectLinks().forEach { pagesToVisit.insert($0) }
 }

 crawl()
 semaphore.wait()
	/**
	* Selenops
	* Copyright (c) Federico Zanetello 2017
	* Licensed under the MIT license. See LICENSE file.
	*/

	import Foundation
	import Docopt // marathon: https://github.com/lf-araujo/docopt.swift.git

	extension Collection where Indices.Iterator.Element == Index {
	/// Returns the element at the specified index iff it is within bounds, otherwise nil.
	subscript (safe index: Index) -> Iterator.Element? {
	return indices.contains(index) ? self[index] : nil
	}
	}

	let manual: String = """
	selenops is a simple swift web crawler that look for a word of your choosing on the web, starting from a given webpage.
	By default it will crawl 10 webpages, use the third optional parameter to change this behaviour.

	Usage:
	selenops WORD WEB [-m MAX]

	Options:
	-h, --help
	-v, --version
	-m MAX [default: 10]
	"""

	var args = CommandLine.arguments
	args.remove(at: 0)
	let result = Docopt.parse(manual, argv: args, help: true, version: "0.0.3")

	let wordToSearch2: String = result["WORD"] as! String ?? ""
	let startUrlString: String = result["WEB"] as! String ?? ""

	guard let startUrl = URL(string: startUrlString) else {
	print("🚫 Bad url!")
	exit(1)
	}

	let maximumPagesToVisit = result["-m"] as? Int ?? 10

	// Crawler Parameters
	let semaphore = DispatchSemaphore(value: 0)
	var visitedPages: Set<URL> = []
	var pagesToVisit: Set<URL> = [startUrl]

	// Crawler Core
	func crawl() {
	guard visitedPages.count < maximumPagesToVisit else {
	print("🏁 Reached max number of pages to visit")
	semaphore.signal()
	return
	}
	guard let pageToVisit = pagesToVisit.popFirst() else {
	print("🏁 No more pages to visit")
	semaphore.signal()
	return
	}
	if visitedPages.contains(pageToVisit) {
	crawl()
	} else {
	visit(page: pageToVisit)
	}
	}

	func visit(page url: URL) {
	visitedPages.insert(url)
	let task = URLSession.shared.dataTask(with: url) { data, response, error in
	defer { crawl() }
	guard
	let data = data,
	error == nil,
	let document = String(data: data, encoding: .utf8) else { return }
	parse(document: document, url: url)
	}

	print("🔎 Visiting page: \(url)")
	task.resume()
	}

	func parse(document: String, url: URL) {
	func find(word: String) {
	if document.contains(word) {
	print("✅ Word '\(word)' found at page \(url)")
	}
	}

	func collectLinks() -> [URL] {
	func getMatches(pattern: String, text: String) -> [String] {
	// used to remove the 'href="' & '"' from the matches
	func trim(url: String) -> String {
	return String(url.characters.dropLast()).substring(from: url.index(url.startIndex, offsetBy: "href=\"".characters.count))
	}

	let regex = try! NSRegularExpression(pattern: pattern, options: [.caseInsensitive])
	let matches = regex.matches(in: text, options: [.reportCompletion], range: NSRange(text.startIndex..<text.endIndex, in: text))
	return matches.map { trim(url: (text as String).substring(with: $0.range)) }
	}

	let pattern = "href=\"(http://.?\|https://.?)\""
	let matches = getMatches(pattern: pattern, text: document)
	return matches.flatMap { URL(string: $0) }
	}
	find(word: wordToSearch2)
	collectLinks().forEach { pagesToVisit.insert($0) }
	}

	crawl()
	semaphore.wait()