Created
September 1, 2017 13:21
-
-
Save tmountain/04937a84e299730b814b194616e44330 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns clj-sleuth.crawl | |
(:require [jsoup.soup :as c] | |
[clojure.pprint :as p] | |
[clojure.tools.logging :as log])) | |
(Thread/setDefaultUncaughtExceptionHandler | |
(reify Thread$UncaughtExceptionHandler | |
(uncaughtException [_ thread ex] | |
(log/error ex "Uncaught exception on" (.getName thread))))) | |
(defn prepend-http | |
[domain] | |
(str "http://" domain)) | |
(defn url-to-file | |
[url] | |
(-> (clojure.string/replace url #"https?://" "") | |
(clojure.string/replace #"[.]" "_"))) | |
(defn get-url | |
[site] | |
{:url site | |
:data (try | |
(-> (c/get! (prepend-http site) :timeout 5000 :ignore-http-errors true) | |
(c/$ (.text))) | |
(catch Exception e (str "Caught exception: " (.getMessage e))))}) | |
(defn do-result | |
[outpath result] | |
(if (and (map? result) | |
(contains? result :url) | |
(contains? result :data)) | |
(spit (str outpath (url-to-file (:url result))) | |
(:data result)))) | |
(defn file-exists? | |
[filepath] | |
(.exists (clojure.java.io/as-file filepath))) | |
(defn make-dispatcher | |
[outpath] | |
(fn [domain] | |
(let [file (str outpath (url-to-file domain))] | |
(if (file-exists? file) | |
nil | |
(->> (get-url domain) | |
(do-result outpath)))))) | |
(defn run-crawl | |
[data] | |
(let [bucket (:bucket data) | |
outpath (:outpath data) | |
dispatcher (make-dispatcher outpath)] | |
(doall (map dispatcher bucket)))) | |
(defn -main | |
[& args] | |
(with-open [rdr (clojure.java.io/reader "data/domains_to_crawl.txt")] | |
(let [outpath "data/crawl_results/" | |
num-agents 256 | |
urls (map clojure.string/trim (line-seq rdr)) | |
num-items (count urls) | |
bucket-size (int (/ num-items num-agents)) | |
buckets (partition bucket-size bucket-size [] urls) | |
agents (map #(agent {:bucket %, :outpath outpath}) buckets)] | |
(doall (map #(send-off % run-crawl) agents)) | |
(apply await agents) | |
(shutdown-agents)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment