Created
February 27, 2010 11:31
-
-
Save maryrosecook/316641 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn request-url [url] | |
(http/http-agent url :handler #(duck-streams/slurp* (http/stream %)))) | |
(defn crawl-batch-of-urls [urls-to-crawl] | |
(def url-crawl-agents (map #(request-url %) urls-to-crawl)) | |
(apply await-for 10000 url-crawl-agents) ; wait for the bastard agents to finish their crawling | |
url-crawl-agents) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(def crawled-host-scores (gen-host-scores urls-crawled -1 (hash-map))) | |
(def host-scores (gen-host-scores urls-saved 1 crawled-host-scores)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn gen-host-scores [urls score host-scores] | |
(def url (first urls)) | |
(if-not (empty? url) | |
(gen-host-scores (rest urls) score (update-host-scores url score host-scores)) | |
host-scores)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn remove-dupes-and-unwanted [f seq already-got] | |
(def unique-seq (remove #(.contains already-got %) seq)) | |
(filter f unique-seq)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn scrawl [url-crawl-agents urls-crawled urls-to-crawl urls-saved host-scores] | |
(if (empty? url-crawl-agents) | |
; empty, so crawl a new batch of urls and recall scrawl | |
(let [batch-to-crawl (take batch-size urls-to-crawl)] | |
(def next-url-crawl-agents (crawl-batch-of-urls batch-to-crawl)) | |
(def next-urls-to-crawl (drop batch-size urls-to-crawl)) | |
(scrawl next-url-crawl-agents urls-crawled next-urls-to-crawl urls-saved host-scores)) | |
; not empty, so get next agent and extract data from it | |
(let [next-url-crawl-agent (first url-crawl-agents)] | |
(if (agent-error next-url-crawl-agent) | |
; agent failed - move to next | |
(scrawl (rest url-crawl-agents) urls-crawled urls-to-crawl urls-saved host-scores) | |
; agent succeeded | |
(let [next-url (http/request-uri next-url-crawl-agent)] ; agent succeeded | |
(def next-url (http/request-uri next-url-crawl-agent)) ; get url that was crawled | |
(def all-linked-urls (seq (into #{} (get-unique-linked-urls next-url-crawl-agent)))) | |
(println (get host-scores (get-host next-url)) " " next-url) ; print out next url to crawl and number of mp3s found | |
(def next-urls-crawled (cons next-url urls-crawled)) | |
(def latest-urls-to-save (remove-dupes-and-unwanted #(mp3? %) all-linked-urls urls-saved)) | |
(def next-host-scores (update-host-scores next-url (count latest-urls-to-save) host-scores)) | |
(def next-urls-saved (concat urls-saved latest-urls-to-save)) | |
(def latest-urls-to-crawl (remove-dupes-and-unwanted #(crawl? % host-scores) all-linked-urls urls-crawled)) | |
(def next-urls-to-crawl (concat urls-to-crawl latest-urls-to-crawl)) | |
(save-data next-url latest-urls-to-save next-urls-to-crawl) ; save key seqs to disk | |
(scrawl (rest url-crawl-agents) next-urls-crawled next-urls-to-crawl next-urls-saved next-host-scores)))))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn update-host-scores [url score host-scores] | |
(def host (get-host url)) | |
(if (get host-scores host) | |
(assoc host-scores host (+ score (get host-scores host))) | |
(assoc host-scores host score))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment