Skip to content

Instantly share code, notes, and snippets.

@maryrosecook
Created February 27, 2010 11:31
Show Gist options
  • Save maryrosecook/316641 to your computer and use it in GitHub Desktop.
Save maryrosecook/316641 to your computer and use it in GitHub Desktop.
(defn request-url [url]
(http/http-agent url :handler #(duck-streams/slurp* (http/stream %))))
(defn crawl-batch-of-urls [urls-to-crawl]
(def url-crawl-agents (map #(request-url %) urls-to-crawl))
(apply await-for 10000 url-crawl-agents) ; wait for the bastard agents to finish their crawling
url-crawl-agents)
(def crawled-host-scores (gen-host-scores urls-crawled -1 (hash-map)))
(def host-scores (gen-host-scores urls-saved 1 crawled-host-scores))
(defn gen-host-scores [urls score host-scores]
(def url (first urls))
(if-not (empty? url)
(gen-host-scores (rest urls) score (update-host-scores url score host-scores))
host-scores))
(defn remove-dupes-and-unwanted [f seq already-got]
(def unique-seq (remove #(.contains already-got %) seq))
(filter f unique-seq))
(defn scrawl [url-crawl-agents urls-crawled urls-to-crawl urls-saved host-scores]
(if (empty? url-crawl-agents)
; empty, so crawl a new batch of urls and recall scrawl
(let [batch-to-crawl (take batch-size urls-to-crawl)]
(def next-url-crawl-agents (crawl-batch-of-urls batch-to-crawl))
(def next-urls-to-crawl (drop batch-size urls-to-crawl))
(scrawl next-url-crawl-agents urls-crawled next-urls-to-crawl urls-saved host-scores))
; not empty, so get next agent and extract data from it
(let [next-url-crawl-agent (first url-crawl-agents)]
(if (agent-error next-url-crawl-agent)
; agent failed - move to next
(scrawl (rest url-crawl-agents) urls-crawled urls-to-crawl urls-saved host-scores)
; agent succeeded
(let [next-url (http/request-uri next-url-crawl-agent)] ; agent succeeded
(def next-url (http/request-uri next-url-crawl-agent)) ; get url that was crawled
(def all-linked-urls (seq (into #{} (get-unique-linked-urls next-url-crawl-agent))))
(println (get host-scores (get-host next-url)) " " next-url) ; print out next url to crawl and number of mp3s found
(def next-urls-crawled (cons next-url urls-crawled))
(def latest-urls-to-save (remove-dupes-and-unwanted #(mp3? %) all-linked-urls urls-saved))
(def next-host-scores (update-host-scores next-url (count latest-urls-to-save) host-scores))
(def next-urls-saved (concat urls-saved latest-urls-to-save))
(def latest-urls-to-crawl (remove-dupes-and-unwanted #(crawl? % host-scores) all-linked-urls urls-crawled))
(def next-urls-to-crawl (concat urls-to-crawl latest-urls-to-crawl))
(save-data next-url latest-urls-to-save next-urls-to-crawl) ; save key seqs to disk
(scrawl (rest url-crawl-agents) next-urls-crawled next-urls-to-crawl next-urls-saved next-host-scores))))))
(defn update-host-scores [url score host-scores]
(def host (get-host url))
(if (get host-scores host)
(assoc host-scores host (+ score (get host-scores host)))
(assoc host-scores host score)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment