Created
January 15, 2017 02:15
-
-
Save jmorton/236f740062359c50428bc6bfd0170044 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns scrape.core | |
(:require [clojure.edn :as edn] | |
[org.httpkit.client :as client] | |
[net.cgrand.enlive-html :as html])) | |
(defn get-links | |
"Get all links at URL as absolute URLs." | |
[url] | |
(->> ;; first get all <a href="">...</a> elements... | |
(-> (java.net.URL. url) | |
(html/html-resource) | |
(html/select [:a])) | |
;; ...then pluck out the href attributes... | |
(map :attrs) | |
(map :href) | |
;; ...and make an absolute URL. | |
(map #(str url %)))) | |
(defn get-body | |
"GET body of URL after delay" | |
([url delay] | |
(Thread/sleep delay) | |
(-> url client/get deref :body slurp)) | |
([url] | |
(get-body url 250))) | |
(defn base->batch | |
"Get links to batches of scenes." | |
[base-url] | |
(->> (get-links base-url) | |
(rest))) | |
(defn batch->scenes | |
"Get links to scene/md5 URL pairs." | |
[scene-list-url] | |
(->> (get-links scene-list-url) | |
(rest) | |
(partition 2))) | |
(defn scene->source | |
"Build a 'source' map for a scene/md5 URL pair." | |
[[checksum-url scene-url]] | |
(let [content (get-body checksum-url) | |
[_ checksum scene-id] (re-matches #"([\S]+) ([\S]+)" content)] | |
{:id scene-id | |
:url scene-url | |
:checksum checksum})) | |
(comment | |
(def base-url "https://edclpdsftp.cr.usgs.gov/downloads/lcmap/sites/washington/") | |
(def batch-list (base->batch base-url)) | |
(def source-sample (map scene->source (take 3 (batch->scenes (first batch-list)))) | |
(spit "source-sample.edn" source-sample)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment