Last active
November 29, 2015 22:48
-
-
Save revivek/d6559b3fd5b9f661e095 to your computer and use it in GitHub Desktop.
Clojure scraping of campaignmonitor's email client CSS support
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Create a JSON representation of campaignmonitor's email CSS support table | |
;; https://www.campaignmonitor.com/css/ | |
(ns scrape.main | |
(:require [net.cgrand.enlive-html :as html]) | |
(:require [clojure.data.json :as json]) | |
(:require [clojure.core.matrix :as matrix])) | |
(def ^:dynamic *base-url* "https://www.campaignmonitor.com/css/") | |
(def ^:dynamic *feature-names-selector* [:#csstable :tbody :td.element-style]) | |
(def ^:dynamic *email-clients-selector* [:#csstable :thead :td.client]) | |
(def ^:dynamic *support-selector* | |
[:#csstable :tbody :> [:tr (html/but :.short)] [:td (html/but :.element-style)]]) | |
(defn fetch-url | |
"Fetch and parse HTML response returned by request to provided URL." | |
[url] | |
(html/html-resource (java.net.URL. url))) | |
(defn extract-text | |
"Extract text vector, given a resource and selector." | |
[html-resource selector] | |
(mapv html/text (html/select html-resource selector))) | |
(defn extract-matrix | |
"Extract 2D vector matrix, given a resource, partition length, and selector." | |
[html-resource partition-length selector] | |
{:pre [(pos? partition-length)] | |
:post [(= (1 (count (distinct (map count %)))))]} | |
(partition partition-length (extract-text html-resource selector))) | |
(defn build | |
"Build up final hashmap data-structure." | |
[clients clients-support feature-names] | |
{:pre [(pos? client-support) | |
(= (count clients) (count clients-support)) | |
(= (count (first client-support)) (count feature-names))]} | |
(hash-map :clients | |
(mapv #(hash-map :client %1 | |
:features (zipmap feature-names %2)) | |
clients | |
clients-support))) | |
(defn -main | |
[] | |
(spit | |
"main.json" | |
(let [root (fetch-url *base-url*) | |
feature-names (extract-text root *feature-names-selector*) | |
clients (extract-text root *email-clients-selector*) | |
clients-support (matrix/transpose ;; feature-indexed (row) -> client-indexed (col) | |
(matrix/emap #(case % "No" 0 "Info" 1 "Yes" 2) | |
(extract-matrix root (count clients) *support-selector*)))] | |
(json/write-str (build clients clients-support feature-names))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment