Created
May 2, 2015 07:41
-
-
Save bakyeono/5d8b1e8058d5e5bb672c to your computer and use it in GitHub Desktop.
Clojure Data Analysis snippets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns da | |
(:use [clojure repl pprint]) | |
(:require [clojure string xml zip]) | |
(:require [clojure.data json]) | |
(:require [clojure.java jdbc]) | |
(:require [net.cgrand enlive-html]) | |
(:require [incanter core io excel]) | |
(:import [java.net URL])) | |
;;; utility | |
(defn main | |
[] | |
nil) | |
(defn reload | |
[] | |
(require 'da :reload)) | |
(defn reload-all | |
[] | |
(require 'da :reload-all)) | |
;;; example: reading data | |
(defn read-csv | |
[] | |
(incanter.io/read-dataset "data/small-sample.csv")) | |
(defn read-csv-with-header | |
[] | |
(incanter.io/read-dataset "data/small-sample-header.csv" :header true)) | |
(defn read-json | |
[] | |
(-> "data/small-sample.json" | |
slurp | |
clojure.data.json/read-json | |
incanter.core/to-dataset)) | |
(defn read-xls | |
[] | |
(incanter.excel/read-xls "data/small-sample-header.xls")) | |
(defn read-jdbc | |
[] | |
(let [db {:subprotocol "sqlite" | |
:subname "data/small-sample.sqlite" | |
:classname "org.sqlite.JDBC"} | |
table-name 'people | |
sql (str "SELECT * FROM " table-name ";")] | |
(clojure.java.jdbc/with-connection db | |
(clojure.java.jdbc/with-query-results rs [sql] | |
(incanter.core/to-dataset (doall rs)))))) | |
(defn read-xml | |
[] | |
(let [xml-file "data/small-sample.xml" | |
first-data clojure.zip/down | |
next-data clojure.zip/right | |
data-map (fn [node] | |
[(:tag node) (first (:content node))])] | |
(->> | |
;; 1. Parse the XML data file; | |
(clojure.xml/parse xml-file) | |
clojure.zip/xml-zip | |
;; 2. Walk it to extract the data nodes; | |
first-data | |
(iterate next-data) | |
(take-while #(not (nil? %))) | |
(map clojure.zip/children) | |
;; 3. Convert them into a sequence of maps; and | |
(map #(mapcat data-map %)) | |
(map #(apply array-map %)) | |
;; 4. Finally convert that into an Incanter dataset | |
incanter.core/to-dataset))) | |
(defn str->kwd | |
"Takes a string and returns a normalized keyword." | |
[str] | |
(-> str | |
clojure.string/lower-case | |
(clojure.string/replace \space \-) | |
keyword)) | |
(defn read-html-table | |
[] | |
(let [url (str "http://" | |
"www.ericrochester.com" | |
"/clj-data-analysis/data/small-sample-table.html") | |
html (net.cgrand.enlive-html/html-resource (java.net.URL. url)) | |
table (net.cgrand.enlive-html/select html [:table#data]) | |
headers (->> (net.cgrand.enlive-html/select table [:tr :th]) | |
(map net.cgrand.enlive-html/text) | |
(map str->kwd) | |
vec) | |
rows (->> (net.cgrand.enlive-html/select table [:tr]) | |
(map #(net.cgrand.enlive-html/select % [:td])) | |
(map #(map net.cgrand.enlive-html/text %)) | |
(filter seq))] | |
(incanter.core/dataset headers rows))) | |
(defn get-family | |
"Takes an article element and returns the family name." | |
[article] | |
(clojure.string/join | |
(map net.cgrand.enlive-html/text | |
(net.cgrand.enlive-html/select article [:header :h2])))) | |
(defn get-person | |
"Takes a list item and returns a map of the persons' name and | |
relationship." | |
[li] | |
(let [[{pnames :content} rel] (:content li)] | |
{:name (apply str pnames) | |
:relationship (clojure.string/trim rel)})) | |
(defn get-rows | |
"Takes an article and returns the person mappings, with the | |
family name added." | |
[article] | |
(let [family (get-family article)] | |
(map #(assoc % :family family) | |
(map get-person | |
(net.cgrand.enlive-html/select article [:ul :li]))))) | |
(defn read-html-list | |
[] | |
(let [url (str "http://" | |
"www.ericrochester.com" | |
"/clj-data-analysis/data/small-sample-list.html") | |
html (net.cgrand.enlive-html/html-resource (java.net.URL. url)) | |
articles (net.cgrand.enlive-html/select html [:article])] | |
(incanter.core/to-dataset (mapcat get-rows articles)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment