Skip to content

Instantly share code, notes, and snippets.

@stantont
Created June 8, 2010 12:22
Show Gist options
  • Save stantont/429941 to your computer and use it in GitHub Desktop.
Save stantont/429941 to your computer and use it in GitHub Desktop.
1997 Ford E350 ac, abs, moon 3000.00
1999 Chevy Venture "Extended Edition" 4900.00
1999 Chevy Venture "Extended Edition, Very Large" 5000.00
1996 Jeep Grand Cherokee MUST SELL! air, moon roof, loaded 4799.00
; Copyright (c) Jonas Enlund. All rights reserved.
; The use and distribution terms for this software are covered by the
; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
; which can be found in the file epl-v10.html at the root of this distribution.
; By using this software in any fashion, you are agreeing to be bound by
; the terms of this license.
; You must not remove this notice, or any other, from this software.
(set! *warn-on-reflection* true)
(defn- partition-on [pred coll]
(lazy-seq
(when-let [s (seq coll)]
(let [[x xs] (split-with (complement pred) s)]
(cons x (partition-on pred (next xs)))))))
(def ^{:private true}
nline (int \newline))
(def ^{:private true}
eof -1)
(defn- read-cell [^java.io.Reader reader sep quote]
(let [sb (StringBuilder.)]
(loop [ch (.read reader) in-quotes? false]
(condp == ch
sep
(if in-quotes?
(do (.append sb (char sep))
(recur (.read reader) true))
[(.toString sb) :sep])
quote
(if in-quotes?
(let [next-ch (.read reader)]
(if (== quote next-ch)
(do (.append sb (char quote))
(recur (.read reader) true))
(recur next-ch false)))
(recur (.read reader) true))
nline
(if in-quotes?
(do (.append sb \newline)
(recur (.read reader) true))
[(.toString sb) :eol])
eof
[(.toString sb) :eof]
;; else
(do (.append sb (char ch))
(recur (.read reader) in-quotes?))))))
(defn- read-csv [reader sep quote]
(lazy-seq
(let [[cell sentinel] (read-cell reader sep quote)]
(case sentinel
:sep
(cons cell (read-csv reader sep quote))
:eol
(list* cell nil (read-csv reader sep quote))
:eof
(if (empty? cell) nil (cons cell nil))))))
(defn parse
[reader & {:keys [f skip-header sep quote]
:or {f vector
skip-header false
sep \,
quote \"}}]
(let [csv (partition-on nil? (read-csv reader (int sep) (int quote)))]
(map #(apply f %)
(if skip-header
(next csv)
csv))))
;; tests
;(comment
(use 'clojure.pprint)
(use 'clojure.java.io)
;; see simple.csv
(defn simple []
(with-open [r (reader "simple.csv")]
(doall
(parse r :skip-header true))))
;; see complicated.csv
(defn complicated []
(with-open [r (reader "complicated.csv")]
(doall
(pprint (parse r)))))
;; using a big file (~ 6Gb) genereted with gencsv.clj
(defn max-big [n]
(let [f (fn [i d & _]
[i (Integer/parseInt d)])]
(with-open [r (reader "big.csv")]
(apply max-key second
(take n (parse r :f f))))))
;; Currently (time (max-big 1000000)) in ~8500msecs
;);comment
(use 'clojure.java.io)
(defn gencsv [rows]
(let [rstr (fn [r]
(str r
"," (-> 60000 rand-int rand-int)
"," (rand) "," (rand)
"," (rand-int 60000) "\n"))]
(with-open [w (writer "big.csv")]
(dotimes [r rows] (.write w (rstr (inc r)))))))
Year Make Model
1997 Ford E350
2000 Mercury Cougar
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment