Skip to content

Instantly share code, notes, and snippets.

@stantont
Created May 26, 2010 11:20
Show Gist options
  • Select an option

  • Save stantont/414359 to your computer and use it in GitHub Desktop.

Select an option

Save stantont/414359 to your computer and use it in GitHub Desktop.
; Copyright (c) Jonas Enlund. All rights reserved.
; The use and distribution terms for this software are covered by the
; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
; which can be found in the file epl-v10.html at the root of this distribution.
; By using this software in any fashion, you are agreeing to be bound by
; the terms of this license.
; You must not remove this notice, or any other, from this software.
(set! *warn-on-reflection* true)
(defn- partition-on [pred coll]
(lazy-seq
(when-let [s (seq coll)]
(let [[x xs] (split-with (complement pred) s)]
(cons x (partition-on pred (next xs)))))))
(def ^{:private true}
nline (int \newline))
(def ^{:private true}
eof -1)
(defn- read-cell [^java.io.Reader reader sep quote]
(let [sb (StringBuffer.)]
(loop [ch (.read reader) in-quotes? false]
(condp == ch
sep
(if in-quotes?
(do (.append sb (char sep))
(recur (.read reader) true))
[(.toString sb) :sep])
quote
(if in-quotes?
(let [next-ch (.read reader)]
(if (== quote next-ch)
(do (.append sb \")
(recur (.read reader) true))
(recur next-ch false)))
(recur (.read reader) true))
nline
(if in-quotes?
(do (.append sb \newline)
(recur (.read reader) true))
[(.toString sb) :eol])
eof
[(.toString sb) :eof]
;; else
(do (.append sb (char ch))
(recur (.read reader) in-quotes?))))))
(defn- read-csv [reader sep quote]
(lazy-seq
(let [[cell sentinel] (read-cell reader sep quote)]
(case sentinel
:sep
(cons cell (read-csv reader sep quote))
:eol
(list* cell nil (read-csv reader sep quote))
:eof
(if (empty? cell) nil (cons cell nil))))))
(defn parse
[reader & {:keys [f skip-header sep quote]
:or {f vector
skip-header false
sep \,
quote \"}}]
(let [csv (partition-on nil? (read-csv reader (int sep) (int quote)))]
(map #(apply f %)
(if skip-header
(next csv)
csv))))
;; tests
;(comment
(use 'clojure.pprint)
(use 'clojure.java.io)
;;;; simple.csv ;;;;
;Year,Make,Model
;1997,Ford,E350
;2000,Mercury,Cougar
;;;;
(defn simple []
(with-open [r (reader "simple.csv")]
(doall
(parse r :skip-header true))))
;;;; complicated.csv ;;;;
;1997,Ford,E350,"ac, abs, moon",3000.00
;1999,Chevy,"Venture ""Extended Edition""","",4900.00
;1999,Chevy,"Venture ""Extended Edition, Very Large""","",5000.00
;1996,Jeep,Grand Cherokee,"MUST SELL!
;air, moon roof, loaded",4799.00
;;;;
(defn complicated []
(with-open [r (reader "complicated.csv")]
(doall
(pprint (parse r)))))
;;;; big file (~ 6Gb) genereted with: ;;;;
;(use 'clojure.java.io)
;(defn gencsv [rows]
; (let [rstr (fn [row]
; (str row "," (rand-int (rand-int 60000)) "," (rand)
; "," (rand) "," (rand-int 60000) "\n"))]
; (with-open [w (writer "big.csv")]
; (dotimes [r rows] (.write w (rstr (inc r)))))))
(defn max-big [n]
(let [f (fn [i d & _]
[i (Integer/parseInt d)])]
(with-open [r (reader "big.csv")]
(apply max-key second
(take n (parse r :f f))))))
;; Currently (time (max-big 1000000)) in ~8500msecs
;);comment
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment