Created
May 26, 2010 11:20
-
-
Save stantont/414359 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ; Copyright (c) Jonas Enlund. All rights reserved. | |
| ; The use and distribution terms for this software are covered by the | |
| ; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) | |
| ; which can be found in the file epl-v10.html at the root of this distribution. | |
| ; By using this software in any fashion, you are agreeing to be bound by | |
| ; the terms of this license. | |
| ; You must not remove this notice, or any other, from this software. | |
| (set! *warn-on-reflection* true) | |
| (defn- partition-on [pred coll] | |
| (lazy-seq | |
| (when-let [s (seq coll)] | |
| (let [[x xs] (split-with (complement pred) s)] | |
| (cons x (partition-on pred (next xs))))))) | |
| (def ^{:private true} | |
| nline (int \newline)) | |
| (def ^{:private true} | |
| eof -1) | |
| (defn- read-cell [^java.io.Reader reader sep quote] | |
| (let [sb (StringBuffer.)] | |
| (loop [ch (.read reader) in-quotes? false] | |
| (condp == ch | |
| sep | |
| (if in-quotes? | |
| (do (.append sb (char sep)) | |
| (recur (.read reader) true)) | |
| [(.toString sb) :sep]) | |
| quote | |
| (if in-quotes? | |
| (let [next-ch (.read reader)] | |
| (if (== quote next-ch) | |
| (do (.append sb \") | |
| (recur (.read reader) true)) | |
| (recur next-ch false))) | |
| (recur (.read reader) true)) | |
| nline | |
| (if in-quotes? | |
| (do (.append sb \newline) | |
| (recur (.read reader) true)) | |
| [(.toString sb) :eol]) | |
| eof | |
| [(.toString sb) :eof] | |
| ;; else | |
| (do (.append sb (char ch)) | |
| (recur (.read reader) in-quotes?)))))) | |
| (defn- read-csv [reader sep quote] | |
| (lazy-seq | |
| (let [[cell sentinel] (read-cell reader sep quote)] | |
| (case sentinel | |
| :sep | |
| (cons cell (read-csv reader sep quote)) | |
| :eol | |
| (list* cell nil (read-csv reader sep quote)) | |
| :eof | |
| (if (empty? cell) nil (cons cell nil)))))) | |
| (defn parse | |
| [reader & {:keys [f skip-header sep quote] | |
| :or {f vector | |
| skip-header false | |
| sep \, | |
| quote \"}}] | |
| (let [csv (partition-on nil? (read-csv reader (int sep) (int quote)))] | |
| (map #(apply f %) | |
| (if skip-header | |
| (next csv) | |
| csv)))) | |
| ;; tests | |
| ;(comment | |
| (use 'clojure.pprint) | |
| (use 'clojure.java.io) | |
| ;;;; simple.csv ;;;; | |
| ;Year,Make,Model | |
| ;1997,Ford,E350 | |
| ;2000,Mercury,Cougar | |
| ;;;; | |
| (defn simple [] | |
| (with-open [r (reader "simple.csv")] | |
| (doall | |
| (parse r :skip-header true)))) | |
| ;;;; complicated.csv ;;;; | |
| ;1997,Ford,E350,"ac, abs, moon",3000.00 | |
| ;1999,Chevy,"Venture ""Extended Edition""","",4900.00 | |
| ;1999,Chevy,"Venture ""Extended Edition, Very Large""","",5000.00 | |
| ;1996,Jeep,Grand Cherokee,"MUST SELL! | |
| ;air, moon roof, loaded",4799.00 | |
| ;;;; | |
| (defn complicated [] | |
| (with-open [r (reader "complicated.csv")] | |
| (doall | |
| (pprint (parse r))))) | |
| ;;;; big file (~ 6Gb) genereted with: ;;;; | |
| ;(use 'clojure.java.io) | |
| ;(defn gencsv [rows] | |
| ; (let [rstr (fn [row] | |
| ; (str row "," (rand-int (rand-int 60000)) "," (rand) | |
| ; "," (rand) "," (rand-int 60000) "\n"))] | |
| ; (with-open [w (writer "big.csv")] | |
| ; (dotimes [r rows] (.write w (rstr (inc r))))))) | |
| (defn max-big [n] | |
| (let [f (fn [i d & _] | |
| [i (Integer/parseInt d)])] | |
| (with-open [r (reader "big.csv")] | |
| (apply max-key second | |
| (take n (parse r :f f)))))) | |
| ;; Currently (time (max-big 1000000)) in ~8500msecs | |
| ;);comment |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment