Last active
April 20, 2020 10:55
-
-
Save joinr/050a536b7ac01b50ae3dfa00cb7e5a74 to your computer and use it in GitHub Desktop.
A simple implementation of string pooling for clojure.data.csv that nets about 4x compression over the naive implementation.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Copyright (c) Jonas Enlund. All rights reserved. The use and | |
;; distribution terms for this software are covered by the Eclipse | |
;; Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) | |
;; which can be found in the file epl-v10.html at the root of this | |
;; distribution. By using this software in any fashion, you are | |
;; agreeing to be bound by the terms of this license. You must not | |
;; remove this notice, or any other, from this software. | |
;;string pool modification fork by joinr | |
(ns clojure.data.csv.pool | |
(:require [clojure [string :as str]]) | |
(:import [java.util.concurrent ConcurrentHashMap] | |
[java.io PushbackReader Reader Writer StringReader EOFException])) | |
(set! *warn-on-reflection* true) | |
(def ^:dynamic *pool-size* 100) | |
(def ^:dynamic *pool-bound* 10000) | |
(defn ->string-pool | |
"Creates a canonicalized string pool, starting with n, bounded up to | |
size bound. Can be used like a function to canonicalize strings. When | |
applied to a string, if the string exists in the known set of strings, | |
the sole reference to the alread-existing string is returned, rather | |
than the 'new' string. For datasets with a low cardinality, we get | |
a lot of referential sharing using this strategy. Typically only | |
used during parsing." | |
[n bound] | |
(let [^ConcurrentHashMap cm (ConcurrentHashMap. (int n))] | |
(reify clojure.lang.IFn | |
(invoke [this x] | |
(do (when (> (.size cm) (int bound)) | |
(.clear cm)) | |
(if-let [canon (.putIfAbsent cm x x)] | |
canon | |
x))) | |
clojure.lang.IDeref | |
(deref [this] (map key (seq cm)))))) | |
;(set! *warn-on-reflection* true) | |
;; Reading | |
(def ^{:private true} lf (int \newline)) | |
(def ^{:private true} cr (int \return)) | |
(def ^{:private true} eof -1) | |
(defn- read-quoted-cell [^PushbackReader reader ^StringBuilder sb sep quote] | |
(loop [ch (.read reader)] | |
(condp == ch | |
quote (let [next-ch (.read reader)] | |
(condp == next-ch | |
quote (do (.append sb (char quote)) | |
(recur (.read reader))) | |
sep :sep | |
lf :eol | |
cr (let [next-next-ch (.read reader)] | |
(when (not= next-next-ch lf) | |
(.unread reader next-next-ch)) | |
:eol) | |
eof :eof | |
(throw (Exception. ^String (format "CSV error (unexpected character: %c)" next-ch))))) | |
eof (throw (EOFException. "CSV error (unexpected end of file)")) | |
(do (.append sb (char ch)) | |
(recur (.read reader)))))) | |
(defn- read-cell [^PushbackReader reader ^StringBuilder sb sep quote] | |
(let [first-ch (.read reader)] | |
(if (== first-ch quote) | |
(read-quoted-cell reader sb sep quote) | |
(loop [ch first-ch] | |
(condp == ch | |
sep :sep | |
lf :eol | |
cr (let [next-ch (.read reader)] | |
(when (not= next-ch lf) | |
(.unread reader next-ch)) | |
:eol) | |
eof :eof | |
(do (.append sb (char ch)) | |
(recur (.read reader)))))))) | |
(defn- read-record | |
([reader sep quote to-str] | |
(loop [record (transient [])] | |
(let [cell (StringBuilder.) | |
sentinel (read-cell reader cell sep quote)] | |
(if (= sentinel :sep) | |
(recur (conj! record (to-str cell))) | |
[(persistent! (conj! record (to-str cell))) sentinel])))) | |
([reader sep quote] (read-record sep quote str))) | |
(defprotocol Read-CSV-From | |
(read-csv-from [input sep quote])) | |
(defn pooled-read [reader sep quote pool] | |
(lazy-seq | |
(let [[record sentinel] (read-record reader sep quote pool)] | |
(case sentinel | |
:eol (cons record (pooled-read reader sep quote pool)) | |
:eof (when-not (= record [""]) | |
(cons record nil)))))) | |
(extend-protocol Read-CSV-From | |
String | |
(read-csv-from [s sep quote] | |
(read-csv-from (PushbackReader. (StringReader. s)) sep quote)) | |
Reader | |
(read-csv-from [reader sep quote] | |
(read-csv-from (PushbackReader. reader) sep quote)) | |
PushbackReader | |
(read-csv-from [reader sep quote] | |
(let [pool (->string-pool *pool-size* *pool-bound*)] | |
(pooled-read reader sep quote #(pool (str %)))))) | |
(defn read-csv | |
"Reads CSV-data from input (String or java.io.Reader) into a lazy | |
sequence of vectors. | |
Valid options are | |
:separator (default \\,) | |
:quote (default \\\")" | |
[input & options] | |
(let [{:keys [separator quote] :or {separator \, quote \"}} options] | |
(read-csv-from input (int separator) (int quote)))) | |
;; Writing | |
(defn- write-cell [^Writer writer obj sep quote quote?] | |
(let [string (str obj) | |
must-quote (quote? string)] | |
(when must-quote (.write writer (int quote))) | |
(.write writer (if must-quote | |
(str/escape string | |
{quote (str quote quote)}) | |
string)) | |
(when must-quote (.write writer (int quote))))) | |
(defn- write-record [^Writer writer record sep quote quote?] | |
(loop [record record] | |
(when-first [cell record] | |
(write-cell writer cell sep quote quote?) | |
(when-let [more (next record)] | |
(.write writer (int sep)) | |
(recur more))))) | |
(defn- write-csv* | |
[^Writer writer records sep quote quote? ^String newline] | |
(loop [records records] | |
(when-first [record records] | |
(write-record writer record sep quote quote?) | |
(.write writer newline) | |
(recur (next records))))) | |
(defn write-csv | |
"Writes data to writer in CSV-format. | |
Valid options are | |
:separator (Default \\,) | |
:quote (Default \\\") | |
:quote? (A predicate function which determines if a string should be quoted. Defaults to quoting only when necessary.) | |
:newline (:lf (default) or :cr+lf)" | |
[writer data & options] | |
(let [opts (apply hash-map options) | |
separator (or (:separator opts) \,) | |
quote (or (:quote opts) \") | |
quote? (or (:quote? opts) #(some #{separator quote \return \newline} %)) | |
newline (or (:newline opts) :lf)] | |
(write-csv* writer | |
data | |
separator | |
quote | |
quote? | |
({:lf "\n" :cr+lf "\r\n"} newline)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment