Skip to content

Instantly share code, notes, and snippets.

@joinr
Last active March 9, 2020 18:42
Show Gist options
  • Save joinr/8c8c82f0e9d814d2162f77c3eda3e0bd to your computer and use it in GitHub Desktop.
Save joinr/8c8c82f0e9d814d2162f77c3eda3e0bd to your computer and use it in GitHub Desktop.
No-seq parsing implementation
;;exctracted from original function, refined to
;;work with iterables via eduction.
(defn derive-parsers
[headers parser-fn parser-scan-len data]
(let [n-cols (count headers)]
(if-not parser-fn
;;a bunch of default parsers
(repeatedly n-cols default-column-parser)
;;only sample what we need, don't retain any seq head.
(->> data
(eduction (take parser-scan-len))
(apply interleave)
(partition parser-scan-len)
(map parser-fn headers)))))
;;our reducing function.
;;returns our column parsers (really parse state), uses
;;side effects.
(defn parse-row [^List parsers ^"[Ljava.lang.String;" row]
(loop [col-idx 0]
(when (< col-idx n-cols)
(let [^String row-data (aget row col-idx)
parser (.get parsers col-idx)]
(if (and row-data
(> (.length row-data) 0)
(not (.equalsIgnoreCase "na" row-data)))
(parse! parser row-data)
(missing! parser))
(recur (unchecked-inc col-idx)))))
parsers)
;;slight modification to original implementation.
;;avoids retaining head of the seq. uses
;;iterables to generate eduction recipes,
;;and reduce instead of generating intermediate
;;seqs.
(defn csv->columns
[input & {:keys [header-row? parser-fn
parser-scan-len]
:or {header-row? true
parser-scan-len 100}}]
(let [;;I think we have to create a new iterator each time, so use a function.
->rows (fn [] (raw-row-iterable input))
initial-row (->> (->rows) (eduction (take 1)) first)
n-cols (count initial-row)
->data (fn [] (if header-row?
(eduction (drop 1) (->rows))
(->rows)))
header-row (or (and header-row? initial-row)
(vec (map (str "col_" (range n-cols)))))
^List column-parsers
(derive-parsers header-row parser-fn parser-scan-len (->data))]
(->> (->data)
(reduce parse-row column-parsers)
(mapv (fn [init-row-data parser]
(assoc (column-data parser)
:name init-row-data))
(if header-row?
initial-row
(range n-cols))))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment