Last active
March 9, 2020 18:42
-
-
Save joinr/8c8c82f0e9d814d2162f77c3eda3e0bd to your computer and use it in GitHub Desktop.
No-seq parsing implementation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;exctracted from original function, refined to | |
;;work with iterables via eduction. | |
(defn derive-parsers | |
[headers parser-fn parser-scan-len data] | |
(let [n-cols (count headers)] | |
(if-not parser-fn | |
;;a bunch of default parsers | |
(repeatedly n-cols default-column-parser) | |
;;only sample what we need, don't retain any seq head. | |
(->> data | |
(eduction (take parser-scan-len)) | |
(apply interleave) | |
(partition parser-scan-len) | |
(map parser-fn headers))))) | |
;;our reducing function. | |
;;returns our column parsers (really parse state), uses | |
;;side effects. | |
(defn parse-row [^List parsers ^"[Ljava.lang.String;" row] | |
(loop [col-idx 0] | |
(when (< col-idx n-cols) | |
(let [^String row-data (aget row col-idx) | |
parser (.get parsers col-idx)] | |
(if (and row-data | |
(> (.length row-data) 0) | |
(not (.equalsIgnoreCase "na" row-data))) | |
(parse! parser row-data) | |
(missing! parser)) | |
(recur (unchecked-inc col-idx))))) | |
parsers) | |
;;slight modification to original implementation. | |
;;avoids retaining head of the seq. uses | |
;;iterables to generate eduction recipes, | |
;;and reduce instead of generating intermediate | |
;;seqs. | |
(defn csv->columns | |
[input & {:keys [header-row? parser-fn | |
parser-scan-len] | |
:or {header-row? true | |
parser-scan-len 100}}] | |
(let [;;I think we have to create a new iterator each time, so use a function. | |
->rows (fn [] (raw-row-iterable input)) | |
initial-row (->> (->rows) (eduction (take 1)) first) | |
n-cols (count initial-row) | |
->data (fn [] (if header-row? | |
(eduction (drop 1) (->rows)) | |
(->rows))) | |
header-row (or (and header-row? initial-row) | |
(vec (map (str "col_" (range n-cols))))) | |
^List column-parsers | |
(derive-parsers header-row parser-fn parser-scan-len (->data))] | |
(->> (->data) | |
(reduce parse-row column-parsers) | |
(mapv (fn [init-row-data parser] | |
(assoc (column-data parser) | |
:name init-row-data)) | |
(if header-row? | |
initial-row | |
(range n-cols)))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment