Created
December 26, 2014 19:40
-
-
Save favila/035718ab762c6adfc8dc to your computer and use it in GitHub Desktop.
Using iota to process an XYZ file in parallel over its chunks. Easily generalized to any situation where you want to fold over groups where the source is not grouped. Prompted by this post: https://groups.google.com/forum/#!topic/clojure/OkxAshQ0JTU
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns xyz | |
"Utilities for determining chunk position in XYZ files and processing | |
chunks using reducers. | |
Uses iota, which uses mmap()." | |
(:require [clojure.core.reducers :as r] | |
iota)) | |
(defn- step-chunk-starts | |
([] [(vector-of :int) 0 :find-start]) | |
([[v i state]] (conj v i)) | |
([[v i state] ^String l] | |
(case state | |
:find-start (if (re-matches #"\d{1,}" l) | |
[(conj v i) (inc i) :comment] | |
[v (inc i) :find-start]) | |
:comment [v (inc i) :find-start]))) | |
(defn chunk-starts | |
"Returns `[chunk-start-line ... total-num-of-lines]` from a reducible | |
`lines` of XYZ file lines." | |
[lines] | |
(->> lines | |
(r/reduce step-chunk-starts) | |
(step-chunk-starts))) | |
(defn chunk-ranges | |
"Returns `[[start-idx0 end-idx0] [end-idx0 start-idx1] ...]`. | |
Same as `(partition 2 1 start+total)` but non-lazy. | |
Designed to produce arguments for subvec to grab groups of items at a time." | |
[start+total] | |
(->> (partition 2 1 start+total) | |
(reduce (fn [ranges [s e]] (conj! ranges (vector-of :int s e))) | |
(transient [])) | |
(persistent!))) | |
(defn index-xyz* | |
"Return a vec of vecs, each of which is a range of indexes in coll which | |
constitutes a single XYZ chunk. Retrieve the chunks with | |
`(map #(apply subvec coll %) (index-xyz* coll))`." | |
[coll] | |
(->> coll chunk-starts chunk-ranges)) | |
(defn index-xyz | |
"Return the chunk index of an XYZ file." | |
[xyzfile] | |
(index-xyz* (iota/seq xyzfile))) | |
(defn foldable-chunks* | |
"Return a foldable collection of the chunks in coll." | |
([coll] | |
(foldable-chunks* coll (index-xyz* coll))) | |
([coll index] | |
(r/map (fn [[s e]] (subvec coll s e)) index))) | |
(defn foldable-chunks | |
"Return a foldable collection chunks in an XYZ file." | |
([xyzfile] | |
(foldable-chunks* (iota/vec xyzfile)))) | |
(comment | |
"Silly example: count all the chunks." | |
(->> (foldable-chunks "myfile.xyz") | |
(r/map (constantly 1)) | |
(r/fold +)) | |
"Same." | |
(->> (foldable-chunks "myfile.xyz") | |
(r/fold + (fn ([] 0) ([x _] (inc x))))) | |
"Get comment line of each chunk." | |
(->> (foldable-chunks "myfile.xyz") | |
(r/map (fn [atom-count comment & atoms] comment)) | |
(r/foldcat))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment