Skip to content

Instantly share code, notes, and snippets.

@atroche
Created March 1, 2017 08:04
Show Gist options
  • Save atroche/a0802d6a718c6fa841e54fa31f2b6f3e to your computer and use it in GitHub Desktop.
Save atroche/a0802d6a718c6fa841e54fa31f2b6f3e to your computer and use it in GitHub Desktop.
(ns sunshine.widefinder
(:import (java.io BufferedInputStream InputStreamReader BufferedReader RandomAccessFile FileInputStream File)))
(defn chunk-file
"Partitions a file into n line-aligned chunks. Returns a list of start and
end byte offset pairs."
[filename n]
(with-open [file (RandomAccessFile. filename "r")]
(let [offsets (for [offset (range 0 (.length file) (/ (.length file) n))]
(do (when-not (zero? offset)
(.seek file offset)
(while (not= (.read file) (int \newline))))
(.getFilePointer file)))
offsets (concat offsets [(.length file)])]
(doall (partition 2 (interleave offsets (rest offsets)))))))
(defn read-lines-range [file start-byte end-byte]
"Returns a lazy sequence of lines from file between start-byte and end-byte."
(let [reader (-> (doto (FileInputStream. file)
(.skip start-byte))
(BufferedInputStream. (* 8 131072))
(InputStreamReader. "US-ASCII")
(BufferedReader. 131072))]
(letfn [(read-line [remaining]
(lazy-seq
(if-let [line (and (pos? remaining) (.readLine reader))]
(cons line (read-line (- remaining (.length line))))
(.close reader))))]
(read-line (- end-byte start-byte)))))
(time
(let [fname "ten.json"
chunk-count (int (/ (.length (File. fname)) (* 32 1024 1024)))]
(println chunk-count)
(-> (pmap (fn [[start end]]
(read-lines-range fname start end))
(chunk-file fname chunk-count))
flatten
count)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment