Created
March 1, 2017 08:04
-
-
Save atroche/a0802d6a718c6fa841e54fa31f2b6f3e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns sunshine.widefinder | |
(:import (java.io BufferedInputStream InputStreamReader BufferedReader RandomAccessFile FileInputStream File))) | |
(defn chunk-file | |
"Partitions a file into n line-aligned chunks. Returns a list of start and | |
end byte offset pairs." | |
[filename n] | |
(with-open [file (RandomAccessFile. filename "r")] | |
(let [offsets (for [offset (range 0 (.length file) (/ (.length file) n))] | |
(do (when-not (zero? offset) | |
(.seek file offset) | |
(while (not= (.read file) (int \newline)))) | |
(.getFilePointer file))) | |
offsets (concat offsets [(.length file)])] | |
(doall (partition 2 (interleave offsets (rest offsets))))))) | |
(defn read-lines-range [file start-byte end-byte] | |
"Returns a lazy sequence of lines from file between start-byte and end-byte." | |
(let [reader (-> (doto (FileInputStream. file) | |
(.skip start-byte)) | |
(BufferedInputStream. (* 8 131072)) | |
(InputStreamReader. "US-ASCII") | |
(BufferedReader. 131072))] | |
(letfn [(read-line [remaining] | |
(lazy-seq | |
(if-let [line (and (pos? remaining) (.readLine reader))] | |
(cons line (read-line (- remaining (.length line)))) | |
(.close reader))))] | |
(read-line (- end-byte start-byte))))) | |
(time | |
(let [fname "ten.json" | |
chunk-count (int (/ (.length (File. fname)) (* 32 1024 1024)))] | |
(println chunk-count) | |
(-> (pmap (fn [[start end]] | |
(read-lines-range fname start end)) | |
(chunk-file fname chunk-count)) | |
flatten | |
count))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment