Skip to content

Instantly share code, notes, and snippets.

@mikeananev
Last active February 16, 2020 16:35
Show Gist options
  • Select an option

  • Save mikeananev/6f0a80d244e5aa29c62a3c798380e61e to your computer and use it in GitHub Desktop.

Select an option

Save mikeananev/6f0a80d244e5aa29c62a3c798380e61e to your computer and use it in GitHub Desktop.
Clojure RLE encoder/decoder
(ns org.rssys.rle.core
(:gen-class)
(:require [org.rssys.rle.naive :as rle1]
[org.rssys.rle.lazy :as rle2]))
(defn -main
"entry point to program."
[& args]
(condp = (first args)
"e" (do
(println "compress mode.")
(rle1/compress-file (second args) (first (drop 2 args))))
"d" (do
(println "decompress mode")
(rle1/decompress-file (second args) (first (drop 2 args))))
"e2" (do
(println "compress (lazy) mode.")
(rle2/compress-file (second args) (first (drop 2 args))))
(do
(println "help: <e|e2|d> <infile> <outfile>")))
(flush)
(System/exit 0))
(comment
(require '[hashp.core])
(compress-file "a1.txt" "a1.rle")
(decompress-file "a1.rle" "a2.txt"))
{
:mvn/repos {"clojars" {:url "https://repo.clojars.org/"}
"central" {:url "https://repo1.maven.org/maven2/"}}
;; don't change target/classes (do not remove it from :paths and
;; do not rename it, otherwise edit build.clj)
:paths ["src" "resources" "target/classes"]
:deps {
org.clojure/clojure {:mvn/version "1.10.1"}
}
:build {
;; uncomment :java-source-paths and :javac-options if you need to compile java sources
;; :java-source-paths "java-src"
;; ( default options: ["-cp" "src:target/classes" "-target" "1.8" "-source" "1.8" "-Xlint:-options"] )
;; :javac-options ["-cp" "src:target/classes" "-target" "1.8" "-source" "1.8" "-Xlint:-options"]
;; uncomment this if you want to specify uberjar name,
;; otherwise it will be artefact-version-standalone.jar
;;:uberjar-name "rle.jar"
:omit-source true
:main "org.rssys.rle.core"
:group-id "org.rssys"
:artifact-id "rle"
:artifact-version "0.1.0-SNAPSHOT"
}
:aliases {
:run {:extra-paths ["resources"]
:jvm-opts ["-Duser.timezone=UTC"
"-DLOG4J_CONFIGURATION_FILE=resources/log4j2.xml"]
:main-opts ["--main" "org.rssys.rle.core"]}
;; to run repl: clojure -R:bg -A:repl
:repl {:extra-deps {criterium {:mvn/version "0.4.5"}
nrepl {:mvn/version "0.6.0"}
healthsamurai/matcho {:mvn/version "0.3.3"}
hashp {:mvn/version "0.1.1"}}
:jvm-opts ["-Duser.timezone=UTC"]
:extra-paths ["dev/src" "resources" "test"]
:main-opts ["--main" "nrepl.cmdline"]}
:test {:extra-deps {lambdaisland/kaocha {:mvn/version "0.0-573"}
lambdaisland/kaocha-cloverage {:mvn/version "0.0-41"}
healthsamurai/matcho {:mvn/version "0.3.3"}}
:extra-paths ["resources" "test" "test/resources"]
:jvm-opts ["-Duser.timezone=UTC"
;;"-DLOG4J_CONFIGURATION_FILE=test/resources/log4j2-test.xml"
]
:main-opts ["--main" "kaocha.runner"]}
:bg {:extra-deps {badigeon/badigeon {:git/url "https://github.com/EwenG/badigeon.git"
:sha "c5d7d8f9c44fee2f193ef924cdf8a485aee539c5"
}}}
;; build uberjar (compile java, clj files): clojure -R:bg -A:uberjar
:uberjar {:main-opts ["-e" "(load-file,\"dev/src/build.clj\"),(build/clean),(build/uberjar)"]}
;; compile java classes only: clojure -R:bg -A:javac
:javac {:main-opts ["-e" "(load-file,\"dev/src/build.clj\"),(build/compile-java)"]}
;; run on Java 9+ to build standalone app: clojure -R:bg -A:standalone
:standalone {:main-opts ["-e" "(load-file,\"dev/src/build.clj\"),(build/clean),(build/standalone)"]}
}
}
(ns org.rssys.rle.lazy
(:require [clojure.java.io :as io])
(:import (java.io InputStream OutputStream)
(java.util Arrays)))
;; byte-seq creates a lazy chunked sequence of bytes from an InputStream.
;; It takes the optimal chunk "size" to model a physical resource
;; (for a disk this would be the block size).
(defn byte-seq [^InputStream is size]
(let [ib (byte-array size)]
((fn step []
(lazy-seq
(let [n (.read is ib)]
(when (not= -1 n)
(let [cb (chunk-buffer size)]
(dotimes [i size] (chunk-append cb (aget ib i)))
(chunk-cons (chunk cb) (step))))))))))
(defn byte-seq2
[^InputStream is]
(lazy-seq
(let [b (.read is)]
(when (not= -1 b)
(cons b (byte-seq2 is))))))
;; RLE encoding transducer
(def rle-encoder
(comp
(partition-by identity)
(map #(vector (count %) (first %)))
(map (fn [x] (if (= 1 (first x)) (second x) x)))
(partition-by vector?)
(map (fn [x] (if (vector? (first x)) x [(- (count x)) (into [] x)])))))
(defn write-bytes-array
"# write encoded bytes to output stream
if `len` > 0 then ba should contain one repeat byte.
if `len` < 0 then ba should contain non repeat bytes.
if ba > -128/127 then ba splits on blocks 128/127 bytes length."
[^OutputStream out-stream ba len]
(if (not (empty? ba))
(cond
(pos-int? len) (if (>= len Byte/MAX_VALUE)
(let [m (mod len Byte/MAX_VALUE)]
(dotimes [n (quot len Byte/MAX_VALUE)]
(.write out-stream (byte-array [(byte Byte/MAX_VALUE) (first ba)])))
(when (> m 0) (.write out-stream (byte-array [(byte m) (first ba)]))))
(.write out-stream (byte-array [(byte len) (first ba)])))
(neg-int? len) (if (>= Byte/MIN_VALUE len)
(let [int-len (- Byte/MIN_VALUE)
int-coll (partition int-len ba)
rest-len (- (- len) (* (count int-coll) int-len))]
(doseq [next-coll int-coll]
(.write out-stream (byte-array (into [(byte Byte/MIN_VALUE)] next-coll))))
(when (> rest-len 0)
(.write out-stream (byte-array (into [(byte (- rest-len))] (drop (* (count int-coll) int-len) ba))))))
(.write out-stream (byte-array (into [(byte len)] ba)))))))
(defn compress-file
[^String infile ^String outfile]
(with-open [in-stream (io/input-stream infile)
out-stream (io/output-stream outfile)]
(let [bs (byte-seq2 in-stream)]
(doseq [item (eduction rle-encoder bs)]
(if (coll? (first item))
(doseq [i item]
(write-bytes-array out-stream [(second i)] (first i)))
(write-bytes-array out-stream (second item) (first item)))))))
(defn decompress-file
"RLE decompressor"
[in-filename out-filename]
(let [out-bytes-counter (atom 0)]
(with-open [in-stream (io/input-stream in-filename)
out-stream (io/output-stream out-filename)]
(loop [next-byte (.read in-stream)]
(if (= next-byte -1)
(printf "decompressed size: %d\n" @out-bytes-counter)
(do
(if (neg-int? (unchecked-byte next-byte))
(let [ba (byte-array (- (unchecked-byte next-byte)))
len (.read in-stream ba)]
(.write out-stream ba)
(swap! out-bytes-counter + len))
(let [len (unchecked-byte next-byte)
ba (byte-array len)
data (.read in-stream)
_ (Arrays/fill ba (unchecked-byte data))]
(.write out-stream ba)
(swap! out-bytes-counter + len)))
(recur (.read in-stream))))))))
(comment
(compress-file "LICENSE" "abc.rle")
(decompress-file "abc.rle" "abc.txt")
(compress-file "abc.txt" "abc.rle")
(def s1 "abccchhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhq")
(def e (eduction rle-encoder (.getBytes s1)))
(def s2 (apply str (map char (rle-decode e))))
(= s1 s2))
(ns org.rssys.rle.naive
(:require [clojure.java.io :as io])
(:import (java.util Arrays)))
(def EOF -1)
(defn write-bytes-array
[^java.io.OutputStream out-stream ba repeated-bytes?]
(if (not (empty? ba))
(let [bytes-count (if repeated-bytes? (count ba) (- (count ba)))]
(if repeated-bytes?
(.write out-stream (byte-array (into [(byte bytes-count)] [(first ba)])))
(.write out-stream (byte-array (into [(byte bytes-count)] ba))))
(if repeated-bytes?
2
(inc (count ba))))
0))
(defn repeat-detected?
[out-bytes-array next-byte]
(= next-byte (peek out-bytes-array)))
(defn compress-file
"naive RLE compressor"
[in-filename out-filename]
(let [in-bytes-counter (atom 0)
out-bytes-counter (atom 0)]
(with-open [in-stream (io/input-stream in-filename)
out-stream (io/output-stream out-filename)]
(loop [next-byte (.read in-stream)
out-bytes-array []
repeat-found? false]
(if (= next-byte EOF)
(swap! out-bytes-counter + (write-bytes-array out-stream out-bytes-array repeat-found?))
(do
(swap! in-bytes-counter inc)
(if (and (not repeat-found?) (repeat-detected? out-bytes-array next-byte))
(do
(swap! out-bytes-counter + (write-bytes-array out-stream (pop out-bytes-array) repeat-found?))
(recur (.read in-stream) [next-byte next-byte] true))
(if (and (not repeat-found?) (not (repeat-detected? out-bytes-array next-byte)) (< Byte/MIN_VALUE (- (count out-bytes-array))))
(recur (.read in-stream) (conj out-bytes-array next-byte) false)
(if (and (not repeat-found?) (>= Byte/MIN_VALUE (- (count out-bytes-array))))
(do
(swap! out-bytes-counter + (write-bytes-array out-stream out-bytes-array repeat-found?))
(recur (.read in-stream) [next-byte] false))
(if (and repeat-found? (repeat-detected? out-bytes-array next-byte) (>= (count out-bytes-array) Byte/MAX_VALUE))
(do
(swap! out-bytes-counter + (write-bytes-array out-stream out-bytes-array repeat-found?))
(recur (.read in-stream) [next-byte] true))
(if (and repeat-found? (repeat-detected? out-bytes-array next-byte) (< (count out-bytes-array) Byte/MAX_VALUE))
(recur (.read in-stream) (conj out-bytes-array next-byte) true)
(if (and repeat-found? (not (repeat-detected? out-bytes-array next-byte)))
(do
(swap! out-bytes-counter + (write-bytes-array out-stream out-bytes-array repeat-found?))
(recur (.read in-stream) [next-byte] false))
(println "error in compressor!")))))))))))
(printf "source size: %d, compressed size: %d, ratio: %.3f\n" @in-bytes-counter @out-bytes-counter (/ (float @out-bytes-counter) @in-bytes-counter))))
(defn decompress-file
"RLE decompressor"
[in-filename out-filename]
(let [out-bytes-counter (atom 0)]
(with-open [in-stream (io/input-stream in-filename)
out-stream (io/output-stream out-filename)]
(loop [next-byte (.read in-stream)]
(if (= next-byte EOF)
(printf "decompressed size: %d\n" @out-bytes-counter)
(do
(if (neg-int? (unchecked-byte next-byte))
(let [ba (byte-array (- (unchecked-byte next-byte)))
len (.read in-stream ba)]
(.write out-stream ba)
(swap! out-bytes-counter + len))
(let [len (unchecked-byte next-byte)
ba (byte-array len)
data (.read in-stream)
_ (Arrays/fill ba (unchecked-byte data))]
(.write out-stream ba)
(swap! out-bytes-counter + len)))
(recur (.read in-stream))))))))
(comment
(decompress-file "abc.rle" "abc.txt")
(decompress-file "a.dat2" "a2.jar"))
(ns org.rssys.rle.naive-test
(:require [clojure.test :refer [deftest testing is]]
[matcho.core :refer [match]]
[org.rssys.rle.lazy :as sut]
[clojure.java.io :as io]))
(deftest compress-file-test
(testing "1 byte"
(let [s "a"
infile "a.txt"
outfile "a.rle"
decfile "a2.txt"]
(spit infile s)
(sut/compress-file infile outfile)
(sut/decompress-file outfile decfile)
(is (= s (slurp decfile)))
(io/delete-file infile)
(io/delete-file outfile)
(io/delete-file decfile)))
(testing "5 bytes"
(let [s "abcde"
infile "a.txt"
outfile "a.rle"
decfile "a2.txt"]
(spit infile s)
(sut/compress-file infile outfile)
(sut/decompress-file outfile decfile)
(is (= s (slurp decfile)))
(io/delete-file infile)
(io/delete-file outfile)
(io/delete-file decfile)))
(testing "2 equal bytes"
(let [s "aa"
infile "a.txt"
outfile "a.rle"
decfile "a2.txt"]
(spit infile s)
(sut/compress-file infile outfile)
(sut/decompress-file outfile decfile)
(is (= s (slurp decfile)))
(io/delete-file infile)
(io/delete-file outfile)
(io/delete-file decfile)))
(testing "15 bytes"
(let [s "1234567890abcdf"
infile "a.txt"
outfile "a.rle"
decfile "a2.txt"]
(spit infile s)
(sut/compress-file infile outfile)
(sut/decompress-file outfile decfile)
(is (= s (slurp decfile)))
(io/delete-file infile)
(io/delete-file outfile)
(io/delete-file decfile)))
(testing "15 equal bytes"
(let [s (apply str (repeat 15 "e"))
infile "a.txt"
outfile "a.rle"
decfile "a2.txt"]
(spit infile s)
(sut/compress-file infile outfile)
(sut/decompress-file outfile decfile)
(is (= s (slurp decfile)))
(io/delete-file infile)
(io/delete-file outfile)
(io/delete-file decfile)))
(testing "129 equal bytes"
(let [s (apply str (repeat 129 "e"))
infile "a.txt"
outfile "a.rle"
decfile "a2.txt"]
(spit infile s)
(sut/compress-file infile outfile)
(sut/decompress-file outfile decfile)
(is (= s (slurp decfile)))
(io/delete-file infile)
(io/delete-file outfile)
(io/delete-file decfile)))
(testing "128 not equal bytes"
(let [s (apply str (repeat 64 "ea"))
infile "a.txt"
outfile "a.rle"
decfile "a2.txt"]
(spit infile s)
(sut/compress-file infile outfile)
(sut/decompress-file outfile decfile)
(is (= s (slurp decfile)))
(io/delete-file infile)
(io/delete-file outfile)
(io/delete-file decfile)))
(testing "130 not equal bytes"
(let [s (apply str (repeat 65 "ea"))
infile "a.txt"
outfile "a.rle"
decfile "a2.txt"]
(spit infile s)
(sut/compress-file infile outfile)
(sut/decompress-file outfile decfile)
(is (= s (slurp decfile)))
(io/delete-file infile)
(io/delete-file outfile)
(io/delete-file decfile)))
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment