Created
February 24, 2019 22:28
-
-
Save mikeananev/b2026b712ecb73012e680805c56af45f to your computer and use it in GitHub Desktop.
Clojure compress / decompress data examples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{:deps {org.clojure/clojure {:mvn/version "1.10.0"} | |
com.taoensso/nippy {:mvn/version "2.14.0"} | |
org.apache.commons/commons-compress {:mvn/version "1.18"}}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns lzw | |
(:require [clojure.set] | |
[clojure.java.io :refer [file output-stream input-stream] :as io] | |
[taoensso.nippy :as nippy]) | |
(:import (java.io DataOutputStream DataInputStream File) | |
java.util.zip.GZIPInputStream | |
java.util.zip.GZIPOutputStream | |
[java.util.zip ZipEntry ZipOutputStream ZipInputStream] | |
(org.apache.commons.compress.compressors CompressorStreamFactory) | |
(org.apache.commons.compress.archivers.sevenz SevenZOutputFile SevenZFile) | |
(org.apache.commons.compress.archivers ArchiveInputStream))) | |
;; GZIP | |
(defn gunzip | |
"decompress data. | |
input: gzipped data which can be opened by io/input-stream. | |
output: something which can be copied to by io/copy (e.g. filename ...)." | |
[input output & opts] | |
(with-open [input (-> input io/input-stream GZIPInputStream.)] | |
(apply io/copy input output opts))) | |
(defn gzip | |
"compress data. | |
input: something which can be copied from by io/copy (e.g. filename ...). | |
output: something which can be opend by io/output-stream. | |
The bytes written to the resulting stream will be gzip compressed." | |
[input output & opts] | |
(with-open [output (-> output io/output-stream GZIPOutputStream.)] | |
(apply io/copy input output opts))) | |
(comment | |
(gzip (file "big.txt") (file "big.gz")) | |
(gunzip (file "big.gz") (file "big.1.txt"))) | |
;; ZIP | |
(defn zip-file | |
"compress file or folder | |
`input-file-or-folder` - filename or folder to be compressed. | |
`out-file` - filename of output archive" | |
[input-file-or-folder out-file] | |
(with-open [zip (ZipOutputStream. (io/output-stream out-file))] | |
(doseq [f (file-seq (io/file input-file-or-folder)) :when (.isFile f)] | |
(.putNextEntry zip (ZipEntry. (.getPath f))) | |
(io/copy f zip) | |
(.closeEntry zip)))) | |
(comment (zip-file "./test" "test.zip")) | |
(defn unzip-file | |
"uncompress zip archive. | |
`input` - name of zip archive to be uncompressed. | |
`output` - name of folder where to output." | |
[input output] | |
(with-open [stream (-> input io/input-stream ZipInputStream.)] | |
(loop [entry (.getNextEntry stream)] | |
(if entry | |
(let [save-path (str output File/separatorChar (.getName entry)) | |
out-file (File. save-path)] | |
(if (.isDirectory entry) | |
(if-not (.exists out-file) | |
(.mkdirs out-file)) | |
(let [parent-dir (File. (.substring save-path 0 (.lastIndexOf save-path (int File/separatorChar))))] | |
(if-not (.exists parent-dir) (.mkdirs parent-dir)) | |
(clojure.java.io/copy stream out-file))) | |
(recur (.getNextEntry stream))))))) | |
(comment (unzip-file "test.zip" "./test2")) | |
;; 7-zip | |
(defn compress7z-file | |
"compress file using 7-zip" | |
[input-filename archive-name] | |
(let [in-file (file input-filename) | |
in (input-stream in-file) | |
seven-z (SevenZOutputFile. (file archive-name)) | |
arch-entry (.createArchiveEntry seven-z in-file input-filename) | |
buf (byte-array 1024)] | |
(.putArchiveEntry seven-z arch-entry) | |
(loop [n (.read in buf)] | |
(when (> n 0) | |
(.write seven-z buf 0 n) | |
(recur (.read in buf)))) | |
(.closeArchiveEntry seven-z) | |
(.close seven-z))) | |
(comment | |
(compress-7zip "big.txt" "big.7z")) | |
(defn decompress-7zip | |
"decompress 7-zip archive. | |
`input` - name of 7-zip archive to be uncompressed. | |
`output` - name of folder where to output." | |
[input output] | |
(with-open [s7-zip-archive (-> (file input) SevenZFile.)] | |
(loop [entry (.getNextEntry s7-zip-archive)] | |
(if entry | |
(let [save-path (str output File/separatorChar (.getName entry)) | |
out-file (File. save-path)] | |
(if (.isDirectory entry) | |
(if-not (.exists out-file) | |
(.mkdirs out-file)) | |
(let [parent-dir (File. (.substring save-path 0 (.lastIndexOf save-path (int File/separatorChar)))) | |
buf-size 1024 | |
buf (byte-array buf-size) | |
entry-size (.getSize entry) | |
out (output-stream out-file)] | |
(if-not (.exists parent-dir) (.mkdirs parent-dir)) | |
(loop [remain-bytes (.getSize entry) | |
n (.read s7-zip-archive buf)] | |
(when (> n 0) | |
(.write out buf 0 n) | |
(when (> remain-bytes 0) | |
(recur (- remain-bytes n) (.read s7-zip-archive buf))))) | |
(.close out))) | |
(recur (.getNextEntry s7-zip-archive))))))) | |
(comment | |
(decompress-7zip "big.7z" "./")) | |
;; LZW | |
(defn make-dict [] | |
(let [vals (range 256)] | |
(zipmap (map vector vals) vals))) | |
(defn compress- [{:keys [dict index w out] :as a} b] | |
(let [buffer (conj w b)] | |
(if (contains? dict buffer) | |
(assoc a :w buffer) | |
{:dict (assoc dict buffer index) | |
:index (inc index) | |
:out (conj out (get dict w)) | |
:w [b]}))) | |
(defn compress [data] | |
(let [initial-data {:dict (make-dict) :index 256 :w [] :out []} | |
{:keys [dict index w out] :as a} (reduce compress- initial-data (seq data))] | |
(conj out (get dict w)))) | |
(defn decompress- [{:keys [dict index w out] :as a} code] | |
(let [entry (if (contains? dict code) (get dict code) (conj w (first w)))] | |
{:dict (assoc dict index (conj w (first entry))) | |
:index (inc index) | |
:out (conj out entry) | |
:w entry})) | |
(defn decompress [data] | |
(let [f [(first data)] | |
initial-data {:dict (clojure.set/map-invert (make-dict)) :index 256 :w f :out f} | |
result (reduce decompress- initial-data (rest data))] | |
(flatten (:out result)))) | |
;; end of LZW | |
(comment | |
(String. (byte-array (decompress (compress (.getBytes "ABACABACABADE"))))) | |
(nippy/freeze-to-file "a.lzw" (compress (.getBytes "ABACABACABADE"))) | |
(String. (byte-array (decompress (nippy/thaw-from-file "a.lzw")))) | |
(def s (slurp "src/lzw.clj")) | |
(def sb (.getBytes s)) | |
(nippy/freeze-to-file "lzw.lzw" (compress sb)) | |
(def ds (String. (byte-array (decompress (nippy/thaw-from-file "lzw.lzw"))))) | |
(= s ds) | |
(def cb (compress sb)) | |
(def db (decompress cb)) | |
(def ds (String. (byte-array db))) | |
(= ds s) | |
(count cb) | |
(count db) | |
(def cb (short-array (compress sb))) | |
(alength cb) | |
(with-open [out (DataOutputStream. (output-stream (file "lzw.lzw")))] | |
(run! (fn [w] (.writeShort out w)) cb)) | |
(def in (DataInputStream. (input-stream (file "lzw.lzw")))) | |
;;(def buf (short-array (/ (.length ^java.io.File (file "lzw.lzw")) 2))) | |
;;(alength buf) | |
(def rb (loop [buf [] | |
avail (> (.available in) 0)] | |
(if-not avail | |
buf | |
(recur (conj buf (.readShort in)) (> (.available in) 0))))) | |
(count rb) | |
(def dd (decompress rb)) | |
(String. (byte-array dd)) | |
(def s (slurp "big.txt")) | |
(def sb (.getBytes s)) | |
(alength sb) | |
(def cb (int-array (compress sb))) | |
(count cb) | |
(with-open [out (DataOutputStream. (output-stream (file "big.lzw")))] | |
(run! (fn [w] (.writeInt out w)) cb)) | |
(def in (DataInputStream. (input-stream (file "big.lzw")))) | |
;;(def buf (short-array (/ (.length ^java.io.File (file "lzw.lzw")) 2))) | |
;;(alength buf) | |
(def rb (loop [buf [] | |
avail (> (.available in) 0)] | |
(if-not avail | |
buf | |
(recur (conj buf (.readInt in)) (> (.available in) 0))))) | |
(count rb) | |
(def dd (decompress rb)) | |
(String. (byte-array dd)) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
See also LZW compression