Skip to content

Instantly share code, notes, and snippets.

@zengxinhui
Last active November 2, 2023 14:55
Show Gist options
  • Save zengxinhui/7a17295781b5f5481a66e79a81345b9f to your computer and use it in GitHub Desktop.
Save zengxinhui/7a17295781b5f5481a66e79a81345b9f to your computer and use it in GitHub Desktop.
Find Duplicate Files
#!/usr/bin/env bb
;; babashka/clojure
(import 'java.security.MessageDigest)
(def SCAN_DIRS ["."])
(def MIN_FILE_SIZE (* 1))
(def SHA256_CHUNK_SIZE (* 1024 8))
(defn sha256 [input]
(let [md (MessageDigest/getInstance "SHA-256")
bytes (.digest md input)]
(apply str (map #(format "%02x" %) bytes))))
(defn read-bytes [file]
(with-open [rdr (clojure.java.io/input-stream file)]
(let [buf (byte-array SHA256_CHUNK_SIZE)]
(.read rdr buf)
buf)))
(let [lm-size-hash (->> SCAN_DIRS
(mapcat (fn [scan-dir]
(->> (clojure.java.io/file scan-dir)
file-seq
(filter #(.isFile %))
(filter #(<= MIN_FILE_SIZE (.length %))))))
(reduce (fn [r file]
(update r
[(.length file) (sha256 (read-bytes file))]
conj
{:dir (.getParent file), :name (.getName file)}))
{}))
dup-dirs (->> lm-size-hash
(keep #(if (< 1 (count (second %))) (set (map :dir (second %)))))
(into #{}))]
(->> (for [dup-dir dup-dirs
:let [entries (->> lm-size-hash
(mapcat (fn [[[size hash] entries]]
(if (= dup-dir (set (map :dir entries)))
(->> entries
(map #(assoc %
:size size
:hash hash
:fullname (str (:dir %) "/" (:name %)))))))))
groups (group-by :dir entries)
entries (->> (vals groups) (mapcat identity))]]
(if (< 1 (count groups))
{:size (reduce + (map :size entries)), :entries entries}
(let [x (->> (group-by :hash entries)
(remove #(= 1 (count (val %))))
(mapcat val))]
{:size (reduce + (map :size x)), :entries x})))
(sort-by :size)
(mapv (fn [{entries :entries}]
(mapv (fn [{:keys [fullname size]}] (println size fullname))
entries)
(println)))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment