Last active
November 2, 2023 14:55
-
-
Save zengxinhui/7a17295781b5f5481a66e79a81345b9f to your computer and use it in GitHub Desktop.
Find Duplicate Files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bb | |
;; babashka/clojure | |
(import 'java.security.MessageDigest) | |
(def SCAN_DIRS ["."]) | |
(def MIN_FILE_SIZE (* 1)) | |
(def SHA256_CHUNK_SIZE (* 1024 8)) | |
(defn sha256 [input] | |
(let [md (MessageDigest/getInstance "SHA-256") | |
bytes (.digest md input)] | |
(apply str (map #(format "%02x" %) bytes)))) | |
(defn read-bytes [file] | |
(with-open [rdr (clojure.java.io/input-stream file)] | |
(let [buf (byte-array SHA256_CHUNK_SIZE)] | |
(.read rdr buf) | |
buf))) | |
(let [lm-size-hash (->> SCAN_DIRS | |
(mapcat (fn [scan-dir] | |
(->> (clojure.java.io/file scan-dir) | |
file-seq | |
(filter #(.isFile %)) | |
(filter #(<= MIN_FILE_SIZE (.length %)))))) | |
(reduce (fn [r file] | |
(update r | |
[(.length file) (sha256 (read-bytes file))] | |
conj | |
{:dir (.getParent file), :name (.getName file)})) | |
{})) | |
dup-dirs (->> lm-size-hash | |
(keep #(if (< 1 (count (second %))) (set (map :dir (second %))))) | |
(into #{}))] | |
(->> (for [dup-dir dup-dirs | |
:let [entries (->> lm-size-hash | |
(mapcat (fn [[[size hash] entries]] | |
(if (= dup-dir (set (map :dir entries))) | |
(->> entries | |
(map #(assoc % | |
:size size | |
:hash hash | |
:fullname (str (:dir %) "/" (:name %))))))))) | |
groups (group-by :dir entries) | |
entries (->> (vals groups) (mapcat identity))]] | |
(if (< 1 (count groups)) | |
{:size (reduce + (map :size entries)), :entries entries} | |
(let [x (->> (group-by :hash entries) | |
(remove #(= 1 (count (val %)))) | |
(mapcat val))] | |
{:size (reduce + (map :size x)), :entries x}))) | |
(sort-by :size) | |
(mapv (fn [{entries :entries}] | |
(mapv (fn [{:keys [fullname size]}] (println size fullname)) | |
entries) | |
(println))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment