Created
January 4, 2021 20:06
-
-
Save anatolebeuzon/5a147269675ed50e9d0d60dd99abd524 to your computer and use it in GitHub Desktop.
CLI: set difference of two folders
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Suppose you have two folders, A and B, containing various files. | |
// You want to get the files in A that are not in B. This is what this CLI provides. | |
// A is called the 'in' set and B is called the 'discard' set. | |
// Items in A that are also in B will be _removed_ from A. | |
// File equality is based on MD5 hashes, not filenames. | |
package main | |
import ( | |
"crypto/md5" | |
"flag" | |
"fmt" | |
"io" | |
"os" | |
"path/filepath" | |
) | |
var flags struct { | |
discard string | |
in string | |
dryRun bool | |
} | |
func init() { | |
flag.StringVar(&flags.discard, "discard", "", "folder containing the files you don't want to keep if they are found in the 'in' folder") | |
flag.StringVar(&flags.in, "in", "", "folder containing a mix of files to keep and to discard") | |
flag.BoolVar(&flags.dryRun, "dry-run", true, "by default, dry run is enabled, so no file is actually removed") | |
flag.Parse() | |
if flags.discard == "" || flags.in == "" { | |
flag.Usage() | |
os.Exit(1) | |
} | |
} | |
func main() { | |
fmt.Println("Loading MD5 hashes of files to discard...") | |
discard := make(map[string]struct{}) | |
doWithProgress(flags.discard, func(path string) { | |
discard[hash(path)] = struct{}{} | |
}) | |
fmt.Println("Looking for items to discard in the 'in' folder...") | |
deleted := 0 | |
doWithProgress(flags.in, func(path string) { | |
if _, ok := discard[hash(path)]; ok { | |
if !flags.dryRun { | |
err := os.Remove(path) | |
if err != nil { | |
panic(err) | |
} | |
} | |
deleted++ | |
} | |
}) | |
log := fmt.Sprintf("deleted %d files", deleted) | |
if flags.dryRun { | |
fmt.Println("would have " + log) | |
fmt.Println("To actually remove deduped files, re-run the command with '-dry-run=false'") | |
} else { | |
fmt.Println(log) | |
} | |
} | |
func doWithProgress(folder string, doFn func(path string)) { | |
todo := countFiles(folder) | |
done := 0 | |
filepath.Walk(folder, func(path string, info os.FileInfo, err error) error { | |
fmt.Printf("\r%d%%", int(100*done/todo)) | |
if !info.IsDir() { | |
doFn(path) | |
done++ | |
} | |
return nil | |
}) | |
fmt.Println("\r100%") | |
} | |
func countFiles(folder string) int { | |
files := 0 | |
filepath.Walk(folder, func(_ string, info os.FileInfo, _ error) error { | |
if !info.IsDir() { | |
files++ | |
} | |
return nil | |
}) | |
return files | |
} | |
// hash returns a hash of the *contents* of the file at the provided path | |
func hash(path string) string { | |
f, err := os.Open(path) | |
if err != nil { | |
panic(err) | |
} | |
defer f.Close() | |
h := md5.New() | |
if _, err := io.Copy(h, f); err != nil { | |
panic(err) | |
} | |
return fmt.Sprintf("%x", h.Sum(nil)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment