Last active
July 5, 2018 16:59
-
-
Save maddie/4b86f7ebc0d42fabe695bf7c4f208833 to your computer and use it in GitHub Desktop.
find duplicate files in a directory recursively
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"crypto/sha1" | |
"encoding/json" | |
"flag" | |
"fmt" | |
"io" | |
"io/ioutil" | |
"os" | |
"path/filepath" | |
"regexp" | |
"runtime" | |
"strings" | |
"sync" | |
"time" | |
// this dependency is optional, you can always use raw value of instead of the humanized ones | |
"github.com/dustin/go-humanize" | |
) | |
var ( | |
// the root path for finding duplicates | |
findPath = flag.String("p", "./", "path to find duplicates") | |
// directories that's being skipped, use relative path | |
skipPaths stringSlice | |
// how many hash workers should be spawned for working. this isn't really useful in most cases because | |
// the hashing speed will most likely be IO bound instead of CPU bound, but for the sake of exercise | |
// this option is provided and set its default to the max number of CPUs on the running machine | |
concurrency = flag.Int("c", runtime.NumCPU(), "number of concurrent hash workers") | |
// outputs the hashing log to a file, an empty string means no file will be written | |
logFile = flag.String("l", "", "output log to file") | |
// outputs the hash result to a JSON file, an empty string means no file will be written | |
output = flag.String("o", "", "output result as JSON to file") | |
// option to enable verbose output in os.Stdout, output to file (-o) will always be verbose | |
verbose = flag.Bool("v", false, "turn on verbose logging") | |
// writer for writing logs | |
resultOutput io.Writer = os.Stdout | |
// map for saving hash results | |
hashMap = make(map[string][]string) | |
// channel for distributing hash jobs to workers | |
hashChan = make(chan string) | |
// the file for log output | |
outputFile *os.File | |
// an array of file paths that were failed to access by this program during walk | |
// this will be shown in the end summary | |
failedPaths []failure | |
// skipped paths | |
skippedPaths []string | |
// map for hash error files | |
hashErrors []failure | |
// lock for map access | |
lock sync.Mutex | |
// wait group for hash jobs | |
wg sync.WaitGroup | |
) | |
type result struct { | |
WorkDir string `json:"work_dir"` | |
Skip []string `json:"skip"` | |
TotalFilesHashed int `json:"total_hashed"` | |
TotalDuplicates int `json:"total_duplicates"` | |
AllHashes map[string][]string `json:"all_hashes"` | |
Duplicates map[string][]string `json:"duplicates"` | |
FailedPaths []failure `json:"failed_paths"` | |
SkippedPaths []string `json:"skipped_paths"` | |
Errors []failure `json:"errors"` | |
} | |
type failure struct { | |
Path string `json:"path"` | |
Reason string `json:"reason"` | |
} | |
type stringSlice struct { | |
strings []string | |
} | |
func (slice *stringSlice) String() string { | |
return strings.Join(slice.strings, ",") | |
} | |
func (slice *stringSlice) Set(value string) error { | |
slice.strings = append(slice.strings, value) | |
return nil | |
} | |
func (slice *stringSlice) Contains(value string) bool { | |
for _, val := range slice.strings { | |
if val == value { | |
return true | |
} | |
} | |
return false | |
} | |
func (slice *stringSlice) MatchesPrefix(value string) bool { | |
for _, val := range slice.strings { | |
re := regexp.MustCompile(`^` + regexp.QuoteMeta(val) + `/.*`) | |
if re.MatchString(value) { | |
return true | |
} | |
} | |
return false | |
} | |
func main() { | |
// parse input flags | |
flag.Var(&skipPaths, "s", "paths to be skipped, use relative path. use multiple times for more paths") | |
flag.Parse() | |
// path usually defaults to working directory. if user gives an empty path, show help message and quit | |
if *findPath == "" { | |
flag.PrintDefaults() | |
os.Exit(1) | |
} | |
if len(skipPaths.strings) > 0 && !regexp.MustCompile(`^\./?`).MatchString(*findPath) { | |
for idx, p := range skipPaths.strings { | |
skipPaths.strings[idx] = filepath.Join(*findPath, p) | |
} | |
} | |
if len(skipPaths.strings) > 0 { | |
printf("skip paths: ") | |
for _, p := range skipPaths.strings { | |
printf("- %s", p) | |
} | |
} | |
printf("will be starting in 5 seconds...") | |
time.Sleep(5 * time.Second) | |
// combine output file and os.Stdout as resultOutput if an output file is given | |
if *logFile != "" { | |
// tries to open the output file, if the file already exists, it will be truncated | |
f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_TRUNC|os.O_RDWR, 0644) | |
if err != nil { | |
printf("failed to open file for output: %+v", err) | |
os.Exit(1) | |
} | |
// save the file pointer for printVerbose | |
outputFile = f | |
// combine the writers | |
resultOutput = io.MultiWriter(os.Stdout, f) | |
} | |
// tries to get the given path's information first | |
dir, err := os.Stat(*findPath) | |
if err != nil { | |
printf("error opening directory: %+v", err) | |
os.Exit(1) | |
} | |
// only continue if the given path is a directory | |
if !dir.IsDir() { | |
printf("%s is not a directory", *findPath) | |
os.Exit(1) | |
} | |
// spawn hash workers | |
for i := 0; i < *concurrency; i++ { | |
go hashWorker() | |
} | |
// start a timer for calculating total time elapsed | |
start := time.Now() | |
// start walking through the given path recursively | |
walkDir(*findPath) | |
// wait for all hash workers to finish | |
wg.Wait() | |
workDir := *findPath | |
if p, err := filepath.Abs(*findPath); err == nil { | |
workDir = p | |
} | |
// from here to the end of the main function is the summarize part | |
printf("====================================================================================") | |
printf("working directory: %s", workDir) | |
printf("====================================================================================") | |
// a counter for showing how many files get hashed in the summary | |
filesTotal := 0 | |
// a counter for showing how many duplicates (group by SHA1 checksum) in the summary | |
dupes := 0 | |
// a map for recording duplicates | |
dupeMap := make(map[string][]string) | |
for hash, files := range hashMap { | |
filesTotal += len(files) | |
// more than one file has the same SHA1 checksum, duplicate(s) found | |
if len(files) > 1 { | |
dupes++ | |
var paths []string | |
for _, f := range files { | |
if workDir != *findPath { | |
f = filepath.Join(workDir, f) | |
} | |
paths = append(paths, f) | |
} | |
dupeMap[hash] = paths | |
printf("found %d duplicate entries with SHA1 hash %s", len(files), hash) | |
for _, p := range paths { | |
printf("- %s", p) | |
} | |
// newline for readability | |
printf("") | |
} | |
} | |
// summary | |
if dupes == 0 { | |
fmt.Println("no duplicates found") | |
} else { | |
printf("%d duplicates found", dupes) | |
} | |
if len(failedPaths) > 0 { | |
printf("failed to access these files/directories:") | |
for _, path := range failedPaths { | |
if workDir != *findPath { | |
path.Path = filepath.Join(workDir, path.Path) | |
} | |
printf("- %s", path) | |
} | |
} | |
// see if there's any files that failed to be hashed | |
if len(hashErrors) > 0 { | |
printf("failed to hash these files:") | |
for _, err := range hashErrors { | |
printf("- %s (%s)", err.Path, err.Reason) | |
} | |
} | |
printf("%d files hashed", filesTotal) | |
printf("total time used: %s", time.Since(start).String()) | |
// outputs result to a JSON file for later use | |
if *output != "" { | |
var result result | |
result.WorkDir = workDir | |
result.Skip = skipPaths.strings | |
result.TotalDuplicates = dupes | |
result.TotalFilesHashed = filesTotal | |
result.AllHashes = hashMap | |
result.Duplicates = dupeMap | |
result.FailedPaths = failedPaths | |
result.SkippedPaths = skippedPaths | |
result.Errors = hashErrors | |
b, err := json.Marshal(&result) | |
if err != nil { | |
printf("failed to generate result JSON: %+v", err) | |
os.Exit(1) | |
} | |
if err := ioutil.WriteFile(*output, b, 0644); err != nil { | |
printf("failed to write result JSON: %+v", err) | |
} | |
} | |
} | |
// hashWorker is the actual worker that consumes the jobs given by walkDir function | |
func hashWorker() { | |
// loop forever for incoming jobs | |
for { | |
// get a path from the channel | |
path := <-hashChan | |
// start hashing | |
hash := getFileMD5(path) | |
// skip hash errors | |
if hash != "" { | |
// add the result to the map | |
lock.Lock() | |
hashMap[hash] = append(hashMap[hash], path) | |
lock.Unlock() | |
} | |
// tell the wait group that this work is done | |
wg.Done() | |
} | |
} | |
// walkDir traverses the directory passed in recursively, passing files to hash workers and walk directories | |
// rootPath is the directory that will be walked recursively | |
func walkDir(rootPath string) { | |
filepath.Walk(rootPath, func(path string, info os.FileInfo, err error) error { | |
// skip root path given since it's already the one being scanned | |
if path == rootPath { | |
return nil | |
} | |
// skip paths | |
if skipPaths.MatchesPrefix(path) { | |
printf("path %s is in skip list, skipping", path) | |
skippedPaths = append(skippedPaths, path) | |
return nil | |
} | |
// if fails to open the given path, add it to the failedPaths array for summary, print logs, and continue | |
if err != nil { | |
printf("failed to open file or directory: %+v", err) | |
var f failure | |
f.Path = path | |
f.Reason = err.Error() | |
failedPaths = append(failedPaths, f) | |
return nil | |
} | |
// path is accessible, and if it's not a directory, pass it to the hash worker | |
if !info.IsDir() { | |
// tell wait group that a new job has been added | |
wg.Add(1) | |
hashChan <- path | |
} | |
return nil | |
}) | |
} | |
// getFileMD5 is the actual code for calculating SHA1 checksum for a given file | |
func getFileMD5(path string) string { | |
// a timer for calculating time elapsed for hashing each file | |
start := time.Now() | |
// try to access the given file, open it as read only | |
f, err := os.OpenFile(path, os.O_RDONLY, 0644) | |
if err != nil { | |
printf("error opening file for hashing: %+v", err) | |
// returns "error" as key for hashMap when there's an error occurred during hash | |
var f failure | |
f.Path = path | |
f.Reason = err.Error() | |
hashErrors = append(hashErrors, f) | |
return "" | |
} | |
// close the file when work is done | |
defer f.Close() | |
// get file properties | |
stat, err := f.Stat() | |
if err != nil { | |
printf("error getting file information: %+v", err) | |
var f failure | |
f.Path = path | |
f.Reason = err.Error() | |
hashErrors = append(hashErrors, f) | |
return "" | |
} | |
// the hash | |
h := sha1.New() | |
// copy the file content directly to the hash to avoid loading the file into memory | |
// this is for handling very large files in some cases | |
if _, err := io.Copy(h, f); err != nil { | |
printf("error reading file for hashing: %+v", err) | |
var f failure | |
f.Path = path | |
f.Reason = err.Error() | |
hashErrors = append(hashErrors, f) | |
return "" | |
} | |
// output the hash as hex string | |
hash := fmt.Sprintf("%x", h.Sum(nil)) | |
// you can substitute humanize here with just the raw value of stat.Size() to avoid external dependency | |
// be sure to change size %s to size %d if you decides to do so | |
printVerbose("hashed file %s (size %s, SHA1 %s), took %s", path, humanize.Bytes(uint64(stat.Size())), hash, time.Since(start).String()) | |
// return the hex string | |
return hash | |
} | |
// printf prints given formatted string to the resultOutput io.Writer, resultOutput can be just os.Stdout | |
// or a io.MultiWriter that combines os.Stdout and a file, depending on the -o flag | |
func printf(format string, args ...interface{}) (int, error) { | |
return fprintf(resultOutput, format, args...) | |
} | |
// printVerbose is for printing extra logs that are not necessarily helpful (noisy). Where it outputs depends on | |
// verbose flag (-v). If verbose = true, then it outputs to whatever resultOutput directs to (just os.Stdout or along | |
// with output file given by flag -o). Otherwise it will only write to the output file given by flag -o, if available. | |
func printVerbose(format string, args ...interface{}) (int, error) { | |
// check for verbose flag | |
if *verbose { | |
return printf(format, args...) | |
} | |
// check for output file | |
if outputFile != nil { | |
return fprintf(outputFile, format, args...) | |
} | |
// return nothing if none of the above were true/given | |
return 0, nil | |
} | |
// fprintf adds newline to printed string, and output it to the given writer | |
func fprintf(writer io.Writer, format string, args ...interface{}) (int, error) { | |
// check if there's a newline in the end, add one if there's none | |
if !strings.HasPrefix(format, "\n") { | |
format = format + "\n" | |
} | |
// print the formatted string to the given writer | |
return fmt.Fprintf(writer, format, args...) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment