Last active
October 22, 2021 20:52
-
-
Save awilliams/b025cdae00e89e7c57f8c1d3e2fa29a6 to your computer and use it in GitHub Desktop.
Duplicate image detector
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
// Duplicate image detector. | |
// | |
// Usage: | |
// ./dupimgs -dir . | |
// | |
// This will recursively search the given directory for files with | |
// {.jpg, .jpeg} extensions (currently hardcoded). For each file with | |
// such an extension, the MD5 hash of the file's contents will be calculated. | |
// After traversing all sub-directories and matching files, the paths of files | |
// with identical hashes will be printed to STDOUT. | |
import ( | |
"crypto/md5" | |
"flag" | |
"fmt" | |
"io" | |
"io/fs" | |
"os" | |
"path/filepath" | |
"strings" | |
) | |
var args = struct { | |
dir string | |
}{ | |
dir: ".", | |
} | |
func main() { | |
flag.StringVar(&args.dir, "dir", args.dir, "Directory to scan") | |
flag.Parse() | |
hashes := make(map[string][]string, 1024) | |
h := md5.New() | |
ignoredExts := make(map[string]int) | |
err := filepath.WalkDir(args.dir, func(path string, d fs.DirEntry, err error) error { | |
if err != nil { | |
return err | |
} | |
if d.IsDir() { | |
return nil | |
} | |
ext := strings.ToLower(filepath.Ext(d.Name())) | |
switch ext { | |
case ".jpg", ".jpeg": | |
// OK | |
default: | |
ignoredExts[ext]++ | |
return nil | |
} | |
fd, err := os.Open(path) | |
if err != nil { | |
return err | |
} | |
defer fd.Close() | |
h.Reset() | |
if _, err := io.Copy(h, fd); err != nil { | |
return err | |
} | |
hash := fmt.Sprintf("%x", h.Sum(nil)) | |
hashes[hash] = append(hashes[hash], path) | |
return nil | |
}) | |
if err != nil { | |
bail(err) | |
} | |
fmt.Printf("Inspected %d images\n\n", len(hashes)) | |
var dups int | |
for hash, paths := range hashes { | |
if len(paths) < 2 { | |
continue | |
} | |
dups++ | |
fmt.Printf("%d photos with hash %q:\n", len(paths), hash) | |
for _, p := range paths { | |
fmt.Printf(" - %s\n", p) | |
} | |
fmt.Println("===") | |
} | |
if len(ignoredExts) > 0 { | |
fmt.Println("The following file extensions were ignored:") | |
for ext, count := range ignoredExts { | |
fmt.Printf("- %-15s %d\n", ext, count) | |
} | |
} | |
if dups > 0 { | |
fmt.Printf("%d sets of duplicates found\n", dups) | |
} else { | |
fmt.Println("No duplicates found") | |
} | |
} | |
func bail(err error) { | |
fmt.Fprintf(os.Stderr, "Error: %s\n", err.Error()) | |
os.Exit(1) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment