Created
October 3, 2016 11:50
-
-
Save wolf0403/07de296b5cb131b54099392b9ab97207 to your computer and use it in GitHub Desktop.
Scan multiple fs roots, looking for dup files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"crypto/sha1" | |
"encoding/json" | |
"flag" | |
"fmt" | |
"hash/crc64" | |
"io/ioutil" | |
"log" | |
"os" | |
"path/filepath" | |
"regexp" | |
"strconv" | |
"strings" | |
) | |
const config = "config.json" | |
var ( | |
writeback = flag.Bool("w", true, "write back fileinfo.json") | |
crcTable = crc64.MakeTable(crc64.ECMA) | |
) | |
type FolderInfo struct { | |
Name string `json:"name"` | |
Children []string `json:"children"` | |
} | |
type FileInfo struct { | |
Name string `json:"name"` | |
Size int64 `json:"size"` | |
First1k string `json:"cksum1k,omitempty"` | |
Cksum string `json:"cksum,omitempty"` | |
Sha1 string `json:"sha1,omitempty"` | |
} | |
type Runtime struct { | |
Roots []string `json:"roots"` | |
Filters map[string]bool `json:"filter"` | |
Files map[string]*FileInfo `json:"files"` | |
Folders map[string]*FolderInfo `json:"-"` | |
Dups [][]string `json:"dups"` | |
Links map[string][]string `json:"links"` | |
config string | |
} | |
func (rt *Runtime) Dump(key string) { | |
b, err := json.MarshalIndent(rt, "", " ") | |
if err != nil { | |
log.Fatalf("Error generating result: %v", err) | |
} | |
if err := ioutil.WriteFile(rt.config+key, b, 0644); err != nil { | |
log.Fatalf("Error writing result: %v", err) | |
} | |
} | |
// keyFn focus on low false same - | |
// if keyFn(f1) != keyFn(f2) then f1 != f2 | |
type keyFn func(fi *FileInfo) (string, error) | |
func keySize(fi *FileInfo) (string, error) { | |
return strconv.Itoa(int(fi.Size)), nil | |
} | |
func keyCRC64(fi *FileInfo) (string, error) { | |
b, err := ioutil.ReadFile(fi.Name) | |
if err != nil { | |
return "", fmt.Errorf("Error ReadFile %q: %v", fi.Name, err) | |
} | |
crc := crc64.Checksum(b, crcTable) | |
return fmt.Sprintf("%v", crc), nil | |
} | |
func keySHA1(fi *FileInfo) (string, error) { | |
b, err := ioutil.ReadFile(fi.Name) | |
if err != nil { | |
return "", fmt.Errorf("Error ReadFile %q: %v", fi.Name, err) | |
} | |
return fmt.Sprintf("%v", sha1.Sum(b)), nil | |
} | |
type fnConfig struct { | |
name string | |
fn keyFn | |
} | |
func (rt *Runtime) Dedup() error { | |
return rt.dedupByKey(rt.Files, []fnConfig{ | |
{"size", keySize}, | |
{"crc64", keyCRC64}, | |
{"sha1", keySHA1}, | |
}) | |
} | |
func (rt *Runtime) dedupByKey(files map[string]*FileInfo, fns []fnConfig) error { | |
log.Printf("Dedup by Key %s on %d files", fns[0].name, len(files)) | |
buckets := map[string][]*FileInfo{} | |
for _, fi := range files { | |
key, err := fns[0].fn(fi) | |
if err != nil { | |
log.Printf("Error get key of %q: %v", fi.Name, err) | |
continue | |
} | |
files, ok := buckets[key] | |
if !ok { | |
files = []*FileInfo{} | |
} | |
buckets[key] = append(files, fi) | |
} | |
for _, fis := range buckets { | |
if len(fis) == 1 { | |
log.Printf("unique: %q", fis[0].Name) | |
delete(rt.Files, fis[0].Name) | |
continue | |
} | |
if len(fns) == 1 { | |
// all keys tried, final. | |
names := []string{} | |
for _, fi := range fis { | |
names = append(names, fi.Name) | |
} | |
rt.Dups = append(rt.Dups, names) | |
continue | |
} | |
names := map[string]*FileInfo{} | |
for _, fi := range fis { | |
names[fi.Name] = fi | |
} | |
rt.dedupByKey(names, fns[1:]) | |
} | |
return nil | |
} | |
func getFolderInfo(path string, rt *Runtime) *FolderInfo { | |
if path == "" { | |
path = "." | |
} | |
path = strings.TrimSuffix(path, "/") + "/" | |
d, ok := rt.Folders[path] | |
if ok { | |
return d | |
} | |
p := filepath.Dir(path) | |
if p+"/" != path { | |
pdir := getFolderInfo(p, rt) | |
pdir.Children = append(pdir.Children, path) | |
} | |
d = &FolderInfo{Name: path} | |
rt.Folders[path] = d | |
return d | |
} | |
func getFileInfo(path string, dir *FolderInfo, rt *Runtime) *FileInfo { | |
f, ok := rt.Files[path] | |
if !ok { | |
f = &FileInfo{Name: path} | |
dir.Children = append(dir.Children, path) | |
rt.Files[path] = f | |
} | |
return f | |
} | |
func match(path, filter string) bool { | |
b, err := regexp.MatchString(filter, path) | |
if err != nil { | |
log.Panicf("match %q - %q failed: %v", path, filter, err) | |
} | |
return b | |
} | |
func ScanFS(root string, rt *Runtime) error { | |
log.Printf("Scanfs %q", root) | |
fs := []string{} | |
for filter, enabled := range rt.Filters { | |
if !enabled { | |
continue | |
} | |
fs = append(fs, filter) | |
} | |
filtered := func(path string) bool { | |
for _, f := range fs { | |
if match(path, f) { | |
return true | |
} | |
} | |
return false | |
} | |
walkFn := func(path string, info os.FileInfo, _ error) error { | |
if info == nil { | |
log.Printf("info is nil - %q", path) | |
return nil | |
} | |
if filtered(path) { | |
log.Printf("filtered: %q", path) | |
if info.IsDir() { | |
return filepath.SkipDir | |
} | |
return nil | |
} | |
if info.Mode()&os.ModeSymlink != 0 { | |
target, err := os.Readlink(path) | |
if err != nil { | |
log.Printf("readlink(%q) failed: %v", path, err) | |
return nil | |
} | |
rt.Links[target] = append(rt.Links[target], path) | |
return nil | |
} | |
if info.IsDir() { | |
getFolderInfo(path, rt) | |
return nil | |
} | |
dir := filepath.Dir(path) | |
d := getFolderInfo(dir, rt) | |
f := getFileInfo(path, d, rt) | |
f.Size = info.Size() | |
return nil | |
} | |
err := filepath.Walk(root, walkFn) | |
rt.Dump("-scanfs") | |
return err | |
} | |
func main() { | |
log.SetFlags(log.LstdFlags | log.Lshortfile) | |
b, err := ioutil.ReadFile(config) | |
if err != nil { | |
log.Fatalf("Error loading roots: %v", err) | |
} | |
rt := Runtime{config: config} | |
if err := json.Unmarshal(b, &rt); err != nil { | |
log.Fatalf("Error loading config: %v", err) | |
} | |
if rt.Files == nil { | |
rt.Files = map[string]*FileInfo{} | |
} | |
if rt.Folders == nil { | |
rt.Folders = map[string]*FolderInfo{} | |
} | |
if rt.Dups == nil { | |
rt.Dups = [][]string{} | |
} | |
if rt.Links == nil { | |
rt.Links = map[string][]string{} | |
} | |
if len(rt.Files) == 0 { | |
if len(rt.Roots) == 0 { | |
rt.Roots = []string{"."} | |
} | |
for _, line := range rt.Roots { | |
ScanFS(line, &rt) | |
} | |
} | |
rt.Dedup() | |
b, err = json.MarshalIndent(rt, "", " ") | |
if err != nil { | |
log.Fatalf("Error generating result: %v", err) | |
} | |
if *writeback { | |
if err := ioutil.WriteFile(config, b, 0644); err != nil { | |
log.Fatalf("Error writing result: %v", err) | |
} | |
} else { | |
fmt.Println(string(b)) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment