Last active
April 2, 2019 08:58
-
-
Save rjeczalik/06c5967654870a7f24defeeb12e1f26f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"bytes" | |
"errors" | |
"fmt" | |
"io" | |
"os" | |
"path/filepath" | |
"strconv" | |
"strings" | |
) | |
const usage = "usage: dedupindex BACKUP_META_DIR|BACKUP_META_FILE [COMPARE_META_FILE]" | |
func die(v interface{}) { | |
fmt.Fprintln(os.Stderr, v) | |
os.Exit(1) | |
} | |
func main() { | |
switch len(os.Args) { | |
case 2: | |
if err := summary(os.Args[1]); err != nil { | |
die(err) | |
} | |
case 3: | |
if err := compare(os.Args[1], os.Args[2]); err != nil { | |
die(err) | |
} | |
default: | |
die(usage) | |
} | |
} | |
type object struct { | |
key string | |
hash string | |
size int // in bytes | |
} | |
func parseObject(line string) (*object, error) { | |
w := strings.Split(line, "\t") | |
if len(w) != 3 || w[0] == "" || w[1] == "" { | |
return nil, errors.New("malformed line: " + line) | |
} | |
size, err := strconv.Atoi(w[2]) | |
if err != nil { | |
return nil, errors.New("malformed line: " + err.Error()) | |
} | |
return &object{ | |
key: w[0], | |
hash: w[1], | |
size: size, | |
}, nil | |
} | |
type size int | |
func (s size) String() string { | |
const ( | |
KiB = 1024 | |
MiB = KiB * 1024 | |
GiB = MiB * 1024 | |
TiB = GiB * 1024 | |
) | |
switch { | |
case s < KiB: | |
return fmt.Sprintf("%.2f B", float64(s)) | |
case s < MiB: | |
return fmt.Sprintf("%.2f KiB", float64(s)/KiB) | |
case s < GiB: | |
return fmt.Sprintf("%.2f MiB", float64(s)/MiB) | |
case s < TiB: | |
return fmt.Sprintf("%.2f GiB", float64(s)/GiB) | |
default: | |
return fmt.Sprintf("%.2f TiB", float64(s)/TiB) | |
} | |
} | |
type index struct { | |
dups map[string]int // hash -> occurances | |
sizes map[string]int // hash -> size | |
} | |
func (s *index) total() (count, size int) { | |
for hash, n := range s.dups { | |
n = max(n, 1) | |
size += n * s.sizes[hash] | |
count += n | |
} | |
return count, size | |
} | |
func (s *index) dedup() (count, size int) { | |
for _, n := range s.sizes { | |
size += n | |
} | |
return len(s.sizes), size | |
} | |
func (s *index) readFrom(rc io.ReadCloser) error { | |
defer rc.Close() | |
scanner := bufio.NewScanner(rc) | |
for scanner.Scan() { | |
line := strings.TrimSpace(scanner.Text()) | |
if line == "" { | |
continue // skip empty lines | |
} | |
obj, err := parseObject(line) | |
if err != nil { | |
return err | |
} | |
if size, ok := s.sizes[obj.hash]; ok && size != obj.size { | |
return fmt.Errorf("size conflict for %q: %d != %d", obj.hash, size, obj.size) | |
} | |
s.dups[obj.hash]++ | |
s.sizes[obj.hash] = obj.size | |
} | |
return scanner.Err() | |
} | |
func (s *index) summary() string { | |
var buf bytes.Buffer | |
totalObjects, totalSize := s.total() | |
dedupObjects, dedupSize := s.dedup() | |
fmt.Fprintf(&buf, "Total objects: %d\n", totalObjects) | |
fmt.Fprintf(&buf, "Total size: %s\n", size(totalSize)) | |
fmt.Fprintf(&buf, "Dedup objects: %d\n", dedupObjects) | |
fmt.Fprintf(&buf, "Dedup size: %s\n", size(dedupSize)) | |
return buf.String() | |
} | |
func summary(dir string, ignore ...string) error { | |
idx, err := buildIndex(dir) | |
if err != nil { | |
return err | |
} | |
fmt.Print(idx.summary()) | |
return nil | |
} | |
func compare(dir, backup string) error { | |
idx, err := buildIndex(dir, backup) | |
if err != nil { | |
return err | |
} | |
bkp, err := buildIndex(backup) | |
if err != nil { | |
return err | |
} | |
for key := range idx.sizes { | |
delete(bkp.dups, key) | |
delete(bkp.sizes, key) | |
} | |
fmt.Print(bkp.summary()) | |
return nil | |
} | |
func buildIndex(dir string, ignore ...string) (*index, error) { | |
var idx = index{ | |
dups: make(map[string]int), | |
sizes: make(map[string]int), | |
} | |
metaFiles := make(map[string]struct{}) | |
switch fi, err := os.Stat(dir); { | |
case err != nil: | |
return nil, err | |
case fi.IsDir(): | |
f, err := os.Open(dir) | |
if err != nil { | |
return nil, err | |
} | |
files, err := f.Readdirnames(-1) | |
f.Close() | |
if err != nil { | |
return nil, err | |
} | |
for _, file := range files { | |
metaFiles[filepath.Join(dir, file)] = struct{}{} | |
} | |
default: | |
metaFiles[dir] = struct{}{} | |
} | |
for file := range metaFiles { | |
for _, s := range ignore { | |
if strings.Contains(file, s) { | |
delete(metaFiles, file) | |
} | |
} | |
} | |
for file := range metaFiles { | |
f, err := os.Open(file) | |
if err != nil { | |
return nil, err | |
} | |
if err := idx.readFrom(f); err != nil { | |
return nil, err | |
} | |
} | |
return &idx, nil | |
} | |
func max(i, j int) int { | |
if i > j { | |
return i | |
} | |
return j | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment