Skip to content

Instantly share code, notes, and snippets.

@rjeczalik
Last active April 2, 2019 08:58
Show Gist options
  • Save rjeczalik/06c5967654870a7f24defeeb12e1f26f to your computer and use it in GitHub Desktop.
Save rjeczalik/06c5967654870a7f24defeeb12e1f26f to your computer and use it in GitHub Desktop.
package main
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strconv"
"strings"
)
const usage = "usage: dedupindex BACKUP_META_DIR|BACKUP_META_FILE [COMPARE_META_FILE]"
func die(v interface{}) {
fmt.Fprintln(os.Stderr, v)
os.Exit(1)
}
func main() {
switch len(os.Args) {
case 2:
if err := summary(os.Args[1]); err != nil {
die(err)
}
case 3:
if err := compare(os.Args[1], os.Args[2]); err != nil {
die(err)
}
default:
die(usage)
}
}
type object struct {
key string
hash string
size int // in bytes
}
func parseObject(line string) (*object, error) {
w := strings.Split(line, "\t")
if len(w) != 3 || w[0] == "" || w[1] == "" {
return nil, errors.New("malformed line: " + line)
}
size, err := strconv.Atoi(w[2])
if err != nil {
return nil, errors.New("malformed line: " + err.Error())
}
return &object{
key: w[0],
hash: w[1],
size: size,
}, nil
}
type size int
func (s size) String() string {
const (
KiB = 1024
MiB = KiB * 1024
GiB = MiB * 1024
TiB = GiB * 1024
)
switch {
case s < KiB:
return fmt.Sprintf("%.2f B", float64(s))
case s < MiB:
return fmt.Sprintf("%.2f KiB", float64(s)/KiB)
case s < GiB:
return fmt.Sprintf("%.2f MiB", float64(s)/MiB)
case s < TiB:
return fmt.Sprintf("%.2f GiB", float64(s)/GiB)
default:
return fmt.Sprintf("%.2f TiB", float64(s)/TiB)
}
}
type index struct {
dups map[string]int // hash -> occurances
sizes map[string]int // hash -> size
}
func (s *index) total() (count, size int) {
for hash, n := range s.dups {
n = max(n, 1)
size += n * s.sizes[hash]
count += n
}
return count, size
}
func (s *index) dedup() (count, size int) {
for _, n := range s.sizes {
size += n
}
return len(s.sizes), size
}
func (s *index) readFrom(rc io.ReadCloser) error {
defer rc.Close()
scanner := bufio.NewScanner(rc)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue // skip empty lines
}
obj, err := parseObject(line)
if err != nil {
return err
}
if size, ok := s.sizes[obj.hash]; ok && size != obj.size {
return fmt.Errorf("size conflict for %q: %d != %d", obj.hash, size, obj.size)
}
s.dups[obj.hash]++
s.sizes[obj.hash] = obj.size
}
return scanner.Err()
}
func (s *index) summary() string {
var buf bytes.Buffer
totalObjects, totalSize := s.total()
dedupObjects, dedupSize := s.dedup()
fmt.Fprintf(&buf, "Total objects: %d\n", totalObjects)
fmt.Fprintf(&buf, "Total size: %s\n", size(totalSize))
fmt.Fprintf(&buf, "Dedup objects: %d\n", dedupObjects)
fmt.Fprintf(&buf, "Dedup size: %s\n", size(dedupSize))
return buf.String()
}
func summary(dir string, ignore ...string) error {
idx, err := buildIndex(dir)
if err != nil {
return err
}
fmt.Print(idx.summary())
return nil
}
func compare(dir, backup string) error {
idx, err := buildIndex(dir, backup)
if err != nil {
return err
}
bkp, err := buildIndex(backup)
if err != nil {
return err
}
for key := range idx.sizes {
delete(bkp.dups, key)
delete(bkp.sizes, key)
}
fmt.Print(bkp.summary())
return nil
}
func buildIndex(dir string, ignore ...string) (*index, error) {
var idx = index{
dups: make(map[string]int),
sizes: make(map[string]int),
}
metaFiles := make(map[string]struct{})
switch fi, err := os.Stat(dir); {
case err != nil:
return nil, err
case fi.IsDir():
f, err := os.Open(dir)
if err != nil {
return nil, err
}
files, err := f.Readdirnames(-1)
f.Close()
if err != nil {
return nil, err
}
for _, file := range files {
metaFiles[filepath.Join(dir, file)] = struct{}{}
}
default:
metaFiles[dir] = struct{}{}
}
for file := range metaFiles {
for _, s := range ignore {
if strings.Contains(file, s) {
delete(metaFiles, file)
}
}
}
for file := range metaFiles {
f, err := os.Open(file)
if err != nil {
return nil, err
}
if err := idx.readFrom(f); err != nil {
return nil, err
}
}
return &idx, nil
}
func max(i, j int) int {
if i > j {
return i
}
return j
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment