Skip to content

Instantly share code, notes, and snippets.

@theonewolf
Created November 8, 2013 15:18
Show Gist options
  • Select an option

  • Save theonewolf/7372474 to your computer and use it in GitHub Desktop.

Select an option

Save theonewolf/7372474 to your computer and use it in GitHub Desktop.
Deduplication example for ACM XRDS blog
#!/usr/bin/env python
from sys import argv as args
original = args[1]
index = args[2]
size = int(args[3])
dedup = args[4]
def dedup(index, original, dedup, size):
for line in index:
if line:
original.seek(int(line) * size)
dedup.write(original.read(size))
with open(original, 'rb') as original:
with open(index, 'r') as index:
with open(dedup, 'wb') as dedup:
dedup(index, original, dedup, size)
#!/usr/bin/env python
from md5 import new as hasher
from sys import argv as args
fname = args[1]
size = int(args[2])
def chunks(f, size):
buf = f.read(size)
while buf:
yield buf
buf = f.read(size)
def hash(f, size):
for chunk in chunks(f, size):
print hasher(chunk).hexdigest()
with open(fname, 'rb') as f:
hash(f, size)
#!/usr/bin/env python
from sys import argv as args
fname = args[1]
def index(f)
uniques = dict()
lines = f.readlines()
for i,line in enumerate(lines):
if line:
print uniques.setdefault(line, i)
with open(fname, 'rb') as f:
index(f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment