Created
November 8, 2013 15:18
-
-
Save theonewolf/7372474 to your computer and use it in GitHub Desktop.
Deduplication example for ACM XRDS blog
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| from sys import argv as args | |
| original = args[1] | |
| index = args[2] | |
| size = int(args[3]) | |
| dedup = args[4] | |
| def dedup(index, original, dedup, size): | |
| for line in index: | |
| if line: | |
| original.seek(int(line) * size) | |
| dedup.write(original.read(size)) | |
| with open(original, 'rb') as original: | |
| with open(index, 'r') as index: | |
| with open(dedup, 'wb') as dedup: | |
| dedup(index, original, dedup, size) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| from md5 import new as hasher | |
| from sys import argv as args | |
| fname = args[1] | |
| size = int(args[2]) | |
| def chunks(f, size): | |
| buf = f.read(size) | |
| while buf: | |
| yield buf | |
| buf = f.read(size) | |
| def hash(f, size): | |
| for chunk in chunks(f, size): | |
| print hasher(chunk).hexdigest() | |
| with open(fname, 'rb') as f: | |
| hash(f, size) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| from sys import argv as args | |
| fname = args[1] | |
| def index(f) | |
| uniques = dict() | |
| lines = f.readlines() | |
| for i,line in enumerate(lines): | |
| if line: | |
| print uniques.setdefault(line, i) | |
| with open(fname, 'rb') as f: | |
| index(f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment