Skip to content

Instantly share code, notes, and snippets.

@hannahherbig
Created June 12, 2018 04:31
Show Gist options
  • Save hannahherbig/e100c935efafa0edd0a70c96482f1200 to your computer and use it in GitHub Desktop.
Save hannahherbig/e100c935efafa0edd0a70c96482f1200 to your computer and use it in GitHub Desktop.
import os
import math
import hashlib
from tqdm import tqdm
units = 'BKMGTPEZY'
def human(x):
if x > 0:
m = int(math.log(x, 1024))
else:
m = x = 0
n = x / (2 ** (10 * m))
post = units[m]
return '%.1f%s' % (n, post)
def calc(filename, t):
m = hashlib.sha256()
with open(filename, 'rb') as f:
while True:
chunk = f.read1(1048576)
if chunk:
m.update(chunk)
t.update(len(chunk))
else:
break
return m.hexdigest()
files = []
total = 0
stats = set()
for dirpath, dirnames, filenames in os.walk('.'):
for filename in filenames:
path = os.path.join(dirpath, filename)
stat = os.stat(path)
if stat not in stats:
stats.add(stat)
total += stat.st_size
files.append(path)
hashes = {}
saved = 0
with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as t:
for file in files:
h = calc(file, t)
if h in hashes:
saved += os.path.getsize(file)
t.write('%s %s %s' % (human(saved), file, hashes[h]))
os.unlink(file)
os.link(hashes[h], file)
else:
hashes[h] = file
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment