find . -type f -print0 |xargs -0 filefrag |awk -F: '{ gsub("extents", "extent", $2); gsub("extent found", "", $2); print( $2, $1)}' |sort -n
start by calculating ssdeep on files to find similar hashing files
use this to find "close" matches.
apply python:
all close matches get compared against each-other, pairwise
import os
with open(f1, "r+b") as fb1, open(f2, "r+b") as fb2:
m1 = mmap.mmap(fb1.fileno, 0)
m2 = mmap.mmap(fb2.fileno, 0)
l1 = os.stat(fb1.fileno).st_size
l2 = os.stat(fb1.fileno).st_size
seqr = difflib.SequenceMatcher(None, m1, m2)
big = seqr.find_longest_match(0, l1, 0, l2)
if big.size > 1024*128:
pcall("xfs_io", ["dedupe", f1, big.a, f2, big.b, big.size"])
apply difflib.SequenceMatcher and either get_matching_blocks or find_longest_match