Last active
October 28, 2024 04:12
-
-
Save donnaken15/f95e8a143bb330fcf7d6268a4d6929e8 to your computer and use it in GitHub Desktop.
Group duplicates of files into hardlinks, check similarity using BLAKE2/SHA256 and matching file size, primarily for Windows/Cygwin/MSYS2 | Jump to comments: https://gist.github.com/donnaken15/f95e8a143bb330fcf7d6268a4d6929e8?permalink_comment_id=5166431#gistcomment-5166431
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/zsh | |
# I'M ACTIVELY FORGETTING HOW MY OWN SCRIPT WORKS!!!!!!!!!!!!!! | |
[ $# -lt 2 ] && [[ ! "$1" == *[\*\?]* ]] && { | |
[ $# -eq 1 ] && | |
echo 'you must specify more than one file to be deduped' && | |
echo | |
echo 'dedupe [input files]' | |
echo '- replace multiple unchanging copies of' | |
echo ' the same files with hardlinks to save space' | |
echo '- as of now, it is recommended to execute this' | |
echo ' only on files that exist on a singular device' | |
exit 1 | |
} | |
# WSL SCREWS UP WINDOWS PATHS THAT AREN'T ENTERED WITH QUOTES | |
[ ! "${password}" = "alpine" ] && { | |
userval() { export "$1"="$2" } # fallback function if below utility doesn't exist | |
dotload_namespace=dk15 | |
dotload=$(command -v dotload || echo /usr/share/dk15/dotload) && source "${dotload}" 'dedupe.conf' 2>/dev/null | |
} || { # for PKGBUILD check() | |
userval() { | |
local test="test_${1}" # :/ | |
export "$1"="${(P)test:-${2}}" | |
} | |
} | |
# makes me wonder if direct binary string comparison will also work somehow, OCD'ing because of memory usage | |
mp=4 | |
userval passes $mp | |
userval hash b2 | |
bools=(force_relink sanity_check hide_invalid batch_hashes simulate_mode scramble_list hide_errored process_gitdir) | |
for b in ${bools}; do | |
userval $b 0; [ ${(P)b} -ne 0 ] && export "$b=true" || export "$b=false" | |
# BRANCH NOT EQUAL AMIRITE AHAHHAHAHAHAHAAAHAHHAHHHAHHHAHAHAH | |
done | |
$simulate_mode && link=nop || link=ln | |
$sanity_check && { | |
echo 'Configuration:' | |
for c in passes hash ${bools}; do | |
echo $c = ${(P)c} | |
done | |
} | |
nop() {return 0} | |
NUL='/dev/null' | |
err='[91;1m' | |
rc='[0m' | |
[ $passes -ge 1 -a $passes -le 4 ] || { | |
echo "Invalid config: passes = ${passes}. Setting to ${mp}." 1>&2 | |
passes=$mp | |
} | |
where=('command' '-v') | |
$where fsutil.exe >$NUL && fsutil=true || fsutil=false | |
[ $passes -ge 3 -a $fsutil = false ] && { | |
echo '[91;1mWARNING: Passes 3 and 4 may take a while to find and' 1>&2 | |
echo 'merge groups of hardlinks using the inputs provided![0m' 1>&2 | |
} | |
$where "${hash}sum" >$NUL || { | |
echo "${err}Cannot find ${hash}sum as a hashing program.${rc}" 1>&2 | |
echo "[90mFaulting to ${hash}.${rc}" 1>&2 | |
hash=sha256 | |
} | |
$where "${hash}sum" >$NUL || { | |
echo "${err}Fallback hashing program ${hash}sum does not exist. Aborting...${rc}" 1>&2 | |
exit 1 | |
} | |
typeset -A hashsizes=( # raw byte lengths | |
#[ck]=4 # can't use, outputs decimal, so isn't compatible with this current system and its execution | |
[md5]=16 | |
[sha1]=20 | |
[sha224]=28 | |
[sha256]=32 | |
[sha384]=48 | |
[sha512]=64 | |
) | |
hash_params=() | |
[ ! "$hash" = b3 ] && hash_params+=(-b) | |
[ "$hash" = b2 -o "$hash" = b3 ] && { | |
userval hash_length 10 # bytes, being conservative for big files | |
[ "$hash" = b2 ] && { | |
hash_params+=(-l $((${hash_length}<<3))) | |
} || { | |
#hash_threads=1 --num-threads ${hash_threads} | |
hash_params+=(-l ${hash_length} --no-names) | |
# hashing a bunch of (usually small) files that use 16 threads | |
# each feels iffy, should scale up with file size | |
# makes me wonder now if it could use spare threads | |
# for processing other files somehow when multiple | |
# files are put in the input command | |
} | |
} || { | |
[ -z ${hashsizes[$hash]} ] && { | |
echo "${err}Unrecognized hash function: ${hash}${rc}" 1>&2 | |
echo "[90mFaulting to ${hash}.${rc}" 1>&2 | |
hash=sha256 | |
} | |
hash_length=${hashsizes[$hash]:-16} | |
} | |
hashstrlen=$((${hash_length}<<1)) | |
basecmd=("${hash}sum" ${hash_params} '--') | |
hashset=() | |
baseset=() | |
counters=() # for skipping unique files in subsequent passes that had no duplicates counted up | |
filter=() # 0 = if pass>1, don't process; 1 = retry on later passes | |
dsize() | |
{ | |
local test=(`df -xtmpfs -xdevtmpfs --output=avail --total 2>$NUL`) && echo $((${test[-1]} * 1024)) # absurd | |
} | |
fsize=(stat -Lc%s) # attempts to reduce amount of new processes, but substitution will always require it | |
inode=(stat -Lc%i) # location of the data in the raw drive | |
mntpt=(df --output=target) | |
sfxlp=('/proc' '/dev' '/tmp' '/var/tmp' '/sys' '/boot') | |
sfind=() | |
for f in "${sfxlp[@]}"; do | |
sfind+=(! -path "\"${f}/*\"") | |
done | |
unset sfxlp | |
#profile=(date '+%s%N') | |
#profileend() { echo "profile script" $(($(($($profile) - $1)) / 1000000000.0)) } | |
dls=() # drive letters | |
mps=() # mount points | |
(mount | sed -n 's/^\(\w:\)\\\?\son\s\(\/cygdrive\/\w\|\/mnt\/\w\|\/\w\).*/\1 \2/p') | | |
while read -r lt; do | |
l=(${(@s: :)lt}) # absurd | |
dls+=(${l[1]:l}) # C: | |
mps+=(${l[2]:l}) # /cygdrive/c/ | |
done | |
wsl_path() # i hate linux | |
{ | |
local l1="${2:l}" | |
local test | |
[[ ! "$l1" = /* ]] && { | |
[ "$1" = '-u' -a "$l1" = 'NUL' ] && { | |
echo "$NUL" | |
return | |
} | |
[ ! "$1" = '-u' -a "$l1" = "$NUL" ] && { | |
echo 'NUL' | |
return | |
} | |
wslpath $1 "$2" | |
return | |
} || { | |
[[ "$l1" = /cygdrive/* ]] && | |
test=${mps[1][1,-2]}${1:10} || | |
{ | |
esc="${2//\\//}" | |
for i in {1..${#dls}}; do | |
[[ "$l1" = /mnt${mps[$i][-2,-1]}* ]] && { # wsl default | |
test="${mps[$i][1,-2]}${esc:5}" | |
break | |
} | |
[[ "$l1" = ${mps[$i][-2,-1]}/* ]] && { # msys | |
test="${mps[$i]}/${esc:3}" | |
break | |
} | |
[[ "$l1" = ${dls[$i]}* ]] && { | |
test="${mps[$i]}${esc:2}" | |
break | |
} | |
[[ "$l1" = ${mps[$i]}* ]] && { | |
test="${mps[$i]}${esc:${#mps[$i]}}" | |
break | |
} | |
done | |
test="${test:-$2}" | |
} | |
[ "$1" = '-u' ] && { | |
echo "$test" | |
return | |
} | |
} | |
wslpath $1 "$test" || { | |
echo "[31mMISHANDLED PATH: $test${rc}" 1>&2 | |
echo "$2" | |
} | |
} | |
fail() { echo -n "$2" } | |
wpath=`$where cygpath` || { | |
$where wslpath >$NUL && wpath=wsl_path || wpath=wpathfail | |
} | |
# TODO: support b3sum windows, doesn't use unix paths | |
units="kmgt" | |
#total=0 | |
lasttest=0 | |
copycount=0 | |
expected=0 | |
rawphysd=0 | |
IFS=$'\n' | |
echo "[97mdedupe - Building file list...${rc}" | |
# should probably also generally avoid deduping files less than at least 8 bytes for whatever reason, in | |
# the case of like blank text files which would have one newline (LF or CR LF) or something | |
# this is why you should be selective with what files you want to dedupe, like certain extensions | |
prep_check() # i don't want to have to rely on this as its own function | |
{ | |
[ ! -e "$1" -o -d "$1" ] && return 1 | |
[ ! -s "$1" ] && return 2 | |
[ ! -w "$1" ] && return 3 | |
return 0 | |
} | |
prognums() | |
{ | |
echo -n "($(($1-${3:-0}))/$(($2-${3:-0})))"$'\r' | |
} | |
l=("$@") | |
k=0 | |
i=1 | |
set -- | |
preperrs=('not a file' 'blank' 'not writeable') | |
while [ $i -le ${#l} ]; do | |
f="${l[$i]}" | |
prognums $i ${#l} $k | |
[[ "$f" == */.git/* ]] && { | |
$process_gitdir || { | |
# there needs to be an alias thing for this or something | |
# to not have to specifically type out increment and continue | |
# over and over and over again | |
((i++)); continue | |
} | |
} | |
[[ "$f" == *[\*\?]* ]] && { # evade "argument list too long" with this, have to wrap the glob in quotes though | |
echo "[93mGot glob pattern: $f${rc}" | |
((k++)); prognums $i ${#l} $k # FIX: upper limit goes over final length of infiles array | |
#l+=(${~f}) | |
# need to split base path and pattern | |
# for now: (cd /.../.../ && dedupe '**/*.*') | |
[[ "$f" == *\*\** ]] && recurse=() || recurse=(-maxdepth 1) # >:( | |
find ${recurse} -wholename "$f" -print 2>$NUL | while read -r ff; do # just to not look frozen for >1 minute | |
[ -d "$ff" ] && continue | |
[[ "$ff" == */.git/* ]] && { | |
$process_gitdir || continue | |
} | |
l+=("$ff"); prognums $i ${#l} $k | |
done | |
((i++)); continue | |
} | |
[[ "$f" == /* ]] && abs=true || { | |
for j in {1..${#dls}}; do # check for windows path | |
[[ "${f:l}" = ${dls[$j]}* ]] && { | |
abs=true | |
break | |
} | |
done | |
} | |
# speed up when paths are relative, mount point stuff should still get handled just fine | |
# but i know stupid programming will try to subvert my expectations by not working at all, as usual | |
${abs:-false} && ff="`realpath -s "$($wpath -u "$f")"`" | |
((i++)) | |
prep_check "${ff:-$f}" || { | |
ERR=$?; $hide_invalid || echo "${err}$f is ${preperrs[$ERR]}.${rc}" 1>&2 | |
continue | |
} | |
infiles+=("${ff:-$f}") | |
done | |
unset l k f | |
[ ${#infiles} -le 0 ] && { | |
echo '[91mThere are no files to process.[0m' 1>&2 | |
exit 1 | |
} | |
[ ${#infiles} -eq 1 ] && { | |
echo '[91mNot enough files entered to process.[0m' 1>&2 | |
exit 1 | |
} | |
$scramble_list && { | |
# for stress testing this script, because of zsh's handling of arrays and items..... | |
# and because this script is stupid and a total broken POS | |
local test=() | |
echo "\n[35mScrambling list...${rc}" | |
i=1 | |
shuf -i "1-${#infiles}" | while read -r s; do | |
test[$s]="${infiles[$i]}" | |
echo -n "($i/${#infiles})"$'\r' | |
((i++)) | |
done | |
infiles=(${test}) | |
} | |
# next totally sane step: move hash generation to an array creation block | |
# because time will be wasted on subsequent passes | |
# why not anyway, the array length and everything lined up | |
# should still be consistent with the file list | |
# here i am now, why | |
$batch_hashes && { | |
IFS=$'\0' | |
echo '\n[96mHashing files...[0m' | |
i=1 | |
# wasted all this time to make a system that | |
# splits the input args by the arg limit when | |
# xargs already does just that | |
# i just realized i probably could've just piped this in the main block | |
# but multiple passes will probably be harder to work with | |
# CAN'T USE >1 COMMAND AT A TIME, NO FAIR, OVERLAPS WITH STDERR | |
# WHY NOT JUST USE THE FILENAME THAT'S FED INTO IT INSTEAD!!! | |
echo -n "${infiles}" | xargs -P 1 -0 ${basecmd} 2>&1 | while read -r test; do | |
#VARNAME[index]=() think | |
#[[ "${test:l}" == *"argument list too long"* ]] && { echo '--------- KYS'; break } | |
[[ "$test" == "${basecmd[1]}: "* ]] && { echo "${err}$test$rc" 2>&1; hashset+=("-"); continue } | |
[[ "$test" == ": "* ]] && continue # STUPID!!!!!! | |
prognums $i ${#infiles} | |
((i++)) | |
hashset+=(${test[0,$hashstrlen]}) | |
filter+=(0) | |
counters+=(0) | |
done | |
die=(${pipestatus}) | |
[ ${die[1]} -eq 127 ] && { | |
echo why | |
exit 999 | |
} | |
[ ${#hashset} -ne ${#infiles} ] && { | |
batch_hashes=false | |
echo "\n${err}Batched hash list fails to match the number of files scanned: ${#hashset} = ${#infiles}${rc}" 1>&2 | |
hashset=() | |
baseset=() | |
filter=() | |
counters=() | |
} || baseset=(${infiles}) # should use ${(P)name} instead | |
unset i | |
IFS=$'\n' | |
} | |
# todo?: take timestamp at init and check if files change after | |
# just so nothing's falsely hardlinked because this is not | |
# fast enough (yet?) and metadata gets stored minutes earlier | |
# also because *a* user would be that dumb to try and mess with it | |
errgate() | |
{ | |
[ "${map[12]}" = "${map[1]}" ] && return 11 | |
# try to utilize KSH [ extension instead: -ef | |
# also applies to symlink :/ | |
[ ! "${map[10]}" = "${map[11]}" ] && return 12 | |
$force_relink | |
[ \ | |
${map[6]} = ${map[7]} -a \( \ | |
$? -eq 1 -o \( $? -eq 0 -a $pass -gt 1 \) \ | |
\) \ | |
] && return 13 | |
[ ! ${map[4]} = ${map[5]} ] && return 14 | |
return 0 | |
} | |
test='' | |
dsize | read -r before | |
echo "[97m(${#infiles} files)${rc}" | |
for pass in {1..$passes}; do | |
batched=0 | |
i=1 | |
$sanity_check && echo "[33mNew pass: $pass${rc}" | |
for f in "${infiles[@]}"; do | |
$sanity_check && echo "[90m$i (${counters[$i]:--}) (${err}${filter[$i][2]:-[92m-}[90m): $f${rc}" | |
[ \ | |
$pass -gt 1 -a \( \ | |
${filter[$i]:-0} -ne 0 -o ${counters[$i]:-0} -le 0 \ | |
\) \ | |
] && { ((i++)) && continue } | |
map=( | |
"$f" | |
"" | |
"${f:t}" | |
) | |
$batch_hashes && # getting already hardlinked error on the first item now with this | |
map[2]="${hashset[$i]}" || | |
map[2]="`${basecmd} "$f"`" | |
{ | |
[ ! "${map[2]}" = "-" ] && | |
prep_check "$f" # imagining someone or something will rename it midway through hash batching | |
} || { | |
ERR=$? | |
[ $pass -eq 1 ] && filter+=(${ERR}) | |
((i++)) | |
echo "${err}[${ERR}] Invalid file $f.${rc}" | |
continue | |
} | |
# map, for substitution speed and earlier function: | |
# [1] = target, this file | |
# [2] = hash | |
# [3] = basename | |
# target and source metadata | |
# [4,5] = file sizes | |
# [6,7] = inodes | |
# [8] = free space | |
# [10,11] = mount points | |
# [12] = source, matching | |
# can't even memorize my own system of metadata handling | |
local tmp=${test:-0} | |
test=${map[2][0,$hashstrlen]} | |
check=${hashset[(Ie)$test]} # GETS LAST OCCURRENCE ACTUALLY..... | |
[ $batch_hashes = true -o $pass -gt 1 ] && { | |
[ $check -eq 0 ] && { # ?????????? | |
echo "${err}------------- why${rc}" 1>&2 | |
((i++)); continue | |
} | |
[ $check -eq $i ] && { | |
((i++)); continue | |
} | |
} | |
$batch_hashes || { | |
[ $check -eq 0 ] && { | |
hashset+=($test) | |
baseset+=("$f") | |
counters[$i]=0 | |
filter+=(0) | |
((i++)); continue | |
} | |
} | |
base="${baseset[$check]}" | |
map+=($( | |
$fsize "$f" "$base" | |
$inode "$f" "$base" | |
dsize | |
$mntpt "$f" "$base" | |
) # should reduce two stat calls into one(?) | |
"$base") | |
#for x in {1..${#map}}; do | |
# echo "map[$x] = ${map[$x]}" | |
#done | |
# why am i even doing (any (all) of) this????? | |
[ ! "${map[10]}" = "${map[11]}" ] && { | |
#[ $pass -eq 1 -a $batch_hashes = false ] && continue | |
{ | |
while true; do | |
check=((check--)) | |
[ $check -lt 1 ] && break | |
[ $check -eq $i ] && continue | |
#echo $i, $check, ${#hashset} $test ${hashset[$check]} | |
[ ! $test = ${hashset[$check]} ] && continue | |
#echo Matched hash $test ${hashset[$check]} | |
ugh="${baseset[$check]}" | |
why=($( | |
$fsize "$ugh" | |
$inode "$ugh" | |
$mntpt "$ugh" | |
)) | |
[ ! "${map[10]}" = "${why[4]}" ] && continue | |
#echo Matched mount point "${map[10]}" = "${why[4]}" | |
# already handled in errgate | |
#[ ! "${map[6]}" = "${why[2]}" ] && continue | |
#echo Matched inode "${map[6]}" = "${why[2]}" | |
#[ ! "${map[4]}" = "${why[1]}" ] && continue | |
#echo Matched size "${map[4]}" = "${why[1]}" | |
map[12]="$ugh" | |
map[11]="${why[4]}" | |
map[7]="${why[2]}" | |
map[5]="${why[1]}" | |
# not tested yet | |
break | |
done | |
} | |
} | |
errgate ${map[@]} | |
ERR=$? | |
[ $pass -eq 1 ] && { | |
$batch_hashes && | |
filter[$i]=${ERR} || | |
filter+=(${ERR}) | |
} | |
[ $ERR -ne 0 ] && { | |
[ $pass -eq 1 -a $hide_errored = false ] && { | |
case ${ERR} in | |
11) errstr="${map[10]}/.../${map[3]} cannot be linked to itself.";; | |
# handle case insensitivity on windows, linux just doesn't have it for some reason | |
12) errstr="Mount points do not match for " | |
errstr+="${errstr} ${map[10]}/.../${map[3]} and ${map[11]}/.../$(basename "$base")";; | |
13) errstr="${map[10]}/.../${map[3]} is already hardlinked.";; | |
14) errstr="[${hashset[$check]:0:15}, ${map[4]}] ${map[3]} and [${test:0:15}, ${map[5]}] " | |
errstr+="$(basename "$base") have matching hashes but different size!!!!";; | |
*) errstr="Uncaught error $ERR";; | |
esac | |
# there has to be a way to make this into an array thing instead | |
echo "${err}${errstr}${rc}" 1>&2 | |
} | |
((i++)); continue | |
} | |
$sanity_check && { | |
echo \ | |
"[33;1m$i[22m: ${test[0,32]} [90m(${map[4]}) ${rc}=" \ | |
"[36;1m$check[22m: ${hashset[$check][0,32]} [90m(${map[5]})${rc}" | |
} | |
s=${map[4]} | |
# prof=`$profile` | |
{ | |
# TODO: handle permission denied error just | |
# so pass number text isn't printed prematurely | |
lasttest=$tmp | |
[ "$f" -nt "$base" ] && { # absurd but muh archives/history reasons | |
target="$f" | |
source="$base" | |
swap=1 | |
} || { | |
target="$base" | |
source="$f" | |
swap=0 | |
} | |
[ $copycount -eq 0 -a $pass -eq 1 ] && echo 'Deduping...' | |
[ $batched -eq 0 -a $passes -gt 1 ] && echo "[92;1mPass ${pass}${rc}" | |
[ $pass -eq 1 ] && ((copycount++)) | |
case "$pass" in | |
1) ;& | |
2) dsize | read -r uhh | |
"$link" -f "$source" "$target" && { | |
rawphysd=$(($rawphysd + `dsize` - ${uhh})) | |
echo "[95;1m[${test:0:15}]${rc}" \ | |
"[97m${base:t}${rc}" \ | |
"[93;1m←${rc} [96;1m$f${rc}" | |
[ $pass -eq 1 ] && { | |
expected=$(($expected + $s)) | |
counters[$i]=$((${counters[$i]}+1)) | |
} | |
((batched++)) | |
# total=((total++)) | |
} | |
;; | |
3) ;& | |
4) ffs=0 | |
$fsutil && { | |
test2="`$wpath -m "$target"`" | |
point="${map[$((10+$swap))]}" | |
fsutil.exe hardlink list "$test2" >$NUL && { | |
ffs=0 | |
lasttest=$test | |
fsutil.exe hardlink list "$test2" | sed 's/\r//g; s/\\/\//g' | while read -r hl; do | |
map=($( | |
$wpath -u "${point}${hl}" && | |
dsize | |
)) && { | |
"$link" -f "${source}" "${map[1]}" && | |
ffs=$(($ffs+1)) && batched=$(($batched+1))&& # * # WHY NO ++ | |
rawphysd=$(($rawphysd + `dsize` - ${map[2]})) && { | |
[ $ffs -eq 1 ] && \ | |
echo "[95;1m[${test:0:15}]${rc}" \ | |
"[97m${source:t}${rc}" \ | |
'[90m<group merge>' | |
echo "[33;1m↑${rc} [96;1m${map[1]}${rc}" | |
} || false | |
} || echo "${err}Failed to hardlink ${hl}.${rc}" 1>&2 | |
done | |
[ $ffs -gt 0 ] | |
} || echo "${err}Cannot find hardlinks for $target.${rc}" 1>&2 | |
} || { | |
map=( | |
#"${map[$((10+$swap))]}" | |
"${map[$((11-$swap))]}" | |
#"${map[$((6+$swap))]}" | |
"${map[$((7-$swap))]}" | |
) && { | |
# NOT A GOOD IDEA!!!! | |
find "${map[1]}" -xdev ${sfind[@]} -inum ${map[2]} 2>$NUL | while read -r hl; do | |
uhh=`dsize` | |
"$link" -f "$source" "$hl" 2>$NUL && | |
ffs=$(($ffs+1)) && batched=$(($batched+1)) && # * MAKE FUNCTION | |
rawphysd=$(($rawphysd + `dsize` - ${uhh})) && { | |
[ $ffs -eq 1 ] && \ | |
echo "[95;1m[${test:0:15}]${rc}" \ | |
"[97m${source}${rc}" \ | |
'[90m<group merge>' | |
echo "[33;1m↑${rc} [96;1m${hl}${rc}" | |
} || echo "${err}Failed to hardlink ${hl}.${rc}" 1>&2 | |
done | |
[ $ffs -gt 0 ] | |
} || echo "${err}Cannot find hardlinks for $target.${rc}" 1>&2 | |
true | |
} | |
;; | |
esac | |
} | |
((i++)) | |
# profileend $prof | |
done | |
[ $batched -eq 0 ] && break | |
done | |
hus() | |
{ | |
[ $1 -lt 1024 ] && { | |
printf '%.0fb\n' $1 | |
return 1 | |
} | |
a=$1 | |
u=0 | |
until [ `printf '%.0f' $(($a-0.499999999999))` -lt 1024 ]; do # hate | |
a=$(($a/1024.0)) | |
((u++)) | |
done | |
printf '%.2f%sb\n' $(($a-0.004999)) ${units[$u]} | |
} | |
map=($( | |
hus $before | |
hus `dsize` | |
hus $expected | |
[ $rawphysd -lt 1 ] && echo || echo + | |
hus $rawphysd | |
)) | |
[ $copycount -gt 0 ] && echo '[92mDone![0m' | |
echo 'Free space:' | |
echo "Before: ${map[1]}" | |
echo "After : ${map[2]}" | |
echo "Expected space saved: ${map[3]}" | |
echo "Disk space difference: ${map[4]}${map[5]}" | |
echo "Found ${#hashset[@]} unique files, $copycount duplicates" | |
#read -rsn | |
exit 0 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.TH dedupe 1 "25 Oct 2024" "0.5-25.10.24" | |
.SH NAME | |
dedupe | |
.SH SYNPOSIS | |
.B dedupe | |
[\fBFILES\fR] ... | |
.SH USAGE | |
Provide a list of at least two files that contain duplicates of one another to group them into hardlinks. | |
For deduping of an entire folder containing duplicate items without specifying every single file, add | |
.B **/*.ext | |
to the argument list to recursively search for matching files in the current directory. | |
It's recommended to run this on a singular drive at a time with folders of | |
old system backups or small to medium size archival data or old downloads. | |
.SH CONFIGURATION | |
Configuration is stored in the user's home directory at \fI~/.config/dk15/dedupe\fR | |
A list of acceptable values to change: | |
.TP | |
.BR passes " = " \fIcount\fR | |
Set the number of passes per command. \fIMinimum\fR = \fB1\fR, \fImaximum\fR = \fB4\fR. | |
For passes 1 and 2, the utility plainly hardlinks the files provided. | |
For passes 3 and 4, which exclusively work on Windows, the files' individual | |
hardlinks found on the current storage device are merged into the first copy | |
of said duplicates to ensure that there are not two or more separate hardlink | |
groups of the exact same file. | |
If there's not a single file that is processed on a pass, the utility will exit. | |
.TP | |
.\" actual suffering | |
.BR hash " = " \fIsum_prefix\fR | |
Set the preferred hashing function. Recommended functions are BLAKE2 (\fBb2\fR), | |
BLAKE3 (\fBb3\fR), or SHA256 (\fBsha256\fR). The name of the chosen function must match | |
the name of an existing executable that is suffxed with -sum, like the above examples. | |
.nr PI 2n | |
Only acceptable hash functions as of now (length, speed, security): | |
.\" how do i not have a second newline separating these | |
.IP * | |
\fBmd5\fR - 16 bytes, fast, weak | |
.IP * | |
\fBsha1\fR - 20 bytes, medium, weak | |
.IP * | |
\fBsha224\fR - 28 bytes, medium, okay | |
.IP * | |
\fBsha256\fR - 32 bytes, medium, strong | |
.IP * | |
\fBsha384\fR - 48 bytes, slow, strong | |
.IP * | |
\fBsha512\fR - 64 bytes, slow, strong | |
.IP * | |
\fBblake2\fR - any length, fast, strong (64 bytes by default) | |
.IP * | |
\fBblake3\fR - any length, very fast, strong (32 bytes by default) | |
.TP | |
.BR hash_length " = " \fIbyte_count\fR | |
BLAKE2 and BLAKE3 only. Set the preferred hash length, adjustable | |
in the case of handling an unusual amount of files at once. | |
.TP | |
.BR force_relink " = " [ \fI0\fR | \fI1\fR ] | |
Force the hardlinking of duplicates which have already been hardlinked. Only applies to pass 1. | |
.TP | |
.BR sanity_check " = " [ \fI0\fR | \fI1\fR ] | |
Print every file that is processed for deduplicating. | |
.TP | |
.BR hide_invalid " = " [ \fI0\fR | \fI1\fR ] | |
Don't print errors about files that don't exist or are blank. | |
.TP | |
.BR batch_hashes " = " [ \fI0\fR | \fI1\fR ] | |
Handle hashing all at once. | |
.B EXPERIMENTAL, DON'T TRY ON FILES THAT ARE IMPORTANT TO YOU. | |
.TP | |
.BR simulate_mode " = " [ \fI0\fR | \fI1\fR ] | |
Simulation mode, do not actually hardlink. | |
.TP | |
.BR hide_errored " = " [ \fI0\fR | \fI1\fR ] | |
Hide errors that display during pass 1. | |
.TP | |
.BR scramble_list " = " [ \fI0\fR | \fI1\fR ] | |
Scramble file list after parsing. For stress testing. | |
.TP | |
.BR process_gitdir " = " [ \fI0\fR | \fI1\fR ] | |
Don't ignore .git folders, bad idea unless the repo targeted is not | |
being modified and merely exists for the purpose of preservation. | |
.SH AUTHOR | |
donnaken15 <[email protected]> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# dedupe config | |
# | |
# Each pass processes the entire file list given to the command. | |
# Pass 1: Incrementally scan, index, hash each file, and check for duplicates to hardlink. | |
# Pass 2: Plainly hardlink exact copies of input files using the complete list of hashes. | |
# Pass 3, 4: Check input duplicate files that already share hardlinks, or just became | |
# hardlinks from the current execution, and merge them together with their sources. | |
# Separate groups of hardlinks of the same file is an issue which requires a complex explanation. | |
# Pass 1 and 2 are basically the same if batch_hashes is enabled. | |
passes = 4 | |
# Preferred hashing function, recommended: BLAKE2 or BLAKE3 | |
# The name of the chosen function must match the name of an | |
# existing executable that is suffixed with -sum. As such: | |
# SHA256: sha256sum, hash = sha256 | |
# BLAKE3: b3sum, hash = b3 | |
hash = b2 | |
# Preferred BLAKE hash length in bytes, adjustable in the | |
# case of handling an unusual amount of files at once. | |
hash_length = 20 | |
# Force hardlinking of duplicates which are already hardlinked. | |
# Only applies to pass 1. | |
force_relink = 0 | |
# Print every file that is processed. | |
sanity_check = 0 | |
# Don't print errors about files that don't exist or are blank. | |
hide_invalid = 0 | |
# Handle hashing all at once. | |
# EXPERIMENTAL, DON'T TRY ON FILES THAT ARE IMPORTANT TO YOU. | |
batch_hashes = 0 | |
# Simulation mode, do not actually hardlink. | |
# Will run through every pass however, to no effect. | |
simulate_mode = 0 | |
# Hide errors that display during pass 1. | |
hide_errored = 0 | |
# Scramble file list after parsing. For stress testing. | |
scramble_list = 0 | |
# Don't ignore .git folders, bad idea unless the repo targeted is not | |
# being modified and merely exists for the purpose of preservation. | |
process_gitdir = 0 |
repeatedly committing now because I'm testing on live ISO
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example logs: