donnaken15 · January 26, 2025 17:24 · donnaken15 · Aug 25, 2024
diff --git a/dedupe b/dedupe
 #!/bin/zsh
 # I'M ACTIVELY FORGETTING HOW MY OWN SCRIPT WORKS!!!!!!!!!!!!!!
 [ $# -lt 2 ] && [[ ! "$1" == *[\*\?]* ]] && {
 	[ $# -eq 1 ] &&
 		echo 'you must specify more than one file to be deduped' &&
 		echo
 	echo 'dedupe [input files]'
 	echo '- replace multiple unchanging copies of'
 	echo '  the same files with hardlinks to save space'
 	echo '- as of now, it is recommended to execute this'
 	echo '  only on files that exist on a singular device'
 	exit 1
 }
 # WSL SCREWS UP WINDOWS PATHS THAT AREN'T ENTERED WITH QUOTES
 [ ! "${password}" = "alpine" ] && {
 	userval() { export "$1"="$2" } # fallback function if below utility doesn't exist
 	dotload_namespace=dk15
 	dotload=$(command -v dotload || echo /usr/share/dk15/dotload) && source "${dotload}" 'dedupe.conf' 2>/dev/null
 } || { # for PKGBUILD check()
 	userval() {
 		local test="test_${1}" # :/
 		export "$1"="${(P)test:-${2}}"
 	}
 }
 bool=(true false)
 # makes me wonder if direct binary string comparison will also work somehow, OCD'ing because of memory usage
 mp=4
 userval passes $mp
 userval hash b2
 userval hash_workers 8
 bools=(force_relink sanity_check hide_invalid batch_hashes simulate_mode scramble_list hide_errored process_gitdir)
 for b in ${bools}; do
 	userval $b 0
 	[ ${(P)b} -ne 0 ];
 	export "$b=${bool[$((1+$?))]}"
 	# BRANCH NOT EQUAL AMIRITE AHAHHAHAHAHAHAAAHAHHAHHHAHHHAHAHAH
 done
 $simulate_mode && link=nop || link=ln
 $sanity_check && {
 	echo 'Configuration:'
 	for c in passes hash hash_workers ${bools}; do
 		echo $c = ${(P)c}
 	done
 }
 nop() {return 0}
 NUL='/dev/null'
 err='[91;1m'
 rc='[0m'
 [ $passes -ge 1 -a $passes -le 4 ] || {
 	echo "Invalid config: passes = ${passes}. Setting to ${mp}." 1>&2
 	passes=$mp
 }
 where=('command' '-v')
 $where fsutil.exe >$NUL
 fsutil=${bool[$((1+$?))]}
 [ $passes -ge 3 -a $fsutil = false ] && {
 	echo '[91;1mWARNING: Passes 3 and 4 may take a while to find and' 1>&2
 	echo 'merge groups of hardlinks using the inputs provided![0m' 1>&2
 }
 $where "${hash}sum" >$NUL || {
 	echo "${err}Cannot find ${hash}sum as a hashing program.${rc}" 1>&2
 	echo "[90mFaulting to ${hash}.${rc}" 1>&2
 	hash=sha256
 }
 $where "${hash}sum" >$NUL || {
 	echo "${err}Fallback hashing program ${hash}sum does not exist. Aborting...${rc}" 1>&2
 	exit 1
 }
 typeset -A hashsizes=( # raw byte lengths
 	[ck]=4
 	[md5]=16
 	[sha1]=20
 	[sha224]=28
 	[sha256]=32
 	[sha384]=48
 	[sha512]=64
 )
 # check local changes since i last pushed this
 # curl -LsSf gist.githubusercontent.com/donnaken15/f95e8a143bb330fcf7d6268a4d6929e8/raw/dedupe | git diff --no-index - dedupe
 hash_params=()
 [ ! "$hash" = b3 ] && hash_params+=(-b)
 [ "$hash" = b2 -o "$hash" = b3 ] && {
 	userval hash_length 10 # bytes, being conservative for big files
 	[ "$hash" = b2 ] && {
 		hash_params+=(-l $((${hash_length}<<3)))
 	} || {
 		#hash_threads=1 --num-threads ${hash_threads}
 		hash_params+=(-l ${hash_length})
 		# hashing a bunch of (usually small) files that use 16 threads
 		# each feels iffy, should scale up with file size
 		# makes me wonder now if it could use spare threads
 		# for processing other files somehow when multiple
 		# files are put in the input command
 	}
 } || {
 	[ -z ${hashsizes[$hash]} ] && {
 		echo "${err}Unrecognized hash function: ${hash}${rc}" 1>&2
 		echo "[90mFaulting to ${hash}.${rc}" 1>&2
 		hash=sha256
 	}
 	hash_length=${hashsizes[$hash]:-16}
 }
 hashstrlen=$((${hash_length}<<1))
 [ "$hash" = ck ] && {
 	hash_params=(-c 'cksum -a crc "$@" | cut -d" " -f1,3- | while read -r c f; do; printf "%08x *%s\n" $c "$f"; done') # AUTISM FUEL
 	basecmd=(zsh)
 } || {
 	basecmd=("${hash}sum")
 }
 basecmd+=(${hash_params} '--')
 hashset=()
 baseset=()
 counters=() # for skipping unique files in subsequent passes that had no duplicates counted up
 filter=() # 0 = if pass>1, don't process; 1 = retry on later passes
 alias skip='{ ((i++)); continue }'
 dsize()
 {
 	local test=(`df -xtmpfs -xdevtmpfs --output=avail --total 2>$NUL`) && echo $((${test[-1]} * 1024)) # absurd
 }
 fsize=(stat -Lc%s) # attempts to reduce amount of new processes, but substitution will always require it
 inode=(stat -Lc%i) # location of the data in the raw drive
 mntpt=(df --output=target) # AYFS STAT ALREADY HAS %m
 # replace with stat -Lc"%s${lf}%i${lf}%m" soon
 sfxlp=('/proc' '/dev' '/tmp' '/var/tmp' '/sys' '/boot')
 sfind=()
 for f in "${sfxlp[@]}"; do
 	sfind+=(! -path "\"${f}/*\"")
 done
 unset sfxlp
 #profile=(date '+%s%N')
 #profileend() { echo "profile script" $(($(($($profile) - $1)) / 1000000000.0)) }
 dls=() # drive letters
 mps=() # mount points
 (mount | sed -n 's/^\(\w:\)\\\?\son\s\(\/cygdrive\/\w\|\/mnt\/\w\|\/\w\).*/\1 \2/p') |
 while read -r lt; do
 	l=(${(@s: :)lt}) # absurd
 	dls+=(${l[1]:l}) # C:
 	mps+=(${l[2]:l}) # /cygdrive/c/
 done
 wsl_path() # i hate linux
 {
 	local l1="${2:l}"
 	local test
 	[[ ! "$l1" = /* ]] && {
 		[ "$1" = '-u' -a "$l1" = 'NUL' ] && {
 			echo "$NUL"
 			return
 		}
 		[ ! "$1" = '-u' -a "$l1" = "$NUL" ] && {
 			echo 'NUL'
 			return
 		}
 		wslpath $1 "$2"
 		return
 	} || {
 		[[ "$l1" = /cygdrive/* ]] &&
 			test=${mps[1][1,-2]}${1:10} ||
 		{
 			esc="${2//\\//}"
 			for i in {1..${#dls}}; do
 				[[ "$l1" = /mnt${mps[$i][-2,-1]}* ]] && { # wsl default
 					test="${mps[$i][1,-2]}${esc:5}"
 					break
 				}
 				[[ "$l1" = ${mps[$i][-2,-1]}/* ]] && { # msys
 					test="${mps[$i]}/${esc:3}"
 					break
 				}
 				[[ "$l1" = ${dls[$i]}* ]] && {
 					test="${mps[$i]}${esc:2}"
 					break
 				}
 				[[ "$l1" = ${mps[$i]}* ]] && {
 					test="${mps[$i]}${esc:${#mps[$i]}}"
 					break
 				}
 			done
 			test="${test:-$2}"
 		}
 		[ "$1" = '-u' ] && {
 			echo "$test"
 			return
 		}
 	}
 	wslpath $1 "$test" || {
 		echo "[31mMISHANDLED PATH: $test${rc}" 1>&2
 		echo "$2"
 	}
 }
 fail() { echo -n "$2" }
 wpath=`$where cygpath` || {
 	$where wslpath >$NUL && wpath=wsl_path || wpath=wpathfail
 }
 # TODO: support b3sum windows, doesn't use unix paths
 units="kmgt"
 #total=0
 lasttest=0
 copycount=0
 failcount=0
 expected=0
 rawphysd=0
 lf=$'\n'
 IFS=$lf
 echo "[97mdedupe - Building file list...${rc}"
 # should probably also generally avoid deduping files less than at least 8 bytes for whatever reason, in
 # the case of like blank text files which would have one newline (LF or CR LF) or something
 # this is why you should be selective with what files you want to dedupe, like certain extensions
 prep_check() # i don't want to have to rely on this as its own function
 {
 	[ ! -e "$1" -o -d "$1" ] && return 1
 	[ ! -s "$1" ] && return 2
 	[ ! -w "$1" ] && return 3
 	return 0
 }
 prognums()
 {
 	echo -n "($(($1-${3:-0}))/$(($2-${3:-0})))"$'\r'
 }
 l=("$@")
 k=0
 i=1
 set --
 preperrs=('not a file' 'blank' 'not writeable')
 while [ $i -le ${#l} ]; do
 	f="${l[$i]}"
 	prognums $i ${#l} $k
 	[[ "$f" == */.git/* ]] && { $process_gitdir || skip }
 	[[ "$f" == *[\*\?]* ]] && { # evade "argument list too long" with this, have to wrap the glob in quotes though
 		echo "[93mGot glob pattern: $f${rc}"
 		((k++)); prognums $i ${#l} $k # FIX: upper limit goes over final length of infiles array
 		#l+=(${~f})
 		# need to split base path and pattern
 		# for now: (cd /.../.../ && dedupe '**/*.*')
 		[[ "$f" == *\*\** ]] && recurse=() || recurse=(-maxdepth 1) # >:(
 		find ${recurse} -wholename "$f" -print 2>$NUL | while read -r ff; do # just to not look frozen for >1 minute
 			[ -d "$ff" ] && continue
 			[[ "$ff" == */.git/* ]] && { $process_gitdir || continue }
 			l+=("$ff"); prognums $i ${#l} $k
 		done
 		skip
 	}
 	[[ "$f" == /* ]] && abs=true || {
 		for j in {1..${#dls}}; do # check for windows path
 			[[ "${f:l}" = ${dls[$j]}* ]] && {
 				abs=true
 				break
 			}
 		done
 	}
 	# speed up when paths are relative, mount point stuff should still get handled just fine
 	# but i know stupid programming will try to subvert my expectations by not working at all, as usual
 	${abs:-false} && ff="`realpath -s "$($wpath -u "$f")"`"
 	((i++))
 	prep_check "${ff:-$f}" || {
 		ERR=$?; $hide_invalid || echo "${err}$f is ${preperrs[$ERR]}.${rc}" 1>&2
 		continue
 	}
 	infiles+=("${ff:-$f}")
 done
 unset l k f
 argerrs=('There are no' 'Not entered enough')
 [ ${#infiles} -le 1 ] && {
 	echo "${err}${argerrs[$((1+${#infiles}))]} files to process.[0m" 1>&2
 	exit 1
 }
 $scramble_list && {
 	# for stress testing this script, because of zsh's handling of arrays and items.....
 	# and because this script is stupid and a total broken POS
 	local test=()
 	echo "\n[35mScrambling list...${rc}"
 	i=1
 	shuf -i "1-${#infiles}" | while read -r s; do
 		test[$s]="${infiles[$i]}"
 		echo -n "($i/${#infiles})"$'\r'
 		((i++))
 	done
 	{
 		(exit ${PIPESTATUS[1]}) &&
 		(exit ${PIPESTATUS[2]}) &&
 		infiles=(${test})
 	}
 }
 #perf=$(date +%s.%N)
 $batch_hashes && {
 	echo '\n[96mHashing files...[0m'
 	i=1
 	why=$(($hashstrlen+2))
 	IFS=$'\0'
 	fpw=$(((${#infiles}*1.0)/$hash_workers))
 	[ $(($fpw >> 0)) -lt 1 ] && fpw=$((((1/$fpw)+0.499999999999)>>0))
 	echo -n "${infiles}" | xargs -n $(($fpw>>0)) -P $hash_workers -0 ${basecmd} | while read -r test; do
 	#	VARNAME[index]=() think
 	#	[[ "${test:l}" == *"argument list too long"* ]] && { echo '--------- why'; break }
 	#	[[ "$test" == "${basecmd[1]}: "* ]] && { echo "${err}$test$rc" 1>&2; continue }
 	#	[[ "$test" == ": "* ]] && continue # STUPID!!!!!!
 		j="${infiles[(Ie)${test:$why}]}"
 		hashset[$j]="${test[0,$hashstrlen]}"
 		filter[$j]=0
 		counters[$j]=0
 		prognums $i ${#infiles}
 		((i++))
 	done
 	die=(${pipestatus})
 	missed=$((${#infiles}-$i+1))
 	[ $missed -gt 0 ] && echo "[91mMissed files: $missed${rc}"
 #	[ \( ${die[2]} -le 127 -o ${die[2]} -ge 123 \) -o ${#hashset} -ne ${#infiles} ] && {
 #		# hoping this is safe when assigning indexes instead
 #		# capture error log from above to print here
 #		batch_hashes=false
 #		IFS=,
 #		#echo "\n${err}Batched hash list failed to match the number of files scanned: ${#hashset} = ${#infiles}${rc}" 1>&2
 #		echo "\n${err}${hash}sum sucks, or your files do. ($die) (${#hashset} = ${#infiles}) ${rc}"
 #		hashset=()
 #		baseset=()
 #		filter=()
 #		counters=()
 #	} || baseset=(${infiles}) # should use ${(P)name} instead
 	baseset=(${infiles}) # should use ${(P)name} instead
 	unset i
 	IFS=$lf
 }
 #echo "------------- autistic: $(($(date +%s.%N)-$perf))"
 errgate()
 {
 	[ "${map[12]}" = "${map[1]}" ] && return 11
 	# try to utilize KSH [ extension instead: -ef
 	# also applies to plain symlink :/
 	[ ! "${map[10]}" = "${map[11]}" ] && return 12
 	$force_relink
 	[ \
 		${map[6]} = ${map[7]} -a \( \
 			$? -eq 1 -o \( $? -eq 0 -a $pass -gt 1 \) \
 		\) \
 	] && return 13
 	[ ! ${map[4]} = ${map[5]} ] && return 14
 	return 0
 }
 sanity_fmt="[36m%6d[37m ([%d3m%2d[37m) [[35m%7s[37m] ([90m$err%s[37;22m): [38;2;%d;%d;%dm%s$rc\n"
 sanity_link_fmt='%6d[22m: %s [90m([37m%9s[90m)[0m'
 sanity_link_fmt="[36;1m$sanity_link_fmt = [33;1m$sanity_link_fmt\n"
 test=''
 dsize | read -r before
 echo "[97m(${#infiles} files)${rc}"
 for pass in {1..$passes}; do
 	batched=0
 	i=1
 	$sanity_check && echo "[33mNew pass: $pass${rc}"
 	for f in "${infiles[@]}"; do
 		hash=${hashset[$i]}
 		[ ${counters[$i]:-0} -gt 0 ] && hilite=9 || hilite=3
 		$sanity_check &&
 			$batch_hashes && {
 				printf "$sanity_fmt" \
 					$i $hilite ${counters[$i]:--} \
 					"${hash:0:7}" "${filter[$i][2]:-[92m-}" \
 					"0x${hash:0:2}" "0x${hash:2:2}" "0x${hash:4:2}" $f 2>$NUL
 			}
 		[ \
 			$pass -gt 1 -a \( \
 				${filter[$i]:-0} -ne 0 -o ${counters[$i]:-0} -le 0 \
 			\) \
 		] && skip
 		map=("$f" "" "${f:t}")
 		[ \
 			\( $batch_hashes = true -a ! -z "${hash}" \) -o \
 			\( $batch_hashes = false -a $pass -gt 1 \) \
 		] && map[2]="${hash}" || map[2]="`${basecmd} "$f"`"
 		# imagining someone or something will rename it midway through hash batching
 		# NOW IT'S CAUSING PROBLEMS!!!!!
 	#	prep_check "$f" || {
 	#		ERR=$?
 	#		[ $pass -eq 1 ] && {
 	#			filter+=(${ERR})
 	#			echo "${err}[${ERR}] Invalid file $f.${rc}" 1>&2
 	#			((failcount++))
 	#		}
 	#		skip
 	#	}
 		# map, for substitution speed and earlier function:
 		# [1] = target, this file
 		# [2] = hash
 		# [3] = basename
 		# target and source metadata
 		# [4,5] = file sizes
 		# [6,7] = inodes
 		# [8] = free space
 		# [10,11] = mount points
 		# [12] = source, matching
 		# can't even memorize my own system of metadata handling
 		local tmp=${test:-0}
 		test=${map[2][0,$hashstrlen]}
 		check=${hashset[(Ie)$test]} # GETS LAST OCCURRENCE ACTUALLY.....
 	#	$sanity_check && echo "[$test] $i = $check"
 		$sanity_check && ! $batch_hashes && {
 			printf "$sanity_fmt" \
 				$i $hilite ${counters[$i]:--} \
 				"${map[2]:0:7}" "${filter[$i][2]:-[92m-}" \
 				"0x${map[2]:0:2}" "0x${map[2]:2:2}" "0x${map[2]:4:2}" "$f" 2>$NUL
 		}
 		[ $batch_hashes = true -o $pass -gt 1 ] && {
 			[ $check -eq 0 ] && { # ??????????
 				echo "${err}------------- why ($f)${rc}" 1>&2
 				((failcount++))
 				skip
 			}
 			[ $check -eq $i ] && skip
 		}
 		$batch_hashes || {
 			[ $check -eq 0 ] && {
 				hashset+=($test)
 				baseset+=("$f")
 				counters[$i]=0
 				filter+=(0)
 				skip
 			}
 		}
 		base="${baseset[$check]}"
 		# optimize this and change indexes, not even using dsize here
 		map+=($(
 			$fsize "$f" "$base"
 			$inode "$f" "$base"
 			dsize
 			$mntpt "$f" "$base"
 		)
 			"$base")
 	#	$sanity_check && for x in {1..${#map}}; do
 	#		echo "map[$x] = ${map[$x]}"
 	#	done
 		# why am i even doing (any (all) of) this?????
 		[ ! "${map[10]}" = "${map[11]}" ] && {
 			# ignore matches from other devices and keep searching
 			# this needs more extensive testing for safety
 		#	[ $pass -eq 1 -a $batch_hashes = false ] && continue
 		#	{
 			#	$sanity_check && echo 'horrible area'
 				while true; do
 					((check--))
 					[ $check -lt 1 ] && break
 					[ $check -eq $i ] && continue
 					[ ! $test = ${hashset[$check]} ] && continue
 				#	$sanity_check && echo "Matched hash $test ${hashset[$check]}"
 					ugh="${baseset[$check]}"
 					why=($(
 						$fsize "$ugh"
 						$inode "$ugh"
 						$mntpt "$ugh"
 					))
 				#	$sanity_check && echo "$i, $check, ${#hashset} $test ${hashset[$check]}, ${map[10]} = ${why[4]}"
 					[ ! "${map[10]}" = "${why[4]}" ] && continue
 				#	$sanity_check && echo Matched mount point "${map[10]}" = "${why[4]}"
 				#
 				#	# already handled in errgate
 				#	[ ! "${map[6]}" = "${why[2]}" ] && continue
 				#	$sanity_check && echo Matched inode "${map[6]}" = "${why[2]}"
 				#	[ ! "${map[4]}" = "${why[1]}" ] && continue
 				#	$sanity_check && echo Matched size "${map[4]}" = "${why[1]}"
 					map[12]="$ugh"
 					map[11]="${why[4]}"
 					map[7]="${why[2]}"
 					map[5]="${why[1]}"
 					base="$ugh" # god sake
 					# not tested yet
 					break
 				done
 		#	}
 		}
 		errgate ${map[@]}
 		ERR=$?
 		[ $pass -eq 1 ] && {
 			$batch_hashes && filter[$i]=${ERR} || filter+=(${ERR})
 		}
 		[ $ERR -ne 0 ] && {
 			[ $pass -eq 1 -a $hide_errored = false ] && {
 				case ${ERR} in
 					11)	errstr="${map[10]}/.../${map[3]} cannot be linked to itself.";;
 						# handle case insensitivity on windows, linux just doesn't have it for some reason
 					12)	errstr="Mount points do not match for "
 						errstr+="${errstr} ${map[10]}/.../${map[3]} and ${map[11]}/.../$(basename "$base")";;
 					13)	errstr="${map[10]}/.../${map[3]} is already hardlinked.";;
 					14)	errstr="[${hashset[$check]:0:15}, ${map[4]}] ${map[3]} and [${test:0:15}, ${map[5]}] "
 						errstr+="$(basename "$base") have matching hashes but different size!!!!";;
 					*)	errstr="Uncaught error $ERR";;
 				esac
 				# there has to be a way to make this into an array thing instead
 				echo "${err}${errstr}${rc}" 1>&2
 			}
 			((failcount++))
 			skip
 		}
 		$sanity_check && printf "$sanity_link_fmt" $i "${test[0,16]}" "${map[4]}" $check "${hashset[$check][0,16]}" "${map[5]}"
 		s=${map[4]}
 	#	prof=`$profile`
 		{
 			# TODO: handle permission denied error just
 			# so pass number text isn't printed prematurely
 			lasttest=$tmp
 			[ "$f" -nt "$base" ] && { # absurd but muh archives/history reasons
 				target="$f"
 				source="$base"
 				swap=1
 			} || {
 				target="$base"
 				source="$f"
 				swap=0
 			}
 			[ $batched -eq 0 ] && {
 				[ $pass -eq 1 ] && echo 'Deduping...'
 				[ $passes -gt 1 ] && echo "[92;1mPass ${pass}${rc}"
 			}
 			case "$pass" in
 				1)	;&
 				2)	dsize | read -r uhh
 					"$link" -f "$source" "$target" && {
 						rawphysd=$(($rawphysd + `dsize` - ${uhh}))
 						echo "[95;1m[${test:0:15}]${rc}" \
 							"[97m${base:t}${rc}" \
 							"[93;1m←${rc} [96;1m$f${rc}"
 						[ $pass -eq 1 ] && {
 							expected=$(($expected + $s))
 							counters[$i]=$((${counters[$i]}+1))
 						}
 						((batched++))
 					#	((total++))
 					}
 					;;
 				3)	;&
 				4)	ffs=0
 					# realizing now this must be optimized to not have to hardlink every
 					# single already hardlinked file with only incrementally hardlinking
 					# another file each time and instead do it all in one iteration only
 					$fsutil && {
 						test2="`$wpath -m "$target"`"
 						point="${map[$((10+$swap))]}"
 						fsutil.exe hardlink list "$test2" >$NUL && {
 							ffs=0
 							lasttest=$test
 							fsutil.exe hardlink list "$test2" | sed 's/\r//g; s/\\/\//g' | while read -r hl; do
 								map=($(
 									$wpath -u "${point}${hl}" &&
 									dsize
 								)) && {
 									"$link" -f "${source}" "${map[1]}" &&
 									ffs=$(($ffs+1)) && batched=$(($batched+1)) && # * # WHY NO ++
 									rawphysd=$(($rawphysd + `dsize` - ${map[2]})) && {
 										[ $ffs -eq 1 ] && \
 											echo "[95;1m[${test:0:15}]" \
 												"[97m${source:t}" \
 												'[90m<group merge>'
 										echo "[33;1m↑${rc} [96;1m${map[1]}${rc}"
 									} || false
 								} || echo "${err}Failed to hardlink ${hl}.${rc}" 1>&2
 							done
 							counters[$i]=$((${counters[$i]}+1))
 							[ $ffs -gt 0 ]
 						} || echo "${err}Cannot find hardlinks for $target.${rc}" 1>&2
 					} || {
 						map=(
 							#"${map[$((10+$swap))]}"
 							"${map[$((11-$swap))]}"
 							#"${map[$((6+$swap))]}"
 							"${map[$((7-$swap))]}"
 						) && {
 							# NOT A GOOD IDEA!!!!
 							find "${map[1]}" -xdev ${sfind[@]} -inum ${map[2]} 2>$NUL | while read -r hl; do
 								uhh=`dsize`
 								"$link" -f "$source" "$hl" 2>$NUL &&
 								ffs=$(($ffs+1)) && batched=$(($batched+1)) && # * MAKE FUNCTION
 								rawphysd=$(($rawphysd + `dsize` - ${uhh})) && {
 									[ $ffs -eq 1 ] && \
 										echo "[95;1m[${test:0:15}]" \
 											"[97m${source}" \
 											'[90m<group merge>'
 									echo "[33;1m↑${rc} [96;1m${hl}${rc}"
 								} || echo "${err}Failed to hardlink ${hl}.${rc}" 1>&2
 							done
 							counters[$i]=$((${counters[$i]}+1))
 							[ $ffs -gt 0 ]
 						} || echo "${err}Cannot find hardlinks for $target.${rc}" 1>&2
 						true
 					}
 					;;
 			esac
 			counters[$check]=${counters[$i]}
 		}
 		((i++))
 	#	profileend $prof
 	done
 	[ $batched -eq 0 ] && break
 done
 hus()
 {
 	[ $1 -lt 1024 ] && {
 		printf '%.0fb\n' $1
 		return 1
 	}
 	a=$1
 	u=0
 	until [ $((($a-0.499999999999)>>0)) -lt 1024 ]; do # hate
 		a=$(($a/1024.0))
 		((u++))
 	done
 	printf '%.2f%sb\n' $(($a-0.004999)) ${units[$u]}
 }
 map=($(
 	hus $before
 	hus `dsize`
 	hus $expected
 	[ $rawphysd -lt 1 ] && echo || echo +
 	hus $rawphysd
 ))
 unique=0
 for i in {1..${#counters}}; do
 	[ ${counters[$i]} -eq 0 ] && {
 		((unique++))
 		true # why
 	} || {
 		((copycount++))
 	}
 done
 [ $copycount -gt 0 ] && echo '[92mDone![0m'
 echo 'Free space:'
 echo "Before: ${map[1]}"
 echo "After : ${map[2]}"
 echo "Expected space saved: ${map[3]}"
 echo "Disk space difference: ${map[4]}${map[5]}"
 echo "Found ${#hashset[@]} files, $unique unique, $copycount duplicates, $failcount failed"
 #read -rsn
 exit 0

diff --git a/dedupe.1 b/dedupe.1
 .TH dedupe 1 "26 Jan 2025" "0.6-26.01.25"
 .SH NAME
 dedupe
 .SH SYNPOSIS
 .B dedupe
 [\fBFILES\fR] ...
 .SH USAGE
 Provide a list of at least two files that contain duplicates of one another to group them into hardlinks.

 For deduping of an entire folder containing duplicate items without specifying every single file, add
 .B **/*.ext
 to the argument list to recursively search for matching files in the current directory.

 NOTE: In the case of the "argument list too long" error, wrap the wildcard filter in quotes
 and the script will use an internal searching mechanism rather than the shell's.

 It's recommended to run this on a singular drive at a time with folders of
 old system backups or small to medium size archival data or old downloads.
 .SH CONFIGURATION
 Configuration is stored in the user's home directory at \fI~/.config/dk15/dedupe\fR

 A list of acceptable values to change:
 .TP
 .BR passes " = " \fIcount\fR
 Set the number of passes per command. \fIMinimum\fR = \fB1\fR, \fImaximum\fR = \fB4\fR.

 For passes 1 and 2, the utility plainly hardlinks the files provided.
 For passes 3 and 4, the files' individual hardlinks found on the current
 storage device are merged into the first copy of said duplicates to ensure
 that there are not two or more separate hardlink groups of the exact same file.
 On Linux, passes 3 and 4 run very slow compared to Windows, which is able to
 keep track of all hard links that are tied to physical files and instantly
 return them upon request.
 .\" you can't spell Windows without W
 .\" you can't spell Linux without L

 If there's not a single file that is processed on a pass, the utility will exit.
 .TP
 .\" actual suffering
 .BR hash " = " \fIsum_prefix\fR

 Set the preferred hashing function. Recommended functions are BLAKE2 (\fBb2\fR),
 BLAKE3 (\fBb3\fR), or SHA256 (\fBsha256\fR). The name of the chosen function must match
 the name of an existing executable that is suffxed with -sum, like the following examples.

 .nr PI 2n
 Only acceptable hash functions as of now (length, speed, security):
 .\" how do i not have a second newline separating these
 .IP *
 \fBck\fR	(CRC32)		-	4 bytes, fast, unknown
 .IP *
 \fBmd5\fR				-	16 bytes, fast, weak
 .IP *
 \fBsha1\fR				-	20 bytes, medium, weak
 .IP *
 \fBsha224\fR			-	28 bytes, medium, okay
 .IP *
 \fBsha256\fR			-	32 bytes, medium, strong
 .IP *
 \fBsha384\fR			-	48 bytes, slow, strong
 .IP *
 \fBsha512\fR			-	64 bytes, slow, strong
 .IP *
 \fBb2\fR	(BLAKE2)		-	any length, fast, strong (64 bytes by default)
 .IP *
 \fBb3\fR	(BLAKE3)		-	any length, very fast, strong (32 bytes by default)

 .TP
 .BR hash_length " = " \fIbyte_count\fR

 BLAKE2 and BLAKE3 only. Set the preferred hash length, adjustable
 in the case of handling an unusual amount of files at once.

 .TP
 .BR force_relink " = " [ \fI0\fR | \fI1\fR ]

 Force the hardlinking of duplicates which have already been hardlinked. Only applies to pass 1.

 .TP
 .BR sanity_check " = " [ \fI0\fR | \fI1\fR ]

 Print every file that is processed for deduplicating.

 .TP
 .BR hide_invalid " = " [ \fI0\fR | \fI1\fR ]

 Don't print errors about preparing files that don't exist or are blank.

 .TP
 .BR batch_hashes " = " [ \fI0\fR | \fI1\fR ]

 Handle hashing all at once.
 .B EXPERIMENTAL, DON'T TRY ON FILES THAT ARE IMPORTANT TO YOU.

 .TP
 .BR simulate_mode " = " [ \fI0\fR | \fI1\fR ]

 Simulation mode, do not actually hardlink.
 Will run through every pass however, to no effect.

 .TP
 .BR hide_errored " = " [ \fI0\fR | \fI1\fR ]

 Hide errors that display during pass 1.

 .TP
 .BR scramble_list " = " [ \fI0\fR | \fI1\fR ]

 Scramble file list after parsing. For stress testing.

 .TP
 .BR process_gitdir " = " [ \fI0\fR | \fI1\fR ]

 Don't ignore .git folders, bad idea unless the repo targeted is not
 being modified and merely exists for the purpose of preservation.

 .TP
 .BR hash_workers " = " num_processes

 Number of hashing processes to run in parallel.
 Recommended value is 8 because of I/O speeds.
 .SH AUTHOR
 donnaken15 <[email protected]>
diff --git a/dedupe.conf.example b/dedupe.conf.example
 # dedupe config
 #
 # Each pass processes the entire file list given to the command.
 # Pass 1: Incrementally scan, index, hash each file, and check for duplicates to hardlink.
 # Pass 2: Plainly hardlink exact copies of input files using the complete list of hashes.
 # Pass 3, 4: Check input duplicate files that already share hardlinks, or just became
 # hardlinks from the current execution, and merge them together with their sources.
 # Separate groups of hardlinks of the same file is an issue which requires a complex explanation.
 # Pass 1 and 2 are basically the same if batch_hashes is enabled.
 passes	=	4

 # Preferred hashing function, recommended: BLAKE2 or BLAKE3
 # The name of the chosen function must match the name of an
 # existing executable that is suffixed with -sum. As such:
 # SHA256: sha256sum, hash = sha256
 # BLAKE3: b3sum, hash = b3
 # CRC32: cksum, hash = ck
 hash	=	b2

 # Preferred BLAKE hash length in bytes, adjustable in the
 # case of handling an unusual amount of files at once.
 hash_length		=	20

 # Force hardlinking of duplicates which are already hardlinked.
 # Only applies to pass 1.
 force_relink	=	0

 # Print every file that is processed.
 sanity_check	=	0

 # Don't print errors about preparing files that don't exist or are blank.
 hide_invalid	=	0

 # Handle the hashing of input files all at once.
 batch_hashes	=	0

 # Simulation mode, do not actually hardlink.
 # Will run through every pass however, to no effect.
 simulate_mode	=	0

 # Hide errors that display during pass 1.
 hide_errored	=	0

 # Scramble file list after parsing. For stress testing.
 scramble_list	=	0

 # Don't ignore .git folders, bad idea unless the repo targeted is not
 # being modified and merely exists for the purpose of preservation.
 process_gitdir	=	0

 # Number of hashing processes to run in parallel.
 # Recommended value is 8 because of I/O speeds.
 hash_workers	=	8
	#!/bin/zsh
	# I'M ACTIVELY FORGETTING HOW MY OWN SCRIPT WORKS!!!!!!!!!!!!!!
	[ $# -lt 2 ] && [[ ! "$1" == [\\?]* ]] && {
	[ $# -eq 1 ] &&
	echo 'you must specify more than one file to be deduped' &&
	echo
	echo 'dedupe [input files]'
	echo '- replace multiple unchanging copies of'
	echo ' the same files with hardlinks to save space'
	echo '- as of now, it is recommended to execute this'
	echo ' only on files that exist on a singular device'
	exit 1
	}
	# WSL SCREWS UP WINDOWS PATHS THAT AREN'T ENTERED WITH QUOTES
	[ ! "${password}" = "alpine" ] && {
	userval() { export "$1"="$2" } # fallback function if below utility doesn't exist
	dotload_namespace=dk15
	dotload=$(command -v dotload \|\| echo /usr/share/dk15/dotload) && source "${dotload}" 'dedupe.conf' 2>/dev/null
	} \|\| { # for PKGBUILD check()
	userval() {
	local test="test_${1}" # :/
	export "$1"="${(P)test:-${2}}"
	}
	}
	bool=(true false)
	# makes me wonder if direct binary string comparison will also work somehow, OCD'ing because of memory usage
	mp=4
	userval passes $mp
	userval hash b2
	userval hash_workers 8
	bools=(force_relink sanity_check hide_invalid batch_hashes simulate_mode scramble_list hide_errored process_gitdir)
	for b in ${bools}; do
	userval $b 0
	[ ${(P)b} -ne 0 ];
	export "$b=${bool[$((1+$?))]}"
	# BRANCH NOT EQUAL AMIRITE AHAHHAHAHAHAHAAAHAHHAHHHAHHHAHAHAH
	done
	$simulate_mode && link=nop \|\| link=ln
	$sanity_check && {
	echo 'Configuration:'
	for c in passes hash hash_workers ${bools}; do
	echo $c = ${(P)c}
	done
	}
	nop() {return 0}
	NUL='/dev/null'
	err='[91;1m'
	rc='[0m'
	[ $passes -ge 1 -a $passes -le 4 ] \|\| {
	echo "Invalid config: passes = ${passes}. Setting to ${mp}." 1>&2
	passes=$mp
	}
	where=('command' '-v')
	$where fsutil.exe >$NUL
	fsutil=${bool[$((1+$?))]}
	[ $passes -ge 3 -a $fsutil = false ] && {
	echo '[91;1mWARNING: Passes 3 and 4 may take a while to find and' 1>&2
	echo 'merge groups of hardlinks using the inputs provided![0m' 1>&2
	}
	$where "${hash}sum" >$NUL \|\| {
	echo "${err}Cannot find ${hash}sum as a hashing program.${rc}" 1>&2
	echo "[90mFaulting to ${hash}.${rc}" 1>&2
	hash=sha256
	}
	$where "${hash}sum" >$NUL \|\| {
	echo "${err}Fallback hashing program ${hash}sum does not exist. Aborting...${rc}" 1>&2
	exit 1
	}
	typeset -A hashsizes=( # raw byte lengths
	[ck]=4
	[md5]=16
	[sha1]=20
	[sha224]=28
	[sha256]=32
	[sha384]=48
	[sha512]=64
	)
	# check local changes since i last pushed this
	# curl -LsSf gist.githubusercontent.com/donnaken15/f95e8a143bb330fcf7d6268a4d6929e8/raw/dedupe \| git diff --no-index - dedupe
	hash_params=()
	[ ! "$hash" = b3 ] && hash_params+=(-b)
	[ "$hash" = b2 -o "$hash" = b3 ] && {
	userval hash_length 10 # bytes, being conservative for big files
	[ "$hash" = b2 ] && {
	hash_params+=(-l $((${hash_length}<<3)))
	} \|\| {
	#hash_threads=1 --num-threads ${hash_threads}
	hash_params+=(-l ${hash_length})
	# hashing a bunch of (usually small) files that use 16 threads
	# each feels iffy, should scale up with file size
	# makes me wonder now if it could use spare threads
	# for processing other files somehow when multiple
	# files are put in the input command
	}
	} \|\| {
	[ -z ${hashsizes[$hash]} ] && {
	echo "${err}Unrecognized hash function: ${hash}${rc}" 1>&2
	echo "[90mFaulting to ${hash}.${rc}" 1>&2
	hash=sha256
	}
	hash_length=${hashsizes[$hash]:-16}
	}
	hashstrlen=$((${hash_length}<<1))
	[ "$hash" = ck ] && {
	hash_params=(-c 'cksum -a crc "$@" \| cut -d" " -f1,3- \| while read -r c f; do; printf "%08x *%s\n" $c "$f"; done') # AUTISM FUEL
	basecmd=(zsh)
	} \|\| {
	basecmd=("${hash}sum")
	}
	basecmd+=(${hash_params} '--')
	hashset=()
	baseset=()
	counters=() # for skipping unique files in subsequent passes that had no duplicates counted up
	filter=() # 0 = if pass>1, don't process; 1 = retry on later passes
	alias skip='{ ((i++)); continue }'
	dsize()
	{
	local test=(`df -xtmpfs -xdevtmpfs --output=avail --total 2>$NUL`) && echo $((${test[-1]} * 1024)) # absurd
	}
	fsize=(stat -Lc%s) # attempts to reduce amount of new processes, but substitution will always require it
	inode=(stat -Lc%i) # location of the data in the raw drive
	mntpt=(df --output=target) # AYFS STAT ALREADY HAS %m
	# replace with stat -Lc"%s${lf}%i${lf}%m" soon
	sfxlp=('/proc' '/dev' '/tmp' '/var/tmp' '/sys' '/boot')
	sfind=()
	for f in "${sfxlp[@]}"; do
	sfind+=(! -path "\"${f}/*\"")
	done
	unset sfxlp
	#profile=(date '+%s%N')
	#profileend() { echo "profile script" $(($(($($profile) - $1)) / 1000000000.0)) }
	dls=() # drive letters
	mps=() # mount points
	(mount \| sed -n 's/^\(\w:\)\\\?\son\s\(\/cygdrive\/\w\\|\/mnt\/\w\\|\/\w\).*/\1 \2/p') \|
	while read -r lt; do
	l=(${(@s: :)lt}) # absurd
	dls+=(${l[1]:l}) # C:
	mps+=(${l[2]:l}) # /cygdrive/c/
	done
	wsl_path() # i hate linux
	{
	local l1="${2:l}"
	local test
	[[ ! "$l1" = /* ]] && {
	[ "$1" = '-u' -a "$l1" = 'NUL' ] && {
	echo "$NUL"
	return
	}
	[ ! "$1" = '-u' -a "$l1" = "$NUL" ] && {
	echo 'NUL'
	return
	}
	wslpath $1 "$2"
	return
	} \|\| {
	[[ "$l1" = /cygdrive/* ]] &&
	test=${mps[1][1,-2]}${1:10} \|\|
	{
	esc="${2//\\//}"
	for i in {1..${#dls}}; do
	[[ "$l1" = /mnt${mps[$i][-2,-1]}* ]] && { # wsl default
	test="${mps[$i][1,-2]}${esc:5}"
	break
	}
	[[ "$l1" = ${mps[$i][-2,-1]}/* ]] && { # msys
	test="${mps[$i]}/${esc:3}"
	break
	}
	[[ "$l1" = ${dls[$i]}* ]] && {
	test="${mps[$i]}${esc:2}"
	break
	}
	[[ "$l1" = ${mps[$i]}* ]] && {
	test="${mps[$i]}${esc:${#mps[$i]}}"
	break
	}
	done
	test="${test:-$2}"
	}
	[ "$1" = '-u' ] && {
	echo "$test"
	return
	}
	}
	wslpath $1 "$test" \|\| {
	echo "[31mMISHANDLED PATH: $test${rc}" 1>&2
	echo "$2"
	}
	}
	fail() { echo -n "$2" }
	wpath=`$where cygpath` \|\| {
	$where wslpath >$NUL && wpath=wsl_path \|\| wpath=wpathfail
	}
	# TODO: support b3sum windows, doesn't use unix paths
	units="kmgt"
	#total=0
	lasttest=0
	copycount=0
	failcount=0
	expected=0
	rawphysd=0
	lf=$'\n'
	IFS=$lf
	echo "[97mdedupe - Building file list...${rc}"
	# should probably also generally avoid deduping files less than at least 8 bytes for whatever reason, in
	# the case of like blank text files which would have one newline (LF or CR LF) or something
	# this is why you should be selective with what files you want to dedupe, like certain extensions
	prep_check() # i don't want to have to rely on this as its own function
	{
	[ ! -e "$1" -o -d "$1" ] && return 1
	[ ! -s "$1" ] && return 2
	[ ! -w "$1" ] && return 3
	return 0
	}
	prognums()
	{
	echo -n "($(($1-${3:-0}))/$(($2-${3:-0})))"$'\r'
	}
	l=("$@")
	k=0
	i=1
	set --
	preperrs=('not a file' 'blank' 'not writeable')
	while [ $i -le ${#l} ]; do
	f="${l[$i]}"
	prognums $i ${#l} $k
	[[ "$f" == /.git/ ]] && { $process_gitdir \|\| skip }
	[[ "$f" == [\\?]* ]] && { # evade "argument list too long" with this, have to wrap the glob in quotes though
	echo "[93mGot glob pattern: $f${rc}"
	((k++)); prognums $i ${#l} $k # FIX: upper limit goes over final length of infiles array
	#l+=(${~f})
	# need to split base path and pattern
	# for now: (cd /.../.../ && dedupe '*/.*')
	[[ "$f" == \\** ]] && recurse=() \|\| recurse=(-maxdepth 1) # >:(
	find ${recurse} -wholename "$f" -print 2>$NUL \| while read -r ff; do # just to not look frozen for >1 minute
	[ -d "$ff" ] && continue
	[[ "$ff" == /.git/ ]] && { $process_gitdir \|\| continue }
	l+=("$ff"); prognums $i ${#l} $k
	done
	skip
	}
	[[ "$f" == /* ]] && abs=true \|\| {
	for j in {1..${#dls}}; do # check for windows path
	[[ "${f:l}" = ${dls[$j]}* ]] && {
	abs=true
	break
	}
	done
	}
	# speed up when paths are relative, mount point stuff should still get handled just fine
	# but i know stupid programming will try to subvert my expectations by not working at all, as usual
	${abs:-false} && ff="`realpath -s "$($wpath -u "$f")"`"
	((i++))
	prep_check "${ff:-$f}" \|\| {
	ERR=$?; $hide_invalid \|\| echo "${err}$f is ${preperrs[$ERR]}.${rc}" 1>&2
	continue
	}
	infiles+=("${ff:-$f}")
	done
	unset l k f
	argerrs=('There are no' 'Not entered enough')
	[ ${#infiles} -le 1 ] && {
	echo "${err}${argerrs[$((1+${#infiles}))]} files to process.[0m" 1>&2
	exit 1
	}
	$scramble_list && {
	# for stress testing this script, because of zsh's handling of arrays and items.....
	# and because this script is stupid and a total broken POS
	local test=()
	echo "\n[35mScrambling list...${rc}"
	i=1
	shuf -i "1-${#infiles}" \| while read -r s; do
	test[$s]="${infiles[$i]}"
	echo -n "($i/${#infiles})"$'\r'
	((i++))
	done
	{
	(exit ${PIPESTATUS[1]}) &&
	(exit ${PIPESTATUS[2]}) &&
	infiles=(${test})
	}
	}
	#perf=$(date +%s.%N)
	$batch_hashes && {
	echo '\n[96mHashing files...[0m'
	i=1
	why=$(($hashstrlen+2))
	IFS=$'\0'
	fpw=$(((${#infiles}*1.0)/$hash_workers))
	[ $(($fpw >> 0)) -lt 1 ] && fpw=$((((1/$fpw)+0.499999999999)>>0))
	echo -n "${infiles}" \| xargs -n $(($fpw>>0)) -P $hash_workers -0 ${basecmd} \| while read -r test; do
	# VARNAME[index]=() think
	# [[ "${test:l}" == "argument list too long" ]] && { echo '--------- why'; break }
	# [[ "$test" == "${basecmd[1]}: "* ]] && { echo "${err}$test$rc" 1>&2; continue }
	# [[ "$test" == ": "* ]] && continue # STUPID!!!!!!
	j="${infiles[(Ie)${test:$why}]}"
	hashset[$j]="${test[0,$hashstrlen]}"
	filter[$j]=0
	counters[$j]=0
	prognums $i ${#infiles}
	((i++))
	done
	die=(${pipestatus})
	missed=$((${#infiles}-$i+1))
	[ $missed -gt 0 ] && echo "[91mMissed files: $missed${rc}"
	# [ \( ${die[2]} -le 127 -o ${die[2]} -ge 123 \) -o ${#hashset} -ne ${#infiles} ] && {
	# # hoping this is safe when assigning indexes instead
	# # capture error log from above to print here
	# batch_hashes=false
	# IFS=,
	# #echo "\n${err}Batched hash list failed to match the number of files scanned: ${#hashset} = ${#infiles}${rc}" 1>&2
	# echo "\n${err}${hash}sum sucks, or your files do. ($die) (${#hashset} = ${#infiles}) ${rc}"
	# hashset=()
	# baseset=()
	# filter=()
	# counters=()
	# } \|\| baseset=(${infiles}) # should use ${(P)name} instead
	baseset=(${infiles}) # should use ${(P)name} instead
	unset i
	IFS=$lf
	}
	#echo "------------- autistic: $(($(date +%s.%N)-$perf))"
	errgate()
	{
	[ "${map[12]}" = "${map[1]}" ] && return 11
	# try to utilize KSH [ extension instead: -ef
	# also applies to plain symlink :/
	[ ! "${map[10]}" = "${map[11]}" ] && return 12
	$force_relink
	[ \
	${map[6]} = ${map[7]} -a \( \
	$? -eq 1 -o \( $? -eq 0 -a $pass -gt 1 \) \
	\) \
	] && return 13
	[ ! ${map[4]} = ${map[5]} ] && return 14
	return 0
	}
	sanity_fmt="[36m%6d[37m ([%d3m%2d[37m) [[35m%7s[37m] ([90m$err%s[37;22m): [38;2;%d;%d;%dm%s$rc\n"
	sanity_link_fmt='%6d[22m: %s [90m([37m%9s[90m)[0m'
	sanity_link_fmt="[36;1m$sanity_link_fmt = [33;1m$sanity_link_fmt\n"
	test=''
	dsize \| read -r before
	echo "[97m(${#infiles} files)${rc}"
	for pass in {1..$passes}; do
	batched=0
	i=1
	$sanity_check && echo "[33mNew pass: $pass${rc}"
	for f in "${infiles[@]}"; do
	hash=${hashset[$i]}
	[ ${counters[$i]:-0} -gt 0 ] && hilite=9 \|\| hilite=3
	$sanity_check &&
	$batch_hashes && {
	printf "$sanity_fmt" \
	$i $hilite ${counters[$i]:--} \
	"${hash:0:7}" "${filter[$i][2]:-[92m-}" \
	"0x${hash:0:2}" "0x${hash:2:2}" "0x${hash:4:2}" $f 2>$NUL
	}
	[ \
	$pass -gt 1 -a \( \
	${filter[$i]:-0} -ne 0 -o ${counters[$i]:-0} -le 0 \
	\) \
	] && skip
	map=("$f" "" "${f:t}")
	[ \
	\( $batch_hashes = true -a ! -z "${hash}" \) -o \
	\( $batch_hashes = false -a $pass -gt 1 \) \
	] && map[2]="${hash}" \|\| map[2]="`${basecmd} "$f"`"
	# imagining someone or something will rename it midway through hash batching
	# NOW IT'S CAUSING PROBLEMS!!!!!
	# prep_check "$f" \|\| {
	# ERR=$?
	# [ $pass -eq 1 ] && {
	# filter+=(${ERR})
	# echo "${err}[${ERR}] Invalid file $f.${rc}" 1>&2
	# ((failcount++))
	# }
	# skip
	# }
	# map, for substitution speed and earlier function:
	# [1] = target, this file
	# [2] = hash
	# [3] = basename
	# target and source metadata
	# [4,5] = file sizes
	# [6,7] = inodes
	# [8] = free space
	# [10,11] = mount points
	# [12] = source, matching
	# can't even memorize my own system of metadata handling
	local tmp=${test:-0}
	test=${map[2][0,$hashstrlen]}
	check=${hashset[(Ie)$test]} # GETS LAST OCCURRENCE ACTUALLY.....
	# $sanity_check && echo "[$test] $i = $check"
	$sanity_check && ! $batch_hashes && {
	printf "$sanity_fmt" \
	$i $hilite ${counters[$i]:--} \
	"${map[2]:0:7}" "${filter[$i][2]:-[92m-}" \
	"0x${map[2]:0:2}" "0x${map[2]:2:2}" "0x${map[2]:4:2}" "$f" 2>$NUL
	}
	[ $batch_hashes = true -o $pass -gt 1 ] && {
	[ $check -eq 0 ] && { # ??????????
	echo "${err}------------- why ($f)${rc}" 1>&2
	((failcount++))
	skip
	}
	[ $check -eq $i ] && skip
	}
	$batch_hashes \|\| {
	[ $check -eq 0 ] && {
	hashset+=($test)
	baseset+=("$f")
	counters[$i]=0
	filter+=(0)
	skip
	}
	}
	base="${baseset[$check]}"
	# optimize this and change indexes, not even using dsize here
	map+=($(
	$fsize "$f" "$base"
	$inode "$f" "$base"
	dsize
	$mntpt "$f" "$base"
	)
	"$base")
	# $sanity_check && for x in {1..${#map}}; do
	# echo "map[$x] = ${map[$x]}"
	# done
	# why am i even doing (any (all) of) this?????
	[ ! "${map[10]}" = "${map[11]}" ] && {
	# ignore matches from other devices and keep searching
	# this needs more extensive testing for safety
	# [ $pass -eq 1 -a $batch_hashes = false ] && continue
	# {
	# $sanity_check && echo 'horrible area'
	while true; do
	((check--))
	[ $check -lt 1 ] && break
	[ $check -eq $i ] && continue
	[ ! $test = ${hashset[$check]} ] && continue
	# $sanity_check && echo "Matched hash $test ${hashset[$check]}"
	ugh="${baseset[$check]}"
	why=($(
	$fsize "$ugh"
	$inode "$ugh"
	$mntpt "$ugh"
	))
	# $sanity_check && echo "$i, $check, ${#hashset} $test ${hashset[$check]}, ${map[10]} = ${why[4]}"
	[ ! "${map[10]}" = "${why[4]}" ] && continue
	# $sanity_check && echo Matched mount point "${map[10]}" = "${why[4]}"
	#
	# # already handled in errgate
	# [ ! "${map[6]}" = "${why[2]}" ] && continue
	# $sanity_check && echo Matched inode "${map[6]}" = "${why[2]}"
	# [ ! "${map[4]}" = "${why[1]}" ] && continue
	# $sanity_check && echo Matched size "${map[4]}" = "${why[1]}"
	map[12]="$ugh"
	map[11]="${why[4]}"
	map[7]="${why[2]}"
	map[5]="${why[1]}"
	base="$ugh" # god sake
	# not tested yet
	break
	done
	# }
	}
	errgate ${map[@]}
	ERR=$?
	[ $pass -eq 1 ] && {
	$batch_hashes && filter[$i]=${ERR} \|\| filter+=(${ERR})
	}
	[ $ERR -ne 0 ] && {
	[ $pass -eq 1 -a $hide_errored = false ] && {
	case ${ERR} in
	11) errstr="${map[10]}/.../${map[3]} cannot be linked to itself.";;
	# handle case insensitivity on windows, linux just doesn't have it for some reason
	12) errstr="Mount points do not match for "
	errstr+="${errstr} ${map[10]}/.../${map[3]} and ${map[11]}/.../$(basename "$base")";;
	13) errstr="${map[10]}/.../${map[3]} is already hardlinked.";;
	14) errstr="[${hashset[$check]:0:15}, ${map[4]}] ${map[3]} and [${test:0:15}, ${map[5]}] "
	errstr+="$(basename "$base") have matching hashes but different size!!!!";;
	*) errstr="Uncaught error $ERR";;
	esac
	# there has to be a way to make this into an array thing instead
	echo "${err}${errstr}${rc}" 1>&2
	}
	((failcount++))
	skip
	}
	$sanity_check && printf "$sanity_link_fmt" $i "${test[0,16]}" "${map[4]}" $check "${hashset[$check][0,16]}" "${map[5]}"
	s=${map[4]}
	# prof=`$profile`
	{
	# TODO: handle permission denied error just
	# so pass number text isn't printed prematurely
	lasttest=$tmp
	[ "$f" -nt "$base" ] && { # absurd but muh archives/history reasons
	target="$f"
	source="$base"
	swap=1
	} \|\| {
	target="$base"
	source="$f"
	swap=0
	}
	[ $batched -eq 0 ] && {
	[ $pass -eq 1 ] && echo 'Deduping...'
	[ $passes -gt 1 ] && echo "[92;1mPass ${pass}${rc}"
	}
	case "$pass" in
	1) ;&
	2) dsize \| read -r uhh
	"$link" -f "$source" "$target" && {
	rawphysd=$(($rawphysd + `dsize` - ${uhh}))
	echo "[95;1m[${test:0:15}]${rc}" \
	"[97m${base:t}${rc}" \
	"[93;1m←${rc} [96;1m$f${rc}"
	[ $pass -eq 1 ] && {
	expected=$(($expected + $s))
	counters[$i]=$((${counters[$i]}+1))
	}
	((batched++))
	# ((total++))
	}
	;;
	3) ;&
	4) ffs=0
	# realizing now this must be optimized to not have to hardlink every
	# single already hardlinked file with only incrementally hardlinking
	# another file each time and instead do it all in one iteration only
	$fsutil && {
	test2="`$wpath -m "$target"`"
	point="${map[$((10+$swap))]}"
	fsutil.exe hardlink list "$test2" >$NUL && {
	ffs=0
	lasttest=$test
	fsutil.exe hardlink list "$test2" \| sed 's/\r//g; s/\\/\//g' \| while read -r hl; do
	map=($(
	$wpath -u "${point}${hl}" &&
	dsize
	)) && {
	"$link" -f "${source}" "${map[1]}" &&
	ffs=$(($ffs+1)) && batched=$(($batched+1)) && # * # WHY NO ++
	rawphysd=$(($rawphysd + `dsize` - ${map[2]})) && {
	[ $ffs -eq 1 ] && \
	echo "[95;1m[${test:0:15}]" \
	"[97m${source:t}" \
	'[90m<group merge>'
	echo "[33;1m↑${rc} [96;1m${map[1]}${rc}"
	} \|\| false
	} \|\| echo "${err}Failed to hardlink ${hl}.${rc}" 1>&2
	done
	counters[$i]=$((${counters[$i]}+1))
	[ $ffs -gt 0 ]
	} \|\| echo "${err}Cannot find hardlinks for $target.${rc}" 1>&2
	} \|\| {
	map=(
	#"${map[$((10+$swap))]}"
	"${map[$((11-$swap))]}"
	#"${map[$((6+$swap))]}"
	"${map[$((7-$swap))]}"
	) && {
	# NOT A GOOD IDEA!!!!
	find "${map[1]}" -xdev ${sfind[@]} -inum ${map[2]} 2>$NUL \| while read -r hl; do
	uhh=`dsize`
	"$link" -f "$source" "$hl" 2>$NUL &&
	ffs=$(($ffs+1)) && batched=$(($batched+1)) && # * MAKE FUNCTION
	rawphysd=$(($rawphysd + `dsize` - ${uhh})) && {
	[ $ffs -eq 1 ] && \
	echo "[95;1m[${test:0:15}]" \
	"[97m${source}" \
	'[90m<group merge>'
	echo "[33;1m↑${rc} [96;1m${hl}${rc}"
	} \|\| echo "${err}Failed to hardlink ${hl}.${rc}" 1>&2
	done
	counters[$i]=$((${counters[$i]}+1))
	[ $ffs -gt 0 ]
	} \|\| echo "${err}Cannot find hardlinks for $target.${rc}" 1>&2
	true
	}
	;;
	esac
	counters[$check]=${counters[$i]}
	}
	((i++))
	# profileend $prof
	done
	[ $batched -eq 0 ] && break
	done
	hus()
	{
	[ $1 -lt 1024 ] && {
	printf '%.0fb\n' $1
	return 1
	}
	a=$1
	u=0
	until [ $((($a-0.499999999999)>>0)) -lt 1024 ]; do # hate
	a=$(($a/1024.0))
	((u++))
	done
	printf '%.2f%sb\n' $(($a-0.004999)) ${units[$u]}
	}
	map=($(
	hus $before
	hus `dsize`
	hus $expected
	[ $rawphysd -lt 1 ] && echo \|\| echo +
	hus $rawphysd
	))
	unique=0
	for i in {1..${#counters}}; do
	[ ${counters[$i]} -eq 0 ] && {
	((unique++))
	true # why
	} \|\| {
	((copycount++))
	}
	done
	[ $copycount -gt 0 ] && echo '[92mDone![0m'
	echo 'Free space:'
	echo "Before: ${map[1]}"
	echo "After : ${map[2]}"
	echo "Expected space saved: ${map[3]}"
	echo "Disk space difference: ${map[4]}${map[5]}"
	echo "Found ${#hashset[@]} files, $unique unique, $copycount duplicates, $failcount failed"
	#read -rsn
	exit 0
	.TH dedupe 1 "26 Jan 2025" "0.6-26.01.25"
	.SH NAME
	dedupe
	.SH SYNPOSIS
	.B dedupe
	[\fBFILES\fR] ...
	.SH USAGE
	Provide a list of at least two files that contain duplicates of one another to group them into hardlinks.

	For deduping of an entire folder containing duplicate items without specifying every single file, add
	.B */.ext
	to the argument list to recursively search for matching files in the current directory.

	NOTE: In the case of the "argument list too long" error, wrap the wildcard filter in quotes
	and the script will use an internal searching mechanism rather than the shell's.

	It's recommended to run this on a singular drive at a time with folders of
	old system backups or small to medium size archival data or old downloads.
	.SH CONFIGURATION
	Configuration is stored in the user's home directory at \fI~/.config/dk15/dedupe\fR

	A list of acceptable values to change:
	.TP
	.BR passes " = " \fIcount\fR
	Set the number of passes per command. \fIMinimum\fR = \fB1\fR, \fImaximum\fR = \fB4\fR.

	For passes 1 and 2, the utility plainly hardlinks the files provided.
	For passes 3 and 4, the files' individual hardlinks found on the current
	storage device are merged into the first copy of said duplicates to ensure
	that there are not two or more separate hardlink groups of the exact same file.
	On Linux, passes 3 and 4 run very slow compared to Windows, which is able to
	keep track of all hard links that are tied to physical files and instantly
	return them upon request.
	.\" you can't spell Windows without W
	.\" you can't spell Linux without L

	If there's not a single file that is processed on a pass, the utility will exit.
	.TP
	.\" actual suffering
	.BR hash " = " \fIsum_prefix\fR

	Set the preferred hashing function. Recommended functions are BLAKE2 (\fBb2\fR),
	BLAKE3 (\fBb3\fR), or SHA256 (\fBsha256\fR). The name of the chosen function must match
	the name of an existing executable that is suffxed with -sum, like the following examples.

	.nr PI 2n
	Only acceptable hash functions as of now (length, speed, security):
	.\" how do i not have a second newline separating these
	.IP *
	\fBck\fR (CRC32) - 4 bytes, fast, unknown
	.IP *
	\fBmd5\fR - 16 bytes, fast, weak
	.IP *
	\fBsha1\fR - 20 bytes, medium, weak
	.IP *
	\fBsha224\fR - 28 bytes, medium, okay
	.IP *
	\fBsha256\fR - 32 bytes, medium, strong
	.IP *
	\fBsha384\fR - 48 bytes, slow, strong
	.IP *
	\fBsha512\fR - 64 bytes, slow, strong
	.IP *
	\fBb2\fR (BLAKE2) - any length, fast, strong (64 bytes by default)
	.IP *
	\fBb3\fR (BLAKE3) - any length, very fast, strong (32 bytes by default)

	.TP
	.BR hash_length " = " \fIbyte_count\fR

	BLAKE2 and BLAKE3 only. Set the preferred hash length, adjustable
	in the case of handling an unusual amount of files at once.

	.TP
	.BR force_relink " = " [ \fI0\fR \| \fI1\fR ]

	Force the hardlinking of duplicates which have already been hardlinked. Only applies to pass 1.

	.TP
	.BR sanity_check " = " [ \fI0\fR \| \fI1\fR ]

	Print every file that is processed for deduplicating.

	.TP
	.BR hide_invalid " = " [ \fI0\fR \| \fI1\fR ]

	Don't print errors about preparing files that don't exist or are blank.

	.TP
	.BR batch_hashes " = " [ \fI0\fR \| \fI1\fR ]

	Handle hashing all at once.
	.B EXPERIMENTAL, DON'T TRY ON FILES THAT ARE IMPORTANT TO YOU.

	.TP
	.BR simulate_mode " = " [ \fI0\fR \| \fI1\fR ]

	Simulation mode, do not actually hardlink.
	Will run through every pass however, to no effect.

	.TP
	.BR hide_errored " = " [ \fI0\fR \| \fI1\fR ]

	Hide errors that display during pass 1.

	.TP
	.BR scramble_list " = " [ \fI0\fR \| \fI1\fR ]

	Scramble file list after parsing. For stress testing.

	.TP
	.BR process_gitdir " = " [ \fI0\fR \| \fI1\fR ]

	Don't ignore .git folders, bad idea unless the repo targeted is not
	being modified and merely exists for the purpose of preservation.

	.TP
	.BR hash_workers " = " num_processes

	Number of hashing processes to run in parallel.
	Recommended value is 8 because of I/O speeds.
	.SH AUTHOR
	donnaken15 <[email protected]>
	# dedupe config
	#
	# Each pass processes the entire file list given to the command.
	# Pass 1: Incrementally scan, index, hash each file, and check for duplicates to hardlink.
	# Pass 2: Plainly hardlink exact copies of input files using the complete list of hashes.
	# Pass 3, 4: Check input duplicate files that already share hardlinks, or just became
	# hardlinks from the current execution, and merge them together with their sources.
	# Separate groups of hardlinks of the same file is an issue which requires a complex explanation.
	# Pass 1 and 2 are basically the same if batch_hashes is enabled.
	passes = 4

	# Preferred hashing function, recommended: BLAKE2 or BLAKE3
	# The name of the chosen function must match the name of an
	# existing executable that is suffixed with -sum. As such:
	# SHA256: sha256sum, hash = sha256
	# BLAKE3: b3sum, hash = b3
	# CRC32: cksum, hash = ck
	hash = b2

	# Preferred BLAKE hash length in bytes, adjustable in the
	# case of handling an unusual amount of files at once.
	hash_length = 20

	# Force hardlinking of duplicates which are already hardlinked.
	# Only applies to pass 1.
	force_relink = 0

	# Print every file that is processed.
	sanity_check = 0

	# Don't print errors about preparing files that don't exist or are blank.
	hide_invalid = 0

	# Handle the hashing of input files all at once.
	batch_hashes = 0

	# Simulation mode, do not actually hardlink.
	# Will run through every pass however, to no effect.
	simulate_mode = 0

	# Hide errors that display during pass 1.
	hide_errored = 0

	# Scramble file list after parsing. For stress testing.
	scramble_list = 0

	# Don't ignore .git folders, bad idea unless the repo targeted is not
	# being modified and merely exists for the purpose of preservation.
	process_gitdir = 0

	# Number of hashing processes to run in parallel.
	# Recommended value is 8 because of I/O speeds.
	hash_workers = 8