Skip to content

Instantly share code, notes, and snippets.

@temp3l
Last active July 31, 2020 19:17
Show Gist options
  • Save temp3l/b75e5ec564599799816913a7354ce5ad to your computer and use it in GitHub Desktop.
Save temp3l/b75e5ec564599799816913a7354ce5ad to your computer and use it in GitHub Desktop.
multithreaded split file with sed
#!/bin/bash
worker=8
args=("$@")
file=${args[0]}
lines=$(wc -l $file | awk -F ' ' '{print $1}')
linesPerWorker=$(expr $lines / $worker)
start=1
mkdir -p tmp
waitforjobs() {
while test $(jobs -p | wc -w) -ge "$1"; do wait -n; done
}
splitfile(){
i=${i} file=${2} start=${3} end=${4}
filename=$(basename $file)
tmpfile=$(mktemp -p tmp)
echo "--- work: ${tmpfile} $start:$end" #echo "${i} $file ${start} ${end} ${file} "
sed -n "${start},${end} p;${end}q" $file > "${tmpfile}"
./bulk2raw.lua ${tmpfile} ${file}
rm -f ${tmpfile}
}
echo ""
echo "--- BSPLIT ${file} into ${worker} chunks"
for i in $(seq 1 1 $worker); do
end=$(($start + $linesPerWorker))
splitfile ${i} $file ${start} ${end} &
start=$(($start + $linesPerWorker+1)) # incr for next run
done
waitforjobs 1
echo "--- ALL CHUNKS ${worker} PROCESSED for: ${file} ...deleting"
echo ""
rm -f ${file}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment