Created
December 8, 2021 03:40
-
-
Save davetang/674dbe5886d0b0859dda65db4760948e to your computer and use it in GitHub Desktop.
Run grep in parallel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -euo pipefail | |
usage() { | |
>&2 echo "Usage: $0 [ -l search_list ] [ -f file_to_grep ] [ -n split_num ] [ -p num_threads ]" | |
exit 1 | |
} | |
num_param=4 | |
required_param=$(bc -l<<<${num_param}*2+1) | |
while getopts ":l:f:n:p:" options; do | |
case "${options}" in | |
l) | |
list=${OPTARG} | |
;; | |
f) | |
file=${OPTARG} | |
;; | |
n) | |
num=${OPTARG} | |
regex='^[1-9][0-9]*$' | |
if [[ ! ${num} =~ ${regex} ]]; then | |
usage | |
fi | |
;; | |
p) | |
num_threads=${OPTARG} | |
regex='^[1-9][0-9]*$' | |
if [[ ! ${num_threads} =~ ${regex} ]]; then | |
usage | |
fi | |
;; | |
:) | |
echo "Error: -${OPTARG} requires an argument." | |
exit 1 | |
;; | |
*) | |
usage ;; | |
esac | |
done | |
if [[ ${OPTIND} -ne ${required_param} ]]; then | |
usage | |
fi | |
# check if input files exist | |
for check in ${list} ${file}; do | |
if [[ ! -e ${check} ]]; then | |
>&2 echo ${check} does not exist | |
exit 1 | |
fi | |
done | |
# generate prefixes | |
prefixes=({a..z}{a..z}) | |
# check to see requested number of splits is larger than supported | |
num_prefix=${#prefixes[@]} | |
if [[ $num -gt ${num_prefix} ]]; then | |
>&2 echo Please enter number less than ${num_prefix} | |
exit 1 | |
fi | |
# get basename | |
base=$(basename -- ${file}) | |
base="${base%.*}" | |
# calculate number of lines per split | |
total=$(cat ${file} | wc -l) | |
div=$(bc -l<<<${total}/${num}+1) | |
lines=$(printf %.0f ${div}) | |
# split file to search | |
split -l ${lines} ${file} ${base}. | |
# file containing commands to run | |
cmd_txt=$(date +%Y%M%d%H%M%N) | |
# generate commands | |
# | |
# -w to prevent partial matches | |
# -F Interpret PATTERN as a list of fixed strings, separated by newlines, any of which is to be matched. | |
# -c count | |
# | |
for ((n = 0; n < ${num}; n++)); do | |
echo "grep -w -c -F -f ${list} ${base}.${prefixes[${n}]}" >> ${cmd_txt} | |
done | |
parallel -j ${num_threads} < ${cmd_txt} | perl -nle '$s += $_; END { print $s }' | |
# clean up | |
rm ${cmd_txt} | |
for ((n = 0; n < ${num}; n++)); do | |
rm ${base}.${prefixes[${n}]} | |
done | |
exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment