Skip to content

Instantly share code, notes, and snippets.

@sebastiancarlos
Last active August 5, 2023 13:57
Show Gist options
  • Save sebastiancarlos/c1dcc666f7cc3e85efabc5cbf9a99915 to your computer and use it in GitHub Desktop.
Save sebastiancarlos/c1dcc666f7cc3e85efabc5cbf9a99915 to your computer and use it in GitHub Desktop.
Grepdist - Print the number of matches in each 10% section of a file
#!/usr/bin/env bash
# All my gist code is licensed under the terms of the MIT license.
# GREPDIST
# Print the number of matches in each 10% section of the file
#
# Example with grep:
# $ grep -c 'Napoleon' war_and_peace.txt
# 576
#
# Example with grepdist:
# $ grepdist 'Napoleon' war_and_peace.txt
# Total: 576
# Section 1 (Lines 1 to 6603): 12 (2.1%)
# Section 2 (Lines 6604 to 13206): 3 (0.5%)
# Section 3 (Lines 13207 to 19809): 32 (5.6%)
# Section 4 (Lines 19810 to 26412): 36 (6.2%)
# Section 5 (Lines 26413 to 33015): 5 (0.9%)
# Section 6 (Lines 33016 to 39618): 154 (26.7%)
# Section 7 (Lines 39619 to 46221): 158 (27.4%)
# Section 8 (Lines 46222 to 52824): 23 (4.0%)
# Section 9 (Lines 52825 to 59427): 88 (15.3%)
# Section 10 (Lines 59428 to 66030): 65 (11.3%)
#
# Or pass a custom number of sections:
# $ grepdist -s 5 'Napoleon' war_and_peace.txt
# Total: 576
# Section 1 (Lines 1 to 13206): 15 (2.6%)
# Section 2 (Lines 13207 to 26412): 68 (11.8%)
# Section 3 (Lines 26413 to 39618): 159 (27.6%)
# Section 4 (Lines 39619 to 52824): 181 (31.4%)
# Section 5 (Lines 52825 to 66030): 153 (26.6%)
#
# It also supports the same options as grep:
# -i, --ignore-case
# -w, --word-regexp
# ARG_OPTIONAL_SINGLE([sections],[s],[the number of sections to divide the file into],[10])
# ARG_OPTIONAL_BOOLEAN([count],[c],[print only the total count of matches])
# ARG_OPTIONAL_BOOLEAN([ignore-case],[i],[ignore case distinctions in PATTERN and FILE])
# ARG_OPTIONAL_BOOLEAN([percentage],[p],[print only the percentage of matches in each section])
# ARG_OPTIONAL_BOOLEAN([word-regexp],[w],[force PATTERN to match only whole words])
# ARG_POSITIONAL_SINGLE([pattern],[the pattern to search for],[])
# ARG_POSITIONAL_SINGLE([file],[the file to search in],[])
# ARG_HELP([grepdist - search for a pattern in a file and print the total number of matches and the percentage of matches in each 10% section of the file])
# ARGBASH_GO()
# needed because of Argbash --> m4_ignore([
### START OF CODE GENERATED BY Argbash v2.10.0 one line above ###
# Argbash is a bash code generator used to get arguments parsing right.
# Argbash is FREE SOFTWARE, see https://argbash.dev for more info
die()
{
local _ret="${2:-1}"
test "${_PRINT_HELP:-no}" = yes && print_help >&2
echo "$1" >&2
exit "${_ret}"
}
begins_with_short_option()
{
local first_option all_short_options='scipwh'
first_option="${1:0:1}"
test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 || return 0
}
# THE DEFAULTS INITIALIZATION - POSITIONALS
_positionals=()
# THE DEFAULTS INITIALIZATION - OPTIONALS
_arg_sections="10"
_arg_count="off"
_arg_ignore_case="off"
_arg_percentage="off"
_arg_word_regexp="off"
print_help()
{
printf '%s\n' "grepdist - search for a pattern in a file and print the total number of matches and the percentage of matches in each 10% section of the file"
printf 'Usage: %s [-s|--sections <arg>] [-c|--(no-)count] [-i|--(no-)ignore-case] [-p|--(no-)percentage] [-w|--(no-)word-regexp] [-h|--help] <pattern> <file>\n' "$0"
printf '\t%s\n' "<pattern>: the pattern to search for"
printf '\t%s\n' "<file>: the file to search in"
printf '\t%s\n' "-s, --sections: the number of sections to divide the file into (default: '10')"
printf '\t%s\n' "-c, --count, --no-count: print only the total count of matches (off by default)"
printf '\t%s\n' "-i, --ignore-case, --no-ignore-case: ignore case distinctions in PATTERN and FILE (off by default)"
printf '\t%s\n' "-p, --percentage, --no-percentage: print only the percentage of matches in each section (off by default)"
printf '\t%s\n' "-w, --word-regexp, --no-word-regexp: force PATTERN to match only whole words (off by default)"
printf '\t%s\n' "-h, --help: Prints help"
}
parse_commandline()
{
_positionals_count=0
while test $# -gt 0
do
_key="$1"
case "$_key" in
-s|--sections)
test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
_arg_sections="$2"
shift
;;
--sections=*)
_arg_sections="${_key##--sections=}"
;;
-s*)
_arg_sections="${_key##-s}"
;;
-c|--no-count|--count)
_arg_count="on"
test "${1:0:5}" = "--no-" && _arg_count="off"
;;
-c*)
_arg_count="on"
_next="${_key##-c}"
if test -n "$_next" -a "$_next" != "$_key"
then
{ begins_with_short_option "$_next" && shift && set -- "-c" "-${_next}" "$@"; } || die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option."
fi
;;
-i|--no-ignore-case|--ignore-case)
_arg_ignore_case="on"
test "${1:0:5}" = "--no-" && _arg_ignore_case="off"
;;
-i*)
_arg_ignore_case="on"
_next="${_key##-i}"
if test -n "$_next" -a "$_next" != "$_key"
then
{ begins_with_short_option "$_next" && shift && set -- "-i" "-${_next}" "$@"; } || die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option."
fi
;;
-p|--no-percentage|--percentage)
_arg_percentage="on"
test "${1:0:5}" = "--no-" && _arg_percentage="off"
;;
-p*)
_arg_percentage="on"
_next="${_key##-p}"
if test -n "$_next" -a "$_next" != "$_key"
then
{ begins_with_short_option "$_next" && shift && set -- "-p" "-${_next}" "$@"; } || die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option."
fi
;;
-w|--no-word-regexp|--word-regexp)
_arg_word_regexp="on"
test "${1:0:5}" = "--no-" && _arg_word_regexp="off"
;;
-w*)
_arg_word_regexp="on"
_next="${_key##-w}"
if test -n "$_next" -a "$_next" != "$_key"
then
{ begins_with_short_option "$_next" && shift && set -- "-w" "-${_next}" "$@"; } || die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option."
fi
;;
-h|--help)
print_help
exit 0
;;
-h*)
print_help
exit 0
;;
*)
_last_positional="$1"
_positionals+=("$_last_positional")
_positionals_count=$((_positionals_count + 1))
;;
esac
shift
done
}
handle_passed_args_count()
{
local _required_args_string="'pattern' and 'file'"
test "${_positionals_count}" -ge 2 || _PRINT_HELP=yes die "FATAL ERROR: Not enough positional arguments - we require exactly 2 (namely: $_required_args_string), but got only ${_positionals_count}." 1
test "${_positionals_count}" -le 2 || _PRINT_HELP=yes die "FATAL ERROR: There were spurious positional arguments --- we expect exactly 2 (namely: $_required_args_string), but got ${_positionals_count} (the last one was: '${_last_positional}')." 1
}
assign_positional_args()
{
local _positional_name _shift_for=$1
_positional_names="_arg_pattern _arg_file "
shift "$_shift_for"
for _positional_name in ${_positional_names}
do
test $# -gt 0 || break
eval "$_positional_name=\${1}" || die "Error during argument parsing, possibly an Argbash bug." 1
shift
done
}
parse_commandline "$@"
handle_passed_args_count
assign_positional_args 1 "${_positionals[@]}"
# OTHER STUFF GENERATED BY Argbash
### END OF CODE GENERATED BY Argbash (sortof) ### ])
# [ <-- needed because of Argbash
# get file with cat (so that we can handle stdin too)
file=$(cat "$_arg_file")
# total number of lines in the file
total=$(echo "$file" | wc -l | tr -d ' ')
section_length=$(echo "$total / $_arg_sections" | bc)
# pass grep flags -i and -w if needed
flags=""
if [ "$_arg_ignore_case" = "on" ]; then
flags="$flags -i"
fi
if [ "$_arg_word_regexp" = "on" ]; then
flags="$flags -w"
fi
# run grep, return only the line numbers of the matches
found=$(echo "$file" | grep $flags -n "$_arg_pattern" | cut -f1 -d:)
total_matches=$(echo "$found" | wc -l | tr -d ' ')
# if the section length is 0 (file too short), just print total count and exit
if [ "$section_length" -eq 0 ]; then
echo "Total: $total_matches"
exit 0
fi
# array to keep track of the number of matches by section (and the total)
matches=()
# loop through the sections
for i in $(seq 1 $_arg_sections); do
section_start=$((section_length * (i - 1) + 1))
# if last section, then section end is the total number of lines
if [ "$i" -eq $_arg_sections ]; then
section_end=$total
else
section_end=$((section_length * i))
fi
matches[$i]=$( echo "$found" | awk "\$1 >= $section_start && \$1 <= $section_end { print }" | wc -l )
done
# print the total
echo "Total: $total_matches"
# if there are no matches, exit
if [ "$total_matches" -eq 0 ]; then
exit 0
fi
# print the sections
for i in $(seq 1 $_arg_sections); do
section_start=$((section_length * (i - 1) + 1))
# if last section, then section end is the total number of lines
if [ "$i" -eq $_arg_sections ]; then
section_end=$total
else
section_end=$((section_length * i))
fi
# numbers padded to the number of digits in the total number of lines
# in the file. Section number is padded to the number of digits in the
# number of sections
section_start=$(printf "%${#total}d" $section_start)
section_end=$(printf "%${#total}d" $section_end)
section_number=$(printf "%${#_arg_sections}d" $i)
echo -n -e "Section $section_number (Lines $section_start to $section_end):"
# what even is a 'centage'? math is weird
percentage=$(printf "%.1f\n" $(echo "scale=4; ${matches[$i]} / $total_matches * 100" | bc -l))
# print the number of matches
if [ "$_arg_percentage" = "on" ]; then
echo -e "\t($percentage%)"
elif [ "$_arg_count" = "on" ]; then
# matches are padded to the total number of digits in the total number
# of matches
matches[$i]=$(printf "%${#total_matches}d" ${matches[$i]})
echo -e "\t${matches[$i]}"
else
matches[$i]=$(printf "%${#total_matches}d" ${matches[$i]})
echo -e "\t${matches[$i]}\t($percentage%)"
fi
done
# ] <-- needed because of Argbash
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment