Last active
August 5, 2023 13:57
-
-
Save sebastiancarlos/c1dcc666f7cc3e85efabc5cbf9a99915 to your computer and use it in GitHub Desktop.
Grepdist - Print the number of matches in each 10% section of a file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# All my gist code is licensed under the terms of the MIT license. | |
# GREPDIST | |
# Print the number of matches in each 10% section of the file | |
# | |
# Example with grep: | |
# $ grep -c 'Napoleon' war_and_peace.txt | |
# 576 | |
# | |
# Example with grepdist: | |
# $ grepdist 'Napoleon' war_and_peace.txt | |
# Total: 576 | |
# Section 1 (Lines 1 to 6603): 12 (2.1%) | |
# Section 2 (Lines 6604 to 13206): 3 (0.5%) | |
# Section 3 (Lines 13207 to 19809): 32 (5.6%) | |
# Section 4 (Lines 19810 to 26412): 36 (6.2%) | |
# Section 5 (Lines 26413 to 33015): 5 (0.9%) | |
# Section 6 (Lines 33016 to 39618): 154 (26.7%) | |
# Section 7 (Lines 39619 to 46221): 158 (27.4%) | |
# Section 8 (Lines 46222 to 52824): 23 (4.0%) | |
# Section 9 (Lines 52825 to 59427): 88 (15.3%) | |
# Section 10 (Lines 59428 to 66030): 65 (11.3%) | |
# | |
# Or pass a custom number of sections: | |
# $ grepdist -s 5 'Napoleon' war_and_peace.txt | |
# Total: 576 | |
# Section 1 (Lines 1 to 13206): 15 (2.6%) | |
# Section 2 (Lines 13207 to 26412): 68 (11.8%) | |
# Section 3 (Lines 26413 to 39618): 159 (27.6%) | |
# Section 4 (Lines 39619 to 52824): 181 (31.4%) | |
# Section 5 (Lines 52825 to 66030): 153 (26.6%) | |
# | |
# It also supports the same options as grep: | |
# -i, --ignore-case | |
# -w, --word-regexp | |
# ARG_OPTIONAL_SINGLE([sections],[s],[the number of sections to divide the file into],[10]) | |
# ARG_OPTIONAL_BOOLEAN([count],[c],[print only the total count of matches]) | |
# ARG_OPTIONAL_BOOLEAN([ignore-case],[i],[ignore case distinctions in PATTERN and FILE]) | |
# ARG_OPTIONAL_BOOLEAN([percentage],[p],[print only the percentage of matches in each section]) | |
# ARG_OPTIONAL_BOOLEAN([word-regexp],[w],[force PATTERN to match only whole words]) | |
# ARG_POSITIONAL_SINGLE([pattern],[the pattern to search for],[]) | |
# ARG_POSITIONAL_SINGLE([file],[the file to search in],[]) | |
# ARG_HELP([grepdist - search for a pattern in a file and print the total number of matches and the percentage of matches in each 10% section of the file]) | |
# ARGBASH_GO() | |
# needed because of Argbash --> m4_ignore([ | |
### START OF CODE GENERATED BY Argbash v2.10.0 one line above ### | |
# Argbash is a bash code generator used to get arguments parsing right. | |
# Argbash is FREE SOFTWARE, see https://argbash.dev for more info | |
die() | |
{ | |
local _ret="${2:-1}" | |
test "${_PRINT_HELP:-no}" = yes && print_help >&2 | |
echo "$1" >&2 | |
exit "${_ret}" | |
} | |
begins_with_short_option() | |
{ | |
local first_option all_short_options='scipwh' | |
first_option="${1:0:1}" | |
test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 || return 0 | |
} | |
# THE DEFAULTS INITIALIZATION - POSITIONALS | |
_positionals=() | |
# THE DEFAULTS INITIALIZATION - OPTIONALS | |
_arg_sections="10" | |
_arg_count="off" | |
_arg_ignore_case="off" | |
_arg_percentage="off" | |
_arg_word_regexp="off" | |
print_help() | |
{ | |
printf '%s\n' "grepdist - search for a pattern in a file and print the total number of matches and the percentage of matches in each 10% section of the file" | |
printf 'Usage: %s [-s|--sections <arg>] [-c|--(no-)count] [-i|--(no-)ignore-case] [-p|--(no-)percentage] [-w|--(no-)word-regexp] [-h|--help] <pattern> <file>\n' "$0" | |
printf '\t%s\n' "<pattern>: the pattern to search for" | |
printf '\t%s\n' "<file>: the file to search in" | |
printf '\t%s\n' "-s, --sections: the number of sections to divide the file into (default: '10')" | |
printf '\t%s\n' "-c, --count, --no-count: print only the total count of matches (off by default)" | |
printf '\t%s\n' "-i, --ignore-case, --no-ignore-case: ignore case distinctions in PATTERN and FILE (off by default)" | |
printf '\t%s\n' "-p, --percentage, --no-percentage: print only the percentage of matches in each section (off by default)" | |
printf '\t%s\n' "-w, --word-regexp, --no-word-regexp: force PATTERN to match only whole words (off by default)" | |
printf '\t%s\n' "-h, --help: Prints help" | |
} | |
parse_commandline() | |
{ | |
_positionals_count=0 | |
while test $# -gt 0 | |
do | |
_key="$1" | |
case "$_key" in | |
-s|--sections) | |
test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1 | |
_arg_sections="$2" | |
shift | |
;; | |
--sections=*) | |
_arg_sections="${_key##--sections=}" | |
;; | |
-s*) | |
_arg_sections="${_key##-s}" | |
;; | |
-c|--no-count|--count) | |
_arg_count="on" | |
test "${1:0:5}" = "--no-" && _arg_count="off" | |
;; | |
-c*) | |
_arg_count="on" | |
_next="${_key##-c}" | |
if test -n "$_next" -a "$_next" != "$_key" | |
then | |
{ begins_with_short_option "$_next" && shift && set -- "-c" "-${_next}" "$@"; } || die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option." | |
fi | |
;; | |
-i|--no-ignore-case|--ignore-case) | |
_arg_ignore_case="on" | |
test "${1:0:5}" = "--no-" && _arg_ignore_case="off" | |
;; | |
-i*) | |
_arg_ignore_case="on" | |
_next="${_key##-i}" | |
if test -n "$_next" -a "$_next" != "$_key" | |
then | |
{ begins_with_short_option "$_next" && shift && set -- "-i" "-${_next}" "$@"; } || die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option." | |
fi | |
;; | |
-p|--no-percentage|--percentage) | |
_arg_percentage="on" | |
test "${1:0:5}" = "--no-" && _arg_percentage="off" | |
;; | |
-p*) | |
_arg_percentage="on" | |
_next="${_key##-p}" | |
if test -n "$_next" -a "$_next" != "$_key" | |
then | |
{ begins_with_short_option "$_next" && shift && set -- "-p" "-${_next}" "$@"; } || die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option." | |
fi | |
;; | |
-w|--no-word-regexp|--word-regexp) | |
_arg_word_regexp="on" | |
test "${1:0:5}" = "--no-" && _arg_word_regexp="off" | |
;; | |
-w*) | |
_arg_word_regexp="on" | |
_next="${_key##-w}" | |
if test -n "$_next" -a "$_next" != "$_key" | |
then | |
{ begins_with_short_option "$_next" && shift && set -- "-w" "-${_next}" "$@"; } || die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option." | |
fi | |
;; | |
-h|--help) | |
print_help | |
exit 0 | |
;; | |
-h*) | |
print_help | |
exit 0 | |
;; | |
*) | |
_last_positional="$1" | |
_positionals+=("$_last_positional") | |
_positionals_count=$((_positionals_count + 1)) | |
;; | |
esac | |
shift | |
done | |
} | |
handle_passed_args_count() | |
{ | |
local _required_args_string="'pattern' and 'file'" | |
test "${_positionals_count}" -ge 2 || _PRINT_HELP=yes die "FATAL ERROR: Not enough positional arguments - we require exactly 2 (namely: $_required_args_string), but got only ${_positionals_count}." 1 | |
test "${_positionals_count}" -le 2 || _PRINT_HELP=yes die "FATAL ERROR: There were spurious positional arguments --- we expect exactly 2 (namely: $_required_args_string), but got ${_positionals_count} (the last one was: '${_last_positional}')." 1 | |
} | |
assign_positional_args() | |
{ | |
local _positional_name _shift_for=$1 | |
_positional_names="_arg_pattern _arg_file " | |
shift "$_shift_for" | |
for _positional_name in ${_positional_names} | |
do | |
test $# -gt 0 || break | |
eval "$_positional_name=\${1}" || die "Error during argument parsing, possibly an Argbash bug." 1 | |
shift | |
done | |
} | |
parse_commandline "$@" | |
handle_passed_args_count | |
assign_positional_args 1 "${_positionals[@]}" | |
# OTHER STUFF GENERATED BY Argbash | |
### END OF CODE GENERATED BY Argbash (sortof) ### ]) | |
# [ <-- needed because of Argbash | |
# get file with cat (so that we can handle stdin too) | |
file=$(cat "$_arg_file") | |
# total number of lines in the file | |
total=$(echo "$file" | wc -l | tr -d ' ') | |
section_length=$(echo "$total / $_arg_sections" | bc) | |
# pass grep flags -i and -w if needed | |
flags="" | |
if [ "$_arg_ignore_case" = "on" ]; then | |
flags="$flags -i" | |
fi | |
if [ "$_arg_word_regexp" = "on" ]; then | |
flags="$flags -w" | |
fi | |
# run grep, return only the line numbers of the matches | |
found=$(echo "$file" | grep $flags -n "$_arg_pattern" | cut -f1 -d:) | |
total_matches=$(echo "$found" | wc -l | tr -d ' ') | |
# if the section length is 0 (file too short), just print total count and exit | |
if [ "$section_length" -eq 0 ]; then | |
echo "Total: $total_matches" | |
exit 0 | |
fi | |
# array to keep track of the number of matches by section (and the total) | |
matches=() | |
# loop through the sections | |
for i in $(seq 1 $_arg_sections); do | |
section_start=$((section_length * (i - 1) + 1)) | |
# if last section, then section end is the total number of lines | |
if [ "$i" -eq $_arg_sections ]; then | |
section_end=$total | |
else | |
section_end=$((section_length * i)) | |
fi | |
matches[$i]=$( echo "$found" | awk "\$1 >= $section_start && \$1 <= $section_end { print }" | wc -l ) | |
done | |
# print the total | |
echo "Total: $total_matches" | |
# if there are no matches, exit | |
if [ "$total_matches" -eq 0 ]; then | |
exit 0 | |
fi | |
# print the sections | |
for i in $(seq 1 $_arg_sections); do | |
section_start=$((section_length * (i - 1) + 1)) | |
# if last section, then section end is the total number of lines | |
if [ "$i" -eq $_arg_sections ]; then | |
section_end=$total | |
else | |
section_end=$((section_length * i)) | |
fi | |
# numbers padded to the number of digits in the total number of lines | |
# in the file. Section number is padded to the number of digits in the | |
# number of sections | |
section_start=$(printf "%${#total}d" $section_start) | |
section_end=$(printf "%${#total}d" $section_end) | |
section_number=$(printf "%${#_arg_sections}d" $i) | |
echo -n -e "Section $section_number (Lines $section_start to $section_end):" | |
# what even is a 'centage'? math is weird | |
percentage=$(printf "%.1f\n" $(echo "scale=4; ${matches[$i]} / $total_matches * 100" | bc -l)) | |
# print the number of matches | |
if [ "$_arg_percentage" = "on" ]; then | |
echo -e "\t($percentage%)" | |
elif [ "$_arg_count" = "on" ]; then | |
# matches are padded to the total number of digits in the total number | |
# of matches | |
matches[$i]=$(printf "%${#total_matches}d" ${matches[$i]}) | |
echo -e "\t${matches[$i]}" | |
else | |
matches[$i]=$(printf "%${#total_matches}d" ${matches[$i]}) | |
echo -e "\t${matches[$i]}\t($percentage%)" | |
fi | |
done | |
# ] <-- needed because of Argbash |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment