sebastiancarlos · August 5, 2023 13:57
diff --git a/grepdist.sh b/grepdist.sh
 #!/usr/bin/env bash

 # All my gist code is licensed under the terms of the MIT license.

 # GREPDIST
 # Print the number of matches in each 10% section of the file
 #
 # Example with grep:
 # $ grep -c 'Napoleon' war_and_peace.txt
 # 576
 #
 # Example with grepdist:
 # $ grepdist 'Napoleon' war_and_peace.txt
 # Total: 576
 # Section  1 (Lines     1 to  6603):       12     (2.1%)
 # Section  2 (Lines  6604 to 13206):        3     (0.5%)
 # Section  3 (Lines 13207 to 19809):       32     (5.6%)
 # Section  4 (Lines 19810 to 26412):       36     (6.2%)
 # Section  5 (Lines 26413 to 33015):        5     (0.9%)
 # Section  6 (Lines 33016 to 39618):      154     (26.7%)
 # Section  7 (Lines 39619 to 46221):      158     (27.4%)
 # Section  8 (Lines 46222 to 52824):       23     (4.0%)
 # Section  9 (Lines 52825 to 59427):       88     (15.3%)
 # Section 10 (Lines 59428 to 66030):       65     (11.3%)
 #
 # Or pass a custom number of sections:
 # $ grepdist -s 5 'Napoleon' war_and_peace.txt
 # Total: 576
 # Section 1 (Lines     1 to 13206):        15     (2.6%)
 # Section 2 (Lines 13207 to 26412):        68     (11.8%)
 # Section 3 (Lines 26413 to 39618):       159     (27.6%)
 # Section 4 (Lines 39619 to 52824):       181     (31.4%)
 # Section 5 (Lines 52825 to 66030):       153     (26.6%)
 #
 # It also supports the same options as grep:
 # -i, --ignore-case
 # -w, --word-regexp

 # ARG_OPTIONAL_SINGLE([sections],[s],[the number of sections to divide the file into],[10])
 # ARG_OPTIONAL_BOOLEAN([count],[c],[print only the total count of matches])
 # ARG_OPTIONAL_BOOLEAN([ignore-case],[i],[ignore case distinctions in PATTERN and FILE])
 # ARG_OPTIONAL_BOOLEAN([percentage],[p],[print only the percentage of matches in each section])
 # ARG_OPTIONAL_BOOLEAN([word-regexp],[w],[force PATTERN to match only whole words])
 # ARG_POSITIONAL_SINGLE([pattern],[the pattern to search for],[])
 # ARG_POSITIONAL_SINGLE([file],[the file to search in],[])
 # ARG_HELP([grepdist - search for a pattern in a file and print the total number of matches and the percentage of matches in each 10% section of the file])
 # ARGBASH_GO()
 # needed because of Argbash --> m4_ignore([
 ### START OF CODE GENERATED BY Argbash v2.10.0 one line above ###
 # Argbash is a bash code generator used to get arguments parsing right.
 # Argbash is FREE SOFTWARE, see https://argbash.dev for more info


 die()
 {
 	local _ret="${2:-1}"
 	test "${_PRINT_HELP:-no}" = yes && print_help >&2
 	echo "$1" >&2
 	exit "${_ret}"
 }


 begins_with_short_option()
 {
 	local first_option all_short_options='scipwh'
 	first_option="${1:0:1}"
 	test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 || return 0
 }

 # THE DEFAULTS INITIALIZATION - POSITIONALS
 _positionals=()
 # THE DEFAULTS INITIALIZATION - OPTIONALS
 _arg_sections="10"
 _arg_count="off"
 _arg_ignore_case="off"
 _arg_percentage="off"
 _arg_word_regexp="off"


 print_help()
 {
 	printf '%s\n' "grepdist - search for a pattern in a file and print the total number of matches and the percentage of matches in each 10% section of the file"
 	printf 'Usage: %s [-s|--sections <arg>] [-c|--(no-)count] [-i|--(no-)ignore-case] [-p|--(no-)percentage] [-w|--(no-)word-regexp] [-h|--help] <pattern> <file>\n' "$0"
 	printf '\t%s\n' "<pattern>: the pattern to search for"
 	printf '\t%s\n' "<file>: the file to search in"
 	printf '\t%s\n' "-s, --sections: the number of sections to divide the file into (default: '10')"
 	printf '\t%s\n' "-c, --count, --no-count: print only the total count of matches (off by default)"
 	printf '\t%s\n' "-i, --ignore-case, --no-ignore-case: ignore case distinctions in PATTERN and FILE (off by default)"
 	printf '\t%s\n' "-p, --percentage, --no-percentage: print only the percentage of matches in each section (off by default)"
 	printf '\t%s\n' "-w, --word-regexp, --no-word-regexp: force PATTERN to match only whole words (off by default)"
 	printf '\t%s\n' "-h, --help: Prints help"
 }


 parse_commandline()
 {
 	_positionals_count=0
 	while test $# -gt 0
 	do
 		_key="$1"
 		case "$_key" in
 			-s|--sections)
 				test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
 				_arg_sections="$2"
 				shift
 				;;
 			--sections=*)
 				_arg_sections="${_key##--sections=}"
 				;;
 			-s*)
 				_arg_sections="${_key##-s}"
 				;;
 			-c|--no-count|--count)
 				_arg_count="on"
 				test "${1:0:5}" = "--no-" && _arg_count="off"
 				;;
 			-c*)
 				_arg_count="on"
 				_next="${_key##-c}"
 				if test -n "$_next" -a "$_next" != "$_key"
 				then
 					{ begins_with_short_option "$_next" && shift && set -- "-c" "-${_next}" "$@"; } || die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option."
 				fi
 				;;
 			-i|--no-ignore-case|--ignore-case)
 				_arg_ignore_case="on"
 				test "${1:0:5}" = "--no-" && _arg_ignore_case="off"
 				;;
 			-i*)
 				_arg_ignore_case="on"
 				_next="${_key##-i}"
 				if test -n "$_next" -a "$_next" != "$_key"
 				then
 					{ begins_with_short_option "$_next" && shift && set -- "-i" "-${_next}" "$@"; } || die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option."
 				fi
 				;;
 			-p|--no-percentage|--percentage)
 				_arg_percentage="on"
 				test "${1:0:5}" = "--no-" && _arg_percentage="off"
 				;;
 			-p*)
 				_arg_percentage="on"
 				_next="${_key##-p}"
 				if test -n "$_next" -a "$_next" != "$_key"
 				then
 					{ begins_with_short_option "$_next" && shift && set -- "-p" "-${_next}" "$@"; } || die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option."
 				fi
 				;;
 			-w|--no-word-regexp|--word-regexp)
 				_arg_word_regexp="on"
 				test "${1:0:5}" = "--no-" && _arg_word_regexp="off"
 				;;
 			-w*)
 				_arg_word_regexp="on"
 				_next="${_key##-w}"
 				if test -n "$_next" -a "$_next" != "$_key"
 				then
 					{ begins_with_short_option "$_next" && shift && set -- "-w" "-${_next}" "$@"; } || die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option."
 				fi
 				;;
 			-h|--help)
 				print_help
 				exit 0
 				;;
 			-h*)
 				print_help
 				exit 0
 				;;
 			*)
 				_last_positional="$1"
 				_positionals+=("$_last_positional")
 				_positionals_count=$((_positionals_count + 1))
 				;;
 		esac
 		shift
 	done
 }


 handle_passed_args_count()
 {
 	local _required_args_string="'pattern' and 'file'"
 	test "${_positionals_count}" -ge 2 || _PRINT_HELP=yes die "FATAL ERROR: Not enough positional arguments - we require exactly 2 (namely: $_required_args_string), but got only ${_positionals_count}." 1
 	test "${_positionals_count}" -le 2 || _PRINT_HELP=yes die "FATAL ERROR: There were spurious positional arguments --- we expect exactly 2 (namely: $_required_args_string), but got ${_positionals_count} (the last one was: '${_last_positional}')." 1
 }


 assign_positional_args()
 {
 	local _positional_name _shift_for=$1
 	_positional_names="_arg_pattern _arg_file "

 	shift "$_shift_for"
 	for _positional_name in ${_positional_names}
 	do
 		test $# -gt 0 || break
 		eval "$_positional_name=\${1}" || die "Error during argument parsing, possibly an Argbash bug." 1
 		shift
 	done
 }

 parse_commandline "$@"
 handle_passed_args_count
 assign_positional_args 1 "${_positionals[@]}"

 # OTHER STUFF GENERATED BY Argbash

 ### END OF CODE GENERATED BY Argbash (sortof) ### ])
 # [ <-- needed because of Argbash

 # get file with cat (so that we can handle stdin too)
 file=$(cat "$_arg_file")

 # total number of lines in the file
 total=$(echo "$file" | wc -l | tr -d ' ')

 section_length=$(echo "$total / $_arg_sections" | bc)

 # pass grep flags -i and -w if needed
 flags=""
 if [ "$_arg_ignore_case" = "on" ]; then
   flags="$flags -i"
 fi
 if [ "$_arg_word_regexp" = "on" ]; then
   flags="$flags -w"
 fi

 # run grep, return only the line numbers of the matches
 found=$(echo "$file" | grep $flags -n "$_arg_pattern" | cut -f1 -d:)
 total_matches=$(echo "$found" | wc -l | tr -d ' ')

 # if the section length is 0 (file too short), just print total count and exit
 if [ "$section_length" -eq 0 ]; then
    echo "Total: $total_matches"
    exit 0
 fi

 # array to keep track of the number of matches by section (and the total)
 matches=()

 # loop through the sections
 for i in $(seq 1 $_arg_sections); do
    section_start=$((section_length * (i - 1) + 1))

    # if last section, then section end is the total number of lines
    if [ "$i" -eq $_arg_sections ]; then
        section_end=$total
    else
        section_end=$((section_length * i))
    fi

    matches[$i]=$( echo "$found" | awk "\$1 >= $section_start && \$1 <= $section_end { print }" | wc -l )
 done

 # print the total
 echo "Total: $total_matches"

 # if there are no matches, exit
 if [ "$total_matches" -eq 0 ]; then
    exit 0
 fi

 # print the sections
 for i in $(seq 1 $_arg_sections); do
    section_start=$((section_length * (i - 1) + 1))

    # if last section, then section end is the total number of lines
    if [ "$i" -eq $_arg_sections ]; then
        section_end=$total
    else
        section_end=$((section_length * i))
    fi

    # numbers padded to the number of digits in the total number of lines
    # in the file. Section number is padded to the number of digits in the
    # number of sections
    section_start=$(printf "%${#total}d" $section_start)
    section_end=$(printf "%${#total}d" $section_end)
    section_number=$(printf "%${#_arg_sections}d" $i)
    echo -n -e "Section $section_number (Lines $section_start to $section_end):"

    # what even is a 'centage'? math is weird
    percentage=$(printf "%.1f\n" $(echo "scale=4; ${matches[$i]} / $total_matches * 100" | bc -l))

    # print the number of matches
    if [ "$_arg_percentage" = "on" ]; then
        echo -e "\t($percentage%)"
    elif [ "$_arg_count" = "on" ]; then
        # matches are padded to the total number of digits in the total number
        # of matches
        matches[$i]=$(printf "%${#total_matches}d" ${matches[$i]})
        echo -e "\t${matches[$i]}"
    else
        matches[$i]=$(printf "%${#total_matches}d" ${matches[$i]})
        echo -e "\t${matches[$i]}\t($percentage%)"
    fi
 done

 # ] <-- needed because of Argbash
	#!/usr/bin/env bash

	# All my gist code is licensed under the terms of the MIT license.

	# GREPDIST
	# Print the number of matches in each 10% section of the file
	#
	# Example with grep:
	# $ grep -c 'Napoleon' war_and_peace.txt
	# 576
	#
	# Example with grepdist:
	# $ grepdist 'Napoleon' war_and_peace.txt
	# Total: 576
	# Section 1 (Lines 1 to 6603): 12 (2.1%)
	# Section 2 (Lines 6604 to 13206): 3 (0.5%)
	# Section 3 (Lines 13207 to 19809): 32 (5.6%)
	# Section 4 (Lines 19810 to 26412): 36 (6.2%)
	# Section 5 (Lines 26413 to 33015): 5 (0.9%)
	# Section 6 (Lines 33016 to 39618): 154 (26.7%)
	# Section 7 (Lines 39619 to 46221): 158 (27.4%)
	# Section 8 (Lines 46222 to 52824): 23 (4.0%)
	# Section 9 (Lines 52825 to 59427): 88 (15.3%)
	# Section 10 (Lines 59428 to 66030): 65 (11.3%)
	#
	# Or pass a custom number of sections:
	# $ grepdist -s 5 'Napoleon' war_and_peace.txt
	# Total: 576
	# Section 1 (Lines 1 to 13206): 15 (2.6%)
	# Section 2 (Lines 13207 to 26412): 68 (11.8%)
	# Section 3 (Lines 26413 to 39618): 159 (27.6%)
	# Section 4 (Lines 39619 to 52824): 181 (31.4%)
	# Section 5 (Lines 52825 to 66030): 153 (26.6%)
	#
	# It also supports the same options as grep:
	# -i, --ignore-case
	# -w, --word-regexp

	# ARG_OPTIONAL_SINGLE([sections],[s],[the number of sections to divide the file into],[10])
	# ARG_OPTIONAL_BOOLEAN([count],[c],[print only the total count of matches])
	# ARG_OPTIONAL_BOOLEAN([ignore-case],[i],[ignore case distinctions in PATTERN and FILE])
	# ARG_OPTIONAL_BOOLEAN([percentage],[p],[print only the percentage of matches in each section])
	# ARG_OPTIONAL_BOOLEAN([word-regexp],[w],[force PATTERN to match only whole words])
	# ARG_POSITIONAL_SINGLE([pattern],[the pattern to search for],[])
	# ARG_POSITIONAL_SINGLE([file],[the file to search in],[])
	# ARG_HELP([grepdist - search for a pattern in a file and print the total number of matches and the percentage of matches in each 10% section of the file])
	# ARGBASH_GO()
	# needed because of Argbash --> m4_ignore([
	### START OF CODE GENERATED BY Argbash v2.10.0 one line above ###
	# Argbash is a bash code generator used to get arguments parsing right.
	# Argbash is FREE SOFTWARE, see https://argbash.dev for more info


	die()
	{
	local _ret="${2:-1}"
	test "${_PRINT_HELP:-no}" = yes && print_help >&2
	echo "$1" >&2
	exit "${_ret}"
	}


	begins_with_short_option()
	{
	local first_option all_short_options='scipwh'
	first_option="${1:0:1}"
	test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 \|\| return 0
	}

	# THE DEFAULTS INITIALIZATION - POSITIONALS
	_positionals=()
	# THE DEFAULTS INITIALIZATION - OPTIONALS
	_arg_sections="10"
	_arg_count="off"
	_arg_ignore_case="off"
	_arg_percentage="off"
	_arg_word_regexp="off"


	print_help()
	{
	printf '%s\n' "grepdist - search for a pattern in a file and print the total number of matches and the percentage of matches in each 10% section of the file"
	printf 'Usage: %s [-s\|--sections <arg>] [-c\|--(no-)count] [-i\|--(no-)ignore-case] [-p\|--(no-)percentage] [-w\|--(no-)word-regexp] [-h\|--help] <pattern> <file>\n' "$0"
	printf '\t%s\n' "<pattern>: the pattern to search for"
	printf '\t%s\n' "<file>: the file to search in"
	printf '\t%s\n' "-s, --sections: the number of sections to divide the file into (default: '10')"
	printf '\t%s\n' "-c, --count, --no-count: print only the total count of matches (off by default)"
	printf '\t%s\n' "-i, --ignore-case, --no-ignore-case: ignore case distinctions in PATTERN and FILE (off by default)"
	printf '\t%s\n' "-p, --percentage, --no-percentage: print only the percentage of matches in each section (off by default)"
	printf '\t%s\n' "-w, --word-regexp, --no-word-regexp: force PATTERN to match only whole words (off by default)"
	printf '\t%s\n' "-h, --help: Prints help"
	}


	parse_commandline()
	{
	_positionals_count=0
	while test $# -gt 0
	do
	_key="$1"
	case "$_key" in
	-s\|--sections)
	test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
	_arg_sections="$2"
	shift
	;;
	--sections=*)
	_arg_sections="${_key##--sections=}"
	;;
	-s*)
	_arg_sections="${_key##-s}"
	;;
	-c\|--no-count\|--count)
	_arg_count="on"
	test "${1:0:5}" = "--no-" && _arg_count="off"
	;;
	-c*)
	_arg_count="on"
	_next="${_key##-c}"
	if test -n "$_next" -a "$_next" != "$_key"
	then
	{ begins_with_short_option "$_next" && shift && set -- "-c" "-${_next}" "$@"; } \|\| die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option."
	fi
	;;
	-i\|--no-ignore-case\|--ignore-case)
	_arg_ignore_case="on"
	test "${1:0:5}" = "--no-" && _arg_ignore_case="off"
	;;
	-i*)
	_arg_ignore_case="on"
	_next="${_key##-i}"
	if test -n "$_next" -a "$_next" != "$_key"
	then
	{ begins_with_short_option "$_next" && shift && set -- "-i" "-${_next}" "$@"; } \|\| die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option."
	fi
	;;
	-p\|--no-percentage\|--percentage)
	_arg_percentage="on"
	test "${1:0:5}" = "--no-" && _arg_percentage="off"
	;;
	-p*)
	_arg_percentage="on"
	_next="${_key##-p}"
	if test -n "$_next" -a "$_next" != "$_key"
	then
	{ begins_with_short_option "$_next" && shift && set -- "-p" "-${_next}" "$@"; } \|\| die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option."
	fi
	;;
	-w\|--no-word-regexp\|--word-regexp)
	_arg_word_regexp="on"
	test "${1:0:5}" = "--no-" && _arg_word_regexp="off"
	;;
	-w*)
	_arg_word_regexp="on"
	_next="${_key##-w}"
	if test -n "$_next" -a "$_next" != "$_key"
	then
	{ begins_with_short_option "$_next" && shift && set -- "-w" "-${_next}" "$@"; } \|\| die "The short option '$_key' can't be decomposed to ${_key:0:2} and -${_key:2}, because ${_key:0:2} doesn't accept value and '-${_key:2:1}' doesn't correspond to a short option."
	fi
	;;
	-h\|--help)
	print_help
	exit 0
	;;
	-h*)
	print_help
	exit 0
	;;
	*)
	_last_positional="$1"
	_positionals+=("$_last_positional")
	_positionals_count=$((_positionals_count + 1))
	;;
	esac
	shift
	done
	}


	handle_passed_args_count()
	{
	local _required_args_string="'pattern' and 'file'"
	test "${_positionals_count}" -ge 2 \|\| _PRINT_HELP=yes die "FATAL ERROR: Not enough positional arguments - we require exactly 2 (namely: $_required_args_string), but got only ${_positionals_count}." 1
	test "${_positionals_count}" -le 2 \|\| _PRINT_HELP=yes die "FATAL ERROR: There were spurious positional arguments --- we expect exactly 2 (namely: $_required_args_string), but got ${_positionals_count} (the last one was: '${_last_positional}')." 1
	}


	assign_positional_args()
	{
	local _positional_name _shift_for=$1
	_positional_names="_arg_pattern _arg_file "

	shift "$_shift_for"
	for _positional_name in ${_positional_names}
	do
	test $# -gt 0 \|\| break
	eval "$_positional_name=\${1}" \|\| die "Error during argument parsing, possibly an Argbash bug." 1
	shift
	done
	}

	parse_commandline "$@"
	handle_passed_args_count
	assign_positional_args 1 "${_positionals[@]}"

	# OTHER STUFF GENERATED BY Argbash

	### END OF CODE GENERATED BY Argbash (sortof) ### ])
	# [ <-- needed because of Argbash

	# get file with cat (so that we can handle stdin too)
	file=$(cat "$_arg_file")

	# total number of lines in the file
	total=$(echo "$file" \| wc -l \| tr -d ' ')

	section_length=$(echo "$total / $_arg_sections" \| bc)

	# pass grep flags -i and -w if needed
	flags=""
	if [ "$_arg_ignore_case" = "on" ]; then
	flags="$flags -i"
	fi
	if [ "$_arg_word_regexp" = "on" ]; then
	flags="$flags -w"
	fi

	# run grep, return only the line numbers of the matches
	found=$(echo "$file" \| grep $flags -n "$_arg_pattern" \| cut -f1 -d:)
	total_matches=$(echo "$found" \| wc -l \| tr -d ' ')

	# if the section length is 0 (file too short), just print total count and exit
	if [ "$section_length" -eq 0 ]; then
	echo "Total: $total_matches"
	exit 0
	fi

	# array to keep track of the number of matches by section (and the total)
	matches=()

	# loop through the sections
	for i in $(seq 1 $_arg_sections); do
	section_start=$((section_length * (i - 1) + 1))

	# if last section, then section end is the total number of lines
	if [ "$i" -eq $_arg_sections ]; then
	section_end=$total
	else
	section_end=$((section_length * i))
	fi

	matches[$i]=$( echo "$found" \| awk "\$1 >= $section_start && \$1 <= $section_end { print }" \| wc -l )
	done

	# print the total
	echo "Total: $total_matches"

	# if there are no matches, exit
	if [ "$total_matches" -eq 0 ]; then
	exit 0
	fi

	# print the sections
	for i in $(seq 1 $_arg_sections); do
	section_start=$((section_length * (i - 1) + 1))

	# if last section, then section end is the total number of lines
	if [ "$i" -eq $_arg_sections ]; then
	section_end=$total
	else
	section_end=$((section_length * i))
	fi

	# numbers padded to the number of digits in the total number of lines
	# in the file. Section number is padded to the number of digits in the
	# number of sections
	section_start=$(printf "%${#total}d" $section_start)
	section_end=$(printf "%${#total}d" $section_end)
	section_number=$(printf "%${#_arg_sections}d" $i)
	echo -n -e "Section $section_number (Lines $section_start to $section_end):"

	# what even is a 'centage'? math is weird
	percentage=$(printf "%.1f\n" $(echo "scale=4; ${matches[$i]} / $total_matches * 100" \| bc -l))

	# print the number of matches
	if [ "$_arg_percentage" = "on" ]; then
	echo -e "\t($percentage%)"
	elif [ "$_arg_count" = "on" ]; then
	# matches are padded to the total number of digits in the total number
	# of matches
	matches[$i]=$(printf "%${#total_matches}d" ${matches[$i]})
	echo -e "\t${matches[$i]}"
	else
	matches[$i]=$(printf "%${#total_matches}d" ${matches[$i]})
	echo -e "\t${matches[$i]}\t($percentage%)"
	fi
	done

	# ] <-- needed because of Argbash
No results found