dantonnoriega · April 21, 2024 20:00
diff --git a/cool-bash-code.sh b/cool-bash-code.sh
 # cool bash codes

 # search a directory for all lines that match a pattern (not perfect but useful) ------
 ## e.g. grep is searching for all lines matching "::" in `R/` to determine package calls
 ## -h hides the file names; -i ignores case
 ## sed -E uses regular expressions to search and match groups;
 ## we then sort and use -u
 grep -hi :: -R R/* | sed -E 's/(.*)([ ]+[a-z]+::)(.*)/\2/g' | sort -u

 # COUNT COLUMNS -----------------
 ## print how many columns, delimited by tab ($'\t'),
 ##  are in each file and file name, then sort
 find . -type f -exec awk -F $'\t' -v d={} '{print d, NF; exit}' {} \; | sort

 # find and kill active ssh connnections --------------
 lsof -i -n | grep ssh | awk '!seen[$2]++' | awk '{print $2}' | while read -r line; do kill $line; done

 # use awk to parse columns because `column` does poorly with empty space. -------------
 ## OFS = output file separator (we add space, ', ');
 ## $1=$1 is a self fulfilling statement and then we print the whole line $0
 ## to understand $1=$1, see point 27 of
 ## http://www.catonmat.net/blog/awk-one-liners-explained-part-two/
 head -n 1000 some_file.txt | awk -F ',' '{OFS=", ";$1=$1; print $0}' | column -s $',' -t | less


 ## CONVERT CSV to PSV without removing commas between double quotes ----------------
 # first awk line delimits on quotes. assuming equally paired double quotes, it takes
 #  every other split and replaces commas with pipes
 # second sed chunk finds any pipe (|) and adds a space (| )
 #   this is because `column` incorrectly parses empty fields in csv file i.e. || fails,
 #   but | | (with a space) does not the `1` is the default code block `{print $0}`
 # the rest is standard, but we are now delimiting on '|' (pipes) not ','
  awk -F'"' -v OFS='"' '{ for (i=1; i<=NF; i+=2) gsub(",", "|", $i) } 1' some_file_with_double_quotes.csv |
  sed 's/|/| /g' |
  column -s'|' -t |
  less

 # DETERMINE IF TEXT EXITS IN FIRST 5 LINES, PRINT IF NOT ----------------------
 # create an empty file. loop through psv files. if we do NOT find word "value" in the
 #  first 5 lines, then echo the file name to `list_files.txt`
 touch list_files.txt; for i in *psv; do if ! head -n 5 "$i" |
  grep -q "value"; then echo "$i" >> list_files.txt; fi; done

 # remove bom in beginning of file ------------------
 awk '{ gsub(/\xef\xbb\xbf/,""); print }' file_name.csv

 # for a column of data, count all unique words and sort ------------
 awk -F'|' '{print $4}' some_file.psv | sort | uniq -c | sort -n | less

 # COUNT BY DELIM; APPEND DELIM IF LESS THAN EXPECTED --------------
 ## count for fields by -F separator. if less than some value (e.g. 20), then append the
 ##   missing fields separators needed.
 ## else print the line
 ## source: https://stackoverflow.com/questions/37295695/how-to-use-printf-to-print-a-character-multiple-times
 ## understanding `%*s` found here:
 ## https://www.gnu.org/software/gawk/manual/html_node/Format-Modifiers.html#Format-Modifiers
 ## magic is in first part:
 ## {s=sprintf("%*s",20-1-NF,""); gsub(/ /," |",s); print NF "\t" s FS $0}
 ##   the %*s repeats a total of 20-1-NF times, appending empty space at the end "".
 ## If 20-1-NF > 0, then it will print a space.
 ##   the `gsub` replaces the space with separator (" |") in this case.
 ##   then you can print the result as `s`.
 cat some_file.psv |
  awk -F'|' '{
    if(NF < 20) {
      s=sprintf("%*s",20-1-NF,"");
      gsub(/ /," |",s); print NF "\t" s FS $0
    } else {
      print NF "\t" $0
    }}'

 # FIND FILES THAT ARE SMALL AND REMOVE THEM -------------
 ## this avoids the "argument list too long using `ls`"
 ## looks at the 5th column, which as data size in bytes.
 ## if below, print the path to the file
 ## can then delete or do something to said files, like `xargs rm -f`
 find -L ./data-raw/graphite -type f | xargs ls -l |
  awk '$5 < 100 {print $9}' | xargs rm -f

 # COUNT DELIMITERS PER LINE -----------------------
 ## useful for seeing if you have delimiter errors e.g. expect only three fields
 ##   if so, then youd only see two pipes (||) per line.
 ##   if you see more or less, something is wrong
 ## here, the delimiter is paired with '\n' e.g. pipe delim = '|\n'
 ## tr -d means to delete; -c is the complement of a list of characters
 ##   we want find and delete all characters (-d) EXCEPT
 ##   pipes '|' and new lines '\n' (-c)
 ## we use `sort | uniq -c` to help order and count instances
 cat host-status-apps.psv | tr -d -c '|\n' | sort | uniq -c


 # split a file into chunks (here 250M) e.g. --------------------
 < big-file.csv parallel --pipe --block 250M 'cat > raw/chunks/big-file-chunk-{#}'

 # same thing but faster
 ## the negative 4 (-4) is how many blocks each job slot should have
 ## resulting in e.g. 4*8 = 32 jobs to process
 parallel --skip-first-line --pipepart --block -4 -j8 -a some-big-file.csv \
  'tee raw/chunks/some-big-file-chunk-{#} > /dev/null'

 # ITERATE URLS, SAVE CONTENT, APPENDING NUMS ------------------
 ## iterate through file with urls, tmp.txt, then curl and save as a set of files using nums
 ## should be in `bash` so we can export function and use in parallel
 function curl_iter () { curl -s -g $2 > tbl$1.csv }
 export -f curl_iter
 cat -n tmp.txt | grep http | tr '\t' ',' | parallel --colsep="," curl_iter

 # FLATTEN SIMPLE JSON OBJECTS (NO NESTING) TO CSV ---------------------
 ## SOURCE: https://stackoverflow.com/a/32965227/3987905
 curl 'https://jsonplaceholder.typicode.com/posts' | 
  jq -r '(. | map(keys) | add | unique)[] as $cols | 
          map(. as $row | $cols | map($row[.])) as $rows | $cols, $rows[] | @csv'


 # BASIC CURL WITH STATUS CODE CHECK, LOG ---------------
 ## curl a URL, print status code and input (assumes single input)
 curl_line() {
  line=$1
    URL="https://example.com/query?somevalue=${line}&format=JSON"
  # echo $URL
  req=$(curl -i -s $URL)
  http_status=$(echo "$req" | sed -n '1,/^^M$/p' | awk 'NR==1{print $2}')
  echo $http_status $line | tee -a log-curl-line.txt
  if [ "$http_status" == "200" ]
  then
    echo "$req" | # want raw , interpolated text of variable echo'd
      # removes http header by excluding lines with carriage returns (hidden ^M);
      grep -v $'\r' |
    tee -a curl-line.json > /dev/null # append and suppress
  fi
 }
 export -f curl_line # export function (only works in bash)

 # need to remove old file so we can start fresh and append (tee -a)
 rm -f curl-line.json log-curl-line.txt

 cat some-file-single-column-line.psv |
  parallel -j1 curl_line

 # QUICKLY CONVERT  any .md to .html (built to be added to ~/.zshrc) --------
 function md2html () {
  /usr/local/bin/pandoc --standalone \
    --template=https://raw.githubusercontent.com/tajmone/pandoc-goodies/master/templates/html5/github/GitHub.html5 \
    --highlight-style=pygments \
    --css=https://bootswatch.com/3/lumen/bootstrap.min.css \
    --metadata pagetitle=$1 $1 -o ${1%.*}.html
 }
	# cool bash codes

	# search a directory for all lines that match a pattern (not perfect but useful) ------
	## e.g. grep is searching for all lines matching "::" in `R/` to determine package calls
	## -h hides the file names; -i ignores case
	## sed -E uses regular expressions to search and match groups;
	## we then sort and use -u
	grep -hi :: -R R/* \| sed -E 's/(.)([ ]+[a-z]+::)(.)/\2/g' \| sort -u

	# COUNT COLUMNS -----------------
	## print how many columns, delimited by tab ($'\t'),
	## are in each file and file name, then sort
	find . -type f -exec awk -F $'\t' -v d={} '{print d, NF; exit}' {} \; \| sort

	# find and kill active ssh connnections --------------
	lsof -i -n \| grep ssh \| awk '!seen[$2]++' \| awk '{print $2}' \| while read -r line; do kill $line; done

	# use awk to parse columns because `column` does poorly with empty space. -------------
	## OFS = output file separator (we add space, ', ');
	## $1=$1 is a self fulfilling statement and then we print the whole line $0
	## to understand $1=$1, see point 27 of
	## http://www.catonmat.net/blog/awk-one-liners-explained-part-two/
	head -n 1000 some_file.txt \| awk -F ',' '{OFS=", ";$1=$1; print $0}' \| column -s $',' -t \| less


	## CONVERT CSV to PSV without removing commas between double quotes ----------------
	# first awk line delimits on quotes. assuming equally paired double quotes, it takes
	# every other split and replaces commas with pipes
	# second sed chunk finds any pipe (\|) and adds a space (\| )
	# this is because `column` incorrectly parses empty fields in csv file i.e. \|\| fails,
	# but \| \| (with a space) does not the `1` is the default code block `{print $0}`
	# the rest is standard, but we are now delimiting on '\|' (pipes) not ','
	awk -F'"' -v OFS='"' '{ for (i=1; i<=NF; i+=2) gsub(",", "\|", $i) } 1' some_file_with_double_quotes.csv \|
	sed 's/\|/\| /g' \|
	column -s'\|' -t \|
	less

	# DETERMINE IF TEXT EXITS IN FIRST 5 LINES, PRINT IF NOT ----------------------
	# create an empty file. loop through psv files. if we do NOT find word "value" in the
	# first 5 lines, then echo the file name to `list_files.txt`
	touch list_files.txt; for i in *psv; do if ! head -n 5 "$i" \|
	grep -q "value"; then echo "$i" >> list_files.txt; fi; done

	# remove bom in beginning of file ------------------
	awk '{ gsub(/\xef\xbb\xbf/,""); print }' file_name.csv

	# for a column of data, count all unique words and sort ------------
	awk -F'\|' '{print $4}' some_file.psv \| sort \| uniq -c \| sort -n \| less

	# COUNT BY DELIM; APPEND DELIM IF LESS THAN EXPECTED --------------
	## count for fields by -F separator. if less than some value (e.g. 20), then append the
	## missing fields separators needed.
	## else print the line
	## source: https://stackoverflow.com/questions/37295695/how-to-use-printf-to-print-a-character-multiple-times
	## understanding `%*s` found here:
	## https://www.gnu.org/software/gawk/manual/html_node/Format-Modifiers.html#Format-Modifiers
	## magic is in first part:
	## {s=sprintf("%*s",20-1-NF,""); gsub(/ /," \|",s); print NF "\t" s FS $0}
	## the %*s repeats a total of 20-1-NF times, appending empty space at the end "".
	## If 20-1-NF > 0, then it will print a space.
	## the `gsub` replaces the space with separator (" \|") in this case.
	## then you can print the result as `s`.
	cat some_file.psv \|
	awk -F'\|' '{
	if(NF < 20) {
	s=sprintf("%*s",20-1-NF,"");
	gsub(/ /," \|",s); print NF "\t" s FS $0
	} else {
	print NF "\t" $0
	}}'

	# FIND FILES THAT ARE SMALL AND REMOVE THEM -------------
	## this avoids the "argument list too long using `ls`"
	## looks at the 5th column, which as data size in bytes.
	## if below, print the path to the file
	## can then delete or do something to said files, like `xargs rm -f`
	find -L ./data-raw/graphite -type f \| xargs ls -l \|
	awk '$5 < 100 {print $9}' \| xargs rm -f

	# COUNT DELIMITERS PER LINE -----------------------
	## useful for seeing if you have delimiter errors e.g. expect only three fields
	## if so, then youd only see two pipes (\|\|) per line.
	## if you see more or less, something is wrong
	## here, the delimiter is paired with '\n' e.g. pipe delim = '\|\n'
	## tr -d means to delete; -c is the complement of a list of characters
	## we want find and delete all characters (-d) EXCEPT
	## pipes '\|' and new lines '\n' (-c)
	## we use `sort \| uniq -c` to help order and count instances
	cat host-status-apps.psv \| tr -d -c '\|\n' \| sort \| uniq -c


	# split a file into chunks (here 250M) e.g. --------------------
	< big-file.csv parallel --pipe --block 250M 'cat > raw/chunks/big-file-chunk-{#}'

	# same thing but faster
	## the negative 4 (-4) is how many blocks each job slot should have
	## resulting in e.g. 4*8 = 32 jobs to process
	parallel --skip-first-line --pipepart --block -4 -j8 -a some-big-file.csv \
	'tee raw/chunks/some-big-file-chunk-{#} > /dev/null'

	# ITERATE URLS, SAVE CONTENT, APPENDING NUMS ------------------
	## iterate through file with urls, tmp.txt, then curl and save as a set of files using nums
	## should be in `bash` so we can export function and use in parallel
	function curl_iter () { curl -s -g $2 > tbl$1.csv }
	export -f curl_iter
	cat -n tmp.txt \| grep http \| tr '\t' ',' \| parallel --colsep="," curl_iter

	# FLATTEN SIMPLE JSON OBJECTS (NO NESTING) TO CSV ---------------------
	## SOURCE: https://stackoverflow.com/a/32965227/3987905
	curl 'https://jsonplaceholder.typicode.com/posts' \|
	jq -r '(. \| map(keys) \| add \| unique)[] as $cols \|
	map(. as $row \| $cols \| map($row[.])) as $rows \| $cols, $rows[] \| @csv'


	# BASIC CURL WITH STATUS CODE CHECK, LOG ---------------
	## curl a URL, print status code and input (assumes single input)
	curl_line() {
	line=$1
	URL="https://example.com/query?somevalue=${line}&format=JSON"
	# echo $URL
	req=$(curl -i -s $URL)
	http_status=$(echo "$req" \| sed -n '1,/^^M$/p' \| awk 'NR==1{print $2}')
	echo $http_status $line \| tee -a log-curl-line.txt
	if [ "$http_status" == "200" ]
	then
	echo "$req" \| # want raw , interpolated text of variable echo'd
	# removes http header by excluding lines with carriage returns (hidden ^M);
	grep -v $'\r' \|
	tee -a curl-line.json > /dev/null # append and suppress
	fi
	}
	export -f curl_line # export function (only works in bash)

	# need to remove old file so we can start fresh and append (tee -a)
	rm -f curl-line.json log-curl-line.txt

	cat some-file-single-column-line.psv \|
	parallel -j1 curl_line

	# QUICKLY CONVERT any .md to .html (built to be added to ~/.zshrc) --------
	function md2html () {
	/usr/local/bin/pandoc --standalone \
	--template=https://raw.githubusercontent.com/tajmone/pandoc-goodies/master/templates/html5/github/GitHub.html5 \
	--highlight-style=pygments \
	--css=https://bootswatch.com/3/lumen/bootstrap.min.css \
	--metadata pagetitle=$1 $1 -o ${1%.*}.html
	}