Last active
April 21, 2024 20:00
-
-
Save dantonnoriega/700ea57123a2eb358f5633202a02aca7 to your computer and use it in GitHub Desktop.
a collection of cool bash scripts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# cool bash codes | |
# search a directory for all lines that match a pattern (not perfect but useful) ------ | |
## e.g. grep is searching for all lines matching "::" in `R/` to determine package calls | |
## -h hides the file names; -i ignores case | |
## sed -E uses regular expressions to search and match groups; | |
## we then sort and use -u | |
grep -hi :: -R R/* | sed -E 's/(.*)([ ]+[a-z]+::)(.*)/\2/g' | sort -u | |
# COUNT COLUMNS ----------------- | |
## print how many columns, delimited by tab ($'\t'), | |
## are in each file and file name, then sort | |
find . -type f -exec awk -F $'\t' -v d={} '{print d, NF; exit}' {} \; | sort | |
# find and kill active ssh connnections -------------- | |
lsof -i -n | grep ssh | awk '!seen[$2]++' | awk '{print $2}' | while read -r line; do kill $line; done | |
# use awk to parse columns because `column` does poorly with empty space. ------------- | |
## OFS = output file separator (we add space, ', '); | |
## $1=$1 is a self fulfilling statement and then we print the whole line $0 | |
## to understand $1=$1, see point 27 of | |
## http://www.catonmat.net/blog/awk-one-liners-explained-part-two/ | |
head -n 1000 some_file.txt | awk -F ',' '{OFS=", ";$1=$1; print $0}' | column -s $',' -t | less | |
## CONVERT CSV to PSV without removing commas between double quotes ---------------- | |
# first awk line delimits on quotes. assuming equally paired double quotes, it takes | |
# every other split and replaces commas with pipes | |
# second sed chunk finds any pipe (|) and adds a space (| ) | |
# this is because `column` incorrectly parses empty fields in csv file i.e. || fails, | |
# but | | (with a space) does not the `1` is the default code block `{print $0}` | |
# the rest is standard, but we are now delimiting on '|' (pipes) not ',' | |
awk -F'"' -v OFS='"' '{ for (i=1; i<=NF; i+=2) gsub(",", "|", $i) } 1' some_file_with_double_quotes.csv | | |
sed 's/|/| /g' | | |
column -s'|' -t | | |
less | |
# DETERMINE IF TEXT EXITS IN FIRST 5 LINES, PRINT IF NOT ---------------------- | |
# create an empty file. loop through psv files. if we do NOT find word "value" in the | |
# first 5 lines, then echo the file name to `list_files.txt` | |
touch list_files.txt; for i in *psv; do if ! head -n 5 "$i" | | |
grep -q "value"; then echo "$i" >> list_files.txt; fi; done | |
# remove bom in beginning of file ------------------ | |
awk '{ gsub(/\xef\xbb\xbf/,""); print }' file_name.csv | |
# for a column of data, count all unique words and sort ------------ | |
awk -F'|' '{print $4}' some_file.psv | sort | uniq -c | sort -n | less | |
# COUNT BY DELIM; APPEND DELIM IF LESS THAN EXPECTED -------------- | |
## count for fields by -F separator. if less than some value (e.g. 20), then append the | |
## missing fields separators needed. | |
## else print the line | |
## source: https://stackoverflow.com/questions/37295695/how-to-use-printf-to-print-a-character-multiple-times | |
## understanding `%*s` found here: | |
## https://www.gnu.org/software/gawk/manual/html_node/Format-Modifiers.html#Format-Modifiers | |
## magic is in first part: | |
## {s=sprintf("%*s",20-1-NF,""); gsub(/ /," |",s); print NF "\t" s FS $0} | |
## the %*s repeats a total of 20-1-NF times, appending empty space at the end "". | |
## If 20-1-NF > 0, then it will print a space. | |
## the `gsub` replaces the space with separator (" |") in this case. | |
## then you can print the result as `s`. | |
cat some_file.psv | | |
awk -F'|' '{ | |
if(NF < 20) { | |
s=sprintf("%*s",20-1-NF,""); | |
gsub(/ /," |",s); print NF "\t" s FS $0 | |
} else { | |
print NF "\t" $0 | |
}}' | |
# FIND FILES THAT ARE SMALL AND REMOVE THEM ------------- | |
## this avoids the "argument list too long using `ls`" | |
## looks at the 5th column, which as data size in bytes. | |
## if below, print the path to the file | |
## can then delete or do something to said files, like `xargs rm -f` | |
find -L ./data-raw/graphite -type f | xargs ls -l | | |
awk '$5 < 100 {print $9}' | xargs rm -f | |
# COUNT DELIMITERS PER LINE ----------------------- | |
## useful for seeing if you have delimiter errors e.g. expect only three fields | |
## if so, then youd only see two pipes (||) per line. | |
## if you see more or less, something is wrong | |
## here, the delimiter is paired with '\n' e.g. pipe delim = '|\n' | |
## tr -d means to delete; -c is the complement of a list of characters | |
## we want find and delete all characters (-d) EXCEPT | |
## pipes '|' and new lines '\n' (-c) | |
## we use `sort | uniq -c` to help order and count instances | |
cat host-status-apps.psv | tr -d -c '|\n' | sort | uniq -c | |
# split a file into chunks (here 250M) e.g. -------------------- | |
< big-file.csv parallel --pipe --block 250M 'cat > raw/chunks/big-file-chunk-{#}' | |
# same thing but faster | |
## the negative 4 (-4) is how many blocks each job slot should have | |
## resulting in e.g. 4*8 = 32 jobs to process | |
parallel --skip-first-line --pipepart --block -4 -j8 -a some-big-file.csv \ | |
'tee raw/chunks/some-big-file-chunk-{#} > /dev/null' | |
# ITERATE URLS, SAVE CONTENT, APPENDING NUMS ------------------ | |
## iterate through file with urls, tmp.txt, then curl and save as a set of files using nums | |
## should be in `bash` so we can export function and use in parallel | |
function curl_iter () { curl -s -g $2 > tbl$1.csv } | |
export -f curl_iter | |
cat -n tmp.txt | grep http | tr '\t' ',' | parallel --colsep="," curl_iter | |
# FLATTEN SIMPLE JSON OBJECTS (NO NESTING) TO CSV --------------------- | |
## SOURCE: https://stackoverflow.com/a/32965227/3987905 | |
curl 'https://jsonplaceholder.typicode.com/posts' | | |
jq -r '(. | map(keys) | add | unique)[] as $cols | | |
map(. as $row | $cols | map($row[.])) as $rows | $cols, $rows[] | @csv' | |
# BASIC CURL WITH STATUS CODE CHECK, LOG --------------- | |
## curl a URL, print status code and input (assumes single input) | |
curl_line() { | |
line=$1 | |
URL="https://example.com/query?somevalue=${line}&format=JSON" | |
# echo $URL | |
req=$(curl -i -s $URL) | |
http_status=$(echo "$req" | sed -n '1,/^^M$/p' | awk 'NR==1{print $2}') | |
echo $http_status $line | tee -a log-curl-line.txt | |
if [ "$http_status" == "200" ] | |
then | |
echo "$req" | # want raw , interpolated text of variable echo'd | |
# removes http header by excluding lines with carriage returns (hidden ^M); | |
grep -v $'\r' | | |
tee -a curl-line.json > /dev/null # append and suppress | |
fi | |
} | |
export -f curl_line # export function (only works in bash) | |
# need to remove old file so we can start fresh and append (tee -a) | |
rm -f curl-line.json log-curl-line.txt | |
cat some-file-single-column-line.psv | | |
parallel -j1 curl_line | |
# QUICKLY CONVERT any .md to .html (built to be added to ~/.zshrc) -------- | |
function md2html () { | |
/usr/local/bin/pandoc --standalone \ | |
--template=https://raw.githubusercontent.com/tajmone/pandoc-goodies/master/templates/html5/github/GitHub.html5 \ | |
--highlight-style=pygments \ | |
--css=https://bootswatch.com/3/lumen/bootstrap.min.css \ | |
--metadata pagetitle=$1 $1 -o ${1%.*}.html | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment