Last active
August 22, 2018 09:21
-
-
Save twolodzko/068b6514e450222e356f4ae157cc7eaa to your computer and use it in GitHub Desktop.
Handy command line data cleaning functions in bash / sed / awk
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Many of the functions are adapted from: | |
| # https://www.polydesmida.info/cookbook/functions.html | |
| nohead () | |
| { | |
| if [[ $# -eq 0 || "$1" == "-h" ]] ; then | |
| echo "Useage: nohead file [n=1]" | |
| echo "Omit header (n rows) from a file." | |
| return | |
| fi | |
| inpfile=$1 | |
| if [ -z "$2" ]; then | |
| nrows=2 | |
| else | |
| nrows=$(($2+1)) | |
| fi | |
| tail -n +"$nrows" "$inpfile" | |
| unset inpfile | |
| unset nrows | |
| } | |
| broken () | |
| { | |
| if [[ $# -eq 0 || "$1" == "-h" ]] ; then | |
| echo "Useage: broken file [separator=',']" | |
| echo "Count the rows groupped by the number of fields, to discover problems with separators." | |
| return | |
| fi | |
| inpfile=$1 | |
| if [ -z "$2" ]; then | |
| sep=',' | |
| else | |
| sep=$2 | |
| fi | |
| awk -F"$sep" '{print NF}' "$inpfile" | sort -n | uniq -c | |
| unset inpfile | |
| unset sep | |
| } | |
| cols () | |
| { | |
| if [[ $# -eq 0 || "$1" == "-h" ]] ; then | |
| echo "Useage: cols file [separator=',']" | |
| echo "Print the header of the file." | |
| return | |
| fi | |
| inpfile=$1 | |
| if [ -z "$2" ]; then | |
| sep=',' | |
| else | |
| sep=$2 | |
| fi | |
| head -n 1 "$inpfile" | tr "$sep" '\n' | nl -w1 | pr -t -2 | |
| unset inpfile | |
| unset sep | |
| } | |
| search () | |
| { | |
| if [[ $# -eq 0 || "$1" == "-h" ]] ; then | |
| echo "Useage: search file query [separator=',']" | |
| echo "Print rows containing the searched phrase." | |
| return | |
| fi | |
| inpfile=$1 | |
| phrase=$2 | |
| if [ -z "$3" ]; then | |
| sep=',' | |
| else | |
| sep=$3 | |
| fi | |
| awk -F"$sep" -v tgt="$phrase" '{for (i=1;i<=NF;i++) if ($i ~ tgt) print NR" f"i": "$i}' "$inpfile" | |
| unset inpfile | |
| unset phrase | |
| unset sep | |
| } | |
| getrow () | |
| { | |
| if [[ $# -eq 0 || "$1" == "-h" ]] ; then | |
| echo "Useage: getrow file row [separator=',']" | |
| echo "Print the record at the given row." | |
| return | |
| fi | |
| inpfile=$1 | |
| row=$2 | |
| if [ -z "$3" ]; then | |
| sep=',' | |
| else | |
| sep=$3 | |
| fi | |
| paste <(head -n1 "$inpfile" | tr "$sep" '\n') <(sed -n "${row}p" "$inpfile"| tr '\t' '\n') | nl | |
| unset inpfile | |
| unset row | |
| unset sep | |
| } | |
| getcols () | |
| { | |
| if [[ $# -eq 0 || "$1" == "-h" ]] ; then | |
| echo "Useage: getcols file cols [separator=',']" | |
| echo "Extract the record at the given row." | |
| return | |
| fi | |
| inpfile=$1 | |
| cols=$2 | |
| if [ -z "$3" ]; then | |
| sep=',' | |
| else | |
| sep=$3 | |
| fi | |
| awk -F"$sep" -vd="$cols" -vs="$sep" 'BEGIN{split(d, a, ",")} {for (i in a) printf "%s%s", $(a[i]), s; printf "\n"}' "$inpfile" | |
| unset inpfile | |
| unset cols | |
| unset sep | |
| } | |
| alphanum () | |
| { | |
| if [[ $# -eq 0 || "$1" == "-h" ]] ; then | |
| echo "Useage: alphanum file [ignore]" | |
| echo "Leave only the alphanumeric characters and user-specified characters." | |
| return | |
| fi | |
| inpfile=$1 | |
| if [ -z "$2" ]; then | |
| ignore='' | |
| else | |
| ignore=$2 | |
| fi | |
| awk -vs="$ignore" '{ gsub("[^A-Za-z0-9 "s"]", " "); gsub(/ +/, " "); gsub(/^ +| +$/, "") } 1' "$inpfile" | |
| unset inpfile | |
| unset ignore | |
| } | |
| uniqvals () | |
| { | |
| if [[ $# -eq 0 || "$1" == "-h" ]] ; then | |
| echo "Useage: uniqvals file column [separator=',']" | |
| echo "List and count the unique values in given column." | |
| return | |
| fi | |
| inpfile=$1 | |
| column=$2 | |
| if [ -z "$3" ]; then | |
| sep=',' | |
| else | |
| sep=$3 | |
| fi | |
| if [ "$sep" == '\t' ]; then | |
| tail -n +2 "$inpfile" | cut -f"$column" | sort | uniq -c | sed "s/^[ ]*//;s/ /\t/" | |
| else | |
| tail -n +2 "$inpfile" | cut -d"$sep" -f"$column" | sort | uniq -c | sed "s/^[ ]*//;s/ /\t/" | |
| fi | |
| unset inpfile | |
| unset column | |
| unset sep | |
| } | |
| # convert CSV to TSV | |
| csv2tsv () | |
| { | |
| if [[ $# -eq 0 || "$1" == "-h" ]] ; then | |
| echo "Useage: csv2tsv input_file [output_file]" | |
| echo "Convert CSV to TSV (tab separated)." | |
| return | |
| fi | |
| inpfile=$1 | |
| regexp='s/("([^"]*)")?,/\2\t/g' | |
| if [ -z "$2" ]; then | |
| sed -E "$regexp" "$inpfile" | |
| else | |
| sed -E "$regexp" "$inpfile" > "$2" | |
| fi | |
| unset inpfile | |
| unset regexp | |
| } | |
| # convert CSV to DSV | |
| csv2dsv () | |
| { | |
| if [[ $# -eq 0 || "$1" == "-h" ]] ; then | |
| echo "Useage: csv2dsv input_file [output_file]" | |
| echo "Convert CSV to DSV (vertical bar separated)." | |
| return | |
| fi | |
| inpfile=$1 | |
| regexp='s/("([^"]*)")?,/\2|/g' | |
| if [ -z "$2" ]; then | |
| sed -E "$regexp" "$inpfile" | |
| else | |
| sed -E "$regexp" "$inpfile" "$2" | |
| fi | |
| unset inpfile | |
| unset regexp | |
| } | |
| # convert CSV to TSV | |
| # https://stackoverflow.com/a/22421445/3986320 | |
| tsv2csv () | |
| { | |
| if [[ $# -eq 0 || "$1" == "-h" ]] ; then | |
| echo "Useage: tsv2csv input_file [output_file]" | |
| echo "Convert TSV to CSV." | |
| return | |
| fi | |
| inpfile=$1 | |
| if [ -z "$2" ]; then | |
| awk 'BEGIN { FS="\t"; OFS="," } { | |
| rebuilt=0 | |
| for(i=1; i<=NF; ++i) { | |
| if ($i ~ /,/ && $i !~ /^".*"$/) { $i = "\"" $i "\""; rebuilt=1 } | |
| } | |
| if (!rebuilt) { $1=$1 } | |
| }' "$inpfile" | |
| else | |
| awk 'BEGIN { FS="\t"; OFS="," } { | |
| rebuilt=0 | |
| for(i=1; i<=NF; ++i) { | |
| if ($i ~ /,/ && $i !~ /^".*"$/) { $i = "\"" $i "\""; rebuilt=1 } | |
| } | |
| if (!rebuilt) { $1=$1 } | |
| }' "$inpfile" > "$2" | |
| fi | |
| unset inpfile | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment