Skip to content

Instantly share code, notes, and snippets.

@twolodzko
Last active August 22, 2018 09:21
Show Gist options
  • Select an option

  • Save twolodzko/068b6514e450222e356f4ae157cc7eaa to your computer and use it in GitHub Desktop.

Select an option

Save twolodzko/068b6514e450222e356f4ae157cc7eaa to your computer and use it in GitHub Desktop.
Handy command line data cleaning functions in bash / sed / awk
# Many of the functions are adapted from:
# https://www.polydesmida.info/cookbook/functions.html
nohead ()
{
if [[ $# -eq 0 || "$1" == "-h" ]] ; then
echo "Useage: nohead file [n=1]"
echo "Omit header (n rows) from a file."
return
fi
inpfile=$1
if [ -z "$2" ]; then
nrows=2
else
nrows=$(($2+1))
fi
tail -n +"$nrows" "$inpfile"
unset inpfile
unset nrows
}
broken ()
{
if [[ $# -eq 0 || "$1" == "-h" ]] ; then
echo "Useage: broken file [separator=',']"
echo "Count the rows groupped by the number of fields, to discover problems with separators."
return
fi
inpfile=$1
if [ -z "$2" ]; then
sep=','
else
sep=$2
fi
awk -F"$sep" '{print NF}' "$inpfile" | sort -n | uniq -c
unset inpfile
unset sep
}
cols ()
{
if [[ $# -eq 0 || "$1" == "-h" ]] ; then
echo "Useage: cols file [separator=',']"
echo "Print the header of the file."
return
fi
inpfile=$1
if [ -z "$2" ]; then
sep=','
else
sep=$2
fi
head -n 1 "$inpfile" | tr "$sep" '\n' | nl -w1 | pr -t -2
unset inpfile
unset sep
}
search ()
{
if [[ $# -eq 0 || "$1" == "-h" ]] ; then
echo "Useage: search file query [separator=',']"
echo "Print rows containing the searched phrase."
return
fi
inpfile=$1
phrase=$2
if [ -z "$3" ]; then
sep=','
else
sep=$3
fi
awk -F"$sep" -v tgt="$phrase" '{for (i=1;i<=NF;i++) if ($i ~ tgt) print NR" f"i": "$i}' "$inpfile"
unset inpfile
unset phrase
unset sep
}
getrow ()
{
if [[ $# -eq 0 || "$1" == "-h" ]] ; then
echo "Useage: getrow file row [separator=',']"
echo "Print the record at the given row."
return
fi
inpfile=$1
row=$2
if [ -z "$3" ]; then
sep=','
else
sep=$3
fi
paste <(head -n1 "$inpfile" | tr "$sep" '\n') <(sed -n "${row}p" "$inpfile"| tr '\t' '\n') | nl
unset inpfile
unset row
unset sep
}
getcols ()
{
if [[ $# -eq 0 || "$1" == "-h" ]] ; then
echo "Useage: getcols file cols [separator=',']"
echo "Extract the record at the given row."
return
fi
inpfile=$1
cols=$2
if [ -z "$3" ]; then
sep=','
else
sep=$3
fi
awk -F"$sep" -vd="$cols" -vs="$sep" 'BEGIN{split(d, a, ",")} {for (i in a) printf "%s%s", $(a[i]), s; printf "\n"}' "$inpfile"
unset inpfile
unset cols
unset sep
}
alphanum ()
{
if [[ $# -eq 0 || "$1" == "-h" ]] ; then
echo "Useage: alphanum file [ignore]"
echo "Leave only the alphanumeric characters and user-specified characters."
return
fi
inpfile=$1
if [ -z "$2" ]; then
ignore=''
else
ignore=$2
fi
awk -vs="$ignore" '{ gsub("[^A-Za-z0-9 "s"]", " "); gsub(/ +/, " "); gsub(/^ +| +$/, "") } 1' "$inpfile"
unset inpfile
unset ignore
}
uniqvals ()
{
if [[ $# -eq 0 || "$1" == "-h" ]] ; then
echo "Useage: uniqvals file column [separator=',']"
echo "List and count the unique values in given column."
return
fi
inpfile=$1
column=$2
if [ -z "$3" ]; then
sep=','
else
sep=$3
fi
if [ "$sep" == '\t' ]; then
tail -n +2 "$inpfile" | cut -f"$column" | sort | uniq -c | sed "s/^[ ]*//;s/ /\t/"
else
tail -n +2 "$inpfile" | cut -d"$sep" -f"$column" | sort | uniq -c | sed "s/^[ ]*//;s/ /\t/"
fi
unset inpfile
unset column
unset sep
}
# convert CSV to TSV
csv2tsv ()
{
if [[ $# -eq 0 || "$1" == "-h" ]] ; then
echo "Useage: csv2tsv input_file [output_file]"
echo "Convert CSV to TSV (tab separated)."
return
fi
inpfile=$1
regexp='s/("([^"]*)")?,/\2\t/g'
if [ -z "$2" ]; then
sed -E "$regexp" "$inpfile"
else
sed -E "$regexp" "$inpfile" > "$2"
fi
unset inpfile
unset regexp
}
# convert CSV to DSV
csv2dsv ()
{
if [[ $# -eq 0 || "$1" == "-h" ]] ; then
echo "Useage: csv2dsv input_file [output_file]"
echo "Convert CSV to DSV (vertical bar separated)."
return
fi
inpfile=$1
regexp='s/("([^"]*)")?,/\2|/g'
if [ -z "$2" ]; then
sed -E "$regexp" "$inpfile"
else
sed -E "$regexp" "$inpfile" "$2"
fi
unset inpfile
unset regexp
}
# convert CSV to TSV
# https://stackoverflow.com/a/22421445/3986320
tsv2csv ()
{
if [[ $# -eq 0 || "$1" == "-h" ]] ; then
echo "Useage: tsv2csv input_file [output_file]"
echo "Convert TSV to CSV."
return
fi
inpfile=$1
if [ -z "$2" ]; then
awk 'BEGIN { FS="\t"; OFS="," } {
rebuilt=0
for(i=1; i<=NF; ++i) {
if ($i ~ /,/ && $i !~ /^".*"$/) { $i = "\"" $i "\""; rebuilt=1 }
}
if (!rebuilt) { $1=$1 }
print
}' "$inpfile"
else
awk 'BEGIN { FS="\t"; OFS="," } {
rebuilt=0
for(i=1; i<=NF; ++i) {
if ($i ~ /,/ && $i !~ /^".*"$/) { $i = "\"" $i "\""; rebuilt=1 }
}
if (!rebuilt) { $1=$1 }
print
}' "$inpfile" > "$2"
fi
unset inpfile
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment