twolodzko · August 22, 2018 09:21
diff --git a/data_cleaning.sh b/data_cleaning.sh

 # Many of the functions are adapted from:
 # https://www.polydesmida.info/cookbook/functions.html


 nohead ()
 {
 if [[ $# -eq 0 || "$1" == "-h" ]] ; then
  echo "Useage: nohead file [n=1]"
  echo "Omit header (n rows) from a file."
  return
 fi

 inpfile=$1

 if [ -z "$2" ]; then
  nrows=2
 else
  nrows=$(($2+1))
 fi

 tail -n +"$nrows" "$inpfile"

 unset inpfile
 unset nrows
 }


 broken ()
 {
 if [[ $# -eq 0 || "$1" == "-h" ]] ; then
  echo "Useage: broken file [separator=',']"
  echo "Count the rows groupped by the number of fields, to discover problems with separators."
  return
 fi

 inpfile=$1

 if [ -z "$2" ]; then
  sep=','
 else
  sep=$2
 fi

 awk -F"$sep" '{print NF}' "$inpfile" | sort -n | uniq -c

 unset inpfile
 unset sep
 }


 cols ()
 {
 if [[ $# -eq 0 || "$1" == "-h" ]] ; then
  echo "Useage: cols file [separator=',']"
  echo "Print the header of the file."
  return
 fi

 inpfile=$1

 if [ -z "$2" ]; then
  sep=','
 else
  sep=$2
 fi

 head -n 1 "$inpfile" | tr "$sep" '\n' | nl -w1 | pr -t -2

 unset inpfile
 unset sep
 }


 search ()
 {
 if [[ $# -eq 0 || "$1" == "-h" ]] ; then
  echo "Useage: search file query [separator=',']"
  echo "Print rows containing the searched phrase."
  return
 fi

 inpfile=$1
 phrase=$2

 if [ -z "$3" ]; then
  sep=','
 else
  sep=$3
 fi

 awk -F"$sep" -v tgt="$phrase" '{for (i=1;i<=NF;i++) if ($i ~ tgt) print NR" f"i": "$i}' "$inpfile"

 unset inpfile
 unset phrase
 unset sep
 }


 getrow ()
 {
 if [[ $# -eq 0 || "$1" == "-h" ]] ; then
  echo "Useage: getrow file row [separator=',']"
  echo "Print the record at the given row."
  return
 fi

 inpfile=$1
 row=$2

 if [ -z "$3" ]; then
  sep=','
 else
  sep=$3
 fi

 paste <(head -n1 "$inpfile" | tr "$sep" '\n') <(sed -n "${row}p" "$inpfile"| tr '\t' '\n') | nl

 unset inpfile
 unset row
 unset sep
 }


 getcols ()
 {
 if [[ $# -eq 0 || "$1" == "-h" ]] ; then
  echo "Useage: getcols file cols [separator=',']"
  echo "Extract the record at the given row."
  return
 fi

 inpfile=$1
 cols=$2

 if [ -z "$3" ]; then
  sep=','
 else
  sep=$3
 fi

 awk -F"$sep" -vd="$cols" -vs="$sep" 'BEGIN{split(d, a, ",")} {for (i in a) printf "%s%s", $(a[i]), s; printf "\n"}' "$inpfile"

 unset inpfile
 unset cols
 unset sep
 }


 alphanum ()
 {
 if [[ $# -eq 0 || "$1" == "-h" ]] ; then
  echo "Useage: alphanum file [ignore]"
  echo "Leave only the alphanumeric characters and user-specified characters."
  return
 fi

 inpfile=$1

 if [ -z "$2" ]; then
  ignore=''
 else
  ignore=$2
 fi

 awk -vs="$ignore" '{ gsub("[^A-Za-z0-9 "s"]", " "); gsub(/ +/, " "); gsub(/^ +| +$/, "") } 1' "$inpfile"

 unset inpfile
 unset ignore
 }


 uniqvals ()
 {
 if [[ $# -eq 0 || "$1" == "-h" ]] ; then
  echo "Useage: uniqvals file column [separator=',']"
  echo "List and count the unique values in given column."
  return
 fi

 inpfile=$1
 column=$2

 if [ -z "$3" ]; then
  sep=','
 else
  sep=$3
 fi

 if [ "$sep" == '\t' ]; then
  tail -n +2 "$inpfile" | cut -f"$column" | sort | uniq -c | sed "s/^[ ]*//;s/ /\t/"
 else
  tail -n +2 "$inpfile" | cut -d"$sep" -f"$column" | sort | uniq -c | sed "s/^[ ]*//;s/ /\t/"
 fi

 unset inpfile
 unset column
 unset sep
 }


 # convert CSV to TSV

 csv2tsv ()
 {
 if [[ $# -eq 0 || "$1" == "-h" ]] ; then
  echo "Useage: csv2tsv input_file [output_file]"
  echo "Convert CSV to TSV (tab separated)."
  return
 fi

 inpfile=$1
 regexp='s/("([^"]*)")?,/\2\t/g'

 if [ -z "$2" ]; then
  sed -E "$regexp" "$inpfile"
 else
  sed -E "$regexp" "$inpfile" > "$2"
 fi

 unset inpfile
 unset regexp
 } 


 # convert CSV to DSV

 csv2dsv ()
 {
 if [[ $# -eq 0 || "$1" == "-h" ]] ; then
  echo "Useage: csv2dsv input_file [output_file]"
  echo "Convert CSV to DSV (vertical bar separated)."
  return
 fi

 inpfile=$1
 regexp='s/("([^"]*)")?,/\2|/g'

 if [ -z "$2" ]; then
  sed -E "$regexp" "$inpfile"
 else
  sed -E "$regexp" "$inpfile" "$2"
 fi

 unset inpfile
 unset regexp
 } 


 # convert CSV to TSV
 # https://stackoverflow.com/a/22421445/3986320

 tsv2csv () 
 {
 if [[ $# -eq 0 || "$1" == "-h" ]] ; then
  echo "Useage: tsv2csv input_file [output_file]"
  echo "Convert TSV to CSV."
  return
 fi

 inpfile=$1

 if [ -z "$2" ]; then
  awk 'BEGIN { FS="\t"; OFS="," } {
    rebuilt=0
    for(i=1; i<=NF; ++i) {
      if ($i ~ /,/ && $i !~ /^".*"$/) { $i = "\"" $i "\""; rebuilt=1 }
    }
    if (!rebuilt) { $1=$1 }
    print
  }' "$inpfile"
 else
  awk 'BEGIN { FS="\t"; OFS="," } {
    rebuilt=0
    for(i=1; i<=NF; ++i) {
      if ($i ~ /,/ && $i !~ /^".*"$/) { $i = "\"" $i "\""; rebuilt=1 }
    }
    if (!rebuilt) { $1=$1 }
    print
  }' "$inpfile" > "$2"
 fi

 unset inpfile
 }

	# Many of the functions are adapted from:
	# https://www.polydesmida.info/cookbook/functions.html


	nohead ()
	{
	if [[ $# -eq 0 \|\| "$1" == "-h" ]] ; then
	echo "Useage: nohead file [n=1]"
	echo "Omit header (n rows) from a file."
	return
	fi

	inpfile=$1

	if [ -z "$2" ]; then
	nrows=2
	else
	nrows=$(($2+1))
	fi

	tail -n +"$nrows" "$inpfile"

	unset inpfile
	unset nrows
	}


	broken ()
	{
	if [[ $# -eq 0 \|\| "$1" == "-h" ]] ; then
	echo "Useage: broken file [separator=',']"
	echo "Count the rows groupped by the number of fields, to discover problems with separators."
	return
	fi

	inpfile=$1

	if [ -z "$2" ]; then
	sep=','
	else
	sep=$2
	fi

	awk -F"$sep" '{print NF}' "$inpfile" \| sort -n \| uniq -c

	unset inpfile
	unset sep
	}


	cols ()
	{
	if [[ $# -eq 0 \|\| "$1" == "-h" ]] ; then
	echo "Useage: cols file [separator=',']"
	echo "Print the header of the file."
	return
	fi

	inpfile=$1

	if [ -z "$2" ]; then
	sep=','
	else
	sep=$2
	fi

	head -n 1 "$inpfile" \| tr "$sep" '\n' \| nl -w1 \| pr -t -2

	unset inpfile
	unset sep
	}


	search ()
	{
	if [[ $# -eq 0 \|\| "$1" == "-h" ]] ; then
	echo "Useage: search file query [separator=',']"
	echo "Print rows containing the searched phrase."
	return
	fi

	inpfile=$1
	phrase=$2

	if [ -z "$3" ]; then
	sep=','
	else
	sep=$3
	fi

	awk -F"$sep" -v tgt="$phrase" '{for (i=1;i<=NF;i++) if ($i ~ tgt) print NR" f"i": "$i}' "$inpfile"

	unset inpfile
	unset phrase
	unset sep
	}


	getrow ()
	{
	if [[ $# -eq 0 \|\| "$1" == "-h" ]] ; then
	echo "Useage: getrow file row [separator=',']"
	echo "Print the record at the given row."
	return
	fi

	inpfile=$1
	row=$2

	if [ -z "$3" ]; then
	sep=','
	else
	sep=$3
	fi

	paste <(head -n1 "$inpfile" \| tr "$sep" '\n') <(sed -n "${row}p" "$inpfile"\| tr '\t' '\n') \| nl

	unset inpfile
	unset row
	unset sep
	}


	getcols ()
	{
	if [[ $# -eq 0 \|\| "$1" == "-h" ]] ; then
	echo "Useage: getcols file cols [separator=',']"
	echo "Extract the record at the given row."
	return
	fi

	inpfile=$1
	cols=$2

	if [ -z "$3" ]; then
	sep=','
	else
	sep=$3
	fi

	awk -F"$sep" -vd="$cols" -vs="$sep" 'BEGIN{split(d, a, ",")} {for (i in a) printf "%s%s", $(a[i]), s; printf "\n"}' "$inpfile"

	unset inpfile
	unset cols
	unset sep
	}


	alphanum ()
	{
	if [[ $# -eq 0 \|\| "$1" == "-h" ]] ; then
	echo "Useage: alphanum file [ignore]"
	echo "Leave only the alphanumeric characters and user-specified characters."
	return
	fi

	inpfile=$1

	if [ -z "$2" ]; then
	ignore=''
	else
	ignore=$2
	fi

	awk -vs="$ignore" '{ gsub("[^A-Za-z0-9 "s"]", " "); gsub(/ +/, " "); gsub(/^ +\| +$/, "") } 1' "$inpfile"

	unset inpfile
	unset ignore
	}


	uniqvals ()
	{
	if [[ $# -eq 0 \|\| "$1" == "-h" ]] ; then
	echo "Useage: uniqvals file column [separator=',']"
	echo "List and count the unique values in given column."
	return
	fi

	inpfile=$1
	column=$2

	if [ -z "$3" ]; then
	sep=','
	else
	sep=$3
	fi

	if [ "$sep" == '\t' ]; then
	tail -n +2 "$inpfile" \| cut -f"$column" \| sort \| uniq -c \| sed "s/^[ ]*//;s/ /\t/"
	else
	tail -n +2 "$inpfile" \| cut -d"$sep" -f"$column" \| sort \| uniq -c \| sed "s/^[ ]*//;s/ /\t/"
	fi

	unset inpfile
	unset column
	unset sep
	}


	# convert CSV to TSV

	csv2tsv ()
	{
	if [[ $# -eq 0 \|\| "$1" == "-h" ]] ; then
	echo "Useage: csv2tsv input_file [output_file]"
	echo "Convert CSV to TSV (tab separated)."
	return
	fi

	inpfile=$1
	regexp='s/("([^"]*)")?,/\2\t/g'

	if [ -z "$2" ]; then
	sed -E "$regexp" "$inpfile"
	else
	sed -E "$regexp" "$inpfile" > "$2"
	fi

	unset inpfile
	unset regexp
	}


	# convert CSV to DSV

	csv2dsv ()
	{
	if [[ $# -eq 0 \|\| "$1" == "-h" ]] ; then
	echo "Useage: csv2dsv input_file [output_file]"
	echo "Convert CSV to DSV (vertical bar separated)."
	return
	fi

	inpfile=$1
	regexp='s/("([^"]*)")?,/\2\|/g'

	if [ -z "$2" ]; then
	sed -E "$regexp" "$inpfile"
	else
	sed -E "$regexp" "$inpfile" "$2"
	fi

	unset inpfile
	unset regexp
	}


	# convert CSV to TSV
	# https://stackoverflow.com/a/22421445/3986320

	tsv2csv ()
	{
	if [[ $# -eq 0 \|\| "$1" == "-h" ]] ; then
	echo "Useage: tsv2csv input_file [output_file]"
	echo "Convert TSV to CSV."
	return
	fi

	inpfile=$1

	if [ -z "$2" ]; then
	awk 'BEGIN { FS="\t"; OFS="," } {
	rebuilt=0
	for(i=1; i<=NF; ++i) {
	if ($i ~ /,/ && $i !~ /^".*"$/) { $i = "\"" $i "\""; rebuilt=1 }
	}
	if (!rebuilt) { $1=$1 }
	print
	}' "$inpfile"
	else
	awk 'BEGIN { FS="\t"; OFS="," } {
	rebuilt=0
	for(i=1; i<=NF; ++i) {
	if ($i ~ /,/ && $i !~ /^".*"$/) { $i = "\"" $i "\""; rebuilt=1 }
	}
	if (!rebuilt) { $1=$1 }
	print
	}' "$inpfile" > "$2"
	fi

	unset inpfile
	}
No results found