elowy01 · September 22, 2022 16:11
diff --git a/AWK cheat sheet b/AWK cheat sheet
 awk '/gold/' coins.txt #look for all the records with the word gold and shows
 these rows
 //
 awk '{if ($3 < 1980) print $3, "    ",$5,$6,$7,$8}' coins.txt #$3 is a variable
 that stores the 3rd word of each row . "    " introduces 4 whitespaces for the
 printing
 //
 awk '{if ($3 >= 0) print $3}' filename #same as the previous one but we add the equal sign
 //
 NR gives you the total number of records being processed or line number. 
 In the following awk NR example, NR variable has line number, in the END section awk NR tells you the total number of records in a file.

 $ awk '{print "Processing Record - ",NR;}END {print NR, "Students Records are processed";}' student-marks
 Processing Record -  1
 Processing Record -  2
 Processing Record -  3
 Processing Record -  4
 Processing Record -  5
 5 Students Records are processed
 //
 awk 'END { print NR }' data #Count the lines in a file
 //
 NF # Number of fields (columns) in a record

 For example, if we have a file like the following:

 cat student-marks
 Jones 2143 78 84 77
 Gondrol 2321 56 58 45
 RinRao 2122 38 37
 Edwin 2537 78 67 45
 Dayan 2415 30 47

 The following awk will generate:

 $ awk '{print NR,"->",NF}' student-marks
 1 -> 5
 2 -> 5
 3 -> 4
 4 -> 5
 5 -> 4

 //
 awk -f <awk program file name> input-file1 #The commands can be written into a file, and then Awk 
 can be told to execute the commands
 //
 awk 'program' input-file1 input-file2.... #If the program is short, we can run the
 program from the command-line
 //
 $example++ #increments the specified variable by one
 //
 Example: 
 -rw-r--r--  1 arnold   user   1933 Nov  7 13:05 Makefile
 -rw-r--r--  1 arnold   user  10809 Nov  7 13:03 awk.h
 -rw-r--r--  1 arnold   user    983 Apr 13 12:14 awk.tab.h
 -rw-r--r--  1 arnold   user  31869 Jun 15 12:20 awk.y
 -rw-r--r--  1 arnold   user  22414 Nov  7 13:03 awk1.c
 -rw-r--r--  1 arnold   user  37455 Nov  7 13:03 awk2.c
 -rw-r--r--  1 arnold   user  27511 Dec  9 13:07 awk3.c
 -rw-r--r--  1 arnold   user   7989 Nov  7 13:03 awk4.c

 ls -l | awk '$6 == "Nov" { sum += $5 }
             END { print sum }'
 #when 6th row is equal to Nov executes the action. In this case it adds the  5th
 row value to sum varible. At the end we print the value of sum.
 //
 #another arithmetic operation
 awk '{sum+=$3-$2} END {print sum}' test.txt
 //
 /12/ { print $0 } ; /21/ { print $0 } #you might want to put more than one of 
 them on a line. This is accomplished by separating the statements 
 with a semicolon (;). 	     
 //
 awk '!/^#/ && $2==1 && $7==1 && $8==1' rawdatafile | wc -l#in this case the first
 line of rawdatafile starts by #. So with this regex we say awk that do not
 consider this line. Besides, with \ wc -l we count the number of lines that 
 returns the awk command
 //
 #Some characters cannot be included literally in string constants ("foo") or regexp 
 constants (/foo/).Instead, they should be represented with escape sequences, which 
 are character sequences beginning with a backslash (\).
 //
 ^@chapter #matches @chapter at the beginning of a string
 //
 [^awk] #matches any character that is not an a, w, or k.
 //
 awk '{print $1}' prueba #print number 1 column
 //
 awk '{if ($2>90) print}' prueba #print number 2 column but only >90 values
 // 
 awk '/ENSP00000339623/ {print}' datafile1008 #searchs for the regex and print the
 record
 //
 awk '$1 !~/7/ {print}' prueba #prints all the records but the number 7 record
 //
 awk '{print $1 "\t\t" $2}' filename #prints $1 and $2 column leaving a tab in
 the middle
 //
 awk '$3~/PATTERN/ {print}' filename.txt #search for a pattern in column 3 inside filename.txt
 //
 awk -F : #sets the field separator
 awk -F"\t"  {print $2}' minus_ko_125_FDR.bed
 //	
 awk '{s += $1} END {print s}' prueba.txt #to sum column $1
 //
 #calculating number of columns in a tab-separated file
 awk -F'\t' '{print NF; exit}' filename
 //
 #skipping first line of a file
 awk 'NR!=1{print}' filename 
 //
 awk 'NR==10' file.txt #jump to line 10 in file.txt
 //
 #equal to string or character
 awk '{if ($5=="U") print}' filename
 //
 #remove all whitespaces by a single tab
 awk -v OFS="\t" '$1=$1' file1
 //
 #regex in AWK
 /
 # selects, all input records with the uppercase letter ‘J’ somewhere in the first field:
 awk '$1 ~ /J/' inventory-shipped
 or
 awk '{ if ($1 ~ /J/) print }' inventory-shipped
 /
 #negating the REGEX now:
 awk '$1 !~ /J/' inventory-shipped
 //
 #Tab field separator
 awk 'BEGIN { FS = "\t" } ; { print $2 }'
 #Using REGEX in AWK
 awk 'BEGIN { FS = "[\t]" } {print $3}' results/linc_up.tfbs.sorted.tsv.tmp
 //
 #regex substitution within a field
 echo '02/08/2011 7,33 Shopping' | awk '{sub(/,/,".",$2)} 1'

 02/08/2011 7.33 Shopping

 //
 #Print all records from some pattern:
 awk '/pattern/{f=1}f' file
 //
 #doing arithmetic operations within AWK
 awk '{sum=$1+$2; print}' filename.txt
 //
 #piping in AWK
 cut -f1 test_path | awk 'BEGIN{OFS="\t"}{print "pg-trace-001:/nfs/1000g-work/ihec/drop/bp-raw-data/blueprint/data/"$1,"/ebi/ftp/pub/databases/blueprint/next_data/"$1}'
 //
 #string concatenation in awk
 awk -F'\t' '{print "string_to_concat" $1}'
 //
 #printing all columns except the first one:
 awk 'BEGIN{FS=OFS="\t"}{$1="";sub("\t","")}1'  filename
 //
 #concatenating a string to each line in a file
 awk '{print "prefix" $0}' file
 //
 #modifying a certain column in a file and printing the new columns separated by ;
 awk -F'\t'  '{ OFS=";"; $44=$44"something"; print}' file.txt
 //
 #getting sequence lengths in a FASTQ file:
 cat file.fastq | awk '{if(NR%4==2) print length($1)}' | sort -n | uniq -c
 //
 #add single quotes to a comma separated list of words:
 awk -F"," -v quote="'" -v OFS="','" '$1=$1 {print quote $0 quote}' file
 //
 #getting the max among a set of numbers:

 Suppose I have a file data.dat with three columns of numbers in plain text. I want to get the maximum value in column 3.

 > awk 'BEGIN {max = 0} {if ($3>max) max=$3} END {print max}' data.dat
 //
 #getting columns names and their position in file
 awk -F'\t' ' { for (i = 1; i <= NF; ++i) print i, $i; exit } ' file
 //
 #split in awk:
 awk '{split($0, a, ":")}'
 #           ^^  ^  ^^^
 #            |  |   |
 #       string  |   delimiter
 #               |
 #               array to store the pieces

 For example:

 echo "12|23|11" | awk '{split($0,a,"|"); print a[3],a[2],a[1]}'
 //
 # remove newlines (or breaks) by whitespaces:
 awk '{printf "%s ",$0} END {print ""}' yourfile.txt
 //
 # change chromosome notations
 (read at http://webappl.blogspot.com/2014/06/convert-vcf-chromosome-notation.html)
 1. Remove 'chr' from the chromosome notation:
 awk '{gsub(/^chr/,""); print}' with_chr.vcf > no_chr.vcf
 2. Add chr before chromosome id
 awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' no_chr.vcf > with_chr.vcf
	awk '/gold/' coins.txt #look for all the records with the word gold and shows
	these rows
	//
	awk '{if ($3 < 1980) print $3, " ",$5,$6,$7,$8}' coins.txt #$3 is a variable
	that stores the 3rd word of each row . " " introduces 4 whitespaces for the
	printing
	//
	awk '{if ($3 >= 0) print $3}' filename #same as the previous one but we add the equal sign
	//
	NR gives you the total number of records being processed or line number.
	In the following awk NR example, NR variable has line number, in the END section awk NR tells you the total number of records in a file.

	$ awk '{print "Processing Record - ",NR;}END {print NR, "Students Records are processed";}' student-marks
	Processing Record - 1
	Processing Record - 2
	Processing Record - 3
	Processing Record - 4
	Processing Record - 5
	5 Students Records are processed
	//
	awk 'END { print NR }' data #Count the lines in a file
	//
	NF # Number of fields (columns) in a record

	For example, if we have a file like the following:

	cat student-marks
	Jones 2143 78 84 77
	Gondrol 2321 56 58 45
	RinRao 2122 38 37
	Edwin 2537 78 67 45
	Dayan 2415 30 47

	The following awk will generate:

	$ awk '{print NR,"->",NF}' student-marks
	1 -> 5
	2 -> 5
	3 -> 4
	4 -> 5
	5 -> 4

	//
	awk -f <awk program file name> input-file1 #The commands can be written into a file, and then Awk
	can be told to execute the commands
	//
	awk 'program' input-file1 input-file2.... #If the program is short, we can run the
	program from the command-line
	//
	$example++ #increments the specified variable by one
	//
	Example:
	-rw-r--r-- 1 arnold user 1933 Nov 7 13:05 Makefile
	-rw-r--r-- 1 arnold user 10809 Nov 7 13:03 awk.h
	-rw-r--r-- 1 arnold user 983 Apr 13 12:14 awk.tab.h
	-rw-r--r-- 1 arnold user 31869 Jun 15 12:20 awk.y
	-rw-r--r-- 1 arnold user 22414 Nov 7 13:03 awk1.c
	-rw-r--r-- 1 arnold user 37455 Nov 7 13:03 awk2.c
	-rw-r--r-- 1 arnold user 27511 Dec 9 13:07 awk3.c
	-rw-r--r-- 1 arnold user 7989 Nov 7 13:03 awk4.c

	ls -l \| awk '$6 == "Nov" { sum += $5 }
	END { print sum }'
	#when 6th row is equal to Nov executes the action. In this case it adds the 5th
	row value to sum varible. At the end we print the value of sum.
	//
	#another arithmetic operation
	awk '{sum+=$3-$2} END {print sum}' test.txt
	//
	/12/ { print $0 } ; /21/ { print $0 } #you might want to put more than one of
	them on a line. This is accomplished by separating the statements
	with a semicolon (;).
	//
	awk '!/^#/ && $2==1 && $7==1 && $8==1' rawdatafile \| wc -l#in this case the first
	line of rawdatafile starts by #. So with this regex we say awk that do not
	consider this line. Besides, with \ wc -l we count the number of lines that
	returns the awk command
	//
	#Some characters cannot be included literally in string constants ("foo") or regexp
	constants (/foo/).Instead, they should be represented with escape sequences, which
	are character sequences beginning with a backslash (\).
	//
	^@chapter #matches @chapter at the beginning of a string
	//
	[^awk] #matches any character that is not an a, w, or k.
	//
	awk '{print $1}' prueba #print number 1 column
	//
	awk '{if ($2>90) print}' prueba #print number 2 column but only >90 values
	//
	awk '/ENSP00000339623/ {print}' datafile1008 #searchs for the regex and print the
	record
	//
	awk '$1 !~/7/ {print}' prueba #prints all the records but the number 7 record
	//
	awk '{print $1 "\t\t" $2}' filename #prints $1 and $2 column leaving a tab in
	the middle
	//
	awk '$3~/PATTERN/ {print}' filename.txt #search for a pattern in column 3 inside filename.txt
	//
	awk -F : #sets the field separator
	awk -F"\t" {print $2}' minus_ko_125_FDR.bed
	//
	awk '{s += $1} END {print s}' prueba.txt #to sum column $1
	//
	#calculating number of columns in a tab-separated file
	awk -F'\t' '{print NF; exit}' filename
	//
	#skipping first line of a file
	awk 'NR!=1{print}' filename
	//
	awk 'NR==10' file.txt #jump to line 10 in file.txt
	//
	#equal to string or character
	awk '{if ($5=="U") print}' filename
	//
	#remove all whitespaces by a single tab
	awk -v OFS="\t" '$1=$1' file1
	//
	#regex in AWK
	/
	# selects, all input records with the uppercase letter ‘J’ somewhere in the first field:
	awk '$1 ~ /J/' inventory-shipped
	or
	awk '{ if ($1 ~ /J/) print }' inventory-shipped
	/
	#negating the REGEX now:
	awk '$1 !~ /J/' inventory-shipped
	//
	#Tab field separator
	awk 'BEGIN { FS = "\t" } ; { print $2 }'
	#Using REGEX in AWK
	awk 'BEGIN { FS = "[\t]" } {print $3}' results/linc_up.tfbs.sorted.tsv.tmp
	//
	#regex substitution within a field
	echo '02/08/2011 7,33 Shopping' \| awk '{sub(/,/,".",$2)} 1'

	02/08/2011 7.33 Shopping

	//
	#Print all records from some pattern:
	awk '/pattern/{f=1}f' file
	//
	#doing arithmetic operations within AWK
	awk '{sum=$1+$2; print}' filename.txt
	//
	#piping in AWK
	cut -f1 test_path \| awk 'BEGIN{OFS="\t"}{print "pg-trace-001:/nfs/1000g-work/ihec/drop/bp-raw-data/blueprint/data/"$1,"/ebi/ftp/pub/databases/blueprint/next_data/"$1}'
	//
	#string concatenation in awk
	awk -F'\t' '{print "string_to_concat" $1}'
	//
	#printing all columns except the first one:
	awk 'BEGIN{FS=OFS="\t"}{$1="";sub("\t","")}1' filename
	//
	#concatenating a string to each line in a file
	awk '{print "prefix" $0}' file
	//
	#modifying a certain column in a file and printing the new columns separated by ;
	awk -F'\t' '{ OFS=";"; $44=$44"something"; print}' file.txt
	//
	#getting sequence lengths in a FASTQ file:
	cat file.fastq \| awk '{if(NR%4==2) print length($1)}' \| sort -n \| uniq -c
	//
	#add single quotes to a comma separated list of words:
	awk -F"," -v quote="'" -v OFS="','" '$1=$1 {print quote $0 quote}' file
	//
	#getting the max among a set of numbers:

	Suppose I have a file data.dat with three columns of numbers in plain text. I want to get the maximum value in column 3.

	> awk 'BEGIN {max = 0} {if ($3>max) max=$3} END {print max}' data.dat
	//
	#getting columns names and their position in file
	awk -F'\t' ' { for (i = 1; i <= NF; ++i) print i, $i; exit } ' file
	//
	#split in awk:
	awk '{split($0, a, ":")}'
	# ^^ ^ ^^^
	# \| \| \|
	# string \| delimiter
	# \|
	# array to store the pieces

	For example:

	echo "12\|23\|11" \| awk '{split($0,a,"\|"); print a[3],a[2],a[1]}'
	//
	# remove newlines (or breaks) by whitespaces:
	awk '{printf "%s ",$0} END {print ""}' yourfile.txt
	//
	# change chromosome notations
	(read at http://webappl.blogspot.com/2014/06/convert-vcf-chromosome-notation.html)
	1. Remove 'chr' from the chromosome notation:
	awk '{gsub(/^chr/,""); print}' with_chr.vcf > no_chr.vcf
	2. Add chr before chromosome id
	awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' no_chr.vcf > with_chr.vcf