Last active
September 22, 2022 16:11
-
-
Save elowy01/ddcdd1ebfeca342e13ccf0900440ddf5 to your computer and use it in GitHub Desktop.
AWK cheat sheet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
awk '/gold/' coins.txt #look for all the records with the word gold and shows | |
these rows | |
// | |
awk '{if ($3 < 1980) print $3, " ",$5,$6,$7,$8}' coins.txt #$3 is a variable | |
that stores the 3rd word of each row . " " introduces 4 whitespaces for the | |
printing | |
// | |
awk '{if ($3 >= 0) print $3}' filename #same as the previous one but we add the equal sign | |
// | |
NR gives you the total number of records being processed or line number. | |
In the following awk NR example, NR variable has line number, in the END section awk NR tells you the total number of records in a file. | |
$ awk '{print "Processing Record - ",NR;}END {print NR, "Students Records are processed";}' student-marks | |
Processing Record - 1 | |
Processing Record - 2 | |
Processing Record - 3 | |
Processing Record - 4 | |
Processing Record - 5 | |
5 Students Records are processed | |
// | |
awk 'END { print NR }' data #Count the lines in a file | |
// | |
NF # Number of fields (columns) in a record | |
For example, if we have a file like the following: | |
cat student-marks | |
Jones 2143 78 84 77 | |
Gondrol 2321 56 58 45 | |
RinRao 2122 38 37 | |
Edwin 2537 78 67 45 | |
Dayan 2415 30 47 | |
The following awk will generate: | |
$ awk '{print NR,"->",NF}' student-marks | |
1 -> 5 | |
2 -> 5 | |
3 -> 4 | |
4 -> 5 | |
5 -> 4 | |
// | |
awk -f <awk program file name> input-file1 #The commands can be written into a file, and then Awk | |
can be told to execute the commands | |
// | |
awk 'program' input-file1 input-file2.... #If the program is short, we can run the | |
program from the command-line | |
// | |
$example++ #increments the specified variable by one | |
// | |
Example: | |
-rw-r--r-- 1 arnold user 1933 Nov 7 13:05 Makefile | |
-rw-r--r-- 1 arnold user 10809 Nov 7 13:03 awk.h | |
-rw-r--r-- 1 arnold user 983 Apr 13 12:14 awk.tab.h | |
-rw-r--r-- 1 arnold user 31869 Jun 15 12:20 awk.y | |
-rw-r--r-- 1 arnold user 22414 Nov 7 13:03 awk1.c | |
-rw-r--r-- 1 arnold user 37455 Nov 7 13:03 awk2.c | |
-rw-r--r-- 1 arnold user 27511 Dec 9 13:07 awk3.c | |
-rw-r--r-- 1 arnold user 7989 Nov 7 13:03 awk4.c | |
ls -l | awk '$6 == "Nov" { sum += $5 } | |
END { print sum }' | |
#when 6th row is equal to Nov executes the action. In this case it adds the 5th | |
row value to sum varible. At the end we print the value of sum. | |
// | |
#another arithmetic operation | |
awk '{sum+=$3-$2} END {print sum}' test.txt | |
// | |
/12/ { print $0 } ; /21/ { print $0 } #you might want to put more than one of | |
them on a line. This is accomplished by separating the statements | |
with a semicolon (;). | |
// | |
awk '!/^#/ && $2==1 && $7==1 && $8==1' rawdatafile | wc -l#in this case the first | |
line of rawdatafile starts by #. So with this regex we say awk that do not | |
consider this line. Besides, with \ wc -l we count the number of lines that | |
returns the awk command | |
// | |
#Some characters cannot be included literally in string constants ("foo") or regexp | |
constants (/foo/).Instead, they should be represented with escape sequences, which | |
are character sequences beginning with a backslash (\). | |
// | |
^@chapter #matches @chapter at the beginning of a string | |
// | |
[^awk] #matches any character that is not an a, w, or k. | |
// | |
awk '{print $1}' prueba #print number 1 column | |
// | |
awk '{if ($2>90) print}' prueba #print number 2 column but only >90 values | |
// | |
awk '/ENSP00000339623/ {print}' datafile1008 #searchs for the regex and print the | |
record | |
// | |
awk '$1 !~/7/ {print}' prueba #prints all the records but the number 7 record | |
// | |
awk '{print $1 "\t\t" $2}' filename #prints $1 and $2 column leaving a tab in | |
the middle | |
// | |
awk '$3~/PATTERN/ {print}' filename.txt #search for a pattern in column 3 inside filename.txt | |
// | |
awk -F : #sets the field separator | |
awk -F"\t" {print $2}' minus_ko_125_FDR.bed | |
// | |
awk '{s += $1} END {print s}' prueba.txt #to sum column $1 | |
// | |
#calculating number of columns in a tab-separated file | |
awk -F'\t' '{print NF; exit}' filename | |
// | |
#skipping first line of a file | |
awk 'NR!=1{print}' filename | |
// | |
awk 'NR==10' file.txt #jump to line 10 in file.txt | |
// | |
#equal to string or character | |
awk '{if ($5=="U") print}' filename | |
// | |
#remove all whitespaces by a single tab | |
awk -v OFS="\t" '$1=$1' file1 | |
// | |
#regex in AWK | |
/ | |
# selects, all input records with the uppercase letter ‘J’ somewhere in the first field: | |
awk '$1 ~ /J/' inventory-shipped | |
or | |
awk '{ if ($1 ~ /J/) print }' inventory-shipped | |
/ | |
#negating the REGEX now: | |
awk '$1 !~ /J/' inventory-shipped | |
// | |
#Tab field separator | |
awk 'BEGIN { FS = "\t" } ; { print $2 }' | |
#Using REGEX in AWK | |
awk 'BEGIN { FS = "[\t]" } {print $3}' results/linc_up.tfbs.sorted.tsv.tmp | |
// | |
#regex substitution within a field | |
echo '02/08/2011 7,33 Shopping' | awk '{sub(/,/,".",$2)} 1' | |
02/08/2011 7.33 Shopping | |
// | |
#Print all records from some pattern: | |
awk '/pattern/{f=1}f' file | |
// | |
#doing arithmetic operations within AWK | |
awk '{sum=$1+$2; print}' filename.txt | |
// | |
#piping in AWK | |
cut -f1 test_path | awk 'BEGIN{OFS="\t"}{print "pg-trace-001:/nfs/1000g-work/ihec/drop/bp-raw-data/blueprint/data/"$1,"/ebi/ftp/pub/databases/blueprint/next_data/"$1}' | |
// | |
#string concatenation in awk | |
awk -F'\t' '{print "string_to_concat" $1}' | |
// | |
#printing all columns except the first one: | |
awk 'BEGIN{FS=OFS="\t"}{$1="";sub("\t","")}1' filename | |
// | |
#concatenating a string to each line in a file | |
awk '{print "prefix" $0}' file | |
// | |
#modifying a certain column in a file and printing the new columns separated by ; | |
awk -F'\t' '{ OFS=";"; $44=$44"something"; print}' file.txt | |
// | |
#getting sequence lengths in a FASTQ file: | |
cat file.fastq | awk '{if(NR%4==2) print length($1)}' | sort -n | uniq -c | |
// | |
#add single quotes to a comma separated list of words: | |
awk -F"," -v quote="'" -v OFS="','" '$1=$1 {print quote $0 quote}' file | |
// | |
#getting the max among a set of numbers: | |
Suppose I have a file data.dat with three columns of numbers in plain text. I want to get the maximum value in column 3. | |
> awk 'BEGIN {max = 0} {if ($3>max) max=$3} END {print max}' data.dat | |
// | |
#getting columns names and their position in file | |
awk -F'\t' ' { for (i = 1; i <= NF; ++i) print i, $i; exit } ' file | |
// | |
#split in awk: | |
awk '{split($0, a, ":")}' | |
# ^^ ^ ^^^ | |
# | | | | |
# string | delimiter | |
# | | |
# array to store the pieces | |
For example: | |
echo "12|23|11" | awk '{split($0,a,"|"); print a[3],a[2],a[1]}' | |
// | |
# remove newlines (or breaks) by whitespaces: | |
awk '{printf "%s ",$0} END {print ""}' yourfile.txt | |
// | |
# change chromosome notations | |
(read at http://webappl.blogspot.com/2014/06/convert-vcf-chromosome-notation.html) | |
1. Remove 'chr' from the chromosome notation: | |
awk '{gsub(/^chr/,""); print}' with_chr.vcf > no_chr.vcf | |
2. Add chr before chromosome id | |
awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' no_chr.vcf > with_chr.vcf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment