Skip to content

Instantly share code, notes, and snippets.

@lindenb
Last active November 20, 2020 14:15
Show Gist options
  • Save lindenb/50a763141293994afdced2886c0dbe70 to your computer and use it in GitHub Desktop.
Save lindenb/50a763141293994afdced2886c0dbe70 to your computer and use it in GitHub Desktop.
a basic GWAK extension generating the reverse complement of a DNA sequence
*.o
*.so
*~
gawk-*

A basic C-based GWAK extension generating the reverse/complement of a DNA sequence.

This extension defines three new functions for gawk:

  • reverse_complement(string)
  • complement(string)
  • reverse(string)

Compilation

make

the script generate a shared library sequence.so

Example:

# set the directory to sequence.so
export AWKLIBPATH=/path/to/dir

$ echo "Hello ACAAGTTTTt" | gawk  --load sequence '{print complement($2);}' 
TGTTCAAAAa
$ echo "Hello ACAAGTTTTt" | gawk  --load sequence '{print reverse($2);}' 
tTTTTGAACA
$ echo "Hello ACAAGTTTTt" | gawk  --load sequence '{print reverse_complement($2);}' 
aAAAACTTGT

See also

Author

Pierre Lindenbaum PhD Institut du Thorax. Nantes

GAWK_VERSION=5.1.0
CFLAGS= -Igawk-$(GAWK_VERSION) -Wall
LDFLAGS= -Lgawk-$(GAWK_VERSION) -shared -fPIC
CC?=gcc
SHLIBEXT=.so
.PHONY: test all clean
all: test
test:sequence$(SHLIBEXT)
echo "Hello ACAAGTTTTt" | AWKLIBPATH=$${PWD} ./gawk-5.1.0/gawk --load sequence '{print complement($$2);}' | grep -w -F TGTTCAAAAa
echo "Hello ACAAGTTTTt" | AWKLIBPATH=$${PWD} ./gawk-5.1.0/gawk --load sequence '{print reverse($$2);}' | grep -w -F tTTTTGAACA
echo "Hello ACAAGTTTTt" | AWKLIBPATH=$${PWD} ./gawk-5.1.0/gawk --load sequence '{print reverse_complement($$2);}' | grep -w -F aAAAACTTGT
@echo "SUCCESS"
sequence$(SHLIBEXT): sequence.c gawk-$(GAWK_VERSION)/gawkapi.h
$(CC) -o $@ $(CFLAGS) $(LDFLAGS) $<
gawk-$(GAWK_VERSION)/gawkapi.h : gawk-$(GAWK_VERSION)/gawk
gawk-$(GAWK_VERSION)/gawk : gawk-$(GAWK_VERSION)/Makefile.in
cd $(dir $<) && ./configure && make
gawk-$(GAWK_VERSION)/Makefile.in :
wget -O jeter.tar.gz "https://ftp.gnu.org/gnu/gawk/gawk-$(GAWK_VERSION).tar.gz"
tar xvfz jeter.tar.gz
rm jeter.tar.gz
touch -c $@
clean:
rm -f sequence$(SHLIBEXT)
rm -rf gawk-$(GAWK_VERSION)
/**
A simple GAWK extension with functions to manipulate DNA.
Pierre Lindenbaum PhD - Institut du Thorax. Nantes. France.
2020
*/
#include <stdio.h>
#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "gawkapi.h"
static const gawk_api_t *api; /* for convenience macros to work */
static awk_ext_id_t ext_id;
static const char *ext_version = "sequence extension: version 1.0";
// #define DEBUG fprintf(stderr,"[DEBUG]%s:%d\n",__FILE__,__LINE__)
int plugin_is_GPL_compatible;
static awk_value_t* modify_sequence(awk_value_t* result,awk_bool_t reverse,awk_bool_t complement) {
awk_value_t param;
char* text = NULL;
size_t i;
make_null_string(result);
if (! get_argument(0, AWK_STRING,&param)) {
update_ERRNO_string("sequence: missing required STRING argument");
return result;
}
emalloc(text, char *, param.str_value.len, "sequence");
for(i=0;i< param.str_value.len;i++) {
char c= param.str_value.str[reverse ? (param.str_value.len -1) - i: i];
if(complement) {
switch(c) {
case 'a': c = 't'; break;
case 'A': c = 'T'; break;
case 't': c = 'a'; break;
case 'T': c = 'A'; break;
case 'g': c = 'c'; break;
case 'G': c = 'C'; break;
case 'c': c = 'g'; break;
case 'C': c = 'G'; break;
default: c='N'; break;
}
}
text[i]= c;
}
make_malloced_string(text,param.str_value.len, result);
return result;
}
static awk_value_t *do_reverse_complement(int nargs, awk_value_t *result, struct awk_ext_func *unused) {
return modify_sequence(result,awk_true,awk_true);
}
static awk_value_t *do_reverse(int nargs, awk_value_t *result, struct awk_ext_func *unused) {
return modify_sequence(result,awk_true,awk_false);
}
static awk_value_t *do_complement(int nargs, awk_value_t *result, struct awk_ext_func *unused) {
return modify_sequence(result,awk_false,awk_true);
}
static awk_bool_t init_sequence() {
return awk_true;
}
static awk_bool_t (*init_func)(void) = init_sequence;
static awk_ext_func_t func_table[] = {
{ "reverse_complement", do_reverse_complement, 1, 1, awk_true, NULL },
{ "reverse", do_reverse, 1, 1, awk_true, NULL },
{ "complement", do_complement, 1, 1, awk_true, NULL },
};
/* define the dl_load function using the boilerplate macro */
dl_load_func(func_table, sequence, "")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment