Skip to content

Instantly share code, notes, and snippets.

@lindenb
Last active September 9, 2019 14:24
Show Gist options
  • Save lindenb/c3f12405e24fbcc609dc6a7110063add to your computer and use it in GitHub Desktop.
Save lindenb/c3f12405e24fbcc609dc6a7110063add to your computer and use it in GitHub Desktop.
ode golf: detecting homopolymers of length N in the (human) genome. C biostar repeat
/**
https://www.biostars.org/p/379454/#379505
Code golf: detecting homopolymers of length N in the (human) genome
Author: Pierre Lindenbaum
compilation:
gcc -O3 -Wall -o biostar379454 biostar379454.c
*/
#include <stdio.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/io.h>
#include <sys/mman.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <ctype.h>
#define DUMP if(len_repeat>=len) {fputs(seq_name,stdout);printf("\t%d\t%d\t%c[%d]\n",pos-len_repeat,pos,prev_c,len_repeat); }len_repeat=0;
#define BUF_STDOUT 1000000
int main(int argc, char const *argv[]) {
char *seq;
char* buff=NULL;
size_t size,x;
int len=0,prev_c=-1,len_repeat=0,pos=0;
char* seq_name=NULL;
struct stat s;
int fd;
if(argc!=3) {
fprintf(stderr,"Usage: %s fasta size.\n",argv[0]);
return EXIT_FAILURE;
}
fd = open (argv[1], O_RDONLY);
if(fd < 0) {
fprintf(stderr,"Cannot open: %s %s.\n",argv[1],strerror(errno));
return EXIT_FAILURE;
}
len = atoi(argv[2]);
buff=(char*)malloc(BUF_STDOUT);
if(buff==NULL) {
fprintf(stderr,"Out of memory\n");
return EXIT_FAILURE;
}
setvbuf(stdout, buff, _IOFBF, BUF_STDOUT);
if(len < 2) {
fprintf(stderr,"bad length %s.\n",argv[2]);
return EXIT_FAILURE;
}
/* Get the size of the file. */
if(fstat (fd, & s)!=0) {
fprintf(stderr,"Cannot stat: %s %s.\n",argv[1],strerror(errno));
return EXIT_FAILURE;
}
size = s.st_size;
seq = (char *) mmap (0, size, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0);
x=0;
while(x<size) {
if(seq[x]=='>') {
size_t x0=x;
DUMP;
free(seq_name);
while(seq[x]!='\n' && x < size) x++;
seq_name = strndup(&seq[x0+1],x-x0-1);
len_repeat=0;
pos=0;
prev_c=-1;
}
else {
int c=toupper(seq[x++]);
if(isspace(c)) continue;
++pos;
if(prev_c==c) {
++len_repeat;
}
else
{
DUMP;
prev_c=c;
}
}
}
fflush(stdout);
free(buff);
munmap(seq, size);
free(seq_name);
return 0;
}
@chunyangbao
Copy link

You mean the output is a fasta, not a BED? Thanks

@chunyangbao
Copy link

The output is a 4-column BED. I generated one by myself. It's very efficient!
1 2 13 A[11]
1 16 40 T[24]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment