Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created April 19, 2019 14:36
Show Gist options
  • Save lindenb/b16c1485c2998af197654c23a9418209 to your computer and use it in GitHub Desktop.
Save lindenb/b16c1485c2998af197654c23a9418209 to your computer and use it in GitHub Desktop.
calculates mean bam coverage and prints a bed of the regions covered more than FACTOR*(mean-coverage) : htslib sam bam coverage depth mask
/**
Author: Pierre Lindenbaum PhD. 2019
This tools calculate the mean depth of a bam (ignoring the non-covered bases)
and output a BED file of the regions covered more than mean-depth*FACTOR
compilation: gcc -o a.out -O3 -Wall -I../htslib highcov.c -L../htslib -lm -lpthread -lhts -lz -llzma -lbz2
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <limits.h>
#include <htslib/sam.h>
#define FACTOR 4.0
struct StepInfo
{
int min_mapQ;
htsFile* fp;
bam_hdr_t* hdr;
};
// This function reads a BAM alignment from one BAM file.
static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
{
struct StepInfo *info = (struct StepInfo*)data;
int ret = -1;
for(;;)
{
ret = sam_read1(info->fp, info->hdr, b);
if ( ret<0 ) break;
if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
if ( (int)b->core.qual < info->min_mapQ ) continue;
break;
}
return ret;
}
#define WHERE fprintf(stderr,"%d\n",__LINE__)
#define BUFFER_SIZE 2000000
int main(int argc,char**argv) {
int step;
char* fn=NULL;
bam_mplp_t mplp;
struct StepInfo info;
info.min_mapQ = 10;
double mean_cov = 0.0;
char* buffer ;
if(argc!=2) {
fprintf(stderr,"Usage: %s file.bam > cov.bed\n",argv[0]);
return EXIT_FAILURE;
}
fn = argv[1];
buffer= malloc(sizeof(char)*BUFFER_SIZE);
if(buffer==NULL) {
fprintf(stderr,"out of memory\n");
return EXIT_FAILURE;
}
setvbuf ( stdout , buffer , _IOFBF ,BUFFER_SIZE);
for(step=0;step<2; ++step) //step 0 : get min/max/mean
{
struct StepInfo* info_ptr=&info;
const bam_pileup1_t **plp = (const bam_pileup1_t**)calloc(1, sizeof(bam_pileup1_t*));
int tid = 0;
int pos = 0;
int depth = 0;
info.fp = hts_open(fn, "rb");
if(info.fp==NULL) {
fprintf(stderr,"Cannot open %s %s",fn,strerror(errno));
return EXIT_FAILURE;
}
info.hdr = sam_hdr_read(info.fp);
if(info.hdr==NULL) {
fprintf(stderr,"Cannot read header for %s %s",fn,strerror(errno));
return EXIT_FAILURE;
}
mplp = bam_mplp_init(1, read_bam, (void**)&info_ptr); // initialization
if(step==0)
{
double sum_cov = 0.0;
long count_reads = 0;
while(bam_mplp_auto(mplp,&tid,&pos,&depth,plp)>0)
{
sum_cov+=depth;
++count_reads;
}
mean_cov = sum_cov/count_reads;
fprintf(stderr,"%s\t%ld\t%f\n",fn,count_reads,mean_cov);
}
else
{
int max_depth = (int)(mean_cov * FACTOR );
int bed_tid = -1;
int bed_start = -1;
int bed_end = -1;
for(;;)
{
int ret = bam_mplp_auto(mplp,&tid,&pos,&depth,plp);
if(ret<=0 || /* EOF */
depth < max_depth || /* Not high-cov region, close the bed */
(bed_tid>=0 && bed_tid != tid) || /* new contig */
(bed_tid>=0 && bed_end!=pos) /*closed bed */
) {
if(bed_tid>=0 && bed_start<bed_end) {
fputs(info.hdr->target_name[bed_tid],stdout);\
printf("\t%d", bed_start);\
printf("\t%d\n", bed_end);
bed_tid = -1;
bed_start = -1;
bed_end = -1;
}
if(ret<=0) break;
}
if(depth >= max_depth)
{
if(bed_tid==-1) {
bed_tid = tid;
bed_start = pos;
}
bed_end = pos+1;
}
}
}
bam_mplp_destroy(mplp);
bam_hdr_destroy(info.hdr);
hts_close(info.fp);
free(plp);
}
fflush(stdout);
fclose(stdout);
free(buffer);
return EXIT_SUCCESS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment