|
#include <stdlib.h> |
|
#include <string.h> |
|
#include <stdio.h> |
|
#include "htslib/bgzf.h" |
|
#include "htslib/sam.h" |
|
|
|
typedef struct { |
|
uint64_t off, a_st, a_en; |
|
uint32_t cnt, n_bytes; |
|
} fsi1_t; |
|
|
|
typedef struct { |
|
uint32_t size, n_ctg; |
|
uint64_t *acc_len; |
|
uint32_t n, m; |
|
fsi1_t *a; |
|
} fsi_t; |
|
|
|
static fsi_t *read_header(BGZF *fp) |
|
{ |
|
fsi_t *fsi; |
|
int i; |
|
bam_hdr_t *h; |
|
if ((h = bam_hdr_read(fp)) == 0) return 0; // fail to read the header |
|
if (bgzf_tell(fp) & 0xffff) return 0; // the header is not in separate block(s) |
|
fsi = (fsi_t*)calloc(1, sizeof(fsi_t)); |
|
fsi->n_ctg = h->n_targets; |
|
fsi->acc_len = (uint64_t*)calloc(fsi->n_ctg + 1, 8); |
|
for (i = 0; i < fsi->n_ctg; ++i) |
|
fsi->acc_len[i+1] = fsi->acc_len[i] + h->target_len[i]; |
|
bam_hdr_destroy(h); |
|
return fsi; |
|
} |
|
|
|
#define kv_push(type, v, x) do { \ |
|
if ((v).n == (v).m) { \ |
|
(v).m = (v).m? (v).m<<1 : 2; \ |
|
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ |
|
} \ |
|
(v).a[(v).n++] = (x); \ |
|
} while (0) |
|
|
|
fsi_t *fsi_build(BGZF *fp, int size) |
|
{ |
|
fsi_t *fsi; |
|
bam1_t *b; |
|
uint64_t off; |
|
fsi1_t t; |
|
int ret; |
|
|
|
if ((fsi = read_header(fp)) == 0) return 0; |
|
fsi->size = size; |
|
memset(&t, 0, sizeof(fsi1_t)); |
|
off = bgzf_tell(fp); |
|
b = bam_init1(); |
|
while ((ret = bam_read1(fp, b)) >= 0) { |
|
fsi1_t s; |
|
s.a_st = b->core.tid < 0? fsi->acc_len[fsi->n_ctg] : fsi->acc_len[b->core.tid] + b->core.pos; |
|
s.a_en = b->core.tid < 0? s.a_st : s.a_st + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
|
s.off = off>>16, s.n_bytes = ret, s.cnt = 1; |
|
if (t.n_bytes == 0 || (t.n_bytes > size && (off&0xffff) == 0)) { |
|
if (t.n_bytes > size) kv_push(fsi1_t, *fsi, t); |
|
t = s; |
|
} else { |
|
t.a_en = t.a_en > s.a_en? t.a_en : s.a_en; |
|
++t.cnt; |
|
t.n_bytes += s.n_bytes; |
|
} |
|
off = bgzf_tell(fp); |
|
} |
|
kv_push(fsi1_t, *fsi, t); |
|
t.off = off>>16, t.a_st = t.a_en = fsi->acc_len[fsi->n_ctg], t.n_bytes = t.cnt = 0; |
|
kv_push(fsi1_t, *fsi, t); |
|
bam_destroy1(b); |
|
return fsi; |
|
} |
|
|
|
void fsi_print(const fsi_t *fsi) |
|
{ |
|
int i; |
|
printf("N\t%d\t%d\n", fsi->n_ctg, fsi->n); |
|
for (i = 0; i < fsi->n_ctg; ++i) |
|
printf("L\t%lld\n", (long long)(fsi->acc_len[i+1] - fsi->acc_len[i])); |
|
for (i = 0; i < fsi->n; ++i) { |
|
fsi1_t *p = &fsi->a[i]; |
|
printf("I\t%lld\t%lld\t%d\t%u\t%u\n", (long long)p->off, (long long)p->a_st, (int)(p->a_en - p->a_st), p->cnt, p->n_bytes); |
|
} |
|
} |
|
|
|
void fsi_destroy(fsi_t *fsi) |
|
{ |
|
free(fsi->a); free(fsi->acc_len); free(fsi); |
|
} |
|
|
|
#include <unistd.h> |
|
|
|
int main(int argc, char *argv[]) |
|
{ |
|
int c, size = 1000000; |
|
fsi_t *fsi; |
|
BGZF *fp; |
|
|
|
while ((c = getopt(argc, argv, "s:")) >= 0) |
|
if (c == 's') size = atoi(optarg); |
|
if (argc - optind < 1) { |
|
fprintf(stderr, "Usage: fsi [-s %d] <in.bam>\n", size); |
|
return 1; |
|
} |
|
fp = bgzf_open(argv[optind], "r"); |
|
fsi = fsi_build(fp, size); |
|
fsi_print(fsi); |
|
fsi_destroy(fsi); |
|
return 0; |
|
} |