Last active
May 14, 2024 19:23
-
-
Save jmarshall/11fac803b1be9e1ce889b858879ebe2d to your computer and use it in GitHub Desktop.
Display BGZF compression statistics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* bgzfsize.c -- Display uncompressed sizes of BGZF files | |
Copyright (C) 2009 John Marshall | |
*/ | |
#include <string.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdarg.h> | |
#include <inttypes.h> | |
#include <errno.h> | |
#include <fcntl.h> | |
#include <unistd.h> | |
const char usage[] = | |
"Usage: bgzfsize [-v] FILE...\n" | |
"\n" | |
"Displays the uncompressed size of each specified BGZF (e.g., BAM) file,\n" | |
"by totalling the counts in the trailer of each BGZF block in the files.\n" | |
"If more than one file is specified, each filename is also displayed\n" | |
"alongside its corresponding file size.\n" | |
"\n" | |
"When -v is given, displays the compressed size, uncompressed size, and\n" | |
"compression ratio for each specified BGZF file and also for each BGZF block\n" | |
"within each file.\n"; | |
#if !defined(O_BINARY) && defined(_O_BINARY) | |
#define O_BINARY _O_BINARY | |
#elif !defined(O_BINARY) | |
#define O_BINARY 0 | |
#endif | |
static int verbose; | |
uint16_t to_uint16(void *pv) { | |
uint8_t *p = (uint8_t *)pv; | |
uint16_t k0 = p[0], k1 = p[1]; | |
return (k1 << 8) | k0; | |
} | |
uint32_t to_uint32(void *pv) { | |
uint8_t *p = (uint8_t *)pv; | |
uint32_t k0 = p[0], k1 = p[1], k2 = p[2], k3 = p[3]; | |
return (k3 << 24) | (k2 << 16) | (k1 << 8) | k0; | |
} | |
void print_data(intmax_t compressed, intmax_t uncompressed, int count, | |
const char* fname) { | |
if (verbose) { | |
double ratio = (uncompressed == 0)? 0 : | |
(uncompressed - compressed) / (double) uncompressed; | |
printf("%"PRIdMAX"\t%"PRIdMAX"\t%4.1f%%", compressed, uncompressed, | |
100.0 * ratio); | |
} | |
else | |
printf("%"PRIdMAX, uncompressed); | |
if (fname) printf("\t%s", fname); | |
if (verbose && count >= 0) printf(" (%d blocks)", count); | |
printf("\n"); | |
} | |
int fail(int use_perror, const char *format, ...) { | |
const char *message = strerror(errno); | |
va_list args; | |
fprintf(stderr, "bgzfsize: "); | |
va_start(args, format); | |
vfprintf(stderr, format, args); | |
va_end(args); | |
if (use_perror) | |
fprintf(stderr, ": %s\n", message); | |
else | |
fprintf(stderr, "\n"); | |
return 0; | |
} | |
#define HEADER_LEN 18 | |
#define FOOTER_LEN 8 | |
int count_blocks(int fd, const char *fname, int show_fname) { | |
char buffer[FOOTER_LEN + HEADER_LEN]; | |
char* const header = &buffer[FOOTER_LEN]; | |
size_t n = FOOTER_LEN; | |
intmax_t btotal = 0, itotal = 0; | |
ssize_t nread; | |
uint32_t bsize, isize; | |
int i, blockcount = 0; | |
for (i = 0; i < sizeof buffer; i++) buffer[i] = '\0'; | |
while (1) { | |
if (n < FOOTER_LEN + HEADER_LEN) { | |
ssize_t nread = read(fd, &buffer[n], sizeof buffer - n); | |
if (nread < 0) return fail(1, "reading \"%s\" failed", fname); | |
else if (nread == 0) break; | |
else n += nread; | |
} | |
if (! (n >= sizeof buffer && | |
header[0] == '\x1f' && header[1] == '\x8b' && | |
header[12] == '\x42' && header[13] == '\x43')) | |
return fail(0, "desynchronised at offset %"PRIuMAX" in \"%s\"", | |
(uintmax_t) lseek(fd, 0, SEEK_CUR), fname); | |
bsize = to_uint16(&header[16]) + 1; | |
if (lseek(fd, bsize - sizeof buffer, SEEK_CUR) < 0) | |
return fail(1, "seeking forward in \"%s\" failed", fname); | |
nread = read(fd, buffer, sizeof buffer); | |
if (nread < 0) | |
return fail(1, "reading \"%s\" failed", fname); | |
else if (nread < FOOTER_LEN) | |
return fail(0, "footer truncated in \"%s\"", fname); | |
else | |
n = nread; | |
isize = to_uint32(&buffer[4]); | |
itotal += isize; | |
/* Don't display the block header/footer overhead. */ | |
/* bsize -= FOOTER_LEN + HEADER_LEN; ...or maybe do */ | |
if (verbose) print_data(bsize, isize, -1, NULL); | |
btotal += bsize; | |
blockcount++; | |
} | |
print_data(btotal, itotal, blockcount, show_fname? fname : NULL); | |
return 1; | |
} | |
int main(int argc, char** argv) { | |
int i, status; | |
if (argc < 2) { | |
fputs(usage, stderr); | |
return EXIT_FAILURE; | |
} | |
else if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0) { | |
fputs(usage, stdout); | |
return EXIT_SUCCESS; | |
} | |
verbose = 0; | |
status = EXIT_SUCCESS; | |
for (i = 1; i < argc; i++) { | |
if (strcmp(argv[i], "-v") == 0) | |
verbose = 1; | |
else { | |
int fd = open(argv[i], O_RDONLY | O_BINARY); | |
if (fd >= 0) { | |
if (! count_blocks(fd, argv[i], (argc > 2))) | |
status = EXIT_FAILURE; | |
close(fd); | |
} | |
else { | |
fail(1, "can't open \"%s\"", argv[i]); | |
status = EXIT_FAILURE; | |
} | |
} | |
} | |
return status; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment