Created
May 12, 2019 03:18
-
-
Save mwgamera/6ede3366307cbcd34fea9b4773f37378 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
SeekGzip <https://github.com/chokkan/seekgzip/> provides a convenient | |
API wrapping Mark Adler's example code for random access to GZIP files | |
and defines simple format to store the index. However, it directly | |
calls Adler's extract function for every read, which is very slow | |
when reading contiguous sequence as it restarts reading from last | |
seek point every time, processing same data over and over. | |
This is a drop-in replacement for the reading part of SeekGzip | |
implementation that keeps whole state around and uses the index | |
only when it's helpful. | |
All good ideas and parts of code are by Mark Adler and Naoaki Okazaki; | |
bugs, lousy naming, and illegible parts are mine. | |
klg, May 2019 | |
*/ | |
#include <assert.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <zlib.h> | |
#include "seekgzip.h" | |
#ifndef BUFSIZE | |
#define BUFSIZE 16384 | |
#endif | |
#define WINSIZE 32768U | |
struct seekpoint { | |
off_t out; /* uncompressed offset */ | |
off_t in; int bits; /* compressed offset */ | |
unsigned char window[WINSIZE]; /* saved window */ | |
}; | |
/* Load array of seek points from SeekGzip index file, | |
return the number of allocated entries or zero on error. */ | |
static uint32_t zsek_load(const char *filename, | |
struct seekpoint **idx, int *error) { | |
gzFile gz; | |
uint32_t i, k; | |
*idx = NULL; | |
*error = SEEKGZIP_OPENERROR; | |
gz = gzopen(filename, "rb"); | |
if (!gz) | |
goto error; | |
*error = SEEKGZIP_IMCOMPATIBLE; /*sic*/ | |
if (gzgetc(gz) != 'Z') goto error; | |
if (gzgetc(gz) != 'S') goto error; | |
if (gzgetc(gz) != 'E') goto error; | |
if (gzgetc(gz) != 'K') goto error; | |
if (gzread(gz, &k, sizeof k) != sizeof k) | |
goto error; | |
if (k != sizeof(off_t)) | |
goto error; | |
if (gzread(gz, &k, sizeof k) != sizeof k) | |
goto error; | |
if (!k) | |
goto error; | |
*error = SEEKGZIP_OUTOFMEMORY; | |
*idx = malloc(k * sizeof **idx); | |
if (!*idx) | |
goto error; | |
for (i = 0; i < k; i++) { | |
struct seekpoint p; | |
gzread(gz, &p.out, sizeof p.out); | |
gzread(gz, &p.in, sizeof p.in); | |
gzread(gz, &p.bits, sizeof p.bits); | |
gzread(gz, &p.window, sizeof p.window); | |
(*idx)[i] = p; | |
} | |
*error = SEEKGZIP_ZLIBERROR; | |
if (gzclose(gz)) { | |
gz = NULL; | |
goto error; | |
} | |
*error = SEEKGZIP_SUCCESS; | |
return k; | |
error: | |
if (gz) | |
gzclose(gz); | |
if (*idx) | |
free(*idx); | |
return 0; | |
} | |
/* Find index of last seek point before given offset. */ | |
static int zsek_find(struct seekpoint *idx, uint32_t len, off_t off) { | |
int base = 0; | |
while (len) { | |
int m = len >> 1; | |
if (off < idx[base+m].out) { | |
len = m; | |
} else { | |
base += m + 1; | |
len -= m + 1; | |
} | |
} | |
return base - 1; | |
} | |
/* Initialize file and z_stream to read from given seek point. */ | |
static int zsek_initat(FILE *fp, z_stream *zs, struct seekpoint *sp) { | |
zs->zalloc = Z_NULL; | |
zs->zfree = Z_NULL; | |
zs->opaque = Z_NULL; | |
zs->avail_in = 0; | |
switch (inflateInit2(zs, -15)) { | |
case Z_OK: | |
break; | |
case Z_MEM_ERROR: | |
return SEEKGZIP_OUTOFMEMORY; | |
default: | |
return SEEKGZIP_ERROR; | |
} | |
if (fseeko(fp, sp->in - !!sp->bits, SEEK_SET)) | |
goto error; | |
if (sp->bits) { | |
int c = getc(fp); | |
if (c < 0) | |
goto error; | |
inflatePrime(zs, sp->bits, c >> (8 - sp->bits)); | |
} | |
inflateSetDictionary(zs, sp->window, WINSIZE); | |
return SEEKGZIP_SUCCESS; | |
error: | |
inflateEnd(zs); | |
if (ferror(fp)) | |
return SEEKGZIP_READERROR; | |
return SEEKGZIP_DATAERROR; | |
} | |
/* Inflate data using given input and output buffers. */ | |
static int zsek_read(FILE *fp, z_stream *zs, | |
unsigned char *inbuf, size_t insize, | |
unsigned char *outbuf, size_t outsize, | |
size_t *bytesread) { | |
*bytesread = 0; | |
zs->avail_out = outsize; | |
zs->next_out = outbuf; | |
do { | |
int ret; | |
if (!zs->avail_in) { | |
zs->next_in = inbuf; | |
zs->avail_in = fread(inbuf, 1, insize, fp); | |
if (!zs->avail_in) | |
return SEEKGZIP_DATAERROR; | |
if (ferror(fp)) | |
return SEEKGZIP_ERROR; | |
} | |
ret = inflate(zs, Z_NO_FLUSH); | |
*bytesread = outsize - zs->avail_out; | |
switch (ret) { | |
case Z_OK: | |
continue; | |
case Z_STREAM_END: | |
return SEEKGZIP_SUCCESS; | |
case Z_BUF_ERROR: | |
case Z_STREAM_ERROR: | |
return SEEKGZIP_ZLIBERROR; | |
case Z_DATA_ERROR: | |
case Z_NEED_DICT: | |
return SEEKGZIP_DATAERROR; | |
case Z_MEM_ERROR: | |
return SEEKGZIP_OUTOFMEMORY; | |
} | |
} while (zs->avail_out); | |
return SEEKGZIP_SUCCESS; | |
} | |
struct tag_seekgzip { | |
FILE *fp; /* original compressed file */ | |
uint32_t nseek, rseek; /* number of seek points, and last used seek point */ | |
struct seekpoint *seektbl; /* seek table */ | |
off_t offset, roff; /* current offset, and requested offset to use next */ | |
int error; /* error code */ | |
z_stream strm; /* saved stream state used in last read */ | |
unsigned char buf[BUFSIZE]; /* compressed input buffer */ | |
}; | |
/* Return newly allocated concatenation of two strings. */ | |
static char *mallocat(const char *a, const char *b) { | |
size_t l1 = strlen(a), l2 = strlen(b); | |
char *p = malloc(l1 + l2 + 1); | |
if (!p) | |
return NULL; | |
memcpy(p, a, l1); | |
memcpy(p + l1, b, l2); | |
p[l1 + l2] = 0; | |
return p; | |
} | |
/* SeekGzip public API */ | |
seekgzip_t *seekgzip_open(const char *filename, int *err) { | |
struct tag_seekgzip *zs = NULL; | |
FILE *fp; | |
char *idx_path; | |
int ret; | |
ret = SEEKGZIP_OPENERROR; | |
fp = fopen(filename, "rb"); | |
if (!fp) | |
goto error; | |
ret = SEEKGZIP_OUTOFMEMORY; | |
zs = malloc(sizeof *zs); | |
if (!zs) | |
goto error; | |
zs->fp = fp; | |
zs->rseek = zs->nseek = 0; | |
idx_path = mallocat(filename, ".idx"); | |
if (!idx_path) | |
goto error; | |
zs->nseek = zsek_load(idx_path, &zs->seektbl, &ret); | |
free(idx_path); | |
if (!zs->nseek) | |
goto error; | |
zs->roff = zs->offset = zs->seektbl->out; | |
ret = zsek_initat(zs->fp, &zs->strm, zs->seektbl); | |
if (ret) | |
goto error; | |
zs->error = 0; | |
if (err) | |
*err = 0; | |
return zs; | |
error: | |
if (fp) | |
fclose(fp); | |
if (zs) { | |
if (zs->nseek) | |
free(zs->seektbl); | |
free(zs); | |
} | |
if (err) | |
*err = ret; | |
return NULL; | |
} | |
void seekgzip_close(seekgzip_t *zs) { | |
if (zs) { | |
inflateEnd(&zs->strm); | |
if (zs->fp) | |
fclose(zs->fp); | |
zs->fp = NULL; | |
if (zs->nseek) | |
free(zs->seektbl); | |
zs->nseek = 0; | |
free(zs); | |
} | |
} | |
int seekgzip_read(seekgzip_t *zs, void *buf, int size) { | |
int ret; | |
size_t s; | |
/* Use seek table only if needed or allows skipping some data, | |
otherwise keep current context and continue from that. */ | |
if (zs->roff < zs->offset || (zs->rseek + 1 < zs->nseek && | |
zs->seektbl[zs->rseek + 1].out <= zs->roff)) { | |
int sp = zsek_find(zs->seektbl, zs->nseek, zs->roff); | |
if ((unsigned)sp >= zs->nseek) { | |
zs->error = SEEKGZIP_ERROR; | |
return -1; | |
} | |
inflateEnd(&zs->strm); | |
ret = zsek_initat(zs->fp, &zs->strm, zs->seektbl + sp); | |
if (ret) | |
goto error; | |
zs->offset = zs->seektbl[sp].out; | |
zs->rseek = sp; | |
} | |
assert(zs->roff >= zs->offset); | |
/* Read and discard anything that needs to be skipped. */ | |
while (zs->roff > zs->offset) { | |
unsigned char discard[WINSIZE]; | |
s = sizeof discard; | |
if (zs->roff - zs->offset < (ssize_t)s) | |
s = zs->roff - zs->offset; | |
ret = zsek_read(zs->fp, &zs->strm, | |
zs->buf, sizeof zs->buf, | |
discard, s, &s); | |
zs->offset += s; | |
if (ret) | |
goto error; | |
if (!s) | |
break; /* seeking past EOF */ | |
} | |
assert(zs->roff == zs->offset || feof(zs->fp)); | |
/* Finally, satisfy read request. */ | |
s = size; | |
ret = zsek_read(zs->fp, &zs->strm, | |
zs->buf, sizeof zs->buf, | |
buf, s, &s); | |
zs->offset += s; | |
zs->roff = zs->offset; | |
if (ret) | |
goto error; | |
return s; | |
error: | |
zs->error = ret; | |
return -1; | |
} | |
void seekgzip_seek(seekgzip_t *zs, off_t offset) { | |
zs->roff = offset; | |
} | |
off_t seekgzip_tell(seekgzip_t *zs) { | |
return zs->roff; | |
} | |
int seekgzip_error(seekgzip_t *zs) { | |
return zs->error; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment