Created
July 4, 2024 15:52
-
-
Save shapr/c76008cd4ab46f253f3aeac2130b8b7c to your computer and use it in GitHub Desktop.
byte pair encoding from http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* compress.c */ | |
/* Copyright 1994 by Philip Gage */ | |
#include <stdio.h> | |
#define BLOCKSIZE 5000 /* Maximum block size */ | |
#define HASHSIZE 4096 /* Size of hash table */ | |
#define MAXCHARS 200 /* Char set per block */ | |
#define THRESHOLD 3 /* Minimum pair count */ | |
unsigned char buffer[BLOCKSIZE]; /* Data block */ | |
unsigned char leftcode[256]; /* Pair table */ | |
unsigned char rightcode[256]; /* Pair table */ | |
unsigned char left[HASHSIZE]; /* Hash table */ | |
unsigned char right[HASHSIZE]; /* Hash table */ | |
unsigned char count[HASHSIZE]; /* Pair count */ | |
int size; /* Size of current data block */ | |
/* Function prototypes */ | |
int lookup (unsigned char, unsigned char); | |
int fileread (FILE *); | |
void filewrite (FILE *); | |
void compress (FILE *, FILE *); | |
/* Return index of character pair in hash table */ | |
/* Deleted nodes have count of 1 for hashing */ | |
int lookup (unsigned char a, unsigned char b) | |
{ | |
int index; | |
/* Compute hash key from both characters */ | |
index= (a ^ (b << 5)) & (HASHSIZE-1); | |
/* Search for pair or first empty slot */ | |
while ((left[index] != a || right[index] != b) && | |
count[index] != 0) | |
index = (index + 1) & (HASHSIZE-1); | |
/* Store pair in table */ | |
left[index] = a; | |
right[index]= b; | |
return index; | |
} | |
/* Read next block from input file into buffer */ | |
int fileread (FILE *input) | |
{ | |
int c, index, used=0; | |
/* Reset hash table and pair table */ | |
for (c = 0; c < HASHSIZE; c++) | |
count[c] = 0; | |
for (c = 0; c < 256; c++) { | |
leftcode[c] = c; | |
rightcode[c] = 0; | |
} | |
size= 0; | |
/* Read data until full or few unused chars */ | |
while (size < BLOCKSIZE && used < MAXCHARS && | |
(c = getc(input)) != EOF) { | |
if (size > 0) { | |
index = lookup(buffer[size-1],c); | |
if (count[index] < 255) ++count[index]; | |
} | |
buffer[size++] = c; | |
/* Use rightcode to flag data chars found */ | |
if (!rightcode[c]) { | |
rightcode[c] = 1; | |
used++; | |
} | |
} | |
return c == EOF; | |
} | |
/* Write each pair table and data block to output */ | |
void filewrite (FILE *output) | |
{ | |
int i, len, c = 0; | |
/* For each character 0..255 */ | |
while (c < 256) { | |
/* If not a pair code, count run of literals */ | |
if (c == leftcode[c]) { | |
len = 1; c++; | |
while (len<127 && c<256 && c==leftcode[c]) { | |
len++; c++; | |
} | |
putc(len + 127,output); len = 0; | |
if (c == 256) break; | |
} | |
/* Else count run of pair codes */ | |
else { | |
len = 0; c++; | |
while (len<127 && c<256 && c!=leftcode[c] || | |
len<125 && c<254 && c+1!=leftcode[c+1]) { | |
len++; c++; | |
} | |
putc(len,output); | |
c -= len + 1; | |
} | |
/* Write range of pairs to output */ | |
for (i = 0; i <= len; i++) { | |
putc(leftcode[c],output); | |
if (c != leftcode[c]) | |
putc(rightcode[c],output); | |
c++; | |
} | |
} | |
/* Write size bytes and compressed data block */ | |
putc(size/256,output); | |
putc(size%256,output); | |
fwrite(buffer,size,1,output); | |
} | |
/* Compress from input file to output file */ | |
void compress (FILE *infile, FILE *outfile) | |
{ | |
int leftch, rightch, code, oldsize; | |
int index, r, w, best, done = 0; | |
/* Compress each data block until end of file */ | |
while (!done) { | |
done = fileread(infile); | |
code = 256; | |
/* Compress this block */ | |
for (;;) { | |
/* Get next unused char for pair code */ | |
for (code--; code >= 0; code--) | |
if (code==leftcode[code] && !rightcode[code]) | |
break; | |
/* Must quit if no unused chars left */ | |
if (code < 0) break; | |
/* Find most frequent pair of chars */ | |
for (best=2, index=0; index<HASHSIZE; index++) | |
if (count[index] > best) { | |
best = count[index]; | |
leftch = left[index]; | |
rightch = right[index]; | |
} | |
/* Done if no more compression possible */ | |
if (best < THRESHOLD) break; | |
/* Replace pairs in data, adjust pair counts */ | |
oldsize = size - 1; | |
for (w = 0, r = 0; r < oldsize; r++) { | |
if (buffer[r] == leftch && | |
buffer[r+1] == rightch) { | |
if (r > 0) { | |
index = lookup(buffer[w-1],leftch); | |
if (count[index] > 1) --count[index]; | |
index = lookup(buffer[w-1],code); | |
if (count[index] < 255) ++count[index]; | |
} | |
if (r < oldsize - 1) { | |
index = lookup(rightch,buffer[r+2]); | |
if (count[index] > 1) --count[index]; | |
index = lookup(code,buffer[r+2]); | |
if (count[index] < 255) ++count[index]; | |
} | |
buffer[w++] = code; | |
r++; size--; | |
} | |
else buffer[w++] = buffer[r]; | |
} | |
buffer[w] = buffer[r]; | |
/* Add to pair substitution table */ | |
leftcode[code] = leftch; | |
rightcode[code] = rightch; | |
/* Delete pair from hash table */ | |
index = lookup(leftch,rightch); | |
count[index] = 1; | |
} | |
filewrite(outfile); | |
} | |
} | |
void main (int argc, char *argv[]) | |
{ | |
FILE *infile, *outfile; | |
if (argc != 3) | |
printf("Usage: compress infile outfile\n"); | |
else if ((infile=fopen(argv[1],"rb"))==NULL) | |
printf("Error opening input %s\n",argv[1]); | |
else if ((outfile=fopen(argv[2],"wb"))==NULL) | |
printf("Error opening output %s\n",argv[2]); | |
else { | |
compress(infile,outfile); | |
fclose(outfile); | |
fclose(infile); | |
} | |
} | |
/*End of File */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment