Created
February 11, 2013 05:29
-
-
Save danielrothfus/4752869 to your computer and use it in GitHub Desktop.
A very simple Huffman implementation I wrote for bit-by-bit I/O practice.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** \file derhuff.c | |
\author Daniel Rothfus | |
\version 1.0 | |
\brief A really simple Huffman implementation. | |
Derhuff is a simple utility to compress files using the Huffman coding scheme. | |
To compress a file, it analyzes the frequencies of all the bytes in the file | |
and builds a tree using these frequencies, which it writes to the output. With | |
this tree, the program then converts each byte of the input to the path to get | |
to the corresponding leaf in the tree. To decode, it just reads the tree from | |
the file and uses that to translate each path back to its corresponding byte. | |
This scheme works best on files with uneven distributions of character | |
frequencies, such as plain text. To see how to use this program, pass the -h | |
or --help flag to the executable. This program is released under the | |
zlib/libpng license included below: | |
Copyright (c) 2012 Daniel Rothfus | |
This software is provided 'as-is', without any express or implied warranty. In | |
no event will the authors be held liable for any damages arising from the use | |
of this software. | |
Permission is granted to anyone to use this software for any purpose, including | |
commercial applications, and to alter it and redistribute it freely, subject to | |
the following restrictions: | |
1. The origin of this software must not be misrepresented; you must not claim | |
that you wrote the original software. If you use this software in a product, | |
an acknowledgment in the product documentation would be appreciated but is | |
not required. | |
2. Altered source versions must be plainly marked as such, and must not be | |
misrepresented as being the original software. | |
3. This notice may not be removed or altered from any source distribution. */ | |
// Includes ------------------------------------------------------------------- | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <getopt.h> | |
#include <stdint.h> | |
// Types ----------------------------------------------------------------------- | |
/** \brief This type is used for representing offsets within a file and the size | |
of a file itself. */ | |
typedef uint64_t file_size; | |
/** \brief This type is used for direct lookup from a character to their huffman | |
equivalent. */ | |
typedef struct{ | |
uint8_t code[8]; | |
uint8_t length; | |
} translate_buffer; | |
/// \brief This is the basic data type that makes up a tree. */ | |
typedef struct node{ | |
struct node *on, *off; | |
int byte; | |
file_size occurrences; | |
} node; | |
// Prototypes ----------------------------------------------------------------- | |
/** \brief Print the help for this program. | |
\param programName The name of the current program as it was called from the | |
command line. */ | |
void PrintHelp(const char *programName); | |
/** \brief Swap a 64 bit value to/from network byte order if it is necessary. | |
\param value The value to swap the endianness of. | |
\return This returns the network byte order version of the host value or the | |
host version of the network byte order value. */ | |
uint64_t ToFromNetwork64(uint64_t value); | |
/** \brief This organizes a tree structure inside of a buffer of nodes. | |
\param bufStart The first node in the buffer containing the letter frequencies. | |
These nodes will reference each other such that a tree is created in the | |
buffer. | |
\paraom totalOcurrences The total number of units found of all types in the | |
input. | |
\return This returns a pointer to the root of the tree. */ | |
node *OrganizeTree(node *bufStart, file_size totalOcurrences); | |
/** \brief Write the encoding tree to the file. | |
\param output The file to write the tree to. | |
\return This returns an exit status for the program - 0 on success. */ | |
char WriteTree(FILE *output, node *treeRoot); | |
/** \brief Read a tree used for decoding from a file. | |
\param output The file to read the tree from. | |
\param nodeBuf A buffer of at least 511 nodes to write the tree to. | |
\param treeRoot A pointer to a pointer to assign to the root of the tree. | |
\return This returns an exit status for the program - 0 on success. */ | |
char ReadTree(FILE *input, node *nodeBuf, node **treeRoot); | |
/** \brief Compress a file and write the compressed file to the output file. | |
\param input The file to compress. | |
\param output The file to write the compressed data to. | |
\return This returns an exit code for the application. */ | |
char CompressFile(FILE *input, FILE *output); | |
/** \brief Decompress a given file to another file. | |
\param input The file to decompress. | |
\param output The file to put the decompressed data into. | |
\return This returns an exit code for the application. */ | |
char DecompressFile(FILE *input, FILE *output); | |
/** \brief Read all the frequencies of all the characters from the file from the | |
current point to the end. | |
\param file The file to read all the letter frequencies from. | |
\param counts A pointer to a buffer of at least 256 file size indicators, which | |
will be used to indicate the number of each byte. */ | |
file_size ReadFrequencies(FILE *file, node *counts); | |
/** \brief Initialize all the nodes in a buffer to the correct states. | |
\param allNodes A pointer to the buffer of at least 511 nodes. */ | |
void InitNodes(node *allNodes); | |
/** \brief This is a debug utility that prints out a tree to the standard output | |
in dot format, using the graph name test. | |
\param rootNode The root node of the tree to print. */ | |
void PrintTree(node *rootNode); | |
/** \brief A debug utility that prints the translate buffers to standard output. | |
\param buffs A point to a buffer of at least 256 translate_buffers. */ | |
void PrintTranslateBuffers(translate_buffer *buffers); | |
/** \brief Build the translate buffer for going directly from characters to | |
huffman equivalents. | |
\param rootNode The root node of the tree to convert. | |
\param buffers The translate buffers to fill with equivalents. There must be | |
size for at least 256 translate_buffers in the buffer. */ | |
void BuildTranslateBuffers(node *rootNode, translate_buffer *buffers); | |
// Constants ------------------------------------------------------------------- | |
/// \brief An array to tell getopt_long how to handle long program arguments | |
static const struct option progOptsLong[] = { | |
{ "compress", no_argument, NULL, 'c' }, | |
{ "decompress", no_argument, NULL, 'd' }, | |
{ "output", required_argument, NULL, 'o' }, | |
{ "help", no_argument, NULL, 'h' }, | |
{ NULL, no_argument, NULL, 0 } | |
}; | |
/// \brief A string to tell getopt_long how to handle short command line input | |
static const char *progOptsStr = "cdo:h"; | |
/// \brief A simple version number to allow for changes to the format later | |
static const uint8_t versionByte = 1; | |
// Implementations ------------------------------------------------------------- | |
/** \brief The entry point of the application. | |
\param argc The number of arguments. | |
\param argv The arguments to the application. */ | |
int main(int argc, char * argv[]){ | |
int option, programStatus = EXIT_SUCCESS; | |
char compress = 0, decompress = 0; | |
FILE *output = stdout, *input = NULL; | |
// Parse through option arguments | |
while((option = getopt_long(argc, argv, progOptsStr, progOptsLong, NULL)) != -1){ | |
switch(option){ | |
case 'c': | |
compress = 1; | |
break; | |
case 'd': | |
decompress = 1; | |
break; | |
case 'o': | |
output = fopen(optarg, "wb"); | |
if(!output){ | |
perror("Could not open output file"); | |
programStatus = EXIT_FAILURE; | |
goto cleanup; | |
} | |
break; | |
case '?': | |
fprintf(stderr, "Ambiguous option given!\n"); | |
return EXIT_FAILURE; | |
case 'h': | |
PrintHelp(argv[0]); | |
return EXIT_SUCCESS; | |
case ':': | |
fprintf(stderr, "Missing option argument!\n"); | |
return EXIT_FAILURE; | |
default: | |
fprintf(stderr, "Please pardon my insanity..."); | |
return EXIT_FAILURE; | |
} | |
} | |
// Get action | |
if(compress * decompress == 1 || compress + decompress == 0){ | |
fprintf(stderr, "Need either compress flag or decompress flag!\n"); | |
programStatus = EXIT_FAILURE; | |
goto cleanup; | |
} | |
// Get input file | |
argc -= optind; | |
argv += optind; | |
if(argc == 1){ | |
input = fopen(argv[0], "rb"); | |
if(!input){ | |
perror("Could not open input file"); | |
programStatus = EXIT_FAILURE; | |
goto cleanup; | |
} | |
} | |
else if(argc > 1){ | |
fprintf(stderr, "This program can only accept one input file!\n"); | |
programStatus = EXIT_FAILURE; | |
goto cleanup; | |
} | |
else{ | |
fprintf(stderr, "The input file is required!\n"); | |
programStatus = EXIT_FAILURE; | |
goto cleanup; | |
} | |
// Do the actual compression or decompression | |
if(compress){ | |
programStatus = CompressFile(input, output); | |
} | |
else{ | |
programStatus = DecompressFile(input, output); | |
} | |
cleanup: | |
if(input && input != stdin){ | |
fclose(input); | |
} | |
if(output && output != stdout){ | |
fclose(output); | |
} | |
return programStatus; | |
} | |
void PrintHelp(const char *programName){ | |
printf("Usage:\n\ | |
%s [-c|-d] [-o OUTPUT] INPUT\n\ | |
Options:\n\ | |
-c compress : Compress the input file to the output file. (Default)\n\ | |
-d decompress : Decompress the compressed input file to the output file.\n\ | |
-o output : Specify the output file.\n\ | |
-h help : Show this help message again.\n", programName); | |
} | |
#ifdef __BIG_ENDIAN__ | |
uint64_t ToFromNetwork64(uint64_t value){ | |
return value; | |
} | |
#elif defined(__LITTLE_ENDIAN__) | |
uint64_t ToFromNetwork64(uint64_t value){ | |
uint64_t toRet; | |
// Swap all bytes opposite each other | |
for(uint8_t c = 0; c < 8; c++){ | |
((uint8_t *)&toRet)[c] = ((uint8_t *)&value)[7 - c]; | |
} | |
return toRet; | |
} | |
#else | |
#error Could not detect endianness of environment! | |
#endif | |
node *OrganizeTree(node *bufStart, file_size totalOcurrences){ | |
node *rootNode = bufStart, *lowest1, *lowest2; | |
unsigned short combinations = 0; | |
// Combine two lowest nodes | |
while(rootNode->occurrences < totalOcurrences){ | |
lowest1 = lowest2 = NULL; | |
for(unsigned short c = 0; c < 256 + combinations; c++){ | |
if(bufStart[c].occurrences > 0){ | |
// Make sure we assign lowest1 | |
if(!lowest1){ | |
lowest1 = &bufStart[c]; | |
} | |
// Make sure we assign lowest2 | |
else if(!lowest2){ | |
lowest2 = &bufStart[c]; | |
} | |
// Check to see if it is lower than either of our others | |
else{ | |
if(bufStart[c].occurrences < lowest1->occurrences){ | |
if(lowest1->occurrences < lowest2->occurrences){ | |
lowest2 = lowest1; | |
} | |
lowest1 = &bufStart[c]; | |
} | |
else if(bufStart[c].occurrences < lowest2->occurrences){ | |
if(lowest2->occurrences < lowest1->occurrences){ | |
lowest1 = lowest2; | |
} | |
lowest2 = &bufStart[c]; | |
} | |
} | |
} | |
} | |
// Make sure that both have values | |
if(lowest2 == NULL || lowest1 == NULL){ | |
if(lowest2 == NULL){ | |
lowest2 = &bufStart[(lowest1->byte + 1) % 256]; | |
} | |
if(lowest1 == NULL){ | |
lowest1 = &bufStart[(lowest2->byte + 1) % 256]; | |
} | |
} | |
// Now combine two nodes | |
rootNode = &bufStart[256 + combinations]; | |
rootNode->on = lowest1; | |
rootNode->off = lowest2; | |
rootNode->occurrences = lowest1->occurrences + lowest2->occurrences; | |
lowest1->occurrences = lowest2->occurrences = 0; | |
combinations++; | |
} | |
return rootNode; | |
} | |
char WriteTree(FILE *output, node *treeRoot){ | |
// Check if we do have a tree to write to output | |
if(treeRoot){ | |
char status; | |
uint8_t bytesToWrite[2]; | |
uint8_t numBytesToWrite; | |
// Sort out what we have to write, accounting for the special case null | |
if(treeRoot->byte < 0){ | |
bytesToWrite[0] = 0; | |
numBytesToWrite = 1; | |
} | |
else if(treeRoot->byte == 0){ | |
bytesToWrite[0] = 1; | |
bytesToWrite[1] = 1; | |
numBytesToWrite = 2; | |
} | |
else{ | |
bytesToWrite[0] = treeRoot->byte; | |
numBytesToWrite = 1; | |
} | |
// Write current node value | |
if(fwrite(bytesToWrite, numBytesToWrite, 1, output) != 1){ | |
fprintf(stderr, "Could not write tree to output!\n"); | |
return EXIT_FAILURE; | |
} | |
// Write left and right trees | |
status = WriteTree(output, treeRoot->on); | |
if(status != EXIT_SUCCESS){ | |
return status; | |
} | |
return WriteTree(output, treeRoot->off); | |
} | |
else{ | |
return EXIT_SUCCESS; | |
} | |
} | |
static char _ReadSubtree(FILE *input, node *nodeBuf, node **assignTo, | |
unsigned char *internalsUsed){ | |
char status; | |
uint8_t read; | |
int charRead; | |
// Read in current character value | |
if(fread(&read, 1, 1, input) != 1){ | |
goto failure; | |
} | |
charRead = (read == 0) ? -1 : read; | |
// Check if this is the special case null byte | |
if(read == 1){ | |
if(fread(&read, 1, 1, input) != 1){ | |
goto failure; | |
} | |
if(read == 1){ | |
charRead = 0; | |
} | |
else{ | |
if(fseek(input, -1L, SEEK_CUR) != 0){ | |
goto failure; | |
} | |
charRead = 1; | |
} | |
} | |
// Assign assignTo to the correct node in the tree | |
if(charRead == -1){ | |
*assignTo = &nodeBuf[256 + (*internalsUsed)++]; | |
status = _ReadSubtree(input, nodeBuf, &(*assignTo)->on, internalsUsed); | |
if(status != EXIT_SUCCESS){ | |
return status; | |
} | |
return _ReadSubtree(input, nodeBuf, &(*assignTo)->off, internalsUsed); | |
} | |
else{ | |
*assignTo = &nodeBuf[charRead]; | |
return EXIT_SUCCESS; | |
} | |
failure: | |
fprintf(stderr, "Could not read tree from input!\n"); | |
return EXIT_FAILURE; | |
} | |
char ReadTree(FILE *input, node *nodeBuf, node **treeRoot){ | |
unsigned char internalsUsed = 0; | |
return _ReadSubtree(input, nodeBuf, treeRoot, &internalsUsed); | |
} | |
char CompressFile(FILE *inFile, FILE *outFile){ | |
file_size fileSize, netFileSize, bytesRead; | |
translate_buffer transBuf[256]; | |
node allNodes[511], *rootNode; | |
char status, codeByteInd, codeByteBitsLeft; | |
uint8_t toConv, outByte, outByteLeft, chunkSize, codeByte; | |
// Initialize all nodes to have the right values | |
InitNodes(allNodes); | |
// Get all letter counts and rewind | |
fileSize = ReadFrequencies(inFile, allNodes); | |
if(fseek(inFile, 0L, SEEK_SET) != 0){ | |
perror("Could not rewind input file"); | |
return EXIT_FAILURE; | |
}; | |
// Write out version byte | |
if(fwrite(&versionByte, 1, 1, outFile) != 1){ | |
fprintf(stderr, "Could version byte to output!\n"); | |
return EXIT_FAILURE; | |
} | |
// Write out file size | |
netFileSize = ToFromNetwork64(fileSize); | |
if(fwrite(&netFileSize, sizeof(file_size), 1, outFile) != 1){ | |
fprintf(stderr, "Could write file size to output!\n"); | |
return EXIT_FAILURE; | |
} | |
// Create and write out encoding tree | |
rootNode = OrganizeTree(allNodes, fileSize); | |
status = WriteTree(outFile, rootNode); | |
if(status != EXIT_SUCCESS){ | |
return status; | |
} | |
// Build translate buffer objects | |
BuildTranslateBuffers(rootNode, transBuf); | |
// Use translate buffers to translate | |
bytesRead = 0; | |
outByteLeft = 8; | |
while(fread(&toConv, 1, 1, inFile)){ | |
// For every byte in the code | |
codeByteInd = 0; | |
while((codeByteBitsLeft = transBuf[toConv].length - codeByteInd * 8) > 0){ | |
// Make sure we only write max 8 bits | |
codeByteBitsLeft = codeByteBitsLeft > 8 ? 8 : codeByteBitsLeft; | |
// For every bit in the current byte to write | |
codeByte = transBuf[toConv].code[codeByteInd]; | |
while(codeByteBitsLeft > 0){ | |
// Select the maximum amount we can write | |
chunkSize = outByteLeft < codeByteBitsLeft ? outByteLeft : codeByteBitsLeft; | |
outByte = (outByte & (255 >> outByteLeft)) | (codeByte << (8 - outByteLeft)); | |
codeByte = codeByte >> chunkSize; | |
codeByteBitsLeft -= chunkSize; | |
outByteLeft -= chunkSize; | |
// Check if we need to flush output | |
if(outByteLeft == 0){ | |
if(fwrite(&outByte, 1, 1, outFile) != 1){ | |
goto outFailure; | |
} | |
outByteLeft = 8; | |
} | |
} | |
codeByteInd++; | |
} | |
bytesRead++; | |
} | |
// Write that last chunk out | |
if(outByteLeft < 8){ | |
if(fwrite(&outByte, 1, 1, outFile) != 1){ | |
goto outFailure; | |
} | |
} | |
// Do a sanity check | |
if(bytesRead != fileSize){ | |
fprintf(stderr, "Input file size error!\n"); | |
} | |
return EXIT_SUCCESS; | |
outFailure: | |
fprintf(stderr, "Could not write to output file!\n"); | |
return EXIT_FAILURE; | |
} | |
char DecompressFile(FILE *inFile, FILE *outFile){ | |
node allNodes[511], *rootNode, *curNode; | |
file_size fileSize, bytesWritten; | |
char status, curBit; | |
uint8_t curByte, toWrite, tmpVersionByte; | |
// Initialize all nodes to have the right values | |
InitNodes(allNodes); | |
// Read in check byte and version byte and check them | |
if(fread(&tmpVersionByte, 1, 1, inFile) != 1){ | |
fprintf(stderr, "Could not read version byte!\n"); | |
return EXIT_FAILURE; | |
} | |
if(tmpVersionByte != versionByte){ | |
fprintf(stderr, "Cannot handle version of derhuff file!\n"); | |
return EXIT_FAILURE; | |
} | |
// Read in number of bytes in file | |
if(fread(&fileSize, sizeof(fileSize), 1, inFile) != 1){ | |
fprintf(stderr, "Could not read file size from input!\n"); | |
return EXIT_FAILURE; | |
} | |
fileSize = ToFromNetwork64(fileSize); | |
// Read in the tree used to encode the file | |
status = ReadTree(inFile, allNodes, &rootNode); | |
if(status != EXIT_SUCCESS){ | |
return status; | |
} | |
// Convert file using tree | |
curNode = rootNode; | |
curBit = 8; | |
bytesWritten = 0; | |
while(bytesWritten < fileSize){ | |
// Try to refill mini-buffer | |
if(curBit == 8){ | |
if(fread(&curByte, 1, 1, inFile) != 1){ | |
fprintf(stderr, "Input file not of expected size!\n"); | |
return EXIT_FAILURE; | |
} | |
curBit = 0; | |
} | |
// Take next decision | |
if(curByte & (1 << curBit)){ | |
curNode = curNode->on; | |
} | |
else{ | |
curNode = curNode->off; | |
} | |
// Check for an invalid current node | |
if(!curNode){ | |
fprintf(stderr, "Corrupt input file!\n"); | |
return EXIT_FAILURE; | |
} | |
// See if we reached an end byte | |
if(curNode->byte >= 0){ | |
toWrite = curNode->byte; | |
if(fwrite(&toWrite, 1, 1, outFile) != 1){ | |
fprintf(stderr, "Could not write to output file!\n"); | |
return EXIT_FAILURE; | |
} | |
bytesWritten++; | |
curNode = rootNode; | |
} | |
curBit++; | |
} | |
return EXIT_SUCCESS; | |
} | |
file_size ReadFrequencies(FILE *file, node *counts){ | |
file_size fileSize = 0; | |
uint8_t tmpByte; | |
// Scan through input file counting occurrences and file size | |
while(1){ | |
if(fread(&tmpByte, 1, 1, file) != 1){ | |
break; | |
} | |
fileSize++; | |
counts[tmpByte].occurrences++; | |
} | |
return fileSize; | |
} | |
void InitNodes(node *allNodes){ | |
// Clear everything except the ones representing characters | |
memset(allNodes, 0, sizeof(node) * 511); | |
for(unsigned short c = 0; c < 256; c++){ | |
allNodes[c].byte = c; | |
} | |
for(unsigned short c = 256; c < 511; c++){ | |
allNodes[c].byte = 255 - c; | |
} | |
} | |
static void _PrintSubtree(node *rootNode){ | |
if(rootNode->on){ | |
_PrintSubtree(rootNode->on); | |
printf("\t%d -- %d;\n", rootNode->byte ,rootNode->on->byte); | |
} | |
if(rootNode->off){ | |
_PrintSubtree(rootNode->off); | |
printf("\t%d -- %d;\n", rootNode->byte ,rootNode->off->byte); | |
} | |
} | |
void PrintTree(node *rootNode){ | |
printf("graph test{\n"); | |
_PrintSubtree(rootNode); | |
printf("}\n"); | |
} | |
static void _PrintCode(translate_buffer *buffer){ | |
for(unsigned short c = 0; c < 256 && c < buffer->length; c++){ | |
printf("%c", (buffer->code[c / 8] & (1 << (c % 8))) ? '1' : '0'); | |
} | |
} | |
void PrintTranslateBuffers(translate_buffer *buffers){ | |
for(unsigned short c = 0; c < 256; c++){ | |
printf("%c: ", c); | |
_PrintCode(&buffers[c]); | |
printf("\n"); | |
} | |
} | |
static void _BuildPlaceEntry(node *curNode, translate_buffer *modify, | |
translate_buffer *insertInto){ | |
uint8_t charIndex; | |
uint8_t bitIndex; | |
// Check for internal node or leaf | |
if(curNode->byte < 0){ | |
charIndex = modify->length / 8; | |
bitIndex = modify->length % 8; | |
modify->length++; | |
// Encode a 1 to the position | |
if(curNode->on){ | |
modify->code[charIndex] |= (1 << bitIndex); | |
_BuildPlaceEntry(curNode->on, modify, insertInto); | |
} | |
// Encode a 0 to the position | |
if(curNode->off){ | |
modify->code[charIndex] &= ~(1 << bitIndex); | |
_BuildPlaceEntry(curNode->off, modify, insertInto); | |
} | |
modify->length--; | |
} | |
else{ | |
memcpy(&insertInto[curNode->byte], modify, sizeof(translate_buffer)); | |
} | |
} | |
void BuildTranslateBuffers(node *rootNode, translate_buffer *buffers){ | |
translate_buffer modify; | |
memset(&modify, 0, sizeof(translate_buffer)); | |
_BuildPlaceEntry(rootNode, &modify, buffers); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment