Skip to content

Instantly share code, notes, and snippets.

@danielrothfus
Created February 11, 2013 05:29
Show Gist options
  • Save danielrothfus/4752869 to your computer and use it in GitHub Desktop.
Save danielrothfus/4752869 to your computer and use it in GitHub Desktop.
A very simple Huffman implementation I wrote for bit-by-bit I/O practice.
/** \file derhuff.c
\author Daniel Rothfus
\version 1.0
\brief A really simple Huffman implementation.
Derhuff is a simple utility to compress files using the Huffman coding scheme.
To compress a file, it analyzes the frequencies of all the bytes in the file
and builds a tree using these frequencies, which it writes to the output. With
this tree, the program then converts each byte of the input to the path to get
to the corresponding leaf in the tree. To decode, it just reads the tree from
the file and uses that to translate each path back to its corresponding byte.
This scheme works best on files with uneven distributions of character
frequencies, such as plain text. To see how to use this program, pass the -h
or --help flag to the executable. This program is released under the
zlib/libpng license included below:
Copyright (c) 2012 Daniel Rothfus
This software is provided 'as-is', without any express or implied warranty. In
no event will the authors be held liable for any damages arising from the use
of this software.
Permission is granted to anyone to use this software for any purpose, including
commercial applications, and to alter it and redistribute it freely, subject to
the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim
that you wrote the original software. If you use this software in a product,
an acknowledgment in the product documentation would be appreciated but is
not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution. */
// Includes -------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <stdint.h>
// Types -----------------------------------------------------------------------
/** \brief This type is used for representing offsets within a file and the size
of a file itself. */
typedef uint64_t file_size;
/** \brief This type is used for direct lookup from a character to their huffman
equivalent. */
typedef struct{
uint8_t code[8];
uint8_t length;
} translate_buffer;
/// \brief This is the basic data type that makes up a tree. */
typedef struct node{
struct node *on, *off;
int byte;
file_size occurrences;
} node;
// Prototypes -----------------------------------------------------------------
/** \brief Print the help for this program.
\param programName The name of the current program as it was called from the
command line. */
void PrintHelp(const char *programName);
/** \brief Swap a 64 bit value to/from network byte order if it is necessary.
\param value The value to swap the endianness of.
\return This returns the network byte order version of the host value or the
host version of the network byte order value. */
uint64_t ToFromNetwork64(uint64_t value);
/** \brief This organizes a tree structure inside of a buffer of nodes.
\param bufStart The first node in the buffer containing the letter frequencies.
These nodes will reference each other such that a tree is created in the
buffer.
\paraom totalOcurrences The total number of units found of all types in the
input.
\return This returns a pointer to the root of the tree. */
node *OrganizeTree(node *bufStart, file_size totalOcurrences);
/** \brief Write the encoding tree to the file.
\param output The file to write the tree to.
\return This returns an exit status for the program - 0 on success. */
char WriteTree(FILE *output, node *treeRoot);
/** \brief Read a tree used for decoding from a file.
\param output The file to read the tree from.
\param nodeBuf A buffer of at least 511 nodes to write the tree to.
\param treeRoot A pointer to a pointer to assign to the root of the tree.
\return This returns an exit status for the program - 0 on success. */
char ReadTree(FILE *input, node *nodeBuf, node **treeRoot);
/** \brief Compress a file and write the compressed file to the output file.
\param input The file to compress.
\param output The file to write the compressed data to.
\return This returns an exit code for the application. */
char CompressFile(FILE *input, FILE *output);
/** \brief Decompress a given file to another file.
\param input The file to decompress.
\param output The file to put the decompressed data into.
\return This returns an exit code for the application. */
char DecompressFile(FILE *input, FILE *output);
/** \brief Read all the frequencies of all the characters from the file from the
current point to the end.
\param file The file to read all the letter frequencies from.
\param counts A pointer to a buffer of at least 256 file size indicators, which
will be used to indicate the number of each byte. */
file_size ReadFrequencies(FILE *file, node *counts);
/** \brief Initialize all the nodes in a buffer to the correct states.
\param allNodes A pointer to the buffer of at least 511 nodes. */
void InitNodes(node *allNodes);
/** \brief This is a debug utility that prints out a tree to the standard output
in dot format, using the graph name test.
\param rootNode The root node of the tree to print. */
void PrintTree(node *rootNode);
/** \brief A debug utility that prints the translate buffers to standard output.
\param buffs A point to a buffer of at least 256 translate_buffers. */
void PrintTranslateBuffers(translate_buffer *buffers);
/** \brief Build the translate buffer for going directly from characters to
huffman equivalents.
\param rootNode The root node of the tree to convert.
\param buffers The translate buffers to fill with equivalents. There must be
size for at least 256 translate_buffers in the buffer. */
void BuildTranslateBuffers(node *rootNode, translate_buffer *buffers);
// Constants -------------------------------------------------------------------
/// \brief An array to tell getopt_long how to handle long program arguments
static const struct option progOptsLong[] = {
{ "compress", no_argument, NULL, 'c' },
{ "decompress", no_argument, NULL, 'd' },
{ "output", required_argument, NULL, 'o' },
{ "help", no_argument, NULL, 'h' },
{ NULL, no_argument, NULL, 0 }
};
/// \brief A string to tell getopt_long how to handle short command line input
static const char *progOptsStr = "cdo:h";
/// \brief A simple version number to allow for changes to the format later
static const uint8_t versionByte = 1;
// Implementations -------------------------------------------------------------
/** \brief The entry point of the application.
\param argc The number of arguments.
\param argv The arguments to the application. */
int main(int argc, char * argv[]){
int option, programStatus = EXIT_SUCCESS;
char compress = 0, decompress = 0;
FILE *output = stdout, *input = NULL;
// Parse through option arguments
while((option = getopt_long(argc, argv, progOptsStr, progOptsLong, NULL)) != -1){
switch(option){
case 'c':
compress = 1;
break;
case 'd':
decompress = 1;
break;
case 'o':
output = fopen(optarg, "wb");
if(!output){
perror("Could not open output file");
programStatus = EXIT_FAILURE;
goto cleanup;
}
break;
case '?':
fprintf(stderr, "Ambiguous option given!\n");
return EXIT_FAILURE;
case 'h':
PrintHelp(argv[0]);
return EXIT_SUCCESS;
case ':':
fprintf(stderr, "Missing option argument!\n");
return EXIT_FAILURE;
default:
fprintf(stderr, "Please pardon my insanity...");
return EXIT_FAILURE;
}
}
// Get action
if(compress * decompress == 1 || compress + decompress == 0){
fprintf(stderr, "Need either compress flag or decompress flag!\n");
programStatus = EXIT_FAILURE;
goto cleanup;
}
// Get input file
argc -= optind;
argv += optind;
if(argc == 1){
input = fopen(argv[0], "rb");
if(!input){
perror("Could not open input file");
programStatus = EXIT_FAILURE;
goto cleanup;
}
}
else if(argc > 1){
fprintf(stderr, "This program can only accept one input file!\n");
programStatus = EXIT_FAILURE;
goto cleanup;
}
else{
fprintf(stderr, "The input file is required!\n");
programStatus = EXIT_FAILURE;
goto cleanup;
}
// Do the actual compression or decompression
if(compress){
programStatus = CompressFile(input, output);
}
else{
programStatus = DecompressFile(input, output);
}
cleanup:
if(input && input != stdin){
fclose(input);
}
if(output && output != stdout){
fclose(output);
}
return programStatus;
}
void PrintHelp(const char *programName){
printf("Usage:\n\
%s [-c|-d] [-o OUTPUT] INPUT\n\
Options:\n\
-c compress : Compress the input file to the output file. (Default)\n\
-d decompress : Decompress the compressed input file to the output file.\n\
-o output : Specify the output file.\n\
-h help : Show this help message again.\n", programName);
}
#ifdef __BIG_ENDIAN__
uint64_t ToFromNetwork64(uint64_t value){
return value;
}
#elif defined(__LITTLE_ENDIAN__)
uint64_t ToFromNetwork64(uint64_t value){
uint64_t toRet;
// Swap all bytes opposite each other
for(uint8_t c = 0; c < 8; c++){
((uint8_t *)&toRet)[c] = ((uint8_t *)&value)[7 - c];
}
return toRet;
}
#else
#error Could not detect endianness of environment!
#endif
node *OrganizeTree(node *bufStart, file_size totalOcurrences){
node *rootNode = bufStart, *lowest1, *lowest2;
unsigned short combinations = 0;
// Combine two lowest nodes
while(rootNode->occurrences < totalOcurrences){
lowest1 = lowest2 = NULL;
for(unsigned short c = 0; c < 256 + combinations; c++){
if(bufStart[c].occurrences > 0){
// Make sure we assign lowest1
if(!lowest1){
lowest1 = &bufStart[c];
}
// Make sure we assign lowest2
else if(!lowest2){
lowest2 = &bufStart[c];
}
// Check to see if it is lower than either of our others
else{
if(bufStart[c].occurrences < lowest1->occurrences){
if(lowest1->occurrences < lowest2->occurrences){
lowest2 = lowest1;
}
lowest1 = &bufStart[c];
}
else if(bufStart[c].occurrences < lowest2->occurrences){
if(lowest2->occurrences < lowest1->occurrences){
lowest1 = lowest2;
}
lowest2 = &bufStart[c];
}
}
}
}
// Make sure that both have values
if(lowest2 == NULL || lowest1 == NULL){
if(lowest2 == NULL){
lowest2 = &bufStart[(lowest1->byte + 1) % 256];
}
if(lowest1 == NULL){
lowest1 = &bufStart[(lowest2->byte + 1) % 256];
}
}
// Now combine two nodes
rootNode = &bufStart[256 + combinations];
rootNode->on = lowest1;
rootNode->off = lowest2;
rootNode->occurrences = lowest1->occurrences + lowest2->occurrences;
lowest1->occurrences = lowest2->occurrences = 0;
combinations++;
}
return rootNode;
}
char WriteTree(FILE *output, node *treeRoot){
// Check if we do have a tree to write to output
if(treeRoot){
char status;
uint8_t bytesToWrite[2];
uint8_t numBytesToWrite;
// Sort out what we have to write, accounting for the special case null
if(treeRoot->byte < 0){
bytesToWrite[0] = 0;
numBytesToWrite = 1;
}
else if(treeRoot->byte == 0){
bytesToWrite[0] = 1;
bytesToWrite[1] = 1;
numBytesToWrite = 2;
}
else{
bytesToWrite[0] = treeRoot->byte;
numBytesToWrite = 1;
}
// Write current node value
if(fwrite(bytesToWrite, numBytesToWrite, 1, output) != 1){
fprintf(stderr, "Could not write tree to output!\n");
return EXIT_FAILURE;
}
// Write left and right trees
status = WriteTree(output, treeRoot->on);
if(status != EXIT_SUCCESS){
return status;
}
return WriteTree(output, treeRoot->off);
}
else{
return EXIT_SUCCESS;
}
}
static char _ReadSubtree(FILE *input, node *nodeBuf, node **assignTo,
unsigned char *internalsUsed){
char status;
uint8_t read;
int charRead;
// Read in current character value
if(fread(&read, 1, 1, input) != 1){
goto failure;
}
charRead = (read == 0) ? -1 : read;
// Check if this is the special case null byte
if(read == 1){
if(fread(&read, 1, 1, input) != 1){
goto failure;
}
if(read == 1){
charRead = 0;
}
else{
if(fseek(input, -1L, SEEK_CUR) != 0){
goto failure;
}
charRead = 1;
}
}
// Assign assignTo to the correct node in the tree
if(charRead == -1){
*assignTo = &nodeBuf[256 + (*internalsUsed)++];
status = _ReadSubtree(input, nodeBuf, &(*assignTo)->on, internalsUsed);
if(status != EXIT_SUCCESS){
return status;
}
return _ReadSubtree(input, nodeBuf, &(*assignTo)->off, internalsUsed);
}
else{
*assignTo = &nodeBuf[charRead];
return EXIT_SUCCESS;
}
failure:
fprintf(stderr, "Could not read tree from input!\n");
return EXIT_FAILURE;
}
char ReadTree(FILE *input, node *nodeBuf, node **treeRoot){
unsigned char internalsUsed = 0;
return _ReadSubtree(input, nodeBuf, treeRoot, &internalsUsed);
}
char CompressFile(FILE *inFile, FILE *outFile){
file_size fileSize, netFileSize, bytesRead;
translate_buffer transBuf[256];
node allNodes[511], *rootNode;
char status, codeByteInd, codeByteBitsLeft;
uint8_t toConv, outByte, outByteLeft, chunkSize, codeByte;
// Initialize all nodes to have the right values
InitNodes(allNodes);
// Get all letter counts and rewind
fileSize = ReadFrequencies(inFile, allNodes);
if(fseek(inFile, 0L, SEEK_SET) != 0){
perror("Could not rewind input file");
return EXIT_FAILURE;
};
// Write out version byte
if(fwrite(&versionByte, 1, 1, outFile) != 1){
fprintf(stderr, "Could version byte to output!\n");
return EXIT_FAILURE;
}
// Write out file size
netFileSize = ToFromNetwork64(fileSize);
if(fwrite(&netFileSize, sizeof(file_size), 1, outFile) != 1){
fprintf(stderr, "Could write file size to output!\n");
return EXIT_FAILURE;
}
// Create and write out encoding tree
rootNode = OrganizeTree(allNodes, fileSize);
status = WriteTree(outFile, rootNode);
if(status != EXIT_SUCCESS){
return status;
}
// Build translate buffer objects
BuildTranslateBuffers(rootNode, transBuf);
// Use translate buffers to translate
bytesRead = 0;
outByteLeft = 8;
while(fread(&toConv, 1, 1, inFile)){
// For every byte in the code
codeByteInd = 0;
while((codeByteBitsLeft = transBuf[toConv].length - codeByteInd * 8) > 0){
// Make sure we only write max 8 bits
codeByteBitsLeft = codeByteBitsLeft > 8 ? 8 : codeByteBitsLeft;
// For every bit in the current byte to write
codeByte = transBuf[toConv].code[codeByteInd];
while(codeByteBitsLeft > 0){
// Select the maximum amount we can write
chunkSize = outByteLeft < codeByteBitsLeft ? outByteLeft : codeByteBitsLeft;
outByte = (outByte & (255 >> outByteLeft)) | (codeByte << (8 - outByteLeft));
codeByte = codeByte >> chunkSize;
codeByteBitsLeft -= chunkSize;
outByteLeft -= chunkSize;
// Check if we need to flush output
if(outByteLeft == 0){
if(fwrite(&outByte, 1, 1, outFile) != 1){
goto outFailure;
}
outByteLeft = 8;
}
}
codeByteInd++;
}
bytesRead++;
}
// Write that last chunk out
if(outByteLeft < 8){
if(fwrite(&outByte, 1, 1, outFile) != 1){
goto outFailure;
}
}
// Do a sanity check
if(bytesRead != fileSize){
fprintf(stderr, "Input file size error!\n");
}
return EXIT_SUCCESS;
outFailure:
fprintf(stderr, "Could not write to output file!\n");
return EXIT_FAILURE;
}
char DecompressFile(FILE *inFile, FILE *outFile){
node allNodes[511], *rootNode, *curNode;
file_size fileSize, bytesWritten;
char status, curBit;
uint8_t curByte, toWrite, tmpVersionByte;
// Initialize all nodes to have the right values
InitNodes(allNodes);
// Read in check byte and version byte and check them
if(fread(&tmpVersionByte, 1, 1, inFile) != 1){
fprintf(stderr, "Could not read version byte!\n");
return EXIT_FAILURE;
}
if(tmpVersionByte != versionByte){
fprintf(stderr, "Cannot handle version of derhuff file!\n");
return EXIT_FAILURE;
}
// Read in number of bytes in file
if(fread(&fileSize, sizeof(fileSize), 1, inFile) != 1){
fprintf(stderr, "Could not read file size from input!\n");
return EXIT_FAILURE;
}
fileSize = ToFromNetwork64(fileSize);
// Read in the tree used to encode the file
status = ReadTree(inFile, allNodes, &rootNode);
if(status != EXIT_SUCCESS){
return status;
}
// Convert file using tree
curNode = rootNode;
curBit = 8;
bytesWritten = 0;
while(bytesWritten < fileSize){
// Try to refill mini-buffer
if(curBit == 8){
if(fread(&curByte, 1, 1, inFile) != 1){
fprintf(stderr, "Input file not of expected size!\n");
return EXIT_FAILURE;
}
curBit = 0;
}
// Take next decision
if(curByte & (1 << curBit)){
curNode = curNode->on;
}
else{
curNode = curNode->off;
}
// Check for an invalid current node
if(!curNode){
fprintf(stderr, "Corrupt input file!\n");
return EXIT_FAILURE;
}
// See if we reached an end byte
if(curNode->byte >= 0){
toWrite = curNode->byte;
if(fwrite(&toWrite, 1, 1, outFile) != 1){
fprintf(stderr, "Could not write to output file!\n");
return EXIT_FAILURE;
}
bytesWritten++;
curNode = rootNode;
}
curBit++;
}
return EXIT_SUCCESS;
}
file_size ReadFrequencies(FILE *file, node *counts){
file_size fileSize = 0;
uint8_t tmpByte;
// Scan through input file counting occurrences and file size
while(1){
if(fread(&tmpByte, 1, 1, file) != 1){
break;
}
fileSize++;
counts[tmpByte].occurrences++;
}
return fileSize;
}
void InitNodes(node *allNodes){
// Clear everything except the ones representing characters
memset(allNodes, 0, sizeof(node) * 511);
for(unsigned short c = 0; c < 256; c++){
allNodes[c].byte = c;
}
for(unsigned short c = 256; c < 511; c++){
allNodes[c].byte = 255 - c;
}
}
static void _PrintSubtree(node *rootNode){
if(rootNode->on){
_PrintSubtree(rootNode->on);
printf("\t%d -- %d;\n", rootNode->byte ,rootNode->on->byte);
}
if(rootNode->off){
_PrintSubtree(rootNode->off);
printf("\t%d -- %d;\n", rootNode->byte ,rootNode->off->byte);
}
}
void PrintTree(node *rootNode){
printf("graph test{\n");
_PrintSubtree(rootNode);
printf("}\n");
}
static void _PrintCode(translate_buffer *buffer){
for(unsigned short c = 0; c < 256 && c < buffer->length; c++){
printf("%c", (buffer->code[c / 8] & (1 << (c % 8))) ? '1' : '0');
}
}
void PrintTranslateBuffers(translate_buffer *buffers){
for(unsigned short c = 0; c < 256; c++){
printf("%c: ", c);
_PrintCode(&buffers[c]);
printf("\n");
}
}
static void _BuildPlaceEntry(node *curNode, translate_buffer *modify,
translate_buffer *insertInto){
uint8_t charIndex;
uint8_t bitIndex;
// Check for internal node or leaf
if(curNode->byte < 0){
charIndex = modify->length / 8;
bitIndex = modify->length % 8;
modify->length++;
// Encode a 1 to the position
if(curNode->on){
modify->code[charIndex] |= (1 << bitIndex);
_BuildPlaceEntry(curNode->on, modify, insertInto);
}
// Encode a 0 to the position
if(curNode->off){
modify->code[charIndex] &= ~(1 << bitIndex);
_BuildPlaceEntry(curNode->off, modify, insertInto);
}
modify->length--;
}
else{
memcpy(&insertInto[curNode->byte], modify, sizeof(translate_buffer));
}
}
void BuildTranslateBuffers(node *rootNode, translate_buffer *buffers){
translate_buffer modify;
memset(&modify, 0, sizeof(translate_buffer));
_BuildPlaceEntry(rootNode, &modify, buffers);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment