Skip to content

Instantly share code, notes, and snippets.

@Jnesselr
Created August 13, 2015 19:39
Show Gist options
  • Save Jnesselr/5394363edb242caf0a2a to your computer and use it in GitHub Desktop.
Save Jnesselr/5394363edb242caf0a2a to your computer and use it in GitHub Desktop.
Huffman compression and decompression (Puff) that was done for a school assignment and designed to be fast
/*
Name: Huff.cpp
Authors: Caleb Hopper and Justin Nesselrotte
Data: 10/27/2013
Description: This program takes as input from the console a
file name. It then writes out a compressed file with the same
file name but a .huf extension. It does this by using the
Huffman algorithm to compress the data.
*/
#include <iostream>
#include <string>
#include <fstream>
#include <ctime>
#include <iomanip>
#include <stack>
// Include the standard template library.
using namespace std;
// These are two buffers, and a buffer length. 8192 was chosen
// as an arbitrary size. readBuffer is used for reading the
// compressed file, and writeBuffer is used for the output file.
const int BUFFER_LENGTH = 16777216;
unsigned char readBuffer[BUFFER_LENGTH];
unsigned char writeBuffer[BUFFER_LENGTH];
// This is the structure that holds each node for the huffman table.
// It contains the glyph (0-256), the left and right pointers, the
// huffman code, and the bit length of that code (shift).
struct HuffEntry {
int glyph;
int left;
int right;
int frequency;
int code;
int shift;
};
// HuffTable is just an array of HuffEntrys
typedef HuffEntry* HuffTable;
/*
Name: sort
Inputs: data - The huffman table
length - The length of the huffman table
Outputs: A sorted huffman table
Description: This function takes the huffman table,
and sorts it for use as a min heap. It uses bubble
sort, which is not the most efficient method, but
was chosen for simplicity.
*/
void sort(HuffTable data, int length) {
bool flag = true;
for(int i=1; i<=length && flag; i++) {
flag = false;
for(int j=0; j<length-1; j++) {
if(data[j+1].frequency < data[j].frequency)
{
std::swap(data[j], data[j+1]);
flag = true;
}
}
}
}
/*
Name: getMinIndex
Inputs: table - The huffman table
left - the index for the left node
right - the index for the right node
Outputs: int - the index of the smallest one, or left if they are equal
Description: This function is a convenience method. It will return the
smaller frequency between two node indexes. If they're the same frequency,
then it chooses the left one.
*/
int inline getMinIndex(HuffTable table, int left, int right) {
if(table[left].frequency <= table[right].frequency) {
return left;
} else {
return right;
}
}
/*
Name: reheap
Inputs: table - The huffman table
changedNode - the last node that was changed in the heap.
length - the length of the heap.
Outputs: a valid min-heap
Description: This function takes a heap in the form of a huffman
table, and makes sure that all of the parent and child nodes around
changedNode are in the correct place. If they aren't, then it swaps
them and checks again.
*/
void reheap(HuffTable table, int changedNode, int length) {
int parent = (changedNode-1)/2;
int left = (changedNode*2)+1;
int right = (changedNode*2)+2;
if(changedNode >= length || length == 1)
return;
if(table[parent].frequency > table[changedNode].frequency) {
std::swap(table[parent], table[changedNode]);
if(changedNode != 0)
reheap(table, parent, length);
} else {
if(right < length) {
if(table[changedNode].frequency > table[left].frequency && table[changedNode].frequency > table[right].frequency) {
if(table[left].frequency <= table[right].frequency) {
std::swap(table[changedNode], table[left]);
reheap(table, left, length);
} else {
std::swap(table[changedNode], table[right]);
reheap(table, right, length);
}
} else if(table[changedNode].frequency > table[left].frequency || table[changedNode].frequency > table[right].frequency) {
int min = getMinIndex(table, left, right);
std::swap(table[changedNode], table[min]);
reheap(table, min, length);
}
} else if(left < length) {
if(table[changedNode].frequency > table[left].frequency) {
std::swap(table[changedNode], table[left]);
}
}
}
}
/*
Name: createHuffTable
Inputs: table - The Huffman table
length - the number of glyphs the table starts with
Outputs: The huffman table with all frequency nodes.
Description: This function takes the min heap of glyphs, and
adds all of the frequncy nodes needed to complete the max heap
for the huffman algorithm.
*/
void createHuffTable(HuffTable table, int length) {
// Sort the table for min heap
sort(table, length);
int h = length-1;
int f = length;
int m;
do {
// Choose the smaller of the two elements, or just the left one if we don't have a choice.
if(h >= 2) {
m = getMinIndex(table, 1, 2); // Left or right child
} else if(h >= 1) {
m = 1; // Only the left child is available
}
std::swap(table[m], table[f]);
if(m < h) {
std::swap(table[m], table[h]);
}
reheap(table, m, h);
std::swap(table[0], table[h]);
// Create a new frequency node
table[0].glyph = -1;
table[0].frequency = table[h].frequency + table[f].frequency;
table[0].left = h;
table[0].right = f;
table[0].code = 0;
table[0].shift = 0;
reheap(table, 0, h);
h--;
f++;
} while(f != 2*length-1);
}
/*
Name: assignCodes
Inputs: table - The Huffman Table
nodes - The shift and code for each table entry
shifts - An array that holds the shift value for each glyph.
shifts[glyph] is the bit length of glyph.
Outputs: None returned, but nodes and shifts are modified.
Description: This function takes the huffman table, and calculates
the huffman code for each glyph. It calculates them backwards to
speed up decompression.
*/
void assignCodes(HuffTable table, int* code, int* shifts) {
// The stacks are used to prevent a recursive call.
// Index is where it is at in the table, and level is
// the level in the tree that it is at.
stack<int> nextNodes;
stack<int> nextLevel;
int index, level;
nextNodes.push(0);
nextLevel.push(0);
do {
index = nextNodes.top();
nextNodes.pop();
level = nextLevel.top();
nextLevel.pop();
int left = table[index].left;
int right = table[index].right;
if(table[index].glyph == -1) {
// Frequency node
// left and right shifts/codes are based on the current
// one at index.
table[left].shift = table[index].shift+1;
table[left].code = table[index].code;
nextNodes.push(left);
nextLevel.push(level+1);
table[right].shift = table[index].shift+1;
table[right].code = table[index].code + (1 << level);
nextNodes.push(right);
nextLevel.push(level+1);
} else {
// Codes should already be set
code[table[index].glyph] = table[index].code;
shifts[table[index].glyph] = table[index].shift;
}
// Continue until all nodes have been visited.
} while(!nextNodes.empty());
}
/*
Name: compressFile
Inputs: filename - The name of the uncompressed file
fin - The file handler for the input file
table - The huffman table
codes - The huffman codes for each glyph used
shifts - The bit length of the huffman codes used
tableLength - The length of the huffman tablee
Outputs: Compressed file to fout
Description: This function finds the compressed file name,
and writes the compressed file to fout. It writes the name
of the original file, the huffman table, and then the data.
*/
void compressFile(string filename, ifstream &fin, HuffTable table, int* codes, int* shifts, int tableLength) {
// filename is original filename. Need to change it to .huf here for fout
int lastIndex = filename.find_last_of(".");
string outputFile = filename.substr(0, lastIndex) + ".huf";
ofstream fout(outputFile, ios::binary | ios::out);
// Write the name of the original input file
int stringLength = filename.length();
fout.write((char*)&stringLength, sizeof stringLength);
fout.write(filename.c_str(), filename.length());
// Write out the huffman table
fout.write((char*)&tableLength, sizeof tableLength);
for(int i=0; i<tableLength; i++) {
fout.write((char*)(&table[i]),12); // 12 is scud
}
// data holds the running data values.
unsigned int data = 0;
// location in writeBuffer
int writeLocation = 0;
// The number of bits that data takes up
int bitsUsed = 0;
// A constant thtat allows us to get just the last byte.
const unsigned int BYTE_MASK = 0xFF;
// A boolean to tell us if we need to write the buffer one last time at the end.
bool fileWritten = true;
// Length of data to read into readBuffer
streamsize length;
// Reset the input file
fin.clear();
fin.seekg(0);
do {
// Read in buffer_length bytes or less, and find out how many we got.
fin.read(reinterpret_cast<char*>(readBuffer), BUFFER_LENGTH);
length = fin.gcount();
// If we read bytes, loop through them byte by byte
if(length > 0) {
for(int i=0; i<length; i++) {
// we haven't written out the writeBuffer yet
fileWritten = false;
// The code for the glyph is found, and put at the left of all the other bits
// in data.
int glyph = readBuffer[i];
data = (codes[glyph] << bitsUsed) + data;
bitsUsed += shifts[glyph];
// Make sure data doesn't get any larger than 8 bits so there's
// room for the next bytes.
while(bitsUsed > 8) {
writeBuffer[writeLocation] = (unsigned char) data & BYTE_MASK;
data = data >> 8;
bitsUsed -= 8;
writeLocation++;
if(writeLocation == BUFFER_LENGTH) {
writeLocation = 0;
fileWritten = true;
fout.write(reinterpret_cast<char*>(writeBuffer), BUFFER_LENGTH);
}
}
}
}
} while(!fin.eof());
cout << bitsUsed << endl;
// Lets add our EOF character
data = (codes[256] << bitsUsed) + data;
bitsUsed += shifts[256];
// Get data down to a single byte length so we can write it out.
while(bitsUsed > 8) {
fileWritten = false;
writeBuffer[writeLocation] = (unsigned char) data & BYTE_MASK;
data = data >> 8;
bitsUsed -= 8;
writeLocation++;
if(writeLocation == BUFFER_LENGTH) {
writeLocation = 0;
fileWritten = true;
fout.write(reinterpret_cast<char*>(writeBuffer), BUFFER_LENGTH);
}
}
// Any leftover bits in data need to be written
if(bitsUsed > 0) {
writeBuffer[writeLocation] = data;
writeLocation++;
}
// Data left in writeBuffer at the end of the run
if(!fileWritten) {
fout.write(reinterpret_cast<char*>(writeBuffer), writeLocation);
}
// Flush and close the output stream.
fout.flush();
fout.close();
}
/*
Name: main
Inputs: None
Outputs: None
Description: This function takes the input file name,
reads the file, and then converts each glyph into its
huffman code before writing the compressed file out.
*/
void main() {
string filename;
// How many times each glyph is used
int glyphs[257] = {0}; // 0-256
// number of unique glyphs read from the file
int numberOfGlyphs = 0;
cout << "Please enter a file name: ";
cin >> filename;
// Open the file
ifstream fin(filename, ios::binary);
if(fin.fail()) {
cout << "Nope, file no es here" << endl;
return;
}
// Start the clock timer
clock_t start, end;
start = clock();
// Read the input file, and increase the value of glyphs at the
// glyph index.
do {
fin.read(reinterpret_cast<char*>(readBuffer), BUFFER_LENGTH);
streamsize length = fin.gcount();
if(length > 0) {
for(int i=0; i<length; i++) {
if(glyphs[readBuffer[i]] == 0)
numberOfGlyphs++;
glyphs[readBuffer[i]]++;
}
}
} while(!fin.eof());
// Special case for End of File
glyphs[256] = 1;
numberOfGlyphs++;
// The frequency nodes + number of glyphs
int tableLength = 2*numberOfGlyphs - 1;
// Dynamically create a table
HuffTable table = new HuffEntry[tableLength];
// These are the huffman codes for each glyph and their bit length
int codes[257];
int shifts[257];
// Our current position in the table
int tableIndex = 0;
// This adds the unique glyphs to the table
for(int i=0; i<257; i++) {
if(glyphs[i] > 0) {
table[tableIndex].frequency = glyphs[i];
table[tableIndex].glyph = i;
table[tableIndex].left = -1;
table[tableIndex].right = -1;
codes[tableIndex] = 0;
shifts[tableIndex] = 0;
tableIndex++;
}
}
// Create the full huffman table
createHuffTable(table, numberOfGlyphs);
// Assign the huffman code to each of the glyphs
assignCodes(table, codes, shifts);
// Compress the file, writing its header and compressed data
compressFile(filename, fin, table, codes, shifts, tableLength);
end = clock();
cout << setprecision(3) << fixed;
//outputTable(table,tableLength);
cout << "Time: " << (double(end-start) / CLOCKS_PER_SEC) << " seconds." << endl;
}
/*
Name: Puff.cpp
Authors: Caleb Hopper and Justin Nesselrotte
Data: 10/27/2013
Description: This program takes as input from the console a
file name. It then writes out an uncompressed file based on
the compressed input file. This is done by using the huffman
compression algorithm.
*/
#include<iostream>
#include<iomanip>
#include<fstream>
#include<string>
#include<stack>
#include<ctime>
#include<vector>
// Include the standard template library.
using namespace std;
// These global variables are bad. Please forgive us our sins.
// This is the maximum bit length of any huffman code. It will
// be used to speed up decompression later
int maxShift = 0;
// These are two buffers, and a buffer length. 8192 was chosen
// as an arbitrary size. readBuffer is used for reading the
// compressed file, and writeBuffer is used for the output file.
const int BUFFER_LENGTH = 8192;
unsigned char readBuffer[BUFFER_LENGTH];
unsigned char writeBuffer[BUFFER_LENGTH];
// This is the structure that holds each node for the huffman table.
// It contains the glyph (0-256), and the left and right pointers.
struct Node
{
int glyph;
int left;
int right;
};
// This node is used to help find the huffman codes. It's kept
// separate from Node to make it easier to read the table from
// the file.
struct AssignNode {
int shift;
int code;
};
/*
Name: assignCodes
Inputs: table - The Huffman Table
nodes - The shift and code for each table entry
shifts - An array that holds the shift value for each glyph.
shifts[glyph] is the bit length of glyph.
Outputs: None returned, but nodes and shifts are modified.
Description: This function takes the huffman table, and calculates
the huffman code for each glyph. It calculates them backwards to
speed up decompression.
*/
void assignCodes(Node* table, AssignNode* nodes, int* shifts) {
stack<int> nextNodes;
stack<int> nextLevel;
nodes[0].code= 0;
nodes[0].shift = 0;
int index, level;
nextNodes.push(0);
nextLevel.push(0);
do {
index = nextNodes.top();
nextNodes.pop();
level = nextLevel.top();
nextLevel.pop();
int left = table[index].left;
int right = table[index].right;
if(table[index].glyph == -1) {
// Frequency node
nodes[left].shift = nodes[index].shift+1;
nodes[left].code = nodes[index].code;
nextNodes.push(left);
nextLevel.push(level+1);
nodes[right].shift = nodes[index].shift+1;
nodes[right].code = nodes[index].code + (1 << level);
nextNodes.push(right);
nextLevel.push(level+1);
} else {
// Codes should already be set
shifts[table[index].glyph] = nodes[index].shift;
if(nodes[index].shift > maxShift)
maxShift = nodes[index].shift;
}
} while(!nextNodes.empty());
}
/*
Name: main
Inputs: None
Outputs: None
Description: This function takes the input file name,
reads the file, and then decompresses each huffman code
into it's glyph in the output file specified by the name
in the file header.
*/
void main()
{
// Number of nodes in the huffman table
int numberOfNodes = 0;
// Name of the input file
string nameOfFile = "";
// Size of the uncompressed file name
int lengthOfFileName = 0;
// Character pointer to hold the uncompressed file name
char * fileName;
// Node pointer for our huffman tree/table
Node * huffmanTree;
// AssignNode pointer for our codes and their bit lengths
AssignNode * assignedNodes;
// The bit length for every glyph.
int shifts[257] = {0};
// Integer pointer to the decompression lookup table
int* lookupTable;
// Get the name of the file
cout << "File name: ";
cin >> nameOfFile;
clock_t start,end;
// Start our timer
start = clock();
//Begin the reading of the header.
ifstream fin(nameOfFile, ios::binary);
if(fin.fail()) {
cout << "No file!" << endl;
}
//Read in the file's name.
fin.read((char*)&lengthOfFileName, sizeof lengthOfFileName);
fileName = new char[lengthOfFileName+1];
fin.read(fileName, lengthOfFileName);
fileName[lengthOfFileName] = '\0';
//Read in the Huffman Table.
fin.read((char*)&numberOfNodes, sizeof numberOfNodes);
huffmanTree = new Node[numberOfNodes];
assignedNodes = new AssignNode[numberOfNodes];
fin.read((char*)huffmanTree, (12 * numberOfNodes));
// Find the codes and their bit length for all of the glyphs
assignCodes(huffmanTree, assignedNodes, shifts);
// Create a lookup table that is 2^maxShift entries long
lookupTable = new int[((int)std::pow(2,maxShift))];
/*
These for loops fill the lookupTable with entries. For example, if
maxShift is 5, and the current glyph is 101, then this will store
the glyph into the lookup table at 00101, 01101, 10101, and 11101.
This allows us to read the 5 least significant bits, and determine
what the glyph is quickly.
*/
for(int i=0; i<numberOfNodes; i++) {
if(huffmanTree[i].glyph != -1) {
int shift = assignedNodes[i].shift;
int numPossible = (int) std::pow(2 , maxShift - shift);
for(int j=0; j<numPossible; j++) {
int code = (j << shift) + assignedNodes[i].code;
lookupTable[code] = huffmanTree[i].glyph;
}
}
}
// output file handler
ofstream fout(fileName, ios::binary);
// data holds the running data values.
unsigned int data = 0;
// Location in writeBuffer
int writeLocation = 0;
// The number of bits that are left to read from data.
int bitsLeft = 0;
// A constant that allows us to get just the last byte.
const unsigned int BYTE_MASK = 0xFF;
// A boolean to tell us if we need to write the buffer one last time at the end.
bool fileWritten = true;
// A boolean to tell us if we have hit the eof glyph yet.
bool eofHit = false;
// A boolean to tell us if we should expect the eof sometime soon, or if we should
// continue to read from the input file.
bool goToEOF = false;
// Holder for the glyph.
int glyph = 0;
// Length of the data read into readBuffer.
streamsize length;
// mask is just a bit mask to let us look up the glyph in the lookup table from data.
int mask = (1 << maxShift) - 1;
do {
// Read in buffer_length bytes or less, and find out how many we got.
fin.read(reinterpret_cast<char*>(readBuffer), BUFFER_LENGTH);
length = fin.gcount();
// If we read in less than the max buffer size, or it's the end of
// the input file, then we need to be on the lookout for the eof glyph
if(length < BUFFER_LENGTH || fin.eof())
goToEOF = true;
// If we read in some bytes, or we're still searching for eof
if(length > 0 || goToEOF) {
// Current byte we're reading
int i=0;
// If we haven't finished off our buffer, or we're searching for eof
while(i < length || goToEOF) {
// we haven't written out the writeBuffer yet
fileWritten = false;
// If we have enough room in data to put another byte from the buffer,
// and if we haven't finished our buffer...
if(bitsLeft <= (8*(sizeof bitsLeft - 1)) && i != length) {
// Read the byte into data before the current data
data = (readBuffer[i] << bitsLeft) + data;
bitsLeft += 8;
i++;
} else {
// data & mask is used to find which glyph we have
glyph = lookupTable[data & mask];
// Shift over the data to get to the next code
data = data >> shifts[glyph];
bitsLeft -= shifts[glyph];
// If we haven't hit the eof glyph
if(glyph != 256) {
// put the glyph in the write buffer
writeBuffer[writeLocation] = glyph;
writeLocation++;
// If we've filled the write buffer, write it to the file.
if(writeLocation == BUFFER_LENGTH) {
writeLocation = 0;
fileWritten = true;
fout.write(reinterpret_cast<char*>(writeBuffer), BUFFER_LENGTH);
}
} else {
// We've hit the eof, and we're going to stop searching for it.
eofHit = true;
goToEOF = false;
}
}
}
}
// Loop until the eof glyph is found.
} while(!eofHit);
// If you haven't written everything out yet, write it out.
if(!fileWritten) {
fout.write(reinterpret_cast<char*>(writeBuffer), writeLocation);
}
// Flush all remaining data, and close the file.
fout.flush();
fout.close();
// Find the ending time, and output how long the program took.
end = clock();
cout << setprecision(3) << fixed;
cout << "Time: " << (double(end-start) / CLOCKS_PER_SEC) << " seconds." << endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment