Created
August 13, 2015 19:39
-
-
Save Jnesselr/5394363edb242caf0a2a to your computer and use it in GitHub Desktop.
Huffman compression and decompression (Puff) that was done for a school assignment and designed to be fast
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Name: Huff.cpp | |
Authors: Caleb Hopper and Justin Nesselrotte | |
Data: 10/27/2013 | |
Description: This program takes as input from the console a | |
file name. It then writes out a compressed file with the same | |
file name but a .huf extension. It does this by using the | |
Huffman algorithm to compress the data. | |
*/ | |
#include <iostream> | |
#include <string> | |
#include <fstream> | |
#include <ctime> | |
#include <iomanip> | |
#include <stack> | |
// Include the standard template library. | |
using namespace std; | |
// These are two buffers, and a buffer length. 8192 was chosen | |
// as an arbitrary size. readBuffer is used for reading the | |
// compressed file, and writeBuffer is used for the output file. | |
const int BUFFER_LENGTH = 16777216; | |
unsigned char readBuffer[BUFFER_LENGTH]; | |
unsigned char writeBuffer[BUFFER_LENGTH]; | |
// This is the structure that holds each node for the huffman table. | |
// It contains the glyph (0-256), the left and right pointers, the | |
// huffman code, and the bit length of that code (shift). | |
struct HuffEntry { | |
int glyph; | |
int left; | |
int right; | |
int frequency; | |
int code; | |
int shift; | |
}; | |
// HuffTable is just an array of HuffEntrys | |
typedef HuffEntry* HuffTable; | |
/* | |
Name: sort | |
Inputs: data - The huffman table | |
length - The length of the huffman table | |
Outputs: A sorted huffman table | |
Description: This function takes the huffman table, | |
and sorts it for use as a min heap. It uses bubble | |
sort, which is not the most efficient method, but | |
was chosen for simplicity. | |
*/ | |
void sort(HuffTable data, int length) { | |
bool flag = true; | |
for(int i=1; i<=length && flag; i++) { | |
flag = false; | |
for(int j=0; j<length-1; j++) { | |
if(data[j+1].frequency < data[j].frequency) | |
{ | |
std::swap(data[j], data[j+1]); | |
flag = true; | |
} | |
} | |
} | |
} | |
/* | |
Name: getMinIndex | |
Inputs: table - The huffman table | |
left - the index for the left node | |
right - the index for the right node | |
Outputs: int - the index of the smallest one, or left if they are equal | |
Description: This function is a convenience method. It will return the | |
smaller frequency between two node indexes. If they're the same frequency, | |
then it chooses the left one. | |
*/ | |
int inline getMinIndex(HuffTable table, int left, int right) { | |
if(table[left].frequency <= table[right].frequency) { | |
return left; | |
} else { | |
return right; | |
} | |
} | |
/* | |
Name: reheap | |
Inputs: table - The huffman table | |
changedNode - the last node that was changed in the heap. | |
length - the length of the heap. | |
Outputs: a valid min-heap | |
Description: This function takes a heap in the form of a huffman | |
table, and makes sure that all of the parent and child nodes around | |
changedNode are in the correct place. If they aren't, then it swaps | |
them and checks again. | |
*/ | |
void reheap(HuffTable table, int changedNode, int length) { | |
int parent = (changedNode-1)/2; | |
int left = (changedNode*2)+1; | |
int right = (changedNode*2)+2; | |
if(changedNode >= length || length == 1) | |
return; | |
if(table[parent].frequency > table[changedNode].frequency) { | |
std::swap(table[parent], table[changedNode]); | |
if(changedNode != 0) | |
reheap(table, parent, length); | |
} else { | |
if(right < length) { | |
if(table[changedNode].frequency > table[left].frequency && table[changedNode].frequency > table[right].frequency) { | |
if(table[left].frequency <= table[right].frequency) { | |
std::swap(table[changedNode], table[left]); | |
reheap(table, left, length); | |
} else { | |
std::swap(table[changedNode], table[right]); | |
reheap(table, right, length); | |
} | |
} else if(table[changedNode].frequency > table[left].frequency || table[changedNode].frequency > table[right].frequency) { | |
int min = getMinIndex(table, left, right); | |
std::swap(table[changedNode], table[min]); | |
reheap(table, min, length); | |
} | |
} else if(left < length) { | |
if(table[changedNode].frequency > table[left].frequency) { | |
std::swap(table[changedNode], table[left]); | |
} | |
} | |
} | |
} | |
/* | |
Name: createHuffTable | |
Inputs: table - The Huffman table | |
length - the number of glyphs the table starts with | |
Outputs: The huffman table with all frequency nodes. | |
Description: This function takes the min heap of glyphs, and | |
adds all of the frequncy nodes needed to complete the max heap | |
for the huffman algorithm. | |
*/ | |
void createHuffTable(HuffTable table, int length) { | |
// Sort the table for min heap | |
sort(table, length); | |
int h = length-1; | |
int f = length; | |
int m; | |
do { | |
// Choose the smaller of the two elements, or just the left one if we don't have a choice. | |
if(h >= 2) { | |
m = getMinIndex(table, 1, 2); // Left or right child | |
} else if(h >= 1) { | |
m = 1; // Only the left child is available | |
} | |
std::swap(table[m], table[f]); | |
if(m < h) { | |
std::swap(table[m], table[h]); | |
} | |
reheap(table, m, h); | |
std::swap(table[0], table[h]); | |
// Create a new frequency node | |
table[0].glyph = -1; | |
table[0].frequency = table[h].frequency + table[f].frequency; | |
table[0].left = h; | |
table[0].right = f; | |
table[0].code = 0; | |
table[0].shift = 0; | |
reheap(table, 0, h); | |
h--; | |
f++; | |
} while(f != 2*length-1); | |
} | |
/* | |
Name: assignCodes | |
Inputs: table - The Huffman Table | |
nodes - The shift and code for each table entry | |
shifts - An array that holds the shift value for each glyph. | |
shifts[glyph] is the bit length of glyph. | |
Outputs: None returned, but nodes and shifts are modified. | |
Description: This function takes the huffman table, and calculates | |
the huffman code for each glyph. It calculates them backwards to | |
speed up decompression. | |
*/ | |
void assignCodes(HuffTable table, int* code, int* shifts) { | |
// The stacks are used to prevent a recursive call. | |
// Index is where it is at in the table, and level is | |
// the level in the tree that it is at. | |
stack<int> nextNodes; | |
stack<int> nextLevel; | |
int index, level; | |
nextNodes.push(0); | |
nextLevel.push(0); | |
do { | |
index = nextNodes.top(); | |
nextNodes.pop(); | |
level = nextLevel.top(); | |
nextLevel.pop(); | |
int left = table[index].left; | |
int right = table[index].right; | |
if(table[index].glyph == -1) { | |
// Frequency node | |
// left and right shifts/codes are based on the current | |
// one at index. | |
table[left].shift = table[index].shift+1; | |
table[left].code = table[index].code; | |
nextNodes.push(left); | |
nextLevel.push(level+1); | |
table[right].shift = table[index].shift+1; | |
table[right].code = table[index].code + (1 << level); | |
nextNodes.push(right); | |
nextLevel.push(level+1); | |
} else { | |
// Codes should already be set | |
code[table[index].glyph] = table[index].code; | |
shifts[table[index].glyph] = table[index].shift; | |
} | |
// Continue until all nodes have been visited. | |
} while(!nextNodes.empty()); | |
} | |
/* | |
Name: compressFile | |
Inputs: filename - The name of the uncompressed file | |
fin - The file handler for the input file | |
table - The huffman table | |
codes - The huffman codes for each glyph used | |
shifts - The bit length of the huffman codes used | |
tableLength - The length of the huffman tablee | |
Outputs: Compressed file to fout | |
Description: This function finds the compressed file name, | |
and writes the compressed file to fout. It writes the name | |
of the original file, the huffman table, and then the data. | |
*/ | |
void compressFile(string filename, ifstream &fin, HuffTable table, int* codes, int* shifts, int tableLength) { | |
// filename is original filename. Need to change it to .huf here for fout | |
int lastIndex = filename.find_last_of("."); | |
string outputFile = filename.substr(0, lastIndex) + ".huf"; | |
ofstream fout(outputFile, ios::binary | ios::out); | |
// Write the name of the original input file | |
int stringLength = filename.length(); | |
fout.write((char*)&stringLength, sizeof stringLength); | |
fout.write(filename.c_str(), filename.length()); | |
// Write out the huffman table | |
fout.write((char*)&tableLength, sizeof tableLength); | |
for(int i=0; i<tableLength; i++) { | |
fout.write((char*)(&table[i]),12); // 12 is scud | |
} | |
// data holds the running data values. | |
unsigned int data = 0; | |
// location in writeBuffer | |
int writeLocation = 0; | |
// The number of bits that data takes up | |
int bitsUsed = 0; | |
// A constant thtat allows us to get just the last byte. | |
const unsigned int BYTE_MASK = 0xFF; | |
// A boolean to tell us if we need to write the buffer one last time at the end. | |
bool fileWritten = true; | |
// Length of data to read into readBuffer | |
streamsize length; | |
// Reset the input file | |
fin.clear(); | |
fin.seekg(0); | |
do { | |
// Read in buffer_length bytes or less, and find out how many we got. | |
fin.read(reinterpret_cast<char*>(readBuffer), BUFFER_LENGTH); | |
length = fin.gcount(); | |
// If we read bytes, loop through them byte by byte | |
if(length > 0) { | |
for(int i=0; i<length; i++) { | |
// we haven't written out the writeBuffer yet | |
fileWritten = false; | |
// The code for the glyph is found, and put at the left of all the other bits | |
// in data. | |
int glyph = readBuffer[i]; | |
data = (codes[glyph] << bitsUsed) + data; | |
bitsUsed += shifts[glyph]; | |
// Make sure data doesn't get any larger than 8 bits so there's | |
// room for the next bytes. | |
while(bitsUsed > 8) { | |
writeBuffer[writeLocation] = (unsigned char) data & BYTE_MASK; | |
data = data >> 8; | |
bitsUsed -= 8; | |
writeLocation++; | |
if(writeLocation == BUFFER_LENGTH) { | |
writeLocation = 0; | |
fileWritten = true; | |
fout.write(reinterpret_cast<char*>(writeBuffer), BUFFER_LENGTH); | |
} | |
} | |
} | |
} | |
} while(!fin.eof()); | |
cout << bitsUsed << endl; | |
// Lets add our EOF character | |
data = (codes[256] << bitsUsed) + data; | |
bitsUsed += shifts[256]; | |
// Get data down to a single byte length so we can write it out. | |
while(bitsUsed > 8) { | |
fileWritten = false; | |
writeBuffer[writeLocation] = (unsigned char) data & BYTE_MASK; | |
data = data >> 8; | |
bitsUsed -= 8; | |
writeLocation++; | |
if(writeLocation == BUFFER_LENGTH) { | |
writeLocation = 0; | |
fileWritten = true; | |
fout.write(reinterpret_cast<char*>(writeBuffer), BUFFER_LENGTH); | |
} | |
} | |
// Any leftover bits in data need to be written | |
if(bitsUsed > 0) { | |
writeBuffer[writeLocation] = data; | |
writeLocation++; | |
} | |
// Data left in writeBuffer at the end of the run | |
if(!fileWritten) { | |
fout.write(reinterpret_cast<char*>(writeBuffer), writeLocation); | |
} | |
// Flush and close the output stream. | |
fout.flush(); | |
fout.close(); | |
} | |
/* | |
Name: main | |
Inputs: None | |
Outputs: None | |
Description: This function takes the input file name, | |
reads the file, and then converts each glyph into its | |
huffman code before writing the compressed file out. | |
*/ | |
void main() { | |
string filename; | |
// How many times each glyph is used | |
int glyphs[257] = {0}; // 0-256 | |
// number of unique glyphs read from the file | |
int numberOfGlyphs = 0; | |
cout << "Please enter a file name: "; | |
cin >> filename; | |
// Open the file | |
ifstream fin(filename, ios::binary); | |
if(fin.fail()) { | |
cout << "Nope, file no es here" << endl; | |
return; | |
} | |
// Start the clock timer | |
clock_t start, end; | |
start = clock(); | |
// Read the input file, and increase the value of glyphs at the | |
// glyph index. | |
do { | |
fin.read(reinterpret_cast<char*>(readBuffer), BUFFER_LENGTH); | |
streamsize length = fin.gcount(); | |
if(length > 0) { | |
for(int i=0; i<length; i++) { | |
if(glyphs[readBuffer[i]] == 0) | |
numberOfGlyphs++; | |
glyphs[readBuffer[i]]++; | |
} | |
} | |
} while(!fin.eof()); | |
// Special case for End of File | |
glyphs[256] = 1; | |
numberOfGlyphs++; | |
// The frequency nodes + number of glyphs | |
int tableLength = 2*numberOfGlyphs - 1; | |
// Dynamically create a table | |
HuffTable table = new HuffEntry[tableLength]; | |
// These are the huffman codes for each glyph and their bit length | |
int codes[257]; | |
int shifts[257]; | |
// Our current position in the table | |
int tableIndex = 0; | |
// This adds the unique glyphs to the table | |
for(int i=0; i<257; i++) { | |
if(glyphs[i] > 0) { | |
table[tableIndex].frequency = glyphs[i]; | |
table[tableIndex].glyph = i; | |
table[tableIndex].left = -1; | |
table[tableIndex].right = -1; | |
codes[tableIndex] = 0; | |
shifts[tableIndex] = 0; | |
tableIndex++; | |
} | |
} | |
// Create the full huffman table | |
createHuffTable(table, numberOfGlyphs); | |
// Assign the huffman code to each of the glyphs | |
assignCodes(table, codes, shifts); | |
// Compress the file, writing its header and compressed data | |
compressFile(filename, fin, table, codes, shifts, tableLength); | |
end = clock(); | |
cout << setprecision(3) << fixed; | |
//outputTable(table,tableLength); | |
cout << "Time: " << (double(end-start) / CLOCKS_PER_SEC) << " seconds." << endl; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Name: Puff.cpp | |
Authors: Caleb Hopper and Justin Nesselrotte | |
Data: 10/27/2013 | |
Description: This program takes as input from the console a | |
file name. It then writes out an uncompressed file based on | |
the compressed input file. This is done by using the huffman | |
compression algorithm. | |
*/ | |
#include<iostream> | |
#include<iomanip> | |
#include<fstream> | |
#include<string> | |
#include<stack> | |
#include<ctime> | |
#include<vector> | |
// Include the standard template library. | |
using namespace std; | |
// These global variables are bad. Please forgive us our sins. | |
// This is the maximum bit length of any huffman code. It will | |
// be used to speed up decompression later | |
int maxShift = 0; | |
// These are two buffers, and a buffer length. 8192 was chosen | |
// as an arbitrary size. readBuffer is used for reading the | |
// compressed file, and writeBuffer is used for the output file. | |
const int BUFFER_LENGTH = 8192; | |
unsigned char readBuffer[BUFFER_LENGTH]; | |
unsigned char writeBuffer[BUFFER_LENGTH]; | |
// This is the structure that holds each node for the huffman table. | |
// It contains the glyph (0-256), and the left and right pointers. | |
struct Node | |
{ | |
int glyph; | |
int left; | |
int right; | |
}; | |
// This node is used to help find the huffman codes. It's kept | |
// separate from Node to make it easier to read the table from | |
// the file. | |
struct AssignNode { | |
int shift; | |
int code; | |
}; | |
/* | |
Name: assignCodes | |
Inputs: table - The Huffman Table | |
nodes - The shift and code for each table entry | |
shifts - An array that holds the shift value for each glyph. | |
shifts[glyph] is the bit length of glyph. | |
Outputs: None returned, but nodes and shifts are modified. | |
Description: This function takes the huffman table, and calculates | |
the huffman code for each glyph. It calculates them backwards to | |
speed up decompression. | |
*/ | |
void assignCodes(Node* table, AssignNode* nodes, int* shifts) { | |
stack<int> nextNodes; | |
stack<int> nextLevel; | |
nodes[0].code= 0; | |
nodes[0].shift = 0; | |
int index, level; | |
nextNodes.push(0); | |
nextLevel.push(0); | |
do { | |
index = nextNodes.top(); | |
nextNodes.pop(); | |
level = nextLevel.top(); | |
nextLevel.pop(); | |
int left = table[index].left; | |
int right = table[index].right; | |
if(table[index].glyph == -1) { | |
// Frequency node | |
nodes[left].shift = nodes[index].shift+1; | |
nodes[left].code = nodes[index].code; | |
nextNodes.push(left); | |
nextLevel.push(level+1); | |
nodes[right].shift = nodes[index].shift+1; | |
nodes[right].code = nodes[index].code + (1 << level); | |
nextNodes.push(right); | |
nextLevel.push(level+1); | |
} else { | |
// Codes should already be set | |
shifts[table[index].glyph] = nodes[index].shift; | |
if(nodes[index].shift > maxShift) | |
maxShift = nodes[index].shift; | |
} | |
} while(!nextNodes.empty()); | |
} | |
/* | |
Name: main | |
Inputs: None | |
Outputs: None | |
Description: This function takes the input file name, | |
reads the file, and then decompresses each huffman code | |
into it's glyph in the output file specified by the name | |
in the file header. | |
*/ | |
void main() | |
{ | |
// Number of nodes in the huffman table | |
int numberOfNodes = 0; | |
// Name of the input file | |
string nameOfFile = ""; | |
// Size of the uncompressed file name | |
int lengthOfFileName = 0; | |
// Character pointer to hold the uncompressed file name | |
char * fileName; | |
// Node pointer for our huffman tree/table | |
Node * huffmanTree; | |
// AssignNode pointer for our codes and their bit lengths | |
AssignNode * assignedNodes; | |
// The bit length for every glyph. | |
int shifts[257] = {0}; | |
// Integer pointer to the decompression lookup table | |
int* lookupTable; | |
// Get the name of the file | |
cout << "File name: "; | |
cin >> nameOfFile; | |
clock_t start,end; | |
// Start our timer | |
start = clock(); | |
//Begin the reading of the header. | |
ifstream fin(nameOfFile, ios::binary); | |
if(fin.fail()) { | |
cout << "No file!" << endl; | |
} | |
//Read in the file's name. | |
fin.read((char*)&lengthOfFileName, sizeof lengthOfFileName); | |
fileName = new char[lengthOfFileName+1]; | |
fin.read(fileName, lengthOfFileName); | |
fileName[lengthOfFileName] = '\0'; | |
//Read in the Huffman Table. | |
fin.read((char*)&numberOfNodes, sizeof numberOfNodes); | |
huffmanTree = new Node[numberOfNodes]; | |
assignedNodes = new AssignNode[numberOfNodes]; | |
fin.read((char*)huffmanTree, (12 * numberOfNodes)); | |
// Find the codes and their bit length for all of the glyphs | |
assignCodes(huffmanTree, assignedNodes, shifts); | |
// Create a lookup table that is 2^maxShift entries long | |
lookupTable = new int[((int)std::pow(2,maxShift))]; | |
/* | |
These for loops fill the lookupTable with entries. For example, if | |
maxShift is 5, and the current glyph is 101, then this will store | |
the glyph into the lookup table at 00101, 01101, 10101, and 11101. | |
This allows us to read the 5 least significant bits, and determine | |
what the glyph is quickly. | |
*/ | |
for(int i=0; i<numberOfNodes; i++) { | |
if(huffmanTree[i].glyph != -1) { | |
int shift = assignedNodes[i].shift; | |
int numPossible = (int) std::pow(2 , maxShift - shift); | |
for(int j=0; j<numPossible; j++) { | |
int code = (j << shift) + assignedNodes[i].code; | |
lookupTable[code] = huffmanTree[i].glyph; | |
} | |
} | |
} | |
// output file handler | |
ofstream fout(fileName, ios::binary); | |
// data holds the running data values. | |
unsigned int data = 0; | |
// Location in writeBuffer | |
int writeLocation = 0; | |
// The number of bits that are left to read from data. | |
int bitsLeft = 0; | |
// A constant that allows us to get just the last byte. | |
const unsigned int BYTE_MASK = 0xFF; | |
// A boolean to tell us if we need to write the buffer one last time at the end. | |
bool fileWritten = true; | |
// A boolean to tell us if we have hit the eof glyph yet. | |
bool eofHit = false; | |
// A boolean to tell us if we should expect the eof sometime soon, or if we should | |
// continue to read from the input file. | |
bool goToEOF = false; | |
// Holder for the glyph. | |
int glyph = 0; | |
// Length of the data read into readBuffer. | |
streamsize length; | |
// mask is just a bit mask to let us look up the glyph in the lookup table from data. | |
int mask = (1 << maxShift) - 1; | |
do { | |
// Read in buffer_length bytes or less, and find out how many we got. | |
fin.read(reinterpret_cast<char*>(readBuffer), BUFFER_LENGTH); | |
length = fin.gcount(); | |
// If we read in less than the max buffer size, or it's the end of | |
// the input file, then we need to be on the lookout for the eof glyph | |
if(length < BUFFER_LENGTH || fin.eof()) | |
goToEOF = true; | |
// If we read in some bytes, or we're still searching for eof | |
if(length > 0 || goToEOF) { | |
// Current byte we're reading | |
int i=0; | |
// If we haven't finished off our buffer, or we're searching for eof | |
while(i < length || goToEOF) { | |
// we haven't written out the writeBuffer yet | |
fileWritten = false; | |
// If we have enough room in data to put another byte from the buffer, | |
// and if we haven't finished our buffer... | |
if(bitsLeft <= (8*(sizeof bitsLeft - 1)) && i != length) { | |
// Read the byte into data before the current data | |
data = (readBuffer[i] << bitsLeft) + data; | |
bitsLeft += 8; | |
i++; | |
} else { | |
// data & mask is used to find which glyph we have | |
glyph = lookupTable[data & mask]; | |
// Shift over the data to get to the next code | |
data = data >> shifts[glyph]; | |
bitsLeft -= shifts[glyph]; | |
// If we haven't hit the eof glyph | |
if(glyph != 256) { | |
// put the glyph in the write buffer | |
writeBuffer[writeLocation] = glyph; | |
writeLocation++; | |
// If we've filled the write buffer, write it to the file. | |
if(writeLocation == BUFFER_LENGTH) { | |
writeLocation = 0; | |
fileWritten = true; | |
fout.write(reinterpret_cast<char*>(writeBuffer), BUFFER_LENGTH); | |
} | |
} else { | |
// We've hit the eof, and we're going to stop searching for it. | |
eofHit = true; | |
goToEOF = false; | |
} | |
} | |
} | |
} | |
// Loop until the eof glyph is found. | |
} while(!eofHit); | |
// If you haven't written everything out yet, write it out. | |
if(!fileWritten) { | |
fout.write(reinterpret_cast<char*>(writeBuffer), writeLocation); | |
} | |
// Flush all remaining data, and close the file. | |
fout.flush(); | |
fout.close(); | |
// Find the ending time, and output how long the program took. | |
end = clock(); | |
cout << setprecision(3) << fixed; | |
cout << "Time: " << (double(end-start) / CLOCKS_PER_SEC) << " seconds." << endl; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment