Created
July 6, 2013 20:33
-
-
Save amintos/5941184 to your computer and use it in GitHub Desktop.
Simple LZ-Like compression in Java.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** Fast and simple text compression. | |
* | |
* This is a modified variant of LZ77: | |
* The algorithm inserts "marker bytes" whose bits state which | |
* of the following 8 symbols are actual symbols (0) or backward | |
* references (1). Backward references are actually two bytes, the | |
* first 6 bits encoding the length and the next 10 bits encoding | |
* the negative offset where the repetition occurred. | |
* The idea is related to LZO and LZJB compression. | |
* | |
* To detect backward references, the position of each trigram is | |
* stored to a limited-size hash table, eventually being | |
* overwritten by hash collisions. | |
* | |
* | |
* | |
* */ | |
public class Compressor { | |
private static final int TABLE_SIZE = 256; | |
private static final int MATCH_BITS = 6; | |
private static final int MATCH_MIN = 3; | |
private static final int MATCH_MAX = | |
((1 << MATCH_BITS) + (MATCH_MIN - 1)); | |
private static final int OFFSET_MASK = | |
((1 << (16 - MATCH_BITS)) - 1); | |
public static String compress(String text) { | |
int[] table = new int[TABLE_SIZE]; | |
char[] input = text.toCharArray(); | |
StringBuffer output = new StringBuffer(); | |
int len = input.length; | |
int src = 0; | |
int dst = 0; | |
int cpy = 0; | |
int copymask = 1 << 7; // next indicator bit to be set | |
int copymap = 0; // last indicator byte | |
for (int i = 0; i < TABLE_SIZE; ++i) { | |
table[i] = 0xdeadbeef; | |
} | |
while (src < len) { | |
copymask <<= 1; | |
if (copymask == 1 << 8) { | |
// need next indicator byte. | |
copymask = 1; | |
copymap = dst; | |
output.append((char)0); | |
dst++; | |
} | |
if (src > len - MATCH_MAX) { | |
// better not match past this point | |
output.append(input[src++]); | |
dst++; | |
continue; | |
} | |
// simple hash function | |
int hash = (((int)(input[src]) + 13) ^ | |
((int)(input[src + 1]) - 13) ^ | |
((int)(input[src + 2]))) % TABLE_SIZE; | |
// hash-table lookup | |
int offset = (src - table[hash]) & OFFSET_MASK; | |
// store current trigram position to hash table | |
table[hash] = src; | |
cpy = src - offset; | |
// check whether real match found or overwritten table entry | |
if (cpy >= 0 && cpy != src && | |
input[src] == input[cpy] && | |
input[src + 1] == input[cpy + 1] && | |
input[src + 2] == input[cpy + 2]) { | |
// toggle indicator bit in indicator byte. | |
output.setCharAt(copymap, (char) (output.charAt(copymap) | copymask)); | |
// do forward matching from there | |
int mlen; | |
for (mlen = MATCH_MIN; mlen < MATCH_MAX; ++mlen) { | |
if (input[src + mlen] != input[cpy + mlen]) break; | |
} | |
// output 2 bytes, 6 bit length + 10 bit offset | |
output.append( (char)( | |
((mlen - MATCH_MIN ) << (8 - MATCH_BITS)) | | |
(offset >> 8) )); | |
output.append( (char)(offset & 0xff) ); | |
src += mlen; | |
dst += 2; | |
// no match, just emit uncompressed symbol | |
} else { | |
output.append(input[src++]); | |
dst++; | |
} | |
} | |
return output.toString(); | |
} | |
public static String decompress(String source) { | |
char[] input = source.toCharArray(); | |
StringBuffer output = new StringBuffer(); | |
int src = 0; | |
int dst = 0; | |
int cpy = 0; | |
int copymask = 1 << 7; | |
int copymap = 0; | |
int len = input.length; | |
while (src < len) { | |
copymask <<= 1; | |
if (copymask == 1 << 8) { | |
// indicator byte. just read it. | |
copymask = 1; | |
copymap = input[src++]; | |
} | |
// indicator byte indicates a backward reference here | |
if ((copymap & copymask) > 0) { | |
// read and decode length and offset of reference | |
int mlen = (input[src] >> (8 - MATCH_BITS)) + MATCH_MIN; | |
int offset = ((input[src] << 8) | input[src + 1]) & OFFSET_MASK; | |
src += 2; | |
cpy = dst - offset; | |
// copy all bytes from reference | |
if (cpy >= 0) { | |
while (--mlen >= 0) { | |
output.append(output.charAt(cpy)); | |
cpy++; | |
dst++; | |
} | |
} else { | |
// stream broken. just emit what we have and cancel. | |
return output.toString(); | |
//throw new RuntimeException("Compressed data corrupted"); | |
} | |
// no match, just emit uncompressed symbol | |
} else { | |
output.append(input[src++]); | |
dst++; | |
} | |
} | |
return output.toString(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment