Skip to content

Instantly share code, notes, and snippets.

@takawitter
Created May 14, 2013 14:17
Show Gist options
  • Save takawitter/5576257 to your computer and use it in GitHub Desktop.
Save takawitter/5576257 to your computer and use it in GitHub Desktop.
ちょっとLZSSに浮気。http://ja.wikipedia.org/wiki/LZSS を参考に、さらに一致文字数が2文字未満の場合は無視するようにした。compress6.6秒、363万文字+282byte(bit列サイズ)になった。
import java.io.IOException;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
public class LZSS {
public static class LZSSData{
public LZSSData(BitSet match, StringBuilder dest, int size) {
this.match = match;
this.dest = dest;
this.size = size;
}
private BitSet match = new BitSet();
private StringBuilder dest = new StringBuilder();
private int size;
}
public static LZSSData compress(CharSequence src, int windowSize)
throws IOException{
BitSet match = new BitSet();
StringBuilder out = new StringBuilder();
int size = 0;
Map<Character, List<Integer>> startPoss = new HashMap<Character, List<Integer>>();
int n = src.length();
for(int i = 0; i < n; i++){
char target = src.charAt(i);
// find longest match
boolean found = false;
int start = 0;
int matchLen = 0;
List<Integer> poss = startPoss.get(target);
if(poss != null){
Iterator<Integer> it = poss.iterator();
while(it.hasNext()){
int s = it.next();
if((i - s) > windowSize){
it.remove();
continue;
}
int len = getMatchedLen(src, s + 1, i + 1, n) + 1;
if(len > matchLen){
start = i - s;
matchLen = len;
}
found = true;
}
poss.add(i);
int jn = Math.min(i + matchLen, n);
for(int j = i + 1; j < jn; j++){
List<Integer> p = startPoss.get(src.charAt(j));
if(p == null){
p = new LinkedList<Integer>();
startPoss.put(src.charAt(j), p);
}
p.add(j);
}
} else{
poss = new LinkedList<Integer>();
poss.add(i);
startPoss.put(target, poss);
}
if(found && matchLen > 1){
match.set(size);
out.append((char)start)
.append((char)matchLen);
i += matchLen - 1;
} else{
match.set(size, false);
out.append(target);
}
size++;
}
return new LZSSData(match, out, size);
}
public static void decompress(LZSSData src, StringBuilder out){
int index = 0;
int n = src.size;
for(int i = 0; i < n; i++){
if(src.match.get(i)){
int start = src.dest.charAt(index++);
int matchedLen = src.dest.charAt(index++);
int s = out.length() - start;
int e = s + matchedLen;
for(; s < e; s++){
out.append(out.charAt(s));
}
} else{
out.append(src.dest.charAt(index++));
}
}
}
private static int getMatchedLen(CharSequence src, int i1, int i2, int end){
int n = Math.min(i2 - i1, end - i2);
for(int i = 0; i < n; i++){
if(src.charAt(i1++) != src.charAt(i2++)) return i;
}
return 0;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment