Created
May 14, 2013 14:17
-
-
Save takawitter/5576257 to your computer and use it in GitHub Desktop.
ちょっとLZSSに浮気。http://ja.wikipedia.org/wiki/LZSS を参考に、さらに一致文字数が2文字未満の場合は無視するようにした。compress6.6秒、363万文字+282byte(bit列サイズ)になった。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import java.util.BitSet; | |
import java.util.HashMap; | |
import java.util.Iterator; | |
import java.util.LinkedList; | |
import java.util.List; | |
import java.util.Map; | |
public class LZSS { | |
public static class LZSSData{ | |
public LZSSData(BitSet match, StringBuilder dest, int size) { | |
this.match = match; | |
this.dest = dest; | |
this.size = size; | |
} | |
private BitSet match = new BitSet(); | |
private StringBuilder dest = new StringBuilder(); | |
private int size; | |
} | |
public static LZSSData compress(CharSequence src, int windowSize) | |
throws IOException{ | |
BitSet match = new BitSet(); | |
StringBuilder out = new StringBuilder(); | |
int size = 0; | |
Map<Character, List<Integer>> startPoss = new HashMap<Character, List<Integer>>(); | |
int n = src.length(); | |
for(int i = 0; i < n; i++){ | |
char target = src.charAt(i); | |
// find longest match | |
boolean found = false; | |
int start = 0; | |
int matchLen = 0; | |
List<Integer> poss = startPoss.get(target); | |
if(poss != null){ | |
Iterator<Integer> it = poss.iterator(); | |
while(it.hasNext()){ | |
int s = it.next(); | |
if((i - s) > windowSize){ | |
it.remove(); | |
continue; | |
} | |
int len = getMatchedLen(src, s + 1, i + 1, n) + 1; | |
if(len > matchLen){ | |
start = i - s; | |
matchLen = len; | |
} | |
found = true; | |
} | |
poss.add(i); | |
int jn = Math.min(i + matchLen, n); | |
for(int j = i + 1; j < jn; j++){ | |
List<Integer> p = startPoss.get(src.charAt(j)); | |
if(p == null){ | |
p = new LinkedList<Integer>(); | |
startPoss.put(src.charAt(j), p); | |
} | |
p.add(j); | |
} | |
} else{ | |
poss = new LinkedList<Integer>(); | |
poss.add(i); | |
startPoss.put(target, poss); | |
} | |
if(found && matchLen > 1){ | |
match.set(size); | |
out.append((char)start) | |
.append((char)matchLen); | |
i += matchLen - 1; | |
} else{ | |
match.set(size, false); | |
out.append(target); | |
} | |
size++; | |
} | |
return new LZSSData(match, out, size); | |
} | |
public static void decompress(LZSSData src, StringBuilder out){ | |
int index = 0; | |
int n = src.size; | |
for(int i = 0; i < n; i++){ | |
if(src.match.get(i)){ | |
int start = src.dest.charAt(index++); | |
int matchedLen = src.dest.charAt(index++); | |
int s = out.length() - start; | |
int e = s + matchedLen; | |
for(; s < e; s++){ | |
out.append(out.charAt(s)); | |
} | |
} else{ | |
out.append(src.dest.charAt(index++)); | |
} | |
} | |
} | |
private static int getMatchedLen(CharSequence src, int i1, int i2, int end){ | |
int n = Math.min(i2 - i1, end - i2); | |
for(int i = 0; i < n; i++){ | |
if(src.charAt(i1++) != src.charAt(i2++)) return i; | |
} | |
return 0; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment