Created
May 14, 2013 00:12
-
-
Save takawitter/5572607 to your computer and use it in GitHub Desktop.
「高速文字列解析の世界」を参考に、LZ77実装してみた。非常に単純な実装で、compress1はマッチする文字列を元配列をなめて求める方式。compress2は文字出現位置をマップで持つ方式。
Wikipedia日本語タイトルをTrieに格納してできたTail配列に対してやってみると、513万文字が435万文字になった。
compress1が31.7秒、compress2が8.9秒。ウィンドウサイズは8192。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import java.util.HashMap; | |
import java.util.Iterator; | |
import java.util.LinkedList; | |
import java.util.List; | |
import java.util.Map; | |
public class LZ77 { | |
public static void compress1(CharSequence src, Appendable out, int windowSize) | |
throws IOException{ | |
int n = src.length(); | |
for(int i = 0; i < n; i++){ | |
char target = src.charAt(i); | |
// find longest match | |
boolean found = false; | |
int start = 0; | |
int matchLen = 0; | |
char nonMatchChar = 0xff; | |
for(int s = Math.max(0, i - windowSize); s < i; s++){ | |
if(target == src.charAt(s)){ | |
int len = getMatchedLen(src, s + 1, i + 1, n) + 1; | |
if(len > matchLen){ | |
start = i - s; | |
matchLen = len; | |
nonMatchChar = (char)0xff; | |
if((i + matchLen) < n){ | |
nonMatchChar = src.charAt(i + matchLen); | |
} | |
} | |
found = true; | |
} | |
} | |
if(found){ | |
out.append((char)start) | |
.append((char)matchLen) | |
.append(nonMatchChar); | |
i += matchLen; | |
} else{ | |
out.append((char)0x00).append((char)0x00).append(target); | |
} | |
} | |
} | |
public static void compress2(CharSequence src, Appendable out, int windowSize) | |
throws IOException{ | |
Map<Character, List<Integer>> startPoss = new HashMap<Character, List<Integer>>(); | |
int n = src.length(); | |
for(int i = 0; i < n; i++){ | |
char target = src.charAt(i); | |
// find longest match | |
boolean found = false; | |
int start = 0; | |
int matchLen = 0; | |
char nonMatchChar = 0xff; | |
List<Integer> poss = startPoss.get(target); | |
if(poss != null){ | |
Iterator<Integer> it = poss.iterator(); | |
while(it.hasNext()){ | |
int s = it.next(); | |
if((i - s) > windowSize){ | |
it.remove(); | |
continue; | |
} | |
int len = getMatchedLen(src, s + 1, i + 1, n) + 1; | |
if(len > matchLen){ | |
start = i - s; | |
matchLen = len; | |
nonMatchChar = (char)0xff; | |
if((i + matchLen) < n){ | |
nonMatchChar = src.charAt(i + matchLen); | |
} | |
} | |
found = true; | |
} | |
poss.add(i); | |
int jn = Math.min(i + matchLen + 1, n); | |
for(int j = i + 1; j < jn; j++){ | |
List<Integer> p = startPoss.get(src.charAt(j)); | |
if(p == null){ | |
p = new LinkedList<Integer>(); | |
startPoss.put(src.charAt(j), p); | |
} | |
p.add(j); | |
} | |
} else{ | |
poss = new LinkedList<Integer>(); | |
poss.add(i); | |
startPoss.put(target, poss); | |
} | |
if(found){ | |
out.append((char)start) | |
.append((char)matchLen) | |
.append(nonMatchChar); | |
i += matchLen; | |
} else{ | |
out.append((char)0x00).append((char)0x00).append(target); | |
} | |
} | |
} | |
private static int getMatchedLen(CharSequence src, int i1, int i2, int end){ | |
int n = Math.min(i2 - i1, end - i2); | |
for(int i = 0; i < n; i++){ | |
if(src.charAt(i1++) != src.charAt(i2++)) return i; | |
} | |
return 0; | |
} | |
public static void decompress(CharSequence src, StringBuilder out){ | |
int n = src.length(); | |
for(int i = 0; i < n; i += 3){ | |
int start = src.charAt(i); | |
int matchedLen = src.charAt(i + 1); | |
char nonMatchChar = src.charAt(i + 2); | |
if(start != 0){ | |
int s = out.length() - start; | |
int e = s + matchedLen; | |
for(; s < e; s++){ | |
out.append(out.charAt(s)); | |
} | |
} | |
if(nonMatchChar != 0xff){ | |
out.append(nonMatchChar); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment