Created
November 14, 2012 14:14
-
-
Save uwi/4072319 to your computer and use it in GitHub Desktop.
Suffix Automaton
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package utils.structure; | |
import java.util.Arrays; | |
import java.util.BitSet; | |
/** | |
* Suffix Automaton | |
* @author uwi | |
* see内のすべてを実装 | |
* @see http://e-maxx.ru/algo/suffix_automata | |
* @verified CF146C | |
*/ | |
public class SuffixAutomaton { | |
public int size; | |
public int[] len; // initialからの最短経路長 | |
public int[] link; // Failure Link | |
public int[][] next; // つぎ | |
public int[] original; // clonedの場合original | |
private SuffixAutomaton(int sz) { | |
size = sz; | |
len = new int[sz]; | |
link = new int[sz]; | |
next = new int[sz][]; | |
original = new int[sz]; | |
} | |
public static int enc(char c) { return c - 'a'; } | |
public static char dec(int n) { return (char)('a'+n); } | |
/** | |
* Suffix Automatonを作成し、トポロジカルソートして返す。 | |
* 1文字加える毎にcloneはたかだか1個しか作成されないので全体で2|a|-1個以下しかノードがない。 | |
* @param a | |
* @return | |
*/ | |
public static SuffixAutomaton buildSuffixAutomaton(char[] a) { | |
int n = a.length; | |
int[] len = new int[2*n]; | |
int[] link = new int[2*n]; | |
int[][] next = new int[2*n][26]; | |
int[] original = new int[2*n]; | |
Arrays.fill(link, -1); | |
for(int i = 0;i < 2*n;i++){ | |
Arrays.fill(next[i], -1); | |
} | |
Arrays.fill(original, -1); | |
len[0] = 0; | |
link[0] = -1; | |
int last = 0; | |
int sz = 1; | |
// extend | |
for(char c : a){ | |
int v = enc(c); | |
int cur = sz++; | |
len[cur] = len[last] + 1; | |
int p; | |
for(p = last; p != -1 && next[p][v] == -1; p = link[p]){ | |
next[p][v] = cur; | |
} | |
if(p == -1){ | |
link[cur] = 0; | |
}else{ | |
int q = next[p][v]; | |
if(len[p] + 1 == len[q]){ | |
link[cur] = q; | |
}else{ | |
int clone = sz++; | |
original[clone] = original[q] != -1 ? original[q] : q; | |
len[clone] = len[p]+1; | |
System.arraycopy(next[q], 0, next[clone], 0, next[q].length); | |
link[clone] = link[q]; | |
for(;p != -1 && next[p][v] == q; p = link[p]){ | |
next[p][v] = clone; | |
} | |
link[q] = link[cur] = clone; | |
} | |
} | |
last = cur; | |
} | |
// topological sort | |
int[] nct = new int[sz]; | |
for(int i = 0;i < sz;i++){ | |
for(int e : next[i]){ | |
if(e != -1)nct[e]++; | |
} | |
} | |
int[] ord = new int[sz]; | |
int p = 1; | |
ord[0] = 0; | |
for(int r = 0;r < p;r++){ | |
for(int e : next[ord[r]]){ | |
if(e != -1 && --nct[e] == 0)ord[p++] = e; | |
} | |
} | |
int[] iord = new int[sz]; | |
for(int i = 0;i < sz;i++)iord[ord[i]] = i; | |
SuffixAutomaton sa = new SuffixAutomaton(sz); | |
for(int i = 0;i < sz;i++){ | |
sa.len[i] = len[ord[i]]; | |
sa.link[i] = link[ord[i]] != -1 ? iord[link[ord[i]]] : -1; | |
sa.next[i] = next[ord[i]]; | |
for(int j = 0;j < sa.next[i].length;j++)sa.next[i][j] = sa.next[i][j] != -1 ? iord[sa.next[i][j]] : -1; | |
sa.original[i] = original[ord[i]] != -1 ? iord[original[ord[i]]] : -1; | |
} | |
return sa; | |
} | |
public String toString() | |
{ | |
StringBuilder sb = new StringBuilder(); | |
for(int i = 0;i < size;i++){ | |
sb.append("{"); | |
sb.append(i).append("|"); | |
sb.append("len:").append(len[i]).append(", "); | |
sb.append("link:").append(link[i]).append(", "); | |
sb.append("original:").append(original[i]).append(", "); | |
sb.append("next:{"); | |
for(int j = 0;j < 26;j++){ | |
if(next[i][j] != -1){ | |
sb.append(dec(j)).append(":").append(next[i][j]).append(","); | |
} | |
} | |
sb.append("}}\n"); | |
} | |
return sb.toString(); | |
} | |
/** | |
* @param str | |
* @return 1:saはstrを含む, -x:saはstrのx文字目まで含む | |
*/ | |
public int contains(char[] str) | |
{ | |
int cur = 0; | |
for(int i = 0;i < str.length;i++){ | |
int nex = next[cur][enc(str[i])]; | |
if(nex == -1)return -i; | |
cur = nex; | |
} | |
return 1; | |
} | |
/** | |
* 相異なる連続部分文字列の個数 | |
* @return | |
*/ | |
public long numberOfDistinctSubstrings() | |
{ | |
long[] dp = new long[size]; | |
for(int i = size-1;i >= 0;i--){ | |
dp[i] = 1; | |
for(int e : next[i]){ | |
if(e != -1)dp[i] += dp[e]; | |
} | |
} | |
return dp[0]-1; // remove empty | |
} | |
/** | |
* 相異なる連続部分文字列の長さの総和 | |
* @return | |
*/ | |
public long totalLengthOfDistinctSubstrings() | |
{ | |
long[] dp = new long[size]; | |
int[] count = new int[size]; | |
for(int i = size-1;i >= 0;i--){ | |
count[i] = 1; | |
dp[i] = 0; | |
for(int e : next[i]){ | |
if(e != -1){ | |
count[i] += count[e]; | |
dp[i] += dp[e] + count[e]; | |
} | |
} | |
} | |
return dp[0]; | |
} | |
/** | |
* 辞書順でK番目に小さい相異なる連続部分文字列を返す | |
* @param K (1-indexed) | |
* @return | |
*/ | |
public String kthDistinctSubstring(long K) | |
{ | |
if(K <= 0)return null; | |
long[] dp = new long[size]; | |
for(int i = size-1;i >= 0;i--){ | |
dp[i] = 1; | |
for(int e : next[i]){ | |
if(e != -1)dp[i] += dp[e]; | |
} | |
} | |
if(K > dp[0]-1)return null; | |
StringBuilder sb = new StringBuilder(); | |
int cur = 0; | |
K++; | |
while(true){ | |
if(K == 1)break; | |
K--; | |
for(int i = 0;i < next[cur].length;i++){ | |
int nex = next[cur][i]; | |
if(nex != -1){ | |
long nct = dp[nex]; | |
if(K <= nct){ | |
sb.append(dec(i)); | |
cur = nex; | |
break; | |
}else{ | |
K -= nct; | |
} | |
} | |
} | |
} | |
return sb.toString(); | |
} | |
/** | |
* シフトした文字列のうち辞書順最小のものを返す。 | |
* s+sに対しSAを構築してgreedyに最小|s|文字。 | |
* @param s | |
* @return | |
*/ | |
public static char[] smallestCyclicShift(char[] s) | |
{ | |
char[] ss = new char[s.length*2]; | |
System.arraycopy(s, 0, ss, 0, s.length); | |
System.arraycopy(s, 0, ss, s.length, s.length); | |
SuffixAutomaton sa = buildSuffixAutomaton(ss); | |
int cur = 0; | |
char[] ret = new char[s.length]; | |
for(int i = 0;i < s.length;i++){ | |
for(int j = 0;j < sa.next[cur].length;j++){ | |
int nex = sa.next[cur][j]; | |
if(nex != -1){ | |
ret[i] = dec(j); | |
cur = nex; | |
break; | |
} | |
} | |
} | |
return ret; | |
} | |
/** | |
* 各nodeについて、initial->nodeの連続部分文字列が元の文字列に何回出現するかカウントした配列を返す。 | |
* cloneされていないノードに1をつけて、len降順に走査してlinkに加算していく。 | |
* 同じ文字が現れた時最初のcloneにlinkが集まる仕掛けになっている。あとはlen降順で走査すればオリジナル文字列上のカウントが集まってくる。 | |
* @return | |
*/ | |
public int[] enumNumberOfOccurs() | |
{ | |
int n = size; | |
int[] dp = new int[n]; | |
for(int i = n-1;i >= 1;i--){ | |
if(original[i] == -1)dp[i] += 1; | |
dp[link[i]] += dp[i]; | |
} | |
return dp; | |
} | |
public int track(char[] str) | |
{ | |
int cur = 0; | |
for(int i = 0;i < str.length;i++){ | |
int nex = next[cur][enc(str[i])]; | |
if(nex == -1)return -1; | |
cur = nex; | |
} | |
return cur; | |
} | |
/** | |
* 初回出現位置 | |
* 非cloneの場合はlen-1, cloneの場合はoriginalのlen-1が部分列の末尾位置となる。 | |
* @param q | |
* @return | |
*/ | |
public int indexOf(char[] q) | |
{ | |
int cur = 0; | |
for(int i = 0;i < q.length;i++){ | |
int nex = next[cur][enc(q[i])]; | |
if(nex == -1)return -1; | |
cur = nex; | |
} | |
if(original[cur] != -1)cur = original[cur]; | |
return len[cur]-1 - q.length + 1; | |
} | |
public int[][] ilinks() | |
{ | |
int n = size; | |
int[] ip = new int[n]; | |
for(int i = 1;i < n;i++)ip[link[i]]++; | |
int[][] ret = new int[n][]; | |
for(int i = 0;i < n;i++)if(ip[i] > 0)ret[i] = new int[ip[i]]; | |
for(int i = 1;i < n;i++)ret[link[i]][--ip[link[i]]] = i; | |
return ret; | |
} | |
/** | |
* 出現位置全列挙 | |
* failure linkを逆にたどり、非cloneのものについて打点する。 | |
* @param q | |
* @param ilinks | |
* @return | |
*/ | |
public BitSet indexOfAll(char[] q, int[][] ilinks) | |
{ | |
BitSet ret = new BitSet(); | |
int cur = track(q); | |
if(cur == -1)return ret; | |
dfsIndexOfAll(cur, q.length, ilinks, ret); | |
return ret; | |
} | |
public void dfsIndexOfAll(int cur, int qlen, int[][] ilinks, BitSet bs) | |
{ | |
if(original[cur] == -1)bs.set(len[cur]-qlen); | |
if(ilinks[cur] != null)for(int e : ilinks[cur])dfsIndexOfAll(e, qlen, ilinks, bs); | |
} | |
/** | |
* 出現しない長さ最小-辞書順最小の文字列を返す。 | |
* @return | |
* NOT TESTED | |
*/ | |
public char[] nonOccuringString() | |
{ | |
int n = size; | |
int[] dp = new int[n]; | |
int[] prev = new int[n]; | |
Arrays.fill(prev, -1); | |
for(int i = n-1;i >= 0;i--){ | |
dp[i] = n+1; | |
for(int j = 0;j < next[i].length;j++){ | |
int e = next[i][j]; | |
int v = e != -1 ? dp[e] : 0; | |
if(v < dp[i]){ | |
dp[i] = v; | |
prev[i] = j; | |
} | |
} | |
dp[i] = prev[i] == -1 ? 0 : dp[i]+1; | |
} | |
int cur = 0; | |
char[] ret = new char[dp[0]]; | |
for(int i = 0;i < dp[0];i++, cur = next[cur][prev[cur]]){ | |
ret[i] = dec(prev[cur]); | |
} | |
return ret; | |
} | |
/** | |
* tとの連続文字列でのLCSのうち、Tで最も若い位置にあるものを得る。 | |
* saをtrieとみなし、Tで検索している感じ。 | |
* @param t | |
* @return | |
*/ | |
public char[] lcs(char[] t) | |
{ | |
if(t.length == 0)return new char[0]; | |
int v = 0, l = 0, best = 0, bestPos = 0; | |
for(int i = 0;i < t.length;i++){ | |
int e = enc(t[i]); | |
while(v != 0 && next[v][e] == -1){ | |
v = link[v]; | |
l = len[v]; | |
} | |
if(next[v][e] != -1){ | |
v = next[v][e]; | |
l++; | |
} | |
if(l > best){ | |
best = l; bestPos = i; | |
} | |
} | |
return Arrays.copyOfRange(t, bestPos-best+1, bestPos+1); | |
} | |
/** | |
* 複数Stringの連続文字列LCS. 異なるdelimiterでjoinしてSuffix Automatonをつくったのち、 | |
* nextでたどっていった際どのdelimiterへも直接たどりつけるようなノードのうち最大のlenのものを返す。 | |
* O(sum |S|*W) | |
* @param strs | |
* @return | |
* NOT TESTED | |
*/ | |
public static char[] lcs(char[][] strs) | |
{ | |
int m = strs.length; | |
StringBuilder sb = new StringBuilder(); | |
char delim = (char)('z'-(m-1)); | |
for(char[] str : strs){ | |
sb.append(new String(str)).append(delim++); | |
} | |
SuffixAutomaton sa = buildSuffixAutomaton(sb.toString().toCharArray()); | |
int n = sa.size; | |
int[] dp = new int[n]; | |
int besti = -1; | |
for(int i = n-1;i >= 0;i--){ | |
for(int j = 0;j < sa.next[i].length;j++){ | |
int nex = sa.next[i][j]; | |
if(nex != -1){ | |
if(j >= 25-(m-1) && j <= 25){ | |
// delim | |
dp[i] |= 1<<j-(25-(m-1)); | |
}else{ | |
// if(i >= nex){ | |
// U.tr(i, nex); | |
// } | |
dp[i] |= dp[nex]; | |
} | |
} | |
} | |
// U.tr(i, dp[i]); | |
if(dp[i] == (1<<m)-1){ | |
besti = i; | |
break; | |
} | |
} | |
if(besti == -1)return new char[0]; | |
char[] ret = new char[sa.len[besti]]; | |
int cur = besti; | |
int pos = sa.len[besti]-1; | |
for(int j = besti-1;j >= 0;j--){ | |
for(int k = 0;k < sa.next[j].length;k++){ | |
if(sa.next[j][k] == cur && sa.len[j] + 1 == sa.len[cur]){ | |
ret[pos--] = dec(k); | |
cur = j; | |
break; | |
} | |
} | |
} | |
return ret; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment