Created
November 23, 2014 17:34
-
-
Save lomereiter/6d400eec79f85994df86 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// building suffix trees in linear time using the Ukkonen's algorithm | |
// | |
// with -debug flag, LOTS of output is printed during construction | |
// so that each step of the algorithm can be studied | |
import std.stdio; | |
import std.array, std.range, std.algorithm; | |
class SuffixTree { | |
static struct Node { | |
inout(Node*) parent() @property inout { return isRoot ? null : incomingEdge.parent; } | |
Edge* incomingEdge; | |
Edge*[] edges; | |
Node* suffixLink; | |
int startPos; // -1 for internal nodes | |
string label() { | |
string result; | |
auto e = incomingEdge; | |
while (e !is null) { | |
result = e.label ~ result; | |
e = e.parent.incomingEdge; | |
} | |
if (result == "") | |
return "(root)"; | |
return result; | |
} | |
bool isLeaf() @property const { | |
return edges.empty; | |
} | |
bool isRoot() @property const { | |
return incomingEdge is null; | |
} | |
bool isInternal() @property const { | |
return !isLeaf && !isRoot; | |
} | |
size_t stringDepth() { | |
if (isRoot) return 0; | |
else | |
return label.length; | |
} | |
} | |
static struct Edge { | |
Node* parent; | |
Node* child; | |
string label; | |
string fullLabel() { | |
if (parent.isRoot) | |
return "::" ~ label; | |
else | |
return parent.label ~ "::" ~ label; | |
} | |
} | |
void print() { | |
void printNode(Node* node, size_t stringDepth) { | |
if (node is null) | |
return; | |
foreach (edge; node.edges) { | |
string label = edge.label; | |
if (label == "\0") | |
label = "$"; | |
if (edge.child.startPos >= 0) { | |
label = label[0 .. min($, max(0, cast(int)currentLength - stringDepth - edge.child.startPos))]; | |
if (label.empty) | |
label = "$"; | |
} | |
if (edge.child.isLeaf) | |
writeln(" ".repeat(stringDepth).joiner, label, "\t(", edge.child.startPos + 1, ")"); | |
else | |
writeln(" ".repeat(stringDepth).joiner, label); | |
printNode(edge.child, stringDepth + edge.label.length); | |
} | |
} | |
void printSuffixLinks(Node* node) { | |
if (node is null) | |
return; | |
if (node.suffixLink !is null) | |
writeln(node.label, " => ", node.suffixLink.label); | |
if (node.isInternal && node.suffixLink is null) | |
writeln("WARNING: node ", node.label, " doesn't have a suffix link!"); | |
foreach (child; node.edges.map!(e => e.child)) | |
printSuffixLinks(child); | |
} | |
printNode(root, 0); | |
writeln("### SUFFIX LINKS are listed below ###"); | |
printSuffixLinks(root); | |
} | |
this(string s) { | |
root = new Node(null, [], null, -1); | |
s ~= "$"; | |
foreach (i; 0 .. s.length) | |
runPhase(s, i); | |
} | |
static struct Match { | |
Node* lastNode; | |
Edge* edge; // null if lastNode is a leaf | |
size_t distance; // down the lastEdge if it's present | |
string unmatched; | |
bool full() @property { | |
return unmatched.empty; | |
} | |
bool splitsEdge() @property { | |
return distance > 0; | |
} | |
} | |
Match findMatch(string str, Node* from, bool existsForSure=false) { | |
if (str.empty) | |
return Match(from, null, 0, str); | |
auto v = from; | |
assert(v !is null); | |
Edge* prev_e, e; | |
size_t distance; | |
while (true) { | |
char c = str[0]; | |
foreach (edge; v.edges) { | |
assert(edge.label.length > 0); | |
version(perf) ++cmp; | |
if (edge.label[0] == c) { | |
e = edge; | |
break; | |
} | |
} | |
if (e is null) { | |
e = prev_e; | |
distance = prev_e is null ? 0 : prev_e.label.length; | |
break; | |
} | |
size_t pos; | |
size_t len = e.label.length; | |
if (existsForSure) { | |
if (len < str.length) { | |
str = str[len .. $]; | |
pos = len; | |
} else { | |
pos = str.length; | |
str = ""; | |
} | |
} else { | |
pos = 1; | |
str = str[1 .. $]; | |
while (pos < len && !str.empty && e.label[pos] == str[0]) { | |
++pos; | |
str = str[1 .. $]; | |
version(perf) ++cmp; | |
} | |
} | |
if (str.empty || pos < len || e.child is null) { | |
distance = pos; | |
break; | |
} | |
prev_e = e; | |
v = e.child; | |
version(perf) ++down; | |
e = null; | |
} | |
if (e is null) | |
return Match(from, null, 0, str); | |
bool full = distance == e.label.length; | |
if (full) | |
return Match(e.child, null, 0, str); | |
else | |
return Match(e.parent, e, distance, str); | |
} | |
Edge* addEdge(Node* parent, string label, Node** newNode) { | |
*newNode = new Node; | |
auto edge = new Edge(parent, *newNode, label); | |
assert(parent.edges.map!(x=>x.label[0]).filter!(x=>x==label[0]).empty); | |
parent.edges ~= edge; | |
(*newNode).incomingEdge = edge; | |
return edge; | |
} | |
debug { | |
string fullLabel(Edge* e) { | |
if (e.child.isInternal) | |
return e.fullLabel(); | |
else | |
return e.fullLabel()[0 .. 2 + currentLength - e.child.startPos]; | |
} | |
string relativeLabel(Edge* e) { | |
return fullLabel(e)[2 + e.parent.stringDepth .. $]; | |
} | |
} | |
// returns the new node | |
Node* splitEdge(Edge* e, size_t dist) { | |
if (e.label.length == dist) | |
return e.child; | |
debug { auto original = fullLabel(e); } | |
auto newLabel = e.label[0 .. dist]; | |
auto v = new Node; | |
auto e2 = new Edge(v, e.child, e.label[dist .. $]); | |
e.label = newLabel; | |
e.child.incomingEdge = e2; | |
e.child = v; | |
v.incomingEdge = e; | |
v.edges = [e2]; | |
v.startPos = -1; // internal | |
debug writeln("SPLIT EDGE ", original, " into ", e.fullLabel, " and ", relativeLabel(e2)); | |
return v; | |
} | |
Node* root; | |
size_t leafsToSkip; | |
size_t currentLength; | |
Node* lastAddedNode; | |
Node* lastAddedInternalNode; | |
version(perf) { | |
size_t up, down, cmp; | |
// approximate numbers of up/down movements in the tree and char comparisons; | |
// approx. in the sense that they differ from precise results by O(|s.length|), | |
// still allowing to judge whether the complexity is linear or not | |
} | |
void createSuffixLink(Node* xa, Node* a) { | |
if (xa is null || xa == root || xa.suffixLink !is null) | |
return; | |
assert(a !is null); | |
debug writeln("CREATE SUFFIX LINK: ", xa.label, " => ", a.label); | |
if (a == root) { | |
assert(xa.label.length == 1); | |
xa.suffixLink = a; | |
} else { | |
assert(xa.label[1 .. $] == a.label); | |
xa.suffixLink = a; | |
} | |
} | |
void addLeaf(Node* from, string label, size_t startPos) { | |
++leafsToSkip; | |
auto e = addEdge(from, label, &lastAddedNode); | |
lastAddedNode.startPos = cast(int)startPos; | |
debug writeln("ADD LEAF ", fullLabel(e)); | |
} | |
void runPhase(string s, size_t i) { | |
++currentLength; // updates edges leading to the leafs | |
debug writeln("PROCESSING SUFFIXES OF ", s[0 .. i + 1]); | |
Match match; | |
lastAddedInternalNode = null; | |
if (lastAddedNode is null) { | |
assert (i == 0); | |
addLeaf(root, s[0 .. $], 0); | |
} | |
foreach (j; leafsToSkip - 1 .. currentLength) { | |
debug writeln("FINDING MATCH FOR ", s[j .. i + 1], " WHEREAS LAST LEAF IS ", fullLabel(lastAddedNode.incomingEdge)); | |
if (j == leafsToSkip - 1) { | |
// [j* .. i] ----> [j* .. i + 1] | |
// so lastAddedNode already points to the right suffix | |
continue; | |
} | |
assert(lastAddedNode.isLeaf); | |
auto internalNode = lastAddedNode.parent; | |
Node* v = internalNode; | |
size_t internalNodeStringDepth = s.length - lastAddedNode.startPos - lastAddedNode.incomingEdge.label.length; | |
size_t lastEdgeLength = currentLength - internalNodeStringDepth - lastAddedNode.startPos; | |
size_t relStringDepth = lastEdgeLength; | |
size_t lastInternalNodeDepth = currentLength - lastAddedNode.startPos - relStringDepth; | |
debug writeln("CURRENT LENGTH = ", currentLength, ", LAST LEAF START POS = ", lastAddedNode.startPos + 1, | |
", INCOMING EDGE LENGTH = ", relStringDepth, ", LAST INTERNAL NODE DEPTH = ", lastInternalNodeDepth); | |
auto suffix = s[j .. i + 1]; | |
if (lastInternalNodeDepth == 0) { // last leaf was added to the root => go to the root | |
debug writeln("SEARCHING FROM ROOT: ", suffix); | |
match = findMatch(suffix, root); | |
} else { | |
while (v.suffixLink is null && !v.isRoot) { | |
relStringDepth += v.incomingEdge.label.length; | |
v = v.parent; | |
version(perf) ++up; | |
} | |
Match m; | |
if (v.isRoot) { | |
debug writeln("FINDING MATCH FOR ", suffix[0 .. lastInternalNodeDepth - 1]); | |
m = findMatch(suffix[0 .. lastInternalNodeDepth - 1], root, true); | |
} else { | |
debug writeln("SHORTCUT: ", internalNode.label, " ~~~> ", | |
v.label, " => ", v.suffixLink.label, | |
" | relative string depth of current suffix (", suffix, ") = ", relStringDepth); | |
auto relSuffixLinkLabel = suffix[$ - relStringDepth .. $][0 .. $ - lastEdgeLength]; | |
m = findMatch(relSuffixLinkLabel, v.suffixLink, true); | |
} | |
assert(m.full); // the tail of internalNode label must be already present | |
assert(!m.lastNode.isLeaf); | |
if (m.edge is null) { | |
createSuffixLink(internalNode, m.lastNode); | |
match = findMatch(suffix[$ - lastEdgeLength .. $], m.lastNode); | |
} else { | |
// suffix link will be created later, after the edge is split | |
match = findMatch(suffix[$ - lastEdgeLength - m.distance .. $], m.lastNode); | |
assert(!match.full); // and there MUST be a split | |
} | |
} | |
if (match.full) { | |
break; // all further suffixes will also be present, no new nodes are required | |
} | |
auto prev = lastAddedInternalNode; | |
// since this is not a full match, we will have to add another leaf | |
if (match.edge is null) { | |
auto label = s[match.unmatched.ptr - s.ptr .. $]; | |
addLeaf(match.lastNode, label, j); | |
lastAddedInternalNode = match.lastNode; | |
} else { | |
lastAddedInternalNode = splitEdge(match.edge, match.distance); | |
auto label = s[match.unmatched.ptr - s.ptr .. $]; | |
addLeaf(lastAddedInternalNode, label, j); | |
} | |
createSuffixLink(prev, lastAddedInternalNode); | |
} | |
createSuffixLink(lastAddedInternalNode, root); | |
debug print(); | |
debug writeln("==========================="); | |
} | |
} | |
struct TimeInterval { double[] ms; | |
string toString() const { | |
import std.conv; | |
return "time (in ms): median = " ~ ms[$ / 2].to!string ~ | |
", max = " ~ ms[$ - 1].to!string; | |
} | |
} | |
TimeInterval measureBuildTime(size_t n) { | |
import std.random; | |
import std.datetime; | |
immutable letters = "ACGT"; | |
auto buf = new char[n]; | |
assert(n <= 1000000); | |
immutable N = 5000000 / n; | |
StopWatch sw; | |
TimeInterval result; | |
version(perf) size_t[] up, down, cmp; | |
foreach (k; 0 .. N) { | |
foreach (j; 0 .. n) | |
buf[j] = letters[uniform(0, letters.length)]; | |
import core.memory; | |
sw.start(); | |
SuffixTree tree; | |
try { | |
tree = new SuffixTree(cast(string)buf); | |
} catch (Throwable e) { | |
writeln("FAIL"); | |
throw e; | |
// writeln(buf); | |
} | |
sw.stop(); | |
version(perf) { | |
up ~= tree.up; | |
down ~= tree.down; | |
cmp ~= tree.cmp; | |
} | |
auto ms = sw.peek().msecs; | |
result.ms ~= ms; | |
result.ms.sort(); | |
sw.reset(); | |
} | |
version(perf) { | |
auto mean(size_t[] a) { return a.reduce!"a+b"() / a.length; } | |
auto max(size_t[] a) { return a.reduce!(.max)(); } | |
auto min(size_t[] a) { return a.reduce!(.min)(); } | |
writeln("Amount of operations performed on each tree:"); | |
writeln("minimums: up = ", min(up), ", down = ", min(down), ", cmp = ", min(cmp)); | |
writeln("averages: up = ", mean(up), ", down = ", mean(down), ", cmp = ", mean(cmp)); | |
writeln("maximums: up = ", max(up), ", down = ", max(down), ", cmp = ", max(cmp)); | |
} | |
return result; | |
} | |
void main() { | |
// test performance on random strings from ACGT alphabet; | |
// time benchmarks are not very persuasive even with disabled asserts | |
// (because of degrading CPU cache performance, I guess...), | |
// so compilation should be done with -version=perf | |
// in order to observe the linear behaviour of the algorithm | |
writeln(measureBuildTime(1000)); | |
writeln(measureBuildTime(10000)); | |
writeln(measureBuildTime(100000)); | |
writeln(measureBuildTime(1000000)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment