Skip to content

Instantly share code, notes, and snippets.

@lomereiter
Created November 23, 2014 17:34
Show Gist options
  • Save lomereiter/6d400eec79f85994df86 to your computer and use it in GitHub Desktop.
Save lomereiter/6d400eec79f85994df86 to your computer and use it in GitHub Desktop.
// building suffix trees in linear time using the Ukkonen's algorithm
//
// with -debug flag, LOTS of output is printed during construction
// so that each step of the algorithm can be studied
import std.stdio;
import std.array, std.range, std.algorithm;
class SuffixTree {
static struct Node {
inout(Node*) parent() @property inout { return isRoot ? null : incomingEdge.parent; }
Edge* incomingEdge;
Edge*[] edges;
Node* suffixLink;
int startPos; // -1 for internal nodes
string label() {
string result;
auto e = incomingEdge;
while (e !is null) {
result = e.label ~ result;
e = e.parent.incomingEdge;
}
if (result == "")
return "(root)";
return result;
}
bool isLeaf() @property const {
return edges.empty;
}
bool isRoot() @property const {
return incomingEdge is null;
}
bool isInternal() @property const {
return !isLeaf && !isRoot;
}
size_t stringDepth() {
if (isRoot) return 0;
else
return label.length;
}
}
static struct Edge {
Node* parent;
Node* child;
string label;
string fullLabel() {
if (parent.isRoot)
return "::" ~ label;
else
return parent.label ~ "::" ~ label;
}
}
void print() {
void printNode(Node* node, size_t stringDepth) {
if (node is null)
return;
foreach (edge; node.edges) {
string label = edge.label;
if (label == "\0")
label = "$";
if (edge.child.startPos >= 0) {
label = label[0 .. min($, max(0, cast(int)currentLength - stringDepth - edge.child.startPos))];
if (label.empty)
label = "$";
}
if (edge.child.isLeaf)
writeln(" ".repeat(stringDepth).joiner, label, "\t(", edge.child.startPos + 1, ")");
else
writeln(" ".repeat(stringDepth).joiner, label);
printNode(edge.child, stringDepth + edge.label.length);
}
}
void printSuffixLinks(Node* node) {
if (node is null)
return;
if (node.suffixLink !is null)
writeln(node.label, " => ", node.suffixLink.label);
if (node.isInternal && node.suffixLink is null)
writeln("WARNING: node ", node.label, " doesn't have a suffix link!");
foreach (child; node.edges.map!(e => e.child))
printSuffixLinks(child);
}
printNode(root, 0);
writeln("### SUFFIX LINKS are listed below ###");
printSuffixLinks(root);
}
this(string s) {
root = new Node(null, [], null, -1);
s ~= "$";
foreach (i; 0 .. s.length)
runPhase(s, i);
}
static struct Match {
Node* lastNode;
Edge* edge; // null if lastNode is a leaf
size_t distance; // down the lastEdge if it's present
string unmatched;
bool full() @property {
return unmatched.empty;
}
bool splitsEdge() @property {
return distance > 0;
}
}
Match findMatch(string str, Node* from, bool existsForSure=false) {
if (str.empty)
return Match(from, null, 0, str);
auto v = from;
assert(v !is null);
Edge* prev_e, e;
size_t distance;
while (true) {
char c = str[0];
foreach (edge; v.edges) {
assert(edge.label.length > 0);
version(perf) ++cmp;
if (edge.label[0] == c) {
e = edge;
break;
}
}
if (e is null) {
e = prev_e;
distance = prev_e is null ? 0 : prev_e.label.length;
break;
}
size_t pos;
size_t len = e.label.length;
if (existsForSure) {
if (len < str.length) {
str = str[len .. $];
pos = len;
} else {
pos = str.length;
str = "";
}
} else {
pos = 1;
str = str[1 .. $];
while (pos < len && !str.empty && e.label[pos] == str[0]) {
++pos;
str = str[1 .. $];
version(perf) ++cmp;
}
}
if (str.empty || pos < len || e.child is null) {
distance = pos;
break;
}
prev_e = e;
v = e.child;
version(perf) ++down;
e = null;
}
if (e is null)
return Match(from, null, 0, str);
bool full = distance == e.label.length;
if (full)
return Match(e.child, null, 0, str);
else
return Match(e.parent, e, distance, str);
}
Edge* addEdge(Node* parent, string label, Node** newNode) {
*newNode = new Node;
auto edge = new Edge(parent, *newNode, label);
assert(parent.edges.map!(x=>x.label[0]).filter!(x=>x==label[0]).empty);
parent.edges ~= edge;
(*newNode).incomingEdge = edge;
return edge;
}
debug {
string fullLabel(Edge* e) {
if (e.child.isInternal)
return e.fullLabel();
else
return e.fullLabel()[0 .. 2 + currentLength - e.child.startPos];
}
string relativeLabel(Edge* e) {
return fullLabel(e)[2 + e.parent.stringDepth .. $];
}
}
// returns the new node
Node* splitEdge(Edge* e, size_t dist) {
if (e.label.length == dist)
return e.child;
debug { auto original = fullLabel(e); }
auto newLabel = e.label[0 .. dist];
auto v = new Node;
auto e2 = new Edge(v, e.child, e.label[dist .. $]);
e.label = newLabel;
e.child.incomingEdge = e2;
e.child = v;
v.incomingEdge = e;
v.edges = [e2];
v.startPos = -1; // internal
debug writeln("SPLIT EDGE ", original, " into ", e.fullLabel, " and ", relativeLabel(e2));
return v;
}
Node* root;
size_t leafsToSkip;
size_t currentLength;
Node* lastAddedNode;
Node* lastAddedInternalNode;
version(perf) {
size_t up, down, cmp;
// approximate numbers of up/down movements in the tree and char comparisons;
// approx. in the sense that they differ from precise results by O(|s.length|),
// still allowing to judge whether the complexity is linear or not
}
void createSuffixLink(Node* xa, Node* a) {
if (xa is null || xa == root || xa.suffixLink !is null)
return;
assert(a !is null);
debug writeln("CREATE SUFFIX LINK: ", xa.label, " => ", a.label);
if (a == root) {
assert(xa.label.length == 1);
xa.suffixLink = a;
} else {
assert(xa.label[1 .. $] == a.label);
xa.suffixLink = a;
}
}
void addLeaf(Node* from, string label, size_t startPos) {
++leafsToSkip;
auto e = addEdge(from, label, &lastAddedNode);
lastAddedNode.startPos = cast(int)startPos;
debug writeln("ADD LEAF ", fullLabel(e));
}
void runPhase(string s, size_t i) {
++currentLength; // updates edges leading to the leafs
debug writeln("PROCESSING SUFFIXES OF ", s[0 .. i + 1]);
Match match;
lastAddedInternalNode = null;
if (lastAddedNode is null) {
assert (i == 0);
addLeaf(root, s[0 .. $], 0);
}
foreach (j; leafsToSkip - 1 .. currentLength) {
debug writeln("FINDING MATCH FOR ", s[j .. i + 1], " WHEREAS LAST LEAF IS ", fullLabel(lastAddedNode.incomingEdge));
if (j == leafsToSkip - 1) {
// [j* .. i] ----> [j* .. i + 1]
// so lastAddedNode already points to the right suffix
continue;
}
assert(lastAddedNode.isLeaf);
auto internalNode = lastAddedNode.parent;
Node* v = internalNode;
size_t internalNodeStringDepth = s.length - lastAddedNode.startPos - lastAddedNode.incomingEdge.label.length;
size_t lastEdgeLength = currentLength - internalNodeStringDepth - lastAddedNode.startPos;
size_t relStringDepth = lastEdgeLength;
size_t lastInternalNodeDepth = currentLength - lastAddedNode.startPos - relStringDepth;
debug writeln("CURRENT LENGTH = ", currentLength, ", LAST LEAF START POS = ", lastAddedNode.startPos + 1,
", INCOMING EDGE LENGTH = ", relStringDepth, ", LAST INTERNAL NODE DEPTH = ", lastInternalNodeDepth);
auto suffix = s[j .. i + 1];
if (lastInternalNodeDepth == 0) { // last leaf was added to the root => go to the root
debug writeln("SEARCHING FROM ROOT: ", suffix);
match = findMatch(suffix, root);
} else {
while (v.suffixLink is null && !v.isRoot) {
relStringDepth += v.incomingEdge.label.length;
v = v.parent;
version(perf) ++up;
}
Match m;
if (v.isRoot) {
debug writeln("FINDING MATCH FOR ", suffix[0 .. lastInternalNodeDepth - 1]);
m = findMatch(suffix[0 .. lastInternalNodeDepth - 1], root, true);
} else {
debug writeln("SHORTCUT: ", internalNode.label, " ~~~> ",
v.label, " => ", v.suffixLink.label,
" | relative string depth of current suffix (", suffix, ") = ", relStringDepth);
auto relSuffixLinkLabel = suffix[$ - relStringDepth .. $][0 .. $ - lastEdgeLength];
m = findMatch(relSuffixLinkLabel, v.suffixLink, true);
}
assert(m.full); // the tail of internalNode label must be already present
assert(!m.lastNode.isLeaf);
if (m.edge is null) {
createSuffixLink(internalNode, m.lastNode);
match = findMatch(suffix[$ - lastEdgeLength .. $], m.lastNode);
} else {
// suffix link will be created later, after the edge is split
match = findMatch(suffix[$ - lastEdgeLength - m.distance .. $], m.lastNode);
assert(!match.full); // and there MUST be a split
}
}
if (match.full) {
break; // all further suffixes will also be present, no new nodes are required
}
auto prev = lastAddedInternalNode;
// since this is not a full match, we will have to add another leaf
if (match.edge is null) {
auto label = s[match.unmatched.ptr - s.ptr .. $];
addLeaf(match.lastNode, label, j);
lastAddedInternalNode = match.lastNode;
} else {
lastAddedInternalNode = splitEdge(match.edge, match.distance);
auto label = s[match.unmatched.ptr - s.ptr .. $];
addLeaf(lastAddedInternalNode, label, j);
}
createSuffixLink(prev, lastAddedInternalNode);
}
createSuffixLink(lastAddedInternalNode, root);
debug print();
debug writeln("===========================");
}
}
struct TimeInterval { double[] ms;
string toString() const {
import std.conv;
return "time (in ms): median = " ~ ms[$ / 2].to!string ~
", max = " ~ ms[$ - 1].to!string;
}
}
TimeInterval measureBuildTime(size_t n) {
import std.random;
import std.datetime;
immutable letters = "ACGT";
auto buf = new char[n];
assert(n <= 1000000);
immutable N = 5000000 / n;
StopWatch sw;
TimeInterval result;
version(perf) size_t[] up, down, cmp;
foreach (k; 0 .. N) {
foreach (j; 0 .. n)
buf[j] = letters[uniform(0, letters.length)];
import core.memory;
sw.start();
SuffixTree tree;
try {
tree = new SuffixTree(cast(string)buf);
} catch (Throwable e) {
writeln("FAIL");
throw e;
// writeln(buf);
}
sw.stop();
version(perf) {
up ~= tree.up;
down ~= tree.down;
cmp ~= tree.cmp;
}
auto ms = sw.peek().msecs;
result.ms ~= ms;
result.ms.sort();
sw.reset();
}
version(perf) {
auto mean(size_t[] a) { return a.reduce!"a+b"() / a.length; }
auto max(size_t[] a) { return a.reduce!(.max)(); }
auto min(size_t[] a) { return a.reduce!(.min)(); }
writeln("Amount of operations performed on each tree:");
writeln("minimums: up = ", min(up), ", down = ", min(down), ", cmp = ", min(cmp));
writeln("averages: up = ", mean(up), ", down = ", mean(down), ", cmp = ", mean(cmp));
writeln("maximums: up = ", max(up), ", down = ", max(down), ", cmp = ", max(cmp));
}
return result;
}
void main() {
// test performance on random strings from ACGT alphabet;
// time benchmarks are not very persuasive even with disabled asserts
// (because of degrading CPU cache performance, I guess...),
// so compilation should be done with -version=perf
// in order to observe the linear behaviour of the algorithm
writeln(measureBuildTime(1000));
writeln(measureBuildTime(10000));
writeln(measureBuildTime(100000));
writeln(measureBuildTime(1000000));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment