// building suffix trees in linear time using the Ukkonen's algorithm // // with -debug flag, LOTS of output is printed during construction // so that each step of the algorithm can be studied import std.stdio; import std.array, std.range, std.algorithm; class SuffixTree { static struct Node { inout(Node*) parent() @property inout { return isRoot ? null : incomingEdge.parent; } Edge* incomingEdge; Edge*[] edges; Node* suffixLink; int startPos; // -1 for internal nodes string label() { string result; auto e = incomingEdge; while (e !is null) { result = e.label ~ result; e = e.parent.incomingEdge; } if (result == "") return "(root)"; return result; } bool isLeaf() @property const { return edges.empty; } bool isRoot() @property const { return incomingEdge is null; } bool isInternal() @property const { return !isLeaf && !isRoot; } size_t stringDepth() { if (isRoot) return 0; else return label.length; } } static struct Edge { Node* parent; Node* child; string label; string fullLabel() { if (parent.isRoot) return "::" ~ label; else return parent.label ~ "::" ~ label; } } void print() { void printNode(Node* node, size_t stringDepth) { if (node is null) return; foreach (edge; node.edges) { string label = edge.label; if (label == "\0") label = "$"; if (edge.child.startPos >= 0) { label = label[0 .. min($, max(0, cast(int)currentLength - stringDepth - edge.child.startPos))]; if (label.empty) label = "$"; } if (edge.child.isLeaf) writeln(" ".repeat(stringDepth).joiner, label, "\t(", edge.child.startPos + 1, ")"); else writeln(" ".repeat(stringDepth).joiner, label); printNode(edge.child, stringDepth + edge.label.length); } } void printSuffixLinks(Node* node) { if (node is null) return; if (node.suffixLink !is null) writeln(node.label, " => ", node.suffixLink.label); if (node.isInternal && node.suffixLink is null) writeln("WARNING: node ", node.label, " doesn't have a suffix link!"); foreach (child; node.edges.map!(e => e.child)) printSuffixLinks(child); } printNode(root, 0); writeln("### SUFFIX LINKS are listed below ###"); printSuffixLinks(root); } this(string s) { root = new Node(null, [], null, -1); s ~= "$"; foreach (i; 0 .. s.length) runPhase(s, i); } static struct Match { Node* lastNode; Edge* edge; // null if lastNode is a leaf size_t distance; // down the lastEdge if it's present string unmatched; bool full() @property { return unmatched.empty; } bool splitsEdge() @property { return distance > 0; } } Match findMatch(string str, Node* from, bool existsForSure=false) { if (str.empty) return Match(from, null, 0, str); auto v = from; assert(v !is null); Edge* prev_e, e; size_t distance; while (true) { char c = str[0]; foreach (edge; v.edges) { assert(edge.label.length > 0); version(perf) ++cmp; if (edge.label[0] == c) { e = edge; break; } } if (e is null) { e = prev_e; distance = prev_e is null ? 0 : prev_e.label.length; break; } size_t pos; size_t len = e.label.length; if (existsForSure) { if (len < str.length) { str = str[len .. $]; pos = len; } else { pos = str.length; str = ""; } } else { pos = 1; str = str[1 .. $]; while (pos < len && !str.empty && e.label[pos] == str[0]) { ++pos; str = str[1 .. $]; version(perf) ++cmp; } } if (str.empty || pos < len || e.child is null) { distance = pos; break; } prev_e = e; v = e.child; version(perf) ++down; e = null; } if (e is null) return Match(from, null, 0, str); bool full = distance == e.label.length; if (full) return Match(e.child, null, 0, str); else return Match(e.parent, e, distance, str); } Edge* addEdge(Node* parent, string label, Node** newNode) { *newNode = new Node; auto edge = new Edge(parent, *newNode, label); assert(parent.edges.map!(x=>x.label[0]).filter!(x=>x==label[0]).empty); parent.edges ~= edge; (*newNode).incomingEdge = edge; return edge; } debug { string fullLabel(Edge* e) { if (e.child.isInternal) return e.fullLabel(); else return e.fullLabel()[0 .. 2 + currentLength - e.child.startPos]; } string relativeLabel(Edge* e) { return fullLabel(e)[2 + e.parent.stringDepth .. $]; } } // returns the new node Node* splitEdge(Edge* e, size_t dist) { if (e.label.length == dist) return e.child; debug { auto original = fullLabel(e); } auto newLabel = e.label[0 .. dist]; auto v = new Node; auto e2 = new Edge(v, e.child, e.label[dist .. $]); e.label = newLabel; e.child.incomingEdge = e2; e.child = v; v.incomingEdge = e; v.edges = [e2]; v.startPos = -1; // internal debug writeln("SPLIT EDGE ", original, " into ", e.fullLabel, " and ", relativeLabel(e2)); return v; } Node* root; size_t leafsToSkip; size_t currentLength; Node* lastAddedNode; Node* lastAddedInternalNode; version(perf) { size_t up, down, cmp; // approximate numbers of up/down movements in the tree and char comparisons; // approx. in the sense that they differ from precise results by O(|s.length|), // still allowing to judge whether the complexity is linear or not } void createSuffixLink(Node* xa, Node* a) { if (xa is null || xa == root || xa.suffixLink !is null) return; assert(a !is null); debug writeln("CREATE SUFFIX LINK: ", xa.label, " => ", a.label); if (a == root) { assert(xa.label.length == 1); xa.suffixLink = a; } else { assert(xa.label[1 .. $] == a.label); xa.suffixLink = a; } } void addLeaf(Node* from, string label, size_t startPos) { ++leafsToSkip; auto e = addEdge(from, label, &lastAddedNode); lastAddedNode.startPos = cast(int)startPos; debug writeln("ADD LEAF ", fullLabel(e)); } void runPhase(string s, size_t i) { ++currentLength; // updates edges leading to the leafs debug writeln("PROCESSING SUFFIXES OF ", s[0 .. i + 1]); Match match; lastAddedInternalNode = null; if (lastAddedNode is null) { assert (i == 0); addLeaf(root, s[0 .. $], 0); } foreach (j; leafsToSkip - 1 .. currentLength) { debug writeln("FINDING MATCH FOR ", s[j .. i + 1], " WHEREAS LAST LEAF IS ", fullLabel(lastAddedNode.incomingEdge)); if (j == leafsToSkip - 1) { // [j* .. i] ----> [j* .. i + 1] // so lastAddedNode already points to the right suffix continue; } assert(lastAddedNode.isLeaf); auto internalNode = lastAddedNode.parent; Node* v = internalNode; size_t internalNodeStringDepth = s.length - lastAddedNode.startPos - lastAddedNode.incomingEdge.label.length; size_t lastEdgeLength = currentLength - internalNodeStringDepth - lastAddedNode.startPos; size_t relStringDepth = lastEdgeLength; size_t lastInternalNodeDepth = currentLength - lastAddedNode.startPos - relStringDepth; debug writeln("CURRENT LENGTH = ", currentLength, ", LAST LEAF START POS = ", lastAddedNode.startPos + 1, ", INCOMING EDGE LENGTH = ", relStringDepth, ", LAST INTERNAL NODE DEPTH = ", lastInternalNodeDepth); auto suffix = s[j .. i + 1]; if (lastInternalNodeDepth == 0) { // last leaf was added to the root => go to the root debug writeln("SEARCHING FROM ROOT: ", suffix); match = findMatch(suffix, root); } else { while (v.suffixLink is null && !v.isRoot) { relStringDepth += v.incomingEdge.label.length; v = v.parent; version(perf) ++up; } Match m; if (v.isRoot) { debug writeln("FINDING MATCH FOR ", suffix[0 .. lastInternalNodeDepth - 1]); m = findMatch(suffix[0 .. lastInternalNodeDepth - 1], root, true); } else { debug writeln("SHORTCUT: ", internalNode.label, " ~~~> ", v.label, " => ", v.suffixLink.label, " | relative string depth of current suffix (", suffix, ") = ", relStringDepth); auto relSuffixLinkLabel = suffix[$ - relStringDepth .. $][0 .. $ - lastEdgeLength]; m = findMatch(relSuffixLinkLabel, v.suffixLink, true); } assert(m.full); // the tail of internalNode label must be already present assert(!m.lastNode.isLeaf); if (m.edge is null) { createSuffixLink(internalNode, m.lastNode); match = findMatch(suffix[$ - lastEdgeLength .. $], m.lastNode); } else { // suffix link will be created later, after the edge is split match = findMatch(suffix[$ - lastEdgeLength - m.distance .. $], m.lastNode); assert(!match.full); // and there MUST be a split } } if (match.full) { break; // all further suffixes will also be present, no new nodes are required } auto prev = lastAddedInternalNode; // since this is not a full match, we will have to add another leaf if (match.edge is null) { auto label = s[match.unmatched.ptr - s.ptr .. $]; addLeaf(match.lastNode, label, j); lastAddedInternalNode = match.lastNode; } else { lastAddedInternalNode = splitEdge(match.edge, match.distance); auto label = s[match.unmatched.ptr - s.ptr .. $]; addLeaf(lastAddedInternalNode, label, j); } createSuffixLink(prev, lastAddedInternalNode); } createSuffixLink(lastAddedInternalNode, root); debug print(); debug writeln("==========================="); } } struct TimeInterval { double[] ms; string toString() const { import std.conv; return "time (in ms): median = " ~ ms[$ / 2].to!string ~ ", max = " ~ ms[$ - 1].to!string; } } TimeInterval measureBuildTime(size_t n) { import std.random; import std.datetime; immutable letters = "ACGT"; auto buf = new char[n]; assert(n <= 1000000); immutable N = 5000000 / n; StopWatch sw; TimeInterval result; version(perf) size_t[] up, down, cmp; foreach (k; 0 .. N) { foreach (j; 0 .. n) buf[j] = letters[uniform(0, letters.length)]; import core.memory; sw.start(); SuffixTree tree; try { tree = new SuffixTree(cast(string)buf); } catch (Throwable e) { writeln("FAIL"); throw e; // writeln(buf); } sw.stop(); version(perf) { up ~= tree.up; down ~= tree.down; cmp ~= tree.cmp; } auto ms = sw.peek().msecs; result.ms ~= ms; result.ms.sort(); sw.reset(); } version(perf) { auto mean(size_t[] a) { return a.reduce!"a+b"() / a.length; } auto max(size_t[] a) { return a.reduce!(.max)(); } auto min(size_t[] a) { return a.reduce!(.min)(); } writeln("Amount of operations performed on each tree:"); writeln("minimums: up = ", min(up), ", down = ", min(down), ", cmp = ", min(cmp)); writeln("averages: up = ", mean(up), ", down = ", mean(down), ", cmp = ", mean(cmp)); writeln("maximums: up = ", max(up), ", down = ", max(down), ", cmp = ", max(cmp)); } return result; } void main() { // test performance on random strings from ACGT alphabet; // time benchmarks are not very persuasive even with disabled asserts // (because of degrading CPU cache performance, I guess...), // so compilation should be done with -version=perf // in order to observe the linear behaviour of the algorithm writeln(measureBuildTime(1000)); writeln(measureBuildTime(10000)); writeln(measureBuildTime(100000)); writeln(measureBuildTime(1000000)); }