// building suffix trees in linear time using the Ukkonen's algorithm
//
// with -debug flag, LOTS of output is printed during construction
// so that each step of the algorithm can be studied

import std.stdio;
import std.array, std.range, std.algorithm;

class SuffixTree {
  static struct Node {
    inout(Node*) parent() @property inout { return isRoot ? null : incomingEdge.parent; }
    Edge* incomingEdge;
    Edge*[] edges;
    Node* suffixLink;
    int startPos; // -1 for internal nodes

    string label() {
      string result;
      auto e = incomingEdge;
      while (e !is null) {
        result = e.label ~ result;
        e = e.parent.incomingEdge;
      }
      if (result == "")
        return "(root)";
      return result;
    }

    bool isLeaf() @property const {
      return edges.empty;
    }

    bool isRoot() @property const {
      return incomingEdge is null;
    }

    bool isInternal() @property const {
      return !isLeaf && !isRoot;
    }

    size_t stringDepth() {
      if (isRoot) return 0;
      else
        return label.length;
    }
  }
  
  static struct Edge {
    Node* parent;
    Node* child;
    string label;

    string fullLabel() {
      if (parent.isRoot)
        return "::" ~ label;
      else 
        return parent.label ~ "::" ~ label;
    }
  }

  void print() {
    void printNode(Node* node, size_t stringDepth) {
      if (node is null)
        return;
      foreach (edge; node.edges) {
        string label = edge.label;
        if (label == "\0")
          label = "$";
        if (edge.child.startPos >= 0) {
          label = label[0 .. min($, max(0, cast(int)currentLength - stringDepth - edge.child.startPos))];
          if (label.empty)
            label = "$";
        }
        if (edge.child.isLeaf)
          writeln(" ".repeat(stringDepth).joiner, label, "\t(", edge.child.startPos + 1, ")");
        else
          writeln(" ".repeat(stringDepth).joiner, label);
        printNode(edge.child, stringDepth + edge.label.length);
      }
    }

    void printSuffixLinks(Node* node) {
      if (node is null)
        return;
      if (node.suffixLink !is null)
        writeln(node.label, " => ", node.suffixLink.label);
      if (node.isInternal && node.suffixLink is null)
        writeln("WARNING: node ", node.label, " doesn't have a suffix link!");
      foreach (child; node.edges.map!(e => e.child))
        printSuffixLinks(child);
    }
    printNode(root, 0);
    writeln("### SUFFIX LINKS are listed below ###");
    printSuffixLinks(root);
  }

  this(string s) {
    root = new Node(null, [], null, -1);
    s ~= "$";
    foreach (i; 0 .. s.length)
      runPhase(s, i);
  }

  static struct Match {
    Node* lastNode;
    Edge* edge; // null if lastNode is a leaf
    size_t distance; // down the lastEdge if it's present
    string unmatched;

    bool full() @property {
      return unmatched.empty;
    }

    bool splitsEdge() @property {
      return distance > 0;
    }
  }

  Match findMatch(string str, Node* from, bool existsForSure=false) {
    if (str.empty)
      return Match(from, null, 0, str);
    
    auto v = from;
    assert(v !is null);
    Edge* prev_e, e;

    size_t distance;

    while (true) {
      char c = str[0];
      foreach (edge; v.edges) {
        assert(edge.label.length > 0);
        version(perf) ++cmp;
        if (edge.label[0] == c) {
          e = edge;
          break;
        }
      }

      if (e is null) {
        e = prev_e;
        distance = prev_e is null ? 0 : prev_e.label.length;
        break;
      }

      size_t pos;
      size_t len = e.label.length;

      if (existsForSure) {
        if (len < str.length) {
          str = str[len .. $];
          pos = len;
        } else {
          pos = str.length;
          str = "";
        }
      } else {
        pos = 1;
        str = str[1 .. $];
        while (pos < len && !str.empty && e.label[pos] == str[0]) {
          ++pos;
          str = str[1 .. $];
          version(perf) ++cmp;
        }
      }

      if (str.empty || pos < len || e.child is null) {
        distance = pos;
        break;
      }

      prev_e = e;
      v = e.child;
      version(perf) ++down;
      e = null;
    }

    if (e is null)
      return Match(from, null, 0, str);
    bool full = distance == e.label.length;
    if (full)
      return Match(e.child, null, 0, str);
    else
      return Match(e.parent, e, distance, str);
  }

  Edge* addEdge(Node* parent, string label, Node** newNode) {
    *newNode = new Node;
    auto edge = new Edge(parent, *newNode, label);
    assert(parent.edges.map!(x=>x.label[0]).filter!(x=>x==label[0]).empty);
    parent.edges ~= edge;
    (*newNode).incomingEdge = edge;
    return edge;
  }

  debug {
    string fullLabel(Edge* e) {
      if (e.child.isInternal)
        return e.fullLabel();
      else
        return e.fullLabel()[0 .. 2 + currentLength - e.child.startPos];
    }

    string relativeLabel(Edge* e) {
      return fullLabel(e)[2 + e.parent.stringDepth .. $];
    }
  }

  // returns the new node
  Node* splitEdge(Edge* e, size_t dist) {
    if (e.label.length == dist)
      return e.child;

    debug { auto original = fullLabel(e); }

    auto newLabel = e.label[0 .. dist];
    auto v = new Node;
    auto e2 = new Edge(v, e.child, e.label[dist .. $]);

    e.label = newLabel;
    e.child.incomingEdge = e2;

    e.child = v;
    v.incomingEdge = e;
    v.edges = [e2];
    v.startPos = -1; // internal

    debug writeln("SPLIT EDGE ", original, " into ", e.fullLabel, " and ", relativeLabel(e2));

    return v;
  }

  Node* root;
  size_t leafsToSkip;
  size_t currentLength;

  Node* lastAddedNode;
  Node* lastAddedInternalNode;

  version(perf) {
    size_t up, down, cmp;
    // approximate numbers of up/down movements in the tree and char comparisons;
    // approx. in the sense that they differ from precise results by O(|s.length|),
    // still allowing to judge whether the complexity is linear or not
  }

  void createSuffixLink(Node* xa, Node* a) {
    if (xa is null || xa == root || xa.suffixLink !is null)
      return;

    assert(a !is null);
    debug writeln("CREATE SUFFIX LINK: ", xa.label, " => ", a.label);

    if (a == root) {
      assert(xa.label.length == 1);
      xa.suffixLink = a;
    } else {
      assert(xa.label[1 .. $] == a.label);
      xa.suffixLink = a;
    }
  }

  void addLeaf(Node* from, string label, size_t startPos) { 
    ++leafsToSkip;
    auto e = addEdge(from, label, &lastAddedNode);
    lastAddedNode.startPos = cast(int)startPos;
    debug writeln("ADD LEAF ", fullLabel(e));
  }

  void runPhase(string s, size_t i) {
    ++currentLength; // updates edges leading to the leafs

    debug writeln("PROCESSING SUFFIXES OF ", s[0 .. i + 1]);

    Match match;

    lastAddedInternalNode = null;
    if (lastAddedNode is null) {
      assert (i == 0);
      addLeaf(root, s[0 .. $], 0);
    }

    foreach (j; leafsToSkip - 1 .. currentLength) {
      debug writeln("FINDING MATCH FOR ", s[j .. i + 1], " WHEREAS LAST LEAF IS ", fullLabel(lastAddedNode.incomingEdge));
      if (j == leafsToSkip - 1) {
        // [j* .. i] ----> [j* .. i + 1]
        // so lastAddedNode already points to the right suffix
        continue;
      } 

      assert(lastAddedNode.isLeaf);

      auto internalNode = lastAddedNode.parent;
      Node* v = internalNode;
      size_t internalNodeStringDepth = s.length - lastAddedNode.startPos - lastAddedNode.incomingEdge.label.length;
      size_t lastEdgeLength = currentLength - internalNodeStringDepth - lastAddedNode.startPos;
      size_t relStringDepth = lastEdgeLength;
      size_t lastInternalNodeDepth = currentLength - lastAddedNode.startPos - relStringDepth;
      debug writeln("CURRENT LENGTH = ", currentLength, ", LAST LEAF START POS = ", lastAddedNode.startPos + 1,
          ", INCOMING EDGE LENGTH = ", relStringDepth, ", LAST INTERNAL NODE DEPTH = ", lastInternalNodeDepth);

      auto suffix = s[j .. i + 1];

      if (lastInternalNodeDepth == 0) { // last leaf was added to the root => go to the root
        debug writeln("SEARCHING FROM ROOT: ", suffix);
        match = findMatch(suffix, root);
      } else {
        while (v.suffixLink is null && !v.isRoot) {
          relStringDepth += v.incomingEdge.label.length;
          v = v.parent;
          version(perf) ++up;
        }

        Match m;

        if (v.isRoot) {
          debug writeln("FINDING MATCH FOR ", suffix[0 .. lastInternalNodeDepth - 1]);
          m = findMatch(suffix[0 .. lastInternalNodeDepth - 1], root, true);
        } else {
          debug writeln("SHORTCUT: ", internalNode.label, " ~~~> ",
              v.label, " => ", v.suffixLink.label, 
              " | relative string depth of current suffix (", suffix, ") = ", relStringDepth);

          auto relSuffixLinkLabel = suffix[$ - relStringDepth .. $][0 .. $ - lastEdgeLength];
          m = findMatch(relSuffixLinkLabel, v.suffixLink, true);
        }

        assert(m.full); // the tail of internalNode label must be already present
        assert(!m.lastNode.isLeaf);

        if (m.edge is null) {
          createSuffixLink(internalNode, m.lastNode);
          match = findMatch(suffix[$ - lastEdgeLength .. $], m.lastNode);
        } else {
          // suffix link will be created later, after the edge is split
          match = findMatch(suffix[$ - lastEdgeLength - m.distance .. $], m.lastNode);
          assert(!match.full); // and there MUST be a split
        }
      }

      if (match.full) {
        break; // all further suffixes will also be present, no new nodes are required
      }

      auto prev = lastAddedInternalNode;

      // since this is not a full match, we will have to add another leaf
      if (match.edge is null) {
        auto label = s[match.unmatched.ptr - s.ptr .. $];
        addLeaf(match.lastNode, label, j);
        lastAddedInternalNode = match.lastNode;
      } else {
        lastAddedInternalNode = splitEdge(match.edge, match.distance);
        auto label = s[match.unmatched.ptr - s.ptr .. $];
        addLeaf(lastAddedInternalNode, label, j);
      }

      createSuffixLink(prev, lastAddedInternalNode);
    }

    createSuffixLink(lastAddedInternalNode, root);

    debug print();
    debug writeln("===========================");
  }

}

struct TimeInterval { double[] ms; 
  string toString() const {
    import std.conv;
    return "time (in ms): median = " ~ ms[$ / 2].to!string ~ 
           ", max = " ~ ms[$ - 1].to!string;
  }
}

TimeInterval measureBuildTime(size_t n) {
  import std.random;
  import std.datetime;
  immutable letters = "ACGT";
  auto buf = new char[n];
  assert(n <= 1000000);
  immutable N = 5000000 / n;
  StopWatch sw;
  TimeInterval result;

  version(perf) size_t[] up, down, cmp;

  foreach (k; 0 .. N) {
    foreach (j; 0 .. n)
      buf[j] = letters[uniform(0, letters.length)];

    import core.memory;
 
    sw.start();
    SuffixTree tree;
    try {
      tree = new SuffixTree(cast(string)buf);
    } catch (Throwable e) {
      writeln("FAIL");
      throw e;
//      writeln(buf);
    }
    sw.stop();

    version(perf) {
      up ~= tree.up;
      down ~= tree.down;
      cmp ~= tree.cmp;
    }
    auto ms = sw.peek().msecs;
    result.ms ~= ms;
    result.ms.sort();
    sw.reset();
  }

  version(perf) {
    auto mean(size_t[] a) { return a.reduce!"a+b"() / a.length; }
    auto max(size_t[] a) { return a.reduce!(.max)(); }
    auto min(size_t[] a) { return a.reduce!(.min)(); }
    writeln("Amount of operations performed on each tree:");
    writeln("minimums: up = ", min(up), ", down = ", min(down), ", cmp = ", min(cmp));
    writeln("averages: up = ", mean(up), ", down = ", mean(down), ", cmp = ", mean(cmp));
    writeln("maximums: up = ", max(up), ", down = ", max(down), ", cmp = ", max(cmp));
  }
  return result;
}

void main() {
  // test performance on random strings from ACGT alphabet;
  // time benchmarks are not very persuasive even with disabled asserts
  // (because of degrading CPU cache performance, I guess...),
  // so compilation should be done with -version=perf 
  // in order to observe the linear behaviour of the algorithm
  writeln(measureBuildTime(1000));
  writeln(measureBuildTime(10000));
  writeln(measureBuildTime(100000));
  writeln(measureBuildTime(1000000));
}