Created
April 22, 2012 15:28
-
-
Save priyadarshan/2464631 to your computer and use it in GitHub Desktop.
An O(n) implementation of McCreight's suffix-tree algorithm.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;; Original URL: http://fpn.mit.edu/Downloads/SuffixTree | |
;;; Copyright [email protected] 2004. | |
(defpackage :suffix-tree | |
(:export :build-suffix-tree :stree-root :*stree-print-string* | |
:random-string :subtree-size :count-leaves) | |
(:use :common-lisp)) | |
;; An O(n) implementation of McCreight's suffix-tree algorithm. | |
;; Usage example: | |
;; (build-suffix-tree "abracadabra$") | |
;; (declaim (optimize (speed 3) (safety 0) (debug 1))) | |
(in-package :suffix-tree) | |
(defmacro returning ((name initform) &body body) | |
`(let ((,name ,initform)) | |
,@body | |
,name)) | |
(defstruct (stree (:constructor create-stree (string root))) | |
(string "" :type string) | |
root) | |
(declaim (inline create-stree-node)) | |
(defstruct (stree-node :conc-name | |
(:constructor create-stree-node (parent start depth))) | |
parent | |
(start 0 :type fixnum) | |
(depth 0 :type fixnum) | |
children slink);;stree-node | |
;; Code for printing. | |
;; Basic idea is to use *stree-print-string* to communicate the | |
;; "string" associated with a suffix-tree. | |
;; Why does (let ((*stree-print-string* "abracadabra$")) | |
;; (stree-root (build-suffix-tree "abracadabra$"))) | |
;; not DWIM? | |
(defvar *stree-print-string* nil) | |
(defun stree-substring (node) | |
(subseq *stree-print-string* (start node) (+ (start node) (depth node)))) | |
(defmethod print-object ((s stree) (str stream)) | |
(let ((*stree-print-string* (stree-string s))) | |
(format str "#<stree(~A), ~A>" (stree-string s) (stree-root s)))) | |
(defmethod print-object ((s-n stree-node) (str stream)) | |
(if *stree-print-string* | |
(print-node-string s-n str) | |
(print-node-no-string s-n str))) | |
(defun print-node-string (s-n str) | |
(format str "<~A" (stree-substring s-n)) | |
(when (slink s-n) | |
(format str ", slink=~A" (stree-substring (slink s-n)))) | |
(when (children s-n) | |
(format str ", ch=~A" (children s-n))) | |
(format str ">"));;print-node-string | |
(defun print-node-no-string (s-n str) | |
(format str "#<NODE: s=~A, d=~D" | |
(start s-n) | |
(depth s-n)) | |
(let ((slink (slink s-n))) | |
(when slink | |
(format str ", slink=(~A ~A)" (start slink) (depth slink)))) | |
(when (children s-n) | |
(format str ", ch=~A" (children s-n))) | |
(format str ">"));;print-node-no-string | |
(defun tree-char (stree index) | |
(declare (type fixnum index)) | |
(char (stree-string stree) index)) | |
(defun make-root () | |
(returning (root (create-stree-node nil 0 0)) | |
(setf (parent root) root | |
(slink root) root))) | |
(defun add-child (node child) | |
(push child (children node)) | |
nil) | |
(defun replace-child (parent old-child new-child) | |
(do ((c (children parent) (cdr c))) | |
((eq (car c) old-child) | |
(progn | |
(setf (car c) new-child) | |
nil)) | |
()));;replace-child | |
(defun fork-node (node depth) | |
(let ((parent (parent node))) | |
(let ((new-node (create-stree-node parent (start node) depth))) | |
(replace-child parent node new-node) | |
(add-child new-node node) | |
(setf (parent node) new-node) | |
new-node)));;fork-node | |
(defun get-child (tree node char) | |
(labels ((get-child-rec (children) | |
(when children | |
(let ((cur-child (car children))) | |
(if (char= (tree-char tree (+ (start cur-child) | |
(depth node))) | |
char) | |
cur-child | |
(get-child-rec (cdr children))))))) | |
(get-child-rec (children node))));;get-child | |
(defun build-suffix-tree (string &optional num-children) | |
"User must pass a special-char delimited string." | |
(declare (type string string)) | |
(returning (tree (create-stree string (make-root))) | |
(let ((last (stree-root tree)) | |
(length (length string))) | |
(let ((n (or num-children length))) | |
(declare (type fixnum n)) | |
(dotimes (i n) | |
(declare (type fixnum i)) | |
(setf last (get-branch tree last i)) | |
(add-child last (create-stree-node last i (- length i))))))));;build-suffix-tree | |
(defun guarantee-slink (tree L i d) | |
(let ((dL (depth L))) | |
(declare (type fixnum d) | |
(type fixnum dL) | |
(type fixnum i)) | |
(cond ((= d dL) L) | |
((> d dL) | |
(guarantee-slink tree | |
(get-child tree L (tree-char tree (+ i dL))) | |
i d)) | |
(t (fork-node L d)))));;guarantee-slink | |
(defun get-branch (tree last i) | |
(declare (type fixnum i)) | |
(let* ((initial-depth (max 0 (- (depth last) 1))) | |
(slink (or (slink last) | |
(setf (slink last) | |
(guarantee-slink tree | |
(slink (parent last)) | |
i | |
initial-depth))))) | |
(labels ((tchar (d) (tree-char tree d)) | |
(sref (d) (declare (type fixnum d)) (tchar (+ i d))) | |
(get-branch-rec (N d) | |
(declare (type fixnum d)) | |
(if (= (depth N) d) | |
(let ((ch (get-child tree N (sref (depth N))))) | |
(if (null ch) | |
N | |
(get-branch-rec ch d))) | |
(if (char= (tchar (+ (start N) d)) (sref d)) | |
(get-branch-rec N (+ d 1)) | |
(fork-node N d))))) | |
(get-branch-rec slink initial-depth))));;get-branch | |
(defun random-string (n chars &optional (end-char #\$)) | |
(let ((nchars (length chars))) | |
(returning (s (make-string (+ n 1))) | |
(dotimes (i n) | |
(setf (char s i) (char chars (random nchars)))) | |
(setf (char s n) end-char)))) | |
;;;; suffix-tree.lisp -- -- ;;;; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
(http://groups.google.com/group/comp.lang.lisp/msg/d0828f590a78e958)
As promised, I've posted my current implementations in
http://fpn.mit.edu/Downloads/SuffixTree
suffix-tree.lisp is the basic implementation. suffix-tree-2.lisp is a
modified implementation in which the leaves are just fixnums rather
than being "nodes". Both implementations use the McCreight algorithm.
The implementations are both O(n) in time and space and are therefore
"asymptotically efficient." The constants are not as good as they
could be. suffix-tree uses about 80 bytes/char, and suffix-tree-2
uses about 50 bytes/char (under CMUCL 19a). I believe the best C
implementations use about 25 bytes/char.
Any comments on my code are appreciated; I'm always trying to improve
my skills. The printing code in particular seems kludgy to me and
doesn't even do what I want (see the comment in suffix-tree.lisp), any
insights on this are most welcome.
Cheers,
rif