Skip to content

Instantly share code, notes, and snippets.

@veer66
Created May 4, 2021 16:41
Show Gist options
  • Save veer66/60042bee7a3233ec94961060b2f832a7 to your computer and use it in GitHub Desktop.
Save veer66/60042bee7a3233ec94961060b2f832a7 to your computer and use it in GitHub Desktop.
2021
(defpackage #:cl-wordcut
(:use #:cl)
(:import-from #:str :join))
(in-package #:cl-wordcut)
(declaim (optimize (speed 3) (debug 0) (safety 0)))
(defconstant +default-dix-ptr-size+ 1024)
(defstruct node-key
(row-no 0 :type integer)
(offset 0 :type integer)
(ch #\0 :type character))
(defstruct node-ptr
(row-no 0 :type integer)
(is-final nil :type boolean)
payload)
(defun init-prefix-tree ()
(make-hash-table :test #'equalp))
(defun make-prefix-tree (sorted-words-with-payload)
(let ((prefix-tree (init-prefix-tree))
(i 0))
(loop for word-payload in sorted-words-with-payload
do
(destructuring-bind (word payload)
word-payload
(let* ((row-no 0)
(ch-list (coerce word 'list))
(ch-len (length ch-list))
(j 0))
(loop for ch in ch-list
do
(let ((node-key (make-node-key :row-no row-no :offset j :ch ch)))
(let ((ex-node-ptr (gethash node-key prefix-tree)))
(if ex-node-ptr
(setq row-no (node-ptr-row-no ex-node-ptr))
(let* ((is-final (= (+ j 1) ch-len))
(node-ptr (make-node-ptr :row-no i :is-final is-final :payload (when is-final payload))))
(setf (gethash node-key prefix-tree) node-ptr)
(setq row-no i))))
(incf j)))))
(incf i))
prefix-tree))
(defun lookup (prefix-tree row-no offset ch)
(gethash (make-node-key :row-no row-no :offset offset :ch ch)
prefix-tree))
(defconstant +unk+ 1)
(defconstant +dict+ 2)
(defconstant +init+ 3)
(defconstant +latin+ 4)
(defconstant +punc+ 5)
(defstruct link
(p-idx 0 :type integer)
(w 0 :type integer)
(unk 0 :type integer)
(kind 0 :type integer))
(defun betterp (l r)
(or (< (link-unk l)
(link-unk r))
(< (link-w l)
(link-w r))))
(defconstant +waiting+ 1)
(defconstant +activated+ 2)
(defconstant +completed+ 3)
(defstruct latin-transducer
(s 0 :type integer)
(e 0 :type integer)
(state +waiting+ :type integer))
(defconstant +cap-a-code+ (char-code #\A))
(defconstant +cap-z-code+ (char-code #\Z))
(defconstant +a-code+ (char-code #\a))
(defconstant +z-code+ (char-code #\z))
(defparameter *latin-tab* (make-hash-table))
(loop for ch from +cap-a-code+ to +cap-z-code+
do (setf (gethash ch *latin-tab*) t))
(loop for ch from +a-code+ to +z-code+
do (setf (gethash ch *latin-tab*) t))
(defun latin-p (ch)
(let ((code (char-code ch)))
(gethash ch *latin-tab*)))
(defun update-latin (transducer ch i ch-vec)
(if (eq +waiting+ (latin-transducer-state transducer))
(if (latin-p ch)
(progn
(setf (latin-transducer-s transducer) i)
(setf (latin-transducer-state transducer) +activated+)
(when (or (eq (length ch-vec) (+ i 1))
(not (latin-p (aref ch-vec (+ i 1)))))
(progn
(setf (latin-transducer-e transducer) (+ i 1))
(setf (latin-transducer-state transducer) +completed+)))))
(if (latin-p ch)
(when (or (eq (length ch-vec) (+ i 1))
(not (latin-p (aref ch-vec (+ i 1)))))
(progn
(setf (latin-transducer-e transducer) (+ i 1))
(setf (latin-transducer-state transducer) +completed+)))
(setf (latin-transducer-state transducer) +waiting+))))
(defclass punc-transducer ()
((s :accessor s :initform 0 :type integer)
(e :accessor e :initform 0 :type integer)
(state :accessor state :initform +waiting+ :type integer)
(link-kind :accessor transducer-link-kind :initform +punc+)))
(defun punc-p (ch)
(eq #\space ch))
(defmethod update-punc ((transducer punc-transducer) (ch character) (i integer) (ch-vec vector))
(if (eq +waiting+ (state transducer))
(if (punc-p ch)
(progn
(setf (slot-value transducer 's) i)
(setf (slot-value transducer 'state) +activated+)
(when (or (eq (length ch-vec) (+ i 1))
(not (punc-p (aref ch-vec (+ i 1)))))
(progn
(setf (slot-value transducer 'e) (+ i 1))
(setf (slot-value transducer 'state) +completed+)))))
(if (punc-p ch)
(when (or (eq (length ch-vec) (+ i 1))
(not (punc-p (aref ch-vec (+ i 1)))))
(progn
(setf (slot-value transducer 'e) (+ i 1))
(setf (slot-value transducer 'state) +completed+)))
(setf (slot-value transducer 'state) +waiting+))))
(defstruct dix-ptr
(s 0 :type integer)
(row-no 0 :type integer)
(is-final t :type boolean))
(defun build-path (dix s)
(let* ((left-boundary 0)
(s-len (length s))
(ch-list (coerce s 'list))
(ch-vec (make-array s-len :element-type 'character :initial-contents ch-list))
(path (make-array (+ s-len 1)))
(dix-ptrs (make-array +default-dix-ptr-size+ :fill-pointer 0))
(i 0)
(latin-transducer (make-latin-transducer))
(punc-transducer (make-instance 'punc-transducer))
)
(setf (aref path 0) (make-link :p-idx 0 :w 0 :unk 0 :kind +init+))
(loop for ch in ch-list
do
(let* ((unk-link (aref path left-boundary))
(link (make-link :p-idx left-boundary
:w (+ (link-w unk-link) 1)
:unk (+ (link-unk unk-link) 1)
:kind +unk+))
(j 0))
(vector-push-extend (make-dix-ptr :s i :row-no 0 :is-final nil) dix-ptrs)
(loop while (< j (length dix-ptrs))
do
(let* ((dix-ptr (aref dix-ptrs j))
(offset (- i (dix-ptr-s dix-ptr)))
(row-no (dix-ptr-row-no dix-ptr))
(child (lookup dix row-no offset ch)))
(if child
(progn
(setf (aref dix-ptrs j)
(make-dix-ptr :s (dix-ptr-s dix-ptr)
:row-no (node-ptr-row-no child)
:is-final (node-ptr-is-final child)))
(incf j))
(if (eq (+ j 1) (length dix-ptrs))
(progn
(incf j)
(vector-pop dix-ptrs))
(setf (aref dix-ptrs j) (vector-pop dix-ptrs))))))
(loop for dix-ptr across dix-ptrs
do
(when (dix-ptr-is-final dix-ptr)
(let* ((new-s (dix-ptr-s dix-ptr))
(dix-link (aref path new-s))
(new-link (make-link :p-idx new-s
:w (link-w dix-link)
:unk (link-unk dix-link)
:kind +dict+)))
(when (betterp new-link link)
(setq link new-link)))))
(update-latin latin-transducer ch i ch-vec)
;;(update-punc punc-transducer ch i ch-vec)
;; (when (eq (state latin-transducer) +completed+)
;; (let* ((s (s transducer))
;; (prev-link (aref path s))
;; (new-link (make-link :p-idx s
;; :w (+ (link-w prev-link) 1)
;; :unk (link-unk prev-link)
;; :kind +latin+)))
;; (when (betterp new-link link)
;; (setq link new-link))))
(unless (eq (link-kind link) +unk+)
(setf left-boundary i))
(setf (aref path (+ i 1)) link)
(incf i)))
path))
;; (defparameter *x* (make-array 8 :fill-pointer 0))
;; (vector-push-extend (make-dix-ptr :s 1 :row-no 0 :final nil) *x*)
;; (build-path (make-prefix-tree '(("กา" 1) ("กาม" 2)))
;; "กากาม")
;; (build-path (make-prefix-tree '(("กา" 1) ("กาม" 2)))
;; "KAKA")
;; (build-path (make-prefix-tree '(("กา" 1) ("กาม" 2)))
;; "A B")
;; (lookup (make-prefix-tree '(("A" 1))) 0 0 #\A)
;; (lookup (make-prefix-tree '(("กา" 1) ("กาม" 2))) 0 0 #\ก)
;; (lookup (make-prefix-tree '(("กา" 1) ("กาม" 2))) 0 1 #\ก) ;; nil
;; (lookup (make-prefix-tree '(("กา" 1) ("กาม" 2))) 0 1 #\า)
;; (alexandria:hash-table-alist (make-prefix-tree '(("กา" 1) ("กาม" 2))))
(defstruct range
(s 0 :type integer)
(e 0 :type integer))
(defun path-to-ranges (path)
(let ((e (- (length path) 1))
(ranges nil))
(loop while (> e 0)
do
(let* ((link (aref path e))
(s (link-p-idx link))
(r (make-range :s s :e e)))
(setq ranges (cons r ranges))
(setq e s)))
ranges))
;; (path-to-ranges (build-path (make-prefix-tree '(("กา" 1) ("กาม" 2)))
;; "กากาม"))
(defun ranges-to-toks (ranges str)
(loop for r in ranges
collect
(subseq str (range-s r) (range-e r))))
;; (ranges-to-toks
;; (path-to-ranges (build-path (make-prefix-tree '(("กา" 1) ("กาม" 2)))
;; "กากาม"))
;; "กากาม")
(defun tokenize (dix str)
(ranges-to-toks (path-to-ranges (build-path dix str))
str))
;; (tokenize (make-prefix-tree '(("กา" 1) ("กาม" 2))) "กากาม")
(defun tokenize-with-delim (dix str delim)
(join delim (tokenize dix str)))
;; (tokenize-with-delim (make-prefix-tree '(("กา" 1) ("กาม" 2))) "กากาม" "|")
(defparameter *dix*
(make-prefix-tree
(with-open-file (f #P"thai.txt" :external-format :utf-8)
(loop for line = (read-line f nil nil)
while line
collect (list line 1)))))
(defun bench1 ()
(time
(with-open-file (f #P"wik100k.txt" :external-format :utf-8)
(loop for line = (read-line f nil nil)
while line
do (tokenize-with-delim *dix* line "|")))))
(bench1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment