Created
May 4, 2021 16:41
-
-
Save veer66/60042bee7a3233ec94961060b2f832a7 to your computer and use it in GitHub Desktop.
2021
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| (defpackage #:cl-wordcut | |
| (:use #:cl) | |
| (:import-from #:str :join)) | |
| (in-package #:cl-wordcut) | |
| (declaim (optimize (speed 3) (debug 0) (safety 0))) | |
| (defconstant +default-dix-ptr-size+ 1024) | |
| (defstruct node-key | |
| (row-no 0 :type integer) | |
| (offset 0 :type integer) | |
| (ch #\0 :type character)) | |
| (defstruct node-ptr | |
| (row-no 0 :type integer) | |
| (is-final nil :type boolean) | |
| payload) | |
| (defun init-prefix-tree () | |
| (make-hash-table :test #'equalp)) | |
| (defun make-prefix-tree (sorted-words-with-payload) | |
| (let ((prefix-tree (init-prefix-tree)) | |
| (i 0)) | |
| (loop for word-payload in sorted-words-with-payload | |
| do | |
| (destructuring-bind (word payload) | |
| word-payload | |
| (let* ((row-no 0) | |
| (ch-list (coerce word 'list)) | |
| (ch-len (length ch-list)) | |
| (j 0)) | |
| (loop for ch in ch-list | |
| do | |
| (let ((node-key (make-node-key :row-no row-no :offset j :ch ch))) | |
| (let ((ex-node-ptr (gethash node-key prefix-tree))) | |
| (if ex-node-ptr | |
| (setq row-no (node-ptr-row-no ex-node-ptr)) | |
| (let* ((is-final (= (+ j 1) ch-len)) | |
| (node-ptr (make-node-ptr :row-no i :is-final is-final :payload (when is-final payload)))) | |
| (setf (gethash node-key prefix-tree) node-ptr) | |
| (setq row-no i)))) | |
| (incf j))))) | |
| (incf i)) | |
| prefix-tree)) | |
| (defun lookup (prefix-tree row-no offset ch) | |
| (gethash (make-node-key :row-no row-no :offset offset :ch ch) | |
| prefix-tree)) | |
| (defconstant +unk+ 1) | |
| (defconstant +dict+ 2) | |
| (defconstant +init+ 3) | |
| (defconstant +latin+ 4) | |
| (defconstant +punc+ 5) | |
| (defstruct link | |
| (p-idx 0 :type integer) | |
| (w 0 :type integer) | |
| (unk 0 :type integer) | |
| (kind 0 :type integer)) | |
| (defun betterp (l r) | |
| (or (< (link-unk l) | |
| (link-unk r)) | |
| (< (link-w l) | |
| (link-w r)))) | |
| (defconstant +waiting+ 1) | |
| (defconstant +activated+ 2) | |
| (defconstant +completed+ 3) | |
| (defstruct latin-transducer | |
| (s 0 :type integer) | |
| (e 0 :type integer) | |
| (state +waiting+ :type integer)) | |
| (defconstant +cap-a-code+ (char-code #\A)) | |
| (defconstant +cap-z-code+ (char-code #\Z)) | |
| (defconstant +a-code+ (char-code #\a)) | |
| (defconstant +z-code+ (char-code #\z)) | |
| (defparameter *latin-tab* (make-hash-table)) | |
| (loop for ch from +cap-a-code+ to +cap-z-code+ | |
| do (setf (gethash ch *latin-tab*) t)) | |
| (loop for ch from +a-code+ to +z-code+ | |
| do (setf (gethash ch *latin-tab*) t)) | |
| (defun latin-p (ch) | |
| (let ((code (char-code ch))) | |
| (gethash ch *latin-tab*))) | |
| (defun update-latin (transducer ch i ch-vec) | |
| (if (eq +waiting+ (latin-transducer-state transducer)) | |
| (if (latin-p ch) | |
| (progn | |
| (setf (latin-transducer-s transducer) i) | |
| (setf (latin-transducer-state transducer) +activated+) | |
| (when (or (eq (length ch-vec) (+ i 1)) | |
| (not (latin-p (aref ch-vec (+ i 1))))) | |
| (progn | |
| (setf (latin-transducer-e transducer) (+ i 1)) | |
| (setf (latin-transducer-state transducer) +completed+))))) | |
| (if (latin-p ch) | |
| (when (or (eq (length ch-vec) (+ i 1)) | |
| (not (latin-p (aref ch-vec (+ i 1))))) | |
| (progn | |
| (setf (latin-transducer-e transducer) (+ i 1)) | |
| (setf (latin-transducer-state transducer) +completed+))) | |
| (setf (latin-transducer-state transducer) +waiting+)))) | |
| (defclass punc-transducer () | |
| ((s :accessor s :initform 0 :type integer) | |
| (e :accessor e :initform 0 :type integer) | |
| (state :accessor state :initform +waiting+ :type integer) | |
| (link-kind :accessor transducer-link-kind :initform +punc+))) | |
| (defun punc-p (ch) | |
| (eq #\space ch)) | |
| (defmethod update-punc ((transducer punc-transducer) (ch character) (i integer) (ch-vec vector)) | |
| (if (eq +waiting+ (state transducer)) | |
| (if (punc-p ch) | |
| (progn | |
| (setf (slot-value transducer 's) i) | |
| (setf (slot-value transducer 'state) +activated+) | |
| (when (or (eq (length ch-vec) (+ i 1)) | |
| (not (punc-p (aref ch-vec (+ i 1))))) | |
| (progn | |
| (setf (slot-value transducer 'e) (+ i 1)) | |
| (setf (slot-value transducer 'state) +completed+))))) | |
| (if (punc-p ch) | |
| (when (or (eq (length ch-vec) (+ i 1)) | |
| (not (punc-p (aref ch-vec (+ i 1))))) | |
| (progn | |
| (setf (slot-value transducer 'e) (+ i 1)) | |
| (setf (slot-value transducer 'state) +completed+))) | |
| (setf (slot-value transducer 'state) +waiting+)))) | |
| (defstruct dix-ptr | |
| (s 0 :type integer) | |
| (row-no 0 :type integer) | |
| (is-final t :type boolean)) | |
| (defun build-path (dix s) | |
| (let* ((left-boundary 0) | |
| (s-len (length s)) | |
| (ch-list (coerce s 'list)) | |
| (ch-vec (make-array s-len :element-type 'character :initial-contents ch-list)) | |
| (path (make-array (+ s-len 1))) | |
| (dix-ptrs (make-array +default-dix-ptr-size+ :fill-pointer 0)) | |
| (i 0) | |
| (latin-transducer (make-latin-transducer)) | |
| (punc-transducer (make-instance 'punc-transducer)) | |
| ) | |
| (setf (aref path 0) (make-link :p-idx 0 :w 0 :unk 0 :kind +init+)) | |
| (loop for ch in ch-list | |
| do | |
| (let* ((unk-link (aref path left-boundary)) | |
| (link (make-link :p-idx left-boundary | |
| :w (+ (link-w unk-link) 1) | |
| :unk (+ (link-unk unk-link) 1) | |
| :kind +unk+)) | |
| (j 0)) | |
| (vector-push-extend (make-dix-ptr :s i :row-no 0 :is-final nil) dix-ptrs) | |
| (loop while (< j (length dix-ptrs)) | |
| do | |
| (let* ((dix-ptr (aref dix-ptrs j)) | |
| (offset (- i (dix-ptr-s dix-ptr))) | |
| (row-no (dix-ptr-row-no dix-ptr)) | |
| (child (lookup dix row-no offset ch))) | |
| (if child | |
| (progn | |
| (setf (aref dix-ptrs j) | |
| (make-dix-ptr :s (dix-ptr-s dix-ptr) | |
| :row-no (node-ptr-row-no child) | |
| :is-final (node-ptr-is-final child))) | |
| (incf j)) | |
| (if (eq (+ j 1) (length dix-ptrs)) | |
| (progn | |
| (incf j) | |
| (vector-pop dix-ptrs)) | |
| (setf (aref dix-ptrs j) (vector-pop dix-ptrs)))))) | |
| (loop for dix-ptr across dix-ptrs | |
| do | |
| (when (dix-ptr-is-final dix-ptr) | |
| (let* ((new-s (dix-ptr-s dix-ptr)) | |
| (dix-link (aref path new-s)) | |
| (new-link (make-link :p-idx new-s | |
| :w (link-w dix-link) | |
| :unk (link-unk dix-link) | |
| :kind +dict+))) | |
| (when (betterp new-link link) | |
| (setq link new-link))))) | |
| (update-latin latin-transducer ch i ch-vec) | |
| ;;(update-punc punc-transducer ch i ch-vec) | |
| ;; (when (eq (state latin-transducer) +completed+) | |
| ;; (let* ((s (s transducer)) | |
| ;; (prev-link (aref path s)) | |
| ;; (new-link (make-link :p-idx s | |
| ;; :w (+ (link-w prev-link) 1) | |
| ;; :unk (link-unk prev-link) | |
| ;; :kind +latin+))) | |
| ;; (when (betterp new-link link) | |
| ;; (setq link new-link)))) | |
| (unless (eq (link-kind link) +unk+) | |
| (setf left-boundary i)) | |
| (setf (aref path (+ i 1)) link) | |
| (incf i))) | |
| path)) | |
| ;; (defparameter *x* (make-array 8 :fill-pointer 0)) | |
| ;; (vector-push-extend (make-dix-ptr :s 1 :row-no 0 :final nil) *x*) | |
| ;; (build-path (make-prefix-tree '(("กา" 1) ("กาม" 2))) | |
| ;; "กากาม") | |
| ;; (build-path (make-prefix-tree '(("กา" 1) ("กาม" 2))) | |
| ;; "KAKA") | |
| ;; (build-path (make-prefix-tree '(("กา" 1) ("กาม" 2))) | |
| ;; "A B") | |
| ;; (lookup (make-prefix-tree '(("A" 1))) 0 0 #\A) | |
| ;; (lookup (make-prefix-tree '(("กา" 1) ("กาม" 2))) 0 0 #\ก) | |
| ;; (lookup (make-prefix-tree '(("กา" 1) ("กาม" 2))) 0 1 #\ก) ;; nil | |
| ;; (lookup (make-prefix-tree '(("กา" 1) ("กาม" 2))) 0 1 #\า) | |
| ;; (alexandria:hash-table-alist (make-prefix-tree '(("กา" 1) ("กาม" 2)))) | |
| (defstruct range | |
| (s 0 :type integer) | |
| (e 0 :type integer)) | |
| (defun path-to-ranges (path) | |
| (let ((e (- (length path) 1)) | |
| (ranges nil)) | |
| (loop while (> e 0) | |
| do | |
| (let* ((link (aref path e)) | |
| (s (link-p-idx link)) | |
| (r (make-range :s s :e e))) | |
| (setq ranges (cons r ranges)) | |
| (setq e s))) | |
| ranges)) | |
| ;; (path-to-ranges (build-path (make-prefix-tree '(("กา" 1) ("กาม" 2))) | |
| ;; "กากาม")) | |
| (defun ranges-to-toks (ranges str) | |
| (loop for r in ranges | |
| collect | |
| (subseq str (range-s r) (range-e r)))) | |
| ;; (ranges-to-toks | |
| ;; (path-to-ranges (build-path (make-prefix-tree '(("กา" 1) ("กาม" 2))) | |
| ;; "กากาม")) | |
| ;; "กากาม") | |
| (defun tokenize (dix str) | |
| (ranges-to-toks (path-to-ranges (build-path dix str)) | |
| str)) | |
| ;; (tokenize (make-prefix-tree '(("กา" 1) ("กาม" 2))) "กากาม") | |
| (defun tokenize-with-delim (dix str delim) | |
| (join delim (tokenize dix str))) | |
| ;; (tokenize-with-delim (make-prefix-tree '(("กา" 1) ("กาม" 2))) "กากาม" "|") | |
| (defparameter *dix* | |
| (make-prefix-tree | |
| (with-open-file (f #P"thai.txt" :external-format :utf-8) | |
| (loop for line = (read-line f nil nil) | |
| while line | |
| collect (list line 1))))) | |
| (defun bench1 () | |
| (time | |
| (with-open-file (f #P"wik100k.txt" :external-format :utf-8) | |
| (loop for line = (read-line f nil nil) | |
| while line | |
| do (tokenize-with-delim *dix* line "|"))))) | |
| (bench1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment