Created
September 25, 2010 08:16
-
-
Save naoyat/596609 to your computer and use it in GitHub Desktop.
[n-gram] FSNLP p.200の図6.3の数字が出るのかどうか
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Gauche | |
(use srfi-1) | |
(use srfi-13) | |
(define (string-list< sl1 sl2) | |
(cond ((null? sl1) #t) | |
((null? sl2) #f) | |
((string< (car sl1) (car sl2)) #t) | |
((string> (car sl1) (car sl2)) #f) | |
(else (string-list< (cdr sl1) (cdr sl2))))) | |
(define (uniq-cnt ls) | |
(let1 ht (make-hash-table 'equal?) | |
(for-each | |
(lambda (elm) | |
(hash-table-put! ht elm (+ 1 (hash-table-get ht elm 0)))) | |
ls) | |
(hash-table-map ht cons))) | |
(define (n-gram n words) | |
(if (= n 1) (map list words) | |
(let1 s (append (cons "^" words) '("$")) | |
(let loop ((l (length s)) (rest s) (result '())) | |
(if (< l n) result | |
(loop (- l 1) (cdr rest) (cons (take rest n) result))))))) | |
(define (n-1-gram n n-gram) | |
(let ((n-1 (- n 1)) | |
(ht (make-hash-table 'equal?)) | |
(htc (make-hash-table 'equal?))) | |
(for-each | |
(lambda (p) | |
(let* ((ng (car p)) (cnt (cdr p)) | |
(n-1g (take ng n-1)) (w (car (last-pair ng)))) | |
; (print "- " n-1g " " w " " cnt) | |
(let1 ht* (or (hash-table-get ht n-1g #f) (make-hash-table 'equal?)) | |
(hash-table-put! ht* w cnt) | |
(hash-table-put! ht n-1g ht*)) | |
(hash-table-put! htc n-1g | |
(+ cnt (hash-table-get htc n-1g 0))) | |
)) | |
n-gram) | |
(values ht htc))) | |
(define (n-gram-pp ng) | |
(string-append "{ " (string-join (map (lambda (ws) (string-join ws "-")) ng) ", ") " }")) | |
(define (load-file-as-wordlist path) | |
(with-input-from-file path | |
(lambda () | |
(let loop ((buf '())) | |
(let1 line (read-line) | |
(cond ((eof-object? line) | |
(apply append (reverse! buf))) | |
(else | |
(let1 splitted (string-split line #\Space) | |
(loop (cons splitted buf)))))))))) | |
(define austen (load-file-as-wordlist "austen.txt")) | |
(define persuasion (load-file-as-wordlist "ja-pers-clean.txt")) | |
(define (not-blank? w) (not (string=? "" w))) | |
(define austen-N 617091) ;(length (filter not-blank? austen))) ; 617091 | |
(define austen-V 14585) ;(- (length austen-1-gram) 1)) ; 14585 | |
;; n-gram | |
(let* ((n 3) | |
(austen-n-gram (uniq-cnt (n-gram n austen))) | |
(persuasion-n-gram (n-gram n persuasion))) | |
(receive (ht hcnt) (n-1-gram n austen-n-gram) | |
(for-each | |
(lambda (ws) | |
(let* ((n-1g (take ws (- n 1))) | |
(w (car (last-pair ws))) | |
(den (hash-table-get hcnt n-1g #f)) | |
(ht* (hash-table-get ht n-1g #f)) | |
(num (if ht* (hash-table-get ht* w #f) #f)) | |
(r (if (and num den) (/. num den) 0))) | |
(format #t "p(~a|~a) = ~a\n" w (string-join n-1g ",") r))) | |
persuasion-n-gram) )) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment