Skip to content

Instantly share code, notes, and snippets.

@naoyat
Created September 25, 2010 08:16
Show Gist options
  • Save naoyat/596609 to your computer and use it in GitHub Desktop.
Save naoyat/596609 to your computer and use it in GitHub Desktop.
[n-gram] FSNLP p.200の図6.3の数字が出るのかどうか
;; Gauche
(use srfi-1)
(use srfi-13)
(define (string-list< sl1 sl2)
(cond ((null? sl1) #t)
((null? sl2) #f)
((string< (car sl1) (car sl2)) #t)
((string> (car sl1) (car sl2)) #f)
(else (string-list< (cdr sl1) (cdr sl2)))))
(define (uniq-cnt ls)
(let1 ht (make-hash-table 'equal?)
(for-each
(lambda (elm)
(hash-table-put! ht elm (+ 1 (hash-table-get ht elm 0))))
ls)
(hash-table-map ht cons)))
(define (n-gram n words)
(if (= n 1) (map list words)
(let1 s (append (cons "^" words) '("$"))
(let loop ((l (length s)) (rest s) (result '()))
(if (< l n) result
(loop (- l 1) (cdr rest) (cons (take rest n) result)))))))
(define (n-1-gram n n-gram)
(let ((n-1 (- n 1))
(ht (make-hash-table 'equal?))
(htc (make-hash-table 'equal?)))
(for-each
(lambda (p)
(let* ((ng (car p)) (cnt (cdr p))
(n-1g (take ng n-1)) (w (car (last-pair ng))))
; (print "- " n-1g " " w " " cnt)
(let1 ht* (or (hash-table-get ht n-1g #f) (make-hash-table 'equal?))
(hash-table-put! ht* w cnt)
(hash-table-put! ht n-1g ht*))
(hash-table-put! htc n-1g
(+ cnt (hash-table-get htc n-1g 0)))
))
n-gram)
(values ht htc)))
(define (n-gram-pp ng)
(string-append "{ " (string-join (map (lambda (ws) (string-join ws "-")) ng) ", ") " }"))
(define (load-file-as-wordlist path)
(with-input-from-file path
(lambda ()
(let loop ((buf '()))
(let1 line (read-line)
(cond ((eof-object? line)
(apply append (reverse! buf)))
(else
(let1 splitted (string-split line #\Space)
(loop (cons splitted buf))))))))))
(define austen (load-file-as-wordlist "austen.txt"))
(define persuasion (load-file-as-wordlist "ja-pers-clean.txt"))
(define (not-blank? w) (not (string=? "" w)))
(define austen-N 617091) ;(length (filter not-blank? austen))) ; 617091
(define austen-V 14585) ;(- (length austen-1-gram) 1)) ; 14585
;; n-gram
(let* ((n 3)
(austen-n-gram (uniq-cnt (n-gram n austen)))
(persuasion-n-gram (n-gram n persuasion)))
(receive (ht hcnt) (n-1-gram n austen-n-gram)
(for-each
(lambda (ws)
(let* ((n-1g (take ws (- n 1)))
(w (car (last-pair ws)))
(den (hash-table-get hcnt n-1g #f))
(ht* (hash-table-get ht n-1g #f))
(num (if ht* (hash-table-get ht* w #f) #f))
(r (if (and num den) (/. num den) 0)))
(format #t "p(~a|~a) = ~a\n" w (string-join n-1g ",") r)))
persuasion-n-gram) ))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment