Created
January 20, 2017 20:35
-
-
Save EarlGray/8d10c8cab4bc03314826dd6ca0a4d023 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(use-modules | |
(ice-9 hash-table) | |
(srfi srfi-1)) | |
(define (for-each-file-line filepath thunk) | |
(with-input-from-file filepath (lambda () | |
(let loop ((line (read-line))) | |
(if line | |
(begin | |
(thunk line) | |
(loop (read-line)))))))) | |
(define (vector-sum v) | |
(let loop ((sum 0) (i 0)) | |
(if (< i (vector-length v)) | |
(loop (+ sum (vector-ref v i)) (+ 1 i)) | |
sum))) | |
; returns a hash-table from words to #t | |
(define (tokenize text) | |
(with-input-from-string text | |
(lambda () | |
(let ((words (make-hash-table))) | |
(let loop | |
((prev #\space) | |
(word '())) | |
(begin | |
(define cur (read-char)) | |
(if (eof-object? cur) | |
(begin | |
(if (char-alphabetic? prev) | |
(hash-set! words (list->string (reverse (cons prev word))) #t)) | |
;(display "## tokenized: ") | |
;(hash-for-each (lambda (k v) (write k)(display " ")) words)(newline) | |
words) | |
(let ((c (char-downcase cur))) | |
(if (char-alphabetic? c) | |
(if (char-alphabetic? prev) | |
(loop c (cons prev word)) | |
(loop c '())) | |
(if (char-alphabetic? prev) | |
(let ((complete-word (list->string (reverse (cons prev word))))) | |
(hash-set! words complete-word #t) | |
(loop c '())) | |
(loop c '()))))))))))) | |
(define label-count 2) | |
(define (vector-inc-value vect index) | |
(let ((value (vector-ref vect index))) | |
(if value (vector-set! vect index (+ 1 value)) 1))) | |
(define (update-frequencies | |
word-counts ; a hashmap from a word to #(label0freq label1freq ...) | |
text ; a string with words | |
training-label) ; label \in (0 .. label-count - 1) | |
(hash-for-each | |
(lambda (word _) | |
(begin | |
(unless (hash-ref word-counts word) | |
(hash-set! word-counts word (make-vector label-count 0))) | |
(let ((word-label-vector (hash-ref word-counts word))) | |
(vector-inc-value word-label-vector training-label)))) | |
(tokenize text))) | |
; an example of using a training-stream: | |
(define (print-stream training-stream) | |
(let loop ((item (training-stream))) | |
(if item | |
(let ((label (car item)) | |
(text (cdr item))) | |
(begin | |
(newline) | |
(write label)(newline) | |
(write text)(newline) | |
(loop (training-stream)))) | |
(begin (display "Bye")(newline))))) | |
; training-stream is a function that produces | |
; - either #f as the end of stream; | |
; - or (label . text) | |
; returns (word-freqs . label-counts) | |
; where `word-freqs` is a hashtable from a word to vector of counts | |
; `label-counts` is a vector of label counts | |
(define (word-freqs-for-stream training-stream) | |
(let | |
((word-freqs (make-hash-table)) | |
(label-counts (make-vector label-count 0))) | |
(let loop () | |
(let ((item (training-stream))) | |
(if item | |
(let ((label (car item)) | |
(text (cdr item))) | |
(vector-inc-value label-counts label) | |
(update-frequencies word-freqs text label) | |
(loop)) | |
(cons word-freqs label-counts)))))) | |
; makes a word-freqs dict a dictionary of Pr(Class | Word): | |
; Pr(Class | Word) = Count(Word /\ Class) / Count(Word) | |
(define (make-probabilities word-freqs) | |
(hash-for-each | |
(lambda (word count-vect) | |
(let ((n-word (vector-sum count-vect))) | |
(let loop ((i 0)) | |
(if (< i label-count) | |
(let ((n-word-and-class (vector-ref count-vect i))) | |
(let ((pr-class-if-word (/ n-word-and-class (exact->inexact n-word)))) | |
(begin | |
;(display "## ")(write word)(display "\ti=")(write i) | |
;(display ", pr(class | word)=")(write pr-class-if-word)(newline) | |
(vector-set! count-vect i pr-class-if-word) | |
(loop (+ 1 i))))))))) | |
word-freqs)) | |
; returns Pr(Class | Word) | |
(define (probab-label-if-word word-probabs word label) | |
(let ((probab-vect (hash-ref word-probabs word))) | |
(if probab-vect | |
(vector-ref probab-vect label) | |
#f))) | |
; p1 p2 ... pN | |
; p = ---------------------------------------------- | |
; p1 p2 ... pN + (1 - p1)(1 - p2)...(1 - pN) | |
; where | |
; p = Pr(Class | Text) where Text = { Word1, Word2, ..., WordN } | |
; p1 = Pr(Class | Word1) | |
; ... | |
; pN = Pr(Class | WordN) | |
(define (probab-class-if-text word-probabs text label) | |
(let ((pr '()) | |
(folder (lambda (p_i sum) | |
(+ sum (- (log (- 1.0 p_i)) (log p_i)))))) | |
(begin | |
(hash-for-each | |
(lambda (word _) | |
(let ((p_i (probab-label-if-word word-probabs word label))) | |
(if p_i | |
(set! pr (cons p_i pr))))) | |
(tokenize text)) | |
;(display "## p_i: ")(write pr)(newline) | |
(let ((eta (fold folder 0.0 pr))) | |
; (display "## eta = ")(write eta)(newline) | |
(/ 1.0 (+ 1.0 (exp eta)))) | |
))) | |
(define (debug-probabs word-probabs) | |
(hash-for-each | |
(lambda (word probabs) | |
(write word)(display ": ")(write probabs) | |
(newline)) | |
word-probabs)) | |
; takes a training stream | |
; returns a closure `bayes-classifier`: | |
; (define bayes-classifier (make-bayes-classifier training-stream)) | |
; (bayes-classifier text label) => Pr(label | text) | |
(define (make-bayes-classifier training-stream) | |
(let ((wfs-lc (word-freqs-for-stream training-stream))) | |
(let ((word-probabs (car wfs-lc))) | |
(begin | |
; (debug-probabs word-probabs) | |
(make-probabilities word-probabs) | |
; (debug-probabs word-probabs) | |
(lambda (text label) | |
(probab-class-if-text word-probabs text label)))))) | |
; vim: set sts=2 sw=2 | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(load "bayes.scm") | |
;; Labels: 0 - Europe, 1 - America | |
(define training-set | |
'((0 . "London Paris NewYork Milan Kyiv Warsaw Milan Lisbon Madrid") | |
(0 . "Lisbon Frankfurt Amsterdam SanFrancisco Oslo Warsaw Helsinki") | |
(0 . "Helsinki Amsterdam SoltLakeCity Venice Oslo Kopenhagen Amsterdam") | |
(0 . "Sophia Lisbon Berlin Miami Amsterdam Odessa Warsaw Prague") | |
(0 . "Odessa Istanbul Amsterdam Sophia Dublin Philadelphia Venice Madrid") | |
(0 . "Frankfurt Milan Berlin Kyiv Helsinki Warsaw Oslo Ottava Prague") | |
(0 . "Odessa Frankfurt Washington Kyiv Kopenhagen Paris Berlin Madrid Warsaw") | |
(0 . "Prague Madrid Sophia Dublin Miami Paris Istanbul Oslo Venice") | |
(1 . "SanFrancisco Kansas Odessa Dallas Houston Washington Ottava ") | |
(1 . "Ottava Kansas SanFrancisco NewYork Detroit Chicago Philadelphia") | |
(1 . "Miami SanFrancisco Dallas Washington Odessa Seattle Paris") | |
(1 . "Milan Ottava SanFrancisco Kyiv Chicago Philadelphia Toronto") | |
(1 . "NewYork Dallas SoltLakeCity Detroit Kansas Seattle Ottava") | |
(1 . "SoltLakeCity Portland Miami Oslo Philadelphia Toronto Sacramento") | |
(1 . "Washington Lisbon Toronto Miami SoltLakeCity Portland Sacramento") | |
(1 . "NewYork Washington LosAngeles Detroit Sacramento Seattle Prague") | |
(1 . "LosAngeles Portland Amsterdam Miami Detroit Philadelphia Madrid") | |
(1 . "Dallas LosAngeles Oklahoma Berlin Odessa Houston Chicago Kansas"))) | |
(define (produce-training-stream) | |
(define ts training-set) | |
(lambda () | |
(if (eq? '() ts) | |
#f | |
(let ((item (car ts))) | |
(begin | |
(set! ts (cdr ts)) | |
item))))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(use-modules | |
(ice-9 hash-table) | |
(ice-9 format) | |
(ice-9 rdelim)) | |
(load "bayes.scm") | |
(define data-directory "CSDMC2010_SPAM/CSDMC2010_SPAM") | |
(define training-labels-file (string-append data-directory "/SPAMTrain.label")) | |
(define training-mail-dir (string-append data-directory "/TRAINING")) | |
(define testing-mail-dir (string-append data-directory "/TESTING")) | |
(define (training-path filename) | |
(string-append training-mail-dir "/" filename)) | |
(define (testing-path filename) | |
(string-append testing-mail-dir "/" filename)) | |
(define (training-mail-fpath-by-num num) | |
(training-path | |
(with-output-to-string | |
(lambda () (format #t "TRAIN_~5,'0d.eml" num))))) | |
(define (testing-mail-fpath-by-num num) | |
(testing-path | |
(with-output-to-string | |
(lambda () (format #t "TEST_~5,'0d.eml" num))))) | |
(define (read-mail filename) | |
(with-input-from-file filename | |
(lambda () | |
; skip headers | |
(let loop () | |
(let ((line (read-line))) | |
(unless (or (eof-object? line) | |
(eq? (string-length line) 0) | |
(equal? line "\r")) | |
;(display "## ")(display (string-length line)) | |
;(display ": ")(for-each write (string->list line))(newline) | |
(loop)))) | |
(read-string)))) | |
; returns a thunk iterating over first n training mail (label . text) | |
; `n-mails` limits number of mails | |
(define (produce-mail-stream n-mails) | |
(define mail-num 0) | |
(define tlport (open-input-file training-labels-file)) | |
(lambda () | |
(if (< mail-num n-mails) | |
(let ((line (read-line tlport))) | |
(if (eof-object? line) | |
#f | |
(let ((tws (string-split line #\space))) | |
(let ((label (string->number (car tws))) | |
(tfname (cadr tws))) | |
(let ((text (read-mail (training-path tfname)))) | |
(display "### reading ")(display tfname)(newline) | |
(set! mail-num (+ 1 mail-num)) | |
(cons label text)))))) | |
#f))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment