Skip to content

Instantly share code, notes, and snippets.

@EarlGray
Created January 20, 2017 20:35
Show Gist options
  • Save EarlGray/8d10c8cab4bc03314826dd6ca0a4d023 to your computer and use it in GitHub Desktop.
Save EarlGray/8d10c8cab4bc03314826dd6ca0a4d023 to your computer and use it in GitHub Desktop.
(use-modules
(ice-9 hash-table)
(srfi srfi-1))
(define (for-each-file-line filepath thunk)
(with-input-from-file filepath (lambda ()
(let loop ((line (read-line)))
(if line
(begin
(thunk line)
(loop (read-line))))))))
(define (vector-sum v)
(let loop ((sum 0) (i 0))
(if (< i (vector-length v))
(loop (+ sum (vector-ref v i)) (+ 1 i))
sum)))
; returns a hash-table from words to #t
(define (tokenize text)
(with-input-from-string text
(lambda ()
(let ((words (make-hash-table)))
(let loop
((prev #\space)
(word '()))
(begin
(define cur (read-char))
(if (eof-object? cur)
(begin
(if (char-alphabetic? prev)
(hash-set! words (list->string (reverse (cons prev word))) #t))
;(display "## tokenized: ")
;(hash-for-each (lambda (k v) (write k)(display " ")) words)(newline)
words)
(let ((c (char-downcase cur)))
(if (char-alphabetic? c)
(if (char-alphabetic? prev)
(loop c (cons prev word))
(loop c '()))
(if (char-alphabetic? prev)
(let ((complete-word (list->string (reverse (cons prev word)))))
(hash-set! words complete-word #t)
(loop c '()))
(loop c '())))))))))))
(define label-count 2)
(define (vector-inc-value vect index)
(let ((value (vector-ref vect index)))
(if value (vector-set! vect index (+ 1 value)) 1)))
(define (update-frequencies
word-counts ; a hashmap from a word to #(label0freq label1freq ...)
text ; a string with words
training-label) ; label \in (0 .. label-count - 1)
(hash-for-each
(lambda (word _)
(begin
(unless (hash-ref word-counts word)
(hash-set! word-counts word (make-vector label-count 0)))
(let ((word-label-vector (hash-ref word-counts word)))
(vector-inc-value word-label-vector training-label))))
(tokenize text)))
; an example of using a training-stream:
(define (print-stream training-stream)
(let loop ((item (training-stream)))
(if item
(let ((label (car item))
(text (cdr item)))
(begin
(newline)
(write label)(newline)
(write text)(newline)
(loop (training-stream))))
(begin (display "Bye")(newline)))))
; training-stream is a function that produces
; - either #f as the end of stream;
; - or (label . text)
; returns (word-freqs . label-counts)
; where `word-freqs` is a hashtable from a word to vector of counts
; `label-counts` is a vector of label counts
(define (word-freqs-for-stream training-stream)
(let
((word-freqs (make-hash-table))
(label-counts (make-vector label-count 0)))
(let loop ()
(let ((item (training-stream)))
(if item
(let ((label (car item))
(text (cdr item)))
(vector-inc-value label-counts label)
(update-frequencies word-freqs text label)
(loop))
(cons word-freqs label-counts))))))
; makes a word-freqs dict a dictionary of Pr(Class | Word):
; Pr(Class | Word) = Count(Word /\ Class) / Count(Word)
(define (make-probabilities word-freqs)
(hash-for-each
(lambda (word count-vect)
(let ((n-word (vector-sum count-vect)))
(let loop ((i 0))
(if (< i label-count)
(let ((n-word-and-class (vector-ref count-vect i)))
(let ((pr-class-if-word (/ n-word-and-class (exact->inexact n-word))))
(begin
;(display "## ")(write word)(display "\ti=")(write i)
;(display ", pr(class | word)=")(write pr-class-if-word)(newline)
(vector-set! count-vect i pr-class-if-word)
(loop (+ 1 i)))))))))
word-freqs))
; returns Pr(Class | Word)
(define (probab-label-if-word word-probabs word label)
(let ((probab-vect (hash-ref word-probabs word)))
(if probab-vect
(vector-ref probab-vect label)
#f)))
; p1 p2 ... pN
; p = ----------------------------------------------
; p1 p2 ... pN + (1 - p1)(1 - p2)...(1 - pN)
; where
; p = Pr(Class | Text) where Text = { Word1, Word2, ..., WordN }
; p1 = Pr(Class | Word1)
; ...
; pN = Pr(Class | WordN)
(define (probab-class-if-text word-probabs text label)
(let ((pr '())
(folder (lambda (p_i sum)
(+ sum (- (log (- 1.0 p_i)) (log p_i))))))
(begin
(hash-for-each
(lambda (word _)
(let ((p_i (probab-label-if-word word-probabs word label)))
(if p_i
(set! pr (cons p_i pr)))))
(tokenize text))
;(display "## p_i: ")(write pr)(newline)
(let ((eta (fold folder 0.0 pr)))
; (display "## eta = ")(write eta)(newline)
(/ 1.0 (+ 1.0 (exp eta))))
)))
(define (debug-probabs word-probabs)
(hash-for-each
(lambda (word probabs)
(write word)(display ": ")(write probabs)
(newline))
word-probabs))
; takes a training stream
; returns a closure `bayes-classifier`:
; (define bayes-classifier (make-bayes-classifier training-stream))
; (bayes-classifier text label) => Pr(label | text)
(define (make-bayes-classifier training-stream)
(let ((wfs-lc (word-freqs-for-stream training-stream)))
(let ((word-probabs (car wfs-lc)))
(begin
; (debug-probabs word-probabs)
(make-probabilities word-probabs)
; (debug-probabs word-probabs)
(lambda (text label)
(probab-class-if-text word-probabs text label))))))
; vim: set sts=2 sw=2
(load "bayes.scm")
;; Labels: 0 - Europe, 1 - America
(define training-set
'((0 . "London Paris NewYork Milan Kyiv Warsaw Milan Lisbon Madrid")
(0 . "Lisbon Frankfurt Amsterdam SanFrancisco Oslo Warsaw Helsinki")
(0 . "Helsinki Amsterdam SoltLakeCity Venice Oslo Kopenhagen Amsterdam")
(0 . "Sophia Lisbon Berlin Miami Amsterdam Odessa Warsaw Prague")
(0 . "Odessa Istanbul Amsterdam Sophia Dublin Philadelphia Venice Madrid")
(0 . "Frankfurt Milan Berlin Kyiv Helsinki Warsaw Oslo Ottava Prague")
(0 . "Odessa Frankfurt Washington Kyiv Kopenhagen Paris Berlin Madrid Warsaw")
(0 . "Prague Madrid Sophia Dublin Miami Paris Istanbul Oslo Venice")
(1 . "SanFrancisco Kansas Odessa Dallas Houston Washington Ottava ")
(1 . "Ottava Kansas SanFrancisco NewYork Detroit Chicago Philadelphia")
(1 . "Miami SanFrancisco Dallas Washington Odessa Seattle Paris")
(1 . "Milan Ottava SanFrancisco Kyiv Chicago Philadelphia Toronto")
(1 . "NewYork Dallas SoltLakeCity Detroit Kansas Seattle Ottava")
(1 . "SoltLakeCity Portland Miami Oslo Philadelphia Toronto Sacramento")
(1 . "Washington Lisbon Toronto Miami SoltLakeCity Portland Sacramento")
(1 . "NewYork Washington LosAngeles Detroit Sacramento Seattle Prague")
(1 . "LosAngeles Portland Amsterdam Miami Detroit Philadelphia Madrid")
(1 . "Dallas LosAngeles Oklahoma Berlin Odessa Houston Chicago Kansas")))
(define (produce-training-stream)
(define ts training-set)
(lambda ()
(if (eq? '() ts)
#f
(let ((item (car ts)))
(begin
(set! ts (cdr ts))
item)))))
(use-modules
(ice-9 hash-table)
(ice-9 format)
(ice-9 rdelim))
(load "bayes.scm")
(define data-directory "CSDMC2010_SPAM/CSDMC2010_SPAM")
(define training-labels-file (string-append data-directory "/SPAMTrain.label"))
(define training-mail-dir (string-append data-directory "/TRAINING"))
(define testing-mail-dir (string-append data-directory "/TESTING"))
(define (training-path filename)
(string-append training-mail-dir "/" filename))
(define (testing-path filename)
(string-append testing-mail-dir "/" filename))
(define (training-mail-fpath-by-num num)
(training-path
(with-output-to-string
(lambda () (format #t "TRAIN_~5,'0d.eml" num)))))
(define (testing-mail-fpath-by-num num)
(testing-path
(with-output-to-string
(lambda () (format #t "TEST_~5,'0d.eml" num)))))
(define (read-mail filename)
(with-input-from-file filename
(lambda ()
; skip headers
(let loop ()
(let ((line (read-line)))
(unless (or (eof-object? line)
(eq? (string-length line) 0)
(equal? line "\r"))
;(display "## ")(display (string-length line))
;(display ": ")(for-each write (string->list line))(newline)
(loop))))
(read-string))))
; returns a thunk iterating over first n training mail (label . text)
; `n-mails` limits number of mails
(define (produce-mail-stream n-mails)
(define mail-num 0)
(define tlport (open-input-file training-labels-file))
(lambda ()
(if (< mail-num n-mails)
(let ((line (read-line tlport)))
(if (eof-object? line)
#f
(let ((tws (string-split line #\space)))
(let ((label (string->number (car tws)))
(tfname (cadr tws)))
(let ((text (read-mail (training-path tfname))))
(display "### reading ")(display tfname)(newline)
(set! mail-num (+ 1 mail-num))
(cons label text))))))
#f)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment