Created
September 29, 2012 22:40
-
-
Save damionjunk/3805350 to your computer and use it in GitHub Desktop.
Witten Bell Smoothing for Unigrams
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns witten-bell.core | |
(:import [java.io SequenceInputStream] | |
[java.util Enumeration Collections]) | |
(:require [clojure.java.io :as io] | |
[clojure.string :as s] | |
[witten-bell.io :as wio])) | |
(defn witten-bell | |
"Given a seq of lines of text, a witten-bell probability map for seen and | |
unseen events is returned. The map is in the form: | |
{:unseen <total probability mass for unseen words> | |
w_1 <discounted probability> | |
... | |
w_n <discounted probability>" | |
[lines & {:keys [n z] :or {n 1 z 10}}] | |
(let [wordseq (mapcat #(re-seq #"[A-Za-z_-]+" %) lines) | |
ngrams (partition n 1 wordseq) | |
freaks (frequencies ngrams) | |
types (count freaks) | |
tokens (reduce + (map val freaks)) | |
denom (+ types tokens) | |
unseen (/ types denom)] | |
(assoc | |
(reduce (fn [m w] | |
(assoc m (key w) (/ (val w) denom)) | |
) {} freaks) | |
:unseen unseen))) | |
(defn witten-bell-file | |
"Runs the witten-bell smoothing on a directory of files matching the | |
provided filename (which can be a regular expression)." | |
[filere & {:keys [dir n] :or {dir "./" n 1}}] | |
(with-open [rdr (io/reader | |
(SequenceInputStream. | |
(wio/coll->enumeration (wio/match-multi-input-stream dir filere))))] | |
(witten-bell (line-seq rdr) n))) | |
(comment | |
(witten-bell (seq ["the dog hates the cat" "the cat hates the dog"])) | |
;;=> {:unseen 2/7, ("cat") 1/7, ("hates") 1/7, ("dog") 1/7, ("the") 2/7} | |
(witten-bell (seq ["the dog hates the cat" "the cat hates the dog" | |
"the cat is a dog that hates all dog cats" | |
"dogs smell like cats"])) | |
;; => {("a") 1/36, ("all") 1/36, :unseen 1/3, ("that") 1/36, | |
;; ("hates") 1/12, ("is") 1/36, ("smell") 1/36, ("the") 5/36, | |
; ("cat") 1/12, ("dogs") 1/36, ("like") 1/36, ("dog") 1/9, ("cats") 1/18} | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment