Created
January 17, 2012 12:24
-
-
Save Folcon/1626486 to your computer and use it in GitHub Desktop.
A Clojure Implementation of the Viterbi Algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
====================================== | |
In/IN | |
[ an/DT Oct./NNP 19/CD review/NN ] | |
of/IN ``/`` | |
[ The/DT Misanthrope/NN ] | |
''/'' at/IN | |
[ Chicago/NNP 's/POS Goodman/NNP Theatre/NNP ] | |
(/( | |
[ ``/`` Revitalized/VBN Classics/NNS ] | |
Take/VBP | |
[ the/DT Stage/NN ] | |
in/IN | |
[ Windy/NNP City/NNP ] | |
,/, ''/'' | |
[ Leisure/NN ] | |
&/CC | |
[ Arts/NNS ] | |
)/) ,/, | |
[ the/DT role/NN ] | |
of/IN | |
[ Celimene/NNP ] | |
,/, played/VBN by/IN | |
[ Kim/NNP Cattrall/NNP ] | |
,/, was/VBD mistakenly/RB attributed/VBN to/TO | |
[ Christina/NNP Haag/NNP ] | |
./. | |
[ Ms./NNP Haag/NNP ] | |
plays/VBZ | |
[ Elianti/NNP ] | |
./. | |
====================================== | |
(/( See/VB :/: ``/`` | |
[ Revitalized/VBN Classics/NNS ] | |
Take/VBP | |
[ the/DT Stage/NN ] | |
in/IN | |
[ Windy/NNP City/NNP ] | |
''/'' --/: | |
[ WSJ/NNP Oct./NNP 19/CD ] | |
,/, | |
[ 1989/CD ] | |
)/) | |
====================================== | |
====================================== | |
[ Rolls-Royce/NNP Motor/NNP Cars/NNPS Inc./NNP ] | |
said/VBD | |
[ it/PRP ] | |
expects/VBZ | |
[ its/PRP$ U.S./NNP sales/NNS ] | |
to/TO remain/VB | |
[ steady/JJ ] | |
at/IN about/IN | |
[ 1,200/CD cars/NNS ] | |
in/IN | |
[ 1990/CD ] | |
./. | |
[ The/DT luxury/NN auto/NN maker/NN last/JJ year/NN ] | |
sold/VBD | |
[ 1,214/CD cars/NNS ] | |
in/IN | |
[ the/DT U.S./NNP Howard/NNP Mosher/NNP ] | |
,/, | |
[ president/NN ] | |
and/CC | |
[ chief/JJ executive/NN officer/NN ] | |
,/, said/VBD | |
[ he/PRP ] | |
anticipates/VBZ | |
[ growth/NN ] | |
for/IN | |
[ the/DT luxury/NN auto/NN maker/NN ] | |
in/IN | |
[ Britain/NNP ] | |
and/CC | |
[ Europe/NNP ] | |
,/, and/CC in/IN | |
[ Far/JJ Eastern/JJ markets/NNS ] | |
./. | |
====================================== | |
[ BELL/NNP INDUSTRIES/NNP Inc./NNP ] | |
increased/VBD | |
[ its/PRP$ quarterly/NN ] | |
to/TO | |
[ 10/CD cents/NNS ] | |
from/IN | |
[ seven/CD cents/NNS ] | |
[ a/DT share/NN ] | |
./. | |
[ The/DT new/JJ rate/NN ] | |
will/MD be/VB | |
[ payable/JJ Feb./NNP 15/CD ] | |
./. | |
[ A/DT record/NN date/NN has/VBZ n't/RB ] | |
been/VBN set/VBN ./. | |
[ Bell/NNP ] | |
,/, based/VBN in/IN | |
[ Los/NNP Angeles/NNP ] | |
,/, makes/VBZ and/CC distributes/VBZ | |
[ electronic/JJ ] | |
,/, | |
[ computer/NN ] | |
and/CC | |
[ building/NN products/NNS ] | |
./. | |
====================================== | |
[ Investors/NNS ] | |
are/VBP appealing/VBG to/TO | |
[ the/DT Securities/NNPS ] | |
and/CC | |
[ Exchange/NNP Commission/NNP ] | |
not/RB to/TO limit/VB | |
[ their/PRP$ access/NN ] | |
to/TO | |
[ information/NN ] | |
about/IN | |
[ stock/NN purchases/NNS ] | |
and/CC | |
[ sales/NNS ] | |
by/IN | |
[ corporate/JJ insiders/NNS ] | |
./. | |
====================================== | |
[ A/DT SEC/NNP proposal/NN ] | |
to/TO ease/VB reporting/NN | |
[ requirements/NNS ] | |
for/IN | |
[ some/DT company/NN executives/NNS ] | |
would/MD undermine/VB | |
[ the/DT usefulness/NN ] | |
of/IN | |
[ information/NN ] | |
on/IN | |
[ insider/NN trades/NNS ] | |
as/IN | |
[ a/DT stock-picking/JJ tool/NN ] | |
,/, | |
[ individual/JJ investors/NNS ] | |
and/CC | |
[ professional/JJ money/NN managers/NNS ] | |
contend/VBP ./. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns tagger.core | |
"Running Viterbi on some text" | |
(:require [clojure.string :as str] | |
[clojure.contrib.generic.functor :as functor] | |
[clojure.contrib.math :as math] | |
[clojure.set :as set] | |
[clojure.data :as data] | |
[clojure.data.finger-tree :as ft])) | |
(def ^:dynamic *epsilon* 0.01) | |
(defn applyAll [fs x] | |
(map #(% x) fs)) | |
(defn split-evenly [coll] | |
(partition (quot (count coll) 10) coll)) | |
(defn nil?-zero [fn & args] | |
(let [val (apply fn args)] | |
(if (nil? val) | |
0 | |
val))) | |
;; Things to watch out for: | |
;; There are 1942 sentences | |
;; There are 44 escaped backslashes "\/" | |
;; There are 5 tags that contain a | which delimits two different tags into 1 | |
;; Counts needed are: | |
;; word counts W | |
;; tag counts T | |
;; word-tag counts W-T | |
;; previous tag to current tag counts T(i+1)-Ti | |
;; Problem set | |
(def str-to-tags (slurp "resources/treebank3_sect2.txt")) | |
;; A sample set to play with | |
;; (def str-to-tags (slurp "resources/sample.txt")) | |
;; A sample set containing the tags with | | |
;; (def str-to-tags (slurp "resources/sample2.txt")) | |
;; A sample set containing words with \/ (escaped /'s) | |
;; (def str-to-tags (slurp "resources/sample3.txt")) | |
;; A small sample set | |
;; (def str-to-tags (slurp "resources/sample4.txt")) | |
;; A quarter of the problem set | |
;; (def str-to-tags (slurp "resources/sample5.txt")) | |
;; 10 sentences | |
;; (def str-to-tags (slurp "resources/sample6.txt")) | |
(defn str->tags [string] | |
(filter #(not (empty? %)) | |
(str/split string #"[\s]"))) | |
(defn tag->W-T [tag] | |
"Converts a tag such as In/IN into a W-T such as [In IN]" | |
(str/split tag #"[//]")) | |
(defn sentence->tags [sentence] | |
(map #(second %) sentence)) | |
(defn strip-tags [tag] | |
(first tag)) | |
(defn this-and-that [xs] | |
(map-indexed (fn [i x] | |
[x (concat (take i xs) | |
(drop (inc i) xs))]) | |
xs)) | |
(def cleaned-tag-str (filter #(= (count %) 2) (map tag->W-T (str->tags str-to-tags)))) | |
(defn split-sentences [tag-str] | |
"Splits the str into sentences" | |
(reduce #(if (= (second (first %2)) ".") | |
(ft/conjr (pop %1) (ft/conjr (peek %1) (first %2))) | |
(ft/conjr %1 %2)) | |
(ft/double-list) | |
(map #(apply ft/double-list %) (partition-by #(= "." (second %)) tag-str)))) | |
(defn split-sentences-start-end [tag-str] | |
"Splits the str into sentences with added start and end tags" | |
(reduce #(if (= (second (first %2)) ".") | |
(conj (vec (drop-last %1)) (conj (vec (conj (vec (conj (last %1) ["START" "START"])) | |
(first %2))) ["END" "END"])) | |
(conj (vec %1) %2)) | |
[] | |
(partition-by #(= "." (second %)) tag-str))) | |
(def sentences (split-sentences cleaned-tag-str)) | |
(def testing-and-training-sentences | |
"A list containing 10 pairs of testing sentences and training sentences" | |
(map (fn [[fst rst]] (ft/double-list fst (apply concat rst))) (this-and-that (split-evenly sentences)))) | |
(defn add-start-end [sentence-list] | |
(map #(ft/consl (ft/conjr % ["END" "END"]) ["START" "START"]) sentence-list)) | |
(def testing-and-training-sentences-start-end (map #(map add-start-end %) testing-and-training-sentences)) | |
(def training-tag-list-start-end (map (comp #(map sentence->tags %) second) testing-and-training-sentences-start-end)) | |
(def testing-and-training-tag-list-start-end (map (fn [sample] (map #(map sentence->tags %) sample)) testing-and-training-sentences-start-end)) | |
(defn insert [m k] | |
"Inserts a key k into a map m if it does not exist or increments the count if it does" | |
(let [val (m k)] | |
(assoc m k (inc (if (nil? val) 0 val))))) | |
(defn nested-insert [m [word tag]] | |
"Inserts a key k into a nested map m of tags and words if it does not exist or increments the count if it does" | |
(let [val (get-in m [tag word])] | |
(assoc-in m [tag word] (inc (if (nil? val) 0 val))))) | |
(defn word-count [tagged-str] | |
"Example of how to get word counts" | |
(reduce #(insert %1 (first %2)) {} tagged-str)) | |
(defn tag-count [tagged-str] | |
"Example of how to get tag counts" | |
(reduce #(insert %1 (second %2)) {} tagged-str)) | |
(defn nested-tag-word-count [tagged-str] | |
"Nested counts in the format of {tag {word count}}" | |
(reduce #(nested-insert %1 %2) {} tagged-str)) | |
(def tag-count-training-list (map #(tag-count (apply concat (second %))) testing-and-training-sentences)) | |
(def word-count-training-list (map #(word-count (apply concat (second %))) testing-and-training-sentences)) | |
(def nested-tag-word-count-training-list (map #(nested-tag-word-count (apply concat (second %))) testing-and-training-sentences)) | |
(defn out-of-step-list [tag-list] | |
"Creates a list of vector pairs where the second element is the next values first element" | |
(map vector (rest tag-list) tag-list)) | |
(def training-previous-tag-tag-list-start-end (map #(map out-of-step-list %) training-tag-list-start-end)) | |
(def training-tag-count-start-end (map (comp frequencies flatten) training-tag-list-start-end)) | |
(defn nested-previous-tag-tag-count [previous-tag-tag-list] | |
"Nested counts in the format of {prior-tag {tag count}}" | |
(reduce #(nested-insert %1 %2) {} (apply concat previous-tag-tag-list))) | |
(def nested-previous-tag-tag-count-training-list (map nested-previous-tag-tag-count training-previous-tag-tag-list-start-end)) | |
(defn unique-keys-count [m] | |
(count (keys m))) | |
(def unique-words-count-training-list (map count word-count-training-list)) | |
;; Calculating probabilities | |
(defn make-prob-fn-map | |
[[nested-t-w-count word-count unique-wc nested-prior-t-t-count tag-count-st-end unique-tc]] | |
{:prob-word-given-tag ;; Construct Emission Probabilities | |
(fn [word tag] | |
(let [word-given-tag (nil?-zero get-in nested-t-w-count [tag word]) | |
tc (nil?-zero word-count word)] | |
(/ (+ word-given-tag *epsilon*) | |
(+ tc (* *epsilon* unique-tc))))) | |
:prob-tag-given-previous-tag ;; Construct Transition Probabilities | |
(fn [tag previous-tag] | |
(let [tag-given-prior-tag-prob (nil?-zero get-in nested-prior-t-t-count [previous-tag tag]) | |
tc (nil?-zero tag-count-st-end previous-tag)] | |
(/ (+ tag-given-prior-tag-prob *epsilon*) | |
(+ tc (* *epsilon* unique-tc)))))}) | |
(defn viterbi-init [v path obs states start-p emit-p] | |
"Initializes viterbi for us" | |
(reduce | |
#(into %1 {%2 [(* (start-p %2) | |
(emit-p (first obs) %2)) | |
(conj path %2)]}) | |
{} | |
states)) | |
(defn extract-prob-state [v path] | |
"Extracts the current probability and state for a given [v path]" | |
[(first (v path)) path]) | |
(defn viterbi-step [prior obs states trans-p emit-p] | |
"Goes through one step of viterbi for us, taking a prior state and performing one step" | |
(apply merge (map | |
(comp (fn [[path v]] {(last path) [v path]}) #(apply max-key val %) #(apply merge %)) | |
((fn [obs] | |
(map #(applyAll (map (comp (fn [[v past-st]] | |
(fn [current-st] | |
{(conj (second (prior past-st)) current-st) | |
(* v (trans-p current-st past-st) | |
(emit-p obs current-st))})) | |
(partial extract-prob-state prior)) states) %) states)) | |
obs)))) | |
(defn viterbi [observations states start-prob trans-prob emit-prob] | |
(let [init (viterbi-init [] [] observations states start-prob emit-prob)] | |
(reduce #(viterbi-step %1 %2 states trans-prob emit-prob) init (rest observations)))) | |
(defn viterbi-solution [observations states start-prob trans-prob emit-prob] | |
(apply max-key #(first (val %)) (viterbi observations states start-prob trans-prob emit-prob))) | |
(defn extract-path [solution] | |
"Extracts the path from a viterbi solution" | |
(second (second solution))) | |
(defn extract-tag-count [seq] | |
(reduce insert {} (flatten (map second (second seq))))) | |
(defn extract-states [seq] | |
(into #{} (flatten (map #(map second %) (second seq))))) | |
(defn extract-observations [seq] | |
(map #(map first %) (first seq))) | |
(defn extract-testing-tags [seq] | |
(map #(map second %) (first seq))) | |
(defn compare-matches [compare] | |
"Compares vector containing a path and testing set and gives the matches" | |
(map (comp (fn [m] (/ (nil?-zero m true) (+ (nil?-zero m true) (nil?-zero m false)))) frequencies (fn [[a b]] (map #(= %1 %2) a b))) compare)) | |
(defn average-accuracy [accuracy-scores] | |
(/ (apply + accuracy-scores) (double (count accuracy-scores)))) | |
(defn run-viterbi [] | |
"Runs viterbi with transition and emission calculated using the same training data via cross validation" | |
(let [states (map extract-states testing-and-training-sentences) | |
observations (map extract-observations testing-and-training-sentences) | |
prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states))) | |
transition-prob (map :prob-tag-given-previous-tag prob-map) | |
emission-prob (map :prob-word-given-tag prob-map) | |
start-prob (map (fn [trans-p] #(trans-p % "START")) transition-prob) | |
testing-tags-list (map extract-testing-tags testing-and-training-sentences)] | |
(map #(map vector %1 %2) | |
testing-tags-list | |
(map (fn [[obs-list sts start-p trans-p emit-p]] | |
(map (fn [obs] | |
(extract-path (viterbi-solution obs sts start-p trans-p emit-p))) obs-list)) | |
(map vector observations states start-prob transition-prob emission-prob))))) | |
(defn -main [] | |
(average-accuracy (map (comp average-accuracy compare-matches) (run-viterbi)))) | |
;; Checking functions | |
(defn close-to-1 [val] | |
(> 0.000001 (math/abs (- 1 val)))) | |
;; Assert that Probabilities are sensible? | |
(defn check-probs? [] | |
(assert | |
(let [states (map extract-states testing-and-training-sentences) | |
observations (map extract-observations testing-and-training-sentences) | |
prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states))) | |
transition-prob (map :prob-tag-given-previous-tag prob-map) | |
emission-prob (map :prob-word-given-tag prob-map) | |
start-prob (map (fn [trans-p] #(trans-p % "START")) transition-prob) | |
testing-tags-list (map extract-testing-tags testing-and-training-sentences)] | |
(every? true? (map close-to-1 (map (fn [[start-pr st]] (apply + (map start-pr st))) (map vector start-prob states))))) | |
"Start Probabilities are not sensible") | |
(assert | |
(let [states (map extract-states testing-and-training-sentences) | |
observations (map extract-observations testing-and-training-sentences) | |
prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states))) | |
transition-prob (map :prob-tag-given-previous-tag prob-map) | |
emission-prob (map :prob-word-given-tag prob-map) | |
start-prob (map #(partial % "START") transition-prob) | |
testing-tags-list (map extract-testing-tags testing-and-training-sentences)] | |
(every? true? (map (fn [[emit-pr st wctl]] (every? true? (map close-to-1 (map (fn [word] (apply + (map #(emit-pr word %) st))) (keys wctl))))) (map vector emission-prob states word-count-training-list)))) | |
"Emmission probabilities are not sensible") | |
(assert | |
(every? true? (let [states (map extract-states testing-and-training-sentences) | |
observations (map extract-observations testing-and-training-sentences) | |
prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states))) | |
transition-prob (map :prob-tag-given-previous-tag prob-map) | |
emission-prob (map :prob-word-given-tag prob-map) | |
start-prob (map #(partial % "START") transition-prob) | |
testing-tags-list (map extract-testing-tags testing-and-training-sentences)] | |
(map (fn [[trans-pr st]] (every? true? (map close-to-1 (map (fn [prior] (apply + (map #(trans-pr % prior) st))) (disj st "."))))) (map vector transition-prob states)))) | |
"Transisition probabilities are not sensible")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment