Folcon · January 17, 2012 12:24
diff --git a/sample6.txt b/sample6.txt



 ======================================

 In/IN 
 [ an/DT Oct./NNP 19/CD review/NN ]
 of/IN ``/`` 
 [ The/DT Misanthrope/NN ]
 ''/'' at/IN 
 [ Chicago/NNP 's/POS Goodman/NNP Theatre/NNP ]
 (/( 
 [ ``/`` Revitalized/VBN Classics/NNS ]
 Take/VBP 
 [ the/DT Stage/NN ]
 in/IN 
 [ Windy/NNP City/NNP ]
 ,/, ''/'' 
 [ Leisure/NN ]
 &/CC 
 [ Arts/NNS ]
 )/) ,/, 
 [ the/DT role/NN ]
 of/IN 
 [ Celimene/NNP ]
 ,/, played/VBN by/IN 
 [ Kim/NNP Cattrall/NNP ]
 ,/, was/VBD mistakenly/RB attributed/VBN to/TO 
 [ Christina/NNP Haag/NNP ]
 ./. 

 [ Ms./NNP Haag/NNP ]
 plays/VBZ 
 [ Elianti/NNP ]
 ./. 

 ======================================

 (/( See/VB :/: ``/`` 
 [ Revitalized/VBN Classics/NNS ]
 Take/VBP 
 [ the/DT Stage/NN ]
 in/IN 
 [ Windy/NNP City/NNP ]
 ''/'' --/: 
 [ WSJ/NNP Oct./NNP 19/CD ]
 ,/, 
 [ 1989/CD ]

 )/) 
 ======================================




 ======================================

 [ Rolls-Royce/NNP Motor/NNP Cars/NNPS Inc./NNP ]
 said/VBD 
 [ it/PRP ]
 expects/VBZ 
 [ its/PRP$ U.S./NNP sales/NNS ]
 to/TO remain/VB 
 [ steady/JJ ]
 at/IN about/IN 
 [ 1,200/CD cars/NNS ]
 in/IN 
 [ 1990/CD ]
 ./. 

 [ The/DT luxury/NN auto/NN maker/NN last/JJ year/NN ]
 sold/VBD 
 [ 1,214/CD cars/NNS ]
 in/IN 
 [ the/DT U.S./NNP Howard/NNP Mosher/NNP ]
 ,/, 
 [ president/NN ]
 and/CC 
 [ chief/JJ executive/NN officer/NN ]
 ,/, said/VBD 
 [ he/PRP ]
 anticipates/VBZ 
 [ growth/NN ]
 for/IN 
 [ the/DT luxury/NN auto/NN maker/NN ]
 in/IN 
 [ Britain/NNP ]
 and/CC 
 [ Europe/NNP ]
 ,/, and/CC in/IN 
 [ Far/JJ Eastern/JJ markets/NNS ]
 ./. 



 ======================================

 [ BELL/NNP INDUSTRIES/NNP Inc./NNP ]
 increased/VBD 
 [ its/PRP$ quarterly/NN ]
 to/TO 
 [ 10/CD cents/NNS ]
 from/IN 
 [ seven/CD cents/NNS ]

 [ a/DT share/NN ]
 ./. 

 [ The/DT new/JJ rate/NN ]
 will/MD be/VB 
 [ payable/JJ Feb./NNP 15/CD ]
 ./. 

 [ A/DT record/NN date/NN has/VBZ n't/RB ]
 been/VBN set/VBN ./. 

 [ Bell/NNP ]
 ,/, based/VBN in/IN 
 [ Los/NNP Angeles/NNP ]
 ,/, makes/VBZ and/CC distributes/VBZ 
 [ electronic/JJ ]
 ,/, 
 [ computer/NN ]
 and/CC 
 [ building/NN products/NNS ]
 ./. 



 ======================================

 [ Investors/NNS ]
 are/VBP appealing/VBG to/TO 
 [ the/DT Securities/NNPS ]
 and/CC 
 [ Exchange/NNP Commission/NNP ]
 not/RB to/TO limit/VB 
 [ their/PRP$ access/NN ]
 to/TO 
 [ information/NN ]
 about/IN 
 [ stock/NN purchases/NNS ]
 and/CC 
 [ sales/NNS ]
 by/IN 
 [ corporate/JJ insiders/NNS ]
 ./. 

 ======================================

 [ A/DT SEC/NNP proposal/NN ]
 to/TO ease/VB reporting/NN 
 [ requirements/NNS ]
 for/IN 
 [ some/DT company/NN executives/NNS ]
 would/MD undermine/VB 
 [ the/DT usefulness/NN ]
 of/IN 
 [ information/NN ]
 on/IN 
 [ insider/NN trades/NNS ]
 as/IN 
 [ a/DT stock-picking/JJ tool/NN ]
 ,/, 
 [ individual/JJ investors/NNS ]
 and/CC 
 [ professional/JJ money/NN managers/NNS ]
 contend/VBP ./. 
diff --git a/tagger.clj b/tagger.clj
 (ns tagger.core
  "Running Viterbi on some text"
  (:require [clojure.string :as str]
            [clojure.contrib.generic.functor :as functor]
            [clojure.contrib.math :as math]
            [clojure.set :as set]
            [clojure.data :as data]
            [clojure.data.finger-tree :as ft]))

 (def ^:dynamic *epsilon* 0.01)

 (defn applyAll [fs x]
  (map #(% x) fs))

 (defn split-evenly [coll]
  (partition (quot (count coll) 10) coll))

 (defn nil?-zero [fn & args]
  (let [val (apply fn args)]
    (if (nil? val)
      0
      val)))

 ;; Things to watch out for:
 ;; There are 1942 sentences
 ;; There are 44 escaped backslashes "\/"
 ;; There are 5 tags that contain a | which delimits two different tags into 1

 ;; Counts needed are:
 ;; word counts W
 ;; tag counts T
 ;; word-tag counts W-T
 ;; previous tag to current tag counts T(i+1)-Ti

 ;; Problem set
 (def str-to-tags (slurp "resources/treebank3_sect2.txt"))
 ;; A sample set to play with
 ;; (def str-to-tags (slurp "resources/sample.txt"))
 ;; A sample set containing the tags with |
 ;; (def str-to-tags (slurp "resources/sample2.txt"))
 ;; A sample set containing words with \/ (escaped /'s)
 ;; (def str-to-tags (slurp "resources/sample3.txt"))
 ;; A small sample set
 ;; (def str-to-tags (slurp "resources/sample4.txt"))
 ;; A quarter of the problem set
 ;; (def str-to-tags (slurp "resources/sample5.txt"))
 ;; 10 sentences
 ;; (def str-to-tags (slurp "resources/sample6.txt"))

 (defn str->tags [string]
  (filter #(not (empty? %))
          (str/split string #"[\s]")))

 (defn tag->W-T [tag]
  "Converts a tag such as In/IN into a W-T such as [In IN]"
  (str/split tag #"[//]"))

 (defn sentence->tags [sentence]
  (map #(second %) sentence))

 (defn strip-tags [tag]
  (first tag))

 (defn this-and-that [xs]
  (map-indexed (fn [i x]
                [x (concat (take i xs) 
                           (drop (inc i) xs))])
              xs))

 (def cleaned-tag-str (filter #(= (count %) 2) (map tag->W-T (str->tags str-to-tags))))

 (defn split-sentences [tag-str]
  "Splits the str into sentences"
  (reduce #(if (= (second (first %2)) ".")
             (ft/conjr (pop %1) (ft/conjr (peek %1) (first %2)))
             (ft/conjr %1 %2))
          (ft/double-list)
          (map #(apply ft/double-list %) (partition-by #(= "." (second %)) tag-str))))

 (defn split-sentences-start-end [tag-str]
  "Splits the str into sentences with added start and end tags"
  (reduce #(if (= (second (first %2)) ".")
             (conj (vec (drop-last %1)) (conj (vec (conj (vec (conj (last %1) ["START" "START"]))
                                                         (first %2))) ["END" "END"]))
             (conj (vec %1) %2))
          []
          (partition-by #(= "." (second %)) tag-str)))

 (def sentences (split-sentences cleaned-tag-str))

 (def testing-and-training-sentences
  "A list containing 10 pairs of testing sentences and training sentences"
  (map (fn [[fst rst]] (ft/double-list fst (apply concat rst))) (this-and-that (split-evenly sentences))))

 (defn add-start-end [sentence-list]
  (map #(ft/consl (ft/conjr % ["END" "END"]) ["START" "START"]) sentence-list))

 (def testing-and-training-sentences-start-end (map #(map add-start-end %) testing-and-training-sentences))

 (def training-tag-list-start-end (map (comp #(map sentence->tags %) second) testing-and-training-sentences-start-end))

 (def testing-and-training-tag-list-start-end (map (fn [sample] (map #(map sentence->tags %) sample)) testing-and-training-sentences-start-end))

 (defn insert [m k]
  "Inserts a key k into a map m if it does not exist or increments the count if it does"
  (let [val (m k)]
    (assoc m k (inc (if (nil? val) 0 val)))))

 (defn nested-insert [m [word tag]]
  "Inserts a key k into a nested map m of tags and words if it does not exist or increments the count if it does"
  (let [val (get-in m [tag word])]
    (assoc-in m [tag word] (inc (if (nil? val) 0 val)))))

 (defn word-count [tagged-str]
  "Example of how to get word counts"
  (reduce #(insert %1 (first %2)) {} tagged-str))

 (defn tag-count [tagged-str]
  "Example of how to get tag counts"
  (reduce #(insert %1 (second %2)) {} tagged-str))

 (defn nested-tag-word-count [tagged-str]
  "Nested counts in the format of {tag {word count}}"
  (reduce #(nested-insert %1 %2) {} tagged-str))

 (def tag-count-training-list (map #(tag-count (apply concat (second %))) testing-and-training-sentences))

 (def word-count-training-list (map #(word-count (apply concat (second %))) testing-and-training-sentences))

 (def nested-tag-word-count-training-list (map #(nested-tag-word-count (apply concat (second %))) testing-and-training-sentences))

 (defn out-of-step-list [tag-list]
  "Creates a list of vector pairs where the second element is the next values first element"
  (map vector (rest tag-list) tag-list))

 (def training-previous-tag-tag-list-start-end (map #(map out-of-step-list %) training-tag-list-start-end))

 (def training-tag-count-start-end (map (comp frequencies flatten) training-tag-list-start-end))

 (defn nested-previous-tag-tag-count [previous-tag-tag-list]
   "Nested counts in the format of {prior-tag {tag count}}"
   (reduce #(nested-insert %1 %2) {} (apply concat previous-tag-tag-list)))

 (def nested-previous-tag-tag-count-training-list (map nested-previous-tag-tag-count training-previous-tag-tag-list-start-end))

 (defn unique-keys-count [m]
  (count (keys m)))

 (def unique-words-count-training-list (map count word-count-training-list))

 ;; Calculating probabilities

 (defn make-prob-fn-map
  [[nested-t-w-count word-count unique-wc nested-prior-t-t-count tag-count-st-end unique-tc]]
  {:prob-word-given-tag ;; Construct Emission Probabilities
   (fn [word tag]
     (let [word-given-tag (nil?-zero get-in nested-t-w-count [tag word])
           tc (nil?-zero word-count word)]
       (/ (+ word-given-tag *epsilon*)
          (+ tc (* *epsilon* unique-tc)))))
   :prob-tag-given-previous-tag ;; Construct Transition Probabilities
   (fn [tag previous-tag]
     (let [tag-given-prior-tag-prob (nil?-zero get-in nested-prior-t-t-count [previous-tag tag])
           tc (nil?-zero tag-count-st-end previous-tag)]
       (/ (+ tag-given-prior-tag-prob *epsilon*)
          (+ tc (* *epsilon* unique-tc)))))})

 (defn viterbi-init [v path obs states start-p emit-p]
  "Initializes viterbi for us"
  (reduce
   #(into %1 {%2 [(* (start-p %2)
                     (emit-p (first obs) %2))
                  (conj path %2)]})
   {}
   states))

 (defn extract-prob-state [v path]
  "Extracts the current probability and state for a given [v path]"
  [(first (v path)) path])

 (defn viterbi-step [prior obs states trans-p emit-p]
  "Goes through one step of viterbi for us, taking a prior state and performing one step"
  (apply merge (map
                (comp (fn [[path v]] {(last path) [v path]}) #(apply max-key val %) #(apply merge %))
                ((fn [obs]
                   (map #(applyAll (map (comp (fn [[v past-st]]
                                                (fn [current-st]
                                                  {(conj (second (prior past-st)) current-st)
                                                   (* v (trans-p current-st past-st)
                                                      (emit-p obs current-st))}))
                                              (partial extract-prob-state prior)) states) %) states))
                 obs))))

 (defn viterbi [observations states start-prob trans-prob emit-prob]
  (let [init (viterbi-init [] [] observations states start-prob emit-prob)]
        (reduce #(viterbi-step %1 %2 states trans-prob emit-prob) init (rest observations))))

 (defn viterbi-solution [observations states start-prob trans-prob emit-prob]
  (apply max-key #(first (val %)) (viterbi observations states start-prob trans-prob emit-prob)))

 (defn extract-path [solution]
  "Extracts the path from a viterbi solution"
  (second (second solution)))

 (defn extract-tag-count [seq]
  (reduce insert {} (flatten (map second (second seq)))))

 (defn extract-states [seq]
  (into #{} (flatten (map #(map second %) (second seq)))))

 (defn extract-observations [seq]
  (map #(map first %) (first seq)))

 (defn extract-testing-tags [seq]
  (map #(map second %) (first seq)))

 (defn compare-matches [compare]
  "Compares vector containing a path and testing set and gives the matches"
  (map (comp (fn [m] (/ (nil?-zero m true) (+ (nil?-zero m true) (nil?-zero m false)))) frequencies (fn [[a b]] (map #(= %1 %2) a b))) compare))

 (defn average-accuracy [accuracy-scores]
  (/ (apply + accuracy-scores) (double (count accuracy-scores))))

 (defn run-viterbi []
  "Runs viterbi with transition and emission calculated using the same training data via cross validation"
  (let [states (map extract-states testing-and-training-sentences)
        observations (map extract-observations testing-and-training-sentences)
        prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
        transition-prob (map :prob-tag-given-previous-tag prob-map)
        emission-prob (map :prob-word-given-tag prob-map)
        start-prob (map (fn [trans-p] #(trans-p % "START")) transition-prob)
        testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
    (map #(map vector %1 %2)
         testing-tags-list
         (map (fn [[obs-list sts start-p trans-p emit-p]]
                (map (fn [obs]
                       (extract-path (viterbi-solution obs sts start-p trans-p emit-p))) obs-list))
              (map vector observations states start-prob transition-prob emission-prob)))))

 (defn -main []
  (average-accuracy (map (comp average-accuracy compare-matches) (run-viterbi))))

 ;; Checking functions

 (defn close-to-1 [val]
  (> 0.000001 (math/abs (- 1 val))))

 ;; Assert that Probabilities are sensible?

 (defn check-probs? []
  (assert
   (let [states (map extract-states testing-and-training-sentences)
         observations (map extract-observations testing-and-training-sentences)
         prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
         transition-prob (map :prob-tag-given-previous-tag prob-map)
         emission-prob (map :prob-word-given-tag prob-map)
         start-prob (map (fn [trans-p] #(trans-p % "START")) transition-prob)
         testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
     (every? true? (map close-to-1 (map (fn [[start-pr st]] (apply + (map start-pr st))) (map vector start-prob states)))))
   "Start Probabilities are not sensible")
  (assert
   (let [states (map extract-states testing-and-training-sentences)
         observations (map extract-observations testing-and-training-sentences)
         prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
         transition-prob (map :prob-tag-given-previous-tag prob-map)
         emission-prob (map :prob-word-given-tag prob-map)
         start-prob (map #(partial % "START") transition-prob)
         testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
     (every? true? (map (fn [[emit-pr st wctl]] (every? true? (map close-to-1 (map (fn [word] (apply + (map #(emit-pr word %) st))) (keys wctl))))) (map vector emission-prob states word-count-training-list))))
   "Emmission probabilities are not sensible")
  (assert
   (every? true? (let [states (map extract-states testing-and-training-sentences)
                       observations (map extract-observations testing-and-training-sentences)
                       prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
                       transition-prob (map :prob-tag-given-previous-tag prob-map)
                       emission-prob (map :prob-word-given-tag prob-map)
                       start-prob (map #(partial % "START") transition-prob)
                       testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
                   (map (fn [[trans-pr st]] (every? true? (map close-to-1 (map (fn [prior] (apply + (map #(trans-pr % prior) st))) (disj st "."))))) (map vector transition-prob states))))
   "Transisition probabilities are not sensible"))



	======================================

	In/IN
	[ an/DT Oct./NNP 19/CD review/NN ]
	of/IN ``/``
	[ The/DT Misanthrope/NN ]
	''/'' at/IN
	[ Chicago/NNP 's/POS Goodman/NNP Theatre/NNP ]
	(/(
	[ ``/`` Revitalized/VBN Classics/NNS ]
	Take/VBP
	[ the/DT Stage/NN ]
	in/IN
	[ Windy/NNP City/NNP ]
	,/, ''/''
	[ Leisure/NN ]
	&/CC
	[ Arts/NNS ]
	)/) ,/,
	[ the/DT role/NN ]
	of/IN
	[ Celimene/NNP ]
	,/, played/VBN by/IN
	[ Kim/NNP Cattrall/NNP ]
	,/, was/VBD mistakenly/RB attributed/VBN to/TO
	[ Christina/NNP Haag/NNP ]
	./.

	[ Ms./NNP Haag/NNP ]
	plays/VBZ
	[ Elianti/NNP ]
	./.

	======================================

	(/( See/VB :/: ``/``
	[ Revitalized/VBN Classics/NNS ]
	Take/VBP
	[ the/DT Stage/NN ]
	in/IN
	[ Windy/NNP City/NNP ]
	''/'' --/:
	[ WSJ/NNP Oct./NNP 19/CD ]
	,/,
	[ 1989/CD ]

	)/)
	======================================




	======================================

	[ Rolls-Royce/NNP Motor/NNP Cars/NNPS Inc./NNP ]
	said/VBD
	[ it/PRP ]
	expects/VBZ
	[ its/PRP$ U.S./NNP sales/NNS ]
	to/TO remain/VB
	[ steady/JJ ]
	at/IN about/IN
	[ 1,200/CD cars/NNS ]
	in/IN
	[ 1990/CD ]
	./.

	[ The/DT luxury/NN auto/NN maker/NN last/JJ year/NN ]
	sold/VBD
	[ 1,214/CD cars/NNS ]
	in/IN
	[ the/DT U.S./NNP Howard/NNP Mosher/NNP ]
	,/,
	[ president/NN ]
	and/CC
	[ chief/JJ executive/NN officer/NN ]
	,/, said/VBD
	[ he/PRP ]
	anticipates/VBZ
	[ growth/NN ]
	for/IN
	[ the/DT luxury/NN auto/NN maker/NN ]
	in/IN
	[ Britain/NNP ]
	and/CC
	[ Europe/NNP ]
	,/, and/CC in/IN
	[ Far/JJ Eastern/JJ markets/NNS ]
	./.



	======================================

	[ BELL/NNP INDUSTRIES/NNP Inc./NNP ]
	increased/VBD
	[ its/PRP$ quarterly/NN ]
	to/TO
	[ 10/CD cents/NNS ]
	from/IN
	[ seven/CD cents/NNS ]

	[ a/DT share/NN ]
	./.

	[ The/DT new/JJ rate/NN ]
	will/MD be/VB
	[ payable/JJ Feb./NNP 15/CD ]
	./.

	[ A/DT record/NN date/NN has/VBZ n't/RB ]
	been/VBN set/VBN ./.

	[ Bell/NNP ]
	,/, based/VBN in/IN
	[ Los/NNP Angeles/NNP ]
	,/, makes/VBZ and/CC distributes/VBZ
	[ electronic/JJ ]
	,/,
	[ computer/NN ]
	and/CC
	[ building/NN products/NNS ]
	./.



	======================================

	[ Investors/NNS ]
	are/VBP appealing/VBG to/TO
	[ the/DT Securities/NNPS ]
	and/CC
	[ Exchange/NNP Commission/NNP ]
	not/RB to/TO limit/VB
	[ their/PRP$ access/NN ]
	to/TO
	[ information/NN ]
	about/IN
	[ stock/NN purchases/NNS ]
	and/CC
	[ sales/NNS ]
	by/IN
	[ corporate/JJ insiders/NNS ]
	./.

	======================================

	[ A/DT SEC/NNP proposal/NN ]
	to/TO ease/VB reporting/NN
	[ requirements/NNS ]
	for/IN
	[ some/DT company/NN executives/NNS ]
	would/MD undermine/VB
	[ the/DT usefulness/NN ]
	of/IN
	[ information/NN ]
	on/IN
	[ insider/NN trades/NNS ]
	as/IN
	[ a/DT stock-picking/JJ tool/NN ]
	,/,
	[ individual/JJ investors/NNS ]
	and/CC
	[ professional/JJ money/NN managers/NNS ]
	contend/VBP ./.
	(ns tagger.core
	"Running Viterbi on some text"
	(:require [clojure.string :as str]
	[clojure.contrib.generic.functor :as functor]
	[clojure.contrib.math :as math]
	[clojure.set :as set]
	[clojure.data :as data]
	[clojure.data.finger-tree :as ft]))

	(def ^:dynamic epsilon 0.01)

	(defn applyAll [fs x]
	(map #(% x) fs))

	(defn split-evenly [coll]
	(partition (quot (count coll) 10) coll))

	(defn nil?-zero [fn & args]
	(let [val (apply fn args)]
	(if (nil? val)
	0
	val)))

	;; Things to watch out for:
	;; There are 1942 sentences
	;; There are 44 escaped backslashes "\/"
	;; There are 5 tags that contain a \| which delimits two different tags into 1

	;; Counts needed are:
	;; word counts W
	;; tag counts T
	;; word-tag counts W-T
	;; previous tag to current tag counts T(i+1)-Ti

	;; Problem set
	(def str-to-tags (slurp "resources/treebank3_sect2.txt"))
	;; A sample set to play with
	;; (def str-to-tags (slurp "resources/sample.txt"))
	;; A sample set containing the tags with \|
	;; (def str-to-tags (slurp "resources/sample2.txt"))
	;; A sample set containing words with \/ (escaped /'s)
	;; (def str-to-tags (slurp "resources/sample3.txt"))
	;; A small sample set
	;; (def str-to-tags (slurp "resources/sample4.txt"))
	;; A quarter of the problem set
	;; (def str-to-tags (slurp "resources/sample5.txt"))
	;; 10 sentences
	;; (def str-to-tags (slurp "resources/sample6.txt"))

	(defn str->tags [string]
	(filter #(not (empty? %))
	(str/split string #"[\s]")))

	(defn tag->W-T [tag]
	"Converts a tag such as In/IN into a W-T such as [In IN]"
	(str/split tag #"[//]"))

	(defn sentence->tags [sentence]
	(map #(second %) sentence))

	(defn strip-tags [tag]
	(first tag))

	(defn this-and-that [xs]
	(map-indexed (fn [i x]
	[x (concat (take i xs)
	(drop (inc i) xs))])
	xs))

	(def cleaned-tag-str (filter #(= (count %) 2) (map tag->W-T (str->tags str-to-tags))))

	(defn split-sentences [tag-str]
	"Splits the str into sentences"
	(reduce #(if (= (second (first %2)) ".")
	(ft/conjr (pop %1) (ft/conjr (peek %1) (first %2)))
	(ft/conjr %1 %2))
	(ft/double-list)
	(map #(apply ft/double-list %) (partition-by #(= "." (second %)) tag-str))))

	(defn split-sentences-start-end [tag-str]
	"Splits the str into sentences with added start and end tags"
	(reduce #(if (= (second (first %2)) ".")
	(conj (vec (drop-last %1)) (conj (vec (conj (vec (conj (last %1) ["START" "START"]))
	(first %2))) ["END" "END"]))
	(conj (vec %1) %2))
	[]
	(partition-by #(= "." (second %)) tag-str)))

	(def sentences (split-sentences cleaned-tag-str))

	(def testing-and-training-sentences
	"A list containing 10 pairs of testing sentences and training sentences"
	(map (fn [[fst rst]] (ft/double-list fst (apply concat rst))) (this-and-that (split-evenly sentences))))

	(defn add-start-end [sentence-list]
	(map #(ft/consl (ft/conjr % ["END" "END"]) ["START" "START"]) sentence-list))

	(def testing-and-training-sentences-start-end (map #(map add-start-end %) testing-and-training-sentences))

	(def training-tag-list-start-end (map (comp #(map sentence->tags %) second) testing-and-training-sentences-start-end))

	(def testing-and-training-tag-list-start-end (map (fn [sample] (map #(map sentence->tags %) sample)) testing-and-training-sentences-start-end))

	(defn insert [m k]
	"Inserts a key k into a map m if it does not exist or increments the count if it does"
	(let [val (m k)]
	(assoc m k (inc (if (nil? val) 0 val)))))

	(defn nested-insert [m [word tag]]
	"Inserts a key k into a nested map m of tags and words if it does not exist or increments the count if it does"
	(let [val (get-in m [tag word])]
	(assoc-in m [tag word] (inc (if (nil? val) 0 val)))))

	(defn word-count [tagged-str]
	"Example of how to get word counts"
	(reduce #(insert %1 (first %2)) {} tagged-str))

	(defn tag-count [tagged-str]
	"Example of how to get tag counts"
	(reduce #(insert %1 (second %2)) {} tagged-str))

	(defn nested-tag-word-count [tagged-str]
	"Nested counts in the format of {tag {word count}}"
	(reduce #(nested-insert %1 %2) {} tagged-str))

	(def tag-count-training-list (map #(tag-count (apply concat (second %))) testing-and-training-sentences))

	(def word-count-training-list (map #(word-count (apply concat (second %))) testing-and-training-sentences))

	(def nested-tag-word-count-training-list (map #(nested-tag-word-count (apply concat (second %))) testing-and-training-sentences))

	(defn out-of-step-list [tag-list]
	"Creates a list of vector pairs where the second element is the next values first element"
	(map vector (rest tag-list) tag-list))

	(def training-previous-tag-tag-list-start-end (map #(map out-of-step-list %) training-tag-list-start-end))

	(def training-tag-count-start-end (map (comp frequencies flatten) training-tag-list-start-end))

	(defn nested-previous-tag-tag-count [previous-tag-tag-list]
	"Nested counts in the format of {prior-tag {tag count}}"
	(reduce #(nested-insert %1 %2) {} (apply concat previous-tag-tag-list)))

	(def nested-previous-tag-tag-count-training-list (map nested-previous-tag-tag-count training-previous-tag-tag-list-start-end))

	(defn unique-keys-count [m]
	(count (keys m)))

	(def unique-words-count-training-list (map count word-count-training-list))

	;; Calculating probabilities

	(defn make-prob-fn-map
	[[nested-t-w-count word-count unique-wc nested-prior-t-t-count tag-count-st-end unique-tc]]
	{:prob-word-given-tag ;; Construct Emission Probabilities
	(fn [word tag]
	(let [word-given-tag (nil?-zero get-in nested-t-w-count [tag word])
	tc (nil?-zero word-count word)]
	(/ (+ word-given-tag epsilon)
	(+ tc (* epsilon unique-tc)))))
	:prob-tag-given-previous-tag ;; Construct Transition Probabilities
	(fn [tag previous-tag]
	(let [tag-given-prior-tag-prob (nil?-zero get-in nested-prior-t-t-count [previous-tag tag])
	tc (nil?-zero tag-count-st-end previous-tag)]
	(/ (+ tag-given-prior-tag-prob epsilon)
	(+ tc (* epsilon unique-tc)))))})

	(defn viterbi-init [v path obs states start-p emit-p]
	"Initializes viterbi for us"
	(reduce
	#(into %1 {%2 [(* (start-p %2)
	(emit-p (first obs) %2))
	(conj path %2)]})
	{}
	states))

	(defn extract-prob-state [v path]
	"Extracts the current probability and state for a given [v path]"
	[(first (v path)) path])

	(defn viterbi-step [prior obs states trans-p emit-p]
	"Goes through one step of viterbi for us, taking a prior state and performing one step"
	(apply merge (map
	(comp (fn [[path v]] {(last path) [v path]}) #(apply max-key val %) #(apply merge %))
	((fn [obs]
	(map #(applyAll (map (comp (fn [[v past-st]]
	(fn [current-st]
	{(conj (second (prior past-st)) current-st)
	(* v (trans-p current-st past-st)
	(emit-p obs current-st))}))
	(partial extract-prob-state prior)) states) %) states))
	obs))))

	(defn viterbi [observations states start-prob trans-prob emit-prob]
	(let [init (viterbi-init [] [] observations states start-prob emit-prob)]
	(reduce #(viterbi-step %1 %2 states trans-prob emit-prob) init (rest observations))))

	(defn viterbi-solution [observations states start-prob trans-prob emit-prob]
	(apply max-key #(first (val %)) (viterbi observations states start-prob trans-prob emit-prob)))

	(defn extract-path [solution]
	"Extracts the path from a viterbi solution"
	(second (second solution)))

	(defn extract-tag-count [seq]
	(reduce insert {} (flatten (map second (second seq)))))

	(defn extract-states [seq]
	(into #{} (flatten (map #(map second %) (second seq)))))

	(defn extract-observations [seq]
	(map #(map first %) (first seq)))

	(defn extract-testing-tags [seq]
	(map #(map second %) (first seq)))

	(defn compare-matches [compare]
	"Compares vector containing a path and testing set and gives the matches"
	(map (comp (fn [m] (/ (nil?-zero m true) (+ (nil?-zero m true) (nil?-zero m false)))) frequencies (fn [[a b]] (map #(= %1 %2) a b))) compare))

	(defn average-accuracy [accuracy-scores]
	(/ (apply + accuracy-scores) (double (count accuracy-scores))))

	(defn run-viterbi []
	"Runs viterbi with transition and emission calculated using the same training data via cross validation"
	(let [states (map extract-states testing-and-training-sentences)
	observations (map extract-observations testing-and-training-sentences)
	prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
	transition-prob (map :prob-tag-given-previous-tag prob-map)
	emission-prob (map :prob-word-given-tag prob-map)
	start-prob (map (fn [trans-p] #(trans-p % "START")) transition-prob)
	testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
	(map #(map vector %1 %2)
	testing-tags-list
	(map (fn [[obs-list sts start-p trans-p emit-p]]
	(map (fn [obs]
	(extract-path (viterbi-solution obs sts start-p trans-p emit-p))) obs-list))
	(map vector observations states start-prob transition-prob emission-prob)))))

	(defn -main []
	(average-accuracy (map (comp average-accuracy compare-matches) (run-viterbi))))

	;; Checking functions

	(defn close-to-1 [val]
	(> 0.000001 (math/abs (- 1 val))))

	;; Assert that Probabilities are sensible?

	(defn check-probs? []
	(assert
	(let [states (map extract-states testing-and-training-sentences)
	observations (map extract-observations testing-and-training-sentences)
	prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
	transition-prob (map :prob-tag-given-previous-tag prob-map)
	emission-prob (map :prob-word-given-tag prob-map)
	start-prob (map (fn [trans-p] #(trans-p % "START")) transition-prob)
	testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
	(every? true? (map close-to-1 (map (fn [[start-pr st]] (apply + (map start-pr st))) (map vector start-prob states)))))
	"Start Probabilities are not sensible")
	(assert
	(let [states (map extract-states testing-and-training-sentences)
	observations (map extract-observations testing-and-training-sentences)
	prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
	transition-prob (map :prob-tag-given-previous-tag prob-map)
	emission-prob (map :prob-word-given-tag prob-map)
	start-prob (map #(partial % "START") transition-prob)
	testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
	(every? true? (map (fn [[emit-pr st wctl]] (every? true? (map close-to-1 (map (fn [word] (apply + (map #(emit-pr word %) st))) (keys wctl))))) (map vector emission-prob states word-count-training-list))))
	"Emmission probabilities are not sensible")
	(assert
	(every? true? (let [states (map extract-states testing-and-training-sentences)
	observations (map extract-observations testing-and-training-sentences)
	prob-map (map make-prob-fn-map (map vector nested-tag-word-count-training-list word-count-training-list unique-words-count-training-list nested-previous-tag-tag-count-training-list training-tag-count-start-end (map count states)))
	transition-prob (map :prob-tag-given-previous-tag prob-map)
	emission-prob (map :prob-word-given-tag prob-map)
	start-prob (map #(partial % "START") transition-prob)
	testing-tags-list (map extract-testing-tags testing-and-training-sentences)]
	(map (fn [[trans-pr st]] (every? true? (map close-to-1 (map (fn [prior] (apply + (map #(trans-pr % prior) st))) (disj st "."))))) (map vector transition-prob states))))
	"Transisition probabilities are not sensible"))