Created
January 25, 2016 09:27
-
-
Save bricef/17e7e3a768d48a5e5082 to your computer and use it in GitHub Desktop.
Start to Bayes filtering in F#
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Learn more about F# at http://fsharp.net | |
// See the 'F# Tutorial' project for more help. | |
open System.IO | |
open System | |
// 01 - Defining the problem | |
type Label = Ham | Spam | |
let classify msg : Label = | |
Spam | |
type Corpus = Map<Label, Map<string, float>> | |
// 02 - Checking the classifier | |
type LabeledData = seq<Label*string> | |
let accuracyRate (classifier:string->Label) (labeledData:LabeledData) : float = | |
let correctResults = | |
labeledData | |
|> Seq.map (fun (label,msg) -> if (classifier msg) = label then 1.0 else 0.0) | |
|> Seq.sum | |
correctResults / float (Seq.length labeledData) | |
// 05 - Using Bayes | |
// first steps | |
let corpus:Corpus = | |
Map.ofList [( Ham, Map.empty );( Spam, Map.empty )] | |
let word_count label (corpus:Corpus) : float= | |
Map.toArray corpus.[label] | |
|> Array.sumBy snd | |
let p_word_given_spam word (corpus:Corpus)= | |
let inner = corpus.[Spam] | |
let count = inner.[word] | |
count / (word_count Spam corpus) | |
let total_word_count corpus = | |
(word_count Ham corpus) + (word_count Spam corpus) | |
let p_spam (corpus:Corpus)= | |
(word_count Spam corpus)/(total_word_count corpus) | |
// generalise | |
let p_word_given_label word label (corpus:Corpus)= | |
let wordcount = | |
match corpus.[label].TryFind word with | |
| Some(x) -> x | |
| None -> 0.001 | |
wordcount / (word_count label corpus) | |
let p_label label (corpus:Corpus) : float= | |
(word_count label corpus) / (total_word_count corpus) | |
let p_word word (corpus:Corpus) = | |
let spamcount = defaultArg (corpus.[Spam].TryFind word) 0.001 | |
let hamcount = defaultArg (corpus.[Ham].TryFind word) 0.001 | |
spamcount + hamcount / total_word_count corpus | |
let p_label_given_word label word (corpus:Corpus)= | |
(p_word_given_label word label corpus) * (p_label Spam corpus) / (p_word word corpus) | |
let getDefaultCount map key = | |
defaultArg (Map.tryFind key map) 0.001 | |
let tokenise (msg:string) = | |
msg.Split([|' '|]) | |
// 05 - Buidling your corpus | |
let addCorpus label word (corpus:Corpus) : Corpus = | |
let newcount:float = (getDefaultCount corpus.[label] word) + 1.0 | |
let labelMap = corpus.[label] | |
Map.add label (Map.add word newcount labelMap) corpus | |
let buildCorpus (labeledData:LabeledData) : Corpus = | |
let labeledTokens = Seq.map (fun (l,msg) -> (l, tokenise msg)) labeledData | |
let addWordsToCorpus words label (corpus:Corpus) : Corpus = | |
Seq.fold (fun c word -> addCorpus label word c) corpus words | |
Seq.fold | |
(fun corpus (label,words) -> addWordsToCorpus words label corpus) | |
corpus | |
labeledTokens | |
[<EntryPoint>] | |
let main argv = | |
// 03 - Loading the data set | |
let strings2labels (labeledData:string []) = | |
let label = | |
match labeledData.[0] with | |
| "ham" -> Ham | |
| _ -> Spam | |
(label, labeledData.[1]) | |
let datapath = Path.Combine(__SOURCE_DIRECTORY__, "SMSSpamCollection.txt") | |
let data = File.ReadAllLines(datapath) | |
let labeledData = data |> Array.map (fun (s:string) -> s.Split([|'\t'|]) ) | |
let labeledMessages = labeledData |> Array.map strings2labels | |
let fraction = 0.1 | |
let r = new Random () | |
let test, training = labeledMessages |> Array.partition (fun x -> r.NextDouble() < fraction) | |
let corpus = buildCorpus training | |
let p = p_label_given_word Spam "me" corpus | |
printfn "%A" p//(accuracyRate classify test) | |
0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment