Skip to content

Instantly share code, notes, and snippets.

@bricef
Created January 25, 2016 09:27
Show Gist options
  • Save bricef/17e7e3a768d48a5e5082 to your computer and use it in GitHub Desktop.
Save bricef/17e7e3a768d48a5e5082 to your computer and use it in GitHub Desktop.
Start to Bayes filtering in F#
// Learn more about F# at http://fsharp.net
// See the 'F# Tutorial' project for more help.
open System.IO
open System
// 01 - Defining the problem
type Label = Ham | Spam
let classify msg : Label =
Spam
type Corpus = Map<Label, Map<string, float>>
// 02 - Checking the classifier
type LabeledData = seq<Label*string>
let accuracyRate (classifier:string->Label) (labeledData:LabeledData) : float =
let correctResults =
labeledData
|> Seq.map (fun (label,msg) -> if (classifier msg) = label then 1.0 else 0.0)
|> Seq.sum
correctResults / float (Seq.length labeledData)
// 05 - Using Bayes
// first steps
let corpus:Corpus =
Map.ofList [( Ham, Map.empty );( Spam, Map.empty )]
let word_count label (corpus:Corpus) : float=
Map.toArray corpus.[label]
|> Array.sumBy snd
let p_word_given_spam word (corpus:Corpus)=
let inner = corpus.[Spam]
let count = inner.[word]
count / (word_count Spam corpus)
let total_word_count corpus =
(word_count Ham corpus) + (word_count Spam corpus)
let p_spam (corpus:Corpus)=
(word_count Spam corpus)/(total_word_count corpus)
// generalise
let p_word_given_label word label (corpus:Corpus)=
let wordcount =
match corpus.[label].TryFind word with
| Some(x) -> x
| None -> 0.001
wordcount / (word_count label corpus)
let p_label label (corpus:Corpus) : float=
(word_count label corpus) / (total_word_count corpus)
let p_word word (corpus:Corpus) =
let spamcount = defaultArg (corpus.[Spam].TryFind word) 0.001
let hamcount = defaultArg (corpus.[Ham].TryFind word) 0.001
spamcount + hamcount / total_word_count corpus
let p_label_given_word label word (corpus:Corpus)=
(p_word_given_label word label corpus) * (p_label Spam corpus) / (p_word word corpus)
let getDefaultCount map key =
defaultArg (Map.tryFind key map) 0.001
let tokenise (msg:string) =
msg.Split([|' '|])
// 05 - Buidling your corpus
let addCorpus label word (corpus:Corpus) : Corpus =
let newcount:float = (getDefaultCount corpus.[label] word) + 1.0
let labelMap = corpus.[label]
Map.add label (Map.add word newcount labelMap) corpus
let buildCorpus (labeledData:LabeledData) : Corpus =
let labeledTokens = Seq.map (fun (l,msg) -> (l, tokenise msg)) labeledData
let addWordsToCorpus words label (corpus:Corpus) : Corpus =
Seq.fold (fun c word -> addCorpus label word c) corpus words
Seq.fold
(fun corpus (label,words) -> addWordsToCorpus words label corpus)
corpus
labeledTokens
[<EntryPoint>]
let main argv =
// 03 - Loading the data set
let strings2labels (labeledData:string []) =
let label =
match labeledData.[0] with
| "ham" -> Ham
| _ -> Spam
(label, labeledData.[1])
let datapath = Path.Combine(__SOURCE_DIRECTORY__, "SMSSpamCollection.txt")
let data = File.ReadAllLines(datapath)
let labeledData = data |> Array.map (fun (s:string) -> s.Split([|'\t'|]) )
let labeledMessages = labeledData |> Array.map strings2labels
let fraction = 0.1
let r = new Random ()
let test, training = labeledMessages |> Array.partition (fun x -> r.NextDouble() < fraction)
let corpus = buildCorpus training
let p = p_label_given_word Spam "me" corpus
printfn "%A" p//(accuracyRate classify test)
0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment