Created
January 20, 2016 16:05
-
-
Save bricef/6fba1c3545c29b3b3b7d to your computer and use it in GitHub Desktop.
Spam Filtering in F# - without probability combinations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open System.IO | |
open System | |
type Label = Ham | Spam | |
let classify msg : Label = Spam | |
type Corpus = Map<Label, Map<string, float>> | |
type LabeledData = seq<Label*string> | |
let accuracyRate (classifier:string->Label) (labeledData:LabeledData) : float = | |
let correctResults = | |
labeledData | |
|> Seq.map (fun (label,msg) -> if (classifier msg) = label then 1.0 else 0.0) | |
|> Seq.sum | |
correctResults / float (Seq.length labeledData) | |
let corpus:Corpus = | |
Map.ofList [( Ham, Map.empty );( Spam, Map.empty )] | |
let getDefaultCount map key = | |
defaultArg (Map.tryFind key map) 0.001 | |
let makeClassifier (corpus:Corpus) = | |
let word_count (label:Label) : float= | |
Map.toArray corpus.[label] |> Array.sumBy snd | |
let p_word_given_spam word= | |
let inner = corpus.[Spam] | |
let count = inner.[word] | |
count / (word_count Spam) | |
let total_word_count = | |
(word_count Ham) + (word_count Spam) | |
let p_spam = | |
(word_count Spam)/(total_word_count) | |
let p_word_given_label word label = | |
let wordcount = | |
match corpus.[label].TryFind word with | |
| Some(x) -> x | |
| None -> 0.001 | |
wordcount / (word_count label) | |
let p_label label : float= | |
(word_count label) / (total_word_count) | |
let p_word word = | |
let spamcount = defaultArg (corpus.[Spam].TryFind word) 0.001 | |
let hamcount = defaultArg (corpus.[Ham].TryFind word) 0.001 | |
spamcount + hamcount / (total_word_count) | |
let p_label_given_word label word = | |
(p_word_given_label word label) * (p_label Spam) / (p_word word) | |
let classify msg : Label = Spam | |
classify | |
let tokenise (msg:string) = | |
msg.Split([|' '|]) | |
let buildCorpus (labeledData:LabeledData) : Corpus = | |
let addCorpus label word (corpus:Corpus) : Corpus = | |
let newcount:float = (getDefaultCount corpus.[label] word) + 1.0 | |
let labelMap = corpus.[label] | |
Map.add label (Map.add word newcount labelMap) corpus | |
let labeledTokens = Seq.map (fun (l,msg) -> (l, tokenise msg)) labeledData | |
let addWordsToCorpus words label (corpus:Corpus) : Corpus = | |
Seq.fold (fun c word -> addCorpus label word c) corpus words | |
Seq.fold | |
(fun corpus (label,words) -> addWordsToCorpus words label corpus) | |
corpus | |
labeledTokens | |
[<EntryPoint>] | |
let main argv = | |
let strings2labels (labeledData:string []) = | |
let label = | |
match labeledData.[0] with | |
| "ham" -> Ham | |
| _ -> Spam | |
(label, labeledData.[1]) | |
let datapath = Path.Combine(__SOURCE_DIRECTORY__, "SMSSpamCollection.txt") | |
let data = File.ReadAllLines(datapath) | |
let labeledData = data |> Array.map (fun (s:string) -> s.Split([|'\t'|]) ) | |
let labeledMessages = labeledData |> Array.map strings2labels | |
let fraction = 0.1 | |
let r = new Random () | |
let test, training = labeledMessages |> Array.partition (fun x -> r.NextDouble() < fraction) | |
let corpus = buildCorpus training | |
let p = p_label_given_word Spam "me" corpus | |
printfn "%A" p//(accuracyRate classify test) | |
0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment