bricef · January 20, 2016 16:05
diff --git a/NaiveBayesClassifier.fs b/NaiveBayesClassifier.fs

 open System.IO
 open System

 type Label = Ham | Spam

 let classify msg : Label = Spam

 type Corpus = Map<Label, Map<string, float>>
 type LabeledData = seq<Label*string>

 let accuracyRate (classifier:string->Label) (labeledData:LabeledData) : float =
    let correctResults = 
        labeledData 
        |> Seq.map (fun (label,msg) -> if (classifier msg) = label then 1.0 else 0.0)
        |> Seq.sum
    correctResults / float (Seq.length labeledData)


 let corpus:Corpus = 
    Map.ofList [( Ham,  Map.empty );( Spam, Map.empty )]

 let getDefaultCount map key = 
    defaultArg (Map.tryFind key map) 0.001   

 let makeClassifier (corpus:Corpus) =
    let word_count (label:Label) : float=
        Map.toArray corpus.[label] |> Array.sumBy snd

    let p_word_given_spam word= 
        let inner = corpus.[Spam]
        let count = inner.[word] 
        count / (word_count Spam)

    let total_word_count = 
        (word_count Ham) + (word_count Spam)

    let p_spam = 
        (word_count Spam)/(total_word_count)

    let p_word_given_label word label = 
        let wordcount = 
            match corpus.[label].TryFind word with 
                | Some(x) -> x
                | None -> 0.001
        wordcount / (word_count label)

    let p_label label : float= 
        (word_count label) / (total_word_count)

    let p_word word = 
        let spamcount = defaultArg (corpus.[Spam].TryFind word) 0.001 
        let hamcount = defaultArg (corpus.[Ham].TryFind word) 0.001 
        spamcount + hamcount / (total_word_count)

    let p_label_given_word label word = 
        (p_word_given_label word label) * (p_label Spam) / (p_word word)

    let classify msg : Label = Spam

    classify


 let tokenise (msg:string) =
    msg.Split([|' '|])


 let buildCorpus (labeledData:LabeledData) : Corpus =
    let addCorpus label word (corpus:Corpus) : Corpus = 
        let newcount:float = (getDefaultCount corpus.[label] word) + 1.0
        let labelMap = corpus.[label]
        Map.add label (Map.add word newcount labelMap) corpus

    let labeledTokens =  Seq.map (fun (l,msg) -> (l, tokenise msg)) labeledData 

    let addWordsToCorpus words label (corpus:Corpus) : Corpus = 
        Seq.fold (fun c word -> addCorpus label word c) corpus words

    Seq.fold 
        (fun corpus (label,words) -> addWordsToCorpus words label corpus) 
        corpus 
        labeledTokens
        
 [<EntryPoint>]
 let main argv = 
    let strings2labels (labeledData:string []) = 
        let label =
            match labeledData.[0] with
            | "ham" -> Ham
            | _ -> Spam
        (label, labeledData.[1])
    let datapath = Path.Combine(__SOURCE_DIRECTORY__, "SMSSpamCollection.txt")
    let data = File.ReadAllLines(datapath)
    let labeledData = data |> Array.map (fun (s:string) -> s.Split([|'\t'|]) )
    let labeledMessages = labeledData |> Array.map strings2labels
    let fraction = 0.1
    let r = new Random ()
    let test, training  = labeledMessages |> Array.partition (fun x -> r.NextDouble() < fraction) 

    let corpus = buildCorpus training 

    let p = p_label_given_word Spam "me" corpus

    printfn "%A" p//(accuracyRate classify test)



    0

	open System.IO
	open System

	type Label = Ham \| Spam

	let classify msg : Label = Spam

	type Corpus = Map<Label, Map<string, float>>
	type LabeledData = seq<Label*string>

	let accuracyRate (classifier:string->Label) (labeledData:LabeledData) : float =
	let correctResults =
	labeledData
	\|> Seq.map (fun (label,msg) -> if (classifier msg) = label then 1.0 else 0.0)
	\|> Seq.sum
	correctResults / float (Seq.length labeledData)


	let corpus:Corpus =
	Map.ofList [( Ham, Map.empty );( Spam, Map.empty )]

	let getDefaultCount map key =
	defaultArg (Map.tryFind key map) 0.001

	let makeClassifier (corpus:Corpus) =
	let word_count (label:Label) : float=
	Map.toArray corpus.[label] \|> Array.sumBy snd

	let p_word_given_spam word=
	let inner = corpus.[Spam]
	let count = inner.[word]
	count / (word_count Spam)

	let total_word_count =
	(word_count Ham) + (word_count Spam)

	let p_spam =
	(word_count Spam)/(total_word_count)

	let p_word_given_label word label =
	let wordcount =
	match corpus.[label].TryFind word with
	\| Some(x) -> x
	\| None -> 0.001
	wordcount / (word_count label)

	let p_label label : float=
	(word_count label) / (total_word_count)

	let p_word word =
	let spamcount = defaultArg (corpus.[Spam].TryFind word) 0.001
	let hamcount = defaultArg (corpus.[Ham].TryFind word) 0.001
	spamcount + hamcount / (total_word_count)

	let p_label_given_word label word =
	(p_word_given_label word label) * (p_label Spam) / (p_word word)

	let classify msg : Label = Spam

	classify


	let tokenise (msg:string) =
	msg.Split([\|' '\|])


	let buildCorpus (labeledData:LabeledData) : Corpus =
	let addCorpus label word (corpus:Corpus) : Corpus =
	let newcount:float = (getDefaultCount corpus.[label] word) + 1.0
	let labelMap = corpus.[label]
	Map.add label (Map.add word newcount labelMap) corpus

	let labeledTokens = Seq.map (fun (l,msg) -> (l, tokenise msg)) labeledData

	let addWordsToCorpus words label (corpus:Corpus) : Corpus =
	Seq.fold (fun c word -> addCorpus label word c) corpus words

	Seq.fold
	(fun corpus (label,words) -> addWordsToCorpus words label corpus)
	corpus
	labeledTokens

	[<EntryPoint>]
	let main argv =
	let strings2labels (labeledData:string []) =
	let label =
	match labeledData.[0] with
	\| "ham" -> Ham
	\| _ -> Spam
	(label, labeledData.[1])
	let datapath = Path.Combine(__SOURCE_DIRECTORY__, "SMSSpamCollection.txt")
	let data = File.ReadAllLines(datapath)
	let labeledData = data \|> Array.map (fun (s:string) -> s.Split([\|'\t'\|]) )
	let labeledMessages = labeledData \|> Array.map strings2labels
	let fraction = 0.1
	let r = new Random ()
	let test, training = labeledMessages \|> Array.partition (fun x -> r.NextDouble() < fraction)

	let corpus = buildCorpus training

	let p = p_label_given_word Spam "me" corpus

	printfn "%A" p//(accuracyRate classify test)



	0