Skip to content

Instantly share code, notes, and snippets.

@paralax
Last active May 2, 2017 21:08
Show Gist options
  • Save paralax/86b00c34e915c2c3d77e to your computer and use it in GitHub Desktop.
Save paralax/86b00c34e915c2c3d77e to your computer and use it in GitHub Desktop.
working on classifying stuff via ngrams
// https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
let KullbackLeiblerD (p : Map<string,int>) (q : Map<string,int> ) : double =
// pp and qq are ngram frequencies
let pp = Map.toList p |> List.map snd |> List.sum |> float
let qq = Map.toList q |> List.map snd |> List.sum |> float
let Q (i:string) (q : Map<string,int>) : double =
// retrieves the frequency of i in q if found or returns .00001
match (Map.tryFind i q) with
| Some(x) -> float(x)/qq
| None -> 0.00001
Map.toSeq p
|> Seq.map (fun (x,y) -> (x, float(y)/pp))
|> Seq.map (fun (x,y) -> y * System.Math.Log(y/(Q x q), 2.0))
|> Seq.sum
let ngrams (s : string) (n: int) : Map<string,int> =
s.ToCharArray()
|> Array.map string
|> Seq.windowed n
|> Seq.map (String.concat "")
|> Seq.groupBy (fun x -> x)
|> Seq.map (fun (x,y)-> x, Seq.length y)
|> Map.ofSeq
let s = "the lazy dog jumped over the quick fox"
let r = "the quick brown fox jumped over the lazy dog"
KullbackLeiblerD (ngrams s 3) (ngrams r 3) // val it : double = 0.8020886671
KullbackLeiblerD (ngrams s 2) (ngrams r 2) // val it : double = 0.3597882936
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment