Skip to content

Instantly share code, notes, and snippets.

@bholzer
Created December 15, 2016 01:53
Show Gist options
  • Save bholzer/f917e7b31744fb243c026e5ea2f29846 to your computer and use it in GitHub Desktop.
Save bholzer/f917e7b31744fb243c026e5ea2f29846 to your computer and use it in GitHub Desktop.
def englishChi2(bytes: Array[Byte]): Double = {
val englishLetterFrequency = Map(
'A' -> 0.08167, 'B' -> 0.01492, 'C' -> 0.02782, 'D' -> 0.04253,
'E' -> 0.12702, 'F' -> 0.02228, 'G' -> 0.02015, 'H' -> 0.06094,
'I' -> 0.06966, 'J' -> 0.00153, 'K' -> 0.00772, 'L' -> 0.04025,
'M' -> 0.02406, 'N' -> 0.06749, 'O' -> 0.07507, 'P' -> 0.01929,
'Q' -> 0.00095, 'R' -> 0.05987, 'S' -> 0.06327, 'T' -> 0.09056,
'U' -> 0.02758, 'V' -> 0.00978, 'W' -> 0.02360, 'X' -> 0.00150,
'Y' -> 0.01974, 'Z' -> 0.00074, ' ' -> 0.19181
)
val stringified = new String(bytes).toUpperCase
val observedFrequencies = stringified.replaceAll("[^A-Z ]", "").foldLeft(Map[Char, Double]())((map, char) => {
map+((char, map.getOrElse(char, 0.0) + 1.0))
})
val ignoredCharCount = stringified.replaceAll("[A-Z ]", "").size
// https://en.wikipedia.org/wiki/Chi-squared_test
val normalizedLength = stringified.size - ignoredCharCount
observedFrequencies.foldLeft(0.0){ case (chi, (char, observedFreq)) => {
val expectedFreq = normalizedLength * englishLetterFrequency.get(char).get
val difference = observedFreq - expectedFreq
chi + (difference*difference / expectedFreq)
}}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment