Last active
March 30, 2016 22:47
-
-
Save dskinner/ee01bec5e25046fdc5c5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"math" | |
"os" | |
"sort" | |
"strings" | |
) | |
var dict = []string{"daniel", "hello", "world", "世界", "ハローワールド", "とても幸せ"} | |
type Match []string | |
func (m Match) Len() int { return len(m) } | |
func (m Match) Less(i, j int) bool { return len(m[i]) > len(m[j]) } | |
func (m Match) Swap(i, j int) { m[i], m[j] = m[j], m[i] } | |
func EntropyDict(s string) (e float64) { | |
var m Match | |
// ranges overlapping occurrences | |
for _, w := range dict { | |
if strings.Contains(s, w) { | |
m = append(m, w) | |
} | |
} | |
// tack on runes | |
for _, r := range s { | |
m = append(m, string(r)) | |
} | |
sort.Sort(m) | |
// remove most of the overlap | |
var l []string | |
LOOP: | |
for _, a := range m { | |
for _, b := range l { | |
if strings.Contains(b, a) { | |
continue LOOP | |
} | |
} | |
l = append(l, a) | |
} | |
for _, w := range l { | |
n := strings.Count(s, w) | |
p := float64(n) / float64(len(s)) | |
e += p * math.Log(p) / math.Log(2) | |
} | |
// | |
return math.Abs(e) | |
} | |
func Entropy(s string) (e float64) { | |
m := make(map[rune]bool) | |
for _, r := range s { | |
if m[r] { | |
continue | |
} | |
m[r] = true | |
n := strings.Count(s, string(r)) | |
p := float64(n) / float64(len(s)) | |
e += p * math.Log(p) / math.Log(2) | |
} | |
return math.Abs(e) | |
} | |
func main() { | |
for _, arg := range os.Args[1:] { | |
fmt.Printf("word: %s\ndict: %v\nreg: %v\n", arg, EntropyDict(arg), Entropy(arg)) | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// test data sourced from the following url: | |
// https://crackstation.net/buy-crackstation-wordlist-password-cracking-dictionary.htm | |
package main | |
import ( | |
"bufio" | |
"fmt" | |
"os" | |
"testing" | |
) | |
func testEntropy(t *testing.T) { | |
f, err := os.Open("testdata/dict.txt") | |
if err != nil { | |
t.Error(err) | |
} | |
sc := bufio.NewScanner(f) | |
var tl, mt int | |
for sc.Scan() { | |
tl += 1 | |
w := sc.Text() | |
e := Entropy(w) | |
if e < 4 { | |
mt += 1 | |
fmt.Println(w) | |
} | |
} | |
fmt.Printf("total: %v\nmatch: %v\n", tl, mt) | |
} | |
func TestXKCD(t *testing.T) { | |
e := Entropy("correcthorsebatterystaple") | |
if fmt.Sprintf("%.2f", e) != "3.36" { | |
t.Fatal(e) | |
} | |
} | |
func BenchmarkEntropy(b *testing.B) { | |
for i := 0; i < b.N; i++ { | |
Entropy("correcthorsebatterystaple") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment