Skip to content

Instantly share code, notes, and snippets.

@PirosB3
Created January 22, 2015 20:28
Show Gist options
  • Save PirosB3/884ceef51dfaedf9bbc9 to your computer and use it in GitHub Desktop.
Save PirosB3/884ceef51dfaedf9bbc9 to your computer and use it in GitHub Desktop.
import nltk
from collections import defaultdict
from itertools import count
from heapq import heapify, heappop, heappush
def huffman(seq, frq):
num = count()
trees = list(zip(frq, num, seq)) # ex (22, 2, 'a',)
heapify(trees)
while len(trees) > 1:
fa, _, a = heappop(trees)
fb, _, b = heappop(trees)
n = next(num)
heappush(trees, (fa + fb, n, [a, b]))
return trees[0][-1]
def codes(tree, prefix=""):
if len(tree) == 1:
yield (tree, prefix)
return
for bit, child in zip("01", tree):
for pair in codes(child, prefix + bit):
yield pair
def test_it_works():
chars = defaultdict(int)
words = nltk.corpus.brown.words()[0:10000]
for w in words:
for c in w.lower():
chars[c] += 1
seq, frq = zip(*chars.items())
tree = huffman(seq, frq)
print dict(codes(tree))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment