Last active
March 25, 2019 11:48
-
-
Save samolisov/bdd493e8d3fffe0d0a3ed8a0b2bad51d to your computer and use it in GitHub Desktop.
Cormen et al, 16.3 Huffman code (test for German)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import heapq | |
def main(): | |
freqmap = {'a':45, 'b':13, 'c':12, 'd':16, 'e':9, 'f':5} | |
char_to_code = huffman(freqmap) | |
print(char_to_code) | |
print(invert_dictionary(char_to_code)) | |
print() | |
# see https://en.wikipedia.org/wiki/Letter_frequency#Relative_frequencies_of_letters_in_other_languages | |
freqmap_german = {'a':6.516, 'b':1.886, 'c':2.732, 'd':5.076, 'e':16.396, 'f':1.656, 'g':3.009, 'h':4.577, 'i':6.550, | |
'j':0.268, 'k':1.417, 'l':3.437, 'm':2.534, 'n':9.776, 'o':2.594, 'p':0.670, 'q':0.018, 'r':7.003, | |
's':7.270, 't':6.154, 'u':4.166, 'v':0.846, 'w':1.921, 'x':0.034, 'y':0.039, 'z':1.134, | |
'ß':0.307, 'ä':0.578, 'ö':0.443, 'ü':0.995} | |
char_to_code = huffman(freqmap_german) | |
print(char_to_code) | |
print() | |
text = "aaaa" | |
print(text) | |
char_to_code, encoded = huffman_encode(text) | |
print(char_to_code) | |
decoded = huffman_decode(invert_dictionary(char_to_code), encoded) | |
print(decoded) | |
print("Is text encoded/decoded correctly?:", (decoded == text)) | |
print() | |
text = "Du grosses Gestirn! Was wäre dein Glück, wenn du nicht Die hättest, welchen du leuchtest!" | |
print(text) | |
char_to_code, encoded = huffman_encode(text) | |
decoded = huffman_decode(invert_dictionary(char_to_code), encoded) | |
print(decoded) | |
print("Is text encoded/decoded correctly?:", (decoded == text)) | |
def huffman(freqmap): | |
""" | |
Builds a code map emloyeing Huffman's algorithm (see Cormen et al, chapter 16.3). The frequences are given in the 'freqmap' | |
parameter. The function returns a map of "char" - "code". Because we are using a queue implemented as a binary min-heap (see the | |
documentation for the standard library: https://docs.python.org/3/library/heapq.html), the total running time of the algorithm | |
on a set of n characters is O(n lg n). We can reduce the running time to O(n lg lg n) by replacing the binary min-heap with | |
a van Emde Boas tree (see Cormen et al, chapter 20). | |
Args: | |
----- | |
freqmap: a map of a character to its frequency in a text, dictionary. For example, | |
{'a':45, 'b':13, 'c':12, 'd':16, 'e':9, 'f':5}. | |
Returns: | |
-------- | |
a map of a charater to its Huffman's prefix-code, for example, | |
{'a': '0', 'b': '101', 'c': '100', 'd': '111', 'e': '1101', 'f': '1100'}. | |
""" | |
class Node: | |
def __init__(self, freq, char, left = None, right = None): | |
self.__left = left | |
self.__right = right | |
self.__freq = freq | |
self.__char = char | |
def addLeft(self, left): | |
self.__left = left | |
def addRight(self, right): | |
self.__right = right | |
def freq(self): | |
return self.__freq | |
def char(self): | |
return self.__char | |
def hasLeft(self): | |
return self.__left != None | |
def hasRight(self): | |
return self.__right != None | |
def isTerminal(self): | |
return not(self.hasLeft()) and not(self.hasRight()) | |
def left(self): | |
return self.__left | |
def right(self): | |
return self.__right | |
def __lt__(self, other): | |
return self.freq() < other.freq() | |
queue = [Node(freq, key) for key, freq in freqmap.items()] | |
heapq.heapify(queue) | |
for i in range(len(freqmap) - 1): | |
left_node = heapq.heappop(queue) | |
right_node = heapq.heappop(queue) | |
heapq.heappush(queue, Node(left_node.freq() + right_node.freq(), None, left_node, right_node)) | |
root = heapq.heappop(queue) | |
# special case: freqmap with one element only | |
if root.isTerminal(): | |
return {root.char(): '0'} | |
stack = [(root, "")] | |
char_to_code = {} | |
while len(stack) > 0: | |
node, code = stack.pop() | |
if node.hasLeft(): | |
stack.append((node.left(), code + '0')) | |
if node.hasRight(): | |
stack.append((node.right(), code + '1')) | |
if node.isTerminal(): | |
char_to_code[node.char()] = code | |
return dict(sorted(char_to_code.items())) # sort the dict for better readability | |
def huffman_encode(text): | |
""" | |
Encodes the text 'text' to a binary representation (each binary number encoded as a character '1' or '0' for | |
the demonstration purposes) using a based on character frequency variable-length prefix-code. The prefix-code | |
is called as Huffman code (see the function 'huffman' that is used to build the Huffman code). | |
Args: | |
----- | |
text (string): a text for encoding. | |
Returns: | |
-------- | |
char_to_code: a map of a charater to its Huffman's prefix-code, for example, | |
{'a': '0', 'b': '101', 'c': '100', 'd': '111', 'e': '1101', 'f': '1100'}. | |
encoded: the encoded text 'text'. | |
""" | |
def build_freqmap(text): | |
""" | |
Analyses the 'text' text and builds a frequency map for the text: a map of a character to a normalized | |
number of how many times the character appears in the text. | |
Args: | |
----- | |
text (string): a text for analysis. | |
Returns: | |
-------- | |
a map of a charater to a normaliized number of occuriences of the character in the text, | |
e.g. {'a':45, 'b':13, 'c':12, 'd':16, 'e':9, 'f':5} | |
""" | |
length = len(text) | |
freqmap = {} | |
# get the frequences | |
for i in text: | |
freqmap[i] = freqmap.get(i, 0) + 1 | |
# normalize the frequences | |
for key in freqmap.keys(): | |
freqmap[key] = (freqmap[key] / length) * 100 | |
return freqmap | |
char_to_code = huffman(build_freqmap(text)) | |
encoded_symbols = [] | |
for x in text: | |
encoded_symbols.append(char_to_code[x]) | |
return char_to_code, ''.join(encoded_symbols) | |
def huffman_decode(code_to_char, encoded): | |
""" | |
Decodes an encoded using a variable-length prefix-code called as Huffman code (see the function 'huffman' that | |
is used to build the Huffman code) text. | |
Args: | |
----- | |
code_to_char: a map of a Huffman's prefix-code of a character to the character, for example, | |
{'0': 'a', '101': 'b', '100': 'c', '111': 'd', '1101': 'e', '1100': 'f'}. | |
encoded (string): an encoded text to be decoded. | |
Returns: | |
-------- | |
the decoded text (string). | |
""" | |
decoded_symbols = [] | |
symbol_start = 0 | |
symbol_end = 1 | |
length = len(encoded) | |
while symbol_end <= length: | |
key = encoded[symbol_start:symbol_end] | |
if key in code_to_char: | |
decoded_symbols.append(code_to_char[key]) | |
symbol_start = symbol_end | |
else: | |
symbol_end = symbol_end + 1 | |
if symbol_start < length: | |
raise ValueError("The encoded string cannot be recognized") | |
return ''.join(decoded_symbols) | |
def invert_dictionary(dictionary): | |
return dict(map(reversed, dictionary.items())) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment