Last active
October 3, 2016 12:36
-
-
Save lazyatom/b5a830f2c445c87f06994c1cd8d61a26 to your computer and use it in GitHub Desktop.
Huffman encoding and decoding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Demonstrating Huffman encoding of strings to compress them | |
# This script accepts a string as an argument, and builds a | |
# Huffman tree to help compress them, then decodes the message using | |
# the same tree to verify that it worked OK. | |
# probabilities_from_book = { | |
# a: 0.25, | |
# b: 0.21, | |
# c: 0.18, | |
# d: 0.14, | |
# e: 0.09, | |
# f: 0.07, | |
# g: 0.06 | |
# } | |
class Node < Struct.new(:symbol, :probability, :left, :right, :label) | |
def to_s | |
[symbol, probability, label].inspect | |
end | |
end | |
def huffman_tree_for_probabilities(probabilities) | |
subtrees = probabilities.map { |s, prob| | |
Node.new(s, prob) | |
} | |
until subtrees.length == 1 | |
x, y = subtrees.pop(2) | |
y.label = '0' | |
x.label = '1' | |
new_subtree = Node.new(nil, x.probability + y.probability, x, y) | |
subtrees.push(new_subtree) | |
subtrees.sort_by! { |s| -s.probability } | |
end | |
subtrees.first | |
end | |
def table_for(tree, label='') | |
compound_label = "#{label}#{tree.label}" | |
if tree.symbol | |
{tree.symbol => compound_label} | |
else | |
table_for(tree.left, compound_label). | |
merge(table_for(tree.right, compound_label)) | |
end | |
end | |
def probabilities_from_string(string) | |
characters = string.split('') | |
uniq_characters = characters.uniq | |
character_counts = uniq_characters.inject({}) { |a,c| a.merge(c => characters.select { |x| x == c }.count) } | |
probabilities = character_counts.inject({}) { |a, (char,count)| a.merge(char => count.to_f / string.length) } | |
end | |
def huffman_tree_for_text(string) | |
huffman_tree_for_probabilities(probabilities_from_string(string)) | |
end | |
def encoded_as_bits(string) | |
table = table_for(huffman_tree_for_text(string)) | |
encoded_string = string.split('').map { |c| table[c] }.join | |
end | |
def decode(binary_string, huffman_tree) | |
table = table_for(huffman_tree).invert | |
bits = binary_string.split('') | |
current_word = [] | |
symbols = [] | |
while bits.any? | |
current_word << bits.shift | |
if symbol = table[current_word.join('')] | |
symbols << symbol | |
current_word.clear | |
end | |
end | |
symbols.join | |
end | |
if ARGV.empty? | |
puts "Usage: ruby huffman.rb <string to compress>" | |
exit | |
end | |
input_string = ARGV.join(' ') | |
input_string_length_in_bits = input_string.length * 8 | |
puts "input length in bits #{input_string_length_in_bits} (#{input_string.length} * 8)" | |
huffman_tree = huffman_tree_for_text(input_string) | |
encoded_string = encoded_as_bits(input_string) | |
puts | |
puts "Binary #{encoded_string}" | |
puts "Encoded string length in bits: #{encoded_string.length}" | |
puts "Compression: #{(encoded_string.length.to_f / (input_string_length_in_bits)) * 100}%" | |
puts | |
puts "Decoded: #{decode(encoded_string, huffman_tree)}" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ ruby huffman.rb this is a test of compression | |
input length in bits 232 (29 * 8) | |
Binary 1111100000100011001000110100101101111101000111111001111100110111010111001110001101111010000001001110110 | |
Encoded string length in bits: 103 | |
Compression: 44.396551724137936% | |
Decoded: this is a test of compression |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment