This can be used to encode a string using huffman compression in Python and decode it in Python or c.
Last active
November 30, 2022 12:24
-
-
Save dorianim/f2d61e9ae836dd073c0162547667282d to your computer and use it in GitHub Desktop.
Super simple Huffman text compression in Python and c
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class HuffmanTree: | |
def __init__(self, char: str, count: int, left=None, right=None): | |
self.char = char | |
self.count = count | |
self.left: HuffmanTree | None = left | |
self.rigth: HuffmanTree | None = right | |
def __repr__(self) -> str: | |
return f"Node('{self.char}'({ord(self.char)}), {self.count})" | |
def as_byte_list(self): | |
if self.char == "\x00": | |
return [ord(self.char)] + self.left.as_byte_list() + self.rigth.as_byte_list() | |
return [ord(self.char)] | |
@staticmethod | |
def __count_chars(string: str) -> dict: | |
counts = {} | |
for char in string: | |
if char in counts: | |
counts[char] += 1 | |
else: | |
counts[char] = 1 | |
return counts | |
@staticmethod | |
def from_string(string: str): | |
char_counts = HuffmanTree.__count_chars(string) | |
nodes = [] | |
for char in char_counts: | |
nodes.append(HuffmanTree(char, char_counts[char])) | |
while len(nodes) > 1: | |
nodes.sort(key=lambda node: node.count, reverse=True) | |
right = nodes.pop() | |
left = nodes.pop() | |
parent_node = HuffmanTree("\x00", | |
left.count + right.count, left, right) | |
nodes.append(parent_node) | |
return nodes[0] | |
def __calculate_lut(tree: HuffmanTree, dict={}, val=[]) -> dict: | |
if tree.char != "\x00": | |
dict[tree.char] = val | |
else: | |
dict = __calculate_lut(tree.left, dict, val + [0b1]) | |
dict = __calculate_lut(tree.rigth, dict, val + [0b0]) | |
return dict | |
def __group_bits_as_bytes(bits: list) -> list: | |
byte_list = [] | |
while len(bits) > 0: | |
byte = 0b0 | |
for _ in range(8): | |
byte = (byte << 1) | |
if len(bits) > 0: | |
byte |= bits.pop(0) | |
byte_list.append(byte) | |
return byte_list | |
def encode(string) -> tuple: | |
"""Encode a string as a huffman compressed list of bytes | |
The string must not conatain \x00 and \x01! | |
Args: | |
string (str): the string to encode | |
Returns: | |
tuple: the list and the HuffmanTree | |
""" | |
if "\x00" in string or "\x01" in string: | |
raise Exception("string must not conatain \\x00 or \\x01!") | |
string += "\x01" | |
tree = HuffmanTree.from_string(string) | |
lut = __calculate_lut(tree) | |
encoded = [] | |
for char in string: | |
encoded += lut[char] | |
encoded = __group_bits_as_bytes(encoded) | |
return encoded, tree | |
def __find_char(tree: HuffmanTree, encoded: list, current_index: list) -> str: | |
# the current index is a list with one element, so it acts like a pointer | |
if tree.char != "\x00": | |
return tree.char | |
current_bit = (encoded[int(current_index[0] / 8)] >> | |
(7 - (current_index[0] % 8))) & 1 | |
current_index[0] += 1 | |
if current_bit == 1: | |
tree = tree.left | |
else: | |
tree = tree.rigth | |
return __find_char(tree, encoded, current_index) | |
def decode(encoded: list, tree: HuffmanTree) -> str: | |
"""Decode a huffman compressed list of bytes to a string | |
Args: | |
code (list): the encoded list | |
tree (HuffmanTree): the tree | |
Returns: | |
str: the decoded string | |
""" | |
string = "" | |
current_index = [0] | |
while True: | |
c = __find_char(tree, encoded, current_index) | |
if c == "\x01": | |
break | |
string += c | |
return string | |
def byte_list_to_c(name: str, bytes: list) -> str: | |
# remove last comma | |
headerString = "const uint8_t " + name + "[] = {\n " | |
counter = 0 | |
for byte in bytes: | |
headerString += f"{hex(byte).zfill(2)}, " | |
counter += 1 | |
if counter % 16 == 0: | |
headerString += "\n " | |
# remove last comma | |
headerString = headerString[:-2] | |
headerString += "\n};" | |
return headerString | |
if __name__ == "__main__": | |
string = "Hello World!" | |
encoded, tree = encode(string) | |
print(byte_list_to_c("huffman_tree", tree.as_byte_list())) | |
print(byte_list_to_c("encoded_string", encoded)) | |
decoded = decode(encoded, tree) | |
assert decoded == string | |
print( | |
f"Without compression: {len(string)}, with compression: {len(encoded)}, gain: {int((1 - len(encoded) / len(string)) * 100)}%") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
const uint8_t huffman_tree[] = { | |
0x0, 0x0, 0x0, 0x0, 0x20, 0x57, 0x0, 0x48, 0x65, 0x0, 0x0, 0x21, 0x1, 0x0, 0x72, 0x64, | |
0x0, 0x6c, 0x6f}; | |
const uint8_t encoded_string[] = { | |
0xdc, 0x53, 0xf8, 0x96, 0x2e, 0x80}; | |
typedef struct Node | |
{ | |
char c; | |
struct Node *p; | |
struct Node *l; | |
struct Node *r; | |
} Node; | |
Node *build_tree(const uint8_t *raw_tree) | |
{ | |
Node *root = NULL; | |
Node *last_node = NULL; | |
int children_missing = 0; | |
do | |
{ | |
Node *current_node = (Node *)malloc(sizeof(Node)); | |
current_node->p = last_node; | |
current_node->c = *raw_tree; | |
raw_tree++; | |
if (current_node->c == 0) | |
{ | |
children_missing += 2; | |
last_node = current_node; | |
} | |
if (root == NULL) | |
{ | |
root = current_node; | |
continue; | |
} | |
if (current_node->p->l == NULL) | |
{ | |
current_node->p->l = current_node; | |
} | |
else | |
{ | |
current_node->p->r = current_node; | |
} | |
children_missing--; | |
while (last_node != NULL && last_node->r != NULL) | |
{ | |
last_node = last_node->p; | |
} | |
} while (children_missing > 0); | |
return root; | |
} | |
void print_tree(Node *tree, int level) | |
{ | |
if (tree == NULL) | |
return; | |
level += 5; | |
print_tree(tree->r, level); | |
for (int i = 5; i < level; i++) | |
printf(" "); | |
if (tree->c == 0) | |
printf("<\n"); | |
else if (tree->c == ' ') | |
printf("␣\n"); | |
else | |
printf("%c\n", tree->c); | |
print_tree(tree->l, level); | |
} | |
char find_char(Node *tree, const uint8_t encoded[], size_t *current_index) | |
{ | |
if (tree->c != 0) | |
return tree->c; | |
uint8_t current_bit = (encoded[*current_index / 8] >> (7 - (*current_index % 8))) & 1; | |
(*current_index)++; | |
if (current_bit == 1) | |
tree = tree->l; | |
else | |
tree = tree->r; | |
return find_char(tree, encoded, current_index); | |
} | |
char *decode(Node *tree, const uint8_t encoded[]) | |
{ | |
char *string = (char *)malloc(sizeof(char)); | |
size_t current_length = 1; | |
size_t current_index = 0; | |
for (;;) | |
{ | |
char c = find_char(tree, encoded, ¤t_index); | |
if (c == 1) | |
{ | |
string[current_length - 1] = 0; | |
break; | |
} | |
string[current_length - 1] = c; | |
string = (char *)realloc((void *)string, sizeof(char) * current_length++); | |
} | |
return string; | |
} | |
int main(int argc, char **argv) | |
{ | |
Node *tree = build_tree(huffman_tree); | |
char *string = decode(tree, encoded_string); | |
printf("%s\n", string); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment