dorianim · November 30, 2022 12:24
diff --git a/README.md b/README.md
diff --git a/huffman.py b/huffman.py
 class HuffmanTree:
    def __init__(self, char: str, count: int, left=None, right=None):
        self.char = char
        self.count = count
        self.left: HuffmanTree | None = left
        self.rigth: HuffmanTree | None = right

    def __repr__(self) -> str:
        return f"Node('{self.char}'({ord(self.char)}), {self.count})"

    def as_byte_list(self):
        if self.char == "\x00":
            return [ord(self.char)] + self.left.as_byte_list() + self.rigth.as_byte_list()

        return [ord(self.char)]

    @staticmethod
    def __count_chars(string: str) -> dict:
        counts = {}
        for char in string:
            if char in counts:
                counts[char] += 1
            else:
                counts[char] = 1

        return counts

    @staticmethod
    def from_string(string: str):
        char_counts = HuffmanTree.__count_chars(string)

        nodes = []
        for char in char_counts:
            nodes.append(HuffmanTree(char, char_counts[char]))

        while len(nodes) > 1:
            nodes.sort(key=lambda node: node.count, reverse=True)

            right = nodes.pop()
            left = nodes.pop()

            parent_node = HuffmanTree("\x00",
                                      left.count + right.count, left, right)
            nodes.append(parent_node)

        return nodes[0]


 def __calculate_lut(tree: HuffmanTree, dict={}, val=[]) -> dict:
    if tree.char != "\x00":
        dict[tree.char] = val
    else:
        dict = __calculate_lut(tree.left, dict, val + [0b1])
        dict = __calculate_lut(tree.rigth, dict, val + [0b0])

    return dict


 def __group_bits_as_bytes(bits: list) -> list:
    byte_list = []
    while len(bits) > 0:
        byte = 0b0
        for _ in range(8):
            byte = (byte << 1)
            if len(bits) > 0:
                byte |= bits.pop(0)
        byte_list.append(byte)

    return byte_list


 def encode(string) -> tuple:
    """Encode a string as a huffman compressed list of bytes
    The string must not conatain \x00 and \x01!

    Args:
        string (str): the string to encode

    Returns:
        tuple: the list and the HuffmanTree
    """
    if "\x00" in string or "\x01" in string:
        raise Exception("string must not conatain \\x00 or \\x01!")

    string += "\x01"
    tree = HuffmanTree.from_string(string)
    lut = __calculate_lut(tree)
    encoded = []
    for char in string:
        encoded += lut[char]

    encoded = __group_bits_as_bytes(encoded)
    return encoded, tree


 def __find_char(tree: HuffmanTree, encoded: list, current_index: list) -> str:
    # the current index is a list with one element, so it acts like a pointer
    if tree.char != "\x00":
        return tree.char

    current_bit = (encoded[int(current_index[0] / 8)] >>
                   (7 - (current_index[0] % 8))) & 1
    current_index[0] += 1

    if current_bit == 1:
        tree = tree.left
    else:
        tree = tree.rigth

    return __find_char(tree, encoded, current_index)


 def decode(encoded: list, tree: HuffmanTree) -> str:
    """Decode a huffman compressed list of bytes to a string

    Args:
        code (list): the encoded list
        tree (HuffmanTree): the tree

    Returns:
        str: the decoded string
    """
    string = ""
    current_index = [0]
    while True:
        c = __find_char(tree, encoded, current_index)

        if c == "\x01":
            break

        string += c

    return string


 def byte_list_to_c(name: str, bytes: list) -> str:

    # remove last comma
    headerString = "const uint8_t " + name + "[] = {\n  "
    counter = 0
    for byte in bytes:
        headerString += f"{hex(byte).zfill(2)}, "
        counter += 1
        if counter % 16 == 0:
            headerString += "\n  "

    # remove last comma
    headerString = headerString[:-2]
    headerString += "\n};"
    return headerString


 if __name__ == "__main__":
    string = "Hello World!"

    encoded, tree = encode(string)
    print(byte_list_to_c("huffman_tree", tree.as_byte_list()))
    print(byte_list_to_c("encoded_string", encoded))

    decoded = decode(encoded, tree)
    assert decoded == string

    print(
        f"Without compression: {len(string)}, with compression: {len(encoded)}, gain: {int((1 - len(encoded) / len(string)) * 100)}%")
diff --git a/huffmann.c b/huffmann.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>

 const uint8_t huffman_tree[] = {
    0x0, 0x0, 0x0, 0x0, 0x20, 0x57, 0x0, 0x48, 0x65, 0x0, 0x0, 0x21, 0x1, 0x0, 0x72, 0x64,
    0x0, 0x6c, 0x6f};
 const uint8_t encoded_string[] = {
    0xdc, 0x53, 0xf8, 0x96, 0x2e, 0x80};

 typedef struct Node
 {
    char c;
    struct Node *p;
    struct Node *l;
    struct Node *r;
 } Node;

 Node *build_tree(const uint8_t *raw_tree)
 {
    Node *root = NULL;
    Node *last_node = NULL;

    int children_missing = 0;
    do
    {
        Node *current_node = (Node *)malloc(sizeof(Node));

        current_node->p = last_node;
        current_node->c = *raw_tree;
        raw_tree++;

        if (current_node->c == 0)
        {
            children_missing += 2;
            last_node = current_node;
        }

        if (root == NULL)
        {
            root = current_node;
            continue;
        }

        if (current_node->p->l == NULL)
        {
            current_node->p->l = current_node;
        }
        else
        {
            current_node->p->r = current_node;
        }
        children_missing--;

        while (last_node != NULL && last_node->r != NULL)
        {
            last_node = last_node->p;
        }
    } while (children_missing > 0);

    return root;
 }

 void print_tree(Node *tree, int level)
 {
    if (tree == NULL)
        return;

    level += 5;

    print_tree(tree->r, level);

    for (int i = 5; i < level; i++)
        printf(" ");

    if (tree->c == 0)
        printf("<\n");
    else if (tree->c == ' ')
        printf("␣\n");
    else
        printf("%c\n", tree->c);

    print_tree(tree->l, level);
 }

 char find_char(Node *tree, const uint8_t encoded[], size_t *current_index)
 {
    if (tree->c != 0)
        return tree->c;

    uint8_t current_bit = (encoded[*current_index / 8] >> (7 - (*current_index % 8))) & 1;
    (*current_index)++;

    if (current_bit == 1)
        tree = tree->l;
    else
        tree = tree->r;

    return find_char(tree, encoded, current_index);
 }

 char *decode(Node *tree, const uint8_t encoded[])
 {
    char *string = (char *)malloc(sizeof(char));
    size_t current_length = 1;
    size_t current_index = 0;
    for (;;)
    {
        char c = find_char(tree, encoded, &current_index);
        if (c == 1)
        {
            string[current_length - 1] = 0;
            break;
        }
        string[current_length - 1] = c;
        string = (char *)realloc((void *)string, sizeof(char) * current_length++);
    }
    return string;
 }

 int main(int argc, char **argv)
 {
    Node *tree = build_tree(huffman_tree);
    char *string = decode(tree, encoded_string);
    printf("%s\n", string);
 }
	class HuffmanTree:
	def __init__(self, char: str, count: int, left=None, right=None):
	self.char = char
	self.count = count
	self.left: HuffmanTree \| None = left
	self.rigth: HuffmanTree \| None = right

	def __repr__(self) -> str:
	return f"Node('{self.char}'({ord(self.char)}), {self.count})"

	def as_byte_list(self):
	if self.char == "\x00":
	return [ord(self.char)] + self.left.as_byte_list() + self.rigth.as_byte_list()

	return [ord(self.char)]

	@staticmethod
	def __count_chars(string: str) -> dict:
	counts = {}
	for char in string:
	if char in counts:
	counts[char] += 1
	else:
	counts[char] = 1

	return counts

	@staticmethod
	def from_string(string: str):
	char_counts = HuffmanTree.__count_chars(string)

	nodes = []
	for char in char_counts:
	nodes.append(HuffmanTree(char, char_counts[char]))

	while len(nodes) > 1:
	nodes.sort(key=lambda node: node.count, reverse=True)

	right = nodes.pop()
	left = nodes.pop()

	parent_node = HuffmanTree("\x00",
	left.count + right.count, left, right)
	nodes.append(parent_node)

	return nodes[0]


	def __calculate_lut(tree: HuffmanTree, dict={}, val=[]) -> dict:
	if tree.char != "\x00":
	dict[tree.char] = val
	else:
	dict = __calculate_lut(tree.left, dict, val + [0b1])
	dict = __calculate_lut(tree.rigth, dict, val + [0b0])

	return dict


	def __group_bits_as_bytes(bits: list) -> list:
	byte_list = []
	while len(bits) > 0:
	byte = 0b0
	for _ in range(8):
	byte = (byte << 1)
	if len(bits) > 0:
	byte \|= bits.pop(0)
	byte_list.append(byte)

	return byte_list


	def encode(string) -> tuple:
	"""Encode a string as a huffman compressed list of bytes
	The string must not conatain \x00 and \x01!

	Args:
	string (str): the string to encode

	Returns:
	tuple: the list and the HuffmanTree
	"""
	if "\x00" in string or "\x01" in string:
	raise Exception("string must not conatain \\x00 or \\x01!")

	string += "\x01"
	tree = HuffmanTree.from_string(string)
	lut = __calculate_lut(tree)
	encoded = []
	for char in string:
	encoded += lut[char]

	encoded = __group_bits_as_bytes(encoded)
	return encoded, tree


	def __find_char(tree: HuffmanTree, encoded: list, current_index: list) -> str:
	# the current index is a list with one element, so it acts like a pointer
	if tree.char != "\x00":
	return tree.char

	current_bit = (encoded[int(current_index[0] / 8)] >>
	(7 - (current_index[0] % 8))) & 1
	current_index[0] += 1

	if current_bit == 1:
	tree = tree.left
	else:
	tree = tree.rigth

	return __find_char(tree, encoded, current_index)


	def decode(encoded: list, tree: HuffmanTree) -> str:
	"""Decode a huffman compressed list of bytes to a string

	Args:
	code (list): the encoded list
	tree (HuffmanTree): the tree

	Returns:
	str: the decoded string
	"""
	string = ""
	current_index = [0]
	while True:
	c = __find_char(tree, encoded, current_index)

	if c == "\x01":
	break

	string += c

	return string


	def byte_list_to_c(name: str, bytes: list) -> str:

	# remove last comma
	headerString = "const uint8_t " + name + "[] = {\n "
	counter = 0
	for byte in bytes:
	headerString += f"{hex(byte).zfill(2)}, "
	counter += 1
	if counter % 16 == 0:
	headerString += "\n "

	# remove last comma
	headerString = headerString[:-2]
	headerString += "\n};"
	return headerString


	if __name__ == "__main__":
	string = "Hello World!"

	encoded, tree = encode(string)
	print(byte_list_to_c("huffman_tree", tree.as_byte_list()))
	print(byte_list_to_c("encoded_string", encoded))

	decoded = decode(encoded, tree)
	assert decoded == string

	print(
	f"Without compression: {len(string)}, with compression: {len(encoded)}, gain: {int((1 - len(encoded) / len(string)) * 100)}%")
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>

	const uint8_t huffman_tree[] = {
	0x0, 0x0, 0x0, 0x0, 0x20, 0x57, 0x0, 0x48, 0x65, 0x0, 0x0, 0x21, 0x1, 0x0, 0x72, 0x64,
	0x0, 0x6c, 0x6f};
	const uint8_t encoded_string[] = {
	0xdc, 0x53, 0xf8, 0x96, 0x2e, 0x80};

	typedef struct Node
	{
	char c;
	struct Node *p;
	struct Node *l;
	struct Node *r;
	} Node;

	Node build_tree(const uint8_t raw_tree)
	{
	Node *root = NULL;
	Node *last_node = NULL;

	int children_missing = 0;
	do
	{
	Node current_node = (Node )malloc(sizeof(Node));

	current_node->p = last_node;
	current_node->c = *raw_tree;
	raw_tree++;

	if (current_node->c == 0)
	{
	children_missing += 2;
	last_node = current_node;
	}

	if (root == NULL)
	{
	root = current_node;
	continue;
	}

	if (current_node->p->l == NULL)
	{
	current_node->p->l = current_node;
	}
	else
	{
	current_node->p->r = current_node;
	}
	children_missing--;

	while (last_node != NULL && last_node->r != NULL)
	{
	last_node = last_node->p;
	}
	} while (children_missing > 0);

	return root;
	}

	void print_tree(Node *tree, int level)
	{
	if (tree == NULL)
	return;

	level += 5;

	print_tree(tree->r, level);

	for (int i = 5; i < level; i++)
	printf(" ");

	if (tree->c == 0)
	printf("<\n");
	else if (tree->c == ' ')
	printf("␣\n");
	else
	printf("%c\n", tree->c);

	print_tree(tree->l, level);
	}

	char find_char(Node tree, const uint8_t encoded[], size_t current_index)
	{
	if (tree->c != 0)
	return tree->c;

	uint8_t current_bit = (encoded[current_index / 8] >> (7 - (current_index % 8))) & 1;
	(*current_index)++;

	if (current_bit == 1)
	tree = tree->l;
	else
	tree = tree->r;

	return find_char(tree, encoded, current_index);
	}

	char decode(Node tree, const uint8_t encoded[])
	{
	char string = (char )malloc(sizeof(char));
	size_t current_length = 1;
	size_t current_index = 0;
	for (;;)
	{
	char c = find_char(tree, encoded, &current_index);
	if (c == 1)
	{
	string[current_length - 1] = 0;
	break;
	}
	string[current_length - 1] = c;
	string = (char )realloc((void )string, sizeof(char) * current_length++);
	}
	return string;
	}

	int main(int argc, char **argv)
	{
	Node *tree = build_tree(huffman_tree);
	char *string = decode(tree, encoded_string);
	printf("%s\n", string);
	}