Created
March 20, 2026 14:09
-
-
Save TobeTek/0e7cd69ef9c2a1c6ce7cc2ebcfc1d8dd to your computer and use it in GitHub Desktop.
An Attempt at Huffman Compression with Python: Supporting code for the tutorial walkthrough
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class MinHeap: | |
| @classmethod | |
| def from_list(cls, arr: list): | |
| instance = cls() | |
| instance.heap = arr | |
| print(f"Creating heap from list: {arr}") | |
| n = len(instance.heap) | |
| for i in range(n // 2 - 1, -1, -1): | |
| print(f"\tHeapifying at index {i}") | |
| instance._sift_down(i) | |
| print(f"Final heap: {instance.heap}") | |
| return instance | |
| def __init__(self): | |
| self.heap = [] | |
| def push(self, val): | |
| self.heap.append(val) | |
| print(f"Pushed {val}, heap: {self.heap}") | |
| self._sift_up(len(self.heap) - 1) | |
| def pop(self): | |
| if len(self.heap) == 0: | |
| print("Heap is empty") | |
| return None | |
| if len(self.heap) == 1: | |
| return self.heap.pop() | |
| root = self.heap[0] | |
| self.heap[0] = self.heap.pop() | |
| print(f"Popped {root}, heap: {self.heap}") | |
| self._sift_down(0) | |
| return root | |
| def _sift_up(self, index): | |
| while index > 0: | |
| parent = (index - 1) // 2 | |
| if self.heap[index] < self.heap[parent]: | |
| print(f"\tSift up: swapping {self.heap[index]} with {self.heap[parent]}") | |
| self.heap[index], self.heap[parent] = self.heap[parent], self.heap[index] | |
| index = parent | |
| else: | |
| break | |
| def _sift_down(self, index): | |
| smallest = index | |
| left, right = 2 * index + 1, 2 * index + 2 | |
| if left < len(self.heap) and self.heap[left] < self.heap[smallest]: | |
| smallest = left | |
| if right < len(self.heap) and self.heap[right] < self.heap[smallest]: | |
| smallest = right | |
| if smallest != index: | |
| print(f"\tSift down: swapping {self.heap[index]} with {self.heap[smallest]}") | |
| self.heap[index], self.heap[smallest] = self.heap[smallest], self.heap[index] | |
| self._sift_down(smallest) | |
| def __len__(self): | |
| return len(self.heap) | |
| if __name__ == "__main__": | |
| heap = MinHeap.from_list([10, 5, 20, 1, 15]) | |
| heap.pop() | |
| heap.pop() | |
| print('Final heap:', heap.heap) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections import Counter | |
| from typing import Any, Dict, List, Tuple | |
| from heap import MinHeap | |
| def count_frequencies(text: str) -> Dict[str, int]: | |
| words = text.split() | |
| return dict(Counter(words)) | |
| def build_huffman_tree(frequencies: Dict[str, int]) -> Tuple[Dict[str, str], Any]: | |
| heap = [(f, [(s, (0, 0))]) for s, f in frequencies.items()] | |
| heap = MinHeap.from_list(heap) | |
| while len(heap) > 1: | |
| a_freq, a_list = heap.pop() | |
| b_freq, b_list = heap.pop() | |
| merged_list = [] | |
| for symbol, (bits, val) in a_list: | |
| merged_list.append((symbol, (bits + 1, val))) | |
| for symbol, (bits, val) in b_list: | |
| # Logic: (1 << bits) + val effectively adds a '1' to the front | |
| merged_list.append((symbol, (bits + 1, (1 << bits) + val))) | |
| new_freq = a_freq + b_freq | |
| heap.push((new_freq, merged_list)) | |
| final_list = heap.pop()[1] | |
| # Convert their (bitsize, value) format back to your string format "0101" | |
| codes = {} | |
| for symbol, (bits, val) in final_list: | |
| codes[symbol] = bin(val)[2:].rjust(bits, "0") | |
| return codes, final_list | |
| def huffman_encode(text: str, codes: Dict[str, str]) -> str: | |
| words = text.split() | |
| return "".join(codes[word] for word in words if word in codes) | |
| def huffman_decode(encoded: str, codes: Dict[str, str]) -> str: | |
| reverse_codes = {v: k for k, v in codes.items()} | |
| decoded = [] | |
| current_buffer = "" | |
| for bit in encoded: | |
| current_buffer += bit | |
| if current_buffer in reverse_codes: | |
| symbol = reverse_codes[current_buffer] | |
| decoded.append(symbol) | |
| current_buffer = "" | |
| return " ".join(decoded) | |
| if __name__ == "__main__": | |
| text= "one ring to rule them all one ring to find them one ring to bring them and in the darkness bind them" | |
| frequencies = count_frequencies(text) | |
| codes, _ = build_huffman_tree(frequencies) | |
| print("\nHUFFMAN CODES:") | |
| for symbol, code in sorted(codes.items(), key=lambda x: (len(x[1]), x[0])): | |
| print(f"{symbol:10}: {code}") | |
| encoded = huffman_encode(text, codes) | |
| decoded = huffman_decode(encoded, codes) | |
| print(f"\nEncoded : {encoded}") | |
| print(f"Decoded : {decoded}") | |
| print(f"Success : {text == decoded}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment