Skip to content

Instantly share code, notes, and snippets.

@potat-dev
Last active November 6, 2024 16:25
Show Gist options
  • Save potat-dev/75f9e71e44a29ac6eb9346767fef2034 to your computer and use it in GitHub Desktop.
Save potat-dev/75f9e71e44a29ac6eb9346767fef2034 to your computer and use it in GitHub Desktop.
takes a text string and creates different types of binary codes for it (Huffman, Shannon, Gilbert-Moore)
import math
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
def calculate_frequencies(text: str) -> Dict[str, float]:
print(f"\nДлина текста: {len(text)} символов")
# Count occurrences
counts = Counter(text)
print("\nКоличество каждого символа:")
for char, count in sorted(counts.items()):
print(f"'{char}': {count} раз")
# Calculate frequencies
total = len(text)
frequencies = {char: round(count/total, 2) for char, count in counts.items()}
print("\nЧастоты символов p(x):")
for char, freq in sorted(frequencies.items()):
print(f"p('{char}') = {count}/{total} ≈ {freq:.2f}")
return frequencies
def calculate_entropy(frequencies: Dict[str, float]) -> float:
components = []
total = 0
print("\nЭнтропия символов H(X):")
for char, p in sorted(frequencies.items()):
component = -p * math.log2(p)
components.append(component)
print(f"Для символа '{char}': -({p:.2f}) × log₂({p:.2f}) = {component:.4f}")
total += component
print(f"\nH(X) = {' + '.join([f'{c:.4f}' for c in components])} = {total:.4f} бит")
return total
def calculate_uniform_code_params(alphabet_size: int, text_length: int) -> Tuple[int, int]:
l = math.ceil(math.log2(alphabet_size))
N = l * text_length
print(f"\nРазмер алфавита M = {alphabet_size}")
print(f"Длина кода l = ⌈log₂({alphabet_size})⌉ = ⌈{math.log2(alphabet_size):.4f}⌉ = {l}")
print(f"Размер закодированной последовательности N = l × {text_length} = {l} × {text_length} = {N} бит")
return l, N
def build_huffman_codes(frequencies: Dict[str, float]) -> Dict[str, str]:
class HuffmanNode:
def __init__(self, char=None, freq=0):
self.char = char
self.freq = freq
self.left = None
self.right = None
def __lt__(self, other):
return self.freq < other.freq
if len(frequencies) <= 1:
return {next(iter(frequencies.keys())): '0'}
# Create initial nodes
nodes = [HuffmanNode(char, freq) for char, freq in frequencies.items()]
print("\nНачальные узлы:")
for node in sorted(nodes, key=lambda x: (-x.freq, x.char)):
print(f"Символ: '{node.char}', Частота: {node.freq:.2f}")
# Build tree
print("\nПостроение дерева:")
while len(nodes) > 1:
nodes.sort(key=lambda x: (x.freq, x.char if x.char else ''))
left = nodes.pop(0)
right = nodes.pop(0)
print(f"Объединяем: {left.char or 'внутр.'} ({left.freq:.2f}) и {right.char or 'внутр.'} ({right.freq:.2f})")
internal = HuffmanNode(freq=left.freq + right.freq)
internal.left = left
internal.right = right
nodes.append(internal)
# Generate codes
codes = {}
def traverse(node, code=''):
if node.char is not None:
codes[node.char] = code
print(f"Символ: '{node.char}', Код: {code}")
return
traverse(node.left, code + '0')
traverse(node.right, code + '1')
print("\nПолученные коды:")
traverse(nodes[0])
return codes
def build_shannon_codes(frequencies: Dict[str, float]) -> Tuple[Dict[str, str], Dict[str, float], Dict[str, int]]:
# Sort by frequency
sorted_chars = sorted(frequencies.items(), key=lambda x: (-x[1], x[0]))
print("\nСимволы, отсортированные по частоте:")
for char, freq in sorted_chars:
print(f"'{char}': {freq:.2f}")
# Calculate q(x)
q = defaultdict(float)
running_sum = 0
print("\nРасчет q(x):")
for char, _ in sorted_chars:
q[char] = running_sum
print(f"q('{char}') = {running_sum:.4f}")
running_sum += frequencies[char]
# Calculate lengths and codes
l = {char: math.ceil(-math.log2(frequencies[char])) for char, _ in sorted_chars}
codes = {}
print("\nПостроение кодов:")
for char, _ in sorted_chars:
length = l[char]
code_value = int(q[char] * 2**length)
codes[char] = format(code_value, f'0{length}b')
print(f"Символ: '{char}'")
print(f" l(x) = ⌈-log₂({frequencies[char]:.4f})⌉ = {length}")
print(f" Код: {codes[char]}")
print(f" q(x)2: {q[char]:.5f}")
return codes, q, l
def build_gilbert_moore_codes(text: str, frequencies: Dict[str, float]) -> Tuple[Dict[str, str], Dict[str, float], Dict[str, float], Dict[str, int]]:
# Get unique characters in order of appearance
seen = []
for char in text:
if char not in seen:
seen.append(char)
print("\nПорядок появления символов в тексте:")
print(''.join(seen))
# Calculate q(x) and σ(x)
q = defaultdict(float)
running_sum = 0
print("\nРасчет q(x) и σ(x):")
for char in seen:
q[char] = running_sum + frequencies[char]/2
print(f"'{char}': q(x) = {running_sum:.4f} + {frequencies[char]:.2f}/2 = {q[char]:.4f}")
running_sum += frequencies[char]
sigma = {char: q[char] for char in seen}
# Calculate lengths and codes
l = {char: math.ceil(-math.log2(frequencies[char])) + 1 for char in seen}
codes = {}
print("\nПостроение кодов:")
for char in seen:
length = l[char]
code_value = int(sigma[char] * 2**length)
codes[char] = format(code_value, f'0{length}b')
print(f"Символ: '{char}'")
print(f" l(x) = ⌈-log₂({frequencies[char]:.4f})⌉ + 1 = {length}")
print(f" Код: {codes[char]}")
print(f" σ(x)2: {sigma[char]:.5f}")
return codes, q, sigma, l
def calculate_avg_length(frequencies: Dict[str, float], codes: Dict[str, str]) -> float:
components = []
for char in frequencies:
component = frequencies[char] * len(codes[char])
components.append(component)
print(f"p('{char}') × l('{char}') = {frequencies[char]:.2f} × {len(codes[char])} = {component:.4f}")
avg_length = sum(components)
print(f"\nСредняя длина l = {' + '.join([f'{c:.4f}' for c in components])} = {avg_length:.4f}")
return avg_length
def calculate_sequence_size(text: str, codes: Dict[str, str]) -> int:
N = sum(len(codes[char]) for char in text)
print(f"Размер последовательности N = {N} бит")
return N
# Main code
# text = "дятел лечит древний дуб, добрый дятел дубу люб."
# text = "даже шею, даже уши ты испачкал в черной туши."
text = "шишкодробные шишки высушиваются в шишкосушилках."
print("Исходный текст:", text)
# Calculate frequencies
print("\nШаг 1: Расчет частот символов")
frequencies = calculate_frequencies(text)
# Calculate entropy
print("\nШаг 2: Расчет энтропии H(X)")
print("\nH(X) = -∑p(x)log₂p(x)")
H = calculate_entropy(frequencies)
# Uniform code parameters
print("\nШаг 3: Расчет параметров равномерного кода")
uniform_l, uniform_N = calculate_uniform_code_params(len(frequencies), len(text))
# Huffman coding
print("\nШаг 4: Построение кода Хаффмана")
huffman_codes = build_huffman_codes(frequencies)
print("\nРасчет средней длины кода Хаффмана:")
huffman_l = calculate_avg_length(frequencies, huffman_codes)
huffman_N = calculate_sequence_size(text, huffman_codes)
huffman_k = uniform_N / huffman_N
print(f"Коэффициент сжатия κ = {uniform_N}/{huffman_N} = {huffman_k:.4f}")
# Shannon coding
print("\nШаг 5: Построение кода Шеннона")
shannon_codes, shannon_q, shannon_l = build_shannon_codes(frequencies)
print("\nРасчет средней длины кода Шеннона:")
shannon_avg_l = calculate_avg_length(frequencies, shannon_codes)
shannon_N = calculate_sequence_size(text, shannon_codes)
shannon_k = uniform_N / shannon_N
print(f"Коэффициент сжатия κ = {uniform_N}/{shannon_N} = {shannon_k:.4f}")
# Gilbert-Moore coding
print("\nШаг 6: Построение кода Гилберта-Мура")
gm_codes, gm_q, gm_sigma, gm_l = build_gilbert_moore_codes(text, frequencies)
print("\nРасчет средней длины кода Гилберта-Мура:")
gm_avg_l = calculate_avg_length(frequencies, gm_codes)
gm_N = calculate_sequence_size(text, gm_codes)
gm_k = uniform_N / gm_N
print(f"Коэффициент сжатия κ = {uniform_N}/{gm_N} = {gm_k:.4f}")
# Print final tables
print("\n" + "="*50)
print("ИТОГОВЫЕ ТАБЛИЦЫ:")
print("="*50)
print("\nКод Хаффмана")
print(f"{'x':4} {'p(x)':>6} {'c(x)':8} {'l(x)':>4}")
for char in sorted(frequencies.keys()):
print(f"{char:4} {frequencies[char]:6.2f} {huffman_codes[char]:8} {len(huffman_codes[char]):4}")
print("\nКод Шеннона")
print(f"{'x':4} {'p(x)':>6} {'q(x)':>8} {'l(x)':>4} {'q(x)2':8} {'c(x)':8}")
for char in sorted(frequencies.keys()):
print(f"{char:4} {frequencies[char]:6.2f} {shannon_q[char]:8.4f} {shannon_l[char]:4} {format(int(shannon_q[char] * 2**shannon_l[char]), f'0{shannon_l[char]}b'):8} {shannon_codes[char]:8}")
print("\nКод Гилберта-Мура")
print(f"{'x':4} {'p(x)':>6} {'q(x)':>8} {'σ(x)':>8} {'σ(x)2':8} {'l(x)':>4} {'c(x)':8}")
for char in sorted(frequencies.keys()):
print(f"{char:4} {frequencies[char]:6.2f} {gm_q[char]:8.4f} {gm_sigma[char]:8.4f} {format(int(gm_sigma[char] * 2**gm_l[char]), f'0{gm_l[char]}b'):8} {gm_l[char]:4} {gm_codes[char]:8}")
print("\nСравнительная таблица")
print(f"{'Код':16} {'l':>8} {'N':>8} {'κ':>8}")
print(f"{'Равномерный':16} {uniform_l:8} {uniform_N:8} {1:8.4f}")
print(f"{'Гилберт-Мур':16} {gm_avg_l:8.2f} {gm_N:8} {gm_k:8.4f}")
print(f"{'Шеннон':16} {shannon_avg_l:8.2f} {shannon_N:8} {shannon_k:8.4f}")
print(f"{'Хаффман':16} {huffman_l:8.2f} {huffman_N:8} {huffman_k:8.4f}")
print(f"\nH(X) = {H:.4f}")

1. Calculating Character Frequencies p(x)

  • First, we need to count how many times each character appears in the text and divide by total length
  • For text "дятел лечит древний дуб, добрый дятел дубу люб."
  • We count ALL characters (letters, spaces, punctuation)
  • Formula: p(x) = count(x) / total_length
  • Round to 2 decimal places
  • Example: if 'д' appears 5 times in text of length 50, p('д') = 5/50 = 0.10 or 10%

2. Calculating Entropy H(X)

  • Entropy measures average information content
  • Formula: H(X) = -∑p(x)log₂p(x) for all characters x
  • Take each probability p(x) we calculated
  • Multiply by its log₂ (negative)
  • Sum all these values
  • Higher entropy means more information/uncertainty

3. Uniform Code

  • Simplest encoding where all characters use same number of bits
  • Length l = ⌈log₂M⌉ where M is alphabet size (number of unique characters)
  • Example: if we have 8 unique characters, l = ⌈log₂8⌉ = 3 bits
  • Total sequence size N = l × text_length
  • This serves as baseline for compression ratios

4. Huffman Code

  • Gives optimal prefix codes (no codeword is prefix of another)
  • More frequent characters get shorter codes
  • Calculate average length l = ∑p(x)×length(code(x))
  • Calculate N = sum of all code lengths in encoded text
  • κ = uniform_N / huffman_N

Steps:

  1. Sort characters by frequency (ascending)
  2. Take two lowest frequency characters
  3. Create new node with their combined frequency
  4. Repeat until single tree remains
  5. Traverse tree: left=0, right=1

5. Shannon Code

  • Also gives prefix codes
  • Theoretically near optimal
  • Calculate same parameters (l, N, κ)

Steps:

  1. Sort characters by frequency (descending)
  2. Calculate q(x) = sum of frequencies before x
  3. For each character:
    • l(x) = ⌈-log₂p(x)⌉
    • c(x) = binary of (q(x) × 2^l(x)) truncated to l(x) bits

6. Gilbert-Moore Code

Special features:

  • Uses order of appearance in text
  • More suitable for adaptive coding

Steps:

  1. Keep characters in order of appearance
  2. For each character x:
    • q(x) = (sum of previous frequencies) + (current frequency/2)
    • σ(x) = q(x)
    • l(x) = ⌈-log₂p(x)⌉ + 1
    • c(x) = binary of (σ(x) × 2^l(x)) truncated to l(x) bits

For all methods

  • Average length l = ∑p(x)×length(code(x))
  • Sequence size N = sum of code lengths for entire text
  • Compression ratio κ = uniform_N / method_N

Specific example

Consider just "дятел":

  1. Frequencies:

    • д: 1/5 = 0.20
    • я: 1/5 = 0.20
    • т: 1/5 = 0.20
    • е: 1/5 = 0.20
    • л: 1/5 = 0.20
  2. Entropy: H = -5×(0.20×log₂(0.20)) ≈ 2.32 bits

  3. Uniform code:

    • 5 characters need l = ⌈log₂5⌉ = 3 bits each
    • N = 3×5 = 15 bits
  4. Huffman code (example):

    • All equal frequencies so arbitrary order:
    • д: 00
    • я: 01
    • т: 10
    • е: 110
    • л: 111
  5. Shannon code:

    • Sort by frequency (all equal here)
    • Calculate cumulative probabilities
    • Generate codes based on these values
  6. Gilbert-Moore:

    • Uses order as they appear in text
    • Accounts for future probabilities

The key difference between these methods:

  • Uniform: Simple but inefficient
  • Huffman: Optimal for fixed codes
  • Shannon: Theoretical basis for compression
  • Gilbert-Moore: Good for adaptive coding

The comparison table at the end shows:

  • Average code length (l)
  • Total sequence size (N)
  • Compression ratio (κ)

This helps evaluate which method is most efficient for the given text.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment