|
import math |
|
from collections import Counter, defaultdict |
|
from typing import Dict, List, Tuple |
|
|
|
def calculate_frequencies(text: str) -> Dict[str, float]: |
|
print(f"\nДлина текста: {len(text)} символов") |
|
|
|
# Count occurrences |
|
counts = Counter(text) |
|
print("\nКоличество каждого символа:") |
|
for char, count in sorted(counts.items()): |
|
print(f"'{char}': {count} раз") |
|
|
|
# Calculate frequencies |
|
total = len(text) |
|
frequencies = {char: round(count/total, 2) for char, count in counts.items()} |
|
|
|
print("\nЧастоты символов p(x):") |
|
for char, freq in sorted(frequencies.items()): |
|
print(f"p('{char}') = {count}/{total} ≈ {freq:.2f}") |
|
|
|
return frequencies |
|
|
|
def calculate_entropy(frequencies: Dict[str, float]) -> float: |
|
components = [] |
|
total = 0 |
|
print("\nЭнтропия символов H(X):") |
|
for char, p in sorted(frequencies.items()): |
|
component = -p * math.log2(p) |
|
components.append(component) |
|
print(f"Для символа '{char}': -({p:.2f}) × log₂({p:.2f}) = {component:.4f}") |
|
total += component |
|
|
|
print(f"\nH(X) = {' + '.join([f'{c:.4f}' for c in components])} = {total:.4f} бит") |
|
return total |
|
|
|
def calculate_uniform_code_params(alphabet_size: int, text_length: int) -> Tuple[int, int]: |
|
l = math.ceil(math.log2(alphabet_size)) |
|
N = l * text_length |
|
|
|
print(f"\nРазмер алфавита M = {alphabet_size}") |
|
print(f"Длина кода l = ⌈log₂({alphabet_size})⌉ = ⌈{math.log2(alphabet_size):.4f}⌉ = {l}") |
|
print(f"Размер закодированной последовательности N = l × {text_length} = {l} × {text_length} = {N} бит") |
|
|
|
return l, N |
|
|
|
def build_huffman_codes(frequencies: Dict[str, float]) -> Dict[str, str]: |
|
class HuffmanNode: |
|
def __init__(self, char=None, freq=0): |
|
self.char = char |
|
self.freq = freq |
|
self.left = None |
|
self.right = None |
|
|
|
def __lt__(self, other): |
|
return self.freq < other.freq |
|
|
|
if len(frequencies) <= 1: |
|
return {next(iter(frequencies.keys())): '0'} |
|
|
|
# Create initial nodes |
|
nodes = [HuffmanNode(char, freq) for char, freq in frequencies.items()] |
|
print("\nНачальные узлы:") |
|
for node in sorted(nodes, key=lambda x: (-x.freq, x.char)): |
|
print(f"Символ: '{node.char}', Частота: {node.freq:.2f}") |
|
|
|
# Build tree |
|
print("\nПостроение дерева:") |
|
while len(nodes) > 1: |
|
nodes.sort(key=lambda x: (x.freq, x.char if x.char else '')) |
|
left = nodes.pop(0) |
|
right = nodes.pop(0) |
|
print(f"Объединяем: {left.char or 'внутр.'} ({left.freq:.2f}) и {right.char or 'внутр.'} ({right.freq:.2f})") |
|
|
|
internal = HuffmanNode(freq=left.freq + right.freq) |
|
internal.left = left |
|
internal.right = right |
|
nodes.append(internal) |
|
|
|
# Generate codes |
|
codes = {} |
|
def traverse(node, code=''): |
|
if node.char is not None: |
|
codes[node.char] = code |
|
print(f"Символ: '{node.char}', Код: {code}") |
|
return |
|
traverse(node.left, code + '0') |
|
traverse(node.right, code + '1') |
|
|
|
print("\nПолученные коды:") |
|
traverse(nodes[0]) |
|
return codes |
|
|
|
def build_shannon_codes(frequencies: Dict[str, float]) -> Tuple[Dict[str, str], Dict[str, float], Dict[str, int]]: |
|
# Sort by frequency |
|
sorted_chars = sorted(frequencies.items(), key=lambda x: (-x[1], x[0])) |
|
print("\nСимволы, отсортированные по частоте:") |
|
for char, freq in sorted_chars: |
|
print(f"'{char}': {freq:.2f}") |
|
|
|
# Calculate q(x) |
|
q = defaultdict(float) |
|
running_sum = 0 |
|
print("\nРасчет q(x):") |
|
for char, _ in sorted_chars: |
|
q[char] = running_sum |
|
print(f"q('{char}') = {running_sum:.4f}") |
|
running_sum += frequencies[char] |
|
|
|
# Calculate lengths and codes |
|
l = {char: math.ceil(-math.log2(frequencies[char])) for char, _ in sorted_chars} |
|
codes = {} |
|
print("\nПостроение кодов:") |
|
for char, _ in sorted_chars: |
|
length = l[char] |
|
code_value = int(q[char] * 2**length) |
|
codes[char] = format(code_value, f'0{length}b') |
|
print(f"Символ: '{char}'") |
|
print(f" l(x) = ⌈-log₂({frequencies[char]:.4f})⌉ = {length}") |
|
print(f" Код: {codes[char]}") |
|
print(f" q(x)2: {q[char]:.5f}") |
|
|
|
return codes, q, l |
|
|
|
def build_gilbert_moore_codes(text: str, frequencies: Dict[str, float]) -> Tuple[Dict[str, str], Dict[str, float], Dict[str, float], Dict[str, int]]: |
|
# Get unique characters in order of appearance |
|
seen = [] |
|
for char in text: |
|
if char not in seen: |
|
seen.append(char) |
|
|
|
print("\nПорядок появления символов в тексте:") |
|
print(''.join(seen)) |
|
|
|
# Calculate q(x) and σ(x) |
|
q = defaultdict(float) |
|
running_sum = 0 |
|
print("\nРасчет q(x) и σ(x):") |
|
for char in seen: |
|
q[char] = running_sum + frequencies[char]/2 |
|
print(f"'{char}': q(x) = {running_sum:.4f} + {frequencies[char]:.2f}/2 = {q[char]:.4f}") |
|
running_sum += frequencies[char] |
|
|
|
sigma = {char: q[char] for char in seen} |
|
|
|
# Calculate lengths and codes |
|
l = {char: math.ceil(-math.log2(frequencies[char])) + 1 for char in seen} |
|
codes = {} |
|
print("\nПостроение кодов:") |
|
for char in seen: |
|
length = l[char] |
|
code_value = int(sigma[char] * 2**length) |
|
codes[char] = format(code_value, f'0{length}b') |
|
print(f"Символ: '{char}'") |
|
print(f" l(x) = ⌈-log₂({frequencies[char]:.4f})⌉ + 1 = {length}") |
|
print(f" Код: {codes[char]}") |
|
print(f" σ(x)2: {sigma[char]:.5f}") |
|
|
|
return codes, q, sigma, l |
|
|
|
def calculate_avg_length(frequencies: Dict[str, float], codes: Dict[str, str]) -> float: |
|
components = [] |
|
for char in frequencies: |
|
component = frequencies[char] * len(codes[char]) |
|
components.append(component) |
|
print(f"p('{char}') × l('{char}') = {frequencies[char]:.2f} × {len(codes[char])} = {component:.4f}") |
|
|
|
avg_length = sum(components) |
|
print(f"\nСредняя длина l = {' + '.join([f'{c:.4f}' for c in components])} = {avg_length:.4f}") |
|
return avg_length |
|
|
|
def calculate_sequence_size(text: str, codes: Dict[str, str]) -> int: |
|
N = sum(len(codes[char]) for char in text) |
|
print(f"Размер последовательности N = {N} бит") |
|
return N |
|
|
|
# Main code |
|
# text = "дятел лечит древний дуб, добрый дятел дубу люб." |
|
# text = "даже шею, даже уши ты испачкал в черной туши." |
|
text = "шишкодробные шишки высушиваются в шишкосушилках." |
|
print("Исходный текст:", text) |
|
|
|
# Calculate frequencies |
|
print("\nШаг 1: Расчет частот символов") |
|
frequencies = calculate_frequencies(text) |
|
|
|
# Calculate entropy |
|
print("\nШаг 2: Расчет энтропии H(X)") |
|
print("\nH(X) = -∑p(x)log₂p(x)") |
|
H = calculate_entropy(frequencies) |
|
|
|
# Uniform code parameters |
|
print("\nШаг 3: Расчет параметров равномерного кода") |
|
uniform_l, uniform_N = calculate_uniform_code_params(len(frequencies), len(text)) |
|
|
|
# Huffman coding |
|
print("\nШаг 4: Построение кода Хаффмана") |
|
huffman_codes = build_huffman_codes(frequencies) |
|
print("\nРасчет средней длины кода Хаффмана:") |
|
huffman_l = calculate_avg_length(frequencies, huffman_codes) |
|
huffman_N = calculate_sequence_size(text, huffman_codes) |
|
huffman_k = uniform_N / huffman_N |
|
print(f"Коэффициент сжатия κ = {uniform_N}/{huffman_N} = {huffman_k:.4f}") |
|
|
|
# Shannon coding |
|
print("\nШаг 5: Построение кода Шеннона") |
|
shannon_codes, shannon_q, shannon_l = build_shannon_codes(frequencies) |
|
print("\nРасчет средней длины кода Шеннона:") |
|
shannon_avg_l = calculate_avg_length(frequencies, shannon_codes) |
|
shannon_N = calculate_sequence_size(text, shannon_codes) |
|
shannon_k = uniform_N / shannon_N |
|
print(f"Коэффициент сжатия κ = {uniform_N}/{shannon_N} = {shannon_k:.4f}") |
|
|
|
# Gilbert-Moore coding |
|
print("\nШаг 6: Построение кода Гилберта-Мура") |
|
gm_codes, gm_q, gm_sigma, gm_l = build_gilbert_moore_codes(text, frequencies) |
|
print("\nРасчет средней длины кода Гилберта-Мура:") |
|
gm_avg_l = calculate_avg_length(frequencies, gm_codes) |
|
gm_N = calculate_sequence_size(text, gm_codes) |
|
gm_k = uniform_N / gm_N |
|
print(f"Коэффициент сжатия κ = {uniform_N}/{gm_N} = {gm_k:.4f}") |
|
|
|
# Print final tables |
|
print("\n" + "="*50) |
|
print("ИТОГОВЫЕ ТАБЛИЦЫ:") |
|
print("="*50) |
|
|
|
print("\nКод Хаффмана") |
|
print(f"{'x':4} {'p(x)':>6} {'c(x)':8} {'l(x)':>4}") |
|
for char in sorted(frequencies.keys()): |
|
print(f"{char:4} {frequencies[char]:6.2f} {huffman_codes[char]:8} {len(huffman_codes[char]):4}") |
|
|
|
print("\nКод Шеннона") |
|
print(f"{'x':4} {'p(x)':>6} {'q(x)':>8} {'l(x)':>4} {'q(x)2':8} {'c(x)':8}") |
|
for char in sorted(frequencies.keys()): |
|
print(f"{char:4} {frequencies[char]:6.2f} {shannon_q[char]:8.4f} {shannon_l[char]:4} {format(int(shannon_q[char] * 2**shannon_l[char]), f'0{shannon_l[char]}b'):8} {shannon_codes[char]:8}") |
|
|
|
print("\nКод Гилберта-Мура") |
|
print(f"{'x':4} {'p(x)':>6} {'q(x)':>8} {'σ(x)':>8} {'σ(x)2':8} {'l(x)':>4} {'c(x)':8}") |
|
for char in sorted(frequencies.keys()): |
|
print(f"{char:4} {frequencies[char]:6.2f} {gm_q[char]:8.4f} {gm_sigma[char]:8.4f} {format(int(gm_sigma[char] * 2**gm_l[char]), f'0{gm_l[char]}b'):8} {gm_l[char]:4} {gm_codes[char]:8}") |
|
|
|
print("\nСравнительная таблица") |
|
print(f"{'Код':16} {'l':>8} {'N':>8} {'κ':>8}") |
|
print(f"{'Равномерный':16} {uniform_l:8} {uniform_N:8} {1:8.4f}") |
|
print(f"{'Гилберт-Мур':16} {gm_avg_l:8.2f} {gm_N:8} {gm_k:8.4f}") |
|
print(f"{'Шеннон':16} {shannon_avg_l:8.2f} {shannon_N:8} {shannon_k:8.4f}") |
|
print(f"{'Хаффман':16} {huffman_l:8.2f} {huffman_N:8} {huffman_k:8.4f}") |
|
print(f"\nH(X) = {H:.4f}") |