Created
February 6, 2023 20:45
-
-
Save Dhravya/a8406fee264c7f9ce2944a7bea3e878e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from rich import print | |
import time | |
FORBIDDEN_CHARS = ["⍼", "⳧"] | |
text= "really really really long text that has a lot of repeating stuff so that it can be coompressed" | |
def find_all_non_overlapping_repeating_substrings(text: str): | |
text = (text + ' ') if text[-1] != ' ' else text | |
repeating_substrings = {} | |
f = text | |
for i in range(len(text)): | |
for j in range(i + 2, min(i + 15, len(text))): | |
substring = text[i:j] | |
if substring in text[j:]: | |
if substring in repeating_substrings: | |
repeating_substrings[substring] += 1 | |
else: | |
repeating_substrings[substring] = 2 | |
for substring in list(repeating_substrings.keys()): | |
if repeating_substrings[substring] < 3: | |
repeating_substrings.pop(substring) | |
for substring in list(repeating_substrings.keys()): | |
for substring2 in list(repeating_substrings.keys()): | |
if substring != substring2 and substring in substring2: | |
repeating_substrings.pop(substring) | |
break | |
n_replace_ran = 0 | |
final_list = [] | |
def replace_shit(txt): | |
nonlocal n_replace_ran | |
nonlocal f | |
n_replace_ran += 1 | |
if f.count(txt) == value: | |
f = f.replace(txt, "") | |
final_list.append(txt) | |
n_replace_ran = 0 | |
else: | |
if 30 > n_replace_ran >= 15: | |
replace_shit(txt[:-1]) | |
elif n_replace_ran >= 30: | |
return | |
else: | |
replace_shit(txt[1:]) | |
for key in repeating_substrings: | |
value = repeating_substrings[key] | |
replace_shit(key) | |
return final_list | |
def find_all_unused_unicode(text: str, number: int = 10000): | |
# return [chr(i) for i in range(0, 10000) if chr(i) not in text] | |
found = [] | |
for i in range(500, 10000): | |
if chr(i) not in text: | |
found.append(chr(i)) | |
if len(found) >= number: | |
break | |
return found | |
def compress(text: str): | |
if any(char in text for char in FORBIDDEN_CHARS): | |
raise ValueError("Text contains forbidden characters") | |
substrings = find_all_non_overlapping_repeating_substrings(text) | |
unused_chars = find_all_unused_unicode(text, len(substrings)) | |
mapping = {} | |
for i in range(len(substrings)): | |
mapping[substrings[i]] = unused_chars[i] | |
for key in mapping: | |
text = text.replace(key, mapping[key]) | |
reference_string = "" | |
for key in mapping: | |
reference_string += f"{key}⳧{mapping[key]}⍼" | |
if len(substrings) == 0: | |
return text | |
return reference_string + "⍼⍼⍼" + text | |
def decompress(text: str): | |
if "⍼⍼⍼" not in text: | |
return text | |
reference_string, text = text.split("⍼⍼⍼⍼") | |
mapping = {} | |
for pair in reference_string.split("⍼"): | |
if pair != "": | |
try: | |
key, value = pair.split("⳧") | |
mapping[value] = key | |
except ValueError: | |
pass | |
for key in mapping: | |
text = text.replace(key, mapping[key]) | |
return text | |
def test(text): | |
compress_time = time.time() | |
compressed = compress(text) | |
print(f"Compression took {time.time() - compress_time:.2f} seconds") | |
decompress_time = time.time() | |
decompressed = decompress(compressed) | |
print(f"Decompression took {time.time() - decompress_time:.2f} seconds") | |
print("Compression/decompression successful:", decompressed == text) | |
if (decompressed != text): | |
loss = (len(text) - len(decompressed)) / len(text) * 100 | |
print("Loss in quality: ", loss, "%") | |
print(decompressed) | |
print("Is compressed smaller than original:", len(compressed) < len(text)) | |
print( | |
f"Compressed text is smaller by {(len(text) - len(compressed)) / len(text) * 100:.2f}%") | |
test(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment