Created
April 11, 2023 06:43
-
-
Save audy/03dc2558faca4fd841ce145cc72f2d3c to your computer and use it in GitHub Desktop.
encode ascii text as DNA sequences
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from itertools import product | |
from typing import Dict, Tuple | |
import sys | |
def generate_codon_mapping( | |
dna_alphabet="GATC", | |
alphabet="|abcdefghijklmnopqrstuvwxyz.!?,;() 01234567890", | |
codon_length=3, | |
) -> Dict[str, str]: | |
""" | |
Returns a dictionary mapping codon -> character. | is preserved as a | |
special character meaning START so you can decode DNA sequences that have | |
been concatenated together | |
""" | |
codons = ["".join(x) for x in list(product(dna_alphabet, repeat=codon_length))] | |
assert len(codons) >= len(alphabet), "We require more codons!" | |
return {codon: character for codon, character in zip(codons, alphabet)} | |
def encode(input_string: str) -> str: | |
character_to_codon = {character: codon for codon, character in generate_codon_mapping().items()} | |
output = [] | |
for character in input_string: | |
output.append(character_to_codon[character]) | |
return "".join(output) | |
def decode(input_string: str) -> str: | |
""" | |
Raw transcribe function. Does not care about start codons. | |
""" | |
codon_to_character = generate_codon_mapping() | |
output = [] | |
for i in range(0, len(input_string))[::3]: | |
codon = input_string[i : i + 3] | |
character = codon_to_character[codon] | |
output.append(character) | |
return "".join(output) | |
def test_encode_decode(): | |
test_string = "|the (quick?) fox jumped over; the lazy! d0g." | |
assert decode(encode(test_string)) == test_string |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment