Last active
January 28, 2021 00:09
-
-
Save damiankao/81b6ebd123b9ccf98e0e47f1ddd3ddd5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Naive probabilistic approach backwards engineer the vaccine sequence: | |
1. For every codon-position, base, and amino-acid result, calculate a probability of base change. | |
2. Apply this probability to the viral sequence and generate a vaccine sequence. | |
3. Compare the generated vaccine sequence to the known vaccine sequence and check for % match. | |
This code is in reference to: | |
https://berthub.eu/articles/posts/part-2-reverse-engineering-source-code-of-the-biontech-pfizer-vaccine/ | |
Requires python3 | |
''' | |
from collections import Counter,defaultdict | |
import random | |
codon_comparison = [x.strip().split(',')[1:] for x in '''0,ATG,ATG | |
3,TTT,TTC | |
6,GTT,GTG | |
9,TTT,TTC | |
12,CTT,CTG | |
15,GTT,GTG | |
18,TTA,CTG | |
21,TTG,CTG | |
24,CCA,CCT | |
27,CTA,CTG | |
30,GTC,GTG | |
33,TCT,TCC | |
36,AGT,AGC | |
39,CAG,CAG | |
42,TGT,TGT | |
45,GTT,GTG | |
48,AAT,AAC | |
51,CTT,CTG | |
54,ACA,ACC | |
57,ACC,ACC | |
60,AGA,AGA | |
63,ACT,ACA | |
66,CAA,CAG | |
69,TTA,CTG | |
72,CCC,CCT | |
75,CCT,CCA | |
78,GCA,GCC | |
81,TAC,TAC | |
84,ACT,ACC | |
87,AAT,AAC | |
90,TCT,AGC | |
93,TTC,TTT | |
96,ACA,ACC | |
99,CGT,AGA | |
102,GGT,GGC | |
105,GTT,GTG | |
108,TAT,TAC | |
111,TAC,TAC | |
114,CCT,CCC | |
117,GAC,GAC | |
120,AAA,AAG | |
123,GTT,GTG | |
126,TTC,TTC | |
129,AGA,AGA | |
132,TCC,TCC | |
135,TCA,AGC | |
138,GTT,GTG | |
141,TTA,CTG | |
144,CAT,CAC | |
147,TCA,TCT | |
150,ACT,ACC | |
153,CAG,CAG | |
156,GAC,GAC | |
159,TTG,CTG | |
162,TTC,TTC | |
165,TTA,CTG | |
168,CCT,CCT | |
171,TTC,TTC | |
174,TTT,TTC | |
177,TCC,AGC | |
180,AAT,AAC | |
183,GTT,GTG | |
186,ACT,ACC | |
189,TGG,TGG | |
192,TTC,TTC | |
195,CAT,CAC | |
198,GCT,GCC | |
201,ATA,ATC | |
204,CAT,CAC | |
207,GTC,GTG | |
210,TCT,TCC | |
213,GGG,GGC | |
216,ACC,ACC | |
219,AAT,AAT | |
222,GGT,GGC | |
225,ACT,ACC | |
228,AAG,AAG | |
231,AGG,AGA | |
234,TTT,TTC | |
237,GAT,GAC | |
240,AAC,AAC | |
243,CCT,CCC | |
246,GTC,GTG | |
249,CTA,CTG | |
252,CCA,CCC | |
255,TTT,TTC | |
258,AAT,AAC | |
261,GAT,GAC | |
264,GGT,GGG | |
267,GTT,GTG | |
270,TAT,TAC | |
273,TTT,TTT | |
276,GCT,GCC | |
279,TCC,AGC | |
282,ACT,ACC | |
285,GAG,GAG | |
288,AAG,AAG | |
291,TCT,TCC | |
294,AAC,AAC | |
297,ATA,ATC | |
300,ATA,ATC | |
303,AGA,AGA | |
306,GGC,GGC | |
309,TGG,TGG | |
312,ATT,ATC | |
315,TTT,TTC | |
318,GGT,GGC | |
321,ACT,ACC | |
324,ACT,ACA | |
327,TTA,CTG | |
330,GAT,GAC | |
333,TCG,AGC | |
336,AAG,AAG | |
339,ACC,ACC | |
342,CAG,CAG | |
345,TCC,AGC | |
348,CTA,CTG | |
351,CTT,CTG | |
354,ATT,ATC | |
357,GTT,GTG | |
360,AAT,AAC | |
363,AAC,AAC | |
366,GCT,GCC | |
369,ACT,ACC | |
372,AAT,AAC | |
375,GTT,GTG | |
378,GTT,GTC | |
381,ATT,ATC | |
384,AAA,AAA | |
387,GTC,GTG | |
390,TGT,TGC | |
393,GAA,GAG | |
396,TTT,TTC | |
399,CAA,CAG | |
402,TTT,TTC | |
405,TGT,TGC | |
408,AAT,AAC | |
411,GAT,GAC | |
414,CCA,CCC | |
417,TTT,TTC | |
420,TTG,CTG | |
423,GGT,GGC | |
426,GTT,GTC | |
429,TAT,TAC | |
432,TAC,TAC | |
435,CAC,CAC | |
438,AAA,AAG | |
441,AAC,AAC | |
444,AAC,AAC | |
447,AAA,AAG | |
450,AGT,AGC | |
453,TGG,TGG | |
456,ATG,ATG | |
459,GAA,GAA | |
462,AGT,AGC | |
465,GAG,GAG | |
468,TTC,TTC | |
471,AGA,CGG | |
474,GTT,GTG | |
477,TAT,TAC | |
480,TCT,AGC | |
483,AGT,AGC | |
486,GCG,GCC | |
489,AAT,AAC | |
492,AAT,AAC | |
495,TGC,TGC | |
498,ACT,ACC | |
501,TTT,TTC | |
504,GAA,GAG | |
507,TAT,TAC | |
510,GTC,GTG | |
513,TCT,TCC | |
516,CAG,CAG | |
519,CCT,CCT | |
522,TTT,TTC | |
525,CTT,CTG | |
528,ATG,ATG | |
531,GAC,GAC | |
534,CTT,CTG | |
537,GAA,GAA | |
540,GGA,GGC | |
543,AAA,AAG | |
546,CAG,CAG | |
549,GGT,GGC | |
552,AAT,AAC | |
555,TTC,TTC | |
558,AAA,AAG | |
561,AAT,AAC | |
564,CTT,CTG | |
567,AGG,CGC | |
570,GAA,GAG | |
573,TTT,TTC | |
576,GTG,GTG | |
579,TTT,TTT | |
582,AAG,AAG | |
585,AAT,AAC | |
588,ATT,ATC | |
591,GAT,GAC | |
594,GGT,GGC | |
597,TAT,TAC | |
600,TTT,TTC | |
603,AAA,AAG | |
606,ATA,ATC | |
609,TAT,TAC | |
612,TCT,AGC | |
615,AAG,AAG | |
618,CAC,CAC | |
621,ACG,ACC | |
624,CCT,CCT | |
627,ATT,ATC | |
630,AAT,AAC | |
633,TTA,CTC | |
636,GTG,GTG | |
639,CGT,CGG | |
642,GAT,GAT | |
645,CTC,CTG | |
648,CCT,CCT | |
651,CAG,CAG | |
654,GGT,GGC | |
657,TTT,TTC | |
660,TCG,TCT | |
663,GCT,GCT | |
666,TTA,CTG | |
669,GAA,GAA | |
672,CCA,CCC | |
675,TTG,CTG | |
678,GTA,GTG | |
681,GAT,GAT | |
684,TTG,CTG | |
687,CCA,CCC | |
690,ATA,ATC | |
693,GGT,GGC | |
696,ATT,ATC | |
699,AAC,AAC | |
702,ATC,ATC | |
705,ACT,ACC | |
708,AGG,CGG | |
711,TTT,TTT | |
714,CAA,CAG | |
717,ACT,ACA | |
720,TTA,CTG | |
723,CTT,CTG | |
726,GCT,GCC | |
729,TTA,CTG | |
732,CAT,CAC | |
735,AGA,AGA | |
738,AGT,AGC | |
741,TAT,TAC | |
744,TTG,CTG | |
747,ACT,ACA | |
750,CCT,CCT | |
753,GGT,GGC | |
756,GAT,GAT | |
759,TCT,AGC | |
762,TCT,AGC | |
765,TCA,AGC | |
768,GGT,GGA | |
771,TGG,TGG | |
774,ACA,ACA | |
777,GCT,GCT | |
780,GGT,GGT | |
783,GCT,GCC | |
786,GCA,GCC | |
789,GCT,GCT | |
792,TAT,TAC | |
795,TAT,TAT | |
798,GTG,GTG | |
801,GGT,GGC | |
804,TAT,TAC | |
807,CTT,CTG | |
810,CAA,CAG | |
813,CCT,CCT | |
816,AGG,AGA | |
819,ACT,ACC | |
822,TTT,TTC | |
825,CTA,CTG | |
828,TTA,CTG | |
831,AAA,AAG | |
834,TAT,TAC | |
837,AAT,AAC | |
840,GAA,GAG | |
843,AAT,AAC | |
846,GGA,GGC | |
849,ACC,ACC | |
852,ATT,ATC | |
855,ACA,ACC | |
858,GAT,GAC | |
861,GCT,GCC | |
864,GTA,GTG | |
867,GAC,GAT | |
870,TGT,TGT | |
873,GCA,GCT | |
876,CTT,CTG | |
879,GAC,GAT | |
882,CCT,CCT | |
885,CTC,CTG | |
888,TCA,AGC | |
891,GAA,GAG | |
894,ACA,ACA | |
897,AAG,AAG | |
900,TGT,TGC | |
903,ACG,ACC | |
906,TTG,CTG | |
909,AAA,AAG | |
912,TCC,TCC | |
915,TTC,TTC | |
918,ACT,ACC | |
921,GTA,GTG | |
924,GAA,GAA | |
927,AAA,AAG | |
930,GGA,GGC | |
933,ATC,ATC | |
936,TAT,TAC | |
939,CAA,CAG | |
942,ACT,ACC | |
945,TCT,AGC | |
948,AAC,AAC | |
951,TTT,TTC | |
954,AGA,CGG | |
957,GTC,GTG | |
960,CAA,CAG | |
963,CCA,CCC | |
966,ACA,ACC | |
969,GAA,GAA | |
972,TCT,TCC | |
975,ATT,ATC | |
978,GTT,GTG | |
981,AGA,CGG | |
984,TTT,TTC | |
987,CCT,CCC | |
990,AAT,AAT | |
993,ATT,ATC | |
996,ACA,ACC | |
999,AAC,AAT | |
1002,TTG,CTG | |
1005,TGC,TGC | |
1008,CCT,CCC | |
1011,TTT,TTC | |
1014,GGT,GGC | |
1017,GAA,GAG | |
1020,GTT,GTG | |
1023,TTT,TTC | |
1026,AAC,AAT | |
1029,GCC,GCC | |
1032,ACC,ACC | |
1035,AGA,AGA | |
1038,TTT,TTC | |
1041,GCA,GCC | |
1044,TCT,TCT | |
1047,GTT,GTG | |
1050,TAT,TAC | |
1053,GCT,GCC | |
1056,TGG,TGG | |
1059,AAC,AAC | |
1062,AGG,CGG | |
1065,AAG,AAG | |
1068,AGA,CGG | |
1071,ATC,ATC | |
1074,AGC,AGC | |
1077,AAC,AAT | |
1080,TGT,TGC | |
1083,GTT,GTG | |
1086,GCT,GCC | |
1089,GAT,GAC | |
1092,TAT,TAC | |
1095,TCT,TCC | |
1098,GTC,GTG | |
1101,CTA,CTG | |
1104,TAT,TAC | |
1107,AAT,AAC | |
1110,TCC,TCC | |
1113,GCA,GCC | |
1116,TCA,AGC | |
1119,TTT,TTC | |
1122,TCC,AGC | |
1125,ACT,ACC | |
1128,TTT,TTC | |
1131,AAG,AAG | |
1134,TGT,TGC | |
1137,TAT,TAC | |
1140,GGA,GGC | |
1143,GTG,GTG | |
1146,TCT,TCC | |
1149,CCT,CCT | |
1152,ACT,ACC | |
1155,AAA,AAG | |
1158,TTA,CTG | |
1161,AAT,AAC | |
1164,GAT,GAC | |
1167,CTC,CTG | |
1170,TGC,TGC | |
1173,TTT,TTC | |
1176,ACT,ACA | |
1179,AAT,AAC | |
1182,GTC,GTG | |
1185,TAT,TAC | |
1188,GCA,GCC | |
1191,GAT,GAC | |
1194,TCA,AGC | |
1197,TTT,TTC | |
1200,GTA,GTG | |
1203,ATT,ATC | |
1206,AGA,CGG | |
1209,GGT,GGA | |
1212,GAT,GAT | |
1215,GAA,GAA | |
1218,GTC,GTG | |
1221,AGA,CGG | |
1224,CAA,CAG | |
1227,ATC,ATT | |
1230,GCT,GCC | |
1233,CCA,CCT | |
1236,GGG,GGA | |
1239,CAA,CAG | |
1242,ACT,ACA | |
1245,GGA,GGC | |
1248,AAG,AAG | |
1251,ATT,ATC | |
1254,GCT,GCC | |
1257,GAT,GAC | |
1260,TAT,TAC | |
1263,AAT,AAC | |
1266,TAT,TAC | |
1269,AAA,AAG | |
1272,TTA,CTG | |
1275,CCA,CCC | |
1278,GAT,GAC | |
1281,GAT,GAC | |
1284,TTT,TTC | |
1287,ACA,ACC | |
1290,GGC,GGC | |
1293,TGC,TGT | |
1296,GTT,GTG | |
1299,ATA,ATT | |
1302,GCT,GCC | |
1305,TGG,TGG | |
1308,AAT,AAC | |
1311,TCT,AGC | |
1314,AAC,AAC | |
1317,AAT,AAC | |
1320,CTT,CTG | |
1323,GAT,GAC | |
1326,TCT,TCC | |
1329,AAG,AAA | |
1332,GTT,GTC | |
1335,GGT,GGC | |
1338,GGT,GGC | |
1341,AAT,AAC | |
1344,TAT,TAC | |
1347,AAT,AAT | |
1350,TAC,TAC | |
1353,CTG,CTG | |
1356,TAT,TAC | |
1359,AGA,CGG | |
1362,TTG,CTG | |
1365,TTT,TTC | |
1368,AGG,CGG | |
1371,AAG,AAG | |
1374,TCT,TCC | |
1377,AAT,AAT | |
1380,CTC,CTG | |
1383,AAA,AAG | |
1386,CCT,CCC | |
1389,TTT,TTC | |
1392,GAG,GAG | |
1395,AGA,CGG | |
1398,GAT,GAC | |
1401,ATT,ATC | |
1404,TCA,TCC | |
1407,ACT,ACC | |
1410,GAA,GAG | |
1413,ATC,ATC | |
1416,TAT,TAT | |
1419,CAG,CAG | |
1422,GCC,GCC | |
1425,GGT,GGC | |
1428,AGC,AGC | |
1431,ACA,ACC | |
1434,CCT,CCT | |
1437,TGT,TGT | |
1440,AAT,AAC | |
1443,GGT,GGC | |
1446,GTT,GTG | |
1449,GAA,GAA | |
1452,GGT,GGC | |
1455,TTT,TTC | |
1458,AAT,AAC | |
1461,TGT,TGC | |
1464,TAC,TAC | |
1467,TTT,TTC | |
1470,CCT,CCA | |
1473,TTA,CTG | |
1476,CAA,CAG | |
1479,TCA,TCC | |
1482,TAT,TAC | |
1485,GGT,GGC | |
1488,TTC,TTT | |
1491,CAA,CAG | |
1494,CCC,CCC | |
1497,ACT,ACA | |
1500,AAT,AAT | |
1503,GGT,GGC | |
1506,GTT,GTG | |
1509,GGT,GGC | |
1512,TAC,TAT | |
1515,CAA,CAG | |
1518,CCA,CCC | |
1521,TAC,TAC | |
1524,AGA,AGA | |
1527,GTA,GTG | |
1530,GTA,GTG | |
1533,GTA,GTG | |
1536,CTT,CTG | |
1539,TCT,AGC | |
1542,TTT,TTC | |
1545,GAA,GAA | |
1548,CTT,CTG | |
1551,CTA,CTG | |
1554,CAT,CAT | |
1557,GCA,GCC | |
1560,CCA,CCT | |
1563,GCA,GCC | |
1566,ACT,ACA | |
1569,GTT,GTG | |
1572,TGT,TGC | |
1575,GGA,GGC | |
1578,CCT,CCT | |
1581,AAA,AAG | |
1584,AAG,AAA | |
1587,TCT,AGC | |
1590,ACT,ACC | |
1593,AAT,AAT | |
1596,TTG,CTC | |
1599,GTT,GTG | |
1602,AAA,AAG | |
1605,AAC,AAC | |
1608,AAA,AAA | |
1611,TGT,TGC | |
1614,GTC,GTG | |
1617,AAT,AAC | |
1620,TTC,TTC | |
1623,AAC,AAC | |
1626,TTC,TTC | |
1629,AAT,AAC | |
1632,GGT,GGC | |
1635,TTA,CTG | |
1638,ACA,ACC | |
1641,GGC,GGC | |
1644,ACA,ACC | |
1647,GGT,GGC | |
1650,GTT,GTG | |
1653,CTT,CTG | |
1656,ACT,ACA | |
1659,GAG,GAG | |
1662,TCT,AGC | |
1665,AAC,AAC | |
1668,AAA,AAG | |
1671,AAG,AAG | |
1674,TTT,TTC | |
1677,CTG,CTG | |
1680,CCT,CCA | |
1683,TTC,TTC | |
1686,CAA,CAG | |
1689,CAA,CAG | |
1692,TTT,TTT | |
1695,GGC,GGC | |
1698,AGA,CGG | |
1701,GAC,GAT | |
1704,ATT,ATC | |
1707,GCT,GCC | |
1710,GAC,GAT | |
1713,ACT,ACC | |
1716,ACT,ACA | |
1719,GAT,GAC | |
1722,GCT,GCC | |
1725,GTC,GTT | |
1728,CGT,AGA | |
1731,GAT,GAT | |
1734,CCA,CCC | |
1737,CAG,CAG | |
1740,ACA,ACA | |
1743,CTT,CTG | |
1746,GAG,GAA | |
1749,ATT,ATC | |
1752,CTT,CTG | |
1755,GAC,GAC | |
1758,ATT,ATC | |
1761,ACA,ACC | |
1764,CCA,CCT | |
1767,TGT,TGC | |
1770,TCT,AGC | |
1773,TTT,TTC | |
1776,GGT,GGC | |
1779,GGT,GGA | |
1782,GTC,GTG | |
1785,AGT,TCT | |
1788,GTT,GTG | |
1791,ATA,ATC | |
1794,ACA,ACC | |
1797,CCA,CCT | |
1800,GGA,GGC | |
1803,ACA,ACC | |
1806,AAT,AAC | |
1809,ACT,ACC | |
1812,TCT,AGC | |
1815,AAC,AAT | |
1818,CAG,CAG | |
1821,GTT,GTG | |
1824,GCT,GCA | |
1827,GTT,GTG | |
1830,CTT,CTG | |
1833,TAT,TAC | |
1836,CAG,CAG | |
1839,GAT,GAC | |
1842,GTT,GTG | |
1845,AAC,AAC | |
1848,TGC,TGT | |
1851,ACA,ACC | |
1854,GAA,GAA | |
1857,GTC,GTG | |
1860,CCT,CCC | |
1863,GTT,GTG | |
1866,GCT,GCC | |
1869,ATT,ATT | |
1872,CAT,CAC | |
1875,GCA,GCC | |
1878,GAT,GAT | |
1881,CAA,CAG | |
1884,CTT,CTG | |
1887,ACT,ACA | |
1890,CCT,CCT | |
1893,ACT,ACA | |
1896,TGG,TGG | |
1899,CGT,CGG | |
1902,GTT,GTG | |
1905,TAT,TAC | |
1908,TCT,TCC | |
1911,ACA,ACC | |
1914,GGT,GGC | |
1917,TCT,AGC | |
1920,AAT,AAT | |
1923,GTT,GTG | |
1926,TTT,TTT | |
1929,CAA,CAG | |
1932,ACA,ACC | |
1935,CGT,AGA | |
1938,GCA,GCC | |
1941,GGC,GGC | |
1944,TGT,TGT | |
1947,TTA,CTG | |
1950,ATA,ATC | |
1953,GGG,GGA | |
1956,GCT,GCC | |
1959,GAA,GAG | |
1962,CAT,CAC | |
1965,GTC,GTG | |
1968,AAC,AAC | |
1971,AAC,AAT | |
1974,TCA,AGC | |
1977,TAT,TAC | |
1980,GAG,GAG | |
1983,TGT,TGC | |
1986,GAC,GAC | |
1989,ATA,ATC | |
1992,CCC,CCC | |
1995,ATT,ATC | |
1998,GGT,GGC | |
2001,GCA,GCT | |
2004,GGT,GGA | |
2007,ATA,ATC | |
2010,TGC,TGC | |
2013,GCT,GCC | |
2016,AGT,AGC | |
2019,TAT,TAC | |
2022,CAG,CAG | |
2025,ACT,ACA | |
2028,CAG,CAG | |
2031,ACT,ACA | |
2034,AAT,AAC | |
2037,TCT,AGC | |
2040,CCT,CCT | |
2043,CGG,CGG | |
2046,CGG,AGA | |
2049,GCA,GCC | |
2052,CGT,AGA | |
2055,AGT,AGC | |
2058,GTA,GTG | |
2061,GCT,GCC | |
2064,AGT,AGC | |
2067,CAA,CAG | |
2070,TCC,AGC | |
2073,ATC,ATC | |
2076,ATT,ATT | |
2079,GCC,GCC | |
2082,TAC,TAC | |
2085,ACT,ACA | |
2088,ATG,ATG | |
2091,TCA,TCT | |
2094,CTT,CTG | |
2097,GGT,GGC | |
2100,GCA,GCC | |
2103,GAA,GAG | |
2106,AAT,AAC | |
2109,TCA,AGC | |
2112,GTT,GTG | |
2115,GCT,GCC | |
2118,TAC,TAC | |
2121,TCT,TCC | |
2124,AAT,AAC | |
2127,AAC,AAC | |
2130,TCT,TCT | |
2133,ATT,ATC | |
2136,GCC,GCT | |
2139,ATA,ATC | |
2142,CCC,CCC | |
2145,ACA,ACC | |
2148,AAT,AAC | |
2151,TTT,TTC | |
2154,ACT,ACC | |
2157,ATT,ATC | |
2160,AGT,AGC | |
2163,GTT,GTG | |
2166,ACC,ACC | |
2169,ACA,ACA | |
2172,GAA,GAG | |
2175,ATT,ATC | |
2178,CTA,CTG | |
2181,CCA,CCT | |
2184,GTG,GTG | |
2187,TCT,TCC | |
2190,ATG,ATG | |
2193,ACC,ACC | |
2196,AAG,AAG | |
2199,ACA,ACC | |
2202,TCA,AGC | |
2205,GTA,GTG | |
2208,GAT,GAC | |
2211,TGT,TGC | |
2214,ACA,ACC | |
2217,ATG,ATG | |
2220,TAC,TAC | |
2223,ATT,ATC | |
2226,TGT,TGC | |
2229,GGT,GGC | |
2232,GAT,GAT | |
2235,TCA,TCC | |
2238,ACT,ACC | |
2241,GAA,GAG | |
2244,TGC,TGC | |
2247,AGC,TCC | |
2250,AAT,AAC | |
2253,CTT,CTG | |
2256,TTG,CTG | |
2259,TTG,CTG | |
2262,CAA,CAG | |
2265,TAT,TAC | |
2268,GGC,GGC | |
2271,AGT,AGC | |
2274,TTT,TTC | |
2277,TGT,TGC | |
2280,ACA,ACC | |
2283,CAA,CAG | |
2286,TTA,CTG | |
2289,AAC,AAT | |
2292,CGT,AGA | |
2295,GCT,GCC | |
2298,TTA,CTG | |
2301,ACT,ACA | |
2304,GGA,GGG | |
2307,ATA,ATC | |
2310,GCT,GCC | |
2313,GTT,GTG | |
2316,GAA,GAA | |
2319,CAA,CAG | |
2322,GAC,GAC | |
2325,AAA,AAG | |
2328,AAC,AAC | |
2331,ACC,ACC | |
2334,CAA,CAA | |
2337,GAA,GAG | |
2340,GTT,GTG | |
2343,TTT,TTC | |
2346,GCA,GCC | |
2349,CAA,CAA | |
2352,GTC,GTG | |
2355,AAA,AAG | |
2358,CAA,CAG | |
2361,ATT,ATC | |
2364,TAC,TAC | |
2367,AAA,AAG | |
2370,ACA,ACC | |
2373,CCA,CCT | |
2376,CCA,CCT | |
2379,ATT,ATC | |
2382,AAA,AAG | |
2385,GAT,GAC | |
2388,TTT,TTC | |
2391,GGT,GGC | |
2394,GGT,GGC | |
2397,TTT,TTC | |
2400,AAT,AAT | |
2403,TTT,TTC | |
2406,TCA,AGC | |
2409,CAA,CAG | |
2412,ATA,ATT | |
2415,TTA,CTG | |
2418,CCA,CCC | |
2421,GAT,GAT | |
2424,CCA,CCT | |
2427,TCA,AGC | |
2430,AAA,AAG | |
2433,CCA,CCC | |
2436,AGC,AGC | |
2439,AAG,AAG | |
2442,AGG,CGG | |
2445,TCA,AGC | |
2448,TTT,TTC | |
2451,ATT,ATC | |
2454,GAA,GAG | |
2457,GAT,GAC | |
2460,CTA,CTG | |
2463,CTT,CTG | |
2466,TTC,TTC | |
2469,AAC,AAC | |
2472,AAA,AAA | |
2475,GTG,GTG | |
2478,ACA,ACA | |
2481,CTT,CTG | |
2484,GCA,GCC | |
2487,GAT,GAC | |
2490,GCT,GCC | |
2493,GGC,GGC | |
2496,TTC,TTC | |
2499,ATC,ATC | |
2502,AAA,AAG | |
2505,CAA,CAG | |
2508,TAT,TAT | |
2511,GGT,GGC | |
2514,GAT,GAT | |
2517,TGC,TGT | |
2520,CTT,CTG | |
2523,GGT,GGC | |
2526,GAT,GAC | |
2529,ATT,ATT | |
2532,GCT,GCC | |
2535,GCT,GCC | |
2538,AGA,AGG | |
2541,GAC,GAT | |
2544,CTC,CTG | |
2547,ATT,ATT | |
2550,TGT,TGC | |
2553,GCA,GCC | |
2556,CAA,CAG | |
2559,AAG,AAG | |
2562,TTT,TTT | |
2565,AAC,AAC | |
2568,GGC,GGA | |
2571,CTT,CTG | |
2574,ACT,ACA | |
2577,GTT,GTG | |
2580,TTG,CTG | |
2583,CCA,CCT | |
2586,CCT,CCT | |
2589,TTG,CTG | |
2592,CTC,CTG | |
2595,ACA,ACC | |
2598,GAT,GAT | |
2601,GAA,GAG | |
2604,ATG,ATG | |
2607,ATT,ATC | |
2610,GCT,GCC | |
2613,CAA,CAG | |
2616,TAC,TAC | |
2619,ACT,ACA | |
2622,TCT,TCT | |
2625,GCA,GCC | |
2628,CTG,CTG | |
2631,TTA,CTG | |
2634,GCG,GCC | |
2637,GGT,GGC | |
2640,ACA,ACA | |
2643,ATC,ATC | |
2646,ACT,ACA | |
2649,TCT,AGC | |
2652,GGT,GGC | |
2655,TGG,TGG | |
2658,ACC,ACA | |
2661,TTT,TTT | |
2664,GGT,GGA | |
2667,GCA,GCA | |
2670,GGT,GGC | |
2673,GCT,GCC | |
2676,GCA,GCT | |
2679,TTA,CTG | |
2682,CAA,CAG | |
2685,ATA,ATC | |
2688,CCA,CCC | |
2691,TTT,TTT | |
2694,GCT,GCT | |
2697,ATG,ATG | |
2700,CAA,CAG | |
2703,ATG,ATG | |
2706,GCT,GCC | |
2709,TAT,TAC | |
2712,AGG,CGG | |
2715,TTT,TTC | |
2718,AAT,AAC | |
2721,GGT,GGC | |
2724,ATT,ATC | |
2727,GGA,GGA | |
2730,GTT,GTG | |
2733,ACA,ACC | |
2736,CAG,CAG | |
2739,AAT,AAT | |
2742,GTT,GTG | |
2745,CTC,CTG | |
2748,TAT,TAC | |
2751,GAG,GAG | |
2754,AAC,AAC | |
2757,CAA,CAG | |
2760,AAA,AAG | |
2763,TTG,CTG | |
2766,ATT,ATC | |
2769,GCC,GCC | |
2772,AAC,AAC | |
2775,CAA,CAG | |
2778,TTT,TTC | |
2781,AAT,AAC | |
2784,AGT,AGC | |
2787,GCT,GCC | |
2790,ATT,ATC | |
2793,GGC,GGC | |
2796,AAA,AAG | |
2799,ATT,ATC | |
2802,CAA,CAG | |
2805,GAC,GAC | |
2808,TCA,AGC | |
2811,CTT,CTG | |
2814,TCT,AGC | |
2817,TCC,AGC | |
2820,ACA,ACA | |
2823,GCA,GCA | |
2826,AGT,AGC | |
2829,GCA,GCC | |
2832,CTT,CTG | |
2835,GGA,GGA | |
2838,AAA,AAG | |
2841,CTT,CTG | |
2844,CAA,CAG | |
2847,GAT,GAC | |
2850,GTG,GTG | |
2853,GTC,GTC | |
2856,AAC,AAC | |
2859,CAA,CAG | |
2862,AAT,AAT | |
2865,GCA,GCC | |
2868,CAA,CAG | |
2871,GCT,GCA | |
2874,TTA,CTG | |
2877,AAC,AAC | |
2880,ACG,ACC | |
2883,CTT,CTG | |
2886,GTT,GTC | |
2889,AAA,AAG | |
2892,CAA,CAG | |
2895,CTT,CTG | |
2898,AGC,TCC | |
2901,TCC,TCC | |
2904,AAT,AAC | |
2907,TTT,TTC | |
2910,GGT,GGC | |
2913,GCA,GCC | |
2916,ATT,ATC | |
2919,TCA,AGC | |
2922,AGT,TCT | |
2925,GTT,GTG | |
2928,TTA,CTG | |
2931,AAT,AAC | |
2934,GAT,GAT | |
2937,ATC,ATC | |
2940,CTT,CTG | |
2943,TCA,AGC | |
2946,CGT,AGA | |
2949,CTT,CTG | |
2952,GAC,GAC | |
2955,AAA,CCT | |
2958,GTT,CCT | |
2961,GAG,GAG | |
2964,GCT,GCC | |
2967,GAA,GAG | |
2970,GTG,GTG | |
2973,CAA,CAG | |
2976,ATT,ATC | |
2979,GAT,GAC | |
2982,AGG,AGA | |
2985,TTG,CTG | |
2988,ATC,ATC | |
2991,ACA,ACA | |
2994,GGC,GGC | |
2997,AGA,AGA | |
3000,CTT,CTG | |
3003,CAA,CAG | |
3006,AGT,AGC | |
3009,TTG,CTC | |
3012,CAG,CAG | |
3015,ACA,ACA | |
3018,TAT,TAC | |
3021,GTG,GTG | |
3024,ACT,ACC | |
3027,CAA,CAG | |
3030,CAA,CAG | |
3033,TTA,CTG | |
3036,ATT,ATC | |
3039,AGA,AGA | |
3042,GCT,GCC | |
3045,GCA,GCC | |
3048,GAA,GAG | |
3051,ATC,ATT | |
3054,AGA,AGA | |
3057,GCT,GCC | |
3060,TCT,TCT | |
3063,GCT,GCC | |
3066,AAT,AAT | |
3069,CTT,CTG | |
3072,GCT,GCC | |
3075,GCT,GCC | |
3078,ACT,ACC | |
3081,AAA,AAG | |
3084,ATG,ATG | |
3087,TCA,TCT | |
3090,GAG,GAG | |
3093,TGT,TGT | |
3096,GTA,GTG | |
3099,CTT,CTG | |
3102,GGA,GGC | |
3105,CAA,CAG | |
3108,TCA,AGC | |
3111,AAA,AAG | |
3114,AGA,AGA | |
3117,GTT,GTG | |
3120,GAT,GAC | |
3123,TTT,TTT | |
3126,TGT,TGC | |
3129,GGA,GGC | |
3132,AAG,AAG | |
3135,GGC,GGC | |
3138,TAT,TAC | |
3141,CAT,CAC | |
3144,CTT,CTG | |
3147,ATG,ATG | |
3150,TCC,AGC | |
3153,TTC,TTC | |
3156,CCT,CCT | |
3159,CAG,CAG | |
3162,TCA,TCT | |
3165,GCA,GCC | |
3168,CCT,CCT | |
3171,CAT,CAC | |
3174,GGT,GGC | |
3177,GTA,GTG | |
3180,GTC,GTG | |
3183,TTC,TTT | |
3186,TTG,CTG | |
3189,CAT,CAC | |
3192,GTG,GTG | |
3195,ACT,ACA | |
3198,TAT,TAT | |
3201,GTC,GTG | |
3204,CCT,CCC | |
3207,GCA,GCT | |
3210,CAA,CAA | |
3213,GAA,GAG | |
3216,AAG,AAG | |
3219,AAC,AAT | |
3222,TTC,TTC | |
3225,ACA,ACC | |
3228,ACT,ACC | |
3231,GCT,GCT | |
3234,CCT,CCA | |
3237,GCC,GCC | |
3240,ATT,ATC | |
3243,TGT,TGC | |
3246,CAT,CAC | |
3249,GAT,GAC | |
3252,GGA,GGC | |
3255,AAA,AAA | |
3258,GCA,GCC | |
3261,CAC,CAC | |
3264,TTT,TTT | |
3267,CCT,CCT | |
3270,CGT,AGA | |
3273,GAA,GAA | |
3276,GGT,GGC | |
3279,GTC,GTG | |
3282,TTT,TTC | |
3285,GTT,GTG | |
3288,TCA,TCC | |
3291,AAT,AAC | |
3294,GGC,GGC | |
3297,ACA,ACC | |
3300,CAC,CAT | |
3303,TGG,TGG | |
3306,TTT,TTC | |
3309,GTA,GTG | |
3312,ACA,ACA | |
3315,CAA,CAG | |
3318,AGG,CGG | |
3321,AAT,AAC | |
3324,TTT,TTC | |
3327,TAT,TAC | |
3330,GAA,GAG | |
3333,CCA,CCC | |
3336,CAA,CAG | |
3339,ATC,ATC | |
3342,ATT,ATC | |
3345,ACT,ACC | |
3348,ACA,ACC | |
3351,GAC,GAC | |
3354,AAC,AAC | |
3357,ACA,ACC | |
3360,TTT,TTC | |
3363,GTG,GTG | |
3366,TCT,TCT | |
3369,GGT,GGC | |
3372,AAC,AAC | |
3375,TGT,TGC | |
3378,GAT,GAC | |
3381,GTT,GTC | |
3384,GTA,GTG | |
3387,ATA,ATC | |
3390,GGA,GGC | |
3393,ATT,ATT | |
3396,GTC,GTG | |
3399,AAC,AAC | |
3402,AAC,AAT | |
3405,ACA,ACC | |
3408,GTT,GTG | |
3411,TAT,TAC | |
3414,GAT,GAC | |
3417,CCT,CCT | |
3420,TTG,CTG | |
3423,CAA,CAG | |
3426,CCT,CCC | |
3429,GAA,GAG | |
3432,TTA,CTG | |
3435,GAC,GAC | |
3438,TCA,AGC | |
3441,TTC,TTC | |
3444,AAG,AAA | |
3447,GAG,GAG | |
3450,GAG,GAA | |
3453,TTA,CTG | |
3456,GAT,GAC | |
3459,AAA,AAG | |
3462,TAT,TAC | |
3465,TTT,TTT | |
3468,AAG,AAG | |
3471,AAT,AAC | |
3474,CAT,CAC | |
3477,ACA,ACA | |
3480,TCA,AGC | |
3483,CCA,CCC | |
3486,GAT,GAC | |
3489,GTT,GTG | |
3492,GAT,GAC | |
3495,TTA,CTG | |
3498,GGT,GGC | |
3501,GAC,GAT | |
3504,ATC,ATC | |
3507,TCT,AGC | |
3510,GGC,GGA | |
3513,ATT,ATC | |
3516,AAT,AAT | |
3519,GCT,GCC | |
3522,TCA,AGC | |
3525,GTT,GTC | |
3528,GTA,GTG | |
3531,AAC,AAC | |
3534,ATT,ATC | |
3537,CAA,CAG | |
3540,AAA,AAA | |
3543,GAA,GAG | |
3546,ATT,ATC | |
3549,GAC,GAC | |
3552,CGC,CGG | |
3555,CTC,CTG | |
3558,AAT,AAC | |
3561,GAG,GAG | |
3564,GTT,GTG | |
3567,GCC,GCC | |
3570,AAG,AAG | |
3573,AAT,AAT | |
3576,TTA,CTG | |
3579,AAT,AAC | |
3582,GAA,GAG | |
3585,TCT,AGC | |
3588,CTC,CTG | |
3591,ATC,ATC | |
3594,GAT,GAC | |
3597,CTC,CTG | |
3600,CAA,CAA | |
3603,GAA,GAA | |
3606,CTT,CTG | |
3609,GGA,GGG | |
3612,AAG,AAG | |
3615,TAT,TAC | |
3618,GAG,GAG | |
3621,CAG,CAG | |
3624,TAT,TAC | |
3627,ATA,ATC | |
3630,AAA,AAG | |
3633,TGG,TGG | |
3636,CCA,CCC | |
3639,TGG,TGG | |
3642,TAC,TAC | |
3645,ATT,ATC | |
3648,TGG,TGG | |
3651,CTA,CTG | |
3654,GGT,GGC | |
3657,TTT,TTT | |
3660,ATA,ATC | |
3663,GCT,GCC | |
3666,GGC,GGA | |
3669,TTG,CTG | |
3672,ATT,ATT | |
3675,GCC,GCC | |
3678,ATA,ATC | |
3681,GTA,GTG | |
3684,ATG,ATG | |
3687,GTG,GTC | |
3690,ACA,ACA | |
3693,ATT,ATC | |
3696,ATG,ATG | |
3699,CTT,CTG | |
3702,TGC,TGT | |
3705,TGT,TGC | |
3708,ATG,ATG | |
3711,ACC,ACC | |
3714,AGT,AGC | |
3717,TGC,TGC | |
3720,TGT,TGT | |
3723,AGT,AGC | |
3726,TGT,TGC | |
3729,CTC,CTG | |
3732,AAG,AAG | |
3735,GGC,GGC | |
3738,TGT,TGT | |
3741,TGT,TGT | |
3744,TCT,AGC | |
3747,TGT,TGT | |
3750,GGA,GGC | |
3753,TCC,AGC | |
3756,TGC,TGC | |
3759,TGC,TGC | |
3762,AAA,AAG | |
3765,TTT,TTC | |
3768,GAT,GAC | |
3771,GAA,GAG | |
3774,GAC,GAC | |
3777,GAC,GAT | |
3780,TCT,TCT | |
3783,GAG,GAG | |
3786,CCA,CCC | |
3789,GTG,GTG | |
3792,CTC,CTG | |
3795,AAA,AAG | |
3798,GGA,GGC | |
3801,GTC,GTG | |
3804,AAA,AAA | |
3807,TTA,CTG | |
3810,CAT,CAC | |
3813,TAC,TAC | |
3816,ACA,ACA | |
3819,TAA,TGA'''.strip().split('\n')] | |
aa = {'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M','ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T','AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K','AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R','CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L','CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P','CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q','CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R','GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V','GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A','GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E','GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G','TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S','TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L','TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_','TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'} | |
def delta_codon(viral,vaccine): | |
delta = [] | |
for i,vr_base in enumerate(viral): | |
vc_base = vaccine[i] | |
if vr_base != vc_base: | |
delta.append((i,vc_base)) | |
return delta | |
cases = defaultdict(lambda : defaultdict(int)) | |
for codon in codon_comparison: | |
vr_aa = aa[codon[0]] | |
for i,base in enumerate(codon[0]): | |
key = (i,base,vr_aa) | |
cases[key]['all'] += 1 | |
deltas = delta_codon(codon[0],codon[1]) | |
for delta in deltas: | |
i,vc_base = delta | |
key = (i,codon[0][i],vr_aa) | |
cases[key][vc_base] += 1 | |
probs = defaultdict(lambda : defaultdict(float)) | |
for key, case in cases.items(): | |
if len(case) > 1: | |
for variant, count in case.items(): | |
case_total = float(case['all']) | |
if variant != 'all': | |
probs[key][variant] = count / case_total | |
def simulate(pr): | |
vr_codons = [x[0] for x in codon_comparison] | |
new_codons = [] | |
for codon in vr_codons: | |
vr_aa = aa[codon] | |
new_codon = '' | |
for i,base in enumerate(codon): | |
key = (i,base,vr_aa) | |
new_base = base | |
if key in pr: | |
w = list(pr[key].values()) | |
w.append(1 - sum(w)) | |
c = list(pr[key].keys()) | |
c.append(base) | |
new_base = random.choices(c, weights=w, k=1)[0] | |
new_codon += new_base | |
new_codons.append(new_codon) | |
return new_codons | |
vaccine_seq = ''.join([x[1] for x in codon_comparison]) | |
simulate_seq = ''.join(simulate(probs)) | |
def match_percentage(a,b): | |
count = 0 | |
for i,a_base in enumerate(a): | |
b_base = b[i] | |
if a_base == b_base: | |
count += 1 | |
return float(count) / len(a) * 100 | |
mp = match_percentage(vaccine_seq,simulate_seq) | |
print('>generated vaccine sequence, ' + str(mp) + '% match with known vaccine sequence') | |
print(simulate_seq) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment