Last active
December 20, 2016 14:59
-
-
Save manrysh/515c5ce4fef3f684b3701b8447ba4872 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#The LCS (Longest common substring) problem is to find the longest string which is a substring in two or more strings. | |
#Unlike subsequence, substring emphasizes on its continuity. | |
#Kmer refers to all the possible substrings whose length is K in a string. It is widely used in sequence assembly. | |
#To compare two sequences, basically is to find all the common Kmers between the two strings. | |
#In order to extend this method to multisequence alignment, LCS is not a very good idea because the longest substring might not be exist in the next string. | |
#Here use DP to list all common substring between two sequence. And then compare with other sequences | |
#data_set_is_on_the_bottom | |
from Bio import SeqIO | |
import numpy as np | |
import itertools | |
import collections | |
f0 = open('a.txt','rU') | |
f = open('b.txt','w') | |
records = list(SeqIO.parse(f0,'fasta')) | |
def kmer(s1,s2): | |
l = [] | |
m = len(str(s1)) | |
n = len(str(s2)) | |
a,b = 0,0 | |
matrix = np.zeros((m,n)) | |
for i in range(m): | |
for j in range(n): | |
if s1[i] == s2[j]: | |
matrix[i][j] = 1 | |
for i in range(m): | |
for j in range(n): | |
string = '' | |
if matrix[i][j] == 1: | |
a = i | |
b = j | |
while a < m-1 and b < n-1: | |
string += s1[a] | |
if matrix[a+1][b+1] == 1: | |
a+=1 | |
b+=1 | |
else: | |
break | |
else: | |
if matrix[a][b] ==1: | |
string += s1[a] | |
if len(string)>1: | |
l.append(string) | |
l = list(set(l)) | |
return l | |
ori_kmer = kmer(records[0].seq,records[1].seq) | |
for i in range(2,len(records)): | |
for k in ori_kmer: | |
if k in records[i]: | |
pass | |
else: | |
ori_kmer.remove(k) | |
result = sorted(ori_kmer,key=len) | |
for i in result: | |
f.write(str(i)+'\n') | |
''' | |
>Rosalind_9328 | |
GGTCAGCGTAACCAATCGAGTACCACAGTCTGCATCGCTTTAGTCGTTCTTGAGACGGGA | |
CATGTCCATAGGTGAAGCGGAGACAACTGAATTATGTACCTTTGCTGCAGGTCTGCGATG | |
CGGAAGTATAACGTCCCGATTGTAATGTGATTGGTGTAACCGTCACACGCCCCTTAGACG | |
TTTGCACCGGAGATAGCCAATAACGTTCTAGCAGTGTCTCGGGATGAGTTTCTACCACAC | |
GCGCATGTGCCTGCTTCGCAATTACGAATGCCTTCAATTCATTATGAAGCTCGGAGGATG | |
GACAAACCGTCTGGCTTGTAGATAAAGGCCCGCGACGAAGTGGCCAAGACACCGCACCTA | |
CGCTAATATCATGATACCTCACCATTGAGAACTGTTCCACTAGTATGACGGAGCATAAGG | |
GGAGTTAGTAACGATGCCGTTAGCACAGTCGTAAACGCGAGCCACCGTCCCCCAAGTTCT | |
CTAATTCAAGTATGCATCTGTAGCGGATCGGACCTGGCCTGGCATTGATTCATTTGAATA | |
AGCCGCAGGAAGTATTCCGATCAATGACAAAAACTCTGGCGGCGGATTCTGTTGACGCTT | |
CTATGGATAGATTCGTGGTTGCCCTAGTCCTAAGATACGAGAAATCACGATGCGAGTTGA | |
GTCCAAGTCCAAGGGGGCGAGTTCCGATCACTAAATCCACTTATTCCTTGCGGGTGATGG | |
GTTTACGCGACAAAAGTGTAAACTAATCCGACAATCTAGAGTTTATTTCCAAATTTACAC | |
ACGCTACAGAGGCCTCAGAAGCGGGAGGCCGGCCAATTGCTTTGTAGAACTCACACCGTC | |
TCACGTCAAACCAAGTTCTAAATTAAGCTGTATCATACGATCACGATGCAACGGACGCCA | |
TACCCCGGATATATAGCTCCACAGCGCCCAGAACCGTAGCCTATGACGACGGGACATGTT | |
TGGGTAATCGAATATTGCGAAGGTTTTCTCTCAGAGGCAA | |
>Rosalind_4644 | |
AATGTCACTGACTTGAAGACGCTGTGAGGTAGTCCACCCGCAGATCCCGTCCATACTCGT | |
CTCGGAAAAATAGCGCAGGTGCTATTACTCTATCGTATATACAGGGAATGTGTGATACAC | |
GGCAAACCGGAGACAACGGTGGGCAGGCAGCGACACGCCGCCGCTGAGTCAAGATCAGTC | |
AGCCTAACATCGAGTCATGGACTCAGCTTGCCCTAGTAGTTAATCGTGAGGATCTGTTAT | |
TATTCTAGGATGACGAGCCGTTCACAATTCCTTACTGCAAGTATCAGTACGTTGCTATAG | |
ATTTCACTCCCCTCGCCCACAAGTGCATGACTTGGTGACCACCTGTGGTCCAAGATCCTG | |
TTCAACTTGTAGTGGCGCAAGGGGAGTTACAACAGATCAATACCCACCAACAATCCTAGC | |
CTCATTGAACTGGGGAACTGTGCAGTTTCTACGCTAATATCATGATACCTCACCATTGAG | |
AACTGTTCCACTAGTATGACGGAGCATAAGGGGAGTTAGTAACGATGCCGTTAGCACAGT | |
CGTAAACGCGAGCCACCGTCCCCCAAGTTCTCTAATTCAAGTATGCATCTGTAGCGGATC | |
GGACCTGGCCTGGCATTGATTCATTTGAATAAGCCGCAGGAAGTATTCCGATCAATGACA | |
AAAACTCTGGCGGCGGATTCTGTTGACGCTTCGATCTATGAGTCAAGACTAGGTCTTACA | |
ACAGGAGTTGTTCGTCATCTATATAGCAAATGTCAAGCAACAGAGAGCATAGAGATATTG | |
AGTGATCAGTTTGTCACGTCAGAACCCTGAAGGATGAACCCTGCAAGTCAGGTCTGAAAC | |
GCATTTTTGGATGAAGAAGTTAAAAAGTCCCAGTTCACGGACATAACTTAATATTGCCAA | |
GCAAGGCAGTCCCCCAGAGTTACACCCGCTTCTGACACTATAGGGTCTCAGACTCGCAAT | |
TGAACATCGGCGTACCGCGGCGGCCATGGGGGCGAACACG | |
>Rosalind_2381 | |
AATCTTGCTGCCATCCGCCTCGTCAATCAGACCAACCCATATGCTCCGTCAGGCTGCTCG | |
TACCCTTCACCTACTCTCGCGATGTACTGGAACGCCTCATACGATGCCTGGGCATCGCAT | |
AGCGGCTTGGCTACGCTAATATCATGATACCTCACCATTGAGAACTGTTCCACTAGTATG | |
ACGGAGCATAAGGGGAGTTAGTAACGATGCCGTTAGCACAGTCGTAAACGCGAGCCACCG | |
TCCCCCAAGTTCTCTAATTCAAGTATGCATCTGTAGCGGATCGGACCTGGCCTGGCATTG | |
ATTCATTTGAATAAGCCGCAGGAAGTATTCCGATCAATGACAAAAACTCTGGCGGCGGAT | |
TCTGTTGACGCTTCGCGTTCCCAAGAGGTACGTAGATCTAGGGCTCGATACTCACATTTT | |
AGTGAAACTAAGGACGGCACGTGCGTGGGATAGAAAAACACAGTCGACCATGCTCTTCAG | |
TTAGATACACGTAGAGGTTTCTCGTGCATGCCCACTGGATAGTCCCGTTAAACCGATTTA | |
AACTACTTGCCGCAATTCACCCTACCATCTCGGGCCTATCAAAACTGCAACCCCAGACCA | |
TGTTACCGGCGGGTGTGTACTCCTTAGGGAAGAAACACGAAAGATCTTCCAGATATACAC | |
GTGGGCCGCAATCTGCTAGTCCAACGGCCTGGGATTCTCAGATAGAATATGTTTGAACTA | |
GCACTCTAGCTGGGAGATCTCGCAAGAAAGCGGGTACGCAAGCGCTGGGTTAGGTGATCG | |
CCGCGCATCCGCTCTAATGAACGGCCGTCGCAACCGGGCTTGATGATATTTGAGTTCGGT | |
AGCTAATGATTCGGGTGTTCGTCGCGTGCATTTGCAATAATGGAGTGGAGCGACGCTTAA | |
TCAGGTGATCGCTTCGCGGACTCTCTCGGCGCGGTACTCATAAACTCAACAAAAGGCCCC | |
ACGCAGGAAAGACGCCGAAGTATTCAGTTTGCCCAGGGCC | |
>Rosalind_3828 | |
TGTTCCGGAGCGCGTGAATTGGCTACGCTAATATCATGATACCTCACCATTGAGAACTGT | |
TCCACTAGTATGACGGAGCATAAGGGGAGTTAGTAACGATGCCGTTAGCACAGTCGTAAA | |
CGCGAGCCACCGTCCCCCAAGTTCTCTAATTCAAGTATGCATCTGTAGCGGATCGGACCT | |
GGCCTGGCATTGATTCATTTGAATAAGCCGCAGGAAGTATTCCGATCAATGACAAAAACT | |
CTGGCGGCGGATTCTGTTGACGCTTCGTTCTGACTACATTTCCCCCTACATGGCCTGGTG | |
TGCGGAGGCTGCCGACTGTCGCATCTGGCCAAGATCCTGGCCTTAATTTCGTACTTAACT | |
AGCATTTGCACATTCCAATAAACAATCGCCATAGCGACAGCCCAAACCAGAAAGCGTTCG | |
AACACCTGGTTACCACGTATTGCTTGAGTGATGCGGAGCCATCAATATTACAGACTTCTC | |
GCACCAATTACTTCCATTATCTTCAACATTAGTAAAATGTAGGTCACCCTGACAATCACA | |
TACGGTCGCGAAGGGGAATTATGCGGGTCAGTTCTGCTCAGACGGGTCGTCGAGAACCTT | |
TCGCACTTAGCCCAGCGCATTCCAGCTCATCCTCTCATCGGGGTTCTTCGCTGGAAGCAA | |
GCTCAGGTTCATAGTAACCAAGCTCAAGGGCCATGCCGAAATAAAGCAGCCAACAGTGGG | |
AAGAGATTCGACGTGGGGACTGTGTATTGCCGACCCCCGACTCCTTAGACGACCATCCTC | |
GGTTCGGTTAGGTCAGGTGGCGTACTCATGGAGAAAACCTAATGCACCGTTCAAACTCTA | |
TCGTAACTGTGTGATTCATCCGTGGTAGTGGCGGTGCGAACCGACTCCCTGGGCCTAACG | |
GTGATTTCGCTTACTTGAAATCCTGTAGGTCTGGCAGGGGCATAGGCCTGTGACTCGTGT | |
CATACGGCGCGAAAGACAGTGTCGTCGGTAGCCAACATTA | |
>Rosalind_4660 | |
ACTAAACGCTTGCGCCCAACTATGTGTCCCGGAGGCACTTAAAGTGCAGGTTGGAACGCC | |
GGGGATCCGGTTGCGAGTTATGCTATTGATAGTGACGCGGAGTAACGCACGGATTAGAGT | |
AAGCTCTTAGATTCCGGGACGCCATTGCGACGCTGGTTCAGCGGTTGAATTGGGACGTGG | |
TAATCAACACCGCCTGGCCACAAAGTGCTTCTACGCTAATATCATGATACCTCACCATTG | |
AGAACTGTTCCACTAGTATGACGGAGCATAAGGGGAGTTAGTAACGATGCCGTTAGCACA | |
GTCGTAAACGCGAGCCACCGTCCCCCAAGTTCTCTAATTCAAGTATGCATCTGTAGCGGA | |
TCGGACCTGGCCTGGCATTGATTCATTTGAATAAGCCGCAGGAAGTATTCCGATCAATGA | |
CAAAAACTCTGGCGGCGGATTCTGTTGACGCTTCTCATGTCAGGTTATTTATACTATCTA | |
TCAACTCCTAGCGCTGACGGCTAAGGCGAGTCGCTCATGTCGTCCCTCGACGGATGTTCT | |
CATAGCGCCTTCGACAAGATGCTTCATCTCACGTATAAGTATCCTCTCGATGTTAGCTAC | |
GCCCTTTTAATCATATATGGTCGAAGTTTTCGCCGTCGCGGCTTACGACAACGCATTAGA | |
GACCTTTATTCTGTGCACCGACCTTAGCACCCACCGGATTTAGAGTCATGTGTAGCTGTA | |
AAAGTATACATTGCCGGTATCGCCATCCGCATTCCTAGCGGCGAAATCCCTGCACGCACA | |
TCTGACCTTGCAATTGTCTACTTGCCTTACAGAAAATGCCCTGCAGTATCCAAAACTCAT | |
GGCCTTATAGGACTATTCACTATTTCAGACTATTTGAGCGCGGTCAATAAACCTCACCAG | |
TGTTCTCTATCGATTTCTAGTAGACAGGAATATCAGATTGCAACGGTCTGTGCGCGTGTT | |
AAGAATGCCAGCCCTCTTCTACGATATATACTGTTCCGAC | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment