Created
June 26, 2016 10:19
-
-
Save diallobakary4/1e020faf2973866ca66826697c43a848 to your computer and use it in GitHub Desktop.
kj
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# CODE CHALLENGE: Solve the String Composition Problem. | |
# Input: An integer k and a string Text. | |
# Output: Compositionk(Text) (the k-mers can be provided in any order). | |
def StringKmerCompo(sequence, kmerlenght): | |
compo,i =[],0 | |
while i < (len(sequence)-kmerlenght+1): | |
compo.append(sequence[i:i+kmerlenght]) | |
i+=1 | |
return compo | |
#Test de la fonction StringKmerCompo | |
# seq = "TAATTAGTCTTCGCGAGGGGTTAAATATAGCGTGCAGATGTAATAACCAACCTTCGCAGCAGGTATGGGCGTTATGCATCCTTATACCGAATGATGTTACACATAAATCTCGGAAGAGCGCAGGATACATGGAATGAGCATAACGGGGCGCGCTATAGATAAGTTGTTTGAGCTAAGAGGTACACGTGCCGCTAAATAGGTCGCCGCATGATATTACGATTGCACGTCATAATTTTGTAGTGATAGTTATATGATTTCAACCCACCGCATTACAGCCACTCTGCGTCGCTCCTAAGGACATCCGTGCCCGAATCGCACTATCAGACCCACGAGTCCGTGGTTGACAGCTAGTACCAGGAGATATCGAATTCACCACAGGCGTCAATTCATAGGGACGCAAGTATAAGTTGCACCGTCTAGCCCGCGCCCCTGTCCCACTGCGGATCAGTAGAAAAGTGGTGCCATGGCACAATACCCCTCGACAGACGAATACACCGAGGAGTTATTCTGAGGCAACCTCGTGGATCATCTAAGTCAGCGCGCTAGAAGCGCTTGTATAAGGCAGATAAAATTTAATAAGAGCTTGGAATGCCAGGTCTACGATCACTATCCTTAGATGGCACTAACGATCCGAATTCCCTTTCGCAAAAATGGATAGGACATGTGAAGAAGGTTAATGGCACCTGGGTGTTTCCACCTTCCTGACTCCGACAACGTTTAAAGAGTCCGAGTCGGTCGTTCTAAAGCAGTTGAGGAAAATAACGCGGACTCTCAACAACGTCTCTGTGGCCAGTCTCAAGCTGGGCTTAAGTGACGTAGGAAAGCCATCCCTCGGGTGTACGAGGTGCAACCCCATGATGCTCCCGAGCTCCAGAAACATGGGGCTCGCCGGCCCCCTCGCACTTCTCACCCCTCATATACGGGCTTGCATCTGAGTTGGCTCTAACACTTAGAGAAGACCTCTGGCTACACGGTAGACAATCTGTTTGTTAATAACCTAGGTCCCTCTGGGAGATTAGTGCGATATTGGTATTATTTCATGAATCATATCGGACCGGATCGCTGTTTGGCGATAGGCATCCATACGATACATAAAAAAGATTACGTCTAGGCGCTATGCCCTGATCCTCCCGAGATAATGACATGCTTTGCAGGATTGCTATAGAATGGGTCAGTGCAGCTACATCTGATAATTAATATTCATCACCCGTGAGTGAAACGGTGAGTTCTTCAAAGAATCTCATCGTTTACATACCGGGGCCGGTTACTTATTCAAAAAGGGGGTCAAAACTTAACTGATTTCGTCTCGGCTAGATTAGACCGGACGAAAAATCGAACCGTCAAGTGCATTGTAGGCGACGGACAGTGATCGGCGGTAATCGATCATATGGCGGTTTTCCGAATTAGAAACCGGCATGCAATCACCAGCGTCGACCCAGCTAGCTATGGCGGACTCAGCGGGACCGTGGCGACGGGCACTGGAAACCTTGTTCAGATAGGTAAGGGGTTCTCCCAGTTATGTGTGTCGGTAAGCGAAAATTGGGATAACGAATTTTTGAGGTAAACCTATGTAGGTATTTCATATGTCGCAGTAACAGTCGGTCCGGCTTCGGGCACATACAACACCTAGTTTCCTTCCTGGAGACGCTCACTACGTAGAAGCCGTACTGCCGAGACATCGGGGCTTCGAGTAGGACGGCAAATAATAACAGACGCACCGATCCTCGGTTCTGCGTTGACTAAGGCTGTGCCCGTACCTCTCCGCGGGCTAGTTGATGTCTCAAACTTGTGAAGCACCGCCTCCCCCACATACAATCTGAAGCGAGAAAAATGTTTCCGAAGCATAGTAATTAAGCGAAGGCGTAATTCGGTTTCGAAACGTAGTATTTTTCCTCTACCTCCGTGTCTAGGACCCTCAGGCAGACGATGGCCACAAGACCGGACCAAACACACCAGGCGCGAAGGTATCCGCCCTGCTCTCGTGACAGTGTCCTTTAGTTACCGCCGATGGATCGTTCCACTTGTCCTAACACTCGGCTGAGATGATTATGTTCACAAGCTATATTACCGTTCCGGCCACAGGCTTTTATGCGACAGTAGCGCGCGCTGGTACGGGGTTATAGTTTTTGCATCTGTCCGAGTTGGTGTATGTACTACCACATATCTCCTATGTCGTAGCAATGTACTTATGAAGTTCAATAGTACTCTCCTCTCAACTGGAGCAGCGATCAGTTGTAGAGCCGGTTAAGACGTCAGGCAACTCAATCTGAGAATGGGCTCTCGCCGGCATACGTTGTAGTCGCGGTCCCACTGACTCGAACCTGGATTCGAATTTGACGTCCCTACGACTTAGGGCTTTACACCGGCCAACCAGTTAGAGTGGCTCCGCCGGATACTAACTGAGTGATTAAGGGGGCCGCTATAGTCGGTCATTGAGATTACGTTCGCTACGAACGGGTTAGAAGTTTCGACATTATAAACTAGGCAGAATCTCCCTATTGTGACTTGGCTACTTGAGCGAATCAGGCAGGGCCGCAGACATGGGGCGCAGCGTTAGAAAAGGTTTCCGCCGCTATTAGGCCGGAAGAAATTTTGAGCGCAGGTCTTATTCGATATCCAAAGTGGACTCGTTTGATAGCGCGAGGCCTTTTAGCCTCGATGCTCGAAATTGCCGGCAGTGCGGGGGGTGCAGCTCGCGACATAGTCACCTCCGTCTCACACCGTGAAGCTCAGATTATGCTACCTTCGACGGTACCCGTTTGCCATTCATTAAAATAATTAGGCAACAGTGGAACGTAGTGGCATCCCGGAGTCAGGTACCGATGTGTCGATCGGCTCTGCCATCTAGTGTGACCTCGAACGGAGTTAATCCTGAACATCTACCGATAACGTGCAAGTTACCAGAAGCCGTCGAGGCGAACAGAGCCATTTTCACACCAAGCTGATAAACTGTCGAGTAAACACCAGCATGGATGAGTCAATTATGTGACCAAATACGAGGACTGAAATGCAGATGGCATGCACCGACACTCTTGTGTCTGTACATATACCGTGTTTTAGTTAGATTGTCAGGTTTGGGCCCAGGGTTACGGAGCTGAAGTGGGCTTGATAGGTCAGGGCCGAAAAGACCCGACCTAACAGGTAACCGCGCTGACACATGTCGCCTCTGCCGCCTTTAAAGCGATACGACGGCAACGTCGTCCTGTTCCTGAGATTTTGTCACTTCGAGGCGTAGGAAGGAGAATGTACAGTGAACGGTTTTACTCTCTGCATGGCGGAATCTAGTCCTGTAGCGGTAGATGAAGTTACCAATGGACCGAGCATCTCAAAAGTAGCTTAGAAGCTGAGAGCGCGTCAGGAGGCTTATATGTGTTGGGACTGCCAACTTAGTTCGCGAGCCATGGGTTCTAGTTAATTCTATGAGGCCAGCCTCCCGATTATAAGCGGTTCGTGACATTGCGGGCATTTCGCCGAAGGGGCGCGCTTGCCAGTCGGAGTGTGCATATAGGCTGTCACTACGGGATAGTATCCGCGACGGGACCACTGATCAGGGTGAACTATATAGTTGGACCAGCTGCTAGCTAACCCCAAAATTTAAACATTCATAGCTTAGAATGTTAAGTGTCAGATGCTGCACGTCAGGTCCCTTACAGAATCCTCCGCGAGAGCTCTATCGAGAATGATAAAGTCGCATTCACCTCGCTTAGCATTCGAGATCAGCGATTGTTACGGACACAAGGCCAAGGGTAGGGGATTTTCTCCGATTCTTTCGGCCTCGGCAGTGAGGGCACCGTATTGCGGATCGCTTTATCGAGCGTCGCTGTCTCTCCTGTTAACCGGGCCAGATGATACAATTCGCATAACTTTTTAGCCAGTAGGCCGAAGCTACAGGTGTTAGGTTCACAGTGCAGCATGTATGGAAACCGTTCGTCGCCGGGGAGGGCTCGGCTGAGACGCTGCGTCACGACTTTTGACCTTGATCTCCCGTGATTCGTATTTGAAGACAAAATCGACTGTACAGTACAGTCAGGGTCTTGTGGGCTAGGAACATGTTCCTGTTCGACACTGACCGCAAAGAGCTCCAACGCACGCTGTCGATAGGCGCCCATACAAACCGCTCCGCCTCGGTTACCCTCTCGCTGTGGGTCGGGCGAGACAAGCTGCCTTCCTACGGGGGTTGCAGTTCGGCCGAATGAAGTGTGAATTCCAGAACAACGTAGGCGTTATCACTGCTGGGATAAGCCTTGACTTCTGTTGCCTATACGAAACCTGCGTGCTCACTCTCCGAGTGACGCACGCTATTACCATCGTAGACAAACGGTCGTCACAGCAGCAAGTCTTTGGAGAGATGTATGCCGAAATAGGCACATAATTTCCCGTAATATCCGACACCTGCGCATAAAAATGTGCAAACTAGAACTTGAGTGCTGGCACGGATATATCACGGCGAATGGAATATGAGCACTCGTGGGTAACCGGATACAACATTCCTCAGGTCTGCATAACTACCTGTGGTCTCCAATGACCGCACCGTGGACTTAGTGAGAGGATAAGACCCACCCCCATGCCAACAGAGGAAGGCGGCTTGTGCCTTCTACTAAGGAGAAATGCACTAGAGATGTCGTTTGGCTGTCCGCAGGGTCTAGGAAAATGGGATTGATACTTGGCTTTACCCCGGTGGTACTCCGATCGCGGGAGTAATACACTCAATATCACCTTGCACTGTTGAAGAACTGGGATGGAGAGTACAGGGCATCGAGGACGTTCAATACTCTGAAGCCGTCTACGCTAGGGCGCGAGGTGAGACAGCTTTTCGTCAGTTGTGGCGACAGTACACTATTTGAGGCATCAAGATGCAGACAAGCTGAGAGATGTGTAAACTCGAGAGTGCCCGCTGGTACTTGGCTAATTACCAGGTCGTCTCTTCGAAGAGCAGAGGCACGCGCACGGTGGAGGAATGTGACTGTCCCTCCAGCTCCCCCCTGCTTCTGATACCCAGT" | |
# for e in StringKmerCompo(seq,100): | |
# print e | |
# | |
# String Spelled by a Genome Path Problem. Reconstruct a string from its genome path. | |
# Input: A sequence of k-mers Pattern1, … ,Patternn such that the last k - 1 symbols of Patterni are | |
# equal to the first k-1 symbols of Patterni+1 for 1 ≤ i ≤ n-1. | |
# Output: A string Text of length k+n-1 such that the i-th k-mer in Text is equal to Patterni (for 1 ≤ i ≤ n). | |
def GenomePath(reads): | |
path = reads[0] | |
for e in reads[1:]: | |
path += e[len(e)-1:] | |
return path | |
#Test de la fonction GenomePath | |
# DNAstring_open = open("dataset_198_3.txt", 'r') | |
# reads = DNAstring_open.readlines() | |
# reads2 = [] | |
# for e in reads: | |
# reads2.append(e[:-1]) | |
# | |
# print GenomePath(reads2) | |
# CODE CHALLENGE: Solve the Overlap Graph Problem (restated below). | |
# Input: A collection Patterns of k-mers. | |
# Output: The overlap graph Overlap(Patterns), in the form of an adjacency list. | |
# (You may return the edges in any order.) | |
def OverlapGraph(reads): | |
#key a read, values: all suffix = prefix reads | |
matrix = {} | |
for e in reads: | |
matrix[e] = [] | |
for e in matrix: | |
for b in reads: | |
if e[1:]==b[:-1]: | |
matrix[e].append(b) | |
return matrix | |
#Test de la fonction GenomePath | |
# DNAstring_open = open("dataset_198_3.txt", 'r') | |
# reads = DNAstring_open.readlines() | |
# reads2 = [] | |
# for e in reads: | |
# reads2.append(e[:-1]) | |
# | |
# result = OverlapGraph(reads2) | |
# output = open("output.txt", 'w') | |
# # Formatting the output for coursera | |
# for e in result: | |
# if result[e]: | |
# print e +" - > "+ result[e][0] | |
# output.writelines(e +" -> "+ result[e][0] +"\n") | |
# seq = "AAGATTCTCTAAGA" | |
# result = OverlapGraph(StringKmerCompo(seq,3)) | |
# for e in result: | |
# if result[e]: | |
# print e +" - > "+ str(result[e])[1:-1].replace("'","") | |
# PathGraphk(Text) is the path consisting of |Text| - k + 1 edges, | |
# where the i-th edge of this path is labeled by the i-th k-mer in Text | |
# and the i-th node of the path is labeled by the i-th (k - 1)-mer in Text. | |
def PathGraph(text,k): | |
#path pattern: [edge,[node, node]] | |
pathEdge = [] | |
#path of successive k-mers | |
pathEdge = StringKmerCompo(text,k) | |
pathGraph = [] | |
for e in pathEdge: | |
pathGraph.append([e[:-1],e[1:]]) | |
return pathGraph | |
#Test de la fonction PathGraph | |
#print PathGraph("AAGATTCTCTAAGA", 3) | |
# In general, given a genome Text, PathGraphk(Text) is the path consisting of |Text| - k + 1 edges, | |
# where the i-th edge of this path is labeled by the i-th k-mer in Text and the i-th node | |
# of the path is labeled by the i-th (k - 1)-mer in Text. The de Bruijn graph DeBruijnk(Text) | |
# is formed by gluing identically labeled nodes in PathGraphk(Text). | |
# Input: An integer k and a string Text. | |
# Output: DeBruijnk(Text), in the form of an adjacency list. | |
def DeBruijnGraph(seq,k): | |
path = PathGraph(seq,k) | |
i = 0 | |
for e in path: | |
for b in path[i+1:]: | |
if e[0] == b[0] : | |
e.append(b[1]) | |
i += 1 | |
# Just remove duplicate edges | |
finalPath = [] | |
check = {} | |
check = set(check) | |
for e in path: | |
check.add(e[0]) | |
for e in path: | |
if e[0] in check: | |
finalPath.append(e) | |
check.remove(e[0]) | |
return finalPath | |
#test de la fonction DeBruijnGraph | |
# for e in DeBruijnGraph("TCGTGTGGTCGTGAGCATTAACGCCTACGCTGCCGGCATCATCTATCCGTTCCCATTAACGGTGAGGTTTTGTTAAGGCAAATAGGCAAAGGCTGATTAGTCCCCCATAGTGCCATTGCAATTGACATATGTTCCGGAAAAAACCACAGGCGTCCCGTGGAACTATAAAGTGTCGATTCGCCGGAAGCTTCAGTTTTGATTAACCAGAGCGTTTTTTCTATGCGCCATCATCCAGATGCACAAATTCTCGATTGTAGGATAGTGACTACATGATGATCACTGGAAGGCCGGCAAGTCCACAATTTCCGGATGCGAATTAGGCTCCTATCGGATCTTATTGTTGTCAGCTCCCCCGTTCTTAATCTTCACAACTTGCAATAATAGTTATCGCGTTACTGACTGTAGCTTAAGCAAGCTTTAACGGCACACCCCAAAACCCGTCTGGTCGGATAACACAAGCGTGTAGGACCAGGAAAGCTGAGCCCTGCATATCCACCACTTCGGTGATCCAGGGGGCGTCAGTTAAGCCACCAGCAAGACCATGATCAACCCGCGGTACGCGACGTTCTTGAACGATCCCTGCAAACACATAACGAATTTTGAGAAATTTGTGGAATGAATGCCAGCTCGCTTGATTCCCAAAACGAATGACACGTCGAACCCCAGGTTACAGCCTCTGAGTAACTGGTACGGTTTACACACTCACGGTGCAGGTTCGCGTCGCGTGCGCTAGGCTATTCCGAGTTTCGGAACGCAACGAGCAACCCCGCTAATCGGAACGCCAGGCGCTCTGGGGTCCTACGCGGGCAGTTCAGTAACACCCTGTATGTGATAGACCTAGTCGGTGGCCTTATATAGGATTAATCCTTTTATGATCAGGAACACTAACTTTAAGTCCGTTAATAGAAGAGTAATCAAATTGTCAAATTCTTGATCGCAGGGAAAGGTCGGGCTCCTGACAGAGCACGATAAGGTCTCCGATCTCGTTTAGGAGCGCGATTTAATTCGCTGGAATTTAACAGGCTCTGCAGATGCGAGATAGGTCCGAATTCCCGACCAATATAGATGCGCCGGGCCCGAGGTGACGTGGTGACATGCAATCGTGCTTCAGAGTGCCCAACGTCCAACTAAGAATCACGGAACACAATATAACACGCTAATTAGGCTAAATGTTCCTCAGACGGTCTGGACGGTTTTGCCGTCTCATGAGTCTCTAAGCGTTCCGATGAGCCATTATGTGTCCCTTGAAAGCCTTTTAGCCCCATCTATGGGACTAAGTCGGGTGGTCCAAAAAAACCCAACTGTCGTTAACAGCCTTGTGGTCCCTATCCGTCCGGAAAGTAACGCGGCAATCGTACTTGCTCATTGCAAAGTCAGCCGCGGTATTGCGCTCGTAGATCTTTGACATTCGCATACGGAATGGTCGGTAGTTCAATTAAATGAGCACTCCAAGGTCTTGCGCTCTCTTGCTAAGCCGGGTAAATACCTATTCTTGCAATCGTGACCCGAGAGAGACATCCTTATCGTAACGGTCGCGGCAAATCTTCAGGGGTCCGGTTCGCTCCGTAATGTCGCCCATATTACGTTGTTTCGGGTGGTGACGTTGGCGTATGAGCGTGACTTCCACTCCGATTCATCACGTAGGGTGGCCCTGATATGACTCTGTTGAAAGGTGGCGGCACCAAGATATTGGATGGTGGCATATGGACCTGATCTAAGGAGGAACTCTGAAAAATCCTAGAAGTCAGGAAGTCTGAGAGGGGACGCCATCATGGTTCCTTCACAATAGCTCGAGACAAAAGGCGTTTTTTTCCCAGCCAGCTCGCTGTGGACACGGCGCTAGGGTCACGCACACTGAGCAGTATACAATTTTGCGGTTGATCAAAGTTAGATGATGTATTTCGTAAGGCTCCTCATTCCAATTGCAGTAGGCCTCGGGCAACCACGGCGACACATTAGGAGTCTATGTGCCAGATCAACACCGTTAGCGTGAGTCGAAG", 12): | |
# print e[0] +" -> "+ str(e[1:])[1:-1].replace("'","") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment