Last active
March 31, 2021 13:47
-
-
Save claraj/0dea4bef2e9ac5e84462b4dbdb4ffe2c to your computer and use it in GitHub Desktop.
Python bioinformations 101
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dna3 = 'ACGACGGATACGCGGGAGCTATTCATCTGTGTTGAGAAACACCGGAGAACTTATTGGTCTGTCAAGATTGCGACTGTGGTATAGCTCACCCGGTCGCGGCTTTCTAGT' \ | |
'TAGTGGCCAGCTCCCGTGTATTTGGAAGCTGAGAGAAGGACCCCTGTGGTTCGAATCAGCTCACGAGCGCTGGCACACCGCAATCAGCCGGCTAATAAAATTCGTACG' \ | |
'GACTGCCCCACACAAGAAGACGGTAAATTTATCAACACTATAGTTGCTATACACCAGGAGCGAGCGTAAATTTGTAGCGGTCAGATTAACTTGCTGGGAACGAACCAT' \ | |
'TGTCGCCCTCTGCAGCAAGTTAGTTGGCATCATTGGTACTGCCCTTCACTGGTAGCAGCTCCCCCTGTAATATATCCGTGGCCACTATTCAAGGGCTCAAATAGGCGA' \ | |
'CCCAGAGACCATTATAGGCGGTACAGCGCTGGTAGGTTTGCCTGGGCAGATATCGTTAGCCCCTTCTGCGCGCTATAAGATAGCGAAGGATAATTCTGCGGGACCA' \ | |
'TGGTCGTCTCCTAACCTCAGGGTGGGATTCCTGGCAGGTGGACCGGGCGCGCATCGAGAGCATTCGGGGTTCCTACCAGCCAGGGAAATCGGGTCGACCACTAGGCAA' \ | |
'TGAGCGGCTCACACCGATTTTCTTAAGAGACGTAACAAAGCCCGCATTAACGGCTGGAGTGAATCACCGTACGACTACCTAAGCCTCATTGGGATCCACTGTAAACCC' \ | |
'CTTCGCCGGTGTTGGGTGTCCGCAACGCCTCTGCTTTTTGCGTACAGTCGGCGTGGTGGAGTCCGCGGCCATACTGGCGGTTGGTTTGTAGAACAGTGTAACGACGTG' \ | |
'TGTCACTGCCCCCCGTAGCTTCTATTGCCCTGTTTGGGAGGTTCTATAGGGGTTACAGAGTAGTTTTAAGTTTTAGCACGACAGCACCAGTATTGCCAGTGACGCCGT' \ | |
'TGAGGCCGCAAAAGTGATTAACCCCCGTGGGACCGGATACGTTCCCAGCGGCAATCCTTGTCTTACCGCCGGACTGCGGAGCGAAGGGAGAAGTAACCGTGGTAATTA' | |
dna4 = 'CAGAGCAATGTCTGTTAGATAATCTCTCGTCTGGATAGCGAGAAGTTTCCGGAAGACGATTGTTTCCAACGAAAGGGCTGATAACTACACTCTGTCGCGCTTCTTTCG' \ | |
'TGTTCGCCAAGGGCACATTGGTTTAAAAGTGATCTCGAGAGACGTTTTCCTGACTTGTTGTGTTATATCAACGTAACTTTTAAGTCATATTTTCTCCCTACCCCAGAC' \ | |
'TAGACGGGTTCCTTTCATCGTCCACCGAGTTGCTTACGAGCAUGACACTTAGCCGGGGAAAAGTTCGCAATTCCGCGACAGCGTCAGGTGTCAAACAGATCCAAGCGA' \ | |
'AGGCCGCCGTGTAACGGAGAATTGTGGGCGCAGTCAAATAGCTAATTATTGGGAAAGGCCAAGTGGAGTCCGTCAGCGGAACAGCCTGGGCGGACGCGCTGCCGCTCG' \ | |
'TTCACCTCGCCTGCCTTCGTGTTGGGGACCGGATACGTTCCCAGCGGCAATCCTTGTCTTACCGCCGGACTGCGGAGCGAAGGGAGAAGTAACCGTGGTAATTAGCGA' \ | |
'GAGACCGTTGAGGCGCGGGGCGATCCGCCCTTGAGTGGACTCCAAACACATTCGACGAAGGGGTGGGAACATAAGTTAATTGGAGGGTCGGGGAAGTCCCACGCCCGG' \ | |
'TCCCTACATGATTGCACATAGTTCGTTCACCAACGGGCGATCTTCCTCACACTAGAGGAACGAGTAGTACTCCAGACATTGAGTCAGTTGCAGACCAAGTGGAGGGAA' \ | |
'CGATTTTTAUGGGCCGCTCAGGTACTAGTGCTAGACCCTACAAACGGCACTGGTGACCCGCTCCCGAGTTTGCGCTGTTACGTGTCCCTTAAAGTATACTTCGATCCT' \ | |
'AACATCGCGGCCATACGACGCTTAAATATTTCACCAGTTGTGTTTCGCGCAUGGAGTTGTTCTGTGTTATCGGCGAGTCTCCATTGCACGTCATCAACTAAAAACCAC' \ | |
'GGCCACACAGACATGCCTTGATTCTTCCCGCGACGGTAGGTTTGCCTGGGCAGATATCGTTAGCCCCTTCTGCGCGCTATAAGATAGCGATAGTAGGTTTAACTATCA' | |
print(len(dna3)) | |
# how many A, C, G, and T are in dna3? | |
# what about dna4? | |
# can you write a function? | |
# and thinking more generally - what if you don't know what letters there are? | |
# | |
# Creating variables | |
a_count = 0 | |
c_count = 0 | |
g_count = 0 | |
t_count = 0 | |
for base in dna3: | |
if base == 'A': | |
a_count += 1 | |
if base == 'C': | |
c_count += 1 | |
if base == 'G': | |
g_count += 1 | |
if base == 'T': | |
t_count += 1 | |
print(a_count, c_count, g_count, t_count) | |
# creating a dictionary of counts | |
# how could you generate this from the text? | |
base_counts = {'A': 0, 'C': 0, 'G': 0, 'T': 0} | |
# work with dictionary keys and values | |
for base in dna3: | |
if base == 'A': | |
base_counts['A'] += 1 | |
if base == 'C': | |
base_counts['C'] += 1 | |
if base == 'G': | |
base_counts['G'] += 1 | |
if base == 'T': | |
base_counts['T'] += 1 | |
print(base_counts) | |
base_counts = {'A': 0, 'C': 0, 'G': 0, 'T': 0} | |
# but now we have a dictionary, we can make this much more general | |
for base in dna3: | |
base_counts[base] += 1 | |
print(base_counts) | |
# Question: how could you generate this intial dictionary from the text? | |
# Questions: can you put the counting code into a function? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Example of real-world use of Python string manipulation - DNA analysis | |
# | |
# DNA is made of ATGC | |
# | |
# A-T are always paired | |
# G-C are always paired | |
# | |
# So if you have a sequence of one side of a DNA molecule, can you use Python to generate the other side? | |
# | |
dna1 = 'ACCAGTACCAGTGT' | |
dna2 = 'GTACACCAGGTCTA' | |
# Remember | |
# A pairs with T and T pairs with A | |
# C pairs with G and G pairs with C | |
# so for dna1, it begins ACCA... so output string will start TGGT... | |
# In DNA, the A, T, C, G are codes for generating proteins | |
# you are made of 100's of different kinds of proteins. | |
# Most of your DNA doesn't appear to make proteins - only about 1% of it encodes protein. | |
# A part of DNA that encodes a protein is called a gene So how do you find which parts do, | |
# or where are your genes in your DNA? | |
# | |
# So biologist are interested in where certain codes are. One code is ATG and this | |
# is called a 'start codon' and that means 'start making a protein here' | |
# | |
# Does string 1 have any genes in it? | |
# Does string 2 have any genes in it? | |
# | |
# If so, what's the index of where that gene is? | |
dna3 = 'ACGACGGATACGCGGGAGCTATTCATCTGTGTTGAGAAACACCGGAGAACTTATTGGTCTGTCAAGATTGCGACTGTGGTATAGCTCACCCGGTCGCGGCTTTCTAGT' \ | |
'TAGTGGCCAGCTCCCGTGTATTTGGAAGCTGAGAGAAGGACCCCTGTGGTTCGAATCAGCTCACGAGCGCTGGCACACCGCAATCAGCCGGCTAATAAAATTCGTACG' \ | |
'GACTGCCCCACACAAGAAGACGGTAAATTTATCAACACTATAGTTGCTATACACCAGGAGCGAGCGTAAATTTGTAGCGGTCAGATTAACTTGCTGGGAACGAACCAT' \ | |
'TGTCGCCCTCTGCAGCAAGTTAGTTGGCATCATTGGTACTGCCCTTCACTGGTAGCAGCTCCCCCTGTAATATATCCGTGGCCACTATTCAAGGGCTCAAATAGGCGA' \ | |
'CCCAGAGACCATTATAGGCGGTACAGCGCTGGTAGGTTTGCCTGGGCAGATATCGTTAGCCCCTTCTGCGCGCTATAAGATAGCGAAGGATAATTCTGCGGGACCA' \ | |
'TGGTCGTCTCCTAACCTCAGGGTGGGATTCCTGGCAGGTGGACCGGGCGCGCATCGAGAGCATTCGGGGTTCCTACCAGCCAGGGAAATCGGGTCGACCACTAGGCAA' \ | |
'TGAGCGGCTCACACCGATTTTCTTAAGAGACGTAACAAAGCCCGCATTAACGGCTGGAGTGAATCACCGTACGACTACCTAAGCCTCATTGGGATCCACTGTAAACCC' \ | |
'CTTCGCCGGTGTTGGGTGTCCGCAACGCCTCTGCTTTTTGCGTACAGTCGGCGTGGTGGAGTCCGCGGCCATACTGGCGGTTGGTTTGTAGAACAGTGTAACGACGTG' \ | |
'TGTCACTGCCCCCCGTAGCTTCTATTGCCCTGTTTGGGAGGTTCTATAGGGGTTACAGAGTAGTTTTAAGTTTTAGCACGACAGCACCAGTATTGCCAGTGACGCCGT' \ | |
'TGAGGCCGCAAAAGTGATTAACCCCCGTGGGACCGGATACGTTCCCAGCGGCAATCCTTGTCTTACCGCCGGACTGCGGAGCGAAGGGAGAAGTAACCGTGGTAATTA' | |
dna4 = 'CAGAGCAATGTCTGTTAGATAATCTCTCGTCTGGATAGCGAGAAGTTTCCGGAAGACGATTGTTTCCAACGAAAGGGCTGATAACTACACTCTGTCGCGCTTCTTTCG' \ | |
'TGTTCGCCAAGGGCACATTGGTTTAAAAGTGATCTCGAGAGACGTTTTCCTGACTTGTTGTGTTATATCAACGTAACTTTTAAGTCATATTTTCTCCCTACCCCAGAC' \ | |
'TAGACGGGTTCCTTTCATCGTCCACCGAGTTGCTTACGAGCAUGACACTTAGCCGGGGAAAAGTTCGCAATTCCGCGACAGCGTCAGGTGTCAAACAGATCCAAGCGA' \ | |
'AGGCCGCCGTGTAACGGAGAATTGTGGGCGCAGTCAAATAGCTAATTATTGGGAAAGGCCAAGTGGAGTCCGTCAGCGGAACAGCCTGGGCGGACGCGCTGCCGCTCG' \ | |
'TTCACCTCGCCTGCCTTCGTGTTGGGGACCGGATACGTTCCCAGCGGCAATCCTTGTCTTACCGCCGGACTGCGGAGCGAAGGGAGAAGTAACCGTGGTAATTAGCGA' \ | |
'GAGACCGTTGAGGCGCGGGGCGATCCGCCCTTGAGTGGACTCCAAACACATTCGACGAAGGGGTGGGAACATAAGTTAATTGGAGGGTCGGGGAAGTCCCACGCCCGG' \ | |
'TCCCTACATGATTGCACATAGTTCGTTCACCAACGGGCGATCTTCCTCACACTAGAGGAACGAGTAGTACTCCAGACATTGAGTCAGTTGCAGACCAAGTGGAGGGAA' \ | |
'CGATTTTTAUGGGCCGCTCAGGTACTAGTGCTAGACCCTACAAACGGCACTGGTGACCCGCTCCCGAGTTTGCGCTGTTACGTGTCCCTTAAAGTATACTTCGATCCT' \ | |
'AACATCGCGGCCATACGACGCTTAAATATTTCACCAGTTGTGTTTCGCGCAUGGAGTTGTTCTGTGTTATCGGCGAGTCTCCATTGCACGTCATCAACTAAAAACCAC' \ | |
'GGCCACACAGACATGCCTTGATTCTTCCCGCGACGGTAGGTTTGCCTGGGCAGATATCGTTAGCCCCTTCTGCGCGCTATAAGATAGCGATAGTAGGTTTAACTATCA' | |
print(dna4.upper()) | |
# why using the \ line extender and not triple-quoted strings? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# How many A, C, G, T characters are in this string? | |
# | |
# if | |
# | |
# dna = 'CGATC' | |
# | |
# We could count easily, there are | |
# 1 A | |
# 2 C | |
# 1 G | |
# 1 T | |
# | |
# What about a longer string? | |
# | |
# Create a much long example string. How would you count the number of each character? | |
dna = 'ACGACGGATACGCGGGAGCTATTCATCTGTGTTGAGAAACACCGGAGAACTTATTGGTCTGTCAAGATTGCGACTGTGGTATAGCTCACCCGGTCGCGGCTTTCTAGT' \ | |
'TAGTGGCCAGCTCCCGTGTATTTGGAAGCTGAGAGAAGGACCCCTGTGGTTCGAATCAGCTCACGAGCGCTGGCACACCGCAATCAGCCGGCTAATAAAATTCGTACG' \ | |
'GACTGCCCCACACAAGAAGACGGTAAATTTATCAACACTATAGTTGCTATACACCAGGAGCGAGCGTAAATTTGTAGCGGTCAGATTAACTTGCTGGGAACGAACCAT' \ | |
'TGTCGCCCTCTGCAGCAAGTTAGTTGGCATCATTGGTACTGCCCTTCACTGGTAGCAGCTCCCCCTGTAATATATCCGTGGCCACTATTCAAGGGCTCAAATAGGCGA' \ | |
'CCCAGAGACCATTATAGGCGGTACAGCGCTGGTAGGTTTGCCTGGGCAGATATCGTTAGCCCCTTCTGCGCGCTATAAGATAGCGAAGGATAATTCTGCGGGACCA' \ | |
'TGGTCGTCTCCTAACCTCAGGGTGGGATTCCTGGCAGGTGGACCGGGCGCGCATCGAGAGCATTCGGGGTTCCTACCAGCCAGGGAAATCGGGTCGACCACTAGGCAA' \ | |
'TGAGCGGCTCACACCGATTTTCTTAAGAGACGTAACAAAGCCCGCATTAACGGCTGGAGTGAATCACCGTACGACTACCTAAGCCTCATTGGGATCCACTGTAAACCC' \ | |
'CTTCGCCGGTGTTGGGTGTCCGCAACGCCTCTGCTTTTTGCGTACAGTCGGCGTGGTGGAGTCCGCGGCCATACTGGCGGTTGGTTTGTAGAACAGTGTAACGACGTG' \ | |
'TGTCACTGCCCCCCGTAGCTTCTATTGCCCTGTTTGGGAGGTTCTATAGGGGTTACAGAGTAGTTTTAAGTTTTAGCACGACAGCACCAGTATTGCCAGTGACGCCGT' \ | |
'TGAGGCCGCAAAAGTGATTAACCCCCGTGGGACCGGATACGTTCCCAGCGGCAATCCTTGTCTTACCGCCGGACTGCGGAGCGAAGGGAGAAGTAACCGTGGTAATTA' | |
print(dna) # All on one line |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dna = """ACGACGGATACGCGGGAGCTATTCATCTGTGTTGAGAAACACCGGAGAACTTATTGGTCTGTCAAGATTGCGACTGTGGTATAGCTCACCCGGTCGCGGCTTTCTAGT' \ | |
'TAGTGGCCAGCTCCCGTGTATTTGGAAGCTGAGAGAAGGACCCCTGTGGTTCGAATCAGCTCACGAGCGCTGGCACACCGCAATCAGCCGGCTAATAAAATTCGTACG' \ | |
'GACTGCCCCACACAAGAAGACGGTAAATTTATCAACACTATAGTTGCTATACACCAGGAGCGAGCGTAAATTTGTAGCGGTCAGATTAACTTGCTGGGAACGAACCAT' \ | |
'TGTCGCCCTCTGCAGCAAGTTAGTTGGCATCATTGGTACTGCCCTTCACTGGTAGCAGCTCCCCCTGTAATATATCCGTGGCCACTATTCAAGGGCTCAAATAGGCGA' \ | |
'CCCAGAGACCATTATAGGCGGTACAGCGCTGGTAGGTTTGCCTGGGCAGATATCGTTAGCCCCTTCTGCGCGCTATAAGATAGCGAAGGATAATTCTGCGGGACCA' \ | |
'TGGTCGTCTCCTAACCTCAGGGTGGGATTCCTGGCAGGTGGACCGGGCGCGCATCGAGAGCATTCGGGGTTCCTACCAGCCAGGGAAATCGGGTCGACCACTAGGCAA' \ | |
'TGAGCGGCTCACACCGATTTTCTTAAGAGACGTAACAAAGCCCGCATTAACGGCTGGAGTGAATCACCGTACGACTACCTAAGCCTCATTGGGATCCACTGTAAACCC' \ | |
'CTTCGCCGGTGTTGGGTGTCCGCAACGCCTCTGCTTTTTGCGTACAGTCGGCGTGGTGGAGTCCGCGGCCATACTGGCGGTTGGTTTGTAGAACAGTGTAACGACGTG' \ | |
'TGTCACTGCCCCCCGTAGCTTCTATTGCCCTGTTTGGGAGGTTCTATAGGGGTTACAGAGTAGTTTTAAGTTTTAGCACGACAGCACCAGTATTGCCAGTGACGCCGT' \ | |
'TGAGGCCGCAAAAGTGATTAACCCCCGTGGGACCGGATACGTTCCCAGCGGCAATCCTTGTCTTACCGCCGGACTGCGGAGCGAAGGGAGAAGTAACCGTGGTAATTA' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sources: | |
DNA structure & image; https://www.nature.com/scitable/topicpage/discovery-of-dna-structure-and-function-watson-397/ | |
Generate random DNA sequences http://bioinformatics.org/sms2/random_dna.html | |
https://en.wikipedia.org/wiki/Start_codon | |
General DNA info https://www.coursera.org/learn/dna-decoded | |
Learning Bioinformatics - solving biology problems with programming: https://www.coursera.org/learn/bioinformatics | |
Python bioinformatics problems: http://rosalind.info |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def other_strand(dna): | |
dna_other_strand = '' | |
for base in dna: # making more general variable names in a function | |
if base == 'A': | |
# it's pair is a T | |
dna_other_strand = dna_other_strand + 'T' | |
elif base == 'T': | |
dna_other_strand = dna_other_strand + 'A' | |
elif base == 'C': | |
dna_other_strand = dna_other_strand + 'G' | |
elif base == 'G': | |
dna_other_strand = dna_other_strand + 'C' | |
return dna_other_strand | |
dna1 = 'ACCAGTACCAGTGT' # how can you make the other side? | |
# TGGTCATGGTCACA # output | |
# A pairs with T and T pairs with A | |
# C pairs with G and G pairs with C | |
dna2 = 'GTACACCAGGTCTA' | |
print(other_strand(dna1)) | |
print(other_strand(dna2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment