-
-
Save tahwaru/a1a4be817211503823fb4a5426ee9851 to your computer and use it in GitHub Desktop.
Using Python for Research_Case Study 1 &2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def read_seq(inputfile): | |
"""reads and returns the imput sequence with special characters removed.""" | |
with open(inputfile, 'r') as f: | |
seq=f.read() | |
seq = seq.replace("\n","") | |
seq= seq.replace("\r","") | |
return seq |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
table = { | |
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', | |
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', | |
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', | |
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', | |
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', | |
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', | |
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', | |
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', | |
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', | |
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', | |
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', | |
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', | |
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', | |
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', | |
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', | |
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', | |
} | |
def translate(seq): | |
"""Translate DNA sequence""" | |
protein="" | |
if len(seq)% 3 ==0: | |
for i in range(0,len(seq),3): | |
codon=seq[i:i+3] | |
protein += table[codon] | |
return protein | |
dna=read_seq("dna.txt") | |
prt=read_seq("protein.txt") | |
#看準確率 | |
print (prt == translate(dna[20:938])[:-1]) | |
#NCBI再給CDS的時候,給予的DNA段落最後會包含終止符,因此要刪除 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
alphabet =" "+string.ascii_lowercase | |
positions ={} | |
for i in range(27): | |
positions[alphabet[i]]=i | |
message = "hi my name is caesar" | |
encoded_message = "" | |
for i in message: | |
encoded_message += alphabet[positions[i]+1 % 27] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def encoding(message,key): | |
encoding_list = [] | |
for char in message: | |
position = positions[char] | |
encoded_position = (position + key) % 27 | |
encoding_list.append(alphabet[encoded_position]) | |
encoded_string = "".join(encoding_list) | |
return encoded_string | |
encoded_message=encoding(message,3) | |
print (encoded_message) | |
decoded_message = encoding(encoded_message,-3) | |
# print your decoded message here! | |
print (decoded_message) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
text="This is my test text. We're friends." | |
def count_words(text): | |
text=text.lower() | |
skips=[".",",",";",":","'",'"'] | |
for i in skips: | |
text.replace(i,"") | |
count_words={} | |
for word in text.split(" "): | |
if word in count_words: | |
count_words[word]+=1 | |
else: | |
count_words[word]=1 | |
return count_words | |
def count_words_fast(text): | |
text=text.lower() | |
skips=[".",",",";",":","'",'"'] | |
for i in skips: | |
text.replace(i,"") | |
count_words=Counter(text.split(" ")) | |
return count_words | |
print (count_words(text)==count_words_fast(text)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def read_book(title_path): | |
""" | |
Read a book and return it as a string. | |
""" | |
with open(title_path, "r", encoding="utf8") as current_file: | |
text=current_file.read() | |
text=text.replace("\n","").replace("\r","") | |
return text | |
def word_stats(word_counts): | |
num_unique=len(word_counts) | |
counts=word_counts.values() | |
return (num_unique, counts) | |
text=read_book("filepath") | |
word_counts=count_words(text) | |
(num_unique, counts)=word_stats(word_counts) | |
print (num_unique, sum(counts)) ##總共有幾種單字、以及共有幾個單字 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
book_dir="./Books" | |
import pandas as pd | |
stats=pd.DataFrame(columns=('language','author','title','length','unique')) | |
title_num=1 | |
for language in os.listdir(book_dir) : #可以讀路徑下的所有檔名 | |
for author in os.listdir(book_dir+'/'+language): | |
for title in os.listdir(book_dir+'/'+language+'/'+author): | |
inputfile=book_dir+'/'+language+'/'+author+'/'+title | |
#print(inputfile) | |
text=read_book(inputfile) | |
(num_unique, counts)=word_stats(count_words(text)) | |
stats.loc[title_num]=language,author.capitalize(),title.replace(".txt",""),sum(counts),num_unique | |
title_num +=1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
plt.figure(figsize=(10,10)) | |
subset=stats[stats.language=="English"] | |
plt.loglog(subset.length,subset.unique,'o',label="English",color="crimson") | |
subset=stats[stats.language=="French"] | |
plt.loglog(subset.length,subset.unique,'o',label="French",color="forestgreen") | |
subset=stats[stats.language=="German"] | |
plt.loglog(subset.length,subset.unique,'o',label="German",color="orange") | |
subset=stats[stats.language=="Portuguese"] | |
plt.loglog(subset.length,subset.unique,'o',label="Portuguese",color="blueviolet") | |
plt.legend() | |
plt.xlabel("Book length") | |
plt.ylabel("Number of unique words") | |
plt.savefig("lang_plt.png") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment