Skip to content

Instantly share code, notes, and snippets.

@tahwaru
Forked from cjue25/01_read.py
Created February 27, 2021 11:54
Show Gist options
  • Save tahwaru/a1a4be817211503823fb4a5426ee9851 to your computer and use it in GitHub Desktop.
Save tahwaru/a1a4be817211503823fb4a5426ee9851 to your computer and use it in GitHub Desktop.
Using Python for Research_Case Study 1 &2
def read_seq(inputfile):
"""reads and returns the imput sequence with special characters removed."""
with open(inputfile, 'r') as f:
seq=f.read()
seq = seq.replace("\n","")
seq= seq.replace("\r","")
return seq
table = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
def translate(seq):
"""Translate DNA sequence"""
protein=""
if len(seq)% 3 ==0:
for i in range(0,len(seq),3):
codon=seq[i:i+3]
protein += table[codon]
return protein
dna=read_seq("dna.txt")
prt=read_seq("protein.txt")
#看準確率
print (prt == translate(dna[20:938])[:-1])
#NCBI再給CDS的時候,給予的DNA段落最後會包含終止符,因此要刪除
import string
alphabet =" "+string.ascii_lowercase
positions ={}
for i in range(27):
positions[alphabet[i]]=i
message = "hi my name is caesar"
encoded_message = ""
for i in message:
encoded_message += alphabet[positions[i]+1 % 27]
def encoding(message,key):
encoding_list = []
for char in message:
position = positions[char]
encoded_position = (position + key) % 27
encoding_list.append(alphabet[encoded_position])
encoded_string = "".join(encoding_list)
return encoded_string
encoded_message=encoding(message,3)
print (encoded_message)
decoded_message = encoding(encoded_message,-3)
# print your decoded message here!
print (decoded_message)
from collections import Counter
text="This is my test text. We're friends."
def count_words(text):
text=text.lower()
skips=[".",",",";",":","'",'"']
for i in skips:
text.replace(i,"")
count_words={}
for word in text.split(" "):
if word in count_words:
count_words[word]+=1
else:
count_words[word]=1
return count_words
def count_words_fast(text):
text=text.lower()
skips=[".",",",";",":","'",'"']
for i in skips:
text.replace(i,"")
count_words=Counter(text.split(" "))
return count_words
print (count_words(text)==count_words_fast(text))
def read_book(title_path):
"""
Read a book and return it as a string.
"""
with open(title_path, "r", encoding="utf8") as current_file:
text=current_file.read()
text=text.replace("\n","").replace("\r","")
return text
def word_stats(word_counts):
num_unique=len(word_counts)
counts=word_counts.values()
return (num_unique, counts)
text=read_book("filepath")
word_counts=count_words(text)
(num_unique, counts)=word_stats(word_counts)
print (num_unique, sum(counts)) ##總共有幾種單字、以及共有幾個單字
import os
book_dir="./Books"
import pandas as pd
stats=pd.DataFrame(columns=('language','author','title','length','unique'))
title_num=1
for language in os.listdir(book_dir) : #可以讀路徑下的所有檔名
for author in os.listdir(book_dir+'/'+language):
for title in os.listdir(book_dir+'/'+language+'/'+author):
inputfile=book_dir+'/'+language+'/'+author+'/'+title
#print(inputfile)
text=read_book(inputfile)
(num_unique, counts)=word_stats(count_words(text))
stats.loc[title_num]=language,author.capitalize(),title.replace(".txt",""),sum(counts),num_unique
title_num +=1
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
subset=stats[stats.language=="English"]
plt.loglog(subset.length,subset.unique,'o',label="English",color="crimson")
subset=stats[stats.language=="French"]
plt.loglog(subset.length,subset.unique,'o',label="French",color="forestgreen")
subset=stats[stats.language=="German"]
plt.loglog(subset.length,subset.unique,'o',label="German",color="orange")
subset=stats[stats.language=="Portuguese"]
plt.loglog(subset.length,subset.unique,'o',label="Portuguese",color="blueviolet")
plt.legend()
plt.xlabel("Book length")
plt.ylabel("Number of unique words")
plt.savefig("lang_plt.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment