tahwaru · February 27, 2021 11:54
diff --git a/01_read.py b/01_read.py
 def read_seq(inputfile):
    """reads and returns the imput sequence with special characters removed."""

    with open(inputfile, 'r') as f:
        seq=f.read()
    seq = seq.replace("\n","")
    seq= seq.replace("\r","")
    return seq
diff --git a/02_translate.py b/02_translate.py
 table = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
 }


 def translate(seq):
    """Translate DNA sequence"""

    protein=""
    if len(seq)% 3 ==0:
        for i in range(0,len(seq),3):
            codon=seq[i:i+3]
            protein += table[codon]
    
    return protein


 dna=read_seq("dna.txt")
 prt=read_seq("protein.txt")

 #看準確率
 print (prt == translate(dna[20:938])[:-1]) 
 #NCBI再給CDS的時候，給予的DNA段落最後會包含終止符，因此要刪除
diff --git a/03_hw1-1.py b/03_hw1-1.py
 import string
 alphabet =" "+string.ascii_lowercase

 positions ={}
 for i in range(27):
    positions[alphabet[i]]=i

 message = "hi my name is caesar"
 encoded_message = ""

 for i in message:
    encoded_message += alphabet[positions[i]+1 % 27]
diff --git a/03_hw1-2.py b/03_hw1-2.py
 def encoding(message,key):
    encoding_list = []
    for char in message:
        position = positions[char]
        encoded_position = (position + key) % 27
        encoding_list.append(alphabet[encoded_position])
    encoded_string = "".join(encoding_list)
    return encoded_string

 encoded_message=encoding(message,3)

 print (encoded_message)

 decoded_message = encoding(encoded_message,-3)

 # print your decoded message here!
 print (decoded_message)
diff --git a/04_count_words.py b/04_count_words.py
 from collections import Counter
 text="This is my test text. We're friends."

 def count_words(text):
    text=text.lower()
    skips=[".",",",";",":","'",'"']
    for i in skips:
        text.replace(i,"")

    count_words={}
    for word in text.split(" "):
        if word in count_words:
            count_words[word]+=1
        else:
            count_words[word]=1
    return count_words

 def count_words_fast(text):
    text=text.lower()
    skips=[".",",",";",":","'",'"']
    for i in skips:
        text.replace(i,"")

    count_words=Counter(text.split(" "))
    return count_words

 print (count_words(text)==count_words_fast(text))
diff --git a/05_read_and_stats.py b/05_read_and_stats.py
 def read_book(title_path):
    """
    Read a book and return it as a string.
    """
    with open(title_path, "r", encoding="utf8") as current_file:
        text=current_file.read()
        text=text.replace("\n","").replace("\r","")
    return text

 def word_stats(word_counts):
    num_unique=len(word_counts)
    counts=word_counts.values()
    return (num_unique, counts)


 text=read_book("filepath")
 word_counts=count_words(text)
 (num_unique, counts)=word_stats(word_counts)
 print (num_unique, sum(counts)) ##總共有幾種單字、以及共有幾個單字
diff --git a/06_books.py b/06_books.py
 import os
 book_dir="./Books"

 import pandas as pd
 stats=pd.DataFrame(columns=('language','author','title','length','unique'))
 title_num=1

 for language in os.listdir(book_dir) : #可以讀路徑下的所有檔名
    for author in os.listdir(book_dir+'/'+language):
        for title in os.listdir(book_dir+'/'+language+'/'+author):
            inputfile=book_dir+'/'+language+'/'+author+'/'+title
            #print(inputfile)
            text=read_book(inputfile)
            (num_unique, counts)=word_stats(count_words(text))
            stats.loc[title_num]=language,author.capitalize(),title.replace(".txt",""),sum(counts),num_unique
            title_num +=1
diff --git a/07_plot.py b/07_plot.py
 import matplotlib.pyplot as plt

 plt.figure(figsize=(10,10))
 subset=stats[stats.language=="English"]
 plt.loglog(subset.length,subset.unique,'o',label="English",color="crimson")
 subset=stats[stats.language=="French"]
 plt.loglog(subset.length,subset.unique,'o',label="French",color="forestgreen")
 subset=stats[stats.language=="German"]
 plt.loglog(subset.length,subset.unique,'o',label="German",color="orange")
 subset=stats[stats.language=="Portuguese"]
 plt.loglog(subset.length,subset.unique,'o',label="Portuguese",color="blueviolet")
 plt.legend()
 plt.xlabel("Book length")
 plt.ylabel("Number of unique words")
 plt.savefig("lang_plt.png")
	def read_seq(inputfile):
	"""reads and returns the imput sequence with special characters removed."""

	with open(inputfile, 'r') as f:
	seq=f.read()
	seq = seq.replace("\n","")
	seq= seq.replace("\r","")
	return seq
	table = {
	'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
	'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
	'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
	'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
	'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
	'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
	'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
	'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
	'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
	'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
	'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
	'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
	'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
	'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
	'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
	'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
	}


	def translate(seq):
	"""Translate DNA sequence"""

	protein=""
	if len(seq)% 3 ==0:
	for i in range(0,len(seq),3):
	codon=seq[i:i+3]
	protein += table[codon]

	return protein


	dna=read_seq("dna.txt")
	prt=read_seq("protein.txt")

	#看準確率
	print (prt == translate(dna[20:938])[:-1])
	#NCBI再給CDS的時候，給予的DNA段落最後會包含終止符，因此要刪除
	import string
	alphabet =" "+string.ascii_lowercase

	positions ={}
	for i in range(27):
	positions[alphabet[i]]=i

	message = "hi my name is caesar"
	encoded_message = ""

	for i in message:
	encoded_message += alphabet[positions[i]+1 % 27]
	def encoding(message,key):
	encoding_list = []
	for char in message:
	position = positions[char]
	encoded_position = (position + key) % 27
	encoding_list.append(alphabet[encoded_position])
	encoded_string = "".join(encoding_list)
	return encoded_string

	encoded_message=encoding(message,3)

	print (encoded_message)

	decoded_message = encoding(encoded_message,-3)

	# print your decoded message here!
	print (decoded_message)
	from collections import Counter
	text="This is my test text. We're friends."

	def count_words(text):
	text=text.lower()
	skips=[".",",",";",":","'",'"']
	for i in skips:
	text.replace(i,"")

	count_words={}
	for word in text.split(" "):
	if word in count_words:
	count_words[word]+=1
	else:
	count_words[word]=1
	return count_words

	def count_words_fast(text):
	text=text.lower()
	skips=[".",",",";",":","'",'"']
	for i in skips:
	text.replace(i,"")

	count_words=Counter(text.split(" "))
	return count_words

	print (count_words(text)==count_words_fast(text))
	def read_book(title_path):
	"""
	Read a book and return it as a string.
	"""
	with open(title_path, "r", encoding="utf8") as current_file:
	text=current_file.read()
	text=text.replace("\n","").replace("\r","")
	return text

	def word_stats(word_counts):
	num_unique=len(word_counts)
	counts=word_counts.values()
	return (num_unique, counts)


	text=read_book("filepath")
	word_counts=count_words(text)
	(num_unique, counts)=word_stats(word_counts)
	print (num_unique, sum(counts)) ##總共有幾種單字、以及共有幾個單字
	import os
	book_dir="./Books"

	import pandas as pd
	stats=pd.DataFrame(columns=('language','author','title','length','unique'))
	title_num=1

	for language in os.listdir(book_dir) : #可以讀路徑下的所有檔名
	for author in os.listdir(book_dir+'/'+language):
	for title in os.listdir(book_dir+'/'+language+'/'+author):
	inputfile=book_dir+'/'+language+'/'+author+'/'+title
	#print(inputfile)
	text=read_book(inputfile)
	(num_unique, counts)=word_stats(count_words(text))
	stats.loc[title_num]=language,author.capitalize(),title.replace(".txt",""),sum(counts),num_unique
	title_num +=1
	import matplotlib.pyplot as plt

	plt.figure(figsize=(10,10))
	subset=stats[stats.language=="English"]
	plt.loglog(subset.length,subset.unique,'o',label="English",color="crimson")
	subset=stats[stats.language=="French"]
	plt.loglog(subset.length,subset.unique,'o',label="French",color="forestgreen")
	subset=stats[stats.language=="German"]
	plt.loglog(subset.length,subset.unique,'o',label="German",color="orange")
	subset=stats[stats.language=="Portuguese"]
	plt.loglog(subset.length,subset.unique,'o',label="Portuguese",color="blueviolet")
	plt.legend()
	plt.xlabel("Book length")
	plt.ylabel("Number of unique words")
	plt.savefig("lang_plt.png")