bryanyang0528 · July 13, 2014 06:03
diff --git a/gistfile1.py b/gistfile1.py
 import codecs
 #處理編碼的套件
 import operator
 ##處理字典檔排序的套件

 text = codecs.open("text.txt","r","utf-8")
 #讀取存成TXT檔的文字，讀入後統一轉成UTF-8格式

 text_new =""
 for line in text.readlines():
    text_new += "".join(line.split('\n'))
 #在這邊先做一個小處理，把不同行的文章串接再一起，如果未來要做一些去除標點符號的處理也會是在這邊。

 def ngram(text,n): #第一個參數放處理好的文章，第二個參數放字詞的長度單位

    words=[]     #存放擷取出來的字詞
    words_freq={}#存放字詞:計算個數 
    
    for w in range(len(text)-(n-1)): #要讀取的長度隨字詞長度改變
        words.append(text[w:w+n])    #抓取長度w-(n-1)的字串

    for word in words:
        if word not in words_freq:               #如果這個字詞還沒有被放在字典檔中
            words_freq[word] = words.count(word) #就開一個新的字詞，裡面放入字詞計算的頻次

    words_freq = sorted(words_freq.iteritems(),key=operator.itemgetter(1),reverse=True) #change words_freq from dict to list 
    return words_freq

 words_freqs = ngram(text_new,3)

 for i in words_freqs:
    print i[0],i[1]
    
 '''
 道：" 35
 笑道： 13
 "那僧 9
 聽了， 8
 "士隱 8
 。士隱 7
 。"那 7
 那僧道 6
 ....
 '''

 words_freqs = ngram(text_new,2)

 for i in words_freqs:
    print i[0],i[1]
    
 '''
 ：" 45
 道： 36
 士隱 33
 雨村 25
 ，不 24
 。" 22
 那僧 17
 ，便 16
 ...
 '''
	import codecs
	#處理編碼的套件
	import operator
	##處理字典檔排序的套件

	text = codecs.open("text.txt","r","utf-8")
	#讀取存成TXT檔的文字，讀入後統一轉成UTF-8格式

	text_new =""
	for line in text.readlines():
	text_new += "".join(line.split('\n'))
	#在這邊先做一個小處理，把不同行的文章串接再一起，如果未來要做一些去除標點符號的處理也會是在這邊。

	def ngram(text,n): #第一個參數放處理好的文章，第二個參數放字詞的長度單位

	words=[] #存放擷取出來的字詞
	words_freq={}#存放字詞:計算個數

	for w in range(len(text)-(n-1)): #要讀取的長度隨字詞長度改變
	words.append(text[w:w+n]) #抓取長度w-(n-1)的字串

	for word in words:
	if word not in words_freq: #如果這個字詞還沒有被放在字典檔中
	words_freq[word] = words.count(word) #就開一個新的字詞，裡面放入字詞計算的頻次

	words_freq = sorted(words_freq.iteritems(),key=operator.itemgetter(1),reverse=True) #change words_freq from dict to list
	return words_freq

	words_freqs = ngram(text_new,3)

	for i in words_freqs:
	print i[0],i[1]

	'''
	道：" 35
	笑道： 13
	"那僧 9
	聽了， 8
	"士隱 8
	。士隱 7
	。"那 7
	那僧道 6
	....
	'''

	words_freqs = ngram(text_new,2)

	for i in words_freqs:
	print i[0],i[1]

	'''
	：" 45
	道： 36
	士隱 33
	雨村 25
	，不 24
	。" 22
	那僧 17
	，便 16
	...
	'''