Created
July 13, 2014 06:03
-
-
Save bryanyang0528/42aebdd0e5b97480da94 to your computer and use it in GitHub Desktop.
N-GRAM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
#處理編碼的套件 | |
import operator | |
##處理字典檔排序的套件 | |
text = codecs.open("text.txt","r","utf-8") | |
#讀取存成TXT檔的文字,讀入後統一轉成UTF-8格式 | |
text_new ="" | |
for line in text.readlines(): | |
text_new += "".join(line.split('\n')) | |
#在這邊先做一個小處理,把不同行的文章串接再一起,如果未來要做一些去除標點符號的處理也會是在這邊。 | |
def ngram(text,n): #第一個參數放處理好的文章,第二個參數放字詞的長度單位 | |
words=[] #存放擷取出來的字詞 | |
words_freq={}#存放字詞:計算個數 | |
for w in range(len(text)-(n-1)): #要讀取的長度隨字詞長度改變 | |
words.append(text[w:w+n]) #抓取長度w-(n-1)的字串 | |
for word in words: | |
if word not in words_freq: #如果這個字詞還沒有被放在字典檔中 | |
words_freq[word] = words.count(word) #就開一個新的字詞,裡面放入字詞計算的頻次 | |
words_freq = sorted(words_freq.iteritems(),key=operator.itemgetter(1),reverse=True) #change words_freq from dict to list | |
return words_freq | |
words_freqs = ngram(text_new,3) | |
for i in words_freqs: | |
print i[0],i[1] | |
''' | |
道:" 35 | |
笑道: 13 | |
"那僧 9 | |
聽了, 8 | |
"士隱 8 | |
。士隱 7 | |
。"那 7 | |
那僧道 6 | |
.... | |
''' | |
words_freqs = ngram(text_new,2) | |
for i in words_freqs: | |
print i[0],i[1] | |
''' | |
:" 45 | |
道: 36 | |
士隱 33 | |
雨村 25 | |
,不 24 | |
。" 22 | |
那僧 17 | |
,便 16 | |
... | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment