Skip to content

Instantly share code, notes, and snippets.

@bryanyang0528
Created July 13, 2014 06:03
Show Gist options
  • Save bryanyang0528/42aebdd0e5b97480da94 to your computer and use it in GitHub Desktop.
Save bryanyang0528/42aebdd0e5b97480da94 to your computer and use it in GitHub Desktop.
N-GRAM
import codecs
#處理編碼的套件
import operator
##處理字典檔排序的套件
text = codecs.open("text.txt","r","utf-8")
#讀取存成TXT檔的文字,讀入後統一轉成UTF-8格式
text_new =""
for line in text.readlines():
text_new += "".join(line.split('\n'))
#在這邊先做一個小處理,把不同行的文章串接再一起,如果未來要做一些去除標點符號的處理也會是在這邊。
def ngram(text,n): #第一個參數放處理好的文章,第二個參數放字詞的長度單位
words=[] #存放擷取出來的字詞
words_freq={}#存放字詞:計算個數
for w in range(len(text)-(n-1)): #要讀取的長度隨字詞長度改變
words.append(text[w:w+n]) #抓取長度w-(n-1)的字串
for word in words:
if word not in words_freq: #如果這個字詞還沒有被放在字典檔中
words_freq[word] = words.count(word) #就開一個新的字詞,裡面放入字詞計算的頻次
words_freq = sorted(words_freq.iteritems(),key=operator.itemgetter(1),reverse=True) #change words_freq from dict to list
return words_freq
words_freqs = ngram(text_new,3)
for i in words_freqs:
print i[0],i[1]
'''
道:" 35
笑道: 13
"那僧 9
聽了, 8
"士隱 8
。士隱 7
。"那 7
那僧道 6
....
'''
words_freqs = ngram(text_new,2)
for i in words_freqs:
print i[0],i[1]
'''
:" 45
道: 36
士隱 33
雨村 25
,不 24
。" 22
那僧 17
,便 16
...
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment