Created
July 16, 2014 11:34
-
-
Save bryanyang0528/fd83acb74ff5db1728a3 to your computer and use it in GitHub Desktop.
N-gram v2.0
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
#處理編碼的套件 | |
import operator | |
##處理字典檔排序的套件 | |
cutlist = "<>/::;;,、"’,.。!?「\"\'\\\n\r《》“”!@#$%^&*()".decode("utf-8") | |
#text_new ="" | |
def cutSentence(text_path, keywords): ##放入原始文章路徑, 增加斷詞的list | |
text = codecs.open(text_path,"r","utf-8") #開檔 #讀取存成TXT檔的文字,讀入後統一轉成UTF-8格式 | |
sentence = "" | |
textList = [] | |
for line in text.readlines(): | |
line = line.strip() ##清除空白 | |
for keyword in keywords: #清除關鍵字 | |
line = "".join(line.split(keyword)) | |
for word in line: | |
if word not in cutlist: #如果文字不是標點符號,就把字加到句子中 | |
sentence += word | |
#print sentence | |
else: | |
textList.append(sentence) #如果遇到標點符號,把句子加到 text list中 | |
sentence = "" | |
#print textList | |
return textList#傳回一個文字陣列 | |
def ngram(textLists,n,minFreq): #第一個參數放處理好的文章(LIST檔,utf-8編碼),第二個參數放字詞的長度單位,第三個參數放至少要幾次以上 | |
words=[] #存放擷取出來的字詞 | |
words_freq={}#存放字詞:計算個數 | |
result= [] | |
for textList in textLists: | |
for w in range(len(textList)-(n-1)): #要讀取的長度隨字詞長度改變 | |
words.append(textList[w:w+n]) #抓取長度w-(n-1)的字串 | |
for word in words: | |
if word not in words_freq: #如果這個字詞還沒有被放在字典檔中 | |
words_freq[word] = words.count(word) #就開一個新的字詞,裡面放入字詞計算的頻次 | |
words_freq = sorted(words_freq.iteritems(),key=operator.itemgetter(1),reverse=True) #change words_freq from dict to list | |
for word in words_freq: | |
if word[1] >= minFreq: | |
result.append(word) | |
return result ##回傳一個陣列[詞,頻次] | |
def longTermPriority(path, maxTermLength, minFreq): | |
longTerms=[] #長詞 | |
longTermsFreq=[] #長詞+次數分配 | |
for i in range(maxTermLength,1,-1): | |
text_list = cutSentence(path,longTerms) | |
#print len(text_list) | |
words_freq = ngram(text_list,i, minFreq) | |
#print i | |
for word_freq in words_freq: | |
longTerms.append(word_freq[0]) | |
#print word_freq[0] | |
longTermsFreq.append(word_freq) | |
#print word_freq | |
return longTermsFreq | |
longTermFreq = longTermPriority("text.txt",6,5) ##最長詞6個字、出現頻次5次以上 | |
for i in longTermFreq: | |
print i[0],i[1] | |
''' | |
士隱聽了 5 | |
空空道人 5 | |
那僧道 6 | |
那道人 5 | |
士隱 40 | |
雨村 25 | |
笑道 17 | |
那僧 11 | |
不知 10 | |
道人 9 | |
一段 8 | |
弟子 8 | |
不過 8 | |
世人 8 | |
丫鬟 8 | |
去了 7 | |
了一 7 | |
風流 7 | |
故事 7 | |
石頭 7 | |
有一 7 | |
二人 7 | |
紅塵 6 | |
意欲 6 | |
聽了 6 | |
英蓮 6 | |
富貴 6 | |
不可 6 | |
原來 6 | |
這一 6 | |
蠢物 6 | |
神仙 6 | |
如此 6 | |
人都 5 | |
其中 5 | |
又有 5 | |
心中 5 | |
幾個 5 | |
之人 5 | |
之事 5 | |
明白 5 | |
有些 5 | |
只有 5 | |
聽得 5 | |
不能 5 | |
也不 5 | |
自己 5 | |
封肅 5 | |
如今 5 | |
說著 5 | |
不了 5 | |
下世 5 | |
女子 5 | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment