Created
July 16, 2014 11:51
-
-
Save bryanyang0528/9c5647ef22e0c67d081f to your computer and use it in GitHub Desktop.
N-Gram v2.0 part2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def ngram(textLists,n,minFreq): #第一個參數放處理好的文章(LIST檔,utf-8編碼),第二個參數放字詞的長度單位,第三個參數放至少要幾次以上 | |
words=[] #存放擷取出來的字詞 | |
words_freq={}#存放字詞:計算個數 | |
result= [] | |
for textList in textLists: | |
for w in range(len(textList)-(n-1)): #要讀取的長度隨字詞長度改變 | |
words.append(textList[w:w+n]) #抓取長度w-(n-1)的字串 | |
for word in words: | |
if word not in words_freq: #如果這個字詞還沒有被放在字典檔中 | |
words_freq[word] = words.count(word) #就開一個新的字詞,裡面放入字詞計算的頻次 | |
words_freq = sorted(words_freq.iteritems(),key=operator.itemgetter(1),reverse=True) #change words_freq from dict to list | |
for word in words_freq: | |
if word[1] >= minFreq: | |
result.append(word) | |
return result ##回傳一個陣列[詞,頻次] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment