Skip to content

Instantly share code, notes, and snippets.

@bryanyang0528
Created July 16, 2014 11:51
Show Gist options
  • Save bryanyang0528/9c5647ef22e0c67d081f to your computer and use it in GitHub Desktop.
Save bryanyang0528/9c5647ef22e0c67d081f to your computer and use it in GitHub Desktop.
N-Gram v2.0 part2
def ngram(textLists,n,minFreq): #第一個參數放處理好的文章(LIST檔,utf-8編碼),第二個參數放字詞的長度單位,第三個參數放至少要幾次以上
words=[] #存放擷取出來的字詞
words_freq={}#存放字詞:計算個數
result= []
for textList in textLists:
for w in range(len(textList)-(n-1)): #要讀取的長度隨字詞長度改變
words.append(textList[w:w+n]) #抓取長度w-(n-1)的字串
for word in words:
if word not in words_freq: #如果這個字詞還沒有被放在字典檔中
words_freq[word] = words.count(word) #就開一個新的字詞,裡面放入字詞計算的頻次
words_freq = sorted(words_freq.iteritems(),key=operator.itemgetter(1),reverse=True) #change words_freq from dict to list
for word in words_freq:
if word[1] >= minFreq:
result.append(word)
return result ##回傳一個陣列[詞,頻次]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment