bryanyang0528 · July 16, 2014 11:51
diff --git a/gistfile1.py b/gistfile1.py
 def ngram(textLists,n,minFreq): #第一個參數放處理好的文章(LIST檔，utf-8編碼)，第二個參數放字詞的長度單位，第三個參數放至少要幾次以上
 
    words=[]     #存放擷取出來的字詞
    words_freq={}#存放字詞:計算個數 
    result= []
    for textList in textLists:
        for w in range(len(textList)-(n-1)): #要讀取的長度隨字詞長度改變
            words.append(textList[w:w+n])    #抓取長度w-(n-1)的字串

    for word in words:
        if word not in words_freq:               #如果這個字詞還沒有被放在字典檔中
            words_freq[word] = words.count(word) #就開一個新的字詞，裡面放入字詞計算的頻次
 
    words_freq = sorted(words_freq.iteritems(),key=operator.itemgetter(1),reverse=True) #change words_freq from dict to list 
    
    for word in words_freq:
        if word[1] >= minFreq:
            result.append(word)
            
    return result ##回傳一個陣列[詞,頻次]
	def ngram(textLists,n,minFreq): #第一個參數放處理好的文章(LIST檔，utf-8編碼)，第二個參數放字詞的長度單位，第三個參數放至少要幾次以上

	words=[] #存放擷取出來的字詞
	words_freq={}#存放字詞:計算個數
	result= []
	for textList in textLists:
	for w in range(len(textList)-(n-1)): #要讀取的長度隨字詞長度改變
	words.append(textList[w:w+n]) #抓取長度w-(n-1)的字串

	for word in words:
	if word not in words_freq: #如果這個字詞還沒有被放在字典檔中
	words_freq[word] = words.count(word) #就開一個新的字詞，裡面放入字詞計算的頻次

	words_freq = sorted(words_freq.iteritems(),key=operator.itemgetter(1),reverse=True) #change words_freq from dict to list

	for word in words_freq:
	if word[1] >= minFreq:
	result.append(word)

	return result ##回傳一個陣列[詞,頻次]