Skip to content

Instantly share code, notes, and snippets.

@bryanyang0528
Last active August 29, 2015 14:04
Show Gist options
  • Save bryanyang0528/f5d87adb7d8729530393 to your computer and use it in GitHub Desktop.
Save bryanyang0528/f5d87adb7d8729530393 to your computer and use it in GitHub Desktop.
N-gram v2.0 part1
import codecs
#處理編碼的套件
import operator
##處理字典檔排序的套件
cutlist = "<>/::;;,、"’,.。!?「\"\'\\\n\r《》“”!@#$%^&*()".decode("utf-8") ##列出標點符號,並轉換成utf-8的格式
def cutSentence(text_path, keywords): ##放入原始文章路徑, 增加斷詞的list
text = codecs.open(text_path,"r","utf-8") #開檔
sentence = ""
textList = []
for line in text.readlines():
line = line.strip() ##清除空白
for keyword in keywords: #清除關鍵字
line = "".join(line.split(keyword))
for word in line:
if word not in cutlist: #如果文字不是標點符號,就把字加到句子中
sentence += word
#print sentence
else:
textList.append(sentence) #如果遇到標點符號,把句子加到 text list中
sentence = ""
#print textList
return textList#傳回一個文字陣列
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment