Last active
August 29, 2015 14:04
-
-
Save bryanyang0528/f5d87adb7d8729530393 to your computer and use it in GitHub Desktop.
N-gram v2.0 part1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
#處理編碼的套件 | |
import operator | |
##處理字典檔排序的套件 | |
cutlist = "<>/::;;,、"’,.。!?「\"\'\\\n\r《》“”!@#$%^&*()".decode("utf-8") ##列出標點符號,並轉換成utf-8的格式 | |
def cutSentence(text_path, keywords): ##放入原始文章路徑, 增加斷詞的list | |
text = codecs.open(text_path,"r","utf-8") #開檔 | |
sentence = "" | |
textList = [] | |
for line in text.readlines(): | |
line = line.strip() ##清除空白 | |
for keyword in keywords: #清除關鍵字 | |
line = "".join(line.split(keyword)) | |
for word in line: | |
if word not in cutlist: #如果文字不是標點符號,就把字加到句子中 | |
sentence += word | |
#print sentence | |
else: | |
textList.append(sentence) #如果遇到標點符號,把句子加到 text list中 | |
sentence = "" | |
#print textList | |
return textList#傳回一個文字陣列 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment