huangziwei · April 20, 2016 06:24
diff --git a/getSentences.py b/getSentences.py
 class getSentences(object):
    
    def __init__(self, dirname):
        self.dirname = dirname

    def split_paragraph(self, raw):
        
        def is_punt(char):
            stop = ['。', '！', '…', '？'] 
            return char in stop # 判断是否要分句的标点符号

        def is_other_punt(char):
            other_punts = ['）']
            return char in other_punts
        
        raw = re.sub('\n|“|”', '', raw)
        sents = []
        start = 0
        quo = 0
        for i, char in enumerate(raw):
            if ( i < (len(raw) - 1) 
                and is_punt(char) 
                and not (is_punt(raw[i+1]) or is_other_punt(raw[i+1])) 
                and quo != 1):
 #                 print(i, char)
                sents.append(raw[start:i+1])
                start = i+1
            else:
                continue
        if start < len(raw):
            sents.append(raw[start:])
        
        return sents
        
    def __iter__(self):
        
        for file in glob.glob(self.dirname + '/*'):
            for paragraph in codecs.open(file, 'r'):
                for line in self.split_paragraph(paragraph):
 #                     yield list(jieba.cut(line))
                    yield line
	class getSentences(object):

	def __init__(self, dirname):
	self.dirname = dirname

	def split_paragraph(self, raw):

	def is_punt(char):
	stop = ['。', '！', '…', '？']
	return char in stop # 判断是否要分句的标点符号

	def is_other_punt(char):
	other_punts = ['）']
	return char in other_punts

	raw = re.sub('\n\|“\|”', '', raw)
	sents = []
	start = 0
	quo = 0
	for i, char in enumerate(raw):
	if ( i < (len(raw) - 1)
	and is_punt(char)
	and not (is_punt(raw[i+1]) or is_other_punt(raw[i+1]))
	and quo != 1):
	# print(i, char)
	sents.append(raw[start:i+1])
	start = i+1
	else:
	continue
	if start < len(raw):
	sents.append(raw[start:])

	return sents

	def __iter__(self):

	for file in glob.glob(self.dirname + '/*'):
	for paragraph in codecs.open(file, 'r'):
	for line in self.split_paragraph(paragraph):
	# yield list(jieba.cut(line))
	yield line