Created
March 26, 2019 07:49
-
-
Save Elfsong/13b92d83010e08d08ee457ca1464f2ea to your computer and use it in GitHub Desktop.
故事对话分割
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import re | |
import jieba | |
ROOT_PATH = "/Users/elfsong/PycharmProjects/BERT_demo" | |
RESOURCE_PATH = os.path.join(ROOT_PATH, "Test text") | |
def avsplit1(s, n): | |
fn = len(s) // n | |
rn = len(s) % n | |
ar = [fn + 1] * rn + [fn] * (n - rn) | |
si = [i * (fn + 1) if i < rn else (rn * (fn + 1) + (i - rn) * fn) for i in range(n)] | |
sr = [s[si[i]:si[i] + ar[i]] for i in range(n)] | |
return sr | |
def split_jieba(s, n): | |
result = list() | |
temp_sentence = "" | |
seg_list = jieba.cut(s, cut_all=False) | |
for segment in seg_list: | |
if len(temp_sentence+segment) <= n: | |
temp_sentence += segment | |
else: | |
result += [temp_sentence] | |
temp_sentence = segment | |
result += [temp_sentence] | |
return [sentence for sentence in result if sentence] | |
def cut_sent(para): | |
para = para.replace(u'\u3000', '') | |
para = re.sub('([,,、:。\-—…!~;;?\?])([^”’])', r"\1\n\2", para) # 单字符断句符 | |
para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号 | |
para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号 | |
para = re.sub('([。!?\?][”’])([^,。!?\?])', r'\1\n\2', para) | |
para = para.rstrip() | |
return [sentence for sentence in para.split("\n") if sentence] | |
def finetune(sentence_list, perfered_length): | |
result = list() | |
current_sentence = sentence_list[0] | |
for sentence in sentence_list[1:]: | |
if len(current_sentence) > perfered_length: | |
result += avsplit1(current_sentence, (len(current_sentence) // perfered_length) + 1) | |
current_sentence = sentence | |
elif len(current_sentence) + len(sentence) <= perfered_length: | |
current_sentence = current_sentence + sentence | |
else: | |
result += [current_sentence] | |
current_sentence = sentence | |
result += avsplit1(current_sentence, (len(current_sentence) // perfered_length) + 1) | |
for index, sentence in enumerate(result[1:]): | |
if sentence[0] in [",", "。", "、", "?", "!", "…", "]", "》"]: | |
result[index] += sentence[0] | |
result[index+1] = result[index+1][1:] | |
if result[index][-1] in ["[", "《"]: | |
result[index + 1] = result[index][-1]+ result[index + 1] | |
result[index] = result[index][:-1] | |
if len(result[index+1]) <= perfered_length * 0.3: | |
result[index] += result[index+1] | |
result[index + 1] = "" | |
final_result = [sentence for sentence in result if sentence] | |
return final_result | |
sentence_list = cut_sent("蚂蚁家族里,有一只[小蚂蚁],每天不干活,不觅食,就知道吃、睡和玩儿。") | |
result = finetune(sentence_list, 15) | |
for sentence in result: | |
print(sentence) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment