Created
June 21, 2017 06:17
-
-
Save alantian/e9b13f5203dca90d3a70121a775f4921 to your computer and use it in GitHub Desktop.
extract dialogue.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
from joblib import Parallel, delayed | |
def go_lines(lines): | |
flow = [] | |
names = [] | |
n = len(lines) | |
i = 0 | |
while i < n: | |
if lines[i].startswith('@Talk'): | |
# extract name, if possible | |
for token in lines[i].split(): | |
if token.startswith('name='): | |
name = token[5:] | |
names.append(name) | |
# extract this piece | |
j = i + 1 | |
while j < n and not lines[j].startswith('@'): | |
j += 1 | |
if lines[j].startswith('@'): | |
piece = ''.join(lines[i+1:j]) | |
flow.append(piece) | |
# skip this piece | |
i = j + 1 | |
else: | |
# go to next line | |
i += 1 | |
# only keep distinct names | |
names = set(names) | |
# only keep CJK | |
for i in range(len(flow)): | |
flow[i] = ''.join([c for c in flow[i] if 0x2E80 <= ord(c) <= 0x9FFF]) | |
# replace name with ○ | |
for i in range(len(flow)): | |
for name in names: | |
flow[i] = flow[i].replace(name, '○' * len(name)) | |
# make dialogue | |
dialogue = [] | |
for i in range(len(flow) - 1): | |
dialogue.append((str(flow[i]), str(flow[i+1]))) | |
return {'flow': flow, 'names': names, 'dialogue': dialogue} | |
def go(filename): | |
lines = [_.strip('\r\n') for _ in open(filename).readlines()] | |
result = go_lines(lines) | |
return result | |
input_dir = 'text-data-utf8' | |
output_dir = 'text-dialogue' | |
os.makedirs(output_dir, exist_ok=True) | |
filepath_list = [] | |
for dir_path, _, filename_list in os.walk(input_dir): | |
for filename in filename_list: | |
if not filename.startswith('.') and filename.lower().endswith('ks'): | |
filepath = os.path.join(dir_path, filename) | |
filepath_list.append(filepath) | |
result_list = Parallel(n_jobs=-1)(delayed(go)(filepath) for filepath in filepath_list) | |
result = [_2 for _ in result_list for _2 in _['dialogue']] | |
with open(os.path.join(output_dir, 'dialogue.txt'), 'w') as fout: | |
for _q, _a in result: | |
print('%s\t%s' %(_q, _a), file=fout) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment