Skip to content

Instantly share code, notes, and snippets.

@alantian
Created June 21, 2017 06:17
Show Gist options
  • Save alantian/e9b13f5203dca90d3a70121a775f4921 to your computer and use it in GitHub Desktop.
Save alantian/e9b13f5203dca90d3a70121a775f4921 to your computer and use it in GitHub Desktop.
extract dialogue.
import os
import re
from joblib import Parallel, delayed
def go_lines(lines):
flow = []
names = []
n = len(lines)
i = 0
while i < n:
if lines[i].startswith('@Talk'):
# extract name, if possible
for token in lines[i].split():
if token.startswith('name='):
name = token[5:]
names.append(name)
# extract this piece
j = i + 1
while j < n and not lines[j].startswith('@'):
j += 1
if lines[j].startswith('@'):
piece = ''.join(lines[i+1:j])
flow.append(piece)
# skip this piece
i = j + 1
else:
# go to next line
i += 1
# only keep distinct names
names = set(names)
# only keep CJK
for i in range(len(flow)):
flow[i] = ''.join([c for c in flow[i] if 0x2E80 <= ord(c) <= 0x9FFF])
# replace name with ○
for i in range(len(flow)):
for name in names:
flow[i] = flow[i].replace(name, '○' * len(name))
# make dialogue
dialogue = []
for i in range(len(flow) - 1):
dialogue.append((str(flow[i]), str(flow[i+1])))
return {'flow': flow, 'names': names, 'dialogue': dialogue}
def go(filename):
lines = [_.strip('\r\n') for _ in open(filename).readlines()]
result = go_lines(lines)
return result
input_dir = 'text-data-utf8'
output_dir = 'text-dialogue'
os.makedirs(output_dir, exist_ok=True)
filepath_list = []
for dir_path, _, filename_list in os.walk(input_dir):
for filename in filename_list:
if not filename.startswith('.') and filename.lower().endswith('ks'):
filepath = os.path.join(dir_path, filename)
filepath_list.append(filepath)
result_list = Parallel(n_jobs=-1)(delayed(go)(filepath) for filepath in filepath_list)
result = [_2 for _ in result_list for _2 in _['dialogue']]
with open(os.path.join(output_dir, 'dialogue.txt'), 'w') as fout:
for _q, _a in result:
print('%s\t%s' %(_q, _a), file=fout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment