Created
September 12, 2016 19:24
-
-
Save benoit-pierre/3d415adf9b4299de9e59ef3f9046e24d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from collections import namedtuple | |
import sys | |
import re | |
from plover.dictionary.base import create_dictionary | |
RTF_TOKEN = re.compile(r'\\((?P<cchar>[-_~\\{}*])|(?P<cword>[A-Za-z]+)(?P<cparam>-?[0-9]+)? ?)|(?P<group>[{}])|(?P<text>[^\n\r\\{}]+)|(?P<nl>[\n\r]+)') | |
Token = namedtuple('Token', 'kind value') | |
class ControlWord(namedtuple('ControlWord', 'kind name param')): | |
def __eq__(self, other): | |
if isinstance(other, ControlWord): | |
other = other.name | |
return self.name == other | |
class Group(object): | |
def __init__(self, destination=None, ignorable=False): | |
self.destination = destination | |
self.ignorable = ignorable | |
self.text = '' | |
text = open(sys.argv[1], 'rb').read().decode('cp1252') | |
dictionary = create_dictionary(sys.argv[2]) | |
pos = 0 | |
token_list = [] | |
while pos < len(text): | |
m = RTF_TOKEN.match(text, pos) | |
assert m is not None | |
pos = m.end() | |
for group_names in ( | |
('cchar',), | |
('cword', 'cparam'), | |
('group',), | |
('nl',), | |
('text',) | |
): | |
token_value = m.group(*group_names) | |
if token_value not in (None, (None, None)): | |
token_kind = group_names[0] | |
break | |
else: | |
raise ValueError() | |
if token_kind == 'nl': | |
continue | |
if token_kind == 'cword': | |
name, param = token_value | |
if param is not None: | |
param = int(param) | |
token = ControlWord('cword', name, param) | |
else: | |
token = Token(token_kind, token_value) | |
token_list.append(token) | |
assert token_list[0] == Token('group', '{') | |
assert token_list[1] == ControlWord('cword', 'rtf', 1) | |
assert token_list[-1] == Token('group', '}') | |
token_list = token_list[2:-1] | |
group = Group(None) | |
group_stack = [group] | |
steno = None | |
n = 0 | |
while n < len(token_list): | |
token = token_list[n] | |
if token.kind == 'cchar': | |
char = token.value | |
if char == '*': | |
pass | |
elif char == '~': | |
group.text += '{^ ^}' | |
elif char == '_': | |
group.text += '{^-^}' | |
else: | |
group.text += char | |
elif token.kind == 'cword': | |
if token.name == 'par': | |
group.text += '{#Return}{#Return}' | |
elif token.name == 'cxds': | |
group.text += '{^}' | |
elif token.name == 'cxfc': | |
group.text += '{-|}' | |
elif token.name == 'cxfing': | |
next_token = token_list[n + 1] | |
assert next_token.kind == 'text' | |
group.text += '{&' + next_token.value + '}' | |
n += 1 | |
elif token.kind == 'group': | |
if token.value == '{': | |
ignorable = False | |
destination = None | |
next_token = token_list[n + 1] | |
if next_token.kind == 'cword': | |
n += 1 | |
destination = next_token | |
elif next_token == Token('cchar', '*'): | |
ignorable = True | |
next_token = token_list[n + 2] | |
if next_token.kind == 'cword': | |
n += 2 | |
destination = next_token | |
if destination == 'cxs': | |
assert len(group_stack) == 1 | |
if steno is not None: | |
dictionary[steno] = group.text | |
group.text = '' | |
group = Group(destination, ignorable) | |
group_stack.append(group) | |
elif token.value == '}': | |
text = '' | |
if group.destination == 'cxs': | |
steno = tuple(group.text.split('/')) | |
elif group.destination == 'cxp': | |
stripped = group.text.strip() | |
if stripped in ['.', '!', '?', ',', ';', ':']: | |
text = '{' + stripped + '}' | |
elif stripped == "'": | |
text = "{^'}" | |
elif stripped in ['-', '/']: | |
text = '{^' + contents + '^}' | |
elif stripped: | |
# Show unknown punctuation as given. | |
text = '{^' + contents + '^}' | |
elif group.destination == 'cxfing': | |
text = '{&' + group.text + '}' | |
elif not group.ignorable: | |
text = group.text | |
group_stack.pop() | |
group = group_stack[-1] | |
group.text += text | |
else: | |
raise ValueError() | |
elif token.kind == 'text': | |
group.text += token.value | |
else: | |
raise ValueError() | |
n += 1 | |
assert len(group_stack) == 1 | |
if steno is not None: | |
dictionary[steno] = group.text | |
dictionary.save() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment