Last active
November 22, 2018 04:58
-
-
Save ikegami-yukino/68e322082c680d84fd886043718a173c to your computer and use it in GitHub Desktop.
Convert CSJ's xml to plain text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import html | |
import re | |
import sys | |
import jaconv | |
re_ogt = re.compile(' OrthographicTranscription="([^"]+)"') | |
re_a = re.compile('\;([^\)]+)\)?') | |
re_semicolon = re.compile(';([^\)]+)\)?') | |
re_d = re.compile('\(D [^\)]+\)') | |
def main(pattern): | |
for path in glob.glob(pattern): | |
start_a = False | |
with open(path) as f: | |
for l in f: | |
if "<SUW" not in l: | |
continue | |
ogt = re_ogt.search(l).group(1) | |
if ogt.startswith(("(F ", "(D", "(M ", "(?")): | |
continue | |
elif re_a.search(ogt): | |
ogt = re_a.search(ogt).group(1) | |
start_a = False | |
elif ogt.startswith("(A ") and ";" not in ogt: | |
start_a = True | |
continue | |
elif "(R " in ogt: | |
ogt = ogt[3:-1] | |
elif re_semicolon.search(ogt): | |
ogt = re_semicolon.search(ogt).group(1) | |
elif ogt.startswith("(O "): | |
ogt = ogt[3:] | |
elif start_a: | |
continue | |
ogt = re_d.sub('', ogt) | |
ogt = html.unescape(ogt).replace('<FV>', '').replace('FV>', '') | |
ogt = ogt[:-1] if ogt.endswith(')') else ogt | |
ogt = jaconv.z2h(ogt, kana=False, ascii=True, digit=True) | |
print(ogt, end='') | |
if 'ClauseBoundaryLabel="[文末]"' in l: | |
print() | |
if __name__ == '__main__': | |
main(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment