Skip to content

Instantly share code, notes, and snippets.

@hiropppe
Last active November 30, 2015 14:41
Show Gist options
  • Save hiropppe/26ea2f98dfbc404a3b94 to your computer and use it in GitHub Desktop.
Save hiropppe/26ea2f98dfbc404a3b94 to your computer and use it in GitHub Desktop.
KNBコーパスをCaboCha形式へ変換するスクリプト
#!/usr/bin/env/python
# Usage: knbc2cabocha.py [format]
# convert KNB corpus to CaboCha format
import sys, re
header, data, head, body, t2c = "", "", [], [], {}
pat_d = re.compile (r'^\d+')
pat_s = re.compile (r'\s+')
for line in sys.stdin:
if line[0] == '#':
header = line
elif line[0] == 'E':
try:
head = [pat_d.sub (lambda m: t2c[m.group ()], h) for h in head]
for i in xrange (len (body)):
data += "* %d %s\n%s" % (i, head[i], body[i])
sys.stdout.write ("%s%s" % (data, line))
except KeyError:
sys.stderr.write ("fail to convert: %s" % header)
header, data, head, body, t2c = "", "", [], [], {}
elif line[0] == '*' or line[0] == '+':
if line[0] == '*':
body[len (body):], head[len (head):] = [""], [""]
head[-1] = line[:-1].split(' ')[1]
t2c[str (len (t2c))] = str (len (head) - 1)
else:
f = pat_s.split (line)[:11]
if len (sys.argv) >= 2: # KNP
if f[0] == f[2]:
f[2] = '*'
body[-1] += ' '.join (f[0:3] + f[3:10:2] + ['*']) + "\n"
else:
body[-1] += "%s\t%s\n" % (f[0], ','.join (f[3:10:2] + [f[2], f[1], '*']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment