emlyn · December 19, 2015 02:38
diff --git a/wikioccs2stanford.py b/wikioccs2stanford.py
 #!/usr/bin/env python
 
 import argparse
 from nltk.tokenize import wordpunct_tokenize

 def split_parts(fulltext, entities):
  text = fulltext
  shift = 0
  done = 0
  parts = []
  lastoffset = None
  for offset, uri, form in sorted(entities):
    if offset == lastoffset: continue
    lastoffset = offset
    l = len(form)
    offset += shift - done
    if offset < 0:
      print 'Warning: offset negative (%d) for "%s", resetting to 0' % (offset, form)
      offset = 0
    i = text.rfind(form, 0, offset+l)
    if i < 0:
      i = text.find(form, offset)
      if i < 0:
        print 'Error: "%s" not found at offset %d in:\n%s' % (form, offset, text)
        return []
    if i != offset:
      print 'Warning: form "%s" shifted by %d at %d.' % (form, i-offset, offset)
      shift += i-offset
    if i > 0: parts.append([text[:i], None])
    parts.append([text[i:i+l], uri])
    text = text[i+l:]
    done += i+l
  if len(text) > 0: parts.append([text, None])
  return parts

 def write_entities(out, parts):
  for text, uri in parts:
    tokens = wordpunct_tokenize(text.decode('utf8'))
    for i, t in enumerate(tokens):
      e = 'ENT' if uri else 'O'
      u = uri if uri and i == 0 else ''
      out.write('%s\t%s\t%s\n' % (t.encode('utf8'), e, u))

 def main(input, output):
  with open(output, 'w') as out:
    last_text = ''
    entities = []
    for line in open(input, 'r'):
      parts = line.split('\t')
      if len(parts) != 6: print "Warning: unexpected length %d (expecting 6): %s" % (len(parts), line)
      try:
        uri, form, text, offset = parts[1:5]
        offset = int(offset)
      except Exception, e:
        print "Error:", e, "\n", line
        continue
      if text != last_text:
        write_entities(out, split_parts(last_text, entities))
        last_text = text
        entities = []
      entities.append([offset, uri, form])
    write_entities(out, split_parts(last_text, entities))

 if __name__ == '__main__':
  parser = argparse.ArgumentParser(description="Wikki Occs to Stanford NER converter.")
  parser.add_argument('input', metavar='in_file', help='Wiki Occs TSV file to read')
  parser.add_argument('output', metavar='out_file', help='Stanford NER training file to write')
  args = parser.parse_args()
  main(**vars(args))
	#!/usr/bin/env python

	import argparse
	from nltk.tokenize import wordpunct_tokenize

	def split_parts(fulltext, entities):
	text = fulltext
	shift = 0
	done = 0
	parts = []
	lastoffset = None
	for offset, uri, form in sorted(entities):
	if offset == lastoffset: continue
	lastoffset = offset
	l = len(form)
	offset += shift - done
	if offset < 0:
	print 'Warning: offset negative (%d) for "%s", resetting to 0' % (offset, form)
	offset = 0
	i = text.rfind(form, 0, offset+l)
	if i < 0:
	i = text.find(form, offset)
	if i < 0:
	print 'Error: "%s" not found at offset %d in:\n%s' % (form, offset, text)
	return []
	if i != offset:
	print 'Warning: form "%s" shifted by %d at %d.' % (form, i-offset, offset)
	shift += i-offset
	if i > 0: parts.append([text[:i], None])
	parts.append([text[i:i+l], uri])
	text = text[i+l:]
	done += i+l
	if len(text) > 0: parts.append([text, None])
	return parts

	def write_entities(out, parts):
	for text, uri in parts:
	tokens = wordpunct_tokenize(text.decode('utf8'))
	for i, t in enumerate(tokens):
	e = 'ENT' if uri else 'O'
	u = uri if uri and i == 0 else ''
	out.write('%s\t%s\t%s\n' % (t.encode('utf8'), e, u))

	def main(input, output):
	with open(output, 'w') as out:
	last_text = ''
	entities = []
	for line in open(input, 'r'):
	parts = line.split('\t')
	if len(parts) != 6: print "Warning: unexpected length %d (expecting 6): %s" % (len(parts), line)
	try:
	uri, form, text, offset = parts[1:5]
	offset = int(offset)
	except Exception, e:
	print "Error:", e, "\n", line
	continue
	if text != last_text:
	write_entities(out, split_parts(last_text, entities))
	last_text = text
	entities = []
	entities.append([offset, uri, form])
	write_entities(out, split_parts(last_text, entities))

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description="Wikki Occs to Stanford NER converter.")
	parser.add_argument('input', metavar='in_file', help='Wiki Occs TSV file to read')
	parser.add_argument('output', metavar='out_file', help='Stanford NER training file to write')
	args = parser.parse_args()
	main(**vars(args))