Skip to content

Instantly share code, notes, and snippets.

@emlyn
Last active December 19, 2015 02:38
Show Gist options
  • Save emlyn/5884369 to your computer and use it in GitHub Desktop.
Save emlyn/5884369 to your computer and use it in GitHub Desktop.
Convert Wikipedia occurrences file generated by DBpedia Spotlight ExtractOccsFromWikipedia (& ExtractCandidateMap) to a format suitable for training Stanford NER (note: occurrences file must not be uri-sorted).
#!/usr/bin/env python
import argparse
from nltk.tokenize import wordpunct_tokenize
def split_parts(fulltext, entities):
text = fulltext
shift = 0
done = 0
parts = []
lastoffset = None
for offset, uri, form in sorted(entities):
if offset == lastoffset: continue
lastoffset = offset
l = len(form)
offset += shift - done
if offset < 0:
print 'Warning: offset negative (%d) for "%s", resetting to 0' % (offset, form)
offset = 0
i = text.rfind(form, 0, offset+l)
if i < 0:
i = text.find(form, offset)
if i < 0:
print 'Error: "%s" not found at offset %d in:\n%s' % (form, offset, text)
return []
if i != offset:
print 'Warning: form "%s" shifted by %d at %d.' % (form, i-offset, offset)
shift += i-offset
if i > 0: parts.append([text[:i], None])
parts.append([text[i:i+l], uri])
text = text[i+l:]
done += i+l
if len(text) > 0: parts.append([text, None])
return parts
def write_entities(out, parts):
for text, uri in parts:
tokens = wordpunct_tokenize(text.decode('utf8'))
for i, t in enumerate(tokens):
e = 'ENT' if uri else 'O'
u = uri if uri and i == 0 else ''
out.write('%s\t%s\t%s\n' % (t.encode('utf8'), e, u))
def main(input, output):
with open(output, 'w') as out:
last_text = ''
entities = []
for line in open(input, 'r'):
parts = line.split('\t')
if len(parts) != 6: print "Warning: unexpected length %d (expecting 6): %s" % (len(parts), line)
try:
uri, form, text, offset = parts[1:5]
offset = int(offset)
except Exception, e:
print "Error:", e, "\n", line
continue
if text != last_text:
write_entities(out, split_parts(last_text, entities))
last_text = text
entities = []
entities.append([offset, uri, form])
write_entities(out, split_parts(last_text, entities))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Wikki Occs to Stanford NER converter.")
parser.add_argument('input', metavar='in_file', help='Wiki Occs TSV file to read')
parser.add_argument('output', metavar='out_file', help='Stanford NER training file to write')
args = parser.parse_args()
main(**vars(args))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment