Last active
December 19, 2015 02:38
-
-
Save emlyn/5884369 to your computer and use it in GitHub Desktop.
Convert Wikipedia occurrences file generated by DBpedia Spotlight ExtractOccsFromWikipedia (& ExtractCandidateMap) to a format suitable for training Stanford NER (note: occurrences file must not be uri-sorted).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
from nltk.tokenize import wordpunct_tokenize | |
def split_parts(fulltext, entities): | |
text = fulltext | |
shift = 0 | |
done = 0 | |
parts = [] | |
lastoffset = None | |
for offset, uri, form in sorted(entities): | |
if offset == lastoffset: continue | |
lastoffset = offset | |
l = len(form) | |
offset += shift - done | |
if offset < 0: | |
print 'Warning: offset negative (%d) for "%s", resetting to 0' % (offset, form) | |
offset = 0 | |
i = text.rfind(form, 0, offset+l) | |
if i < 0: | |
i = text.find(form, offset) | |
if i < 0: | |
print 'Error: "%s" not found at offset %d in:\n%s' % (form, offset, text) | |
return [] | |
if i != offset: | |
print 'Warning: form "%s" shifted by %d at %d.' % (form, i-offset, offset) | |
shift += i-offset | |
if i > 0: parts.append([text[:i], None]) | |
parts.append([text[i:i+l], uri]) | |
text = text[i+l:] | |
done += i+l | |
if len(text) > 0: parts.append([text, None]) | |
return parts | |
def write_entities(out, parts): | |
for text, uri in parts: | |
tokens = wordpunct_tokenize(text.decode('utf8')) | |
for i, t in enumerate(tokens): | |
e = 'ENT' if uri else 'O' | |
u = uri if uri and i == 0 else '' | |
out.write('%s\t%s\t%s\n' % (t.encode('utf8'), e, u)) | |
def main(input, output): | |
with open(output, 'w') as out: | |
last_text = '' | |
entities = [] | |
for line in open(input, 'r'): | |
parts = line.split('\t') | |
if len(parts) != 6: print "Warning: unexpected length %d (expecting 6): %s" % (len(parts), line) | |
try: | |
uri, form, text, offset = parts[1:5] | |
offset = int(offset) | |
except Exception, e: | |
print "Error:", e, "\n", line | |
continue | |
if text != last_text: | |
write_entities(out, split_parts(last_text, entities)) | |
last_text = text | |
entities = [] | |
entities.append([offset, uri, form]) | |
write_entities(out, split_parts(last_text, entities)) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description="Wikki Occs to Stanford NER converter.") | |
parser.add_argument('input', metavar='in_file', help='Wiki Occs TSV file to read') | |
parser.add_argument('output', metavar='out_file', help='Stanford NER training file to write') | |
args = parser.parse_args() | |
main(**vars(args)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment