Last active
January 31, 2020 06:24
-
-
Save philschmid/8475b5b2b2ee640a2f1a3ed40b882ecd to your computer and use it in GitHub Desktop.
automatic Ner Labeling dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def add_entity_to_words(sentence='',entities=''): | |
print(sentence) | |
res_sen_tpl = [] | |
sentence= sentence.lower() | |
words_in_sentence = tokenize_to_word(sentence) | |
for wrd_idx,word in enumerate(words_in_sentence): | |
if len(word) > 1: | |
r_word = f"{word}" | |
word = word.lower() | |
for ent in entities: | |
if word in en['name'].lower(): | |
en_tpl = split_ent(en['name']).lower()) | |
next_wrd = next_word(words_in_sentence,wrd_idx) | |
prev_wrd = prev_word(words_in_sentence,wrd_idx) | |
if [prev_wrd,word,next_wrd] == en_tpl: | |
res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"I-{en['type']}"}) | |
elif [word,next_wrd] == en_tpl or [word,next_wrd] == en_tpl[:-1]: | |
res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"B-{en['type']}"}) | |
elif [prev_wrd,word] == en_tpl or [prev_wrd,word] == en_tpl[1:]: | |
res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"I-{en['type']}"}) | |
elif [word] == en_tpl: | |
if any(split_ent(d['name'].lower()) == [word,next_wrd] for d in entities): | |
pass | |
elif any(split_ent(d['name'].lower()) == [prev_wrd,word] for d in entities): | |
pass | |
else: | |
res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"B-{en['type']}"}) | |
return_sentence=[] | |
for idx,word in enumerate(word_in_sentence): | |
if any(d['word] == word for d in res_sen_tpl): | |
for ents in res_sen_tpl: | |
if idx == ents['idx']: | |
return_sentence.append([ents['words],ents['type]]) | |
else: | |
return_sentence.append([ents['word'],ents['type']]) | |
return return_sentence |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
entities=[ | |
{"name":"Data Architect", "type":"JOB"}, | |
{"name":"architect", "type":"JOB"}, | |
{"name":"snowflake", "type":"TECHNOLOGY"}, | |
{"name":"Data Integration", "type":"TASK"}, | |
] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
def write_ner_tsv(row='',outpath='.',filename=None): | |
with open(f"{outpath}/{filename}","wt") as out_file: | |
tsv_writer = csv.writer(out_file,delimiter='\t') | |
## write header | |
tsv_writer.writerow(['#',row['title'],'']) | |
## tokenize into sentences | |
list_of_sentences= tokenize_to_sen(row['text']) | |
## loop through sentences | |
for sen in list_of_sentences: | |
labelled_sen = add_entity_to_words(sen,entities) | |
for word in labelled_sen: | |
tsv_writer.writerow(word) | |
## break for next sentence | |
tsv_writer.writerow(['','','']) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from segtok.segmenter import split_single | |
from segtok.tokenzier import word_tokenizer | |
def tokenize_to_word(sen=''): | |
return word_tokenizer(sen) | |
def tokenize_to_sen(text=''): | |
return [sent for sent in split_single(text) if len(sent)>0] | |
def prev_word(sen,idx): | |
try: | |
if(idx < 1): | |
return None | |
else: | |
return sen[idx-1].lower() | |
except: | |
return None | |
def next_word(sen,idx): | |
try: | |
return sen[idx+1].lower() | |
except: | |
return None | |
def split_ent(ent): | |
try: | |
return ent.split(" ") | |
except: | |
return [ent] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment