Skip to content

Instantly share code, notes, and snippets.

@philschmid
Last active January 31, 2020 06:24
Show Gist options
  • Save philschmid/8475b5b2b2ee640a2f1a3ed40b882ecd to your computer and use it in GitHub Desktop.
Save philschmid/8475b5b2b2ee640a2f1a3ed40b882ecd to your computer and use it in GitHub Desktop.
automatic Ner Labeling dataset
def add_entity_to_words(sentence='',entities=''):
print(sentence)
res_sen_tpl = []
sentence= sentence.lower()
words_in_sentence = tokenize_to_word(sentence)
for wrd_idx,word in enumerate(words_in_sentence):
if len(word) > 1:
r_word = f"{word}"
word = word.lower()
for ent in entities:
if word in en['name'].lower():
en_tpl = split_ent(en['name']).lower())
next_wrd = next_word(words_in_sentence,wrd_idx)
prev_wrd = prev_word(words_in_sentence,wrd_idx)
if [prev_wrd,word,next_wrd] == en_tpl:
res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"I-{en['type']}"})
elif [word,next_wrd] == en_tpl or [word,next_wrd] == en_tpl[:-1]:
res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"B-{en['type']}"})
elif [prev_wrd,word] == en_tpl or [prev_wrd,word] == en_tpl[1:]:
res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"I-{en['type']}"})
elif [word] == en_tpl:
if any(split_ent(d['name'].lower()) == [word,next_wrd] for d in entities):
pass
elif any(split_ent(d['name'].lower()) == [prev_wrd,word] for d in entities):
pass
else:
res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"B-{en['type']}"})
return_sentence=[]
for idx,word in enumerate(word_in_sentence):
if any(d['word] == word for d in res_sen_tpl):
for ents in res_sen_tpl:
if idx == ents['idx']:
return_sentence.append([ents['words],ents['type]])
else:
return_sentence.append([ents['word'],ents['type']])
return return_sentence
entities=[
{"name":"Data Architect", "type":"JOB"},
{"name":"architect", "type":"JOB"},
{"name":"snowflake", "type":"TECHNOLOGY"},
{"name":"Data Integration", "type":"TASK"},
]
import csv
def write_ner_tsv(row='',outpath='.',filename=None):
with open(f"{outpath}/{filename}","wt") as out_file:
tsv_writer = csv.writer(out_file,delimiter='\t')
## write header
tsv_writer.writerow(['#',row['title'],''])
## tokenize into sentences
list_of_sentences= tokenize_to_sen(row['text'])
## loop through sentences
for sen in list_of_sentences:
labelled_sen = add_entity_to_words(sen,entities)
for word in labelled_sen:
tsv_writer.writerow(word)
## break for next sentence
tsv_writer.writerow(['','',''])
from segtok.segmenter import split_single
from segtok.tokenzier import word_tokenizer
def tokenize_to_word(sen=''):
return word_tokenizer(sen)
def tokenize_to_sen(text=''):
return [sent for sent in split_single(text) if len(sent)>0]
def prev_word(sen,idx):
try:
if(idx < 1):
return None
else:
return sen[idx-1].lower()
except:
return None
def next_word(sen,idx):
try:
return sen[idx+1].lower()
except:
return None
def split_ent(ent):
try:
return ent.split(" ")
except:
return [ent]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment