philschmid · January 31, 2020 06:24
diff --git a/add_entity_to_words.py b/add_entity_to_words.py
 def add_entity_to_words(sentence='',entities=''):
  print(sentence)
  res_sen_tpl = []
  sentence= sentence.lower()
  words_in_sentence = tokenize_to_word(sentence)
  for wrd_idx,word in enumerate(words_in_sentence):
    if len(word) > 1:
      r_word = f"{word}"
      word = word.lower()
      for ent in entities:
        if word in en['name'].lower():
          en_tpl = split_ent(en['name']).lower())
          next_wrd = next_word(words_in_sentence,wrd_idx)
          prev_wrd = prev_word(words_in_sentence,wrd_idx)
          if [prev_wrd,word,next_wrd] == en_tpl:
            res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"I-{en['type']}"})
          elif [word,next_wrd] == en_tpl or [word,next_wrd] == en_tpl[:-1]:
            res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"B-{en['type']}"})
          elif [prev_wrd,word] == en_tpl or [prev_wrd,word] == en_tpl[1:]:
            res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"I-{en['type']}"})
          elif [word] == en_tpl:
            if any(split_ent(d['name'].lower()) == [word,next_wrd] for d in entities):
               pass
            elif any(split_ent(d['name'].lower()) == [prev_wrd,word] for d in entities):
               pass
            else:
               res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"B-{en['type']}"})
  return_sentence=[]
  for idx,word in enumerate(word_in_sentence):
     if any(d['word] == word for d in res_sen_tpl):
        for ents in res_sen_tpl:
          if idx == ents['idx']:
            return_sentence.append([ents['words],ents['type]])
     else:
        return_sentence.append([ents['word'],ents['type']])
  return return_sentence
diff --git a/ent_dict.py b/ent_dict.py
 entities=[
 {"name":"Data Architect", "type":"JOB"},
 {"name":"architect", "type":"JOB"},
 {"name":"snowflake", "type":"TECHNOLOGY"},
 {"name":"Data Integration", "type":"TASK"},
 ]
diff --git a/labelling_ner.py b/labelling_ner.py
 import csv

 def write_ner_tsv(row='',outpath='.',filename=None):
  with open(f"{outpath}/{filename}","wt") as out_file:
    tsv_writer = csv.writer(out_file,delimiter='\t')
    ## write header
    tsv_writer.writerow(['#',row['title'],''])
    ## tokenize into sentences
    list_of_sentences= tokenize_to_sen(row['text'])
    ## loop through sentences
    for sen in list_of_sentences:
      labelled_sen = add_entity_to_words(sen,entities)
      for word in labelled_sen:
        tsv_writer.writerow(word)
      ## break for next sentence
      tsv_writer.writerow(['','',''])

                
diff --git a/ner_helper.py b/ner_helper.py
 from segtok.segmenter import split_single
 from segtok.tokenzier import word_tokenizer

 def tokenize_to_word(sen=''):
  return word_tokenizer(sen)


 def tokenize_to_sen(text=''):
  return [sent for sent in split_single(text) if len(sent)>0]

 def prev_word(sen,idx):
  try:
    if(idx < 1):
      return None
    else:
      return sen[idx-1].lower()
  except:
    return None
  
 def next_word(sen,idx):
  try:
    return sen[idx+1].lower()
  except:
    return None  

 def split_ent(ent):
  try:
    return ent.split(" ")
  except:
    return [ent]
	def add_entity_to_words(sentence='',entities=''):
	print(sentence)
	res_sen_tpl = []
	sentence= sentence.lower()
	words_in_sentence = tokenize_to_word(sentence)
	for wrd_idx,word in enumerate(words_in_sentence):
	if len(word) > 1:
	r_word = f"{word}"
	word = word.lower()
	for ent in entities:
	if word in en['name'].lower():
	en_tpl = split_ent(en['name']).lower())
	next_wrd = next_word(words_in_sentence,wrd_idx)
	prev_wrd = prev_word(words_in_sentence,wrd_idx)
	if [prev_wrd,word,next_wrd] == en_tpl:
	res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"I-{en['type']}"})
	elif [word,next_wrd] == en_tpl or [word,next_wrd] == en_tpl[:-1]:
	res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"B-{en['type']}"})
	elif [prev_wrd,word] == en_tpl or [prev_wrd,word] == en_tpl[1:]:
	res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"I-{en['type']}"})
	elif [word] == en_tpl:
	if any(split_ent(d['name'].lower()) == [word,next_wrd] for d in entities):
	pass
	elif any(split_ent(d['name'].lower()) == [prev_wrd,word] for d in entities):
	pass
	else:
	res_sen_tpl.append({'idx':wrd_idx,'word':r_word,'type':f"B-{en['type']}"})
	return_sentence=[]
	for idx,word in enumerate(word_in_sentence):
	if any(d['word] == word for d in res_sen_tpl):
	for ents in res_sen_tpl:
	if idx == ents['idx']:
	return_sentence.append([ents['words],ents['type]])
	else:
	return_sentence.append([ents['word'],ents['type']])
	return return_sentence
	entities=[
	{"name":"Data Architect", "type":"JOB"},
	{"name":"architect", "type":"JOB"},
	{"name":"snowflake", "type":"TECHNOLOGY"},
	{"name":"Data Integration", "type":"TASK"},
	]
	import csv

	def write_ner_tsv(row='',outpath='.',filename=None):
	with open(f"{outpath}/{filename}","wt") as out_file:
	tsv_writer = csv.writer(out_file,delimiter='\t')
	## write header
	tsv_writer.writerow(['#',row['title'],''])
	## tokenize into sentences
	list_of_sentences= tokenize_to_sen(row['text'])
	## loop through sentences
	for sen in list_of_sentences:
	labelled_sen = add_entity_to_words(sen,entities)
	for word in labelled_sen:
	tsv_writer.writerow(word)
	## break for next sentence
	tsv_writer.writerow(['','',''])
	from segtok.segmenter import split_single
	from segtok.tokenzier import word_tokenizer

	def tokenize_to_word(sen=''):
	return word_tokenizer(sen)


	def tokenize_to_sen(text=''):
	return [sent for sent in split_single(text) if len(sent)>0]

	def prev_word(sen,idx):
	try:
	if(idx < 1):
	return None
	else:
	return sen[idx-1].lower()
	except:
	return None

	def next_word(sen,idx):
	try:
	return sen[idx+1].lower()
	except:
	return None

	def split_ent(ent):
	try:
	return ent.split(" ")
	except:
	return [ent]