Last active
September 12, 2019 01:23
-
-
Save yuya-kanai/afaba9389ed729365f2bce7a42957690 to your computer and use it in GitHub Desktop.
data labeling tool for jupyter notebook
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from IPython.display import display | |
from ipywidgets import Checkbox | |
import csv | |
import gensim | |
word_model = gensim.models.KeyedVectors.load_word2vec_format('model.vec', binary=False) | |
def ambiguous_word_labeling(ambiguous_word_list): | |
foundational_words = {} | |
def update_state(change, target): | |
foundational_words[target]['state'] = change.new | |
def save(change): | |
keep_word_set = {} | |
disregard_word_set = set() | |
remove_word_set = set() | |
with open('keep.csv', 'r') as f: | |
reader = csv.reader(f) | |
keep_word_set = set(list(reader)[0]) | |
with open('remove.csv', 'r') as f: | |
reader = csv.reader(f) | |
remove_word_set = set(list(reader)[0]) | |
for word,each_foundational_word in foundational_words.items(): | |
if(each_foundational_word['state']=='キープ'): | |
keep_word_set.update(each_foundational_word['derived_words']) | |
elif(each_foundational_word['state']=='無視'): | |
disregard_word_set.update(each_foundational_word['derived_words']) | |
elif(each_foundational_word['state']=='捨てる'): | |
remove_word_set.update(each_foundational_word['derived_words']) | |
keep_word_set-=disregard_word_set | |
remove_word_set-=disregard_word_set | |
print("keep: ",keep_word_set) | |
print("remove: ",remove_word_set) | |
with open('keep.csv', 'w', newline='') as myfile: | |
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) | |
wr.writerow(list(keep_word_set)) | |
with open('remove.csv', 'w', newline='') as myfile: | |
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) | |
wr.writerow(list(remove_word_set)) | |
print('done') | |
tab_children = [] | |
for name in range(1): | |
accordion_children=[] | |
for title in ambiguous_word_list[:]: | |
try: | |
similar_words = word_model.most_similar(positive=[title]) | |
except: | |
continue | |
mapped_words = list(map(lambda x: x[0], similar_words)) | |
mapped_words.append(title) | |
foundational_words[title]={} | |
foundational_words[title]['state'] = 'キープ' | |
foundational_words[title]['derived_words'] = mapped_words | |
items =[] | |
for i in mapped_words: | |
items.append(Checkbox(True, description=str(i))) | |
gridbox = widgets.GridBox(items, layout=widgets.Layout(grid_template_columns="repeat(4, 160px)")) | |
toggle_buttons = widgets.ToggleButtons( | |
options=[ 'キープ', '無視','捨てる',], | |
description='ラベル', | |
value='キープ', | |
disabled=False, | |
button_style='', | |
tooltips=['Description of slow', 'Description of regular', 'Description of fast'], | |
) | |
toggle_buttons.observe(lambda x,title=title: update_state(x,target=title), names='value') | |
# htmlTitle = widgets.HTML( | |
# value="<h3>" + title +"</h3>", | |
# ) | |
accordion_container = widgets.VBox([toggle_buttons,gridbox]) | |
accordion_children.append( accordion_container) | |
accordion = widgets.Accordion(children=accordion_children) | |
for i_title, title in enumerate(ambiguous_word_list): | |
accordion.set_title(i_title,title) | |
tab_children.append(accordion) | |
save_button=widgets.Button( | |
value=False, | |
description='Save', | |
button_style='success', | |
tooltip='save file', | |
icon='save' | |
) | |
save_button.on_click(save) | |
tab_children.append(save_button) | |
display(widgets.VBox(tab_children)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment