Created
July 4, 2018 05:07
-
-
Save ItoTomoki/40658c0448b8a9942243c10b1cbb5b61 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
print 'Content-Type: text/html; charset=UTF-8\n\n' | |
print "<html><body>" | |
import os | |
#os.chdir("/Users/tomoki/Sites/VisualizeYahooSamples") | |
import sys,os | |
import cgi | |
import pickle | |
from collections import defaultdict | |
from normalizer import normalize,_convert_marks,_delete_cyclic_word | |
import MeCab | |
#sys.path.append("/Users/tomoki/Sites/VisualizeYahooSamples") | |
#from YahooVisualizePreprocess import * | |
hiteiword = ["ない", "ぬ", "まい", "ん"] | |
def morphological_analysis(text): | |
cur_data_wakati = open("cur_output.txt").read().split("\n")[0:-2] | |
original_words = [res.split("\t")[0] for res in cur_data_wakati] | |
normed_words = [res.split("\t")[1].split(",")[6] for res in cur_data_wakati] | |
hinshi_array = [res.split("\t")[1].split(",")[0] for res in cur_data_wakati] | |
forms = [res.split("\t")[1].split(",")[5] for res in cur_data_wakati] | |
nums = [res.split("\t")[1].split(",")[1] for res in cur_data_wakati] | |
wordarray = "" | |
wordarray_origin = "" | |
index = 0 | |
while (index < len(original_words)): | |
word = original_words[index] | |
hinshi = hinshi_array[index] | |
if (((hinshi == "名詞") | (hinshi == "動詞") | (hinshi == "形容詞")) & (nums[index] != "数") & (normed_words[index] != "*")): | |
if ((hinshi == "動詞") & (forms[index] == "未然形")): | |
if (index+1) < len(original_words): | |
if (normed_words[index+1] in hiteiword): | |
wordarray = (wordarray + word + normed_words[index+1] + " ") | |
wordarray_origin = (wordarray_origin + word + normed_words[index+1] + " ") | |
index += 2 | |
continue | |
elif (hinshi == "動詞"): | |
if (index+1) < len(original_words): | |
if (hinshi_array[index+1] == "動詞"): | |
wordarray = (wordarray + word) | |
wordarray_origin = (wordarray_origin + word) | |
index += 1 | |
continue | |
elif (hinshi == "動詞"): | |
if ((index+2) < len(original_words)): | |
if ((hinshi_array[index+1] == "助動詞") & (forms[index+1] == "未然形")): | |
if original_words[index+2] in hiteiword: | |
wordarray = (wordarray + word + original_words[index+1] + normed_words[index+2] + " ") | |
wordarray_origin = (wordarray_origin + word + original_words[index+1] + original_words[index+2] + " ") | |
index += 3 | |
continue | |
else: | |
wordarray = (wordarray + word + original_words[index+1] + normed_words[index+2] + " ") #パターン4のとき | |
wordarray_origin = (wordarray_origin + word + original_words[index+1] + original_words[index+2] + " ") | |
index += 3 | |
continue | |
else: | |
wordarray = (wordarray + normed_words[index] + " ") | |
wordarray_origin = (wordarray_origin + word + " ") | |
else: | |
wordarray = (wordarray + word + " ") | |
wordarray_origin = (wordarray_origin + word + " ") | |
index += 1 | |
wordarray = (wordarray + "\n") | |
wordarray_origin = (wordarray_origin + "\n") | |
return (wordarray, wordarray_origin) | |
def output_colored_text(text, polarity_word_dic, word2vecdic, weight_cluster, Cluster_label_dic, context = True): | |
wakati_text, wakati_text_origin = morphological_analysis(normalize(text).encode("utf-8")) | |
#wakati_text, wakati_text_origin = morphological_analysis(text.encode("utf-8")) | |
wakati_text_origin_list = wakati_text_origin.split() | |
weight_sort = [] | |
word_list = [] | |
for word_index, word in enumerate(wakati_text.split()): | |
if word in polarity_word_dic.keys(): | |
if context == True: | |
weight_sort.append(weight_cluster[Cluster_label_dic[word2vecdic[word.decode("utf-8")]]][1] * polarity_word_dic[word]) | |
else: | |
weight_sort.append(polarity_word_dic[word]) | |
#word_list.append(word) | |
#word_list.append(wakati_text_origin_list[word_index]) | |
word_list.append(wakati_text_origin_list[word_index] + "[" + str(word2vecdic[word.decode("utf-8")]) + "]") | |
else: | |
#print "(",word,")", | |
#print word, | |
weight_sort.append(0) | |
#word_list.append(word) | |
word_list.append(wakati_text_origin_list[word_index]) | |
return word_list, weight_sort | |
import matplotlib.pyplot as plt | |
def rescale_score_by_abs (score, max_score, min_score): | |
""" | |
rescale positive score to the range [0.5, 1.0], negative score to the range [0.0, 0.5], | |
using the extremal scores max_score and min_score for normalization | |
""" | |
# CASE 1: positive AND negative scores occur -------------------- | |
if max_score>0 and min_score<0: | |
if max_score >= abs(min_score): # deepest color is positive | |
if score>=0: | |
return 0.5 + 0.5*(score/max_score) | |
else: | |
return 0.5 - 0.5*(abs(score)/max_score) | |
else: # deepest color is negative | |
if score>=0: | |
return 0.5 + 0.5*(score/abs(min_score)) | |
else: | |
return 0.5 - 0.5*(score/min_score) | |
# CASE 2: ONLY positive scores occur ----------------------------- | |
elif max_score>0 and min_score>=0: | |
if max_score == min_score: | |
return 1.0 | |
else: | |
return 0.5 + 0.5*(score/max_score) | |
# CASE 3: ONLY negative scores occur ----------------------------- | |
elif max_score<=0 and min_score<0: | |
if max_score == min_score: | |
return 0.0 | |
else: | |
return 0.5 - 0.5*(score/min_score) | |
def getRGB (c_tuple): | |
return "#%02x%02x%02x"%(int(c_tuple[0]*255), int(c_tuple[1]*255), int(c_tuple[2]*255)) | |
def span_word (word, score, colormap): | |
return "<span style=\"background-color:"+getRGB(colormap(score))+"\">"+word+"</span>" | |
def html_heatmap (words, scores, cmap_name="bwr", context = True): | |
colormap = plt.get_cmap(cmap_name) | |
assert len(words)==len(scores) | |
max_s = max(scores) | |
min_s = min(scores) | |
output_text = "" | |
for idx, w in enumerate(words): | |
#score = rescale_score_by_abs(scores[idx], max_s, min_s) | |
if context == True: | |
#score = 0.5 + scores[idx]/max(abs(max_s), abs(min_s)) | |
score = rescale_score_by_abs(scores[idx], max_s, min_s) | |
else: | |
score = 0.5 + scores[idx] | |
output_text = output_text + span_word(w, score, colormap) + " " | |
return output_text + "\n" | |
DimentionN =1000 | |
word2vecdic = pickle.load(open(("word2vecdic_kihon4_" + str(DimentionN) + ".pkl"),"r")) | |
#polarity_dic_list = pickle.load(open("result_for_PAKDD2018/polarity_dic_list.pkl")) | |
polarity_dic_mean = pickle.load(open("VisualizeYahooSamples/polarity_dic_mean.pkl")) | |
#polarity_dic_mean = defaultdict(int) | |
#for polarity_dic in polarity_dic_list: | |
#for word in polarity_dic: | |
#polarity_dic_mean[word] += polarity_dic[word]/5.0 | |
weight_cluster = pickle.load(open("VisualizeYahooSamples/weight_cluster.pkl")) | |
Cluster_label_dic = pickle.load(open("VisualizeYahooSamples/Cluster_label_dic.pkl")) | |
#text = "株価が上昇する" | |
#word_list, weight_sort = output_colored_text(text,polarity_dic_mean, word2vecdic, weight_cluster, Cluster_label_dic, context = True) | |
#print html_heatmap(word_list, weight_sort, context=True) | |
#display(HTML(html_heatmap(word_list, weight_sort, context=True))) | |
form = cgi.FieldStorage() | |
form_check = 0 | |
if form.has_key("sentence"): | |
form_check = 1 | |
if form_check == 0 : | |
print "<h1>ERROR !</h1>" | |
else : | |
print "<h2>Japanese Sentence (Original)</h2><hr>" | |
print form["sentence"].value | |
print "<br>" | |
os.system("echo " + form["sentence"].value + " | /usr/local/bin/mecab " + | |
" -u /Users/tomoki/Mypython/ruiternews/yahoo_financeboard/userdictionary/ruiter-keyword.dic," + | |
"/Users/tomoki/Mypython/ruiternews/yahoo_financeboard/userdictionary/wikipedia-keyword.dic," + | |
"/Users/tomoki/Mypython/ruiternews/yahoo_financeboard/userdictionary/hatena-keyword.dic" + | |
" > cur_output.txt") | |
word_list, weight_sort = output_colored_text(form["sentence"].value,polarity_dic_mean, word2vecdic, weight_cluster, Cluster_label_dic, context = True) | |
print "<h2>Japanese Sentence (Colored)</h2><hr>" | |
print html_heatmap(word_list, weight_sort, context=True) | |
#print "<b>mail: </b>", form["mail"].value | |
#html_data = open("VisualizeYahooSamples/VisResult/body/output_0.html").read() | |
#print html_data | |
print "</body></html>" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment