Skip to content

Instantly share code, notes, and snippets.

@ItoTomoki
Created July 4, 2018 05:07
Show Gist options
  • Save ItoTomoki/40658c0448b8a9942243c10b1cbb5b61 to your computer and use it in GitHub Desktop.
Save ItoTomoki/40658c0448b8a9942243c10b1cbb5b61 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# encoding: utf-8
print 'Content-Type: text/html; charset=UTF-8\n\n'
print "<html><body>"
import os
#os.chdir("/Users/tomoki/Sites/VisualizeYahooSamples")
import sys,os
import cgi
import pickle
from collections import defaultdict
from normalizer import normalize,_convert_marks,_delete_cyclic_word
import MeCab
#sys.path.append("/Users/tomoki/Sites/VisualizeYahooSamples")
#from YahooVisualizePreprocess import *
hiteiword = ["ない", "ぬ", "まい", "ん"]
def morphological_analysis(text):
cur_data_wakati = open("cur_output.txt").read().split("\n")[0:-2]
original_words = [res.split("\t")[0] for res in cur_data_wakati]
normed_words = [res.split("\t")[1].split(",")[6] for res in cur_data_wakati]
hinshi_array = [res.split("\t")[1].split(",")[0] for res in cur_data_wakati]
forms = [res.split("\t")[1].split(",")[5] for res in cur_data_wakati]
nums = [res.split("\t")[1].split(",")[1] for res in cur_data_wakati]
wordarray = ""
wordarray_origin = ""
index = 0
while (index < len(original_words)):
word = original_words[index]
hinshi = hinshi_array[index]
if (((hinshi == "名詞") | (hinshi == "動詞") | (hinshi == "形容詞")) & (nums[index] != "数") & (normed_words[index] != "*")):
if ((hinshi == "動詞") & (forms[index] == "未然形")):
if (index+1) < len(original_words):
if (normed_words[index+1] in hiteiword):
wordarray = (wordarray + word + normed_words[index+1] + " ")
wordarray_origin = (wordarray_origin + word + normed_words[index+1] + " ")
index += 2
continue
elif (hinshi == "動詞"):
if (index+1) < len(original_words):
if (hinshi_array[index+1] == "動詞"):
wordarray = (wordarray + word)
wordarray_origin = (wordarray_origin + word)
index += 1
continue
elif (hinshi == "動詞"):
if ((index+2) < len(original_words)):
if ((hinshi_array[index+1] == "助動詞") & (forms[index+1] == "未然形")):
if original_words[index+2] in hiteiword:
wordarray = (wordarray + word + original_words[index+1] + normed_words[index+2] + " ")
wordarray_origin = (wordarray_origin + word + original_words[index+1] + original_words[index+2] + " ")
index += 3
continue
else:
wordarray = (wordarray + word + original_words[index+1] + normed_words[index+2] + " ") #パターン4のとき
wordarray_origin = (wordarray_origin + word + original_words[index+1] + original_words[index+2] + " ")
index += 3
continue
else:
wordarray = (wordarray + normed_words[index] + " ")
wordarray_origin = (wordarray_origin + word + " ")
else:
wordarray = (wordarray + word + " ")
wordarray_origin = (wordarray_origin + word + " ")
index += 1
wordarray = (wordarray + "\n")
wordarray_origin = (wordarray_origin + "\n")
return (wordarray, wordarray_origin)
def output_colored_text(text, polarity_word_dic, word2vecdic, weight_cluster, Cluster_label_dic, context = True):
wakati_text, wakati_text_origin = morphological_analysis(normalize(text).encode("utf-8"))
#wakati_text, wakati_text_origin = morphological_analysis(text.encode("utf-8"))
wakati_text_origin_list = wakati_text_origin.split()
weight_sort = []
word_list = []
for word_index, word in enumerate(wakati_text.split()):
if word in polarity_word_dic.keys():
if context == True:
weight_sort.append(weight_cluster[Cluster_label_dic[word2vecdic[word.decode("utf-8")]]][1] * polarity_word_dic[word])
else:
weight_sort.append(polarity_word_dic[word])
#word_list.append(word)
#word_list.append(wakati_text_origin_list[word_index])
word_list.append(wakati_text_origin_list[word_index] + "[" + str(word2vecdic[word.decode("utf-8")]) + "]")
else:
#print "(",word,")",
#print word,
weight_sort.append(0)
#word_list.append(word)
word_list.append(wakati_text_origin_list[word_index])
return word_list, weight_sort
import matplotlib.pyplot as plt
def rescale_score_by_abs (score, max_score, min_score):
"""
rescale positive score to the range [0.5, 1.0], negative score to the range [0.0, 0.5],
using the extremal scores max_score and min_score for normalization
"""
# CASE 1: positive AND negative scores occur --------------------
if max_score>0 and min_score<0:
if max_score >= abs(min_score): # deepest color is positive
if score>=0:
return 0.5 + 0.5*(score/max_score)
else:
return 0.5 - 0.5*(abs(score)/max_score)
else: # deepest color is negative
if score>=0:
return 0.5 + 0.5*(score/abs(min_score))
else:
return 0.5 - 0.5*(score/min_score)
# CASE 2: ONLY positive scores occur -----------------------------
elif max_score>0 and min_score>=0:
if max_score == min_score:
return 1.0
else:
return 0.5 + 0.5*(score/max_score)
# CASE 3: ONLY negative scores occur -----------------------------
elif max_score<=0 and min_score<0:
if max_score == min_score:
return 0.0
else:
return 0.5 - 0.5*(score/min_score)
def getRGB (c_tuple):
return "#%02x%02x%02x"%(int(c_tuple[0]*255), int(c_tuple[1]*255), int(c_tuple[2]*255))
def span_word (word, score, colormap):
return "<span style=\"background-color:"+getRGB(colormap(score))+"\">"+word+"</span>"
def html_heatmap (words, scores, cmap_name="bwr", context = True):
colormap = plt.get_cmap(cmap_name)
assert len(words)==len(scores)
max_s = max(scores)
min_s = min(scores)
output_text = ""
for idx, w in enumerate(words):
#score = rescale_score_by_abs(scores[idx], max_s, min_s)
if context == True:
#score = 0.5 + scores[idx]/max(abs(max_s), abs(min_s))
score = rescale_score_by_abs(scores[idx], max_s, min_s)
else:
score = 0.5 + scores[idx]
output_text = output_text + span_word(w, score, colormap) + " "
return output_text + "\n"
DimentionN =1000
word2vecdic = pickle.load(open(("word2vecdic_kihon4_" + str(DimentionN) + ".pkl"),"r"))
#polarity_dic_list = pickle.load(open("result_for_PAKDD2018/polarity_dic_list.pkl"))
polarity_dic_mean = pickle.load(open("VisualizeYahooSamples/polarity_dic_mean.pkl"))
#polarity_dic_mean = defaultdict(int)
#for polarity_dic in polarity_dic_list:
#for word in polarity_dic:
#polarity_dic_mean[word] += polarity_dic[word]/5.0
weight_cluster = pickle.load(open("VisualizeYahooSamples/weight_cluster.pkl"))
Cluster_label_dic = pickle.load(open("VisualizeYahooSamples/Cluster_label_dic.pkl"))
#text = "株価が上昇する"
#word_list, weight_sort = output_colored_text(text,polarity_dic_mean, word2vecdic, weight_cluster, Cluster_label_dic, context = True)
#print html_heatmap(word_list, weight_sort, context=True)
#display(HTML(html_heatmap(word_list, weight_sort, context=True)))
form = cgi.FieldStorage()
form_check = 0
if form.has_key("sentence"):
form_check = 1
if form_check == 0 :
print "<h1>ERROR !</h1>"
else :
print "<h2>Japanese Sentence (Original)</h2><hr>"
print form["sentence"].value
print "<br>"
os.system("echo " + form["sentence"].value + " | /usr/local/bin/mecab " +
" -u /Users/tomoki/Mypython/ruiternews/yahoo_financeboard/userdictionary/ruiter-keyword.dic," +
"/Users/tomoki/Mypython/ruiternews/yahoo_financeboard/userdictionary/wikipedia-keyword.dic," +
"/Users/tomoki/Mypython/ruiternews/yahoo_financeboard/userdictionary/hatena-keyword.dic" +
" > cur_output.txt")
word_list, weight_sort = output_colored_text(form["sentence"].value,polarity_dic_mean, word2vecdic, weight_cluster, Cluster_label_dic, context = True)
print "<h2>Japanese Sentence (Colored)</h2><hr>"
print html_heatmap(word_list, weight_sort, context=True)
#print "<b>mail: </b>", form["mail"].value
#html_data = open("VisualizeYahooSamples/VisResult/body/output_0.html").read()
#print html_data
print "</body></html>"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment