ItoTomoki · July 4, 2018 05:07
diff --git a/Visualizetext.cgi b/Visualizetext.cgi
 #!/usr/bin/env python
 # encoding: utf-8
 print 'Content-Type: text/html; charset=UTF-8\n\n'
 
 print "<html><body>"

 import os
 #os.chdir("/Users/tomoki/Sites/VisualizeYahooSamples")
 import sys,os
 import cgi
 import pickle
 from collections import defaultdict
 from normalizer import normalize,_convert_marks,_delete_cyclic_word
 import MeCab
 #sys.path.append("/Users/tomoki/Sites/VisualizeYahooSamples")
 #from YahooVisualizePreprocess import *


 hiteiword = ["ない", "ぬ", "まい", "ん"]

 def morphological_analysis(text):
    cur_data_wakati = open("cur_output.txt").read().split("\n")[0:-2]
    original_words = [res.split("\t")[0] for res in cur_data_wakati]
    normed_words = [res.split("\t")[1].split(",")[6] for res in cur_data_wakati]
    hinshi_array = [res.split("\t")[1].split(",")[0] for res in cur_data_wakati]
    forms = [res.split("\t")[1].split(",")[5] for res in cur_data_wakati]
    nums = [res.split("\t")[1].split(",")[1] for res in cur_data_wakati]
    wordarray = ""
    wordarray_origin = ""
    index = 0
    while (index < len(original_words)):
        word = original_words[index]
        hinshi = hinshi_array[index]
        if (((hinshi == "名詞") | (hinshi == "動詞") | (hinshi == "形容詞")) & (nums[index] != "数") & (normed_words[index] != "*")):
            if ((hinshi == "動詞") & (forms[index] == "未然形")):
                if (index+1) < len(original_words):
                    if (normed_words[index+1] in hiteiword):
                        wordarray = (wordarray + word + normed_words[index+1] + " ")
                        wordarray_origin = (wordarray_origin + word + normed_words[index+1] + " ")
                        index += 2
                        continue
            elif (hinshi == "動詞"):
                if (index+1) < len(original_words):
                    if (hinshi_array[index+1] == "動詞"):
                        wordarray = (wordarray + word)
                        wordarray_origin = (wordarray_origin + word)
                        index += 1
                        continue
            elif (hinshi == "動詞"):
                if ((index+2) < len(original_words)):
                    if ((hinshi_array[index+1] == "助動詞") & (forms[index+1] == "未然形")):
                        if original_words[index+2] in hiteiword:
                            wordarray = (wordarray + word + original_words[index+1] + normed_words[index+2] + " ")
                            wordarray_origin = (wordarray_origin + word + original_words[index+1] + original_words[index+2] + " ")
                            index += 3
                            continue
                        else:
                            wordarray = (wordarray + word + original_words[index+1] + normed_words[index+2] + " ") #パターン4のとき
                            wordarray_origin = (wordarray_origin + word + original_words[index+1] + original_words[index+2] + " ")
                            index += 3
                            continue
            else:
                wordarray = (wordarray + normed_words[index] + " ")
                wordarray_origin = (wordarray_origin + word + " ")
        else:
            wordarray = (wordarray + word + " ")
            wordarray_origin = (wordarray_origin + word + " ")
        index += 1
    wordarray = (wordarray + "\n")
    wordarray_origin = (wordarray_origin + "\n")
    return (wordarray, wordarray_origin)


 def output_colored_text(text, polarity_word_dic, word2vecdic, weight_cluster, Cluster_label_dic, context = True):
    wakati_text, wakati_text_origin = morphological_analysis(normalize(text).encode("utf-8"))
    #wakati_text, wakati_text_origin = morphological_analysis(text.encode("utf-8"))
    wakati_text_origin_list = wakati_text_origin.split()
    weight_sort  = []
    word_list = []
    for word_index, word in enumerate(wakati_text.split()):
            if word in polarity_word_dic.keys():
                if context == True:
                    weight_sort.append(weight_cluster[Cluster_label_dic[word2vecdic[word.decode("utf-8")]]][1] * polarity_word_dic[word])
                else:
                    weight_sort.append(polarity_word_dic[word])
                #word_list.append(word)
                #word_list.append(wakati_text_origin_list[word_index])
                word_list.append(wakati_text_origin_list[word_index] + "[" + str(word2vecdic[word.decode("utf-8")]) + "]")
            else:
                #print "(",word,")",
                #print word,
                weight_sort.append(0)
                #word_list.append(word)
                word_list.append(wakati_text_origin_list[word_index])
    return word_list, weight_sort

 import matplotlib.pyplot as plt
 def rescale_score_by_abs (score, max_score, min_score):
    """
    rescale positive score to the range [0.5, 1.0], negative score to the range [0.0, 0.5],
    using the extremal scores max_score and min_score for normalization
    """
    
    # CASE 1: positive AND negative scores occur --------------------
    if max_score>0 and min_score<0:
    
        if max_score >= abs(min_score):   # deepest color is positive
            if score>=0:
                return 0.5 + 0.5*(score/max_score)
            else:
                return 0.5 - 0.5*(abs(score)/max_score)

        else:                             # deepest color is negative
            if score>=0:
                return 0.5 + 0.5*(score/abs(min_score))
            else:
                return 0.5 - 0.5*(score/min_score)   
    
    # CASE 2: ONLY positive scores occur -----------------------------       
    elif max_score>0 and min_score>=0: 
        if max_score == min_score:
            return 1.0
        else:
            return 0.5 + 0.5*(score/max_score)
    
    # CASE 3: ONLY negative scores occur -----------------------------
    elif max_score<=0 and min_score<0: 
        if max_score == min_score:
            return 0.0
        else:
            return 0.5 - 0.5*(score/min_score)    
  
      
 def getRGB (c_tuple):
    return "#%02x%02x%02x"%(int(c_tuple[0]*255), int(c_tuple[1]*255), int(c_tuple[2]*255))

     
 def span_word (word, score, colormap):
    return "<span style=\"background-color:"+getRGB(colormap(score))+"\">"+word+"</span>"


 def html_heatmap (words, scores, cmap_name="bwr", context = True):
    
    colormap  = plt.get_cmap(cmap_name)
     
    assert len(words)==len(scores)
    max_s     = max(scores)
    min_s     = min(scores)
    
    output_text = ""
    
    for idx, w in enumerate(words):
        #score       = rescale_score_by_abs(scores[idx], max_s, min_s)
        if context == True:
            #score       = 0.5 + scores[idx]/max(abs(max_s), abs(min_s))
            score       = rescale_score_by_abs(scores[idx], max_s, min_s)
        else:
            score       = 0.5 + scores[idx]
        output_text = output_text + span_word(w, score, colormap) + " "
    
    return output_text + "\n"

 DimentionN  =1000
 word2vecdic = pickle.load(open(("word2vecdic_kihon4_" + str(DimentionN) + ".pkl"),"r"))
 #polarity_dic_list = pickle.load(open("result_for_PAKDD2018/polarity_dic_list.pkl"))
 polarity_dic_mean = pickle.load(open("VisualizeYahooSamples/polarity_dic_mean.pkl"))
 #polarity_dic_mean = defaultdict(int)
 #for polarity_dic in polarity_dic_list:
    #for word in polarity_dic:
        #polarity_dic_mean[word] += polarity_dic[word]/5.0

 weight_cluster = pickle.load(open("VisualizeYahooSamples/weight_cluster.pkl"))
 Cluster_label_dic = pickle.load(open("VisualizeYahooSamples/Cluster_label_dic.pkl"))

 #text = "株価が上昇する"
 #word_list, weight_sort = output_colored_text(text,polarity_dic_mean, word2vecdic, weight_cluster, Cluster_label_dic, context = True)
 #print html_heatmap(word_list, weight_sort, context=True)
 #display(HTML(html_heatmap(word_list, weight_sort, context=True)))

 form = cgi.FieldStorage()
 form_check = 0
 if form.has_key("sentence"):
  form_check = 1
 if form_check == 0 :
  print "<h1>ERROR !</h1>"
 else :
  print "<h2>Japanese Sentence (Original)</h2><hr>"
  print form["sentence"].value
  print "<br>"
  os.system("echo " + form["sentence"].value + " | /usr/local/bin/mecab " + 
    " -u /Users/tomoki/Mypython/ruiternews/yahoo_financeboard/userdictionary/ruiter-keyword.dic," + 
    "/Users/tomoki/Mypython/ruiternews/yahoo_financeboard/userdictionary/wikipedia-keyword.dic," + 
    "/Users/tomoki/Mypython/ruiternews/yahoo_financeboard/userdictionary/hatena-keyword.dic" + 
    " > cur_output.txt")
  word_list, weight_sort = output_colored_text(form["sentence"].value,polarity_dic_mean, word2vecdic, weight_cluster, Cluster_label_dic, context = True)
  print "<h2>Japanese Sentence (Colored)</h2><hr>"
  print html_heatmap(word_list, weight_sort, context=True)
  #print "<b>mail: </b>", form["mail"].value

 #html_data = open("VisualizeYahooSamples/VisResult/body/output_0.html").read()
 #print html_data
 print "</body></html>"
	#!/usr/bin/env python
	# encoding: utf-8
	print 'Content-Type: text/html; charset=UTF-8\n\n'

	print "<html><body>"

	import os
	#os.chdir("/Users/tomoki/Sites/VisualizeYahooSamples")
	import sys,os
	import cgi
	import pickle
	from collections import defaultdict
	from normalizer import normalize,_convert_marks,_delete_cyclic_word
	import MeCab
	#sys.path.append("/Users/tomoki/Sites/VisualizeYahooSamples")
	#from YahooVisualizePreprocess import *


	hiteiword = ["ない", "ぬ", "まい", "ん"]

	def morphological_analysis(text):
	cur_data_wakati = open("cur_output.txt").read().split("\n")[0:-2]
	original_words = [res.split("\t")[0] for res in cur_data_wakati]
	normed_words = [res.split("\t")[1].split(",")[6] for res in cur_data_wakati]
	hinshi_array = [res.split("\t")[1].split(",")[0] for res in cur_data_wakati]
	forms = [res.split("\t")[1].split(",")[5] for res in cur_data_wakati]
	nums = [res.split("\t")[1].split(",")[1] for res in cur_data_wakati]
	wordarray = ""
	wordarray_origin = ""
	index = 0
	while (index < len(original_words)):
	word = original_words[index]
	hinshi = hinshi_array[index]
	if (((hinshi == "名詞") \| (hinshi == "動詞") \| (hinshi == "形容詞")) & (nums[index] != "数") & (normed_words[index] != "*")):
	if ((hinshi == "動詞") & (forms[index] == "未然形")):
	if (index+1) < len(original_words):
	if (normed_words[index+1] in hiteiword):
	wordarray = (wordarray + word + normed_words[index+1] + " ")
	wordarray_origin = (wordarray_origin + word + normed_words[index+1] + " ")
	index += 2
	continue
	elif (hinshi == "動詞"):
	if (index+1) < len(original_words):
	if (hinshi_array[index+1] == "動詞"):
	wordarray = (wordarray + word)
	wordarray_origin = (wordarray_origin + word)
	index += 1
	continue
	elif (hinshi == "動詞"):
	if ((index+2) < len(original_words)):
	if ((hinshi_array[index+1] == "助動詞") & (forms[index+1] == "未然形")):
	if original_words[index+2] in hiteiword:
	wordarray = (wordarray + word + original_words[index+1] + normed_words[index+2] + " ")
	wordarray_origin = (wordarray_origin + word + original_words[index+1] + original_words[index+2] + " ")
	index += 3
	continue
	else:
	wordarray = (wordarray + word + original_words[index+1] + normed_words[index+2] + " ") #パターン4のとき
	wordarray_origin = (wordarray_origin + word + original_words[index+1] + original_words[index+2] + " ")
	index += 3
	continue
	else:
	wordarray = (wordarray + normed_words[index] + " ")
	wordarray_origin = (wordarray_origin + word + " ")
	else:
	wordarray = (wordarray + word + " ")
	wordarray_origin = (wordarray_origin + word + " ")
	index += 1
	wordarray = (wordarray + "\n")
	wordarray_origin = (wordarray_origin + "\n")
	return (wordarray, wordarray_origin)


	def output_colored_text(text, polarity_word_dic, word2vecdic, weight_cluster, Cluster_label_dic, context = True):
	wakati_text, wakati_text_origin = morphological_analysis(normalize(text).encode("utf-8"))
	#wakati_text, wakati_text_origin = morphological_analysis(text.encode("utf-8"))
	wakati_text_origin_list = wakati_text_origin.split()
	weight_sort = []
	word_list = []
	for word_index, word in enumerate(wakati_text.split()):
	if word in polarity_word_dic.keys():
	if context == True:
	weight_sort.append(weight_cluster[Cluster_label_dic[word2vecdic[word.decode("utf-8")]]][1] * polarity_word_dic[word])
	else:
	weight_sort.append(polarity_word_dic[word])
	#word_list.append(word)
	#word_list.append(wakati_text_origin_list[word_index])
	word_list.append(wakati_text_origin_list[word_index] + "[" + str(word2vecdic[word.decode("utf-8")]) + "]")
	else:
	#print "(",word,")",
	#print word,
	weight_sort.append(0)
	#word_list.append(word)
	word_list.append(wakati_text_origin_list[word_index])
	return word_list, weight_sort

	import matplotlib.pyplot as plt
	def rescale_score_by_abs (score, max_score, min_score):
	"""
	rescale positive score to the range [0.5, 1.0], negative score to the range [0.0, 0.5],
	using the extremal scores max_score and min_score for normalization
	"""

	# CASE 1: positive AND negative scores occur --------------------
	if max_score>0 and min_score<0:

	if max_score >= abs(min_score): # deepest color is positive
	if score>=0:
	return 0.5 + 0.5*(score/max_score)
	else:
	return 0.5 - 0.5*(abs(score)/max_score)

	else: # deepest color is negative
	if score>=0:
	return 0.5 + 0.5*(score/abs(min_score))
	else:
	return 0.5 - 0.5*(score/min_score)

	# CASE 2: ONLY positive scores occur -----------------------------
	elif max_score>0 and min_score>=0:
	if max_score == min_score:
	return 1.0
	else:
	return 0.5 + 0.5*(score/max_score)

	# CASE 3: ONLY negative scores occur -----------------------------
	elif max_score<=0 and min_score<0:
	if max_score == min_score:
	return 0.0
	else:
	return 0.5 - 0.5*(score/min_score)


	def getRGB (c_tuple):
	return "#%02x%02x%02x"%(int(c_tuple[0]255), int(c_tuple[1]255), int(c_tuple[2]*255))


	def span_word (word, score, colormap):
	return "<span style=\"background-color:"+getRGB(colormap(score))+"\">"+word+"</span>"


	def html_heatmap (words, scores, cmap_name="bwr", context = True):

	colormap = plt.get_cmap(cmap_name)

	assert len(words)==len(scores)
	max_s = max(scores)
	min_s = min(scores)

	output_text = ""

	for idx, w in enumerate(words):
	#score = rescale_score_by_abs(scores[idx], max_s, min_s)
	if context == True:
	#score = 0.5 + scores[idx]/max(abs(max_s), abs(min_s))
	score = rescale_score_by_abs(scores[idx], max_s, min_s)
	else:
	score = 0.5 + scores[idx]
	output_text = output_text + span_word(w, score, colormap) + " "

	return output_text + "\n"

	DimentionN =1000
	word2vecdic = pickle.load(open(("word2vecdic_kihon4_" + str(DimentionN) + ".pkl"),"r"))
	#polarity_dic_list = pickle.load(open("result_for_PAKDD2018/polarity_dic_list.pkl"))
	polarity_dic_mean = pickle.load(open("VisualizeYahooSamples/polarity_dic_mean.pkl"))
	#polarity_dic_mean = defaultdict(int)
	#for polarity_dic in polarity_dic_list:
	#for word in polarity_dic:
	#polarity_dic_mean[word] += polarity_dic[word]/5.0

	weight_cluster = pickle.load(open("VisualizeYahooSamples/weight_cluster.pkl"))
	Cluster_label_dic = pickle.load(open("VisualizeYahooSamples/Cluster_label_dic.pkl"))

	#text = "株価が上昇する"
	#word_list, weight_sort = output_colored_text(text,polarity_dic_mean, word2vecdic, weight_cluster, Cluster_label_dic, context = True)
	#print html_heatmap(word_list, weight_sort, context=True)
	#display(HTML(html_heatmap(word_list, weight_sort, context=True)))

	form = cgi.FieldStorage()
	form_check = 0
	if form.has_key("sentence"):
	form_check = 1
	if form_check == 0 :
	print "<h1>ERROR !</h1>"
	else :
	print "<h2>Japanese Sentence (Original)</h2><hr>"
	print form["sentence"].value
	print "<br>"
	os.system("echo " + form["sentence"].value + " \| /usr/local/bin/mecab " +
	" -u /Users/tomoki/Mypython/ruiternews/yahoo_financeboard/userdictionary/ruiter-keyword.dic," +
	"/Users/tomoki/Mypython/ruiternews/yahoo_financeboard/userdictionary/wikipedia-keyword.dic," +
	"/Users/tomoki/Mypython/ruiternews/yahoo_financeboard/userdictionary/hatena-keyword.dic" +
	" > cur_output.txt")
	word_list, weight_sort = output_colored_text(form["sentence"].value,polarity_dic_mean, word2vecdic, weight_cluster, Cluster_label_dic, context = True)
	print "<h2>Japanese Sentence (Colored)</h2><hr>"
	print html_heatmap(word_list, weight_sort, context=True)
	#print "<b>mail: </b>", form["mail"].value

	#html_data = open("VisualizeYahooSamples/VisResult/body/output_0.html").read()
	#print html_data
	print "</body></html>"