wesslen · February 3, 2020 03:25
diff --git a/spacy-sent-sim.py b/spacy-sent-sim.py
 import spacy
 import numpy as np
 import pandas as pd
 import altair as alt
 #alt.renderers.enable('default') # if in jupyter, need to activate

 def cos_sim(t1, t2):
    return np.dot(t1.vector, t2.vector) / (t1.vector_norm * t2.vector_norm)

 nlp = spacy.load("en_core_web_lg")

 text1="Anchoring effect is the tendency to focus too heavily on one piece of information when making decisions. In this paper, we present a novel, systematic study and resulting analyses that investigate the effects of anchoring effect on human decision-making using visual analytic systems. Visual analytics interfaces typically contain multiple views that present various aspects of information such as spatial, temporal, and categorical. These views are designed to present complex, heterogeneous data in accessible forms that aid decision-making. However, human decision-making is often hindered by the use of heuristics, or cognitive biases, such as anchoring effect. Anchoring effect can be triggered by the order in which information is presented or the magnitude of information presented. Through carefully designed laboratory experiments, we present evidence of anchoring effect in analysis with visual analytics interfaces when users are primed by representation of different pieces of information. We also describe detailed analyses of users' interaction logs which reveal the impact of anchoring bias on the visual representation preferred and paths of analysis. We discuss implications for future research to possibly detect and alleviate anchoring bias."
 text2="The attraction effect is a well-studied cognitive bias in decision making research, where one's choice between two alternatives is influenced by the presence of an irrelevant (dominated) third alternative. We examine whether this cognitive bias, so far only tested with three alternatives and simple presentation formats such as numerical tables, text and pictures, also appears in visualizations. Since visualizations can be used to support decision making - e.g., when choosing a house to buy or an employee to hire - a systematic bias could have important implications. In a first crowdsource experiment, we indeed partially replicated the attraction effect with three alternatives presented as a numerical table, and observed similar effects when they were presented as a scatterplot. In a second experiment, we investigated if the effect extends to larger sets of alternatives, where the number of alternatives is too large for numerical tables to be practical. Our findings indicate that the bias persists for larger sets of alternatives presented as scatterplots. We discuss implications for future research on how to further study and possibly alleviate the attraction effect."

 doc1 = nlp(text1)
 doc2 = nlp(text2)

 # http://akuederle.com/create-numpy-array-with-for-loop
 sent_sims = np.empty((len(list(doc1.sents)), len(list(doc2.sents))))

 sents1_dict = {i: sent.text for i, sent in enumerate(doc1.sents)}
 sents2_dict = {i: sent.text for i, sent in enumerate(doc2.sents)}
 for i, sent1 in enumerate(doc1.sents):
    for j, sent2 in enumerate(doc2.sents):
        sent_sims[i,j] = cos_sim(sent1,sent2)
        
 source = pd.DataFrame(sent_sims)

 df = pd.DataFrame(source.stack(), columns=['similarity']).reset_index()

 alt.Chart(df).mark_rect().encode(
    x='level_0:O',
    y='level_1:O',
    color='similarity:Q'
 )

 # print(d1_df)
 # print(d2_df)
 # print(source)


 ### word on sentence leve

 sent1= "Last month, the government put Wuhan in a virtual lockdown, sealing off the city and banning most public transportation and private cars from its streets in a desperate effort to contain the outbreak."
 sent2= "One of them spans about eight acres, has 1,000 beds and is scheduled to open on Monday."

 s1 = nlp(sent1)
 s2 = nlp(sent2)

 s1_words = [i.text for i in s1]
 s2_words = [i.text for i in s2]

 word_sims = np.empty((len(list(s1)), len(list(s2))))

 for i, word1 in enumerate(s1):
    for j, word2 in enumerate(s2):
        word_sims[i,j] = cos_sim(word1,word2)
        
 source = pd.DataFrame(word_sims)

 df = pd.DataFrame(source.stack(), columns=['similarity']).reset_index()

 c = alt.Chart(df).mark_rect().encode(
        x='level_1:O',
        y='level_0:O',
        color='similarity:Q'
 )

 # to do: change chart to include words from s1_words and s2_words
 print(s1_words)
 print(s2_words)
 c
	import spacy
	import numpy as np
	import pandas as pd
	import altair as alt
	#alt.renderers.enable('default') # if in jupyter, need to activate

	def cos_sim(t1, t2):
	return np.dot(t1.vector, t2.vector) / (t1.vector_norm * t2.vector_norm)

	nlp = spacy.load("en_core_web_lg")

	text1="Anchoring effect is the tendency to focus too heavily on one piece of information when making decisions. In this paper, we present a novel, systematic study and resulting analyses that investigate the effects of anchoring effect on human decision-making using visual analytic systems. Visual analytics interfaces typically contain multiple views that present various aspects of information such as spatial, temporal, and categorical. These views are designed to present complex, heterogeneous data in accessible forms that aid decision-making. However, human decision-making is often hindered by the use of heuristics, or cognitive biases, such as anchoring effect. Anchoring effect can be triggered by the order in which information is presented or the magnitude of information presented. Through carefully designed laboratory experiments, we present evidence of anchoring effect in analysis with visual analytics interfaces when users are primed by representation of different pieces of information. We also describe detailed analyses of users' interaction logs which reveal the impact of anchoring bias on the visual representation preferred and paths of analysis. We discuss implications for future research to possibly detect and alleviate anchoring bias."
	text2="The attraction effect is a well-studied cognitive bias in decision making research, where one's choice between two alternatives is influenced by the presence of an irrelevant (dominated) third alternative. We examine whether this cognitive bias, so far only tested with three alternatives and simple presentation formats such as numerical tables, text and pictures, also appears in visualizations. Since visualizations can be used to support decision making - e.g., when choosing a house to buy or an employee to hire - a systematic bias could have important implications. In a first crowdsource experiment, we indeed partially replicated the attraction effect with three alternatives presented as a numerical table, and observed similar effects when they were presented as a scatterplot. In a second experiment, we investigated if the effect extends to larger sets of alternatives, where the number of alternatives is too large for numerical tables to be practical. Our findings indicate that the bias persists for larger sets of alternatives presented as scatterplots. We discuss implications for future research on how to further study and possibly alleviate the attraction effect."

	doc1 = nlp(text1)
	doc2 = nlp(text2)

	# http://akuederle.com/create-numpy-array-with-for-loop
	sent_sims = np.empty((len(list(doc1.sents)), len(list(doc2.sents))))

	sents1_dict = {i: sent.text for i, sent in enumerate(doc1.sents)}
	sents2_dict = {i: sent.text for i, sent in enumerate(doc2.sents)}
	for i, sent1 in enumerate(doc1.sents):
	for j, sent2 in enumerate(doc2.sents):
	sent_sims[i,j] = cos_sim(sent1,sent2)

	source = pd.DataFrame(sent_sims)

	df = pd.DataFrame(source.stack(), columns=['similarity']).reset_index()

	alt.Chart(df).mark_rect().encode(
	x='level_0:O',
	y='level_1:O',
	color='similarity:Q'
	)

	# print(d1_df)
	# print(d2_df)
	# print(source)


	### word on sentence leve

	sent1= "Last month, the government put Wuhan in a virtual lockdown, sealing off the city and banning most public transportation and private cars from its streets in a desperate effort to contain the outbreak."
	sent2= "One of them spans about eight acres, has 1,000 beds and is scheduled to open on Monday."

	s1 = nlp(sent1)
	s2 = nlp(sent2)

	s1_words = [i.text for i in s1]
	s2_words = [i.text for i in s2]

	word_sims = np.empty((len(list(s1)), len(list(s2))))

	for i, word1 in enumerate(s1):
	for j, word2 in enumerate(s2):
	word_sims[i,j] = cos_sim(word1,word2)

	source = pd.DataFrame(word_sims)

	df = pd.DataFrame(source.stack(), columns=['similarity']).reset_index()

	c = alt.Chart(df).mark_rect().encode(
	x='level_1:O',
	y='level_0:O',
	color='similarity:Q'
	)

	# to do: change chart to include words from s1_words and s2_words
	print(s1_words)
	print(s2_words)
	c
No results found