quadrismegistus · June 7, 2020 15:14
diff --git a/gensim_word2vec_make_semantic_network.py b/gensim_word2vec_make_semantic_network.py
 """
 Code to make a network out of the shortest N cosine-distances (or, equivalently, the strongest N associations)
 between a set of words in a gensim word2vec model.

 To use:
 Set the filenames for the word2vec model.
 Set `my_words` to be a list of your own choosing.
 Set `num_top_dists` to be a number or a factor of the length of `my_words.`
 Choose between the two methods below to produce distances, and comment-out the other one.
 """

 # Import gensim and load the model
 import gensim
 model = gensim.models.Word2Vec.load_word2vec_format('[model].txt.gz', '[vocab].txt')

 # Set the the words we want to find connections between
 my_words = ['a','b', ...]
 my_words = [word for word in my_words if word in model] # filter out words not in model

 # The number of connections we want: either as a factor of the number of words or a set number
 num_top_conns = len(my_words) * 2 

 #######

 # Make a list of all word-to-word distances [each as a tuple of (word1,word2,dist)]
 dists=[]

 ## Method 1 to find distances: use gensim to get the similarity between each word pair
 for i1,word1 in enumerate(my_words):
 	for i2,word2 in enumerate(my_words):
 		if i1>=i2: continue
 		cosine_similarity = model.similarity(word1,word2)
 		cosine_distance = 1 - cosine_similarity
 		dist = (word1, word2, cosine_distance)
 		dists.append(dist)

 ## Or, Method 2 to find distances: use scipy (faster)
 from scipy.spatial.distance import pdist,squareform
 Matrix = np.array([model[word] for word in my_words])
 dist = squareform(pdist(Matrix,'cosine'))
 for i1,word1 in enumerate(my_words):
 	for i2,word2 in enumerate(my_words):
 		if i1>=i2: continue
 		cosine_distance = Matrix[i1, i2]
 		dist = (word1, word2, cosine_distance)
 		dists.append(dist)

 ######

 # Sort the list by ascending distance
 dists.sort(key=lambda _tuple: _tuple[-1])

 # Get the top connections
 top_conns = dists[:num_top_conns]

 # Make a network
 import networkx as nx
 g = nx.Graph()
 for word1,word2,dist in top_conns:
 	weight = 1 - dist # cosine similarity makes more sense for edge weight
 	g.add_edge(word1, word2, weight=float(weight))

 # Write the network
 nx.write_graphml(g, 'my-semantic-network.graphml')
	"""
	Code to make a network out of the shortest N cosine-distances (or, equivalently, the strongest N associations)
	between a set of words in a gensim word2vec model.

	To use:
	Set the filenames for the word2vec model.
	Set `my_words` to be a list of your own choosing.
	Set `num_top_dists` to be a number or a factor of the length of `my_words.`
	Choose between the two methods below to produce distances, and comment-out the other one.
	"""

	# Import gensim and load the model
	import gensim
	model = gensim.models.Word2Vec.load_word2vec_format('[model].txt.gz', '[vocab].txt')

	# Set the the words we want to find connections between
	my_words = ['a','b', ...]
	my_words = [word for word in my_words if word in model] # filter out words not in model

	# The number of connections we want: either as a factor of the number of words or a set number
	num_top_conns = len(my_words) * 2

	#######

	# Make a list of all word-to-word distances [each as a tuple of (word1,word2,dist)]
	dists=[]

	## Method 1 to find distances: use gensim to get the similarity between each word pair
	for i1,word1 in enumerate(my_words):
	for i2,word2 in enumerate(my_words):
	if i1>=i2: continue
	cosine_similarity = model.similarity(word1,word2)
	cosine_distance = 1 - cosine_similarity
	dist = (word1, word2, cosine_distance)
	dists.append(dist)

	## Or, Method 2 to find distances: use scipy (faster)
	from scipy.spatial.distance import pdist,squareform
	Matrix = np.array([model[word] for word in my_words])
	dist = squareform(pdist(Matrix,'cosine'))
	for i1,word1 in enumerate(my_words):
	for i2,word2 in enumerate(my_words):
	if i1>=i2: continue
	cosine_distance = Matrix[i1, i2]
	dist = (word1, word2, cosine_distance)
	dists.append(dist)

	######

	# Sort the list by ascending distance
	dists.sort(key=lambda _tuple: _tuple[-1])

	# Get the top connections
	top_conns = dists[:num_top_conns]

	# Make a network
	import networkx as nx
	g = nx.Graph()
	for word1,word2,dist in top_conns:
	weight = 1 - dist # cosine similarity makes more sense for edge weight
	g.add_edge(word1, word2, weight=float(weight))

	# Write the network
	nx.write_graphml(g, 'my-semantic-network.graphml')